]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/commitdiff
Merge tag 'kvm-3.10-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 5 May 2013 21:47:31 +0000 (14:47 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 5 May 2013 21:47:31 +0000 (14:47 -0700)
Pull kvm updates from Gleb Natapov:
 "Highlights of the updates are:

  general:
   - new emulated device API
   - legacy device assignment is now optional
   - irqfd interface is more generic and can be shared between arches

  x86:
   - VMCS shadow support and other nested VMX improvements
   - APIC virtualization and Posted Interrupt hardware support
   - Optimize mmio spte zapping

  ppc:
    - BookE: in-kernel MPIC emulation with irqfd support
    - Book3S: in-kernel XICS emulation (incomplete)
    - Book3S: HV: migration fixes
    - BookE: more debug support preparation
    - BookE: e6500 support

  ARM:
   - reworking of Hyp idmaps

  s390:
   - ioeventfd for virtio-ccw

  And many other bug fixes, cleanups and improvements"

* tag 'kvm-3.10-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (204 commits)
  kvm: Add compat_ioctl for device control API
  KVM: x86: Account for failing enable_irq_window for NMI window request
  KVM: PPC: Book3S: Add API for in-kernel XICS emulation
  kvm/ppc/mpic: fix missing unlock in set_base_addr()
  kvm/ppc: Hold srcu lock when calling kvm_io_bus_read/write
  kvm/ppc/mpic: remove users
  kvm/ppc/mpic: fix mmio region lists when multiple guests used
  kvm/ppc/mpic: remove default routes from documentation
  kvm: KVM_CAP_IOMMU only available with device assignment
  ARM: KVM: iterate over all CPUs for CPU compatibility check
  KVM: ARM: Fix spelling in error message
  ARM: KVM: define KVM_ARM_MAX_VCPUS unconditionally
  KVM: ARM: Fix API documentation for ONE_REG encoding
  ARM: KVM: promote vfp_host pointer to generic host cpu context
  ARM: KVM: add architecture specific hook for capabilities
  ARM: KVM: perform HYP initilization for hotplugged CPUs
  ARM: KVM: switch to a dual-step HYP init code
  ARM: KVM: rework HYP page table freeing
  ARM: KVM: enforce maximum size for identity mapped code
  ARM: KVM: move to a KVM provided HYP idmap
  ...

110 files changed:
Documentation/virtual/kvm/api.txt
Documentation/virtual/kvm/devices/README [new file with mode: 0644]
Documentation/virtual/kvm/devices/mpic.txt [new file with mode: 0644]
Documentation/virtual/kvm/devices/xics.txt [new file with mode: 0644]
arch/arm/include/asm/idmap.h
arch/arm/include/asm/kvm_host.h
arch/arm/include/asm/kvm_mmu.h
arch/arm/kernel/asm-offsets.c
arch/arm/kernel/vmlinux.lds.S
arch/arm/kvm/Kconfig
arch/arm/kvm/Makefile
arch/arm/kvm/arch_timer.c
arch/arm/kvm/arm.c
arch/arm/kvm/init.S
arch/arm/kvm/mmu.c
arch/arm/kvm/perf.c [new file with mode: 0644]
arch/arm/mm/idmap.c
arch/ia64/include/asm/kvm_host.h
arch/ia64/include/uapi/asm/kvm.h
arch/ia64/kvm/Kconfig
arch/ia64/kvm/Makefile
arch/ia64/kvm/kvm-ia64.c
arch/ia64/kvm/lapic.h
arch/powerpc/include/asm/hvcall.h
arch/powerpc/include/asm/kvm_book3s.h
arch/powerpc/include/asm/kvm_book3s_64.h
arch/powerpc/include/asm/kvm_book3s_asm.h
arch/powerpc/include/asm/kvm_booke.h
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/include/asm/reg.h
arch/powerpc/include/uapi/asm/kvm.h
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kvm/44x.c
arch/powerpc/kvm/Kconfig
arch/powerpc/kvm/Makefile
arch/powerpc/kvm/book3s.c
arch/powerpc/kvm/book3s_64_mmu_hv.c
arch/powerpc/kvm/book3s_emulate.c
arch/powerpc/kvm/book3s_hv.c
arch/powerpc/kvm/book3s_hv_rm_mmu.c
arch/powerpc/kvm/book3s_hv_rm_xics.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_hv_rmhandlers.S
arch/powerpc/kvm/book3s_pr.c
arch/powerpc/kvm/book3s_pr_papr.c
arch/powerpc/kvm/book3s_rtas.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_xics.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_xics.h [new file with mode: 0644]
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/booke_interrupts.S
arch/powerpc/kvm/e500.c
arch/powerpc/kvm/e500.h
arch/powerpc/kvm/e500_emulate.c
arch/powerpc/kvm/e500_mmu.c
arch/powerpc/kvm/e500mc.c
arch/powerpc/kvm/emulate.c
arch/powerpc/kvm/irq.h [new file with mode: 0644]
arch/powerpc/kvm/mpic.c [new file with mode: 0644]
arch/powerpc/kvm/powerpc.c
arch/powerpc/sysdev/xics/icp-native.c
arch/s390/include/uapi/asm/Kbuild
arch/s390/include/uapi/asm/virtio-ccw.h [new file with mode: 0644]
arch/s390/kvm/Kconfig
arch/s390/kvm/Makefile
arch/s390/kvm/diag.c
arch/s390/kvm/gaccess.h
arch/s390/kvm/intercept.c
arch/s390/kvm/interrupt.c
arch/s390/kvm/kvm-s390.c
arch/s390/kvm/kvm-s390.h
arch/s390/kvm/priv.c
arch/x86/include/asm/entry_arch.h
arch/x86/include/asm/hardirq.h
arch/x86/include/asm/hw_irq.h
arch/x86/include/asm/irq_vectors.h
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/vmx.h
arch/x86/include/uapi/asm/kvm.h
arch/x86/include/uapi/asm/msr-index.h
arch/x86/include/uapi/asm/vmx.h
arch/x86/kernel/entry_64.S
arch/x86/kernel/irq.c
arch/x86/kernel/irqinit.c
arch/x86/kernel/kvmclock.c
arch/x86/kvm/Kconfig
arch/x86/kvm/Makefile
arch/x86/kvm/emulate.c
arch/x86/kvm/i8254.c
arch/x86/kvm/lapic.c
arch/x86/kvm/lapic.h
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu.h
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/pmu.c
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
drivers/s390/kvm/kvm_virtio.c
drivers/s390/kvm/virtio_ccw.c
include/linux/kvm_host.h
include/trace/events/kvm.h
include/uapi/linux/kvm.h
virt/kvm/Kconfig
virt/kvm/assigned-dev.c
virt/kvm/eventfd.c
virt/kvm/ioapic.c
virt/kvm/ioapic.h
virt/kvm/irq_comm.c
virt/kvm/irqchip.c [new file with mode: 0644]
virt/kvm/kvm_main.c

index 119358dfb74295af7e95c6ecdcda35e9cf1d8f17..5f91eda9164713faa1f66a613a998d36b44f5191 100644 (file)
@@ -1486,15 +1486,23 @@ struct kvm_ioeventfd {
        __u8  pad[36];
 };
 
+For the special case of virtio-ccw devices on s390, the ioevent is matched
+to a subchannel/virtqueue tuple instead.
+
 The following flags are defined:
 
 #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
 #define KVM_IOEVENTFD_FLAG_PIO       (1 << kvm_ioeventfd_flag_nr_pio)
 #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
+#define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
+       (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
 
 If datamatch flag is set, the event will be signaled only if the written value
 to the registered address is equal to datamatch in struct kvm_ioeventfd.
 
+For virtio-ccw devices, addr contains the subchannel id and datamatch the
+virtqueue index.
+
 
 4.60 KVM_DIRTY_TLB
 
@@ -1780,27 +1788,48 @@ registers, find a list below:
   PPC   | KVM_REG_PPC_VPA_DTL   | 128
   PPC   | KVM_REG_PPC_EPCR     | 32
   PPC   | KVM_REG_PPC_EPR      | 32
+  PPC   | KVM_REG_PPC_TCR      | 32
+  PPC   | KVM_REG_PPC_TSR      | 32
+  PPC   | KVM_REG_PPC_OR_TSR   | 32
+  PPC   | KVM_REG_PPC_CLEAR_TSR        | 32
+  PPC   | KVM_REG_PPC_MAS0     | 32
+  PPC   | KVM_REG_PPC_MAS1     | 32
+  PPC   | KVM_REG_PPC_MAS2     | 64
+  PPC   | KVM_REG_PPC_MAS7_3   | 64
+  PPC   | KVM_REG_PPC_MAS4     | 32
+  PPC   | KVM_REG_PPC_MAS6     | 32
+  PPC   | KVM_REG_PPC_MMUCFG   | 32
+  PPC   | KVM_REG_PPC_TLB0CFG  | 32
+  PPC   | KVM_REG_PPC_TLB1CFG  | 32
+  PPC   | KVM_REG_PPC_TLB2CFG  | 32
+  PPC   | KVM_REG_PPC_TLB3CFG  | 32
+  PPC   | KVM_REG_PPC_TLB0PS   | 32
+  PPC   | KVM_REG_PPC_TLB1PS   | 32
+  PPC   | KVM_REG_PPC_TLB2PS   | 32
+  PPC   | KVM_REG_PPC_TLB3PS   | 32
+  PPC   | KVM_REG_PPC_EPTCFG   | 32
+  PPC   | KVM_REG_PPC_ICP_STATE | 64
 
 ARM registers are mapped using the lower 32 bits.  The upper 16 of that
 is the register group type, or coprocessor number:
 
 ARM core registers have the following id bit patterns:
-  0x4002 0000 0010 <index into the kvm_regs struct:16>
+  0x4020 0000 0010 <index into the kvm_regs struct:16>
 
 ARM 32-bit CP15 registers have the following id bit patterns:
-  0x4002 0000 000F <zero:1> <crn:4> <crm:4> <opc1:4> <opc2:3>
+  0x4020 0000 000F <zero:1> <crn:4> <crm:4> <opc1:4> <opc2:3>
 
 ARM 64-bit CP15 registers have the following id bit patterns:
-  0x4003 0000 000F <zero:1> <zero:4> <crm:4> <opc1:4> <zero:3>
+  0x4030 0000 000F <zero:1> <zero:4> <crm:4> <opc1:4> <zero:3>
 
 ARM CCSIDR registers are demultiplexed by CSSELR value:
-  0x4002 0000 0011 00 <csselr:8>
+  0x4020 0000 0011 00 <csselr:8>
 
 ARM 32-bit VFP control registers have the following id bit patterns:
-  0x4002 0000 0012 1 <regno:12>
+  0x4020 0000 0012 1 <regno:12>
 
 ARM 64-bit FP registers have the following id bit patterns:
-  0x4002 0000 0012 0 <regno:12>
+  0x4030 0000 0012 0 <regno:12>
 
 4.69 KVM_GET_ONE_REG
 
@@ -2161,6 +2190,76 @@ header; first `n_valid' valid entries with contents from the data
 written, then `n_invalid' invalid entries, invalidating any previously
 valid entries found.
 
+4.79 KVM_CREATE_DEVICE
+
+Capability: KVM_CAP_DEVICE_CTRL
+Type: vm ioctl
+Parameters: struct kvm_create_device (in/out)
+Returns: 0 on success, -1 on error
+Errors:
+  ENODEV: The device type is unknown or unsupported
+  EEXIST: Device already created, and this type of device may not
+          be instantiated multiple times
+
+  Other error conditions may be defined by individual device types or
+  have their standard meanings.
+
+Creates an emulated device in the kernel.  The file descriptor returned
+in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR.
+
+If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the
+device type is supported (not necessarily whether it can be created
+in the current vm).
+
+Individual devices should not define flags.  Attributes should be used
+for specifying any behavior that is not implied by the device type
+number.
+
+struct kvm_create_device {
+       __u32   type;   /* in: KVM_DEV_TYPE_xxx */
+       __u32   fd;     /* out: device handle */
+       __u32   flags;  /* in: KVM_CREATE_DEVICE_xxx */
+};
+
+4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR
+
+Capability: KVM_CAP_DEVICE_CTRL
+Type: device ioctl
+Parameters: struct kvm_device_attr
+Returns: 0 on success, -1 on error
+Errors:
+  ENXIO:  The group or attribute is unknown/unsupported for this device
+  EPERM:  The attribute cannot (currently) be accessed this way
+          (e.g. read-only attribute, or attribute that only makes
+          sense when the device is in a different state)
+
+  Other error conditions may be defined by individual device types.
+
+Gets/sets a specified piece of device configuration and/or state.  The
+semantics are device-specific.  See individual device documentation in
+the "devices" directory.  As with ONE_REG, the size of the data
+transferred is defined by the particular attribute.
+
+struct kvm_device_attr {
+       __u32   flags;          /* no flags currently defined */
+       __u32   group;          /* device-defined */
+       __u64   attr;           /* group-defined */
+       __u64   addr;           /* userspace address of attr data */
+};
+
+4.81 KVM_HAS_DEVICE_ATTR
+
+Capability: KVM_CAP_DEVICE_CTRL
+Type: device ioctl
+Parameters: struct kvm_device_attr
+Returns: 0 on success, -1 on error
+Errors:
+  ENXIO:  The group or attribute is unknown/unsupported for this device
+
+Tests whether a device supports a particular attribute.  A successful
+return indicates the attribute is implemented.  It does not necessarily
+indicate that the attribute can be read or written in the device's
+current state.  "addr" is ignored.
 
 4.77 KVM_ARM_VCPU_INIT
 
@@ -2243,6 +2342,25 @@ and distributor interface, the ioctl must be called after calling
 KVM_CREATE_IRQCHIP, but before calling KVM_RUN on any of the VCPUs.  Calling
 this ioctl twice for any of the base addresses will return -EEXIST.
 
+4.82 KVM_PPC_RTAS_DEFINE_TOKEN
+
+Capability: KVM_CAP_PPC_RTAS
+Architectures: ppc
+Type: vm ioctl
+Parameters: struct kvm_rtas_token_args
+Returns: 0 on success, -1 on error
+
+Defines a token value for a RTAS (Run Time Abstraction Services)
+service in order to allow it to be handled in the kernel.  The
+argument struct gives the name of the service, which must be the name
+of a service that has a kernel-side implementation.  If the token
+value is non-zero, it will be associated with that service, and
+subsequent RTAS calls by the guest specifying that token will be
+handled by the kernel.  If the token value is 0, then any token
+associated with the service will be forgotten, and subsequent RTAS
+calls by the guest for that service will be passed to userspace to be
+handled.
+
 
 5. The kvm_run structure
 ------------------------
@@ -2646,3 +2764,19 @@ to receive the topmost interrupt vector.
 When disabled (args[0] == 0), behavior is as if this facility is unsupported.
 
 When this capability is enabled, KVM_EXIT_EPR can occur.
+
+6.6 KVM_CAP_IRQ_MPIC
+
+Architectures: ppc
+Parameters: args[0] is the MPIC device fd
+            args[1] is the MPIC CPU number for this vcpu
+
+This capability connects the vcpu to an in-kernel MPIC device.
+
+6.7 KVM_CAP_IRQ_XICS
+
+Architectures: ppc
+Parameters: args[0] is the XICS device fd
+            args[1] is the XICS CPU number (server ID) for this vcpu
+
+This capability connects the vcpu to an in-kernel XICS device.
diff --git a/Documentation/virtual/kvm/devices/README b/Documentation/virtual/kvm/devices/README
new file mode 100644 (file)
index 0000000..34a6983
--- /dev/null
@@ -0,0 +1 @@
+This directory contains specific device bindings for KVM_CAP_DEVICE_CTRL.
diff --git a/Documentation/virtual/kvm/devices/mpic.txt b/Documentation/virtual/kvm/devices/mpic.txt
new file mode 100644 (file)
index 0000000..8257397
--- /dev/null
@@ -0,0 +1,53 @@
+MPIC interrupt controller
+=========================
+
+Device types supported:
+  KVM_DEV_TYPE_FSL_MPIC_20     Freescale MPIC v2.0
+  KVM_DEV_TYPE_FSL_MPIC_42     Freescale MPIC v4.2
+
+Only one MPIC instance, of any type, may be instantiated.  The created
+MPIC will act as the system interrupt controller, connecting to each
+vcpu's interrupt inputs.
+
+Groups:
+  KVM_DEV_MPIC_GRP_MISC
+  Attributes:
+    KVM_DEV_MPIC_BASE_ADDR (rw, 64-bit)
+      Base address of the 256 KiB MPIC register space.  Must be
+      naturally aligned.  A value of zero disables the mapping.
+      Reset value is zero.
+
+  KVM_DEV_MPIC_GRP_REGISTER (rw, 32-bit)
+    Access an MPIC register, as if the access were made from the guest.
+    "attr" is the byte offset into the MPIC register space.  Accesses
+    must be 4-byte aligned.
+
+    MSIs may be signaled by using this attribute group to write
+    to the relevant MSIIR.
+
+  KVM_DEV_MPIC_GRP_IRQ_ACTIVE (rw, 32-bit)
+    IRQ input line for each standard openpic source.  0 is inactive and 1
+    is active, regardless of interrupt sense.
+
+    For edge-triggered interrupts:  Writing 1 is considered an activating
+    edge, and writing 0 is ignored.  Reading returns 1 if a previously
+    signaled edge has not been acknowledged, and 0 otherwise.
+
+    "attr" is the IRQ number.  IRQ numbers for standard sources are the
+    byte offset of the relevant IVPR from EIVPR0, divided by 32.
+
+IRQ Routing:
+
+  The MPIC emulation supports IRQ routing. Only a single MPIC device can
+  be instantiated. Once that device has been created, it's available as
+  irqchip id 0.
+
+  This irqchip 0 has 256 interrupt pins, which expose the interrupts in
+  the main array of interrupt sources (a.k.a. "SRC" interrupts).
+
+  The numbering is the same as the MPIC device tree binding -- based on
+  the register offset from the beginning of the sources array, without
+  regard to any subdivisions in chip documentation such as "internal"
+  or "external" interrupts.
+
+  Access to non-SRC interrupts is not implemented through IRQ routing mechanisms.
diff --git a/Documentation/virtual/kvm/devices/xics.txt b/Documentation/virtual/kvm/devices/xics.txt
new file mode 100644 (file)
index 0000000..4286493
--- /dev/null
@@ -0,0 +1,66 @@
+XICS interrupt controller
+
+Device type supported: KVM_DEV_TYPE_XICS
+
+Groups:
+  KVM_DEV_XICS_SOURCES
+  Attributes: One per interrupt source, indexed by the source number.
+
+This device emulates the XICS (eXternal Interrupt Controller
+Specification) defined in PAPR.  The XICS has a set of interrupt
+sources, each identified by a 20-bit source number, and a set of
+Interrupt Control Presentation (ICP) entities, also called "servers",
+each associated with a virtual CPU.
+
+The ICP entities are created by enabling the KVM_CAP_IRQ_ARCH
+capability for each vcpu, specifying KVM_CAP_IRQ_XICS in args[0] and
+the interrupt server number (i.e. the vcpu number from the XICS's
+point of view) in args[1] of the kvm_enable_cap struct.  Each ICP has
+64 bits of state which can be read and written using the
+KVM_GET_ONE_REG and KVM_SET_ONE_REG ioctls on the vcpu.  The 64 bit
+state word has the following bitfields, starting at the
+least-significant end of the word:
+
+* Unused, 16 bits
+
+* Pending interrupt priority, 8 bits
+  Zero is the highest priority, 255 means no interrupt is pending.
+
+* Pending IPI (inter-processor interrupt) priority, 8 bits
+  Zero is the highest priority, 255 means no IPI is pending.
+
+* Pending interrupt source number, 24 bits
+  Zero means no interrupt pending, 2 means an IPI is pending
+
+* Current processor priority, 8 bits
+  Zero is the highest priority, meaning no interrupts can be
+  delivered, and 255 is the lowest priority.
+
+Each source has 64 bits of state that can be read and written using
+the KVM_GET_DEVICE_ATTR and KVM_SET_DEVICE_ATTR ioctls, specifying the
+KVM_DEV_XICS_SOURCES attribute group, with the attribute number being
+the interrupt source number.  The 64 bit state word has the following
+bitfields, starting from the least-significant end of the word:
+
+* Destination (server number), 32 bits
+  This specifies where the interrupt should be sent, and is the
+  interrupt server number specified for the destination vcpu.
+
+* Priority, 8 bits
+  This is the priority specified for this interrupt source, where 0 is
+  the highest priority and 255 is the lowest.  An interrupt with a
+  priority of 255 will never be delivered.
+
+* Level sensitive flag, 1 bit
+  This bit is 1 for a level-sensitive interrupt source, or 0 for
+  edge-sensitive (or MSI).
+
+* Masked flag, 1 bit
+  This bit is set to 1 if the interrupt is masked (cannot be delivered
+  regardless of its priority), for example by the ibm,int-off RTAS
+  call, or 0 if it is not masked.
+
+* Pending flag, 1 bit
+  This bit is 1 if the source has a pending interrupt, otherwise 0.
+
+Only one XICS instance may be created per VM.
index 1a66f907e5cca0f8df1d3cbef5ab3372bf000c4b..bf863edb517dd162066f34b5537e1f99db6482ef 100644 (file)
@@ -8,7 +8,6 @@
 #define __idmap __section(.idmap.text) noinline notrace
 
 extern pgd_t *idmap_pgd;
-extern pgd_t *hyp_pgd;
 
 void setup_mm_for_reboot(void);
 
index 0c4e643d939ea12be15ad3a63254e25ff6c4be76..57cb786a6203de0d6444d1f265344a59abc0de09 100644 (file)
@@ -87,7 +87,7 @@ struct kvm_vcpu_fault_info {
        u32 hyp_pc;             /* PC when exception was taken from Hyp mode */
 };
 
-typedef struct vfp_hard_struct kvm_kernel_vfp_t;
+typedef struct vfp_hard_struct kvm_cpu_context_t;
 
 struct kvm_vcpu_arch {
        struct kvm_regs regs;
@@ -105,8 +105,10 @@ struct kvm_vcpu_arch {
        struct kvm_vcpu_fault_info fault;
 
        /* Floating point registers (VFP and Advanced SIMD/NEON) */
-       kvm_kernel_vfp_t vfp_guest;
-       kvm_kernel_vfp_t *vfp_host;
+       struct vfp_hard_struct vfp_guest;
+
+       /* Host FP context */
+       kvm_cpu_context_t *host_cpu_context;
 
        /* VGIC state */
        struct vgic_cpu vgic_cpu;
@@ -188,23 +190,38 @@ int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *);
 int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
                int exception_index);
 
-static inline void __cpu_init_hyp_mode(unsigned long long pgd_ptr,
+static inline void __cpu_init_hyp_mode(unsigned long long boot_pgd_ptr,
+                                      unsigned long long pgd_ptr,
                                       unsigned long hyp_stack_ptr,
                                       unsigned long vector_ptr)
 {
-       unsigned long pgd_low, pgd_high;
-
-       pgd_low = (pgd_ptr & ((1ULL << 32) - 1));
-       pgd_high = (pgd_ptr >> 32ULL);
-
        /*
-        * Call initialization code, and switch to the full blown
-        * HYP code. The init code doesn't need to preserve these registers as
-        * r1-r3 and r12 are already callee save according to the AAPCS.
-        * Note that we slightly misuse the prototype by casing the pgd_low to
-        * a void *.
+        * Call initialization code, and switch to the full blown HYP
+        * code. The init code doesn't need to preserve these
+        * registers as r0-r3 are already callee saved according to
+        * the AAPCS.
+        * Note that we slightly misuse the prototype by casing the
+        * stack pointer to a void *.
+        *
+        * We don't have enough registers to perform the full init in
+        * one go.  Install the boot PGD first, and then install the
+        * runtime PGD, stack pointer and vectors. The PGDs are always
+        * passed as the third argument, in order to be passed into
+        * r2-r3 to the init code (yes, this is compliant with the
+        * PCS!).
         */
-       kvm_call_hyp((void *)pgd_low, pgd_high, hyp_stack_ptr, vector_ptr);
+
+       kvm_call_hyp(NULL, 0, boot_pgd_ptr);
+
+       kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr);
 }
 
+static inline int kvm_arch_dev_ioctl_check_extension(long ext)
+{
+       return 0;
+}
+
+int kvm_perf_init(void);
+int kvm_perf_teardown(void);
+
 #endif /* __ARM_KVM_HOST_H__ */
index 970f3b5fa109492ed2c46de8c9239bdfb1e5872c..472ac7091003ac0cbae073ec1794413d80a04262 100644 (file)
 #ifndef __ARM_KVM_MMU_H__
 #define __ARM_KVM_MMU_H__
 
-#include <asm/cacheflush.h>
-#include <asm/pgalloc.h>
-#include <asm/idmap.h>
+#include <asm/memory.h>
+#include <asm/page.h>
 
 /*
  * We directly use the kernel VA for the HYP, as we can directly share
  * the mapping (HTTBR "covers" TTBR1).
  */
-#define HYP_PAGE_OFFSET_MASK   (~0UL)
+#define HYP_PAGE_OFFSET_MASK   UL(~0)
 #define HYP_PAGE_OFFSET                PAGE_OFFSET
 #define KERN_TO_HYP(kva)       (kva)
 
+/*
+ * Our virtual mapping for the boot-time MMU-enable code. Must be
+ * shared across all the page-tables. Conveniently, we use the vectors
+ * page, where no kernel data will ever be shared with HYP.
+ */
+#define TRAMPOLINE_VA          UL(CONFIG_VECTORS_BASE)
+
+#ifndef __ASSEMBLY__
+
+#include <asm/cacheflush.h>
+#include <asm/pgalloc.h>
+
 int create_hyp_mappings(void *from, void *to);
 int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
-void free_hyp_pmds(void);
+void free_boot_hyp_pgd(void);
+void free_hyp_pgds(void);
 
 int kvm_alloc_stage2_pgd(struct kvm *kvm);
 void kvm_free_stage2_pgd(struct kvm *kvm);
@@ -45,6 +57,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
 void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
 
 phys_addr_t kvm_mmu_get_httbr(void);
+phys_addr_t kvm_mmu_get_boot_httbr(void);
+phys_addr_t kvm_get_idmap_vector(void);
 int kvm_mmu_init(void);
 void kvm_clear_hyp_idmap(void);
 
@@ -114,4 +128,8 @@ static inline void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn)
        }
 }
 
+#define kvm_flush_dcache_to_poc(a,l)   __cpuc_flush_dcache_area((a), (l))
+
+#endif /* !__ASSEMBLY__ */
+
 #endif /* __ARM_KVM_MMU_H__ */
index a53efa9936906128661b4abf189c4ffe3994c037..ee68cce6b48e4cfc5a65609b199b55ed6642594d 100644 (file)
@@ -158,7 +158,7 @@ int main(void)
   DEFINE(VCPU_MIDR,            offsetof(struct kvm_vcpu, arch.midr));
   DEFINE(VCPU_CP15,            offsetof(struct kvm_vcpu, arch.cp15));
   DEFINE(VCPU_VFP_GUEST,       offsetof(struct kvm_vcpu, arch.vfp_guest));
-  DEFINE(VCPU_VFP_HOST,                offsetof(struct kvm_vcpu, arch.vfp_host));
+  DEFINE(VCPU_VFP_HOST,                offsetof(struct kvm_vcpu, arch.host_cpu_context));
   DEFINE(VCPU_REGS,            offsetof(struct kvm_vcpu, arch.regs));
   DEFINE(VCPU_USR_REGS,                offsetof(struct kvm_vcpu, arch.regs.usr_regs));
   DEFINE(VCPU_SVC_REGS,                offsetof(struct kvm_vcpu, arch.regs.svc_regs));
index b571484e9f0388133429cde749c96cee3d1dfb47..a871b8e00fca7d67141859bbb2415dadabbc1ad5 100644 (file)
@@ -20,7 +20,7 @@
        VMLINUX_SYMBOL(__idmap_text_start) = .;                         \
        *(.idmap.text)                                                  \
        VMLINUX_SYMBOL(__idmap_text_end) = .;                           \
-       ALIGN_FUNCTION();                                               \
+       . = ALIGN(32);                                                  \
        VMLINUX_SYMBOL(__hyp_idmap_text_start) = .;                     \
        *(.hyp.idmap.text)                                              \
        VMLINUX_SYMBOL(__hyp_idmap_text_end) = .;
@@ -315,3 +315,8 @@ SECTIONS
  */
 ASSERT((__proc_info_end - __proc_info_begin), "missing CPU support")
 ASSERT((__arch_info_end - __arch_info_begin), "no machine record defined")
+/*
+ * The HYP init code can't be more than a page long.
+ * The above comment applies as well.
+ */
+ASSERT(((__hyp_idmap_text_end - __hyp_idmap_text_start) <= PAGE_SIZE), "HYP init code too big")
index 49dd64e579c2da533b1cd43235aef3a9d286b909..370e1a8af6ac0663974b2fdc186dd05c7f3b996a 100644 (file)
@@ -41,9 +41,9 @@ config KVM_ARM_HOST
          Provides host support for ARM processors.
 
 config KVM_ARM_MAX_VCPUS
-       int "Number maximum supported virtual CPUs per VM"
-       depends on KVM_ARM_HOST
-       default 4
+       int "Number maximum supported virtual CPUs per VM" if KVM_ARM_HOST
+       default 4 if KVM_ARM_HOST
+       default 0
        help
          Static number of max supported virtual CPUs per VM.
 
index 8dc5e76cb789dcf9cf562e31595ddd840979c49e..53c5ed83d16fc47455073d96355e3ac787ee5190 100644 (file)
@@ -18,6 +18,6 @@ kvm-arm-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
 
 obj-y += kvm-arm.o init.o interrupts.o
 obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
-obj-y += coproc.o coproc_a15.o mmio.o psci.o
+obj-y += coproc.o coproc_a15.o mmio.o psci.o perf.o
 obj-$(CONFIG_KVM_ARM_VGIC) += vgic.o
 obj-$(CONFIG_KVM_ARM_TIMER) += arch_timer.o
index 6ac938d46297d27a7217cee66ea1a245e9fead3a..c55b6089e923a1273a94c30856a2af8e9fd0dd79 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/kvm_host.h>
 #include <linux/interrupt.h>
 
+#include <clocksource/arm_arch_timer.h>
 #include <asm/arch_timer.h>
 
 #include <asm/kvm_vgic.h>
@@ -64,7 +65,7 @@ static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu)
 {
        struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 
-       timer->cntv_ctl |= 1 << 1; /* Mask the interrupt in the guest */
+       timer->cntv_ctl |= ARCH_TIMER_CTRL_IT_MASK;
        kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
                            vcpu->arch.timer_cpu.irq->irq,
                            vcpu->arch.timer_cpu.irq->level);
@@ -133,8 +134,8 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
        cycle_t cval, now;
        u64 ns;
 
-       /* Check if the timer is enabled and unmasked first */
-       if ((timer->cntv_ctl & 3) != 1)
+       if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
+               !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE))
                return;
 
        cval = timer->cntv_cval;
index a0dfc2a53f91135b8c8c7deaf853d59095ab5295..37d216d814cdd62d12040666e70192b0a33fe350 100644 (file)
@@ -16,6 +16,7 @@
  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  */
 
+#include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/kvm_host.h>
@@ -48,7 +49,7 @@ __asm__(".arch_extension      virt");
 #endif
 
 static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
-static kvm_kernel_vfp_t __percpu *kvm_host_vfp_state;
+static kvm_cpu_context_t __percpu *kvm_host_cpu_state;
 static unsigned long hyp_default_vectors;
 
 /* Per-CPU variable containing the currently running vcpu. */
@@ -206,7 +207,7 @@ int kvm_dev_ioctl_check_extension(long ext)
                r = KVM_MAX_VCPUS;
                break;
        default:
-               r = 0;
+               r = kvm_arch_dev_ioctl_check_extension(ext);
                break;
        }
        return r;
@@ -218,27 +219,18 @@ long kvm_arch_dev_ioctl(struct file *filp,
        return -EINVAL;
 }
 
-int kvm_arch_set_memory_region(struct kvm *kvm,
-                              struct kvm_userspace_memory_region *mem,
-                              struct kvm_memory_slot old,
-                              int user_alloc)
-{
-       return 0;
-}
-
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot,
-                                  struct kvm_memory_slot old,
                                   struct kvm_userspace_memory_region *mem,
-                                  bool user_alloc)
+                                  enum kvm_mr_change change)
 {
        return 0;
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
                                   struct kvm_userspace_memory_region *mem,
-                                  struct kvm_memory_slot old,
-                                  bool user_alloc)
+                                  const struct kvm_memory_slot *old,
+                                  enum kvm_mr_change change)
 {
 }
 
@@ -326,7 +318,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
        vcpu->cpu = cpu;
-       vcpu->arch.vfp_host = this_cpu_ptr(kvm_host_vfp_state);
+       vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
 
        /*
         * Check whether this vcpu requires the cache to be flushed on
@@ -639,7 +631,8 @@ static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
        return 0;
 }
 
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level)
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
+                         bool line_status)
 {
        u32 irq = irq_level->irq;
        unsigned int irq_type, vcpu_idx, irq_num;
@@ -794,30 +787,48 @@ long kvm_arch_vm_ioctl(struct file *filp,
        }
 }
 
-static void cpu_init_hyp_mode(void *vector)
+static void cpu_init_hyp_mode(void *dummy)
 {
+       unsigned long long boot_pgd_ptr;
        unsigned long long pgd_ptr;
        unsigned long hyp_stack_ptr;
        unsigned long stack_page;
        unsigned long vector_ptr;
 
        /* Switch from the HYP stub to our own HYP init vector */
-       __hyp_set_vectors((unsigned long)vector);
+       __hyp_set_vectors(kvm_get_idmap_vector());
 
+       boot_pgd_ptr = (unsigned long long)kvm_mmu_get_boot_httbr();
        pgd_ptr = (unsigned long long)kvm_mmu_get_httbr();
        stack_page = __get_cpu_var(kvm_arm_hyp_stack_page);
        hyp_stack_ptr = stack_page + PAGE_SIZE;
        vector_ptr = (unsigned long)__kvm_hyp_vector;
 
-       __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
+       __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
+}
+
+static int hyp_init_cpu_notify(struct notifier_block *self,
+                              unsigned long action, void *cpu)
+{
+       switch (action) {
+       case CPU_STARTING:
+       case CPU_STARTING_FROZEN:
+               cpu_init_hyp_mode(NULL);
+               break;
+       }
+
+       return NOTIFY_OK;
 }
 
+static struct notifier_block hyp_init_cpu_nb = {
+       .notifier_call = hyp_init_cpu_notify,
+};
+
 /**
  * Inits Hyp-mode on all online CPUs
  */
 static int init_hyp_mode(void)
 {
-       phys_addr_t init_phys_addr;
        int cpu;
        int err = 0;
 
@@ -849,24 +860,6 @@ static int init_hyp_mode(void)
                per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
        }
 
-       /*
-        * Execute the init code on each CPU.
-        *
-        * Note: The stack is not mapped yet, so don't do anything else than
-        * initializing the hypervisor mode on each CPU using a local stack
-        * space for temporary storage.
-        */
-       init_phys_addr = virt_to_phys(__kvm_hyp_init);
-       for_each_online_cpu(cpu) {
-               smp_call_function_single(cpu, cpu_init_hyp_mode,
-                                        (void *)(long)init_phys_addr, 1);
-       }
-
-       /*
-        * Unmap the identity mapping
-        */
-       kvm_clear_hyp_idmap();
-
        /*
         * Map the Hyp-code called directly from the host
         */
@@ -890,33 +883,38 @@ static int init_hyp_mode(void)
        }
 
        /*
-        * Map the host VFP structures
+        * Map the host CPU structures
         */
-       kvm_host_vfp_state = alloc_percpu(kvm_kernel_vfp_t);
-       if (!kvm_host_vfp_state) {
+       kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t);
+       if (!kvm_host_cpu_state) {
                err = -ENOMEM;
-               kvm_err("Cannot allocate host VFP state\n");
+               kvm_err("Cannot allocate host CPU state\n");
                goto out_free_mappings;
        }
 
        for_each_possible_cpu(cpu) {
-               kvm_kernel_vfp_t *vfp;
+               kvm_cpu_context_t *cpu_ctxt;
 
-               vfp = per_cpu_ptr(kvm_host_vfp_state, cpu);
-               err = create_hyp_mappings(vfp, vfp + 1);
+               cpu_ctxt = per_cpu_ptr(kvm_host_cpu_state, cpu);
+               err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1);
 
                if (err) {
-                       kvm_err("Cannot map host VFP state: %d\n", err);
-                       goto out_free_vfp;
+                       kvm_err("Cannot map host CPU state: %d\n", err);
+                       goto out_free_context;
                }
        }
 
+       /*
+        * Execute the init code on each CPU.
+        */
+       on_each_cpu(cpu_init_hyp_mode, NULL, 1);
+
        /*
         * Init HYP view of VGIC
         */
        err = kvm_vgic_hyp_init();
        if (err)
-               goto out_free_vfp;
+               goto out_free_context;
 
 #ifdef CONFIG_KVM_ARM_VGIC
                vgic_present = true;
@@ -929,12 +927,19 @@ static int init_hyp_mode(void)
        if (err)
                goto out_free_mappings;
 
+#ifndef CONFIG_HOTPLUG_CPU
+       free_boot_hyp_pgd();
+#endif
+
+       kvm_perf_init();
+
        kvm_info("Hyp mode initialized successfully\n");
+
        return 0;
-out_free_vfp:
-       free_percpu(kvm_host_vfp_state);
+out_free_context:
+       free_percpu(kvm_host_cpu_state);
 out_free_mappings:
-       free_hyp_pmds();
+       free_hyp_pgds();
 out_free_stack_pages:
        for_each_possible_cpu(cpu)
                free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
@@ -943,27 +948,42 @@ out_err:
        return err;
 }
 
+static void check_kvm_target_cpu(void *ret)
+{
+       *(int *)ret = kvm_target_cpu();
+}
+
 /**
  * Initialize Hyp-mode and memory mappings on all CPUs.
  */
 int kvm_arch_init(void *opaque)
 {
        int err;
+       int ret, cpu;
 
        if (!is_hyp_mode_available()) {
                kvm_err("HYP mode not available\n");
                return -ENODEV;
        }
 
-       if (kvm_target_cpu() < 0) {
-               kvm_err("Target CPU not supported!\n");
-               return -ENODEV;
+       for_each_online_cpu(cpu) {
+               smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1);
+               if (ret < 0) {
+                       kvm_err("Error, CPU %d not supported!\n", cpu);
+                       return -ENODEV;
+               }
        }
 
        err = init_hyp_mode();
        if (err)
                goto out_err;
 
+       err = register_cpu_notifier(&hyp_init_cpu_nb);
+       if (err) {
+               kvm_err("Cannot register HYP init CPU notifier (%d)\n", err);
+               goto out_err;
+       }
+
        kvm_coproc_table_init();
        return 0;
 out_err:
@@ -973,6 +993,7 @@ out_err:
 /* NOP: Compiling as a module not supported */
 void kvm_arch_exit(void)
 {
+       kvm_perf_teardown();
 }
 
 static int arm_init(void)
index 9f37a79b880b85705c3b524f08d80a7e866a3479..f048338135f7a5b20bf52371d03a3af0b922b68f 100644 (file)
 #include <asm/asm-offsets.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
 
 /********************************************************************
  * Hypervisor initialization
  *   - should be called with:
- *       r0,r1 = Hypervisor pgd pointer
- *       r2 = top of Hyp stack (kernel VA)
- *       r3 = pointer to hyp vectors
+ *       r0 = top of Hyp stack (kernel VA)
+ *       r1 = pointer to hyp vectors
+ *       r2,r3 = Hypervisor pgd pointer
+ *
+ * The init scenario is:
+ * - We jump in HYP with four parameters: boot HYP pgd, runtime HYP pgd,
+ *   runtime stack, runtime vectors
+ * - Enable the MMU with the boot pgd
+ * - Jump to a target into the trampoline page (remember, this is the same
+ *   physical page!)
+ * - Now switch to the runtime pgd (same VA, and still the same physical
+ *   page!)
+ * - Invalidate TLBs
+ * - Set stack and vectors
+ * - Profit! (or eret, if you only care about the code).
+ *
+ * As we only have four registers available to pass parameters (and we
+ * need six), we split the init in two phases:
+ * - Phase 1: r0 = 0, r1 = 0, r2,r3 contain the boot PGD.
+ *   Provides the basic HYP init, and enable the MMU.
+ * - Phase 2: r0 = ToS, r1 = vectors, r2,r3 contain the runtime PGD.
+ *   Switches to the runtime PGD, set stack and vectors.
  */
 
        .text
@@ -47,22 +67,25 @@ __kvm_hyp_init:
        W(b)    .
 
 __do_hyp_init:
+       cmp     r0, #0                  @ We have a SP?
+       bne     phase2                  @ Yes, second stage init
+
        @ Set the HTTBR to point to the hypervisor PGD pointer passed
-       mcrr    p15, 4, r0, r1, c2
+       mcrr    p15, 4, r2, r3, c2
 
        @ Set the HTCR and VTCR to the same shareability and cacheability
        @ settings as the non-secure TTBCR and with T0SZ == 0.
        mrc     p15, 4, r0, c2, c0, 2   @ HTCR
-       ldr     r12, =HTCR_MASK
-       bic     r0, r0, r12
+       ldr     r2, =HTCR_MASK
+       bic     r0, r0, r2
        mrc     p15, 0, r1, c2, c0, 2   @ TTBCR
        and     r1, r1, #(HTCR_MASK & ~TTBCR_T0SZ)
        orr     r0, r0, r1
        mcr     p15, 4, r0, c2, c0, 2   @ HTCR
 
        mrc     p15, 4, r1, c2, c1, 2   @ VTCR
-       ldr     r12, =VTCR_MASK
-       bic     r1, r1, r12
+       ldr     r2, =VTCR_MASK
+       bic     r1, r1, r2
        bic     r0, r0, #(~VTCR_HTCR_SH)        @ clear non-reusable HTCR bits
        orr     r1, r0, r1
        orr     r1, r1, #(KVM_VTCR_SL0 | KVM_VTCR_T0SZ | KVM_VTCR_S)
@@ -85,24 +108,41 @@ __do_hyp_init:
        @  - Memory alignment checks: enabled
        @  - MMU: enabled (this code must be run from an identity mapping)
        mrc     p15, 4, r0, c1, c0, 0   @ HSCR
-       ldr     r12, =HSCTLR_MASK
-       bic     r0, r0, r12
+       ldr     r2, =HSCTLR_MASK
+       bic     r0, r0, r2
        mrc     p15, 0, r1, c1, c0, 0   @ SCTLR
-       ldr     r12, =(HSCTLR_EE | HSCTLR_FI | HSCTLR_I | HSCTLR_C)
-       and     r1, r1, r12
- ARM(  ldr     r12, =(HSCTLR_M | HSCTLR_A)                     )
- THUMB(        ldr     r12, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE)         )
-       orr     r1, r1, r12
+       ldr     r2, =(HSCTLR_EE | HSCTLR_FI | HSCTLR_I | HSCTLR_C)
+       and     r1, r1, r2
+ ARM(  ldr     r2, =(HSCTLR_M | HSCTLR_A)                      )
+ THUMB(        ldr     r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE)          )
+       orr     r1, r1, r2
        orr     r0, r0, r1
        isb
        mcr     p15, 4, r0, c1, c0, 0   @ HSCR
-       isb
 
-       @ Set stack pointer and return to the kernel
-       mov     sp, r2
+       @ End of init phase-1
+       eret
+
+phase2:
+       @ Set stack pointer
+       mov     sp, r0
 
        @ Set HVBAR to point to the HYP vectors
-       mcr     p15, 4, r3, c12, c0, 0  @ HVBAR
+       mcr     p15, 4, r1, c12, c0, 0  @ HVBAR
+
+       @ Jump to the trampoline page
+       ldr     r0, =TRAMPOLINE_VA
+       adr     r1, target
+       bfi     r0, r1, #0, #PAGE_SHIFT
+       mov     pc, r0
+
+target:        @ We're now in the trampoline code, switch page tables
+       mcrr    p15, 4, r2, r3, c2
+       isb
+
+       @ Invalidate the old TLBs
+       mcr     p15, 4, r0, c8, c7, 0   @ TLBIALLH
+       dsb
 
        eret
 
index 2f12e4056408e93c43976bd24d6a4109ac27753b..965706578f13bdb0ff062f9df2502608de7a3ae9 100644 (file)
 
 extern char  __hyp_idmap_text_start[], __hyp_idmap_text_end[];
 
+static pgd_t *boot_hyp_pgd;
+static pgd_t *hyp_pgd;
 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
 
+static void *init_bounce_page;
+static unsigned long hyp_idmap_start;
+static unsigned long hyp_idmap_end;
+static phys_addr_t hyp_idmap_vector;
+
 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 {
        kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
@@ -71,172 +78,224 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
        return p;
 }
 
-static void free_ptes(pmd_t *pmd, unsigned long addr)
+static void clear_pud_entry(pud_t *pud)
 {
-       pte_t *pte;
-       unsigned int i;
+       pmd_t *pmd_table = pmd_offset(pud, 0);
+       pud_clear(pud);
+       pmd_free(NULL, pmd_table);
+       put_page(virt_to_page(pud));
+}
 
-       for (i = 0; i < PTRS_PER_PMD; i++, addr += PMD_SIZE) {
-               if (!pmd_none(*pmd) && pmd_table(*pmd)) {
-                       pte = pte_offset_kernel(pmd, addr);
-                       pte_free_kernel(NULL, pte);
-               }
-               pmd++;
+static void clear_pmd_entry(pmd_t *pmd)
+{
+       pte_t *pte_table = pte_offset_kernel(pmd, 0);
+       pmd_clear(pmd);
+       pte_free_kernel(NULL, pte_table);
+       put_page(virt_to_page(pmd));
+}
+
+static bool pmd_empty(pmd_t *pmd)
+{
+       struct page *pmd_page = virt_to_page(pmd);
+       return page_count(pmd_page) == 1;
+}
+
+static void clear_pte_entry(pte_t *pte)
+{
+       if (pte_present(*pte)) {
+               kvm_set_pte(pte, __pte(0));
+               put_page(virt_to_page(pte));
        }
 }
 
-static void free_hyp_pgd_entry(unsigned long addr)
+static bool pte_empty(pte_t *pte)
+{
+       struct page *pte_page = virt_to_page(pte);
+       return page_count(pte_page) == 1;
+}
+
+static void unmap_range(pgd_t *pgdp, unsigned long long start, u64 size)
 {
        pgd_t *pgd;
        pud_t *pud;
        pmd_t *pmd;
-       unsigned long hyp_addr = KERN_TO_HYP(addr);
+       pte_t *pte;
+       unsigned long long addr = start, end = start + size;
+       u64 range;
+
+       while (addr < end) {
+               pgd = pgdp + pgd_index(addr);
+               pud = pud_offset(pgd, addr);
+               if (pud_none(*pud)) {
+                       addr += PUD_SIZE;
+                       continue;
+               }
 
-       pgd = hyp_pgd + pgd_index(hyp_addr);
-       pud = pud_offset(pgd, hyp_addr);
+               pmd = pmd_offset(pud, addr);
+               if (pmd_none(*pmd)) {
+                       addr += PMD_SIZE;
+                       continue;
+               }
 
-       if (pud_none(*pud))
-               return;
-       BUG_ON(pud_bad(*pud));
+               pte = pte_offset_kernel(pmd, addr);
+               clear_pte_entry(pte);
+               range = PAGE_SIZE;
 
-       pmd = pmd_offset(pud, hyp_addr);
-       free_ptes(pmd, addr);
-       pmd_free(NULL, pmd);
-       pud_clear(pud);
+               /* If we emptied the pte, walk back up the ladder */
+               if (pte_empty(pte)) {
+                       clear_pmd_entry(pmd);
+                       range = PMD_SIZE;
+                       if (pmd_empty(pmd)) {
+                               clear_pud_entry(pud);
+                               range = PUD_SIZE;
+                       }
+               }
+
+               addr += range;
+       }
 }
 
 /**
- * free_hyp_pmds - free a Hyp-mode level-2 tables and child level-3 tables
+ * free_boot_hyp_pgd - free HYP boot page tables
  *
- * Assumes this is a page table used strictly in Hyp-mode and therefore contains
- * either mappings in the kernel memory area (above PAGE_OFFSET), or
- * device mappings in the vmalloc range (from VMALLOC_START to VMALLOC_END).
+ * Free the HYP boot page tables. The bounce page is also freed.
  */
-void free_hyp_pmds(void)
+void free_boot_hyp_pgd(void)
 {
-       unsigned long addr;
-
        mutex_lock(&kvm_hyp_pgd_mutex);
-       for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
-               free_hyp_pgd_entry(addr);
-       for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
-               free_hyp_pgd_entry(addr);
+
+       if (boot_hyp_pgd) {
+               unmap_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
+               unmap_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
+               kfree(boot_hyp_pgd);
+               boot_hyp_pgd = NULL;
+       }
+
+       if (hyp_pgd)
+               unmap_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
+
+       kfree(init_bounce_page);
+       init_bounce_page = NULL;
+
        mutex_unlock(&kvm_hyp_pgd_mutex);
 }
 
-static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
-                                   unsigned long end)
+/**
+ * free_hyp_pgds - free Hyp-mode page tables
+ *
+ * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
+ * therefore contains either mappings in the kernel memory area (above
+ * PAGE_OFFSET), or device mappings in the vmalloc range (from
+ * VMALLOC_START to VMALLOC_END).
+ *
+ * boot_hyp_pgd should only map two pages for the init code.
+ */
+void free_hyp_pgds(void)
 {
-       pte_t *pte;
        unsigned long addr;
-       struct page *page;
 
-       for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
-               unsigned long hyp_addr = KERN_TO_HYP(addr);
+       free_boot_hyp_pgd();
+
+       mutex_lock(&kvm_hyp_pgd_mutex);
 
-               pte = pte_offset_kernel(pmd, hyp_addr);
-               BUG_ON(!virt_addr_valid(addr));
-               page = virt_to_page(addr);
-               kvm_set_pte(pte, mk_pte(page, PAGE_HYP));
+       if (hyp_pgd) {
+               for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
+                       unmap_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+               for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
+                       unmap_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+               kfree(hyp_pgd);
+               hyp_pgd = NULL;
        }
+
+       mutex_unlock(&kvm_hyp_pgd_mutex);
 }
 
-static void create_hyp_io_pte_mappings(pmd_t *pmd, unsigned long start,
-                                      unsigned long end,
-                                      unsigned long *pfn_base)
+static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
+                                   unsigned long end, unsigned long pfn,
+                                   pgprot_t prot)
 {
        pte_t *pte;
        unsigned long addr;
 
-       for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
-               unsigned long hyp_addr = KERN_TO_HYP(addr);
-
-               pte = pte_offset_kernel(pmd, hyp_addr);
-               BUG_ON(pfn_valid(*pfn_base));
-               kvm_set_pte(pte, pfn_pte(*pfn_base, PAGE_HYP_DEVICE));
-               (*pfn_base)++;
-       }
+       addr = start;
+       do {
+               pte = pte_offset_kernel(pmd, addr);
+               kvm_set_pte(pte, pfn_pte(pfn, prot));
+               get_page(virt_to_page(pte));
+               kvm_flush_dcache_to_poc(pte, sizeof(*pte));
+               pfn++;
+       } while (addr += PAGE_SIZE, addr != end);
 }
 
 static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
-                                  unsigned long end, unsigned long *pfn_base)
+                                  unsigned long end, unsigned long pfn,
+                                  pgprot_t prot)
 {
        pmd_t *pmd;
        pte_t *pte;
        unsigned long addr, next;
 
-       for (addr = start; addr < end; addr = next) {
-               unsigned long hyp_addr = KERN_TO_HYP(addr);
-               pmd = pmd_offset(pud, hyp_addr);
+       addr = start;
+       do {
+               pmd = pmd_offset(pud, addr);
 
                BUG_ON(pmd_sect(*pmd));
 
                if (pmd_none(*pmd)) {
-                       pte = pte_alloc_one_kernel(NULL, hyp_addr);
+                       pte = pte_alloc_one_kernel(NULL, addr);
                        if (!pte) {
                                kvm_err("Cannot allocate Hyp pte\n");
                                return -ENOMEM;
                        }
                        pmd_populate_kernel(NULL, pmd, pte);
+                       get_page(virt_to_page(pmd));
+                       kvm_flush_dcache_to_poc(pmd, sizeof(*pmd));
                }
 
                next = pmd_addr_end(addr, end);
 
-               /*
-                * If pfn_base is NULL, we map kernel pages into HYP with the
-                * virtual address. Otherwise, this is considered an I/O
-                * mapping and we map the physical region starting at
-                * *pfn_base to [start, end[.
-                */
-               if (!pfn_base)
-                       create_hyp_pte_mappings(pmd, addr, next);
-               else
-                       create_hyp_io_pte_mappings(pmd, addr, next, pfn_base);
-       }
+               create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
+               pfn += (next - addr) >> PAGE_SHIFT;
+       } while (addr = next, addr != end);
 
        return 0;
 }
 
-static int __create_hyp_mappings(void *from, void *to, unsigned long *pfn_base)
+static int __create_hyp_mappings(pgd_t *pgdp,
+                                unsigned long start, unsigned long end,
+                                unsigned long pfn, pgprot_t prot)
 {
-       unsigned long start = (unsigned long)from;
-       unsigned long end = (unsigned long)to;
        pgd_t *pgd;
        pud_t *pud;
        pmd_t *pmd;
        unsigned long addr, next;
        int err = 0;
 
-       if (start >= end)
-               return -EINVAL;
-       /* Check for a valid kernel memory mapping */
-       if (!pfn_base && (!virt_addr_valid(from) || !virt_addr_valid(to - 1)))
-               return -EINVAL;
-       /* Check for a valid kernel IO mapping */
-       if (pfn_base && (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1)))
-               return -EINVAL;
-
        mutex_lock(&kvm_hyp_pgd_mutex);
-       for (addr = start; addr < end; addr = next) {
-               unsigned long hyp_addr = KERN_TO_HYP(addr);
-               pgd = hyp_pgd + pgd_index(hyp_addr);
-               pud = pud_offset(pgd, hyp_addr);
+       addr = start & PAGE_MASK;
+       end = PAGE_ALIGN(end);
+       do {
+               pgd = pgdp + pgd_index(addr);
+               pud = pud_offset(pgd, addr);
 
                if (pud_none_or_clear_bad(pud)) {
-                       pmd = pmd_alloc_one(NULL, hyp_addr);
+                       pmd = pmd_alloc_one(NULL, addr);
                        if (!pmd) {
                                kvm_err("Cannot allocate Hyp pmd\n");
                                err = -ENOMEM;
                                goto out;
                        }
                        pud_populate(NULL, pud, pmd);
+                       get_page(virt_to_page(pud));
+                       kvm_flush_dcache_to_poc(pud, sizeof(*pud));
                }
 
                next = pgd_addr_end(addr, end);
-               err = create_hyp_pmd_mappings(pud, addr, next, pfn_base);
+               err = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
                if (err)
                        goto out;
-       }
+               pfn += (next - addr) >> PAGE_SHIFT;
+       } while (addr = next, addr != end);
 out:
        mutex_unlock(&kvm_hyp_pgd_mutex);
        return err;
@@ -250,27 +309,41 @@ out:
  * The same virtual address as the kernel virtual address is also used
  * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
  * physical pages.
- *
- * Note: Wrapping around zero in the "to" address is not supported.
  */
 int create_hyp_mappings(void *from, void *to)
 {
-       return __create_hyp_mappings(from, to, NULL);
+       unsigned long phys_addr = virt_to_phys(from);
+       unsigned long start = KERN_TO_HYP((unsigned long)from);
+       unsigned long end = KERN_TO_HYP((unsigned long)to);
+
+       /* Check for a valid kernel memory mapping */
+       if (!virt_addr_valid(from) || !virt_addr_valid(to - 1))
+               return -EINVAL;
+
+       return __create_hyp_mappings(hyp_pgd, start, end,
+                                    __phys_to_pfn(phys_addr), PAGE_HYP);
 }
 
 /**
  * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode
  * @from:      The kernel start VA of the range
  * @to:                The kernel end VA of the range (exclusive)
- * @addr:      The physical start address which gets mapped
+ * @phys_addr: The physical start address which gets mapped
  *
  * The resulting HYP VA is the same as the kernel VA, modulo
  * HYP_PAGE_OFFSET.
  */
-int create_hyp_io_mappings(void *from, void *to, phys_addr_t addr)
+int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
 {
-       unsigned long pfn = __phys_to_pfn(addr);
-       return __create_hyp_mappings(from, to, &pfn);
+       unsigned long start = KERN_TO_HYP((unsigned long)from);
+       unsigned long end = KERN_TO_HYP((unsigned long)to);
+
+       /* Check for a valid kernel IO mapping */
+       if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1))
+               return -EINVAL;
+
+       return __create_hyp_mappings(hyp_pgd, start, end,
+                                    __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
 }
 
 /**
@@ -307,42 +380,6 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
        return 0;
 }
 
-static void clear_pud_entry(pud_t *pud)
-{
-       pmd_t *pmd_table = pmd_offset(pud, 0);
-       pud_clear(pud);
-       pmd_free(NULL, pmd_table);
-       put_page(virt_to_page(pud));
-}
-
-static void clear_pmd_entry(pmd_t *pmd)
-{
-       pte_t *pte_table = pte_offset_kernel(pmd, 0);
-       pmd_clear(pmd);
-       pte_free_kernel(NULL, pte_table);
-       put_page(virt_to_page(pmd));
-}
-
-static bool pmd_empty(pmd_t *pmd)
-{
-       struct page *pmd_page = virt_to_page(pmd);
-       return page_count(pmd_page) == 1;
-}
-
-static void clear_pte_entry(pte_t *pte)
-{
-       if (pte_present(*pte)) {
-               kvm_set_pte(pte, __pte(0));
-               put_page(virt_to_page(pte));
-       }
-}
-
-static bool pte_empty(pte_t *pte)
-{
-       struct page *pte_page = virt_to_page(pte);
-       return page_count(pte_page) == 1;
-}
-
 /**
  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
  * @kvm:   The VM pointer
@@ -356,43 +393,7 @@ static bool pte_empty(pte_t *pte)
  */
 static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
 {
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd;
-       pte_t *pte;
-       phys_addr_t addr = start, end = start + size;
-       u64 range;
-
-       while (addr < end) {
-               pgd = kvm->arch.pgd + pgd_index(addr);
-               pud = pud_offset(pgd, addr);
-               if (pud_none(*pud)) {
-                       addr += PUD_SIZE;
-                       continue;
-               }
-
-               pmd = pmd_offset(pud, addr);
-               if (pmd_none(*pmd)) {
-                       addr += PMD_SIZE;
-                       continue;
-               }
-
-               pte = pte_offset_kernel(pmd, addr);
-               clear_pte_entry(pte);
-               range = PAGE_SIZE;
-
-               /* If we emptied the pte, walk back up the ladder */
-               if (pte_empty(pte)) {
-                       clear_pmd_entry(pmd);
-                       range = PMD_SIZE;
-                       if (pmd_empty(pmd)) {
-                               clear_pud_entry(pud);
-                               range = PUD_SIZE;
-                       }
-               }
-
-               addr += range;
-       }
+       unmap_range(kvm->arch.pgd, start, size);
 }
 
 /**
@@ -728,47 +729,105 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 
 phys_addr_t kvm_mmu_get_httbr(void)
 {
-       VM_BUG_ON(!virt_addr_valid(hyp_pgd));
        return virt_to_phys(hyp_pgd);
 }
 
+phys_addr_t kvm_mmu_get_boot_httbr(void)
+{
+       return virt_to_phys(boot_hyp_pgd);
+}
+
+phys_addr_t kvm_get_idmap_vector(void)
+{
+       return hyp_idmap_vector;
+}
+
 int kvm_mmu_init(void)
 {
-       if (!hyp_pgd) {
+       int err;
+
+       hyp_idmap_start = virt_to_phys(__hyp_idmap_text_start);
+       hyp_idmap_end = virt_to_phys(__hyp_idmap_text_end);
+       hyp_idmap_vector = virt_to_phys(__kvm_hyp_init);
+
+       if ((hyp_idmap_start ^ hyp_idmap_end) & PAGE_MASK) {
+               /*
+                * Our init code is crossing a page boundary. Allocate
+                * a bounce page, copy the code over and use that.
+                */
+               size_t len = __hyp_idmap_text_end - __hyp_idmap_text_start;
+               phys_addr_t phys_base;
+
+               init_bounce_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+               if (!init_bounce_page) {
+                       kvm_err("Couldn't allocate HYP init bounce page\n");
+                       err = -ENOMEM;
+                       goto out;
+               }
+
+               memcpy(init_bounce_page, __hyp_idmap_text_start, len);
+               /*
+                * Warning: the code we just copied to the bounce page
+                * must be flushed to the point of coherency.
+                * Otherwise, the data may be sitting in L2, and HYP
+                * mode won't be able to observe it as it runs with
+                * caches off at that point.
+                */
+               kvm_flush_dcache_to_poc(init_bounce_page, len);
+
+               phys_base = virt_to_phys(init_bounce_page);
+               hyp_idmap_vector += phys_base - hyp_idmap_start;
+               hyp_idmap_start = phys_base;
+               hyp_idmap_end = phys_base + len;
+
+               kvm_info("Using HYP init bounce page @%lx\n",
+                        (unsigned long)phys_base);
+       }
+
+       hyp_pgd = kzalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL);
+       boot_hyp_pgd = kzalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL);
+       if (!hyp_pgd || !boot_hyp_pgd) {
                kvm_err("Hyp mode PGD not allocated\n");
-               return -ENOMEM;
+               err = -ENOMEM;
+               goto out;
        }
 
-       return 0;
-}
+       /* Create the idmap in the boot page tables */
+       err =   __create_hyp_mappings(boot_hyp_pgd,
+                                     hyp_idmap_start, hyp_idmap_end,
+                                     __phys_to_pfn(hyp_idmap_start),
+                                     PAGE_HYP);
 
-/**
- * kvm_clear_idmap - remove all idmaps from the hyp pgd
- *
- * Free the underlying pmds for all pgds in range and clear the pgds (but
- * don't free them) afterwards.
- */
-void kvm_clear_hyp_idmap(void)
-{
-       unsigned long addr, end;
-       unsigned long next;
-       pgd_t *pgd = hyp_pgd;
-       pud_t *pud;
-       pmd_t *pmd;
+       if (err) {
+               kvm_err("Failed to idmap %lx-%lx\n",
+                       hyp_idmap_start, hyp_idmap_end);
+               goto out;
+       }
 
-       addr = virt_to_phys(__hyp_idmap_text_start);
-       end = virt_to_phys(__hyp_idmap_text_end);
+       /* Map the very same page at the trampoline VA */
+       err =   __create_hyp_mappings(boot_hyp_pgd,
+                                     TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
+                                     __phys_to_pfn(hyp_idmap_start),
+                                     PAGE_HYP);
+       if (err) {
+               kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n",
+                       TRAMPOLINE_VA);
+               goto out;
+       }
 
-       pgd += pgd_index(addr);
-       do {
-               next = pgd_addr_end(addr, end);
-               if (pgd_none_or_clear_bad(pgd))
-                       continue;
-               pud = pud_offset(pgd, addr);
-               pmd = pmd_offset(pud, addr);
+       /* Map the same page again into the runtime page tables */
+       err =   __create_hyp_mappings(hyp_pgd,
+                                     TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
+                                     __phys_to_pfn(hyp_idmap_start),
+                                     PAGE_HYP);
+       if (err) {
+               kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n",
+                       TRAMPOLINE_VA);
+               goto out;
+       }
 
-               pud_clear(pud);
-               kvm_clean_pmd_entry(pmd);
-               pmd_free(NULL, (pmd_t *)((unsigned long)pmd & PAGE_MASK));
-       } while (pgd++, addr = next, addr < end);
+       return 0;
+out:
+       free_hyp_pgds();
+       return err;
 }
diff --git a/arch/arm/kvm/perf.c b/arch/arm/kvm/perf.c
new file mode 100644 (file)
index 0000000..1a3849d
--- /dev/null
@@ -0,0 +1,68 @@
+/*
+ * Based on the x86 implementation.
+ *
+ * Copyright (C) 2012 ARM Ltd.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_emulate.h>
+
+static int kvm_is_in_guest(void)
+{
+        return kvm_arm_get_running_vcpu() != NULL;
+}
+
+static int kvm_is_user_mode(void)
+{
+       struct kvm_vcpu *vcpu;
+
+       vcpu = kvm_arm_get_running_vcpu();
+
+       if (vcpu)
+               return !vcpu_mode_priv(vcpu);
+
+       return 0;
+}
+
+static unsigned long kvm_get_guest_ip(void)
+{
+       struct kvm_vcpu *vcpu;
+
+       vcpu = kvm_arm_get_running_vcpu();
+
+       if (vcpu)
+               return *vcpu_pc(vcpu);
+
+       return 0;
+}
+
+static struct perf_guest_info_callbacks kvm_guest_cbs = {
+       .is_in_guest    = kvm_is_in_guest,
+       .is_user_mode   = kvm_is_user_mode,
+       .get_guest_ip   = kvm_get_guest_ip,
+};
+
+int kvm_perf_init(void)
+{
+       return perf_register_guest_info_callbacks(&kvm_guest_cbs);
+}
+
+int kvm_perf_teardown(void)
+{
+       return perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+}
index 5ee505c937d171902839369f3cd98c13b93b11ca..83cb3ac27095146f3f60c04047c6b212856a73b2 100644 (file)
@@ -8,7 +8,6 @@
 #include <asm/pgtable.h>
 #include <asm/sections.h>
 #include <asm/system_info.h>
-#include <asm/virt.h>
 
 pgd_t *idmap_pgd;
 
@@ -83,37 +82,10 @@ static void identity_mapping_add(pgd_t *pgd, const char *text_start,
        } while (pgd++, addr = next, addr != end);
 }
 
-#if defined(CONFIG_ARM_VIRT_EXT) && defined(CONFIG_ARM_LPAE)
-pgd_t *hyp_pgd;
-
-extern char  __hyp_idmap_text_start[], __hyp_idmap_text_end[];
-
-static int __init init_static_idmap_hyp(void)
-{
-       hyp_pgd = kzalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL);
-       if (!hyp_pgd)
-               return -ENOMEM;
-
-       pr_info("Setting up static HYP identity map for 0x%p - 0x%p\n",
-               __hyp_idmap_text_start, __hyp_idmap_text_end);
-       identity_mapping_add(hyp_pgd, __hyp_idmap_text_start,
-                            __hyp_idmap_text_end, PMD_SECT_AP1);
-
-       return 0;
-}
-#else
-static int __init init_static_idmap_hyp(void)
-{
-       return 0;
-}
-#endif
-
 extern char  __idmap_text_start[], __idmap_text_end[];
 
 static int __init init_static_idmap(void)
 {
-       int ret;
-
        idmap_pgd = pgd_alloc(&init_mm);
        if (!idmap_pgd)
                return -ENOMEM;
@@ -123,12 +95,10 @@ static int __init init_static_idmap(void)
        identity_mapping_add(idmap_pgd, __idmap_text_start,
                             __idmap_text_end, 0);
 
-       ret = init_static_idmap_hyp();
-
        /* Flush L1 for the hardware to see this page table content */
        flush_cache_louis();
 
-       return ret;
+       return 0;
 }
 early_initcall(init_static_idmap);
 
index cfa74983c6751a78fdd998fcc1aada6e2fd67f6a..989dd3fe8de19d9fc40de248f5788f359eb3ebc6 100644 (file)
@@ -26,6 +26,7 @@
 #define KVM_USER_MEM_SLOTS 32
 
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+#define KVM_IRQCHIP_NUM_PINS  KVM_IOAPIC_NUM_PINS
 
 /* define exit reasons from vmm to kvm*/
 #define EXIT_REASON_VM_PANIC           0
index ec6c6b3012389402e9d530b305136e5c3855ad42..99503c2844000fe5580c3aa9530928f3d97e6f28 100644 (file)
@@ -27,7 +27,6 @@
 /* Select x86 specific features in <linux/kvm.h> */
 #define __KVM_HAVE_IOAPIC
 #define __KVM_HAVE_IRQ_LINE
-#define __KVM_HAVE_DEVICE_ASSIGNMENT
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
index 2cd225f8c68d288f50eabda6a3528fc3f9e3bdc5..990b86420cc64638b542b95dd6a9059ba8cd37f9 100644 (file)
@@ -21,12 +21,11 @@ config KVM
        tristate "Kernel-based Virtual Machine (KVM) support"
        depends on BROKEN
        depends on HAVE_KVM && MODULES
-       # for device assignment:
-       depends on PCI
        depends on BROKEN
        select PREEMPT_NOTIFIERS
        select ANON_INODES
        select HAVE_KVM_IRQCHIP
+       select HAVE_KVM_IRQ_ROUTING
        select KVM_APIC_ARCHITECTURE
        select KVM_MMIO
        ---help---
@@ -50,6 +49,17 @@ config KVM_INTEL
          Provides support for KVM on Itanium 2 processors equipped with the VT
          extensions.
 
+config KVM_DEVICE_ASSIGNMENT
+       bool "KVM legacy PCI device assignment support"
+       depends on KVM && PCI && IOMMU_API
+       default y
+       ---help---
+         Provide support for legacy PCI device assignment through KVM.  The
+         kernel now also supports a full featured userspace device driver
+         framework through VFIO, which supersedes much of this support.
+
+         If unsure, say Y.
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
index db3d7c5d10711a1d0357a0c179e752f1d6f0df58..1a4053789d0167ad6cb70dcb60a09e56020eaeb3 100644 (file)
@@ -49,10 +49,10 @@ ccflags-y := -Ivirt/kvm -Iarch/ia64/kvm/
 asflags-y := -Ivirt/kvm -Iarch/ia64/kvm/
 
 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
-               coalesced_mmio.o irq_comm.o assigned-dev.o)
+               coalesced_mmio.o irq_comm.o)
 
-ifeq ($(CONFIG_IOMMU_API),y)
-common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
+ifeq ($(CONFIG_KVM_DEVICE_ASSIGNMENT),y)
+common-objs += $(addprefix ../../../virt/kvm/, assigned-dev.o iommu.o)
 endif
 
 kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o
index ad3126a5864403eb328cdb911eff1ba223ee45fc..5b2dc0d10c8f4211d28e044a2071306d1ef955ed 100644 (file)
@@ -204,9 +204,11 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_COALESCED_MMIO:
                r = KVM_COALESCED_MMIO_PAGE_OFFSET;
                break;
+#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
        case KVM_CAP_IOMMU:
                r = iommu_present(&pci_bus_type);
                break;
+#endif
        default:
                r = 0;
        }
@@ -924,13 +926,15 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
        return 0;
 }
 
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+               bool line_status)
 {
        if (!irqchip_in_kernel(kvm))
                return -ENXIO;
 
        irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
-                                       irq_event->irq, irq_event->level);
+                                       irq_event->irq, irq_event->level,
+                                       line_status);
        return 0;
 }
 
@@ -942,24 +946,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
        int r = -ENOTTY;
 
        switch (ioctl) {
-       case KVM_SET_MEMORY_REGION: {
-               struct kvm_memory_region kvm_mem;
-               struct kvm_userspace_memory_region kvm_userspace_mem;
-
-               r = -EFAULT;
-               if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
-                       goto out;
-               kvm_userspace_mem.slot = kvm_mem.slot;
-               kvm_userspace_mem.flags = kvm_mem.flags;
-               kvm_userspace_mem.guest_phys_addr =
-                                       kvm_mem.guest_phys_addr;
-               kvm_userspace_mem.memory_size = kvm_mem.memory_size;
-               r = kvm_vm_ioctl_set_memory_region(kvm,
-                                       &kvm_userspace_mem, false);
-               if (r)
-                       goto out;
-               break;
-               }
        case KVM_CREATE_IRQCHIP:
                r = -EFAULT;
                r = kvm_ioapic_init(kvm);
@@ -1384,9 +1370,7 @@ void kvm_arch_sync_events(struct kvm *kvm)
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
        kvm_iommu_unmap_guest(kvm);
-#ifdef  KVM_CAP_DEVICE_ASSIGNMENT
        kvm_free_all_assigned_devices(kvm);
-#endif
        kfree(kvm->arch.vioapic);
        kvm_release_vm_pages(kvm);
 }
@@ -1578,9 +1562,8 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                struct kvm_memory_slot *memslot,
-               struct kvm_memory_slot old,
                struct kvm_userspace_memory_region *mem,
-               bool user_alloc)
+               enum kvm_mr_change change)
 {
        unsigned long i;
        unsigned long pfn;
@@ -1610,8 +1593,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
                struct kvm_userspace_memory_region *mem,
-               struct kvm_memory_slot old,
-               bool user_alloc)
+               const struct kvm_memory_slot *old,
+               enum kvm_mr_change change)
 {
        return;
 }
index c3e2935b6db4f56741cf5afd362c4aa458db35ab..c5f92a926a9a27702a95edd0c9425350b0a1e730 100644 (file)
@@ -27,10 +27,4 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
 #define kvm_apic_present(x) (true)
 #define kvm_lapic_enabled(x) (true)
 
-static inline bool kvm_apic_vid_enabled(void)
-{
-       /* IA64 has no apicv supporting, do nothing here */
-       return false;
-}
-
 #endif
index 4bc2c3dad6adf4045de3c9749c1c7fec1b777bb5..cf4df8e2139af0a1a0c083ba3ad2618b2efc57c4 100644 (file)
 #define H_SET_MODE             0x31C
 #define MAX_HCALL_OPCODE       H_SET_MODE
 
+/* Platform specific hcalls, used by KVM */
+#define H_RTAS                 0xf000
+
 #ifndef __ASSEMBLY__
 
 /**
index 5a56e1c5f8517fde0b4005c0c456d14a0b4ce7bc..349ed85c7d61e00dff7358e70933197704f9b9db 100644 (file)
@@ -142,6 +142,8 @@ extern int kvmppc_mmu_hv_init(void);
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
+extern void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
+                                         unsigned int vec);
 extern void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags);
 extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
                           bool upper, u32 val);
@@ -156,7 +158,8 @@ void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep,
                        unsigned long pte_index);
 extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr,
                        unsigned long *nb_ret);
-extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr);
+extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr,
+                       unsigned long gpa, bool dirty);
 extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
                        long pte_index, unsigned long pteh, unsigned long ptel);
 extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
@@ -458,6 +461,8 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 #define OSI_SC_MAGIC_R4                        0x77810F9B
 
 #define INS_DCBZ                       0x7c0007ec
+/* TO = 31 for unconditional trap */
+#define INS_TW                         0x7fe00008
 
 /* LPIDs we support with this build -- runtime limit may be lower */
 #define KVMPPC_NR_LPIDS                        (LPID_RSVD + 1)
index 38bec1dc99281f682ddf13209c296be0a8da056b..9c1ff330c8053563545b9a0e7cfc72e62798365e 100644 (file)
@@ -268,4 +268,17 @@ static inline int is_vrma_hpte(unsigned long hpte_v)
                (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)));
 }
 
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+/*
+ * Note modification of an HPTE; set the HPTE modified bit
+ * if anyone is interested.
+ */
+static inline void note_hpte_modification(struct kvm *kvm,
+                                         struct revmap_entry *rev)
+{
+       if (atomic_read(&kvm->arch.hpte_mod_interest))
+               rev->guest_rpte |= HPTE_GR_MODIFIED;
+}
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
index cdc3d2717cc6e0feb9858e2cf1502958e8b3d711..9039d3c97eecd2a5324ab759c7511d448ed1a49c 100644 (file)
 #ifndef __ASM_KVM_BOOK3S_ASM_H__
 #define __ASM_KVM_BOOK3S_ASM_H__
 
+/* XICS ICP register offsets */
+#define XICS_XIRR              4
+#define XICS_MFRR              0xc
+#define XICS_IPI               2       /* interrupt source # for IPIs */
+
 #ifdef __ASSEMBLY__
 
 #ifdef CONFIG_KVM_BOOK3S_HANDLER
@@ -81,10 +86,11 @@ struct kvmppc_host_state {
 #ifdef CONFIG_KVM_BOOK3S_64_HV
        u8 hwthread_req;
        u8 hwthread_state;
-
+       u8 host_ipi;
        struct kvm_vcpu *kvm_vcpu;
        struct kvmppc_vcore *kvm_vcore;
        unsigned long xics_phys;
+       u32 saved_xirr;
        u64 dabr;
        u64 host_mmcr[3];
        u32 host_pmc[8];
index b7cd3356a532d7c76d53dd9e1c418c40adb35275..d3c1eb34c986470af2f7f4baa0bfc226b1ca1488 100644 (file)
@@ -26,6 +26,8 @@
 /* LPIDs we support with this build -- runtime limit may be lower */
 #define KVMPPC_NR_LPIDS                        64
 
+#define KVMPPC_INST_EHPRIV     0x7c00021c
+
 static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
 {
        vcpu->arch.gpr[num] = val;
index d1bb86074721cf5394da11aca171fee1981bd72b..af326cde7cb62bf2f07c70e6d0992e154504d5e0 100644 (file)
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #endif
 
+/* These values are internal and can be increased later */
+#define KVM_NR_IRQCHIPS          1
+#define KVM_IRQCHIP_NUM_PINS     256
+
 #if !defined(CONFIG_KVM_440)
 #include <linux/mmu_notifier.h>
 
@@ -188,6 +192,10 @@ struct kvmppc_linear_info {
        int              type;
 };
 
+/* XICS components, defined in book3s_xics.c */
+struct kvmppc_xics;
+struct kvmppc_icp;
+
 /*
  * The reverse mapping array has one entry for each HPTE,
  * which stores the guest's view of the second word of the HPTE
@@ -255,6 +263,13 @@ struct kvm_arch {
 #endif /* CONFIG_KVM_BOOK3S_64_HV */
 #ifdef CONFIG_PPC_BOOK3S_64
        struct list_head spapr_tce_tables;
+       struct list_head rtas_tokens;
+#endif
+#ifdef CONFIG_KVM_MPIC
+       struct openpic *mpic;
+#endif
+#ifdef CONFIG_KVM_XICS
+       struct kvmppc_xics *xics;
 #endif
 };
 
@@ -301,11 +316,13 @@ struct kvmppc_vcore {
  * that a guest can register.
  */
 struct kvmppc_vpa {
+       unsigned long gpa;      /* Current guest phys addr */
        void *pinned_addr;      /* Address in kernel linear mapping */
        void *pinned_end;       /* End of region */
        unsigned long next_gpa; /* Guest phys addr for update */
        unsigned long len;      /* Number of bytes required */
        u8 update_pending;      /* 1 => update pinned_addr from next_gpa */
+       bool dirty;             /* true => area has been modified by kernel */
 };
 
 struct kvmppc_pte {
@@ -359,6 +376,11 @@ struct kvmppc_slb {
 #define KVMPPC_BOOKE_MAX_IAC   4
 #define KVMPPC_BOOKE_MAX_DAC   2
 
+/* KVMPPC_EPR_USER takes precedence over KVMPPC_EPR_KERNEL */
+#define KVMPPC_EPR_NONE                0 /* EPR not supported */
+#define KVMPPC_EPR_USER                1 /* exit to userspace to fill EPR */
+#define KVMPPC_EPR_KERNEL      2 /* in-kernel irqchip */
+
 struct kvmppc_booke_debug_reg {
        u32 dbcr0;
        u32 dbcr1;
@@ -370,6 +392,12 @@ struct kvmppc_booke_debug_reg {
        u64 dac[KVMPPC_BOOKE_MAX_DAC];
 };
 
+#define KVMPPC_IRQ_DEFAULT     0
+#define KVMPPC_IRQ_MPIC                1
+#define KVMPPC_IRQ_XICS                2
+
+struct openpic;
+
 struct kvm_vcpu_arch {
        ulong host_stack;
        u32 host_pid;
@@ -502,8 +530,11 @@ struct kvm_vcpu_arch {
        spinlock_t wdt_lock;
        struct timer_list wdt_timer;
        u32 tlbcfg[4];
+       u32 tlbps[4];
        u32 mmucfg;
+       u32 eptcfg;
        u32 epr;
+       u32 crit_save;
        struct kvmppc_booke_debug_reg dbg_reg;
 #endif
        gpa_t paddr_accessed;
@@ -521,7 +552,7 @@ struct kvm_vcpu_arch {
        u8 sane;
        u8 cpu_type;
        u8 hcall_needed;
-       u8 epr_enabled;
+       u8 epr_flags; /* KVMPPC_EPR_xxx */
        u8 epr_needed;
 
        u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
@@ -548,6 +579,13 @@ struct kvm_vcpu_arch {
        unsigned long magic_page_pa; /* phys addr to map the magic page to */
        unsigned long magic_page_ea; /* effect. addr to map the magic page to */
 
+       int irq_type;           /* one of KVM_IRQ_* */
+       int irq_cpu_id;
+       struct openpic *mpic;   /* KVM_IRQ_MPIC */
+#ifdef CONFIG_KVM_XICS
+       struct kvmppc_icp *icp; /* XICS presentation controller */
+#endif
+
 #ifdef CONFIG_KVM_BOOK3S_64_HV
        struct kvm_vcpu_arch_shared shregs;
 
@@ -588,5 +626,6 @@ struct kvm_vcpu_arch {
 #define KVM_MMIO_REG_FQPR      0x0060
 
 #define __KVM_HAVE_ARCH_WQP
+#define __KVM_HAVE_CREATE_DEVICE
 
 #endif /* __POWERPC_KVM_HOST_H__ */
index 44a657adf41606c3219a0577ac78952ee6b9c64a..a5287fe03d773e7d541e26757b8b5fad4a9a3af3 100644 (file)
@@ -44,7 +44,7 @@ enum emulation_result {
        EMULATE_DO_DCR,       /* kvm_run filled with DCR request */
        EMULATE_FAIL,         /* can't emulate this instruction */
        EMULATE_AGAIN,        /* something went wrong. go again */
-       EMULATE_DO_PAPR,      /* kvm_run filled with PAPR request */
+       EMULATE_EXIT_USER,    /* emulation requires exit to user-space */
 };
 
 extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
@@ -104,8 +104,7 @@ extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                        struct kvm_interrupt *irq);
-extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
-                                         struct kvm_interrupt *irq);
+extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu);
 
 extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
@@ -131,6 +130,7 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm,
 extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
                        struct kvm_memory_slot *memslot, unsigned long porder);
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
+
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
                                struct kvm_create_spapr_tce *args);
 extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
@@ -152,7 +152,7 @@ extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
                                struct kvm_userspace_memory_region *mem);
 extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
                                struct kvm_userspace_memory_region *mem,
-                               struct kvm_memory_slot old);
+                               const struct kvm_memory_slot *old);
 extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm,
                                      struct kvm_ppc_smmu_info *info);
 extern void kvmppc_core_flush_memslot(struct kvm *kvm,
@@ -165,6 +165,18 @@ extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
 
 extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
 
+int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
+
+extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp);
+extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu);
+extern void kvmppc_rtas_tokens_free(struct kvm *kvm);
+extern int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server,
+                               u32 priority);
+extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server,
+                               u32 *priority);
+extern int kvmppc_xics_int_on(struct kvm *kvm, u32 irq);
+extern int kvmppc_xics_int_off(struct kvm *kvm, u32 irq);
+
 /*
  * Cuts out inst bits with ordering according to spec.
  * That means the leftmost bit is zero. All given bits are included.
@@ -246,12 +258,29 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *);
 
 void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
 
+struct openpic;
+
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {
        paca[cpu].kvm_hstate.xics_phys = addr;
 }
 
+static inline u32 kvmppc_get_xics_latch(void)
+{
+       u32 xirr = get_paca()->kvm_hstate.saved_xirr;
+
+       get_paca()->kvm_hstate.saved_xirr = 0;
+
+       return xirr;
+}
+
+static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
+{
+       paca[cpu].kvm_hstate.host_ipi = host_ipi;
+}
+
+extern void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu);
 extern void kvm_linear_init(void);
 
 #else
@@ -260,6 +289,46 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 
 static inline void kvm_linear_init(void)
 {}
+
+static inline u32 kvmppc_get_xics_latch(void)
+{
+       return 0;
+}
+
+static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
+{}
+
+static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+       kvm_vcpu_kick(vcpu);
+}
+#endif
+
+#ifdef CONFIG_KVM_XICS
+static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.irq_type == KVMPPC_IRQ_XICS;
+}
+extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
+extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
+extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
+extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
+extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
+extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
+extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
+                       struct kvm_vcpu *vcpu, u32 cpu);
+#else
+static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
+       { return 0; }
+static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
+static inline int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu,
+                                        unsigned long server)
+       { return -EINVAL; }
+static inline int kvm_vm_ioctl_xics_irq(struct kvm *kvm,
+                                       struct kvm_irq_level *args)
+       { return -ENOTTY; }
+static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
+       { return 0; }
 #endif
 
 static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
@@ -271,6 +340,32 @@ static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
 #endif
 }
 
+#ifdef CONFIG_KVM_MPIC
+
+void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu);
+int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
+                            u32 cpu);
+void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu);
+
+#else
+
+static inline void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline int kvmppc_mpic_connect_vcpu(struct kvm_device *dev,
+               struct kvm_vcpu *vcpu, u32 cpu)
+{
+       return -EINVAL;
+}
+
+static inline void kvmppc_mpic_disconnect_vcpu(struct openpic *opp,
+               struct kvm_vcpu *vcpu)
+{
+}
+
+#endif /* CONFIG_KVM_MPIC */
+
 int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
                              struct kvm_config_tlb *cfg);
 int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
@@ -283,8 +378,15 @@ void kvmppc_init_lpid(unsigned long nr_lpids);
 
 static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
 {
-       /* Clear i-cache for new pages */
        struct page *page;
+       /*
+        * We can only access pages that the kernel maps
+        * as memory. Bail out for unmapped ones.
+        */
+       if (!pfn_valid(pfn))
+               return;
+
+       /* Clear i-cache for new pages */
        page = pfn_to_page(pfn);
        if (!test_bit(PG_arch_1, &page->flags)) {
                flush_dcache_icache_page(page);
@@ -324,4 +426,6 @@ static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb)
        return ea;
 }
 
+extern void xics_wake_cpu(int cpu);
+
 #endif /* __POWERPC_KVM_PPC_H__ */
index 3d17427e4fd7016ceccf7e0069bc1625567fd4ea..a6136515c7f2952d82843eb47586901b3747f22f 100644 (file)
 #define     LPCR_PECE1 0x00002000      /* decrementer can cause exit */
 #define     LPCR_PECE2 0x00001000      /* machine check etc can cause exit */
 #define   LPCR_MER     0x00000800      /* Mediated External Exception */
+#define   LPCR_MER_SH  11
 #define   LPCR_LPES    0x0000000c
 #define   LPCR_LPES0   0x00000008      /* LPAR Env selector 0 */
 #define   LPCR_LPES1   0x00000004      /* LPAR Env selector 1 */
index 16064d00adb900057039423fc548772eec5c938c..0fb1a6e9ff908c58517ccc991b4c6fcab5b534fe 100644 (file)
@@ -25,6 +25,8 @@
 /* Select powerpc specific features in <linux/kvm.h> */
 #define __KVM_HAVE_SPAPR_TCE
 #define __KVM_HAVE_PPC_SMT
+#define __KVM_HAVE_IRQCHIP
+#define __KVM_HAVE_IRQ_LINE
 
 struct kvm_regs {
        __u64 pc;
@@ -272,8 +274,31 @@ struct kvm_debug_exit_arch {
 
 /* for KVM_SET_GUEST_DEBUG */
 struct kvm_guest_debug_arch {
+       struct {
+               /* H/W breakpoint/watchpoint address */
+               __u64 addr;
+               /*
+                * Type denotes h/w breakpoint, read watchpoint, write
+                * watchpoint or watchpoint (both read and write).
+                */
+#define KVMPPC_DEBUG_NONE              0x0
+#define KVMPPC_DEBUG_BREAKPOINT                (1UL << 1)
+#define KVMPPC_DEBUG_WATCH_WRITE       (1UL << 2)
+#define KVMPPC_DEBUG_WATCH_READ                (1UL << 3)
+               __u32 type;
+               __u32 reserved;
+       } bp[16];
 };
 
+/* Debug related defines */
+/*
+ * kvm_guest_debug->control is a 32 bit field. The lower 16 bits are generic
+ * and upper 16 bits are architecture specific. Architecture specific defines
+ * that ioctl is for setting hardware breakpoint or software breakpoint.
+ */
+#define KVM_GUESTDBG_USE_SW_BP         0x00010000
+#define KVM_GUESTDBG_USE_HW_BP         0x00020000
+
 /* definition of registers in kvm_run */
 struct kvm_sync_regs {
 };
@@ -299,6 +324,12 @@ struct kvm_allocate_rma {
        __u64 rma_size;
 };
 
+/* for KVM_CAP_PPC_RTAS */
+struct kvm_rtas_token_args {
+       char name[120];
+       __u64 token;    /* Use a token of 0 to undefine a mapping */
+};
+
 struct kvm_book3e_206_tlb_entry {
        __u32 mas8;
        __u32 mas1;
@@ -359,6 +390,26 @@ struct kvm_get_htab_header {
        __u16   n_invalid;
 };
 
+/* Per-vcpu XICS interrupt controller state */
+#define KVM_REG_PPC_ICP_STATE  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c)
+
+#define  KVM_REG_PPC_ICP_CPPR_SHIFT    56      /* current proc priority */
+#define  KVM_REG_PPC_ICP_CPPR_MASK     0xff
+#define  KVM_REG_PPC_ICP_XISR_SHIFT    32      /* interrupt status field */
+#define  KVM_REG_PPC_ICP_XISR_MASK     0xffffff
+#define  KVM_REG_PPC_ICP_MFRR_SHIFT    24      /* pending IPI priority */
+#define  KVM_REG_PPC_ICP_MFRR_MASK     0xff
+#define  KVM_REG_PPC_ICP_PPRI_SHIFT    16      /* pending irq priority */
+#define  KVM_REG_PPC_ICP_PPRI_MASK     0xff
+
+/* Device control API: PPC-specific devices */
+#define KVM_DEV_MPIC_GRP_MISC          1
+#define   KVM_DEV_MPIC_BASE_ADDR       0       /* 64-bit */
+
+#define KVM_DEV_MPIC_GRP_REGISTER      2       /* 32-bit */
+#define KVM_DEV_MPIC_GRP_IRQ_ACTIVE    3       /* 32-bit */
+
+/* One-Reg API: PPC-specific registers */
 #define KVM_REG_PPC_HIOR       (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1)
 #define KVM_REG_PPC_IAC1       (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2)
 #define KVM_REG_PPC_IAC2       (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3)
@@ -417,4 +468,47 @@ struct kvm_get_htab_header {
 #define KVM_REG_PPC_EPCR       (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85)
 #define KVM_REG_PPC_EPR                (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x86)
 
+/* Timer Status Register OR/CLEAR interface */
+#define KVM_REG_PPC_OR_TSR     (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x87)
+#define KVM_REG_PPC_CLEAR_TSR  (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x88)
+#define KVM_REG_PPC_TCR                (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x89)
+#define KVM_REG_PPC_TSR                (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8a)
+
+/* Debugging: Special instruction for software breakpoint */
+#define KVM_REG_PPC_DEBUG_INST (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8b)
+
+/* MMU registers */
+#define KVM_REG_PPC_MAS0       (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8c)
+#define KVM_REG_PPC_MAS1       (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x8d)
+#define KVM_REG_PPC_MAS2       (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8e)
+#define KVM_REG_PPC_MAS7_3     (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8f)
+#define KVM_REG_PPC_MAS4       (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x90)
+#define KVM_REG_PPC_MAS6       (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x91)
+#define KVM_REG_PPC_MMUCFG     (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x92)
+/*
+ * TLBnCFG fields TLBnCFG_N_ENTRY and TLBnCFG_ASSOC can be changed only using
+ * KVM_CAP_SW_TLB ioctl
+ */
+#define KVM_REG_PPC_TLB0CFG    (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x93)
+#define KVM_REG_PPC_TLB1CFG    (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x94)
+#define KVM_REG_PPC_TLB2CFG    (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x95)
+#define KVM_REG_PPC_TLB3CFG    (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x96)
+#define KVM_REG_PPC_TLB0PS     (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x97)
+#define KVM_REG_PPC_TLB1PS     (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x98)
+#define KVM_REG_PPC_TLB2PS     (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x99)
+#define KVM_REG_PPC_TLB3PS     (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9a)
+#define KVM_REG_PPC_EPTCFG     (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9b)
+
+/* PPC64 eXternal Interrupt Controller Specification */
+#define KVM_DEV_XICS_GRP_SOURCES       1       /* 64-bit source attributes */
+
+/* Layout of 64-bit source attribute values */
+#define  KVM_XICS_DESTINATION_SHIFT    0
+#define  KVM_XICS_DESTINATION_MASK     0xffffffffULL
+#define  KVM_XICS_PRIORITY_SHIFT       32
+#define  KVM_XICS_PRIORITY_MASK                0xff
+#define  KVM_XICS_LEVEL_SENSITIVE      (1ULL << 40)
+#define  KVM_XICS_MASKED               (1ULL << 41)
+#define  KVM_XICS_PENDING              (1ULL << 42)
+
 #endif /* __LINUX_KVM_POWERPC_H */
index 172233eab79945800feed84fdd0cddd9298d5a28..b51a97cfedf88ff2b7884ef19485ff081d18cf78 100644 (file)
@@ -480,6 +480,7 @@ int main(void)
        DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
        DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
        DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
+       DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty));
 #endif
 #ifdef CONFIG_PPC_BOOK3S
        DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
@@ -576,6 +577,8 @@ int main(void)
        HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
        HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore);
        HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
+       HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
+       HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
        HSTATE_FIELD(HSTATE_MMCR, host_mmcr);
        HSTATE_FIELD(HSTATE_PMC, host_pmc);
        HSTATE_FIELD(HSTATE_PURR, host_purr);
@@ -599,6 +602,7 @@ int main(void)
        DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
        DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
        DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr));
+       DEFINE(VCPU_CRIT_SAVE, offsetof(struct kvm_vcpu, arch.crit_save));
 #endif /* CONFIG_PPC_BOOK3S */
 #endif /* CONFIG_KVM */
 
index 3d7fd21c65f92eeeb26ce19fc9d7fe07d0cdb36c..2f5c6b6d687709ecd7a71f0f7985a48974b075f5 100644 (file)
@@ -124,6 +124,18 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        return kvmppc_set_sregs_ivor(vcpu, sregs);
 }
 
+int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
+                       union kvmppc_one_reg *val)
+{
+       return -EINVAL;
+}
+
+int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
+                      union kvmppc_one_reg *val)
+{
+       return -EINVAL;
+}
+
 struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 {
        struct kvmppc_vcpu_44x *vcpu_44x;
index 63c67ec72e437011d639a394392ae6ae48c22923..eb643f8625796711f93fbad6d92022f957517df9 100644 (file)
@@ -136,21 +136,41 @@ config KVM_E500V2
          If unsure, say N.
 
 config KVM_E500MC
-       bool "KVM support for PowerPC E500MC/E5500 processors"
+       bool "KVM support for PowerPC E500MC/E5500/E6500 processors"
        depends on PPC_E500MC
        select KVM
        select KVM_MMIO
        select KVM_BOOKE_HV
        select MMU_NOTIFIER
        ---help---
-         Support running unmodified E500MC/E5500 (32-bit) guest kernels in
-         virtual machines on E500MC/E5500 host processors.
+         Support running unmodified E500MC/E5500/E6500 guest kernels in
+         virtual machines on E500MC/E5500/E6500 host processors.
 
          This module provides access to the hardware capabilities through
          a character device node named /dev/kvm.
 
          If unsure, say N.
 
+config KVM_MPIC
+       bool "KVM in-kernel MPIC emulation"
+       depends on KVM && E500
+       select HAVE_KVM_IRQCHIP
+       select HAVE_KVM_IRQ_ROUTING
+       select HAVE_KVM_MSI
+       help
+         Enable support for emulating MPIC devices inside the
+          host kernel, rather than relying on userspace to emulate.
+          Currently, support is limited to certain versions of
+          Freescale's MPIC implementation.
+
+config KVM_XICS
+       bool "KVM in-kernel XICS emulation"
+       depends on KVM_BOOK3S_64 && !KVM_MPIC
+       ---help---
+         Include support for the XICS (eXternal Interrupt Controller
+         Specification) interrupt controller architecture used on
+         IBM POWER (pSeries) servers.
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
index b772eded8c26b53e1d65d644aa7988bf17e85874..422de3f4d46cd129dd14c06dd1852dab9bf12463 100644 (file)
@@ -72,12 +72,18 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
        book3s_hv.o \
        book3s_hv_interrupts.o \
        book3s_64_mmu_hv.o
+kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
+       book3s_hv_rm_xics.o
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
        book3s_hv_rmhandlers.o \
        book3s_hv_rm_mmu.o \
        book3s_64_vio_hv.o \
        book3s_hv_ras.o \
-       book3s_hv_builtin.o
+       book3s_hv_builtin.o \
+       $(kvm-book3s_64-builtin-xics-objs-y)
+
+kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
+       book3s_xics.o
 
 kvm-book3s_64-module-objs := \
        ../../../virt/kvm/kvm_main.o \
@@ -86,6 +92,7 @@ kvm-book3s_64-module-objs := \
        emulate.o \
        book3s.o \
        book3s_64_vio.o \
+       book3s_rtas.o \
        $(kvm-book3s_64-objs-y)
 
 kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
@@ -103,6 +110,9 @@ kvm-book3s_32-objs := \
        book3s_32_mmu.o
 kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
 
+kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o
+kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(addprefix ../../../virt/kvm/, irqchip.o)
+
 kvm-objs := $(kvm-objs-m) $(kvm-objs-y)
 
 obj-$(CONFIG_KVM_440) += kvm.o
index a4b64528524098633bcec777404c2fcbf5ac2fe6..700df6f1d32c5a6ad88542db1a0c8ccd1bbe26d4 100644 (file)
@@ -104,7 +104,7 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
        return prio;
 }
 
-static void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
+void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu,
                                          unsigned int vec)
 {
        unsigned long old_pending = vcpu->arch.pending_exceptions;
@@ -160,8 +160,7 @@ void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
        kvmppc_book3s_queue_irqprio(vcpu, vec);
 }
 
-void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
-                                  struct kvm_interrupt *irq)
+void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)
 {
        kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
        kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
@@ -530,6 +529,21 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
                        val = get_reg_val(reg->id, vcpu->arch.vscr.u[3]);
                        break;
 #endif /* CONFIG_ALTIVEC */
+               case KVM_REG_PPC_DEBUG_INST: {
+                       u32 opcode = INS_TW;
+                       r = copy_to_user((u32 __user *)(long)reg->addr,
+                                        &opcode, sizeof(u32));
+                       break;
+               }
+#ifdef CONFIG_KVM_XICS
+               case KVM_REG_PPC_ICP_STATE:
+                       if (!vcpu->arch.icp) {
+                               r = -ENXIO;
+                               break;
+                       }
+                       val = get_reg_val(reg->id, kvmppc_xics_get_icp(vcpu));
+                       break;
+#endif /* CONFIG_KVM_XICS */
                default:
                        r = -EINVAL;
                        break;
@@ -592,6 +606,16 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
                        vcpu->arch.vscr.u[3] = set_reg_val(reg->id, val);
                        break;
 #endif /* CONFIG_ALTIVEC */
+#ifdef CONFIG_KVM_XICS
+               case KVM_REG_PPC_ICP_STATE:
+                       if (!vcpu->arch.icp) {
+                               r = -ENXIO;
+                               break;
+                       }
+                       r = kvmppc_xics_set_icp(vcpu,
+                                               set_reg_val(reg->id, val));
+                       break;
+#endif /* CONFIG_KVM_XICS */
                default:
                        r = -EINVAL;
                        break;
@@ -607,6 +631,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
        return 0;
 }
 
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+                                       struct kvm_guest_debug *dbg)
+{
+       return -EINVAL;
+}
+
 void kvmppc_decrementer_func(unsigned long data)
 {
        struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
index da98e26f6e454c95be4596a12f85da4840b6f1e9..5880dfb31074895816af634620e736cd51985bc2 100644 (file)
@@ -893,7 +893,10 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
                        /* Harvest R and C */
                        rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
                        *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
-                       rev[i].guest_rpte = ptel | rcbits;
+                       if (rcbits & ~rev[i].guest_rpte) {
+                               rev[i].guest_rpte = ptel | rcbits;
+                               note_hpte_modification(kvm, &rev[i]);
+                       }
                }
                unlock_rmap(rmapp);
                hptep[0] &= ~HPTE_V_HVLOCK;
@@ -976,7 +979,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
                /* Now check and modify the HPTE */
                if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) {
                        kvmppc_clear_ref_hpte(kvm, hptep, i);
-                       rev[i].guest_rpte |= HPTE_R_R;
+                       if (!(rev[i].guest_rpte & HPTE_R_R)) {
+                               rev[i].guest_rpte |= HPTE_R_R;
+                               note_hpte_modification(kvm, &rev[i]);
+                       }
                        ret = 1;
                }
                hptep[0] &= ~HPTE_V_HVLOCK;
@@ -1080,7 +1086,10 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
                        hptep[1] &= ~HPTE_R_C;
                        eieio();
                        hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
-                       rev[i].guest_rpte |= HPTE_R_C;
+                       if (!(rev[i].guest_rpte & HPTE_R_C)) {
+                               rev[i].guest_rpte |= HPTE_R_C;
+                               note_hpte_modification(kvm, &rev[i]);
+                       }
                        ret = 1;
                }
                hptep[0] &= ~HPTE_V_HVLOCK;
@@ -1090,11 +1099,30 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
        return ret;
 }
 
+static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
+                             struct kvm_memory_slot *memslot,
+                             unsigned long *map)
+{
+       unsigned long gfn;
+
+       if (!vpa->dirty || !vpa->pinned_addr)
+               return;
+       gfn = vpa->gpa >> PAGE_SHIFT;
+       if (gfn < memslot->base_gfn ||
+           gfn >= memslot->base_gfn + memslot->npages)
+               return;
+
+       vpa->dirty = false;
+       if (map)
+               __set_bit_le(gfn - memslot->base_gfn, map);
+}
+
 long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
                             unsigned long *map)
 {
        unsigned long i;
        unsigned long *rmapp;
+       struct kvm_vcpu *vcpu;
 
        preempt_disable();
        rmapp = memslot->arch.rmap;
@@ -1103,6 +1131,15 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
                        __set_bit_le(i, map);
                ++rmapp;
        }
+
+       /* Harvest dirty bits from VPA and DTL updates */
+       /* Note: we never modify the SLB shadow buffer areas */
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               spin_lock(&vcpu->arch.vpa_update_lock);
+               harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map);
+               harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map);
+               spin_unlock(&vcpu->arch.vpa_update_lock);
+       }
        preempt_enable();
        return 0;
 }
@@ -1114,7 +1151,7 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
        unsigned long gfn = gpa >> PAGE_SHIFT;
        struct page *page, *pages[1];
        int npages;
-       unsigned long hva, psize, offset;
+       unsigned long hva, offset;
        unsigned long pa;
        unsigned long *physp;
        int srcu_idx;
@@ -1146,14 +1183,9 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
        }
        srcu_read_unlock(&kvm->srcu, srcu_idx);
 
-       psize = PAGE_SIZE;
-       if (PageHuge(page)) {
-               page = compound_head(page);
-               psize <<= compound_order(page);
-       }
-       offset = gpa & (psize - 1);
+       offset = gpa & (PAGE_SIZE - 1);
        if (nb_ret)
-               *nb_ret = psize - offset;
+               *nb_ret = PAGE_SIZE - offset;
        return page_address(page) + offset;
 
  err:
@@ -1161,11 +1193,31 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
        return NULL;
 }
 
-void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
+void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
+                            bool dirty)
 {
        struct page *page = virt_to_page(va);
+       struct kvm_memory_slot *memslot;
+       unsigned long gfn;
+       unsigned long *rmap;
+       int srcu_idx;
 
        put_page(page);
+
+       if (!dirty || !kvm->arch.using_mmu_notifiers)
+               return;
+
+       /* We need to mark this page dirty in the rmap chain */
+       gfn = gpa >> PAGE_SHIFT;
+       srcu_idx = srcu_read_lock(&kvm->srcu);
+       memslot = gfn_to_memslot(kvm, gfn);
+       if (memslot) {
+               rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
+               lock_rmap(rmap);
+               *rmap |= KVMPPC_RMAP_CHANGED;
+               unlock_rmap(rmap);
+       }
+       srcu_read_unlock(&kvm->srcu, srcu_idx);
 }
 
 /*
@@ -1193,16 +1245,36 @@ struct kvm_htab_ctx {
 
 #define HPTE_SIZE      (2 * sizeof(unsigned long))
 
+/*
+ * Returns 1 if this HPT entry has been modified or has pending
+ * R/C bit changes.
+ */
+static int hpte_dirty(struct revmap_entry *revp, unsigned long *hptp)
+{
+       unsigned long rcbits_unset;
+
+       if (revp->guest_rpte & HPTE_GR_MODIFIED)
+               return 1;
+
+       /* Also need to consider changes in reference and changed bits */
+       rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
+       if ((hptp[0] & HPTE_V_VALID) && (hptp[1] & rcbits_unset))
+               return 1;
+
+       return 0;
+}
+
 static long record_hpte(unsigned long flags, unsigned long *hptp,
                        unsigned long *hpte, struct revmap_entry *revp,
                        int want_valid, int first_pass)
 {
        unsigned long v, r;
+       unsigned long rcbits_unset;
        int ok = 1;
        int valid, dirty;
 
        /* Unmodified entries are uninteresting except on the first pass */
-       dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
+       dirty = hpte_dirty(revp, hptp);
        if (!first_pass && !dirty)
                return 0;
 
@@ -1223,16 +1295,28 @@ static long record_hpte(unsigned long flags, unsigned long *hptp,
                while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
                        cpu_relax();
                v = hptp[0];
+
+               /* re-evaluate valid and dirty from synchronized HPTE value */
+               valid = !!(v & HPTE_V_VALID);
+               dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
+
+               /* Harvest R and C into guest view if necessary */
+               rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
+               if (valid && (rcbits_unset & hptp[1])) {
+                       revp->guest_rpte |= (hptp[1] & (HPTE_R_R | HPTE_R_C)) |
+                               HPTE_GR_MODIFIED;
+                       dirty = 1;
+               }
+
                if (v & HPTE_V_ABSENT) {
                        v &= ~HPTE_V_ABSENT;
                        v |= HPTE_V_VALID;
+                       valid = 1;
                }
-               /* re-evaluate valid and dirty from synchronized HPTE value */
-               valid = !!(v & HPTE_V_VALID);
                if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
                        valid = 0;
-               r = revp->guest_rpte | (hptp[1] & (HPTE_R_R | HPTE_R_C));
-               dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
+
+               r = revp->guest_rpte;
                /* only clear modified if this is the right sort of entry */
                if (valid == want_valid && dirty) {
                        r &= ~HPTE_GR_MODIFIED;
@@ -1288,7 +1372,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
                /* Skip uninteresting entries, i.e. clean on not-first pass */
                if (!first_pass) {
                        while (i < kvm->arch.hpt_npte &&
-                              !(revp->guest_rpte & HPTE_GR_MODIFIED)) {
+                              !hpte_dirty(revp, hptp)) {
                                ++i;
                                hptp += 2;
                                ++revp;
index 836c56975e21523577822c91b6ae68c1333702a5..1f6344c4408d6deba440fefab77829486f424d6c 100644 (file)
@@ -194,7 +194,9 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                run->papr_hcall.args[i] = gpr;
                        }
 
-                       emulated = EMULATE_DO_PAPR;
+                       run->exit_reason = KVM_EXIT_PAPR_HCALL;
+                       vcpu->arch.hcall_needed = 1;
+                       emulated = EMULATE_EXIT_USER;
                        break;
                }
 #endif
index f5416934932b17078fb60fdd4b0302df9e7c0e62..9de24f8e03c71b44e0407b65bbe137c3506650e3 100644 (file)
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
+void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+       int me;
+       int cpu = vcpu->cpu;
+       wait_queue_head_t *wqp;
+
+       wqp = kvm_arch_vcpu_wq(vcpu);
+       if (waitqueue_active(wqp)) {
+               wake_up_interruptible(wqp);
+               ++vcpu->stat.halt_wakeup;
+       }
+
+       me = get_cpu();
+
+       /* CPU points to the first thread of the core */
+       if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) {
+               int real_cpu = cpu + vcpu->arch.ptid;
+               if (paca[real_cpu].kvm_hstate.xics_phys)
+                       xics_wake_cpu(real_cpu);
+               else if (cpu_online(cpu))
+                       smp_send_reschedule(cpu);
+       }
+       put_cpu();
+}
+
 /*
  * We use the vcpu_load/put functions to measure stolen time.
  * Stolen time is counted as time when either the vcpu is able to
@@ -259,7 +284,7 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
                        len = ((struct reg_vpa *)va)->length.hword;
                else
                        len = ((struct reg_vpa *)va)->length.word;
-               kvmppc_unpin_guest_page(kvm, va);
+               kvmppc_unpin_guest_page(kvm, va, vpa, false);
 
                /* Check length */
                if (len > nb || len < sizeof(struct reg_vpa))
@@ -359,13 +384,13 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
                va = NULL;
                nb = 0;
                if (gpa)
-                       va = kvmppc_pin_guest_page(kvm, vpap->next_gpa, &nb);
+                       va = kvmppc_pin_guest_page(kvm, gpa, &nb);
                spin_lock(&vcpu->arch.vpa_update_lock);
                if (gpa == vpap->next_gpa)
                        break;
                /* sigh... unpin that one and try again */
                if (va)
-                       kvmppc_unpin_guest_page(kvm, va);
+                       kvmppc_unpin_guest_page(kvm, va, gpa, false);
        }
 
        vpap->update_pending = 0;
@@ -375,12 +400,15 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
                 * has changed the mappings underlying guest memory,
                 * so unregister the region.
                 */
-               kvmppc_unpin_guest_page(kvm, va);
+               kvmppc_unpin_guest_page(kvm, va, gpa, false);
                va = NULL;
        }
        if (vpap->pinned_addr)
-               kvmppc_unpin_guest_page(kvm, vpap->pinned_addr);
+               kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa,
+                                       vpap->dirty);
+       vpap->gpa = gpa;
        vpap->pinned_addr = va;
+       vpap->dirty = false;
        if (va)
                vpap->pinned_end = va + vpap->len;
 }
@@ -472,6 +500,7 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
        /* order writing *dt vs. writing vpa->dtl_idx */
        smp_wmb();
        vpa->dtl_idx = ++vcpu->arch.dtl_index;
+       vcpu->arch.dtl.dirty = true;
 }
 
 int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
@@ -479,7 +508,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
        unsigned long req = kvmppc_get_gpr(vcpu, 3);
        unsigned long target, ret = H_SUCCESS;
        struct kvm_vcpu *tvcpu;
-       int idx;
+       int idx, rc;
 
        switch (req) {
        case H_ENTER:
@@ -515,6 +544,28 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
                                        kvmppc_get_gpr(vcpu, 5),
                                        kvmppc_get_gpr(vcpu, 6));
                break;
+       case H_RTAS:
+               if (list_empty(&vcpu->kvm->arch.rtas_tokens))
+                       return RESUME_HOST;
+
+               rc = kvmppc_rtas_hcall(vcpu);
+
+               if (rc == -ENOENT)
+                       return RESUME_HOST;
+               else if (rc == 0)
+                       break;
+
+               /* Send the error out to userspace via KVM_RUN */
+               return rc;
+
+       case H_XIRR:
+       case H_CPPR:
+       case H_EOI:
+       case H_IPI:
+               if (kvmppc_xics_enabled(vcpu)) {
+                       ret = kvmppc_xics_hcall(vcpu, req);
+                       break;
+               } /* fallthrough */
        default:
                return RESUME_HOST;
        }
@@ -913,15 +964,19 @@ out:
        return ERR_PTR(err);
 }
 
+static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
+{
+       if (vpa->pinned_addr)
+               kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa,
+                                       vpa->dirty);
+}
+
 void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 {
        spin_lock(&vcpu->arch.vpa_update_lock);
-       if (vcpu->arch.dtl.pinned_addr)
-               kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.dtl.pinned_addr);
-       if (vcpu->arch.slb_shadow.pinned_addr)
-               kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.slb_shadow.pinned_addr);
-       if (vcpu->arch.vpa.pinned_addr)
-               kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa.pinned_addr);
+       unpin_vpa(vcpu->kvm, &vcpu->arch.dtl);
+       unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
+       unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
        spin_unlock(&vcpu->arch.vpa_update_lock);
        kvm_vcpu_uninit(vcpu);
        kmem_cache_free(kvm_vcpu_cache, vcpu);
@@ -955,7 +1010,6 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
 }
 
 extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
-extern void xics_wake_cpu(int cpu);
 
 static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
                                   struct kvm_vcpu *vcpu)
@@ -1330,9 +1384,12 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                        break;
                vc->runner = vcpu;
                n_ceded = 0;
-               list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
+               list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
                        if (!v->arch.pending_exceptions)
                                n_ceded += v->arch.ceded;
+                       else
+                               v->arch.ceded = 0;
+               }
                if (n_ceded == vc->n_runnable)
                        kvmppc_vcore_blocked(vc);
                else
@@ -1645,12 +1702,12 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
                                      struct kvm_userspace_memory_region *mem,
-                                     struct kvm_memory_slot old)
+                                     const struct kvm_memory_slot *old)
 {
        unsigned long npages = mem->memory_size >> PAGE_SHIFT;
        struct kvm_memory_slot *memslot;
 
-       if (npages && old.npages) {
+       if (npages && old->npages) {
                /*
                 * If modifying a memslot, reset all the rmap dirty bits.
                 * If this is a new memslot, we don't need to do anything
@@ -1827,6 +1884,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
        cpumask_setall(&kvm->arch.need_tlb_flush);
 
        INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+       INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
 
        kvm->arch.rma = NULL;
 
@@ -1872,6 +1930,8 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
                kvm->arch.rma = NULL;
        }
 
+       kvmppc_rtas_tokens_free(kvm);
+
        kvmppc_free_hpt(kvm);
        WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
 }
index 19c93bae1aea5781a421957c3ff941d39427f7f5..6dcbb49105a4667353ee745d4a3ea6c23db72082 100644 (file)
@@ -97,17 +97,6 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
 }
 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
 
-/*
- * Note modification of an HPTE; set the HPTE modified bit
- * if anyone is interested.
- */
-static inline void note_hpte_modification(struct kvm *kvm,
-                                         struct revmap_entry *rev)
-{
-       if (atomic_read(&kvm->arch.hpte_mod_interest))
-               rev->guest_rpte |= HPTE_GR_MODIFIED;
-}
-
 /* Remove this HPTE from the chain for a real page */
 static void remove_revmap_chain(struct kvm *kvm, long pte_index,
                                struct revmap_entry *rev,
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
new file mode 100644 (file)
index 0000000..b4b0082
--- /dev/null
@@ -0,0 +1,406 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/debug.h>
+#include <asm/synch.h>
+#include <asm/ppc-opcode.h>
+
+#include "book3s_xics.h"
+
+#define DEBUG_PASSUP
+
+static inline void rm_writeb(unsigned long paddr, u8 val)
+{
+       __asm__ __volatile__("sync; stbcix %0,0,%1"
+               : : "r" (val), "r" (paddr) : "memory");
+}
+
+static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
+                               struct kvm_vcpu *this_vcpu)
+{
+       struct kvmppc_icp *this_icp = this_vcpu->arch.icp;
+       unsigned long xics_phys;
+       int cpu;
+
+       /* Mark the target VCPU as having an interrupt pending */
+       vcpu->stat.queue_intr++;
+       set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
+
+       /* Kick self ? Just set MER and return */
+       if (vcpu == this_vcpu) {
+               mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_MER);
+               return;
+       }
+
+       /* Check if the core is loaded, if not, too hard */
+       cpu = vcpu->cpu;
+       if (cpu < 0 || cpu >= nr_cpu_ids) {
+               this_icp->rm_action |= XICS_RM_KICK_VCPU;
+               this_icp->rm_kick_target = vcpu;
+               return;
+       }
+       /* In SMT cpu will always point to thread 0, we adjust it */
+       cpu += vcpu->arch.ptid;
+
+       /* Not too hard, then poke the target */
+       xics_phys = paca[cpu].kvm_hstate.xics_phys;
+       rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+}
+
+static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
+{
+       /* Note: Only called on self ! */
+       clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
+                 &vcpu->arch.pending_exceptions);
+       mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER);
+}
+
+static inline bool icp_rm_try_update(struct kvmppc_icp *icp,
+                                    union kvmppc_icp_state old,
+                                    union kvmppc_icp_state new)
+{
+       struct kvm_vcpu *this_vcpu = local_paca->kvm_hstate.kvm_vcpu;
+       bool success;
+
+       /* Calculate new output value */
+       new.out_ee = (new.xisr && (new.pending_pri < new.cppr));
+
+       /* Attempt atomic update */
+       success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw;
+       if (!success)
+               goto bail;
+
+       /*
+        * Check for output state update
+        *
+        * Note that this is racy since another processor could be updating
+        * the state already. This is why we never clear the interrupt output
+        * here, we only ever set it. The clear only happens prior to doing
+        * an update and only by the processor itself. Currently we do it
+        * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
+        *
+        * We also do not try to figure out whether the EE state has changed,
+        * we unconditionally set it if the new state calls for it. The reason
+        * for that is that we opportunistically remove the pending interrupt
+        * flag when raising CPPR, so we need to set it back here if an
+        * interrupt is still pending.
+        */
+       if (new.out_ee)
+               icp_rm_set_vcpu_irq(icp->vcpu, this_vcpu);
+
+       /* Expose the state change for debug purposes */
+       this_vcpu->arch.icp->rm_dbgstate = new;
+       this_vcpu->arch.icp->rm_dbgtgt = icp->vcpu;
+
+ bail:
+       return success;
+}
+
+static inline int check_too_hard(struct kvmppc_xics *xics,
+                                struct kvmppc_icp *icp)
+{
+       return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS;
+}
+
+static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+                            u8 new_cppr)
+{
+       union kvmppc_icp_state old_state, new_state;
+       bool resend;
+
+       /*
+        * This handles several related states in one operation:
+        *
+        * ICP State: Down_CPPR
+        *
+        * Load CPPR with new value and if the XISR is 0
+        * then check for resends:
+        *
+        * ICP State: Resend
+        *
+        * If MFRR is more favored than CPPR, check for IPIs
+        * and notify ICS of a potential resend. This is done
+        * asynchronously (when used in real mode, we will have
+        * to exit here).
+        *
+        * We do not handle the complete Check_IPI as documented
+        * here. In the PAPR, this state will be used for both
+        * Set_MFRR and Down_CPPR. However, we know that we aren't
+        * changing the MFRR state here so we don't need to handle
+        * the case of an MFRR causing a reject of a pending irq,
+        * this will have been handled when the MFRR was set in the
+        * first place.
+        *
+        * Thus we don't have to handle rejects, only resends.
+        *
+        * When implementing real mode for HV KVM, resend will lead to
+        * a H_TOO_HARD return and the whole transaction will be handled
+        * in virtual mode.
+        */
+       do {
+               old_state = new_state = ACCESS_ONCE(icp->state);
+
+               /* Down_CPPR */
+               new_state.cppr = new_cppr;
+
+               /*
+                * Cut down Resend / Check_IPI / IPI
+                *
+                * The logic is that we cannot have a pending interrupt
+                * trumped by an IPI at this point (see above), so we
+                * know that either the pending interrupt is already an
+                * IPI (in which case we don't care to override it) or
+                * it's either more favored than us or non existent
+                */
+               if (new_state.mfrr < new_cppr &&
+                   new_state.mfrr <= new_state.pending_pri) {
+                       new_state.pending_pri = new_state.mfrr;
+                       new_state.xisr = XICS_IPI;
+               }
+
+               /* Latch/clear resend bit */
+               resend = new_state.need_resend;
+               new_state.need_resend = 0;
+
+       } while (!icp_rm_try_update(icp, old_state, new_state));
+
+       /*
+        * Now handle resend checks. Those are asynchronous to the ICP
+        * state update in HW (ie bus transactions) so we can handle them
+        * separately here as well.
+        */
+       if (resend)
+               icp->rm_action |= XICS_RM_CHECK_RESEND;
+}
+
+
+unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
+{
+       union kvmppc_icp_state old_state, new_state;
+       struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+       struct kvmppc_icp *icp = vcpu->arch.icp;
+       u32 xirr;
+
+       if (!xics || !xics->real_mode)
+               return H_TOO_HARD;
+
+       /* First clear the interrupt */
+       icp_rm_clr_vcpu_irq(icp->vcpu);
+
+       /*
+        * ICP State: Accept_Interrupt
+        *
+        * Return the pending interrupt (if any) along with the
+        * current CPPR, then clear the XISR & set CPPR to the
+        * pending priority
+        */
+       do {
+               old_state = new_state = ACCESS_ONCE(icp->state);
+
+               xirr = old_state.xisr | (((u32)old_state.cppr) << 24);
+               if (!old_state.xisr)
+                       break;
+               new_state.cppr = new_state.pending_pri;
+               new_state.pending_pri = 0xff;
+               new_state.xisr = 0;
+
+       } while (!icp_rm_try_update(icp, old_state, new_state));
+
+       /* Return the result in GPR4 */
+       vcpu->arch.gpr[4] = xirr;
+
+       return check_too_hard(xics, icp);
+}
+
+int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+                   unsigned long mfrr)
+{
+       union kvmppc_icp_state old_state, new_state;
+       struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+       struct kvmppc_icp *icp, *this_icp = vcpu->arch.icp;
+       u32 reject;
+       bool resend;
+       bool local;
+
+       if (!xics || !xics->real_mode)
+               return H_TOO_HARD;
+
+       local = this_icp->server_num == server;
+       if (local)
+               icp = this_icp;
+       else
+               icp = kvmppc_xics_find_server(vcpu->kvm, server);
+       if (!icp)
+               return H_PARAMETER;
+
+       /*
+        * ICP state: Set_MFRR
+        *
+        * If the CPPR is more favored than the new MFRR, then
+        * nothing needs to be done as there can be no XISR to
+        * reject.
+        *
+        * If the CPPR is less favored, then we might be replacing
+        * an interrupt, and thus need to possibly reject it as in
+        *
+        * ICP state: Check_IPI
+        */
+       do {
+               old_state = new_state = ACCESS_ONCE(icp->state);
+
+               /* Set_MFRR */
+               new_state.mfrr = mfrr;
+
+               /* Check_IPI */
+               reject = 0;
+               resend = false;
+               if (mfrr < new_state.cppr) {
+                       /* Reject a pending interrupt if not an IPI */
+                       if (mfrr <= new_state.pending_pri)
+                               reject = new_state.xisr;
+                       new_state.pending_pri = mfrr;
+                       new_state.xisr = XICS_IPI;
+               }
+
+               if (mfrr > old_state.mfrr && mfrr > new_state.cppr) {
+                       resend = new_state.need_resend;
+                       new_state.need_resend = 0;
+               }
+       } while (!icp_rm_try_update(icp, old_state, new_state));
+
+       /* Pass rejects to virtual mode */
+       if (reject && reject != XICS_IPI) {
+               this_icp->rm_action |= XICS_RM_REJECT;
+               this_icp->rm_reject = reject;
+       }
+
+       /* Pass resends to virtual mode */
+       if (resend)
+               this_icp->rm_action |= XICS_RM_CHECK_RESEND;
+
+       return check_too_hard(xics, this_icp);
+}
+
+int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+       union kvmppc_icp_state old_state, new_state;
+       struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+       struct kvmppc_icp *icp = vcpu->arch.icp;
+       u32 reject;
+
+       if (!xics || !xics->real_mode)
+               return H_TOO_HARD;
+
+       /*
+        * ICP State: Set_CPPR
+        *
+        * We can safely compare the new value with the current
+        * value outside of the transaction as the CPPR is only
+        * ever changed by the processor on itself
+        */
+       if (cppr > icp->state.cppr) {
+               icp_rm_down_cppr(xics, icp, cppr);
+               goto bail;
+       } else if (cppr == icp->state.cppr)
+               return H_SUCCESS;
+
+       /*
+        * ICP State: Up_CPPR
+        *
+        * The processor is raising its priority, this can result
+        * in a rejection of a pending interrupt:
+        *
+        * ICP State: Reject_Current
+        *
+        * We can remove EE from the current processor, the update
+        * transaction will set it again if needed
+        */
+       icp_rm_clr_vcpu_irq(icp->vcpu);
+
+       do {
+               old_state = new_state = ACCESS_ONCE(icp->state);
+
+               reject = 0;
+               new_state.cppr = cppr;
+
+               if (cppr <= new_state.pending_pri) {
+                       reject = new_state.xisr;
+                       new_state.xisr = 0;
+                       new_state.pending_pri = 0xff;
+               }
+
+       } while (!icp_rm_try_update(icp, old_state, new_state));
+
+       /* Pass rejects to virtual mode */
+       if (reject && reject != XICS_IPI) {
+               icp->rm_action |= XICS_RM_REJECT;
+               icp->rm_reject = reject;
+       }
+ bail:
+       return check_too_hard(xics, icp);
+}
+
+int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+       struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+       struct kvmppc_icp *icp = vcpu->arch.icp;
+       struct kvmppc_ics *ics;
+       struct ics_irq_state *state;
+       u32 irq = xirr & 0x00ffffff;
+       u16 src;
+
+       if (!xics || !xics->real_mode)
+               return H_TOO_HARD;
+
+       /*
+        * ICP State: EOI
+        *
+        * Note: If EOI is incorrectly used by SW to lower the CPPR
+        * value (ie more favored), we do not check for rejection of
+        * a pending interrupt, this is a SW error and PAPR sepcifies
+        * that we don't have to deal with it.
+        *
+        * The sending of an EOI to the ICS is handled after the
+        * CPPR update
+        *
+        * ICP State: Down_CPPR which we handle
+        * in a separate function as it's shared with H_CPPR.
+        */
+       icp_rm_down_cppr(xics, icp, xirr >> 24);
+
+       /* IPIs have no EOI */
+       if (irq == XICS_IPI)
+               goto bail;
+       /*
+        * EOI handling: If the interrupt is still asserted, we need to
+        * resend it. We can take a lockless "peek" at the ICS state here.
+        *
+        * "Message" interrupts will never have "asserted" set
+        */
+       ics = kvmppc_xics_find_ics(xics, irq, &src);
+       if (!ics)
+               goto bail;
+       state = &ics->irq_state[src];
+
+       /* Still asserted, resend it, we make it look like a reject */
+       if (state->asserted) {
+               icp->rm_action |= XICS_RM_REJECT;
+               icp->rm_reject = irq;
+       }
+ bail:
+       return check_too_hard(xics, icp);
+}
index e33d11f1b977c2ea46e3494500c17255cca9daf5..b02f91e4c70dc5341bc450fa68f22b102ccd462e 100644 (file)
@@ -79,10 +79,6 @@ _GLOBAL(kvmppc_hv_entry_trampoline)
  *                                                                            *
  *****************************************************************************/
 
-#define XICS_XIRR              4
-#define XICS_QIRR              0xc
-#define XICS_IPI               2       /* interrupt source # for IPIs */
-
 /*
  * We come in here when wakened from nap mode on a secondary hw thread.
  * Relocation is off and most register values are lost.
@@ -101,50 +97,51 @@ kvm_start_guest:
        li      r0,1
        stb     r0,PACA_NAPSTATELOST(r13)
 
-       /* get vcpu pointer, NULL if we have no vcpu to run */
-       ld      r4,HSTATE_KVM_VCPU(r13)
-       cmpdi   cr1,r4,0
+       /* were we napping due to cede? */
+       lbz     r0,HSTATE_NAPPING(r13)
+       cmpwi   r0,0
+       bne     kvm_end_cede
+
+       /*
+        * We weren't napping due to cede, so this must be a secondary
+        * thread being woken up to run a guest, or being woken up due
+        * to a stray IPI.  (Or due to some machine check or hypervisor
+        * maintenance interrupt while the core is in KVM.)
+        */
 
        /* Check the wake reason in SRR1 to see why we got here */
        mfspr   r3,SPRN_SRR1
        rlwinm  r3,r3,44-31,0x7         /* extract wake reason field */
        cmpwi   r3,4                    /* was it an external interrupt? */
-       bne     27f
-
-       /*
-        * External interrupt - for now assume it is an IPI, since we
-        * should never get any other interrupts sent to offline threads.
-        * Only do this for secondary threads.
-        */
-       beq     cr1,25f
-       lwz     r3,VCPU_PTID(r4)
-       cmpwi   r3,0
-       beq     27f
-25:    ld      r5,HSTATE_XICS_PHYS(r13)
-       li      r0,0xff
-       li      r6,XICS_QIRR
-       li      r7,XICS_XIRR
+       bne     27f                     /* if not */
+       ld      r5,HSTATE_XICS_PHYS(r13)
+       li      r7,XICS_XIRR            /* if it was an external interrupt, */
        lwzcix  r8,r5,r7                /* get and ack the interrupt */
        sync
        clrldi. r9,r8,40                /* get interrupt source ID. */
-       beq     27f                     /* none there? */
-       cmpwi   r9,XICS_IPI
-       bne     26f
+       beq     28f                     /* none there? */
+       cmpwi   r9,XICS_IPI             /* was it an IPI? */
+       bne     29f
+       li      r0,0xff
+       li      r6,XICS_MFRR
        stbcix  r0,r5,r6                /* clear IPI */
-26:    stwcix  r8,r5,r7                /* EOI the interrupt */
-
-27:    /* XXX should handle hypervisor maintenance interrupts etc. here */
+       stwcix  r8,r5,r7                /* EOI the interrupt */
+       sync                            /* order loading of vcpu after that */
 
-       /* reload vcpu pointer after clearing the IPI */
+       /* get vcpu pointer, NULL if we have no vcpu to run */
        ld      r4,HSTATE_KVM_VCPU(r13)
        cmpdi   r4,0
        /* if we have no vcpu to run, go back to sleep */
        beq     kvm_no_guest
+       b       kvmppc_hv_entry
 
-       /* were we napping due to cede? */
-       lbz     r0,HSTATE_NAPPING(r13)
-       cmpwi   r0,0
-       bne     kvm_end_cede
+27:    /* XXX should handle hypervisor maintenance interrupts etc. here */
+       b       kvm_no_guest
+28:    /* SRR1 said external but ICP said nope?? */
+       b       kvm_no_guest
+29:    /* External non-IPI interrupt to offline secondary thread? help?? */
+       stw     r8,HSTATE_SAVED_XIRR(r13)
+       b       kvm_no_guest
 
 .global kvmppc_hv_entry
 kvmppc_hv_entry:
@@ -260,6 +257,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
        lwz     r5, LPPACA_YIELDCOUNT(r3)
        addi    r5, r5, 1
        stw     r5, LPPACA_YIELDCOUNT(r3)
+       li      r6, 1
+       stb     r6, VCPU_VPA_DIRTY(r4)
 25:
        /* Load up DAR and DSISR */
        ld      r5, VCPU_DAR(r4)
@@ -485,20 +484,20 @@ toc_tlbie_lock:
        mtctr   r6
        mtxer   r7
 
+       ld      r10, VCPU_PC(r4)
+       ld      r11, VCPU_MSR(r4)
 kvmppc_cede_reentry:           /* r4 = vcpu, r13 = paca */
        ld      r6, VCPU_SRR0(r4)
        ld      r7, VCPU_SRR1(r4)
-       ld      r10, VCPU_PC(r4)
-       ld      r11, VCPU_MSR(r4)       /* r11 = vcpu->arch.msr & ~MSR_HV */
 
+       /* r11 = vcpu->arch.msr & ~MSR_HV */
        rldicl  r11, r11, 63 - MSR_HV_LG, 1
        rotldi  r11, r11, 1 + MSR_HV_LG
        ori     r11, r11, MSR_ME
 
        /* Check if we can deliver an external or decrementer interrupt now */
        ld      r0,VCPU_PENDING_EXC(r4)
-       li      r8,(1 << BOOK3S_IRQPRIO_EXTERNAL)
-       oris    r8,r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
+       lis     r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
        and     r0,r0,r8
        cmpdi   cr1,r0,0
        andi.   r0,r11,MSR_EE
@@ -526,10 +525,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
        /* Move SRR0 and SRR1 into the respective regs */
 5:     mtspr   SPRN_SRR0, r6
        mtspr   SPRN_SRR1, r7
-       li      r0,0
-       stb     r0,VCPU_CEDED(r4)       /* cancel cede */
 
 fast_guest_return:
+       li      r0,0
+       stb     r0,VCPU_CEDED(r4)       /* cancel cede */
        mtspr   SPRN_HSRR0,r10
        mtspr   SPRN_HSRR1,r11
 
@@ -676,17 +675,99 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
        cmpwi   r12,BOOK3S_INTERRUPT_SYSCALL
        beq     hcall_try_real_mode
 
-       /* Check for mediated interrupts (could be done earlier really ...) */
+       /* Only handle external interrupts here on arch 206 and later */
 BEGIN_FTR_SECTION
-       cmpwi   r12,BOOK3S_INTERRUPT_EXTERNAL
-       bne+    1f
-       andi.   r0,r11,MSR_EE
-       beq     1f
-       mfspr   r5,SPRN_LPCR
-       andi.   r0,r5,LPCR_MER
-       bne     bounce_ext_interrupt
-1:
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+       b       ext_interrupt_to_host
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
+
+       /* External interrupt ? */
+       cmpwi   r12, BOOK3S_INTERRUPT_EXTERNAL
+       bne+    ext_interrupt_to_host
+
+       /* External interrupt, first check for host_ipi. If this is
+        * set, we know the host wants us out so let's do it now
+        */
+do_ext_interrupt:
+       lbz     r0, HSTATE_HOST_IPI(r13)
+       cmpwi   r0, 0
+       bne     ext_interrupt_to_host
+
+       /* Now read the interrupt from the ICP */
+       ld      r5, HSTATE_XICS_PHYS(r13)
+       li      r7, XICS_XIRR
+       cmpdi   r5, 0
+       beq-    ext_interrupt_to_host
+       lwzcix  r3, r5, r7
+       rlwinm. r0, r3, 0, 0xffffff
+       sync
+       beq     3f              /* if nothing pending in the ICP */
+
+       /* We found something in the ICP...
+        *
+        * If it's not an IPI, stash it in the PACA and return to
+        * the host, we don't (yet) handle directing real external
+        * interrupts directly to the guest
+        */
+       cmpwi   r0, XICS_IPI
+       bne     ext_stash_for_host
+
+       /* It's an IPI, clear the MFRR and EOI it */
+       li      r0, 0xff
+       li      r6, XICS_MFRR
+       stbcix  r0, r5, r6              /* clear the IPI */
+       stwcix  r3, r5, r7              /* EOI it */
+       sync
+
+       /* We need to re-check host IPI now in case it got set in the
+        * meantime. If it's clear, we bounce the interrupt to the
+        * guest
+        */
+       lbz     r0, HSTATE_HOST_IPI(r13)
+       cmpwi   r0, 0
+       bne-    1f
+
+       /* Allright, looks like an IPI for the guest, we need to set MER */
+3:
+       /* Check if any CPU is heading out to the host, if so head out too */
+       ld      r5, HSTATE_KVM_VCORE(r13)
+       lwz     r0, VCORE_ENTRY_EXIT(r5)
+       cmpwi   r0, 0x100
+       bge     ext_interrupt_to_host
+
+       /* See if there is a pending interrupt for the guest */
+       mfspr   r8, SPRN_LPCR
+       ld      r0, VCPU_PENDING_EXC(r9)
+       /* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */
+       rldicl. r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63
+       rldimi  r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH
+       beq     2f
+
+       /* And if the guest EE is set, we can deliver immediately, else
+        * we return to the guest with MER set
+        */
+       andi.   r0, r11, MSR_EE
+       beq     2f
+       mtspr   SPRN_SRR0, r10
+       mtspr   SPRN_SRR1, r11
+       li      r10, BOOK3S_INTERRUPT_EXTERNAL
+       li      r11, (MSR_ME << 1) | 1  /* synthesize MSR_SF | MSR_ME */
+       rotldi  r11, r11, 63
+2:     mr      r4, r9
+       mtspr   SPRN_LPCR, r8
+       b       fast_guest_return
+
+       /* We raced with the host, we need to resend that IPI, bummer */
+1:     li      r0, IPI_PRIORITY
+       stbcix  r0, r5, r6              /* set the IPI */
+       sync
+       b       ext_interrupt_to_host
+
+ext_stash_for_host:
+       /* It's not an IPI and it's for the host, stash it in the PACA
+        * before exit, it will be picked up by the host ICP driver
+        */
+       stw     r3, HSTATE_SAVED_XIRR(r13)
+ext_interrupt_to_host:
 
 guest_exit_cont:               /* r9 = vcpu, r12 = trap, r13 = paca */
        /* Save DEC */
@@ -829,7 +910,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
        beq     44f
        ld      r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */
        li      r0,IPI_PRIORITY
-       li      r7,XICS_QIRR
+       li      r7,XICS_MFRR
        stbcix  r0,r7,r8                /* trigger the IPI */
 44:    srdi.   r3,r3,1
        addi    r6,r6,PACA_SIZE
@@ -1018,6 +1099,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
        lwz     r3, LPPACA_YIELDCOUNT(r8)
        addi    r3, r3, 1
        stw     r3, LPPACA_YIELDCOUNT(r8)
+       li      r3, 1
+       stb     r3, VCPU_VPA_DIRTY(r9)
 25:
        /* Save PMU registers if requested */
        /* r8 and cr0.eq are live here */
@@ -1350,11 +1433,19 @@ hcall_real_table:
        .long   0               /* 0x58 */
        .long   0               /* 0x5c */
        .long   0               /* 0x60 */
-       .long   0               /* 0x64 */
-       .long   0               /* 0x68 */
-       .long   0               /* 0x6c */
-       .long   0               /* 0x70 */
-       .long   0               /* 0x74 */
+#ifdef CONFIG_KVM_XICS
+       .long   .kvmppc_rm_h_eoi - hcall_real_table
+       .long   .kvmppc_rm_h_cppr - hcall_real_table
+       .long   .kvmppc_rm_h_ipi - hcall_real_table
+       .long   0               /* 0x70 - H_IPOLL */
+       .long   .kvmppc_rm_h_xirr - hcall_real_table
+#else
+       .long   0               /* 0x64 - H_EOI */
+       .long   0               /* 0x68 - H_CPPR */
+       .long   0               /* 0x6c - H_IPI */
+       .long   0               /* 0x70 - H_IPOLL */
+       .long   0               /* 0x74 - H_XIRR */
+#endif
        .long   0               /* 0x78 */
        .long   0               /* 0x7c */
        .long   0               /* 0x80 */
@@ -1405,15 +1496,6 @@ ignore_hdec:
        mr      r4,r9
        b       fast_guest_return
 
-bounce_ext_interrupt:
-       mr      r4,r9
-       mtspr   SPRN_SRR0,r10
-       mtspr   SPRN_SRR1,r11
-       li      r10,BOOK3S_INTERRUPT_EXTERNAL
-       li      r11,(MSR_ME << 1) | 1   /* synthesize MSR_SF | MSR_ME */
-       rotldi  r11,r11,63
-       b       fast_guest_return
-
 _GLOBAL(kvmppc_h_set_dabr)
        std     r4,VCPU_DABR(r3)
        /* Work around P7 bug where DABR can get corrupted on mtspr */
@@ -1519,6 +1601,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
        b       .
 
 kvm_end_cede:
+       /* get vcpu pointer */
+       ld      r4, HSTATE_KVM_VCPU(r13)
+
        /* Woken by external or decrementer interrupt */
        ld      r1, HSTATE_HOST_R1(r13)
 
@@ -1558,6 +1643,16 @@ kvm_end_cede:
        li      r0,0
        stb     r0,HSTATE_NAPPING(r13)
 
+       /* Check the wake reason in SRR1 to see why we got here */
+       mfspr   r3, SPRN_SRR1
+       rlwinm  r3, r3, 44-31, 0x7      /* extract wake reason field */
+       cmpwi   r3, 4                   /* was it an external interrupt? */
+       li      r12, BOOK3S_INTERRUPT_EXTERNAL
+       mr      r9, r4
+       ld      r10, VCPU_PC(r9)
+       ld      r11, VCPU_MSR(r9)
+       beq     do_ext_interrupt        /* if so */
+
        /* see if any other thread is already exiting */
        lwz     r0,VCORE_ENTRY_EXIT(r5)
        cmpwi   r0,0x100
@@ -1577,8 +1672,7 @@ kvm_cede_prodded:
 
        /* we've ceded but we want to give control to the host */
 kvm_cede_exit:
-       li      r3,H_TOO_HARD
-       blr
+       b       hcall_real_fallback
 
        /* Try to handle a machine check in real mode */
 machine_check_realmode:
@@ -1626,7 +1720,7 @@ secondary_nap:
        beq     37f
        sync
        li      r0, 0xff
-       li      r6, XICS_QIRR
+       li      r6, XICS_MFRR
        stbcix  r0, r5, r6              /* clear the IPI */
        stwcix  r3, r5, r7              /* EOI it */
 37:    sync
index dbdc15aa8127f2c13e29359236e7b0869ad899fc..bdc40b8e77d9ecc6c5da88e076a53e97ffe52277 100644 (file)
@@ -762,9 +762,7 @@ program_interrupt:
                        run->exit_reason = KVM_EXIT_MMIO;
                        r = RESUME_HOST_NV;
                        break;
-               case EMULATE_DO_PAPR:
-                       run->exit_reason = KVM_EXIT_PAPR_HCALL;
-                       vcpu->arch.hcall_needed = 1;
+               case EMULATE_EXIT_USER:
                        r = RESUME_HOST_NV;
                        break;
                default:
@@ -1283,7 +1281,7 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
                                struct kvm_userspace_memory_region *mem,
-                               struct kvm_memory_slot old)
+                               const struct kvm_memory_slot *old)
 {
 }
 
@@ -1298,6 +1296,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 {
 #ifdef CONFIG_PPC64
        INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+       INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
 #endif
 
        if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
index ee02b30878ed4bec733af6cbd9aa152eefe22d0f..b24309c6c2d507d3f2c008358c3c0728d450a7cd 100644 (file)
@@ -227,6 +227,13 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
        return EMULATE_DONE;
 }
 
+static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
+{
+       long rc = kvmppc_xics_hcall(vcpu, cmd);
+       kvmppc_set_gpr(vcpu, 3, rc);
+       return EMULATE_DONE;
+}
+
 int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 {
        switch (cmd) {
@@ -246,6 +253,20 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
                clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
                vcpu->stat.halt_wakeup++;
                return EMULATE_DONE;
+       case H_XIRR:
+       case H_CPPR:
+       case H_EOI:
+       case H_IPI:
+               if (kvmppc_xics_enabled(vcpu))
+                       return kvmppc_h_pr_xics_hcall(vcpu, cmd);
+               break;
+       case H_RTAS:
+               if (list_empty(&vcpu->kvm->arch.rtas_tokens))
+                       return RESUME_HOST;
+               if (kvmppc_rtas_hcall(vcpu))
+                       break;
+               kvmppc_set_gpr(vcpu, 3, 0);
+               return EMULATE_DONE;
        }
 
        return EMULATE_FAIL;
diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
new file mode 100644 (file)
index 0000000..3219ba8
--- /dev/null
@@ -0,0 +1,274 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/err.h>
+
+#include <asm/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/rtas.h>
+
+#ifdef CONFIG_KVM_XICS
+static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+       u32 irq, server, priority;
+       int rc;
+
+       if (args->nargs != 3 || args->nret != 1) {
+               rc = -3;
+               goto out;
+       }
+
+       irq = args->args[0];
+       server = args->args[1];
+       priority = args->args[2];
+
+       rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
+       if (rc)
+               rc = -3;
+out:
+       args->rets[0] = rc;
+}
+
+static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+       u32 irq, server, priority;
+       int rc;
+
+       if (args->nargs != 1 || args->nret != 3) {
+               rc = -3;
+               goto out;
+       }
+
+       irq = args->args[0];
+
+       server = priority = 0;
+       rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
+       if (rc) {
+               rc = -3;
+               goto out;
+       }
+
+       args->rets[1] = server;
+       args->rets[2] = priority;
+out:
+       args->rets[0] = rc;
+}
+
+static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+       u32 irq;
+       int rc;
+
+       if (args->nargs != 1 || args->nret != 1) {
+               rc = -3;
+               goto out;
+       }
+
+       irq = args->args[0];
+
+       rc = kvmppc_xics_int_off(vcpu->kvm, irq);
+       if (rc)
+               rc = -3;
+out:
+       args->rets[0] = rc;
+}
+
+static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args)
+{
+       u32 irq;
+       int rc;
+
+       if (args->nargs != 1 || args->nret != 1) {
+               rc = -3;
+               goto out;
+       }
+
+       irq = args->args[0];
+
+       rc = kvmppc_xics_int_on(vcpu->kvm, irq);
+       if (rc)
+               rc = -3;
+out:
+       args->rets[0] = rc;
+}
+#endif /* CONFIG_KVM_XICS */
+
+struct rtas_handler {
+       void (*handler)(struct kvm_vcpu *vcpu, struct rtas_args *args);
+       char *name;
+};
+
+static struct rtas_handler rtas_handlers[] = {
+#ifdef CONFIG_KVM_XICS
+       { .name = "ibm,set-xive", .handler = kvm_rtas_set_xive },
+       { .name = "ibm,get-xive", .handler = kvm_rtas_get_xive },
+       { .name = "ibm,int-off",  .handler = kvm_rtas_int_off },
+       { .name = "ibm,int-on",   .handler = kvm_rtas_int_on },
+#endif
+};
+
+struct rtas_token_definition {
+       struct list_head list;
+       struct rtas_handler *handler;
+       u64 token;
+};
+
+static int rtas_name_matches(char *s1, char *s2)
+{
+       struct kvm_rtas_token_args args;
+       return !strncmp(s1, s2, sizeof(args.name));
+}
+
+static int rtas_token_undefine(struct kvm *kvm, char *name)
+{
+       struct rtas_token_definition *d, *tmp;
+
+       lockdep_assert_held(&kvm->lock);
+
+       list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) {
+               if (rtas_name_matches(d->handler->name, name)) {
+                       list_del(&d->list);
+                       kfree(d);
+                       return 0;
+               }
+       }
+
+       /* It's not an error to undefine an undefined token */
+       return 0;
+}
+
+static int rtas_token_define(struct kvm *kvm, char *name, u64 token)
+{
+       struct rtas_token_definition *d;
+       struct rtas_handler *h = NULL;
+       bool found;
+       int i;
+
+       lockdep_assert_held(&kvm->lock);
+
+       list_for_each_entry(d, &kvm->arch.rtas_tokens, list) {
+               if (d->token == token)
+                       return -EEXIST;
+       }
+
+       found = false;
+       for (i = 0; i < ARRAY_SIZE(rtas_handlers); i++) {
+               h = &rtas_handlers[i];
+               if (rtas_name_matches(h->name, name)) {
+                       found = true;
+                       break;
+               }
+       }
+
+       if (!found)
+               return -ENOENT;
+
+       d = kzalloc(sizeof(*d), GFP_KERNEL);
+       if (!d)
+               return -ENOMEM;
+
+       d->handler = h;
+       d->token = token;
+
+       list_add_tail(&d->list, &kvm->arch.rtas_tokens);
+
+       return 0;
+}
+
+int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp)
+{
+       struct kvm_rtas_token_args args;
+       int rc;
+
+       if (copy_from_user(&args, argp, sizeof(args)))
+               return -EFAULT;
+
+       mutex_lock(&kvm->lock);
+
+       if (args.token)
+               rc = rtas_token_define(kvm, args.name, args.token);
+       else
+               rc = rtas_token_undefine(kvm, args.name);
+
+       mutex_unlock(&kvm->lock);
+
+       return rc;
+}
+
+int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu)
+{
+       struct rtas_token_definition *d;
+       struct rtas_args args;
+       rtas_arg_t *orig_rets;
+       gpa_t args_phys;
+       int rc;
+
+       /* r4 contains the guest physical address of the RTAS args */
+       args_phys = kvmppc_get_gpr(vcpu, 4);
+
+       rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args));
+       if (rc)
+               goto fail;
+
+       /*
+        * args->rets is a pointer into args->args. Now that we've
+        * copied args we need to fix it up to point into our copy,
+        * not the guest args. We also need to save the original
+        * value so we can restore it on the way out.
+        */
+       orig_rets = args.rets;
+       args.rets = &args.args[args.nargs];
+
+       mutex_lock(&vcpu->kvm->lock);
+
+       rc = -ENOENT;
+       list_for_each_entry(d, &vcpu->kvm->arch.rtas_tokens, list) {
+               if (d->token == args.token) {
+                       d->handler->handler(vcpu, &args);
+                       rc = 0;
+                       break;
+               }
+       }
+
+       mutex_unlock(&vcpu->kvm->lock);
+
+       if (rc == 0) {
+               args.rets = orig_rets;
+               rc = kvm_write_guest(vcpu->kvm, args_phys, &args, sizeof(args));
+               if (rc)
+                       goto fail;
+       }
+
+       return rc;
+
+fail:
+       /*
+        * We only get here if the guest has called RTAS with a bogus
+        * args pointer. That means we can't get to the args, and so we
+        * can't fail the RTAS call. So fail right out to userspace,
+        * which should kill the guest.
+        */
+       return rc;
+}
+
+void kvmppc_rtas_tokens_free(struct kvm *kvm)
+{
+       struct rtas_token_definition *d, *tmp;
+
+       lockdep_assert_held(&kvm->lock);
+
+       list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) {
+               list_del(&d->list);
+               kfree(d);
+       }
+}
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
new file mode 100644 (file)
index 0000000..f7a1037
--- /dev/null
@@ -0,0 +1,1270 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/anon_inodes.h>
+
+#include <asm/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xics.h>
+#include <asm/debug.h>
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "book3s_xics.h"
+
+#if 1
+#define XICS_DBG(fmt...) do { } while (0)
+#else
+#define XICS_DBG(fmt...) trace_printk(fmt)
+#endif
+
+#define ENABLE_REALMODE        true
+#define DEBUG_REALMODE false
+
+/*
+ * LOCKING
+ * =======
+ *
+ * Each ICS has a mutex protecting the information about the IRQ
+ * sources and avoiding simultaneous deliveries if the same interrupt.
+ *
+ * ICP operations are done via a single compare & swap transaction
+ * (most ICP state fits in the union kvmppc_icp_state)
+ */
+
+/*
+ * TODO
+ * ====
+ *
+ * - To speed up resends, keep a bitmap of "resend" set bits in the
+ *   ICS
+ *
+ * - Speed up server# -> ICP lookup (array ? hash table ?)
+ *
+ * - Make ICS lockless as well, or at least a per-interrupt lock or hashed
+ *   locks array to improve scalability
+ */
+
+/* -- ICS routines -- */
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+                           u32 new_irq);
+
+static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level,
+                          bool report_status)
+{
+       struct ics_irq_state *state;
+       struct kvmppc_ics *ics;
+       u16 src;
+
+       XICS_DBG("ics deliver %#x (level: %d)\n", irq, level);
+
+       ics = kvmppc_xics_find_ics(xics, irq, &src);
+       if (!ics) {
+               XICS_DBG("ics_deliver_irq: IRQ 0x%06x not found !\n", irq);
+               return -EINVAL;
+       }
+       state = &ics->irq_state[src];
+       if (!state->exists)
+               return -EINVAL;
+
+       if (report_status)
+               return state->asserted;
+
+       /*
+        * We set state->asserted locklessly. This should be fine as
+        * we are the only setter, thus concurrent access is undefined
+        * to begin with.
+        */
+       if (level == KVM_INTERRUPT_SET_LEVEL)
+               state->asserted = 1;
+       else if (level == KVM_INTERRUPT_UNSET) {
+               state->asserted = 0;
+               return 0;
+       }
+
+       /* Attempt delivery */
+       icp_deliver_irq(xics, NULL, irq);
+
+       return state->asserted;
+}
+
+static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
+                            struct kvmppc_icp *icp)
+{
+       int i;
+
+       mutex_lock(&ics->lock);
+
+       for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+               struct ics_irq_state *state = &ics->irq_state[i];
+
+               if (!state->resend)
+                       continue;
+
+               XICS_DBG("resend %#x prio %#x\n", state->number,
+                             state->priority);
+
+               mutex_unlock(&ics->lock);
+               icp_deliver_irq(xics, icp, state->number);
+               mutex_lock(&ics->lock);
+       }
+
+       mutex_unlock(&ics->lock);
+}
+
+static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
+                      struct ics_irq_state *state,
+                      u32 server, u32 priority, u32 saved_priority)
+{
+       bool deliver;
+
+       mutex_lock(&ics->lock);
+
+       state->server = server;
+       state->priority = priority;
+       state->saved_priority = saved_priority;
+       deliver = false;
+       if ((state->masked_pending || state->resend) && priority != MASKED) {
+               state->masked_pending = 0;
+               deliver = true;
+       }
+
+       mutex_unlock(&ics->lock);
+
+       return deliver;
+}
+
+int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority)
+{
+       struct kvmppc_xics *xics = kvm->arch.xics;
+       struct kvmppc_icp *icp;
+       struct kvmppc_ics *ics;
+       struct ics_irq_state *state;
+       u16 src;
+
+       if (!xics)
+               return -ENODEV;
+
+       ics = kvmppc_xics_find_ics(xics, irq, &src);
+       if (!ics)
+               return -EINVAL;
+       state = &ics->irq_state[src];
+
+       icp = kvmppc_xics_find_server(kvm, server);
+       if (!icp)
+               return -EINVAL;
+
+       XICS_DBG("set_xive %#x server %#x prio %#x MP:%d RS:%d\n",
+                irq, server, priority,
+                state->masked_pending, state->resend);
+
+       if (write_xive(xics, ics, state, server, priority, priority))
+               icp_deliver_irq(xics, icp, irq);
+
+       return 0;
+}
+
+int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
+{
+       struct kvmppc_xics *xics = kvm->arch.xics;
+       struct kvmppc_ics *ics;
+       struct ics_irq_state *state;
+       u16 src;
+
+       if (!xics)
+               return -ENODEV;
+
+       ics = kvmppc_xics_find_ics(xics, irq, &src);
+       if (!ics)
+               return -EINVAL;
+       state = &ics->irq_state[src];
+
+       mutex_lock(&ics->lock);
+       *server = state->server;
+       *priority = state->priority;
+       mutex_unlock(&ics->lock);
+
+       return 0;
+}
+
+int kvmppc_xics_int_on(struct kvm *kvm, u32 irq)
+{
+       struct kvmppc_xics *xics = kvm->arch.xics;
+       struct kvmppc_icp *icp;
+       struct kvmppc_ics *ics;
+       struct ics_irq_state *state;
+       u16 src;
+
+       if (!xics)
+               return -ENODEV;
+
+       ics = kvmppc_xics_find_ics(xics, irq, &src);
+       if (!ics)
+               return -EINVAL;
+       state = &ics->irq_state[src];
+
+       icp = kvmppc_xics_find_server(kvm, state->server);
+       if (!icp)
+               return -EINVAL;
+
+       if (write_xive(xics, ics, state, state->server, state->saved_priority,
+                      state->saved_priority))
+               icp_deliver_irq(xics, icp, irq);
+
+       return 0;
+}
+
+int kvmppc_xics_int_off(struct kvm *kvm, u32 irq)
+{
+       struct kvmppc_xics *xics = kvm->arch.xics;
+       struct kvmppc_ics *ics;
+       struct ics_irq_state *state;
+       u16 src;
+
+       if (!xics)
+               return -ENODEV;
+
+       ics = kvmppc_xics_find_ics(xics, irq, &src);
+       if (!ics)
+               return -EINVAL;
+       state = &ics->irq_state[src];
+
+       write_xive(xics, ics, state, state->server, MASKED, state->priority);
+
+       return 0;
+}
+
+/* -- ICP routines, including hcalls -- */
+
+static inline bool icp_try_update(struct kvmppc_icp *icp,
+                                 union kvmppc_icp_state old,
+                                 union kvmppc_icp_state new,
+                                 bool change_self)
+{
+       bool success;
+
+       /* Calculate new output value */
+       new.out_ee = (new.xisr && (new.pending_pri < new.cppr));
+
+       /* Attempt atomic update */
+       success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw;
+       if (!success)
+               goto bail;
+
+       XICS_DBG("UPD [%04x] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+                icp->server_num,
+                old.cppr, old.mfrr, old.pending_pri, old.xisr,
+                old.need_resend, old.out_ee);
+       XICS_DBG("UPD        - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n",
+                new.cppr, new.mfrr, new.pending_pri, new.xisr,
+                new.need_resend, new.out_ee);
+       /*
+        * Check for output state update
+        *
+        * Note that this is racy since another processor could be updating
+        * the state already. This is why we never clear the interrupt output
+        * here, we only ever set it. The clear only happens prior to doing
+        * an update and only by the processor itself. Currently we do it
+        * in Accept (H_XIRR) and Up_Cppr (H_XPPR).
+        *
+        * We also do not try to figure out whether the EE state has changed,
+        * we unconditionally set it if the new state calls for it. The reason
+        * for that is that we opportunistically remove the pending interrupt
+        * flag when raising CPPR, so we need to set it back here if an
+        * interrupt is still pending.
+        */
+       if (new.out_ee) {
+               kvmppc_book3s_queue_irqprio(icp->vcpu,
+                                           BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+               if (!change_self)
+                       kvmppc_fast_vcpu_kick(icp->vcpu);
+       }
+ bail:
+       return success;
+}
+
+static void icp_check_resend(struct kvmppc_xics *xics,
+                            struct kvmppc_icp *icp)
+{
+       u32 icsid;
+
+       /* Order this load with the test for need_resend in the caller */
+       smp_rmb();
+       for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) {
+               struct kvmppc_ics *ics = xics->ics[icsid];
+
+               if (!test_and_clear_bit(icsid, icp->resend_map))
+                       continue;
+               if (!ics)
+                       continue;
+               ics_check_resend(xics, ics, icp);
+       }
+}
+
+static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
+                              u32 *reject)
+{
+       union kvmppc_icp_state old_state, new_state;
+       bool success;
+
+       XICS_DBG("try deliver %#x(P:%#x) to server %#x\n", irq, priority,
+                icp->server_num);
+
+       do {
+               old_state = new_state = ACCESS_ONCE(icp->state);
+
+               *reject = 0;
+
+               /* See if we can deliver */
+               success = new_state.cppr > priority &&
+                       new_state.mfrr > priority &&
+                       new_state.pending_pri > priority;
+
+               /*
+                * If we can, check for a rejection and perform the
+                * delivery
+                */
+               if (success) {
+                       *reject = new_state.xisr;
+                       new_state.xisr = irq;
+                       new_state.pending_pri = priority;
+               } else {
+                       /*
+                        * If we failed to deliver we set need_resend
+                        * so a subsequent CPPR state change causes us
+                        * to try a new delivery.
+                        */
+                       new_state.need_resend = true;
+               }
+
+       } while (!icp_try_update(icp, old_state, new_state, false));
+
+       return success;
+}
+
+static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+                           u32 new_irq)
+{
+       struct ics_irq_state *state;
+       struct kvmppc_ics *ics;
+       u32 reject;
+       u16 src;
+
+       /*
+        * This is used both for initial delivery of an interrupt and
+        * for subsequent rejection.
+        *
+        * Rejection can be racy vs. resends. We have evaluated the
+        * rejection in an atomic ICP transaction which is now complete,
+        * so potentially the ICP can already accept the interrupt again.
+        *
+        * So we need to retry the delivery. Essentially the reject path
+        * boils down to a failed delivery. Always.
+        *
+        * Now the interrupt could also have moved to a different target,
+        * thus we may need to re-do the ICP lookup as well
+        */
+
+ again:
+       /* Get the ICS state and lock it */
+       ics = kvmppc_xics_find_ics(xics, new_irq, &src);
+       if (!ics) {
+               XICS_DBG("icp_deliver_irq: IRQ 0x%06x not found !\n", new_irq);
+               return;
+       }
+       state = &ics->irq_state[src];
+
+       /* Get a lock on the ICS */
+       mutex_lock(&ics->lock);
+
+       /* Get our server */
+       if (!icp || state->server != icp->server_num) {
+               icp = kvmppc_xics_find_server(xics->kvm, state->server);
+               if (!icp) {
+                       pr_warn("icp_deliver_irq: IRQ 0x%06x server 0x%x not found !\n",
+                               new_irq, state->server);
+                       goto out;
+               }
+       }
+
+       /* Clear the resend bit of that interrupt */
+       state->resend = 0;
+
+       /*
+        * If masked, bail out
+        *
+        * Note: PAPR doesn't mention anything about masked pending
+        * when doing a resend, only when doing a delivery.
+        *
+        * However that would have the effect of losing a masked
+        * interrupt that was rejected and isn't consistent with
+        * the whole masked_pending business which is about not
+        * losing interrupts that occur while masked.
+        *
+        * I don't differenciate normal deliveries and resends, this
+        * implementation will differ from PAPR and not lose such
+        * interrupts.
+        */
+       if (state->priority == MASKED) {
+               XICS_DBG("irq %#x masked pending\n", new_irq);
+               state->masked_pending = 1;
+               goto out;
+       }
+
+       /*
+        * Try the delivery, this will set the need_resend flag
+        * in the ICP as part of the atomic transaction if the
+        * delivery is not possible.
+        *
+        * Note that if successful, the new delivery might have itself
+        * rejected an interrupt that was "delivered" before we took the
+        * icp mutex.
+        *
+        * In this case we do the whole sequence all over again for the
+        * new guy. We cannot assume that the rejected interrupt is less
+        * favored than the new one, and thus doesn't need to be delivered,
+        * because by the time we exit icp_try_to_deliver() the target
+        * processor may well have alrady consumed & completed it, and thus
+        * the rejected interrupt might actually be already acceptable.
+        */
+       if (icp_try_to_deliver(icp, new_irq, state->priority, &reject)) {
+               /*
+                * Delivery was successful, did we reject somebody else ?
+                */
+               if (reject && reject != XICS_IPI) {
+                       mutex_unlock(&ics->lock);
+                       new_irq = reject;
+                       goto again;
+               }
+       } else {
+               /*
+                * We failed to deliver the interrupt we need to set the
+                * resend map bit and mark the ICS state as needing a resend
+                */
+               set_bit(ics->icsid, icp->resend_map);
+               state->resend = 1;
+
+               /*
+                * If the need_resend flag got cleared in the ICP some time
+                * between icp_try_to_deliver() atomic update and now, then
+                * we know it might have missed the resend_map bit. So we
+                * retry
+                */
+               smp_mb();
+               if (!icp->state.need_resend) {
+                       mutex_unlock(&ics->lock);
+                       goto again;
+               }
+       }
+ out:
+       mutex_unlock(&ics->lock);
+}
+
+static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+                         u8 new_cppr)
+{
+       union kvmppc_icp_state old_state, new_state;
+       bool resend;
+
+       /*
+        * This handles several related states in one operation:
+        *
+        * ICP State: Down_CPPR
+        *
+        * Load CPPR with new value and if the XISR is 0
+        * then check for resends:
+        *
+        * ICP State: Resend
+        *
+        * If MFRR is more favored than CPPR, check for IPIs
+        * and notify ICS of a potential resend. This is done
+        * asynchronously (when used in real mode, we will have
+        * to exit here).
+        *
+        * We do not handle the complete Check_IPI as documented
+        * here. In the PAPR, this state will be used for both
+        * Set_MFRR and Down_CPPR. However, we know that we aren't
+        * changing the MFRR state here so we don't need to handle
+        * the case of an MFRR causing a reject of a pending irq,
+        * this will have been handled when the MFRR was set in the
+        * first place.
+        *
+        * Thus we don't have to handle rejects, only resends.
+        *
+        * When implementing real mode for HV KVM, resend will lead to
+        * a H_TOO_HARD return and the whole transaction will be handled
+        * in virtual mode.
+        */
+       do {
+               old_state = new_state = ACCESS_ONCE(icp->state);
+
+               /* Down_CPPR */
+               new_state.cppr = new_cppr;
+
+               /*
+                * Cut down Resend / Check_IPI / IPI
+                *
+                * The logic is that we cannot have a pending interrupt
+                * trumped by an IPI at this point (see above), so we
+                * know that either the pending interrupt is already an
+                * IPI (in which case we don't care to override it) or
+                * it's either more favored than us or non existent
+                */
+               if (new_state.mfrr < new_cppr &&
+                   new_state.mfrr <= new_state.pending_pri) {
+                       WARN_ON(new_state.xisr != XICS_IPI &&
+                               new_state.xisr != 0);
+                       new_state.pending_pri = new_state.mfrr;
+                       new_state.xisr = XICS_IPI;
+               }
+
+               /* Latch/clear resend bit */
+               resend = new_state.need_resend;
+               new_state.need_resend = 0;
+
+       } while (!icp_try_update(icp, old_state, new_state, true));
+
+       /*
+        * Now handle resend checks. Those are asynchronous to the ICP
+        * state update in HW (ie bus transactions) so we can handle them
+        * separately here too
+        */
+       if (resend)
+               icp_check_resend(xics, icp);
+}
+
+static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu)
+{
+       union kvmppc_icp_state old_state, new_state;
+       struct kvmppc_icp *icp = vcpu->arch.icp;
+       u32 xirr;
+
+       /* First, remove EE from the processor */
+       kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+                                     BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+       /*
+        * ICP State: Accept_Interrupt
+        *
+        * Return the pending interrupt (if any) along with the
+        * current CPPR, then clear the XISR & set CPPR to the
+        * pending priority
+        */
+       do {
+               old_state = new_state = ACCESS_ONCE(icp->state);
+
+               xirr = old_state.xisr | (((u32)old_state.cppr) << 24);
+               if (!old_state.xisr)
+                       break;
+               new_state.cppr = new_state.pending_pri;
+               new_state.pending_pri = 0xff;
+               new_state.xisr = 0;
+
+       } while (!icp_try_update(icp, old_state, new_state, true));
+
+       XICS_DBG("h_xirr vcpu %d xirr %#x\n", vcpu->vcpu_id, xirr);
+
+       return xirr;
+}
+
+static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
+                                unsigned long mfrr)
+{
+       union kvmppc_icp_state old_state, new_state;
+       struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+       struct kvmppc_icp *icp;
+       u32 reject;
+       bool resend;
+       bool local;
+
+       XICS_DBG("h_ipi vcpu %d to server %lu mfrr %#lx\n",
+                vcpu->vcpu_id, server, mfrr);
+
+       icp = vcpu->arch.icp;
+       local = icp->server_num == server;
+       if (!local) {
+               icp = kvmppc_xics_find_server(vcpu->kvm, server);
+               if (!icp)
+                       return H_PARAMETER;
+       }
+
+       /*
+        * ICP state: Set_MFRR
+        *
+        * If the CPPR is more favored than the new MFRR, then
+        * nothing needs to be rejected as there can be no XISR to
+        * reject.  If the MFRR is being made less favored then
+        * there might be a previously-rejected interrupt needing
+        * to be resent.
+        *
+        * If the CPPR is less favored, then we might be replacing
+        * an interrupt, and thus need to possibly reject it as in
+        *
+        * ICP state: Check_IPI
+        */
+       do {
+               old_state = new_state = ACCESS_ONCE(icp->state);
+
+               /* Set_MFRR */
+               new_state.mfrr = mfrr;
+
+               /* Check_IPI */
+               reject = 0;
+               resend = false;
+               if (mfrr < new_state.cppr) {
+                       /* Reject a pending interrupt if not an IPI */
+                       if (mfrr <= new_state.pending_pri)
+                               reject = new_state.xisr;
+                       new_state.pending_pri = mfrr;
+                       new_state.xisr = XICS_IPI;
+               }
+
+               if (mfrr > old_state.mfrr && mfrr > new_state.cppr) {
+                       resend = new_state.need_resend;
+                       new_state.need_resend = 0;
+               }
+       } while (!icp_try_update(icp, old_state, new_state, local));
+
+       /* Handle reject */
+       if (reject && reject != XICS_IPI)
+               icp_deliver_irq(xics, icp, reject);
+
+       /* Handle resend */
+       if (resend)
+               icp_check_resend(xics, icp);
+
+       return H_SUCCESS;
+}
+
+static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
+{
+       union kvmppc_icp_state old_state, new_state;
+       struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+       struct kvmppc_icp *icp = vcpu->arch.icp;
+       u32 reject;
+
+       XICS_DBG("h_cppr vcpu %d cppr %#lx\n", vcpu->vcpu_id, cppr);
+
+       /*
+        * ICP State: Set_CPPR
+        *
+        * We can safely compare the new value with the current
+        * value outside of the transaction as the CPPR is only
+        * ever changed by the processor on itself
+        */
+       if (cppr > icp->state.cppr)
+               icp_down_cppr(xics, icp, cppr);
+       else if (cppr == icp->state.cppr)
+               return;
+
+       /*
+        * ICP State: Up_CPPR
+        *
+        * The processor is raising its priority, this can result
+        * in a rejection of a pending interrupt:
+        *
+        * ICP State: Reject_Current
+        *
+        * We can remove EE from the current processor, the update
+        * transaction will set it again if needed
+        */
+       kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+                                     BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+       do {
+               old_state = new_state = ACCESS_ONCE(icp->state);
+
+               reject = 0;
+               new_state.cppr = cppr;
+
+               if (cppr <= new_state.pending_pri) {
+                       reject = new_state.xisr;
+                       new_state.xisr = 0;
+                       new_state.pending_pri = 0xff;
+               }
+
+       } while (!icp_try_update(icp, old_state, new_state, true));
+
+       /*
+        * Check for rejects. They are handled by doing a new delivery
+        * attempt (see comments in icp_deliver_irq).
+        */
+       if (reject && reject != XICS_IPI)
+               icp_deliver_irq(xics, icp, reject);
+}
+
+static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+       struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+       struct kvmppc_icp *icp = vcpu->arch.icp;
+       struct kvmppc_ics *ics;
+       struct ics_irq_state *state;
+       u32 irq = xirr & 0x00ffffff;
+       u16 src;
+
+       XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr);
+
+       /*
+        * ICP State: EOI
+        *
+        * Note: If EOI is incorrectly used by SW to lower the CPPR
+        * value (ie more favored), we do not check for rejection of
+        * a pending interrupt, this is a SW error and PAPR sepcifies
+        * that we don't have to deal with it.
+        *
+        * The sending of an EOI to the ICS is handled after the
+        * CPPR update
+        *
+        * ICP State: Down_CPPR which we handle
+        * in a separate function as it's shared with H_CPPR.
+        */
+       icp_down_cppr(xics, icp, xirr >> 24);
+
+       /* IPIs have no EOI */
+       if (irq == XICS_IPI)
+               return H_SUCCESS;
+       /*
+        * EOI handling: If the interrupt is still asserted, we need to
+        * resend it. We can take a lockless "peek" at the ICS state here.
+        *
+        * "Message" interrupts will never have "asserted" set
+        */
+       ics = kvmppc_xics_find_ics(xics, irq, &src);
+       if (!ics) {
+               XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq);
+               return H_PARAMETER;
+       }
+       state = &ics->irq_state[src];
+
+       /* Still asserted, resend it */
+       if (state->asserted)
+               icp_deliver_irq(xics, icp, irq);
+
+       return H_SUCCESS;
+}
+
+static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
+{
+       struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+       struct kvmppc_icp *icp = vcpu->arch.icp;
+
+       XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n",
+                hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt);
+
+       if (icp->rm_action & XICS_RM_KICK_VCPU)
+               kvmppc_fast_vcpu_kick(icp->rm_kick_target);
+       if (icp->rm_action & XICS_RM_CHECK_RESEND)
+               icp_check_resend(xics, icp);
+       if (icp->rm_action & XICS_RM_REJECT)
+               icp_deliver_irq(xics, icp, icp->rm_reject);
+
+       icp->rm_action = 0;
+
+       return H_SUCCESS;
+}
+
+int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
+{
+       struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+       unsigned long res;
+       int rc = H_SUCCESS;
+
+       /* Check if we have an ICP */
+       if (!xics || !vcpu->arch.icp)
+               return H_HARDWARE;
+
+       /* Check for real mode returning too hard */
+       if (xics->real_mode)
+               return kvmppc_xics_rm_complete(vcpu, req);
+
+       switch (req) {
+       case H_XIRR:
+               res = kvmppc_h_xirr(vcpu);
+               kvmppc_set_gpr(vcpu, 4, res);
+               break;
+       case H_CPPR:
+               kvmppc_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
+               break;
+       case H_EOI:
+               rc = kvmppc_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
+               break;
+       case H_IPI:
+               rc = kvmppc_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
+                                 kvmppc_get_gpr(vcpu, 5));
+               break;
+       }
+
+       return rc;
+}
+
+
+/* -- Initialisation code etc. -- */
+
+static int xics_debug_show(struct seq_file *m, void *private)
+{
+       struct kvmppc_xics *xics = m->private;
+       struct kvm *kvm = xics->kvm;
+       struct kvm_vcpu *vcpu;
+       int icsid, i;
+
+       if (!kvm)
+               return 0;
+
+       seq_printf(m, "=========\nICP state\n=========\n");
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               struct kvmppc_icp *icp = vcpu->arch.icp;
+               union kvmppc_icp_state state;
+
+               if (!icp)
+                       continue;
+
+               state.raw = ACCESS_ONCE(icp->state.raw);
+               seq_printf(m, "cpu server %#lx XIRR:%#x PPRI:%#x CPPR:%#x MFRR:%#x OUT:%d NR:%d\n",
+                          icp->server_num, state.xisr,
+                          state.pending_pri, state.cppr, state.mfrr,
+                          state.out_ee, state.need_resend);
+       }
+
+       for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
+               struct kvmppc_ics *ics = xics->ics[icsid];
+
+               if (!ics)
+                       continue;
+
+               seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n",
+                          icsid);
+
+               mutex_lock(&ics->lock);
+
+               for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+                       struct ics_irq_state *irq = &ics->irq_state[i];
+
+                       seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n",
+                                  irq->number, irq->server, irq->priority,
+                                  irq->saved_priority, irq->asserted,
+                                  irq->resend, irq->masked_pending);
+
+               }
+               mutex_unlock(&ics->lock);
+       }
+       return 0;
+}
+
+static int xics_debug_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, xics_debug_show, inode->i_private);
+}
+
+static const struct file_operations xics_debug_fops = {
+       .open = xics_debug_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = single_release,
+};
+
+static void xics_debugfs_init(struct kvmppc_xics *xics)
+{
+       char *name;
+
+       name = kasprintf(GFP_KERNEL, "kvm-xics-%p", xics);
+       if (!name) {
+               pr_err("%s: no memory for name\n", __func__);
+               return;
+       }
+
+       xics->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root,
+                                          xics, &xics_debug_fops);
+
+       pr_debug("%s: created %s\n", __func__, name);
+       kfree(name);
+}
+
+static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
+                                       struct kvmppc_xics *xics, int irq)
+{
+       struct kvmppc_ics *ics;
+       int i, icsid;
+
+       icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
+
+       mutex_lock(&kvm->lock);
+
+       /* ICS already exists - somebody else got here first */
+       if (xics->ics[icsid])
+               goto out;
+
+       /* Create the ICS */
+       ics = kzalloc(sizeof(struct kvmppc_ics), GFP_KERNEL);
+       if (!ics)
+               goto out;
+
+       mutex_init(&ics->lock);
+       ics->icsid = icsid;
+
+       for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+               ics->irq_state[i].number = (icsid << KVMPPC_XICS_ICS_SHIFT) | i;
+               ics->irq_state[i].priority = MASKED;
+               ics->irq_state[i].saved_priority = MASKED;
+       }
+       smp_wmb();
+       xics->ics[icsid] = ics;
+
+       if (icsid > xics->max_icsid)
+               xics->max_icsid = icsid;
+
+ out:
+       mutex_unlock(&kvm->lock);
+       return xics->ics[icsid];
+}
+
+int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num)
+{
+       struct kvmppc_icp *icp;
+
+       if (!vcpu->kvm->arch.xics)
+               return -ENODEV;
+
+       if (kvmppc_xics_find_server(vcpu->kvm, server_num))
+               return -EEXIST;
+
+       icp = kzalloc(sizeof(struct kvmppc_icp), GFP_KERNEL);
+       if (!icp)
+               return -ENOMEM;
+
+       icp->vcpu = vcpu;
+       icp->server_num = server_num;
+       icp->state.mfrr = MASKED;
+       icp->state.pending_pri = MASKED;
+       vcpu->arch.icp = icp;
+
+       XICS_DBG("created server for vcpu %d\n", vcpu->vcpu_id);
+
+       return 0;
+}
+
+u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_icp *icp = vcpu->arch.icp;
+       union kvmppc_icp_state state;
+
+       if (!icp)
+               return 0;
+       state = icp->state;
+       return ((u64)state.cppr << KVM_REG_PPC_ICP_CPPR_SHIFT) |
+               ((u64)state.xisr << KVM_REG_PPC_ICP_XISR_SHIFT) |
+               ((u64)state.mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT) |
+               ((u64)state.pending_pri << KVM_REG_PPC_ICP_PPRI_SHIFT);
+}
+
+int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
+{
+       struct kvmppc_icp *icp = vcpu->arch.icp;
+       struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+       union kvmppc_icp_state old_state, new_state;
+       struct kvmppc_ics *ics;
+       u8 cppr, mfrr, pending_pri;
+       u32 xisr;
+       u16 src;
+       bool resend;
+
+       if (!icp || !xics)
+               return -ENOENT;
+
+       cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT;
+       xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) &
+               KVM_REG_PPC_ICP_XISR_MASK;
+       mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT;
+       pending_pri = icpval >> KVM_REG_PPC_ICP_PPRI_SHIFT;
+
+       /* Require the new state to be internally consistent */
+       if (xisr == 0) {
+               if (pending_pri != 0xff)
+                       return -EINVAL;
+       } else if (xisr == XICS_IPI) {
+               if (pending_pri != mfrr || pending_pri >= cppr)
+                       return -EINVAL;
+       } else {
+               if (pending_pri >= mfrr || pending_pri >= cppr)
+                       return -EINVAL;
+               ics = kvmppc_xics_find_ics(xics, xisr, &src);
+               if (!ics)
+                       return -EINVAL;
+       }
+
+       new_state.raw = 0;
+       new_state.cppr = cppr;
+       new_state.xisr = xisr;
+       new_state.mfrr = mfrr;
+       new_state.pending_pri = pending_pri;
+
+       /*
+        * Deassert the CPU interrupt request.
+        * icp_try_update will reassert it if necessary.
+        */
+       kvmppc_book3s_dequeue_irqprio(icp->vcpu,
+                                     BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+
+       /*
+        * Note that if we displace an interrupt from old_state.xisr,
+        * we don't mark it as rejected.  We expect userspace to set
+        * the state of the interrupt sources to be consistent with
+        * the ICP states (either before or afterwards, which doesn't
+        * matter).  We do handle resends due to CPPR becoming less
+        * favoured because that is necessary to end up with a
+        * consistent state in the situation where userspace restores
+        * the ICS states before the ICP states.
+        */
+       do {
+               old_state = ACCESS_ONCE(icp->state);
+
+               if (new_state.mfrr <= old_state.mfrr) {
+                       resend = false;
+                       new_state.need_resend = old_state.need_resend;
+               } else {
+                       resend = old_state.need_resend;
+                       new_state.need_resend = 0;
+               }
+       } while (!icp_try_update(icp, old_state, new_state, false));
+
+       if (resend)
+               icp_check_resend(xics, icp);
+
+       return 0;
+}
+
+static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
+{
+       int ret;
+       struct kvmppc_ics *ics;
+       struct ics_irq_state *irqp;
+       u64 __user *ubufp = (u64 __user *) addr;
+       u16 idx;
+       u64 val, prio;
+
+       ics = kvmppc_xics_find_ics(xics, irq, &idx);
+       if (!ics)
+               return -ENOENT;
+
+       irqp = &ics->irq_state[idx];
+       mutex_lock(&ics->lock);
+       ret = -ENOENT;
+       if (irqp->exists) {
+               val = irqp->server;
+               prio = irqp->priority;
+               if (prio == MASKED) {
+                       val |= KVM_XICS_MASKED;
+                       prio = irqp->saved_priority;
+               }
+               val |= prio << KVM_XICS_PRIORITY_SHIFT;
+               if (irqp->asserted)
+                       val |= KVM_XICS_LEVEL_SENSITIVE | KVM_XICS_PENDING;
+               else if (irqp->masked_pending || irqp->resend)
+                       val |= KVM_XICS_PENDING;
+               ret = 0;
+       }
+       mutex_unlock(&ics->lock);
+
+       if (!ret && put_user(val, ubufp))
+               ret = -EFAULT;
+
+       return ret;
+}
+
+static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
+{
+       struct kvmppc_ics *ics;
+       struct ics_irq_state *irqp;
+       u64 __user *ubufp = (u64 __user *) addr;
+       u16 idx;
+       u64 val;
+       u8 prio;
+       u32 server;
+
+       if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
+               return -ENOENT;
+
+       ics = kvmppc_xics_find_ics(xics, irq, &idx);
+       if (!ics) {
+               ics = kvmppc_xics_create_ics(xics->kvm, xics, irq);
+               if (!ics)
+                       return -ENOMEM;
+       }
+       irqp = &ics->irq_state[idx];
+       if (get_user(val, ubufp))
+               return -EFAULT;
+
+       server = val & KVM_XICS_DESTINATION_MASK;
+       prio = val >> KVM_XICS_PRIORITY_SHIFT;
+       if (prio != MASKED &&
+           kvmppc_xics_find_server(xics->kvm, server) == NULL)
+               return -EINVAL;
+
+       mutex_lock(&ics->lock);
+       irqp->server = server;
+       irqp->saved_priority = prio;
+       if (val & KVM_XICS_MASKED)
+               prio = MASKED;
+       irqp->priority = prio;
+       irqp->resend = 0;
+       irqp->masked_pending = 0;
+       irqp->asserted = 0;
+       if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE))
+               irqp->asserted = 1;
+       irqp->exists = 1;
+       mutex_unlock(&ics->lock);
+
+       if (val & KVM_XICS_PENDING)
+               icp_deliver_irq(xics, NULL, irqp->number);
+
+       return 0;
+}
+
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+               bool line_status)
+{
+       struct kvmppc_xics *xics = kvm->arch.xics;
+
+       return ics_deliver_irq(xics, irq, level, line_status);
+}
+
+static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+       struct kvmppc_xics *xics = dev->private;
+
+       switch (attr->group) {
+       case KVM_DEV_XICS_GRP_SOURCES:
+               return xics_set_source(xics, attr->attr, attr->addr);
+       }
+       return -ENXIO;
+}
+
+static int xics_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+       struct kvmppc_xics *xics = dev->private;
+
+       switch (attr->group) {
+       case KVM_DEV_XICS_GRP_SOURCES:
+               return xics_get_source(xics, attr->attr, attr->addr);
+       }
+       return -ENXIO;
+}
+
+static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+       switch (attr->group) {
+       case KVM_DEV_XICS_GRP_SOURCES:
+               if (attr->attr >= KVMPPC_XICS_FIRST_IRQ &&
+                   attr->attr < KVMPPC_XICS_NR_IRQS)
+                       return 0;
+               break;
+       }
+       return -ENXIO;
+}
+
+static void kvmppc_xics_free(struct kvm_device *dev)
+{
+       struct kvmppc_xics *xics = dev->private;
+       int i;
+       struct kvm *kvm = xics->kvm;
+
+       debugfs_remove(xics->dentry);
+
+       if (kvm)
+               kvm->arch.xics = NULL;
+
+       for (i = 0; i <= xics->max_icsid; i++)
+               kfree(xics->ics[i]);
+       kfree(xics);
+       kfree(dev);
+}
+
+static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
+{
+       struct kvmppc_xics *xics;
+       struct kvm *kvm = dev->kvm;
+       int ret = 0;
+
+       xics = kzalloc(sizeof(*xics), GFP_KERNEL);
+       if (!xics)
+               return -ENOMEM;
+
+       dev->private = xics;
+       xics->dev = dev;
+       xics->kvm = kvm;
+
+       /* Already there ? */
+       mutex_lock(&kvm->lock);
+       if (kvm->arch.xics)
+               ret = -EEXIST;
+       else
+               kvm->arch.xics = xics;
+       mutex_unlock(&kvm->lock);
+
+       if (ret)
+               return ret;
+
+       xics_debugfs_init(xics);
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+       if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+               /* Enable real mode support */
+               xics->real_mode = ENABLE_REALMODE;
+               xics->real_mode_dbg = DEBUG_REALMODE;
+       }
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
+       return 0;
+}
+
+struct kvm_device_ops kvm_xics_ops = {
+       .name = "kvm-xics",
+       .create = kvmppc_xics_create,
+       .destroy = kvmppc_xics_free,
+       .set_attr = xics_set_attr,
+       .get_attr = xics_get_attr,
+       .has_attr = xics_has_attr,
+};
+
+int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
+                            u32 xcpu)
+{
+       struct kvmppc_xics *xics = dev->private;
+       int r = -EBUSY;
+
+       if (dev->ops != &kvm_xics_ops)
+               return -EPERM;
+       if (xics->kvm != vcpu->kvm)
+               return -EPERM;
+       if (vcpu->arch.irq_type)
+               return -EBUSY;
+
+       r = kvmppc_xics_create_icp(vcpu, xcpu);
+       if (!r)
+               vcpu->arch.irq_type = KVMPPC_IRQ_XICS;
+
+       return r;
+}
+
+void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu)
+{
+       if (!vcpu->arch.icp)
+               return;
+       kfree(vcpu->arch.icp);
+       vcpu->arch.icp = NULL;
+       vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
+}
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
new file mode 100644 (file)
index 0000000..dd9326c
--- /dev/null
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2012 Michael Ellerman, IBM Corporation.
+ * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _KVM_PPC_BOOK3S_XICS_H
+#define _KVM_PPC_BOOK3S_XICS_H
+
+/*
+ * We use a two-level tree to store interrupt source information.
+ * There are up to 1024 ICS nodes, each of which can represent
+ * 1024 sources.
+ */
+#define KVMPPC_XICS_MAX_ICS_ID 1023
+#define KVMPPC_XICS_ICS_SHIFT  10
+#define KVMPPC_XICS_IRQ_PER_ICS        (1 << KVMPPC_XICS_ICS_SHIFT)
+#define KVMPPC_XICS_SRC_MASK   (KVMPPC_XICS_IRQ_PER_ICS - 1)
+
+/*
+ * Interrupt source numbers below this are reserved, for example
+ * 0 is "no interrupt", and 2 is used for IPIs.
+ */
+#define KVMPPC_XICS_FIRST_IRQ  16
+#define KVMPPC_XICS_NR_IRQS    ((KVMPPC_XICS_MAX_ICS_ID + 1) * \
+                                KVMPPC_XICS_IRQ_PER_ICS)
+
+/* Priority value to use for disabling an interrupt */
+#define MASKED 0xff
+
+/* State for one irq source */
+struct ics_irq_state {
+       u32 number;
+       u32 server;
+       u8  priority;
+       u8  saved_priority;
+       u8  resend;
+       u8  masked_pending;
+       u8  asserted; /* Only for LSI */
+       u8  exists;
+};
+
+/* Atomic ICP state, updated with a single compare & swap */
+union kvmppc_icp_state {
+       unsigned long raw;
+       struct {
+               u8 out_ee:1;
+               u8 need_resend:1;
+               u8 cppr;
+               u8 mfrr;
+               u8 pending_pri;
+               u32 xisr;
+       };
+};
+
+/* One bit per ICS */
+#define ICP_RESEND_MAP_SIZE    (KVMPPC_XICS_MAX_ICS_ID / BITS_PER_LONG + 1)
+
+struct kvmppc_icp {
+       struct kvm_vcpu *vcpu;
+       unsigned long server_num;
+       union kvmppc_icp_state state;
+       unsigned long resend_map[ICP_RESEND_MAP_SIZE];
+
+       /* Real mode might find something too hard, here's the action
+        * it might request from virtual mode
+        */
+#define XICS_RM_KICK_VCPU      0x1
+#define XICS_RM_CHECK_RESEND   0x2
+#define XICS_RM_REJECT         0x4
+       u32 rm_action;
+       struct kvm_vcpu *rm_kick_target;
+       u32  rm_reject;
+
+       /* Debug stuff for real mode */
+       union kvmppc_icp_state rm_dbgstate;
+       struct kvm_vcpu *rm_dbgtgt;
+};
+
+struct kvmppc_ics {
+       struct mutex lock;
+       u16 icsid;
+       struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
+};
+
+struct kvmppc_xics {
+       struct kvm *kvm;
+       struct kvm_device *dev;
+       struct dentry *dentry;
+       u32 max_icsid;
+       bool real_mode;
+       bool real_mode_dbg;
+       struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1];
+};
+
+static inline struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm,
+                                                        u32 nr)
+{
+       struct kvm_vcpu *vcpu = NULL;
+       int i;
+
+       kvm_for_each_vcpu(i, vcpu, kvm) {
+               if (vcpu->arch.icp && nr == vcpu->arch.icp->server_num)
+                       return vcpu->arch.icp;
+       }
+       return NULL;
+}
+
+static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
+                                                     u32 irq, u16 *source)
+{
+       u32 icsid = irq >> KVMPPC_XICS_ICS_SHIFT;
+       u16 src = irq & KVMPPC_XICS_SRC_MASK;
+       struct kvmppc_ics *ics;
+
+       if (source)
+               *source = src;
+       if (icsid > KVMPPC_XICS_MAX_ICS_ID)
+               return NULL;
+       ics = xics->ics[icsid];
+       if (!ics)
+               return NULL;
+       return ics;
+}
+
+
+#endif /* _KVM_PPC_BOOK3S_XICS_H */
index 020923e4313490445130517f220325f7485a4fa2..1020119226dbefe5589726758bbff2f03c70ec53 100644 (file)
@@ -222,8 +222,7 @@ void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
        kvmppc_booke_queue_irqprio(vcpu, prio);
 }
 
-void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
-                                  struct kvm_interrupt *irq)
+void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)
 {
        clear_bit(BOOKE_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
        clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
@@ -347,7 +346,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
                keep_irq = true;
        }
 
-       if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_enabled)
+       if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_flags)
                update_epr = true;
 
        switch (priority) {
@@ -428,8 +427,14 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
                        set_guest_esr(vcpu, vcpu->arch.queued_esr);
                if (update_dear == true)
                        set_guest_dear(vcpu, vcpu->arch.queued_dear);
-               if (update_epr == true)
-                       kvm_make_request(KVM_REQ_EPR_EXIT, vcpu);
+               if (update_epr == true) {
+                       if (vcpu->arch.epr_flags & KVMPPC_EPR_USER)
+                               kvm_make_request(KVM_REQ_EPR_EXIT, vcpu);
+                       else if (vcpu->arch.epr_flags & KVMPPC_EPR_KERNEL) {
+                               BUG_ON(vcpu->arch.irq_type != KVMPPC_IRQ_MPIC);
+                               kvmppc_mpic_set_epr(vcpu);
+                       }
+               }
 
                new_msr &= msr_mask;
 #if defined(CONFIG_64BIT)
@@ -746,6 +751,9 @@ static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
                kvmppc_core_queue_program(vcpu, ESR_PIL);
                return RESUME_HOST;
 
+       case EMULATE_EXIT_USER:
+               return RESUME_HOST;
+
        default:
                BUG();
        }
@@ -1148,6 +1156,18 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
        return r;
 }
 
+static void kvmppc_set_tsr(struct kvm_vcpu *vcpu, u32 new_tsr)
+{
+       u32 old_tsr = vcpu->arch.tsr;
+
+       vcpu->arch.tsr = new_tsr;
+
+       if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS))
+               arm_next_watchdog(vcpu);
+
+       update_timer_ints(vcpu);
+}
+
 /* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
@@ -1287,16 +1307,8 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
                kvmppc_emulate_dec(vcpu);
        }
 
-       if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
-               u32 old_tsr = vcpu->arch.tsr;
-
-               vcpu->arch.tsr = sregs->u.e.tsr;
-
-               if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS))
-                       arm_next_watchdog(vcpu);
-
-               update_timer_ints(vcpu);
-       }
+       if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR)
+               kvmppc_set_tsr(vcpu, sregs->u.e.tsr);
 
        return 0;
 }
@@ -1409,84 +1421,134 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
 int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 {
-       int r = -EINVAL;
+       int r = 0;
+       union kvmppc_one_reg val;
+       int size;
+       long int i;
+
+       size = one_reg_size(reg->id);
+       if (size > sizeof(val))
+               return -EINVAL;
 
        switch (reg->id) {
        case KVM_REG_PPC_IAC1:
        case KVM_REG_PPC_IAC2:
        case KVM_REG_PPC_IAC3:
-       case KVM_REG_PPC_IAC4: {
-               int iac = reg->id - KVM_REG_PPC_IAC1;
-               r = copy_to_user((u64 __user *)(long)reg->addr,
-                                &vcpu->arch.dbg_reg.iac[iac], sizeof(u64));
+       case KVM_REG_PPC_IAC4:
+               i = reg->id - KVM_REG_PPC_IAC1;
+               val = get_reg_val(reg->id, vcpu->arch.dbg_reg.iac[i]);
                break;
-       }
        case KVM_REG_PPC_DAC1:
-       case KVM_REG_PPC_DAC2: {
-               int dac = reg->id - KVM_REG_PPC_DAC1;
-               r = copy_to_user((u64 __user *)(long)reg->addr,
-                                &vcpu->arch.dbg_reg.dac[dac], sizeof(u64));
+       case KVM_REG_PPC_DAC2:
+               i = reg->id - KVM_REG_PPC_DAC1;
+               val = get_reg_val(reg->id, vcpu->arch.dbg_reg.dac[i]);
                break;
-       }
        case KVM_REG_PPC_EPR: {
                u32 epr = get_guest_epr(vcpu);
-               r = put_user(epr, (u32 __user *)(long)reg->addr);
+               val = get_reg_val(reg->id, epr);
                break;
        }
 #if defined(CONFIG_64BIT)
        case KVM_REG_PPC_EPCR:
-               r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr);
+               val = get_reg_val(reg->id, vcpu->arch.epcr);
                break;
 #endif
+       case KVM_REG_PPC_TCR:
+               val = get_reg_val(reg->id, vcpu->arch.tcr);
+               break;
+       case KVM_REG_PPC_TSR:
+               val = get_reg_val(reg->id, vcpu->arch.tsr);
+               break;
+       case KVM_REG_PPC_DEBUG_INST:
+               val = get_reg_val(reg->id, KVMPPC_INST_EHPRIV);
+               break;
        default:
+               r = kvmppc_get_one_reg(vcpu, reg->id, &val);
                break;
        }
+
+       if (r)
+               return r;
+
+       if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size))
+               r = -EFAULT;
+
        return r;
 }
 
 int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 {
-       int r = -EINVAL;
+       int r = 0;
+       union kvmppc_one_reg val;
+       int size;
+       long int i;
+
+       size = one_reg_size(reg->id);
+       if (size > sizeof(val))
+               return -EINVAL;
+
+       if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size))
+               return -EFAULT;
 
        switch (reg->id) {
        case KVM_REG_PPC_IAC1:
        case KVM_REG_PPC_IAC2:
        case KVM_REG_PPC_IAC3:
-       case KVM_REG_PPC_IAC4: {
-               int iac = reg->id - KVM_REG_PPC_IAC1;
-               r = copy_from_user(&vcpu->arch.dbg_reg.iac[iac],
-                            (u64 __user *)(long)reg->addr, sizeof(u64));
+       case KVM_REG_PPC_IAC4:
+               i = reg->id - KVM_REG_PPC_IAC1;
+               vcpu->arch.dbg_reg.iac[i] = set_reg_val(reg->id, val);
                break;
-       }
        case KVM_REG_PPC_DAC1:
-       case KVM_REG_PPC_DAC2: {
-               int dac = reg->id - KVM_REG_PPC_DAC1;
-               r = copy_from_user(&vcpu->arch.dbg_reg.dac[dac],
-                            (u64 __user *)(long)reg->addr, sizeof(u64));
+       case KVM_REG_PPC_DAC2:
+               i = reg->id - KVM_REG_PPC_DAC1;
+               vcpu->arch.dbg_reg.dac[i] = set_reg_val(reg->id, val);
                break;
-       }
        case KVM_REG_PPC_EPR: {
-               u32 new_epr;
-               r = get_user(new_epr, (u32 __user *)(long)reg->addr);
-               if (!r)
-                       kvmppc_set_epr(vcpu, new_epr);
+               u32 new_epr = set_reg_val(reg->id, val);
+               kvmppc_set_epr(vcpu, new_epr);
                break;
        }
 #if defined(CONFIG_64BIT)
        case KVM_REG_PPC_EPCR: {
-               u32 new_epcr;
-               r = get_user(new_epcr, (u32 __user *)(long)reg->addr);
-               if (r == 0)
-                       kvmppc_set_epcr(vcpu, new_epcr);
+               u32 new_epcr = set_reg_val(reg->id, val);
+               kvmppc_set_epcr(vcpu, new_epcr);
                break;
        }
 #endif
+       case KVM_REG_PPC_OR_TSR: {
+               u32 tsr_bits = set_reg_val(reg->id, val);
+               kvmppc_set_tsr_bits(vcpu, tsr_bits);
+               break;
+       }
+       case KVM_REG_PPC_CLEAR_TSR: {
+               u32 tsr_bits = set_reg_val(reg->id, val);
+               kvmppc_clr_tsr_bits(vcpu, tsr_bits);
+               break;
+       }
+       case KVM_REG_PPC_TSR: {
+               u32 tsr = set_reg_val(reg->id, val);
+               kvmppc_set_tsr(vcpu, tsr);
+               break;
+       }
+       case KVM_REG_PPC_TCR: {
+               u32 tcr = set_reg_val(reg->id, val);
+               kvmppc_set_tcr(vcpu, tcr);
+               break;
+       }
        default:
+               r = kvmppc_set_one_reg(vcpu, reg->id, &val);
                break;
        }
+
        return r;
 }
 
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+                                        struct kvm_guest_debug *dbg)
+{
+       return -EINVAL;
+}
+
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
        return -ENOTSUPP;
@@ -1531,7 +1593,7 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
                                struct kvm_userspace_memory_region *mem,
-                               struct kvm_memory_slot old)
+                               const struct kvm_memory_slot *old)
 {
 }
 
index f4bb55c96517493e8af058fa68bea2a082a98dfc..2c6deb5ef2fe89237ae9fdcadaa62e786085d842 100644 (file)
@@ -54,8 +54,7 @@
                        (1<<BOOKE_INTERRUPT_DTLB_MISS) | \
                        (1<<BOOKE_INTERRUPT_ALIGNMENT))
 
-.macro KVM_HANDLER ivor_nr scratch srr0
-_GLOBAL(kvmppc_handler_\ivor_nr)
+.macro __KVM_HANDLER ivor_nr scratch srr0
        /* Get pointer to vcpu and record exit number. */
        mtspr   \scratch , r4
        mfspr   r4, SPRN_SPRG_THREAD
@@ -76,6 +75,43 @@ _GLOBAL(kvmppc_handler_\ivor_nr)
        bctr
 .endm
 
+.macro KVM_HANDLER ivor_nr scratch srr0
+_GLOBAL(kvmppc_handler_\ivor_nr)
+       __KVM_HANDLER \ivor_nr \scratch \srr0
+.endm
+
+.macro KVM_DBG_HANDLER ivor_nr scratch srr0
+_GLOBAL(kvmppc_handler_\ivor_nr)
+       mtspr   \scratch, r4
+       mfspr   r4, SPRN_SPRG_THREAD
+       lwz     r4, THREAD_KVM_VCPU(r4)
+       stw     r3, VCPU_CRIT_SAVE(r4)
+       mfcr    r3
+       mfspr   r4, SPRN_CSRR1
+       andi.   r4, r4, MSR_PR
+       bne     1f
+       /* debug interrupt happened in enter/exit path */
+       mfspr   r4, SPRN_CSRR1
+       rlwinm  r4, r4, 0, ~MSR_DE
+       mtspr   SPRN_CSRR1, r4
+       lis     r4, 0xffff
+       ori     r4, r4, 0xffff
+       mtspr   SPRN_DBSR, r4
+       mfspr   r4, SPRN_SPRG_THREAD
+       lwz     r4, THREAD_KVM_VCPU(r4)
+       mtcr    r3
+       lwz     r3, VCPU_CRIT_SAVE(r4)
+       mfspr   r4, \scratch
+       rfci
+1:     /* debug interrupt happened in guest */
+       mtcr    r3
+       mfspr   r4, SPRN_SPRG_THREAD
+       lwz     r4, THREAD_KVM_VCPU(r4)
+       lwz     r3, VCPU_CRIT_SAVE(r4)
+       mfspr   r4, \scratch
+       __KVM_HANDLER \ivor_nr \scratch \srr0
+.endm
+
 .macro KVM_HANDLER_ADDR ivor_nr
        .long   kvmppc_handler_\ivor_nr
 .endm
@@ -100,7 +136,7 @@ KVM_HANDLER BOOKE_INTERRUPT_FIT SPRN_SPRG_RSCRATCH0 SPRN_SRR0
 KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
 KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0
 KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0
-KVM_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
+KVM_DBG_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0
 KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0
 KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0
 KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0
index 6dd4de7802bfea4e591070968ba0169116682137..ce6b73c29612532a9f8e32282b1347761a2aeb22 100644 (file)
@@ -425,6 +425,20 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        return kvmppc_set_sregs_ivor(vcpu, sregs);
 }
 
+int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
+                       union kvmppc_one_reg *val)
+{
+       int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
+       return r;
+}
+
+int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
+                      union kvmppc_one_reg *val)
+{
+       int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
+       return r;
+}
+
 struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 {
        struct kvmppc_vcpu_e500 *vcpu_e500;
index 33db48a8ce241e1ee9ee0bce50bf991bb550b332..c2e5e98453a67e4609cf17007a74d20ed11fee2c 100644 (file)
 #include <asm/mmu-book3e.h>
 #include <asm/tlb.h>
 
+enum vcpu_ftr {
+       VCPU_FTR_MMU_V2
+};
+
 #define E500_PID_NUM   3
 #define E500_TLB_NUM   2
 
@@ -131,6 +135,10 @@ void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
 void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 
+int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+                               union kvmppc_one_reg *val);
+int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+                              union kvmppc_one_reg *val);
 
 #ifdef CONFIG_KVM_E500V2
 unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500,
@@ -295,4 +303,18 @@ static inline unsigned int get_tlbmiss_tid(struct kvm_vcpu *vcpu)
 #define get_tlb_sts(gtlbe)              (MAS1_TS)
 #endif /* !BOOKE_HV */
 
+static inline bool has_feature(const struct kvm_vcpu *vcpu,
+                              enum vcpu_ftr ftr)
+{
+       bool has_ftr;
+       switch (ftr) {
+       case VCPU_FTR_MMU_V2:
+               has_ftr = ((vcpu->arch.mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2);
+               break;
+       default:
+               return false;
+       }
+       return has_ftr;
+}
+
 #endif /* KVM_E500_H */
index e78f353a836a0731d96217546598a93b2ae22dfd..b10a01243abdeb788a7d3513b25a266cfc09d35c 100644 (file)
@@ -284,6 +284,16 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
        case SPRN_TLB1CFG:
                *spr_val = vcpu->arch.tlbcfg[1];
                break;
+       case SPRN_TLB0PS:
+               if (!has_feature(vcpu, VCPU_FTR_MMU_V2))
+                       return EMULATE_FAIL;
+               *spr_val = vcpu->arch.tlbps[0];
+               break;
+       case SPRN_TLB1PS:
+               if (!has_feature(vcpu, VCPU_FTR_MMU_V2))
+                       return EMULATE_FAIL;
+               *spr_val = vcpu->arch.tlbps[1];
+               break;
        case SPRN_L1CSR0:
                *spr_val = vcpu_e500->l1csr0;
                break;
@@ -307,6 +317,15 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
        case SPRN_MMUCFG:
                *spr_val = vcpu->arch.mmucfg;
                break;
+       case SPRN_EPTCFG:
+               if (!has_feature(vcpu, VCPU_FTR_MMU_V2))
+                       return EMULATE_FAIL;
+               /*
+                * Legacy Linux guests access EPTCFG register even if the E.PT
+                * category is disabled in the VM. Give them a chance to live.
+                */
+               *spr_val = vcpu->arch.eptcfg;
+               break;
 
        /* extra exceptions */
        case SPRN_IVOR32:
index 5c4475983f7843c0f2b6c80a488c9e8f074ca5eb..c41a5a96b558bd8d17ccaf3adc8b49e95c59e817 100644 (file)
@@ -596,6 +596,140 @@ int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        return 0;
 }
 
+int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+                               union kvmppc_one_reg *val)
+{
+       int r = 0;
+       long int i;
+
+       switch (id) {
+       case KVM_REG_PPC_MAS0:
+               *val = get_reg_val(id, vcpu->arch.shared->mas0);
+               break;
+       case KVM_REG_PPC_MAS1:
+               *val = get_reg_val(id, vcpu->arch.shared->mas1);
+               break;
+       case KVM_REG_PPC_MAS2:
+               *val = get_reg_val(id, vcpu->arch.shared->mas2);
+               break;
+       case KVM_REG_PPC_MAS7_3:
+               *val = get_reg_val(id, vcpu->arch.shared->mas7_3);
+               break;
+       case KVM_REG_PPC_MAS4:
+               *val = get_reg_val(id, vcpu->arch.shared->mas4);
+               break;
+       case KVM_REG_PPC_MAS6:
+               *val = get_reg_val(id, vcpu->arch.shared->mas6);
+               break;
+       case KVM_REG_PPC_MMUCFG:
+               *val = get_reg_val(id, vcpu->arch.mmucfg);
+               break;
+       case KVM_REG_PPC_EPTCFG:
+               *val = get_reg_val(id, vcpu->arch.eptcfg);
+               break;
+       case KVM_REG_PPC_TLB0CFG:
+       case KVM_REG_PPC_TLB1CFG:
+       case KVM_REG_PPC_TLB2CFG:
+       case KVM_REG_PPC_TLB3CFG:
+               i = id - KVM_REG_PPC_TLB0CFG;
+               *val = get_reg_val(id, vcpu->arch.tlbcfg[i]);
+               break;
+       case KVM_REG_PPC_TLB0PS:
+       case KVM_REG_PPC_TLB1PS:
+       case KVM_REG_PPC_TLB2PS:
+       case KVM_REG_PPC_TLB3PS:
+               i = id - KVM_REG_PPC_TLB0PS;
+               *val = get_reg_val(id, vcpu->arch.tlbps[i]);
+               break;
+       default:
+               r = -EINVAL;
+               break;
+       }
+
+       return r;
+}
+
+int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id,
+                              union kvmppc_one_reg *val)
+{
+       int r = 0;
+       long int i;
+
+       switch (id) {
+       case KVM_REG_PPC_MAS0:
+               vcpu->arch.shared->mas0 = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_MAS1:
+               vcpu->arch.shared->mas1 = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_MAS2:
+               vcpu->arch.shared->mas2 = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_MAS7_3:
+               vcpu->arch.shared->mas7_3 = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_MAS4:
+               vcpu->arch.shared->mas4 = set_reg_val(id, *val);
+               break;
+       case KVM_REG_PPC_MAS6:
+               vcpu->arch.shared->mas6 = set_reg_val(id, *val);
+               break;
+       /* Only allow MMU registers to be set to the config supported by KVM */
+       case KVM_REG_PPC_MMUCFG: {
+               u32 reg = set_reg_val(id, *val);
+               if (reg != vcpu->arch.mmucfg)
+                       r = -EINVAL;
+               break;
+       }
+       case KVM_REG_PPC_EPTCFG: {
+               u32 reg = set_reg_val(id, *val);
+               if (reg != vcpu->arch.eptcfg)
+                       r = -EINVAL;
+               break;
+       }
+       case KVM_REG_PPC_TLB0CFG:
+       case KVM_REG_PPC_TLB1CFG:
+       case KVM_REG_PPC_TLB2CFG:
+       case KVM_REG_PPC_TLB3CFG: {
+               /* MMU geometry (N_ENTRY/ASSOC) can be set only using SW_TLB */
+               u32 reg = set_reg_val(id, *val);
+               i = id - KVM_REG_PPC_TLB0CFG;
+               if (reg != vcpu->arch.tlbcfg[i])
+                       r = -EINVAL;
+               break;
+       }
+       case KVM_REG_PPC_TLB0PS:
+       case KVM_REG_PPC_TLB1PS:
+       case KVM_REG_PPC_TLB2PS:
+       case KVM_REG_PPC_TLB3PS: {
+               u32 reg = set_reg_val(id, *val);
+               i = id - KVM_REG_PPC_TLB0PS;
+               if (reg != vcpu->arch.tlbps[i])
+                       r = -EINVAL;
+               break;
+       }
+       default:
+               r = -EINVAL;
+               break;
+       }
+
+       return r;
+}
+
+static int vcpu_mmu_geometry_update(struct kvm_vcpu *vcpu,
+               struct kvm_book3e_206_tlb_params *params)
+{
+       vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+       if (params->tlb_sizes[0] <= 2048)
+               vcpu->arch.tlbcfg[0] |= params->tlb_sizes[0];
+       vcpu->arch.tlbcfg[0] |= params->tlb_ways[0] << TLBnCFG_ASSOC_SHIFT;
+
+       vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+       vcpu->arch.tlbcfg[1] |= params->tlb_sizes[1];
+       vcpu->arch.tlbcfg[1] |= params->tlb_ways[1] << TLBnCFG_ASSOC_SHIFT;
+       return 0;
+}
+
 int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
                              struct kvm_config_tlb *cfg)
 {
@@ -692,16 +826,8 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
        vcpu_e500->gtlb_offset[0] = 0;
        vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0];
 
-       vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE;
-
-       vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
-       if (params.tlb_sizes[0] <= 2048)
-               vcpu->arch.tlbcfg[0] |= params.tlb_sizes[0];
-       vcpu->arch.tlbcfg[0] |= params.tlb_ways[0] << TLBnCFG_ASSOC_SHIFT;
-
-       vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
-       vcpu->arch.tlbcfg[1] |= params.tlb_sizes[1];
-       vcpu->arch.tlbcfg[1] |= params.tlb_ways[1] << TLBnCFG_ASSOC_SHIFT;
+       /* Update vcpu's MMU geometry based on SW_TLB input */
+       vcpu_mmu_geometry_update(vcpu, &params);
 
        vcpu_e500->shared_tlb_pages = pages;
        vcpu_e500->num_shared_tlb_pages = num_pages;
@@ -737,6 +863,39 @@ int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
        return 0;
 }
 
+/* Vcpu's MMU default configuration */
+static int vcpu_mmu_init(struct kvm_vcpu *vcpu,
+                      struct kvmppc_e500_tlb_params *params)
+{
+       /* Initialize RASIZE, PIDSIZE, NTLBS and MAVN fields with host values*/
+       vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE;
+
+       /* Initialize TLBnCFG fields with host values and SW_TLB geometry*/
+       vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) &
+                            ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+       vcpu->arch.tlbcfg[0] |= params[0].entries;
+       vcpu->arch.tlbcfg[0] |= params[0].ways << TLBnCFG_ASSOC_SHIFT;
+
+       vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) &
+                            ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+       vcpu->arch.tlbcfg[1] |= params[1].entries;
+       vcpu->arch.tlbcfg[1] |= params[1].ways << TLBnCFG_ASSOC_SHIFT;
+
+       if (has_feature(vcpu, VCPU_FTR_MMU_V2)) {
+               vcpu->arch.tlbps[0] = mfspr(SPRN_TLB0PS);
+               vcpu->arch.tlbps[1] = mfspr(SPRN_TLB1PS);
+
+               vcpu->arch.mmucfg &= ~MMUCFG_LRAT;
+
+               /* Guest mmu emulation currently doesn't handle E.PT */
+               vcpu->arch.eptcfg = 0;
+               vcpu->arch.tlbcfg[0] &= ~TLBnCFG_PT;
+               vcpu->arch.tlbcfg[1] &= ~TLBnCFG_IND;
+       }
+
+       return 0;
+}
+
 int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
        struct kvm_vcpu *vcpu = &vcpu_e500->vcpu;
@@ -781,18 +940,7 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
        if (!vcpu_e500->g2h_tlb1_map)
                goto err;
 
-       /* Init TLB configuration register */
-       vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) &
-                            ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
-       vcpu->arch.tlbcfg[0] |= vcpu_e500->gtlb_params[0].entries;
-       vcpu->arch.tlbcfg[0] |=
-               vcpu_e500->gtlb_params[0].ways << TLBnCFG_ASSOC_SHIFT;
-
-       vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) &
-                            ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
-       vcpu->arch.tlbcfg[1] |= vcpu_e500->gtlb_params[1].entries;
-       vcpu->arch.tlbcfg[1] |=
-               vcpu_e500->gtlb_params[1].ways << TLBnCFG_ASSOC_SHIFT;
+       vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params);
 
        kvmppc_recalc_tlb1map_range(vcpu_e500);
        return 0;
index 2f4baa074b2ebf0bf9cfe497055afbe9460ec4a5..753cc99eff2be8dab0f55c7d34409dc33d30a7fa 100644 (file)
@@ -177,6 +177,8 @@ int kvmppc_core_check_processor_compat(void)
                r = 0;
        else if (strcmp(cur_cpu_spec->cpu_name, "e5500") == 0)
                r = 0;
+       else if (strcmp(cur_cpu_spec->cpu_name, "e6500") == 0)
+               r = 0;
        else
                r = -ENOTSUPP;
 
@@ -260,6 +262,20 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
        return kvmppc_set_sregs_ivor(vcpu, sregs);
 }
 
+int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
+                       union kvmppc_one_reg *val)
+{
+       int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val);
+       return r;
+}
+
+int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
+                      union kvmppc_one_reg *val)
+{
+       int r = kvmppc_set_one_reg_e500_tlb(vcpu, id, val);
+       return r;
+}
+
 struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 {
        struct kvmppc_vcpu_e500 *vcpu_e500;
index 7a73b6f72a8ba4a9031d3426c93b32abc03e0951..631a2650e4e42c0038524b0940a9ae91bcdcd3a4 100644 (file)
@@ -38,6 +38,7 @@
 
 #define OP_31_XOP_TRAP      4
 #define OP_31_XOP_LWZX      23
+#define OP_31_XOP_DCBST     54
 #define OP_31_XOP_TRAP_64   68
 #define OP_31_XOP_DCBF      86
 #define OP_31_XOP_LBZX      87
@@ -370,6 +371,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs);
                        break;
 
+               case OP_31_XOP_DCBST:
                case OP_31_XOP_DCBF:
                case OP_31_XOP_DCBI:
                        /* Do nothing. The guest is performing dcbi because
diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
new file mode 100644 (file)
index 0000000..5a9a10b
--- /dev/null
@@ -0,0 +1,20 @@
+#ifndef __IRQ_H
+#define __IRQ_H
+
+#include <linux/kvm_host.h>
+
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+       int ret = 0;
+
+#ifdef CONFIG_KVM_MPIC
+       ret = ret || (kvm->arch.mpic != NULL);
+#endif
+#ifdef CONFIG_KVM_XICS
+       ret = ret || (kvm->arch.xics != NULL);
+#endif
+       smp_rmb();
+       return ret;
+}
+
+#endif
diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
new file mode 100644 (file)
index 0000000..2861ae9
--- /dev/null
@@ -0,0 +1,1853 @@
+/*
+ * OpenPIC emulation
+ *
+ * Copyright (c) 2004 Jocelyn Mayer
+ *               2011 Alexander Graf
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/kvm_host.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <asm/uaccess.h>
+#include <asm/mpic.h>
+#include <asm/kvm_para.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_ppc.h>
+#include "iodev.h"
+
+#define MAX_CPU     32
+#define MAX_SRC     256
+#define MAX_TMR     4
+#define MAX_IPI     4
+#define MAX_MSI     8
+#define MAX_IRQ     (MAX_SRC + MAX_IPI + MAX_TMR)
+#define VID         0x03       /* MPIC version ID */
+
+/* OpenPIC capability flags */
+#define OPENPIC_FLAG_IDR_CRIT     (1 << 0)
+#define OPENPIC_FLAG_ILR          (2 << 0)
+
+/* OpenPIC address map */
+#define OPENPIC_REG_SIZE             0x40000
+#define OPENPIC_GLB_REG_START        0x0
+#define OPENPIC_GLB_REG_SIZE         0x10F0
+#define OPENPIC_TMR_REG_START        0x10F0
+#define OPENPIC_TMR_REG_SIZE         0x220
+#define OPENPIC_MSI_REG_START        0x1600
+#define OPENPIC_MSI_REG_SIZE         0x200
+#define OPENPIC_SUMMARY_REG_START    0x3800
+#define OPENPIC_SUMMARY_REG_SIZE     0x800
+#define OPENPIC_SRC_REG_START        0x10000
+#define OPENPIC_SRC_REG_SIZE         (MAX_SRC * 0x20)
+#define OPENPIC_CPU_REG_START        0x20000
+#define OPENPIC_CPU_REG_SIZE         (0x100 + ((MAX_CPU - 1) * 0x1000))
+
+struct fsl_mpic_info {
+       int max_ext;
+};
+
+static struct fsl_mpic_info fsl_mpic_20 = {
+       .max_ext = 12,
+};
+
+static struct fsl_mpic_info fsl_mpic_42 = {
+       .max_ext = 12,
+};
+
+#define FRR_NIRQ_SHIFT    16
+#define FRR_NCPU_SHIFT     8
+#define FRR_VID_SHIFT      0
+
+#define VID_REVISION_1_2   2
+#define VID_REVISION_1_3   3
+
+#define VIR_GENERIC      0x00000000    /* Generic Vendor ID */
+
+#define GCR_RESET        0x80000000
+#define GCR_MODE_PASS    0x00000000
+#define GCR_MODE_MIXED   0x20000000
+#define GCR_MODE_PROXY   0x60000000
+
+#define TBCR_CI           0x80000000   /* count inhibit */
+#define TCCR_TOG          0x80000000   /* toggles when decrement to zero */
+
+#define IDR_EP_SHIFT      31
+#define IDR_EP_MASK       (1 << IDR_EP_SHIFT)
+#define IDR_CI0_SHIFT     30
+#define IDR_CI1_SHIFT     29
+#define IDR_P1_SHIFT      1
+#define IDR_P0_SHIFT      0
+
+#define ILR_INTTGT_MASK   0x000000ff
+#define ILR_INTTGT_INT    0x00
+#define ILR_INTTGT_CINT   0x01 /* critical */
+#define ILR_INTTGT_MCP    0x02 /* machine check */
+#define NUM_OUTPUTS       3
+
+#define MSIIR_OFFSET       0x140
+#define MSIIR_SRS_SHIFT    29
+#define MSIIR_SRS_MASK     (0x7 << MSIIR_SRS_SHIFT)
+#define MSIIR_IBS_SHIFT    24
+#define MSIIR_IBS_MASK     (0x1f << MSIIR_IBS_SHIFT)
+
+static int get_current_cpu(void)
+{
+#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE)
+       struct kvm_vcpu *vcpu = current->thread.kvm_vcpu;
+       return vcpu ? vcpu->arch.irq_cpu_id : -1;
+#else
+       /* XXX */
+       return -1;
+#endif
+}
+
+static int openpic_cpu_write_internal(void *opaque, gpa_t addr,
+                                     u32 val, int idx);
+static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
+                                    u32 *ptr, int idx);
+
+enum irq_type {
+       IRQ_TYPE_NORMAL = 0,
+       IRQ_TYPE_FSLINT,        /* FSL internal interrupt -- level only */
+       IRQ_TYPE_FSLSPECIAL,    /* FSL timer/IPI interrupt, edge, no polarity */
+};
+
+struct irq_queue {
+       /* Round up to the nearest 64 IRQs so that the queue length
+        * won't change when moving between 32 and 64 bit hosts.
+        */
+       unsigned long queue[BITS_TO_LONGS((MAX_IRQ + 63) & ~63)];
+       int next;
+       int priority;
+};
+
+struct irq_source {
+       uint32_t ivpr;          /* IRQ vector/priority register */
+       uint32_t idr;           /* IRQ destination register */
+       uint32_t destmask;      /* bitmap of CPU destinations */
+       int last_cpu;
+       int output;             /* IRQ level, e.g. ILR_INTTGT_INT */
+       int pending;            /* TRUE if IRQ is pending */
+       enum irq_type type;
+       bool level:1;           /* level-triggered */
+       bool nomask:1;  /* critical interrupts ignore mask on some FSL MPICs */
+};
+
+#define IVPR_MASK_SHIFT       31
+#define IVPR_MASK_MASK        (1 << IVPR_MASK_SHIFT)
+#define IVPR_ACTIVITY_SHIFT   30
+#define IVPR_ACTIVITY_MASK    (1 << IVPR_ACTIVITY_SHIFT)
+#define IVPR_MODE_SHIFT       29
+#define IVPR_MODE_MASK        (1 << IVPR_MODE_SHIFT)
+#define IVPR_POLARITY_SHIFT   23
+#define IVPR_POLARITY_MASK    (1 << IVPR_POLARITY_SHIFT)
+#define IVPR_SENSE_SHIFT      22
+#define IVPR_SENSE_MASK       (1 << IVPR_SENSE_SHIFT)
+
+#define IVPR_PRIORITY_MASK     (0xF << 16)
+#define IVPR_PRIORITY(_ivprr_) ((int)(((_ivprr_) & IVPR_PRIORITY_MASK) >> 16))
+#define IVPR_VECTOR(opp, _ivprr_) ((_ivprr_) & (opp)->vector_mask)
+
+/* IDR[EP/CI] are only for FSL MPIC prior to v4.0 */
+#define IDR_EP      0x80000000 /* external pin */
+#define IDR_CI      0x40000000 /* critical interrupt */
+
+struct irq_dest {
+       struct kvm_vcpu *vcpu;
+
+       int32_t ctpr;           /* CPU current task priority */
+       struct irq_queue raised;
+       struct irq_queue servicing;
+
+       /* Count of IRQ sources asserting on non-INT outputs */
+       uint32_t outputs_active[NUM_OUTPUTS];
+};
+
+#define MAX_MMIO_REGIONS 10
+
+struct openpic {
+       struct kvm *kvm;
+       struct kvm_device *dev;
+       struct kvm_io_device mmio;
+       const struct mem_reg *mmio_regions[MAX_MMIO_REGIONS];
+       int num_mmio_regions;
+
+       gpa_t reg_base;
+       spinlock_t lock;
+
+       /* Behavior control */
+       struct fsl_mpic_info *fsl;
+       uint32_t model;
+       uint32_t flags;
+       uint32_t nb_irqs;
+       uint32_t vid;
+       uint32_t vir;           /* Vendor identification register */
+       uint32_t vector_mask;
+       uint32_t tfrr_reset;
+       uint32_t ivpr_reset;
+       uint32_t idr_reset;
+       uint32_t brr1;
+       uint32_t mpic_mode_mask;
+
+       /* Global registers */
+       uint32_t frr;           /* Feature reporting register */
+       uint32_t gcr;           /* Global configuration register  */
+       uint32_t pir;           /* Processor initialization register */
+       uint32_t spve;          /* Spurious vector register */
+       uint32_t tfrr;          /* Timer frequency reporting register */
+       /* Source registers */
+       struct irq_source src[MAX_IRQ];
+       /* Local registers per output pin */
+       struct irq_dest dst[MAX_CPU];
+       uint32_t nb_cpus;
+       /* Timer registers */
+       struct {
+               uint32_t tccr;  /* Global timer current count register */
+               uint32_t tbcr;  /* Global timer base count register */
+       } timers[MAX_TMR];
+       /* Shared MSI registers */
+       struct {
+               uint32_t msir;  /* Shared Message Signaled Interrupt Register */
+       } msi[MAX_MSI];
+       uint32_t max_irq;
+       uint32_t irq_ipi0;
+       uint32_t irq_tim0;
+       uint32_t irq_msi;
+};
+
+
+static void mpic_irq_raise(struct openpic *opp, struct irq_dest *dst,
+                          int output)
+{
+       struct kvm_interrupt irq = {
+               .irq = KVM_INTERRUPT_SET_LEVEL,
+       };
+
+       if (!dst->vcpu) {
+               pr_debug("%s: destination cpu %d does not exist\n",
+                        __func__, (int)(dst - &opp->dst[0]));
+               return;
+       }
+
+       pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id,
+               output);
+
+       if (output != ILR_INTTGT_INT)   /* TODO */
+               return;
+
+       kvm_vcpu_ioctl_interrupt(dst->vcpu, &irq);
+}
+
+static void mpic_irq_lower(struct openpic *opp, struct irq_dest *dst,
+                          int output)
+{
+       if (!dst->vcpu) {
+               pr_debug("%s: destination cpu %d does not exist\n",
+                        __func__, (int)(dst - &opp->dst[0]));
+               return;
+       }
+
+       pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id,
+               output);
+
+       if (output != ILR_INTTGT_INT)   /* TODO */
+               return;
+
+       kvmppc_core_dequeue_external(dst->vcpu);
+}
+
+static inline void IRQ_setbit(struct irq_queue *q, int n_IRQ)
+{
+       set_bit(n_IRQ, q->queue);
+}
+
+static inline void IRQ_resetbit(struct irq_queue *q, int n_IRQ)
+{
+       clear_bit(n_IRQ, q->queue);
+}
+
+static inline int IRQ_testbit(struct irq_queue *q, int n_IRQ)
+{
+       return test_bit(n_IRQ, q->queue);
+}
+
+static void IRQ_check(struct openpic *opp, struct irq_queue *q)
+{
+       int irq = -1;
+       int next = -1;
+       int priority = -1;
+
+       for (;;) {
+               irq = find_next_bit(q->queue, opp->max_irq, irq + 1);
+               if (irq == opp->max_irq)
+                       break;
+
+               pr_debug("IRQ_check: irq %d set ivpr_pr=%d pr=%d\n",
+                       irq, IVPR_PRIORITY(opp->src[irq].ivpr), priority);
+
+               if (IVPR_PRIORITY(opp->src[irq].ivpr) > priority) {
+                       next = irq;
+                       priority = IVPR_PRIORITY(opp->src[irq].ivpr);
+               }
+       }
+
+       q->next = next;
+       q->priority = priority;
+}
+
+static int IRQ_get_next(struct openpic *opp, struct irq_queue *q)
+{
+       /* XXX: optimize */
+       IRQ_check(opp, q);
+
+       return q->next;
+}
+
+static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ,
+                          bool active, bool was_active)
+{
+       struct irq_dest *dst;
+       struct irq_source *src;
+       int priority;
+
+       dst = &opp->dst[n_CPU];
+       src = &opp->src[n_IRQ];
+
+       pr_debug("%s: IRQ %d active %d was %d\n",
+               __func__, n_IRQ, active, was_active);
+
+       if (src->output != ILR_INTTGT_INT) {
+               pr_debug("%s: output %d irq %d active %d was %d count %d\n",
+                       __func__, src->output, n_IRQ, active, was_active,
+                       dst->outputs_active[src->output]);
+
+               /* On Freescale MPIC, critical interrupts ignore priority,
+                * IACK, EOI, etc.  Before MPIC v4.1 they also ignore
+                * masking.
+                */
+               if (active) {
+                       if (!was_active &&
+                           dst->outputs_active[src->output]++ == 0) {
+                               pr_debug("%s: Raise OpenPIC output %d cpu %d irq %d\n",
+                                       __func__, src->output, n_CPU, n_IRQ);
+                               mpic_irq_raise(opp, dst, src->output);
+                       }
+               } else {
+                       if (was_active &&
+                           --dst->outputs_active[src->output] == 0) {
+                               pr_debug("%s: Lower OpenPIC output %d cpu %d irq %d\n",
+                                       __func__, src->output, n_CPU, n_IRQ);
+                               mpic_irq_lower(opp, dst, src->output);
+                       }
+               }
+
+               return;
+       }
+
+       priority = IVPR_PRIORITY(src->ivpr);
+
+       /* Even if the interrupt doesn't have enough priority,
+        * it is still raised, in case ctpr is lowered later.
+        */
+       if (active)
+               IRQ_setbit(&dst->raised, n_IRQ);
+       else
+               IRQ_resetbit(&dst->raised, n_IRQ);
+
+       IRQ_check(opp, &dst->raised);
+
+       if (active && priority <= dst->ctpr) {
+               pr_debug("%s: IRQ %d priority %d too low for ctpr %d on CPU %d\n",
+                       __func__, n_IRQ, priority, dst->ctpr, n_CPU);
+               active = 0;
+       }
+
+       if (active) {
+               if (IRQ_get_next(opp, &dst->servicing) >= 0 &&
+                   priority <= dst->servicing.priority) {
+                       pr_debug("%s: IRQ %d is hidden by servicing IRQ %d on CPU %d\n",
+                               __func__, n_IRQ, dst->servicing.next, n_CPU);
+               } else {
+                       pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d/%d\n",
+                               __func__, n_CPU, n_IRQ, dst->raised.next);
+                       mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
+               }
+       } else {
+               IRQ_get_next(opp, &dst->servicing);
+               if (dst->raised.priority > dst->ctpr &&
+                   dst->raised.priority > dst->servicing.priority) {
+                       pr_debug("%s: IRQ %d inactive, IRQ %d prio %d above %d/%d, CPU %d\n",
+                               __func__, n_IRQ, dst->raised.next,
+                               dst->raised.priority, dst->ctpr,
+                               dst->servicing.priority, n_CPU);
+                       /* IRQ line stays asserted */
+               } else {
+                       pr_debug("%s: IRQ %d inactive, current prio %d/%d, CPU %d\n",
+                               __func__, n_IRQ, dst->ctpr,
+                               dst->servicing.priority, n_CPU);
+                       mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
+               }
+       }
+}
+
+/* update pic state because registers for n_IRQ have changed value */
+static void openpic_update_irq(struct openpic *opp, int n_IRQ)
+{
+       struct irq_source *src;
+       bool active, was_active;
+       int i;
+
+       src = &opp->src[n_IRQ];
+       active = src->pending;
+
+       if ((src->ivpr & IVPR_MASK_MASK) && !src->nomask) {
+               /* Interrupt source is disabled */
+               pr_debug("%s: IRQ %d is disabled\n", __func__, n_IRQ);
+               active = false;
+       }
+
+       was_active = !!(src->ivpr & IVPR_ACTIVITY_MASK);
+
+       /*
+        * We don't have a similar check for already-active because
+        * ctpr may have changed and we need to withdraw the interrupt.
+        */
+       if (!active && !was_active) {
+               pr_debug("%s: IRQ %d is already inactive\n", __func__, n_IRQ);
+               return;
+       }
+
+       if (active)
+               src->ivpr |= IVPR_ACTIVITY_MASK;
+       else
+               src->ivpr &= ~IVPR_ACTIVITY_MASK;
+
+       if (src->destmask == 0) {
+               /* No target */
+               pr_debug("%s: IRQ %d has no target\n", __func__, n_IRQ);
+               return;
+       }
+
+       if (src->destmask == (1 << src->last_cpu)) {
+               /* Only one CPU is allowed to receive this IRQ */
+               IRQ_local_pipe(opp, src->last_cpu, n_IRQ, active, was_active);
+       } else if (!(src->ivpr & IVPR_MODE_MASK)) {
+               /* Directed delivery mode */
+               for (i = 0; i < opp->nb_cpus; i++) {
+                       if (src->destmask & (1 << i)) {
+                               IRQ_local_pipe(opp, i, n_IRQ, active,
+                                              was_active);
+                       }
+               }
+       } else {
+               /* Distributed delivery mode */
+               for (i = src->last_cpu + 1; i != src->last_cpu; i++) {
+                       if (i == opp->nb_cpus)
+                               i = 0;
+
+                       if (src->destmask & (1 << i)) {
+                               IRQ_local_pipe(opp, i, n_IRQ, active,
+                                              was_active);
+                               src->last_cpu = i;
+                               break;
+                       }
+               }
+       }
+}
+
+static void openpic_set_irq(void *opaque, int n_IRQ, int level)
+{
+       struct openpic *opp = opaque;
+       struct irq_source *src;
+
+       if (n_IRQ >= MAX_IRQ) {
+               WARN_ONCE(1, "%s: IRQ %d out of range\n", __func__, n_IRQ);
+               return;
+       }
+
+       src = &opp->src[n_IRQ];
+       pr_debug("openpic: set irq %d = %d ivpr=0x%08x\n",
+               n_IRQ, level, src->ivpr);
+       if (src->level) {
+               /* level-sensitive irq */
+               src->pending = level;
+               openpic_update_irq(opp, n_IRQ);
+       } else {
+               /* edge-sensitive irq */
+               if (level) {
+                       src->pending = 1;
+                       openpic_update_irq(opp, n_IRQ);
+               }
+
+               if (src->output != ILR_INTTGT_INT) {
+                       /* Edge-triggered interrupts shouldn't be used
+                        * with non-INT delivery, but just in case,
+                        * try to make it do something sane rather than
+                        * cause an interrupt storm.  This is close to
+                        * what you'd probably see happen in real hardware.
+                        */
+                       src->pending = 0;
+                       openpic_update_irq(opp, n_IRQ);
+               }
+       }
+}
+
+static void openpic_reset(struct openpic *opp)
+{
+       int i;
+
+       opp->gcr = GCR_RESET;
+       /* Initialise controller registers */
+       opp->frr = ((opp->nb_irqs - 1) << FRR_NIRQ_SHIFT) |
+           (opp->vid << FRR_VID_SHIFT);
+
+       opp->pir = 0;
+       opp->spve = -1 & opp->vector_mask;
+       opp->tfrr = opp->tfrr_reset;
+       /* Initialise IRQ sources */
+       for (i = 0; i < opp->max_irq; i++) {
+               opp->src[i].ivpr = opp->ivpr_reset;
+               opp->src[i].idr = opp->idr_reset;
+
+               switch (opp->src[i].type) {
+               case IRQ_TYPE_NORMAL:
+                       opp->src[i].level =
+                           !!(opp->ivpr_reset & IVPR_SENSE_MASK);
+                       break;
+
+               case IRQ_TYPE_FSLINT:
+                       opp->src[i].ivpr |= IVPR_POLARITY_MASK;
+                       break;
+
+               case IRQ_TYPE_FSLSPECIAL:
+                       break;
+               }
+       }
+       /* Initialise IRQ destinations */
+       for (i = 0; i < MAX_CPU; i++) {
+               opp->dst[i].ctpr = 15;
+               memset(&opp->dst[i].raised, 0, sizeof(struct irq_queue));
+               opp->dst[i].raised.next = -1;
+               memset(&opp->dst[i].servicing, 0, sizeof(struct irq_queue));
+               opp->dst[i].servicing.next = -1;
+       }
+       /* Initialise timers */
+       for (i = 0; i < MAX_TMR; i++) {
+               opp->timers[i].tccr = 0;
+               opp->timers[i].tbcr = TBCR_CI;
+       }
+       /* Go out of RESET state */
+       opp->gcr = 0;
+}
+
+static inline uint32_t read_IRQreg_idr(struct openpic *opp, int n_IRQ)
+{
+       return opp->src[n_IRQ].idr;
+}
+
+static inline uint32_t read_IRQreg_ilr(struct openpic *opp, int n_IRQ)
+{
+       if (opp->flags & OPENPIC_FLAG_ILR)
+               return opp->src[n_IRQ].output;
+
+       return 0xffffffff;
+}
+
+static inline uint32_t read_IRQreg_ivpr(struct openpic *opp, int n_IRQ)
+{
+       return opp->src[n_IRQ].ivpr;
+}
+
+static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ,
+                                   uint32_t val)
+{
+       struct irq_source *src = &opp->src[n_IRQ];
+       uint32_t normal_mask = (1UL << opp->nb_cpus) - 1;
+       uint32_t crit_mask = 0;
+       uint32_t mask = normal_mask;
+       int crit_shift = IDR_EP_SHIFT - opp->nb_cpus;
+       int i;
+
+       if (opp->flags & OPENPIC_FLAG_IDR_CRIT) {
+               crit_mask = mask << crit_shift;
+               mask |= crit_mask | IDR_EP;
+       }
+
+       src->idr = val & mask;
+       pr_debug("Set IDR %d to 0x%08x\n", n_IRQ, src->idr);
+
+       if (opp->flags & OPENPIC_FLAG_IDR_CRIT) {
+               if (src->idr & crit_mask) {
+                       if (src->idr & normal_mask) {
+                               pr_debug("%s: IRQ configured for multiple output types, using critical\n",
+                                       __func__);
+                       }
+
+                       src->output = ILR_INTTGT_CINT;
+                       src->nomask = true;
+                       src->destmask = 0;
+
+                       for (i = 0; i < opp->nb_cpus; i++) {
+                               int n_ci = IDR_CI0_SHIFT - i;
+
+                               if (src->idr & (1UL << n_ci))
+                                       src->destmask |= 1UL << i;
+                       }
+               } else {
+                       src->output = ILR_INTTGT_INT;
+                       src->nomask = false;
+                       src->destmask = src->idr & normal_mask;
+               }
+       } else {
+               src->destmask = src->idr;
+       }
+}
+
+static inline void write_IRQreg_ilr(struct openpic *opp, int n_IRQ,
+                                   uint32_t val)
+{
+       if (opp->flags & OPENPIC_FLAG_ILR) {
+               struct irq_source *src = &opp->src[n_IRQ];
+
+               src->output = val & ILR_INTTGT_MASK;
+               pr_debug("Set ILR %d to 0x%08x, output %d\n", n_IRQ, src->idr,
+                       src->output);
+
+               /* TODO: on MPIC v4.0 only, set nomask for non-INT */
+       }
+}
+
+static inline void write_IRQreg_ivpr(struct openpic *opp, int n_IRQ,
+                                    uint32_t val)
+{
+       uint32_t mask;
+
+       /* NOTE when implementing newer FSL MPIC models: starting with v4.0,
+        * the polarity bit is read-only on internal interrupts.
+        */
+       mask = IVPR_MASK_MASK | IVPR_PRIORITY_MASK | IVPR_SENSE_MASK |
+           IVPR_POLARITY_MASK | opp->vector_mask;
+
+       /* ACTIVITY bit is read-only */
+       opp->src[n_IRQ].ivpr =
+           (opp->src[n_IRQ].ivpr & IVPR_ACTIVITY_MASK) | (val & mask);
+
+       /* For FSL internal interrupts, The sense bit is reserved and zero,
+        * and the interrupt is always level-triggered.  Timers and IPIs
+        * have no sense or polarity bits, and are edge-triggered.
+        */
+       switch (opp->src[n_IRQ].type) {
+       case IRQ_TYPE_NORMAL:
+               opp->src[n_IRQ].level =
+                   !!(opp->src[n_IRQ].ivpr & IVPR_SENSE_MASK);
+               break;
+
+       case IRQ_TYPE_FSLINT:
+               opp->src[n_IRQ].ivpr &= ~IVPR_SENSE_MASK;
+               break;
+
+       case IRQ_TYPE_FSLSPECIAL:
+               opp->src[n_IRQ].ivpr &= ~(IVPR_POLARITY_MASK | IVPR_SENSE_MASK);
+               break;
+       }
+
+       openpic_update_irq(opp, n_IRQ);
+       pr_debug("Set IVPR %d to 0x%08x -> 0x%08x\n", n_IRQ, val,
+               opp->src[n_IRQ].ivpr);
+}
+
+static void openpic_gcr_write(struct openpic *opp, uint64_t val)
+{
+       if (val & GCR_RESET) {
+               openpic_reset(opp);
+               return;
+       }
+
+       opp->gcr &= ~opp->mpic_mode_mask;
+       opp->gcr |= val & opp->mpic_mode_mask;
+}
+
+static int openpic_gbl_write(void *opaque, gpa_t addr, u32 val)
+{
+       struct openpic *opp = opaque;
+       int err = 0;
+
+       pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
+       if (addr & 0xF)
+               return 0;
+
+       switch (addr) {
+       case 0x00:      /* Block Revision Register1 (BRR1) is Readonly */
+               break;
+       case 0x40:
+       case 0x50:
+       case 0x60:
+       case 0x70:
+       case 0x80:
+       case 0x90:
+       case 0xA0:
+       case 0xB0:
+               err = openpic_cpu_write_internal(opp, addr, val,
+                                                get_current_cpu());
+               break;
+       case 0x1000:            /* FRR */
+               break;
+       case 0x1020:            /* GCR */
+               openpic_gcr_write(opp, val);
+               break;
+       case 0x1080:            /* VIR */
+               break;
+       case 0x1090:            /* PIR */
+               /*
+                * This register is used to reset a CPU core --
+                * let userspace handle it.
+                */
+               err = -ENXIO;
+               break;
+       case 0x10A0:            /* IPI_IVPR */
+       case 0x10B0:
+       case 0x10C0:
+       case 0x10D0: {
+               int idx;
+               idx = (addr - 0x10A0) >> 4;
+               write_IRQreg_ivpr(opp, opp->irq_ipi0 + idx, val);
+               break;
+       }
+       case 0x10E0:            /* SPVE */
+               opp->spve = val & opp->vector_mask;
+               break;
+       default:
+               break;
+       }
+
+       return err;
+}
+
+static int openpic_gbl_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+       struct openpic *opp = opaque;
+       u32 retval;
+       int err = 0;
+
+       pr_debug("%s: addr %#llx\n", __func__, addr);
+       retval = 0xFFFFFFFF;
+       if (addr & 0xF)
+               goto out;
+
+       switch (addr) {
+       case 0x1000:            /* FRR */
+               retval = opp->frr;
+               retval |= (opp->nb_cpus - 1) << FRR_NCPU_SHIFT;
+               break;
+       case 0x1020:            /* GCR */
+               retval = opp->gcr;
+               break;
+       case 0x1080:            /* VIR */
+               retval = opp->vir;
+               break;
+       case 0x1090:            /* PIR */
+               retval = 0x00000000;
+               break;
+       case 0x00:              /* Block Revision Register1 (BRR1) */
+               retval = opp->brr1;
+               break;
+       case 0x40:
+       case 0x50:
+       case 0x60:
+       case 0x70:
+       case 0x80:
+       case 0x90:
+       case 0xA0:
+       case 0xB0:
+               err = openpic_cpu_read_internal(opp, addr,
+                       &retval, get_current_cpu());
+               break;
+       case 0x10A0:            /* IPI_IVPR */
+       case 0x10B0:
+       case 0x10C0:
+       case 0x10D0:
+               {
+                       int idx;
+                       idx = (addr - 0x10A0) >> 4;
+                       retval = read_IRQreg_ivpr(opp, opp->irq_ipi0 + idx);
+               }
+               break;
+       case 0x10E0:            /* SPVE */
+               retval = opp->spve;
+               break;
+       default:
+               break;
+       }
+
+out:
+       pr_debug("%s: => 0x%08x\n", __func__, retval);
+       *ptr = retval;
+       return err;
+}
+
+static int openpic_tmr_write(void *opaque, gpa_t addr, u32 val)
+{
+       struct openpic *opp = opaque;
+       int idx;
+
+       addr += 0x10f0;
+
+       pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
+       if (addr & 0xF)
+               return 0;
+
+       if (addr == 0x10f0) {
+               /* TFRR */
+               opp->tfrr = val;
+               return 0;
+       }
+
+       idx = (addr >> 6) & 0x3;
+       addr = addr & 0x30;
+
+       switch (addr & 0x30) {
+       case 0x00:              /* TCCR */
+               break;
+       case 0x10:              /* TBCR */
+               if ((opp->timers[idx].tccr & TCCR_TOG) != 0 &&
+                   (val & TBCR_CI) == 0 &&
+                   (opp->timers[idx].tbcr & TBCR_CI) != 0)
+                       opp->timers[idx].tccr &= ~TCCR_TOG;
+
+               opp->timers[idx].tbcr = val;
+               break;
+       case 0x20:              /* TVPR */
+               write_IRQreg_ivpr(opp, opp->irq_tim0 + idx, val);
+               break;
+       case 0x30:              /* TDR */
+               write_IRQreg_idr(opp, opp->irq_tim0 + idx, val);
+               break;
+       }
+
+       return 0;
+}
+
+static int openpic_tmr_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+       struct openpic *opp = opaque;
+       uint32_t retval = -1;
+       int idx;
+
+       pr_debug("%s: addr %#llx\n", __func__, addr);
+       if (addr & 0xF)
+               goto out;
+
+       idx = (addr >> 6) & 0x3;
+       if (addr == 0x0) {
+               /* TFRR */
+               retval = opp->tfrr;
+               goto out;
+       }
+
+       switch (addr & 0x30) {
+       case 0x00:              /* TCCR */
+               retval = opp->timers[idx].tccr;
+               break;
+       case 0x10:              /* TBCR */
+               retval = opp->timers[idx].tbcr;
+               break;
+       case 0x20:              /* TIPV */
+               retval = read_IRQreg_ivpr(opp, opp->irq_tim0 + idx);
+               break;
+       case 0x30:              /* TIDE (TIDR) */
+               retval = read_IRQreg_idr(opp, opp->irq_tim0 + idx);
+               break;
+       }
+
+out:
+       pr_debug("%s: => 0x%08x\n", __func__, retval);
+       *ptr = retval;
+       return 0;
+}
+
+static int openpic_src_write(void *opaque, gpa_t addr, u32 val)
+{
+       struct openpic *opp = opaque;
+       int idx;
+
+       pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
+
+       addr = addr & 0xffff;
+       idx = addr >> 5;
+
+       switch (addr & 0x1f) {
+       case 0x00:
+               write_IRQreg_ivpr(opp, idx, val);
+               break;
+       case 0x10:
+               write_IRQreg_idr(opp, idx, val);
+               break;
+       case 0x18:
+               write_IRQreg_ilr(opp, idx, val);
+               break;
+       }
+
+       return 0;
+}
+
+static int openpic_src_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+       struct openpic *opp = opaque;
+       uint32_t retval;
+       int idx;
+
+       pr_debug("%s: addr %#llx\n", __func__, addr);
+       retval = 0xFFFFFFFF;
+
+       addr = addr & 0xffff;
+       idx = addr >> 5;
+
+       switch (addr & 0x1f) {
+       case 0x00:
+               retval = read_IRQreg_ivpr(opp, idx);
+               break;
+       case 0x10:
+               retval = read_IRQreg_idr(opp, idx);
+               break;
+       case 0x18:
+               retval = read_IRQreg_ilr(opp, idx);
+               break;
+       }
+
+       pr_debug("%s: => 0x%08x\n", __func__, retval);
+       *ptr = retval;
+       return 0;
+}
+
+static int openpic_msi_write(void *opaque, gpa_t addr, u32 val)
+{
+       struct openpic *opp = opaque;
+       int idx = opp->irq_msi;
+       int srs, ibs;
+
+       pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val);
+       if (addr & 0xF)
+               return 0;
+
+       switch (addr) {
+       case MSIIR_OFFSET:
+               srs = val >> MSIIR_SRS_SHIFT;
+               idx += srs;
+               ibs = (val & MSIIR_IBS_MASK) >> MSIIR_IBS_SHIFT;
+               opp->msi[srs].msir |= 1 << ibs;
+               openpic_set_irq(opp, idx, 1);
+               break;
+       default:
+               /* most registers are read-only, thus ignored */
+               break;
+       }
+
+       return 0;
+}
+
+static int openpic_msi_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+       struct openpic *opp = opaque;
+       uint32_t r = 0;
+       int i, srs;
+
+       pr_debug("%s: addr %#llx\n", __func__, addr);
+       if (addr & 0xF)
+               return -ENXIO;
+
+       srs = addr >> 4;
+
+       switch (addr) {
+       case 0x00:
+       case 0x10:
+       case 0x20:
+       case 0x30:
+       case 0x40:
+       case 0x50:
+       case 0x60:
+       case 0x70:              /* MSIRs */
+               r = opp->msi[srs].msir;
+               /* Clear on read */
+               opp->msi[srs].msir = 0;
+               openpic_set_irq(opp, opp->irq_msi + srs, 0);
+               break;
+       case 0x120:             /* MSISR */
+               for (i = 0; i < MAX_MSI; i++)
+                       r |= (opp->msi[i].msir ? 1 : 0) << i;
+               break;
+       }
+
+       pr_debug("%s: => 0x%08x\n", __func__, r);
+       *ptr = r;
+       return 0;
+}
+
+static int openpic_summary_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+       uint32_t r = 0;
+
+       pr_debug("%s: addr %#llx\n", __func__, addr);
+
+       /* TODO: EISR/EIMR */
+
+       *ptr = r;
+       return 0;
+}
+
+static int openpic_summary_write(void *opaque, gpa_t addr, u32 val)
+{
+       pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val);
+
+       /* TODO: EISR/EIMR */
+       return 0;
+}
+
+static int openpic_cpu_write_internal(void *opaque, gpa_t addr,
+                                     u32 val, int idx)
+{
+       struct openpic *opp = opaque;
+       struct irq_source *src;
+       struct irq_dest *dst;
+       int s_IRQ, n_IRQ;
+
+       pr_debug("%s: cpu %d addr %#llx <= 0x%08x\n", __func__, idx,
+               addr, val);
+
+       if (idx < 0)
+               return 0;
+
+       if (addr & 0xF)
+               return 0;
+
+       dst = &opp->dst[idx];
+       addr &= 0xFF0;
+       switch (addr) {
+       case 0x40:              /* IPIDR */
+       case 0x50:
+       case 0x60:
+       case 0x70:
+               idx = (addr - 0x40) >> 4;
+               /* we use IDE as mask which CPUs to deliver the IPI to still. */
+               opp->src[opp->irq_ipi0 + idx].destmask |= val;
+               openpic_set_irq(opp, opp->irq_ipi0 + idx, 1);
+               openpic_set_irq(opp, opp->irq_ipi0 + idx, 0);
+               break;
+       case 0x80:              /* CTPR */
+               dst->ctpr = val & 0x0000000F;
+
+               pr_debug("%s: set CPU %d ctpr to %d, raised %d servicing %d\n",
+                       __func__, idx, dst->ctpr, dst->raised.priority,
+                       dst->servicing.priority);
+
+               if (dst->raised.priority <= dst->ctpr) {
+                       pr_debug("%s: Lower OpenPIC INT output cpu %d due to ctpr\n",
+                               __func__, idx);
+                       mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
+               } else if (dst->raised.priority > dst->servicing.priority) {
+                       pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d\n",
+                               __func__, idx, dst->raised.next);
+                       mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
+               }
+
+               break;
+       case 0x90:              /* WHOAMI */
+               /* Read-only register */
+               break;
+       case 0xA0:              /* IACK */
+               /* Read-only register */
+               break;
+       case 0xB0: {            /* EOI */
+               int notify_eoi;
+
+               pr_debug("EOI\n");
+               s_IRQ = IRQ_get_next(opp, &dst->servicing);
+
+               if (s_IRQ < 0) {
+                       pr_debug("%s: EOI with no interrupt in service\n",
+                               __func__);
+                       break;
+               }
+
+               IRQ_resetbit(&dst->servicing, s_IRQ);
+               /* Notify listeners that the IRQ is over */
+               notify_eoi = s_IRQ;
+               /* Set up next servicing IRQ */
+               s_IRQ = IRQ_get_next(opp, &dst->servicing);
+               /* Check queued interrupts. */
+               n_IRQ = IRQ_get_next(opp, &dst->raised);
+               src = &opp->src[n_IRQ];
+               if (n_IRQ != -1 &&
+                   (s_IRQ == -1 ||
+                    IVPR_PRIORITY(src->ivpr) > dst->servicing.priority)) {
+                       pr_debug("Raise OpenPIC INT output cpu %d irq %d\n",
+                               idx, n_IRQ);
+                       mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
+               }
+
+               spin_unlock(&opp->lock);
+               kvm_notify_acked_irq(opp->kvm, 0, notify_eoi);
+               spin_lock(&opp->lock);
+
+               break;
+       }
+       default:
+               break;
+       }
+
+       return 0;
+}
+
+static int openpic_cpu_write(void *opaque, gpa_t addr, u32 val)
+{
+       struct openpic *opp = opaque;
+
+       return openpic_cpu_write_internal(opp, addr, val,
+                                        (addr & 0x1f000) >> 12);
+}
+
+static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst,
+                            int cpu)
+{
+       struct irq_source *src;
+       int retval, irq;
+
+       pr_debug("Lower OpenPIC INT output\n");
+       mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
+
+       irq = IRQ_get_next(opp, &dst->raised);
+       pr_debug("IACK: irq=%d\n", irq);
+
+       if (irq == -1)
+               /* No more interrupt pending */
+               return opp->spve;
+
+       src = &opp->src[irq];
+       if (!(src->ivpr & IVPR_ACTIVITY_MASK) ||
+           !(IVPR_PRIORITY(src->ivpr) > dst->ctpr)) {
+               pr_err("%s: bad raised IRQ %d ctpr %d ivpr 0x%08x\n",
+                       __func__, irq, dst->ctpr, src->ivpr);
+               openpic_update_irq(opp, irq);
+               retval = opp->spve;
+       } else {
+               /* IRQ enter servicing state */
+               IRQ_setbit(&dst->servicing, irq);
+               retval = IVPR_VECTOR(opp, src->ivpr);
+       }
+
+       if (!src->level) {
+               /* edge-sensitive IRQ */
+               src->ivpr &= ~IVPR_ACTIVITY_MASK;
+               src->pending = 0;
+               IRQ_resetbit(&dst->raised, irq);
+       }
+
+       if ((irq >= opp->irq_ipi0) && (irq < (opp->irq_ipi0 + MAX_IPI))) {
+               src->destmask &= ~(1 << cpu);
+               if (src->destmask && !src->level) {
+                       /* trigger on CPUs that didn't know about it yet */
+                       openpic_set_irq(opp, irq, 1);
+                       openpic_set_irq(opp, irq, 0);
+                       /* if all CPUs knew about it, set active bit again */
+                       src->ivpr |= IVPR_ACTIVITY_MASK;
+               }
+       }
+
+       return retval;
+}
+
+void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu)
+{
+       struct openpic *opp = vcpu->arch.mpic;
+       int cpu = vcpu->arch.irq_cpu_id;
+       unsigned long flags;
+
+       spin_lock_irqsave(&opp->lock, flags);
+
+       if ((opp->gcr & opp->mpic_mode_mask) == GCR_MODE_PROXY)
+               kvmppc_set_epr(vcpu, openpic_iack(opp, &opp->dst[cpu], cpu));
+
+       spin_unlock_irqrestore(&opp->lock, flags);
+}
+
+static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
+                                    u32 *ptr, int idx)
+{
+       struct openpic *opp = opaque;
+       struct irq_dest *dst;
+       uint32_t retval;
+
+       pr_debug("%s: cpu %d addr %#llx\n", __func__, idx, addr);
+       retval = 0xFFFFFFFF;
+
+       if (idx < 0)
+               goto out;
+
+       if (addr & 0xF)
+               goto out;
+
+       dst = &opp->dst[idx];
+       addr &= 0xFF0;
+       switch (addr) {
+       case 0x80:              /* CTPR */
+               retval = dst->ctpr;
+               break;
+       case 0x90:              /* WHOAMI */
+               retval = idx;
+               break;
+       case 0xA0:              /* IACK */
+               retval = openpic_iack(opp, dst, idx);
+               break;
+       case 0xB0:              /* EOI */
+               retval = 0;
+               break;
+       default:
+               break;
+       }
+       pr_debug("%s: => 0x%08x\n", __func__, retval);
+
+out:
+       *ptr = retval;
+       return 0;
+}
+
+static int openpic_cpu_read(void *opaque, gpa_t addr, u32 *ptr)
+{
+       struct openpic *opp = opaque;
+
+       return openpic_cpu_read_internal(opp, addr, ptr,
+                                        (addr & 0x1f000) >> 12);
+}
+
+struct mem_reg {
+       int (*read)(void *opaque, gpa_t addr, u32 *ptr);
+       int (*write)(void *opaque, gpa_t addr, u32 val);
+       gpa_t start_addr;
+       int size;
+};
+
+static const struct mem_reg openpic_gbl_mmio = {
+       .write = openpic_gbl_write,
+       .read = openpic_gbl_read,
+       .start_addr = OPENPIC_GLB_REG_START,
+       .size = OPENPIC_GLB_REG_SIZE,
+};
+
+static const struct mem_reg openpic_tmr_mmio = {
+       .write = openpic_tmr_write,
+       .read = openpic_tmr_read,
+       .start_addr = OPENPIC_TMR_REG_START,
+       .size = OPENPIC_TMR_REG_SIZE,
+};
+
+static const struct mem_reg openpic_cpu_mmio = {
+       .write = openpic_cpu_write,
+       .read = openpic_cpu_read,
+       .start_addr = OPENPIC_CPU_REG_START,
+       .size = OPENPIC_CPU_REG_SIZE,
+};
+
+static const struct mem_reg openpic_src_mmio = {
+       .write = openpic_src_write,
+       .read = openpic_src_read,
+       .start_addr = OPENPIC_SRC_REG_START,
+       .size = OPENPIC_SRC_REG_SIZE,
+};
+
+static const struct mem_reg openpic_msi_mmio = {
+       .read = openpic_msi_read,
+       .write = openpic_msi_write,
+       .start_addr = OPENPIC_MSI_REG_START,
+       .size = OPENPIC_MSI_REG_SIZE,
+};
+
+static const struct mem_reg openpic_summary_mmio = {
+       .read = openpic_summary_read,
+       .write = openpic_summary_write,
+       .start_addr = OPENPIC_SUMMARY_REG_START,
+       .size = OPENPIC_SUMMARY_REG_SIZE,
+};
+
+static void add_mmio_region(struct openpic *opp, const struct mem_reg *mr)
+{
+       if (opp->num_mmio_regions >= MAX_MMIO_REGIONS) {
+               WARN(1, "kvm mpic: too many mmio regions\n");
+               return;
+       }
+
+       opp->mmio_regions[opp->num_mmio_regions++] = mr;
+}
+
+static void fsl_common_init(struct openpic *opp)
+{
+       int i;
+       int virq = MAX_SRC;
+
+       add_mmio_region(opp, &openpic_msi_mmio);
+       add_mmio_region(opp, &openpic_summary_mmio);
+
+       opp->vid = VID_REVISION_1_2;
+       opp->vir = VIR_GENERIC;
+       opp->vector_mask = 0xFFFF;
+       opp->tfrr_reset = 0;
+       opp->ivpr_reset = IVPR_MASK_MASK;
+       opp->idr_reset = 1 << 0;
+       opp->max_irq = MAX_IRQ;
+
+       opp->irq_ipi0 = virq;
+       virq += MAX_IPI;
+       opp->irq_tim0 = virq;
+       virq += MAX_TMR;
+
+       BUG_ON(virq > MAX_IRQ);
+
+       opp->irq_msi = 224;
+
+       for (i = 0; i < opp->fsl->max_ext; i++)
+               opp->src[i].level = false;
+
+       /* Internal interrupts, including message and MSI */
+       for (i = 16; i < MAX_SRC; i++) {
+               opp->src[i].type = IRQ_TYPE_FSLINT;
+               opp->src[i].level = true;
+       }
+
+       /* timers and IPIs */
+       for (i = MAX_SRC; i < virq; i++) {
+               opp->src[i].type = IRQ_TYPE_FSLSPECIAL;
+               opp->src[i].level = false;
+       }
+}
+
+static int kvm_mpic_read_internal(struct openpic *opp, gpa_t addr, u32 *ptr)
+{
+       int i;
+
+       for (i = 0; i < opp->num_mmio_regions; i++) {
+               const struct mem_reg *mr = opp->mmio_regions[i];
+
+               if (mr->start_addr > addr || addr >= mr->start_addr + mr->size)
+                       continue;
+
+               return mr->read(opp, addr - mr->start_addr, ptr);
+       }
+
+       return -ENXIO;
+}
+
+static int kvm_mpic_write_internal(struct openpic *opp, gpa_t addr, u32 val)
+{
+       int i;
+
+       for (i = 0; i < opp->num_mmio_regions; i++) {
+               const struct mem_reg *mr = opp->mmio_regions[i];
+
+               if (mr->start_addr > addr || addr >= mr->start_addr + mr->size)
+                       continue;
+
+               return mr->write(opp, addr - mr->start_addr, val);
+       }
+
+       return -ENXIO;
+}
+
+static int kvm_mpic_read(struct kvm_io_device *this, gpa_t addr,
+                        int len, void *ptr)
+{
+       struct openpic *opp = container_of(this, struct openpic, mmio);
+       int ret;
+       union {
+               u32 val;
+               u8 bytes[4];
+       } u;
+
+       if (addr & (len - 1)) {
+               pr_debug("%s: bad alignment %llx/%d\n",
+                        __func__, addr, len);
+               return -EINVAL;
+       }
+
+       spin_lock_irq(&opp->lock);
+       ret = kvm_mpic_read_internal(opp, addr - opp->reg_base, &u.val);
+       spin_unlock_irq(&opp->lock);
+
+       /*
+        * Technically only 32-bit accesses are allowed, but be nice to
+        * people dumping registers a byte at a time -- it works in real
+        * hardware (reads only, not writes).
+        */
+       if (len == 4) {
+               *(u32 *)ptr = u.val;
+               pr_debug("%s: addr %llx ret %d len 4 val %x\n",
+                        __func__, addr, ret, u.val);
+       } else if (len == 1) {
+               *(u8 *)ptr = u.bytes[addr & 3];
+               pr_debug("%s: addr %llx ret %d len 1 val %x\n",
+                        __func__, addr, ret, u.bytes[addr & 3]);
+       } else {
+               pr_debug("%s: bad length %d\n", __func__, len);
+               return -EINVAL;
+       }
+
+       return ret;
+}
+
+static int kvm_mpic_write(struct kvm_io_device *this, gpa_t addr,
+                         int len, const void *ptr)
+{
+       struct openpic *opp = container_of(this, struct openpic, mmio);
+       int ret;
+
+       if (len != 4) {
+               pr_debug("%s: bad length %d\n", __func__, len);
+               return -EOPNOTSUPP;
+       }
+       if (addr & 3) {
+               pr_debug("%s: bad alignment %llx/%d\n", __func__, addr, len);
+               return -EOPNOTSUPP;
+       }
+
+       spin_lock_irq(&opp->lock);
+       ret = kvm_mpic_write_internal(opp, addr - opp->reg_base,
+                                     *(const u32 *)ptr);
+       spin_unlock_irq(&opp->lock);
+
+       pr_debug("%s: addr %llx ret %d val %x\n",
+                __func__, addr, ret, *(const u32 *)ptr);
+
+       return ret;
+}
+
+static const struct kvm_io_device_ops mpic_mmio_ops = {
+       .read = kvm_mpic_read,
+       .write = kvm_mpic_write,
+};
+
+static void map_mmio(struct openpic *opp)
+{
+       kvm_iodevice_init(&opp->mmio, &mpic_mmio_ops);
+
+       kvm_io_bus_register_dev(opp->kvm, KVM_MMIO_BUS,
+                               opp->reg_base, OPENPIC_REG_SIZE,
+                               &opp->mmio);
+}
+
+static void unmap_mmio(struct openpic *opp)
+{
+       kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio);
+}
+
+static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr)
+{
+       u64 base;
+
+       if (copy_from_user(&base, (u64 __user *)(long)attr->addr, sizeof(u64)))
+               return -EFAULT;
+
+       if (base & 0x3ffff) {
+               pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx not aligned\n",
+                        __func__, base);
+               return -EINVAL;
+       }
+
+       if (base == opp->reg_base)
+               return 0;
+
+       mutex_lock(&opp->kvm->slots_lock);
+
+       unmap_mmio(opp);
+       opp->reg_base = base;
+
+       pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx\n",
+                __func__, base);
+
+       if (base == 0)
+               goto out;
+
+       map_mmio(opp);
+
+out:
+       mutex_unlock(&opp->kvm->slots_lock);
+       return 0;
+}
+
+#define ATTR_SET               0
+#define ATTR_GET               1
+
+static int access_reg(struct openpic *opp, gpa_t addr, u32 *val, int type)
+{
+       int ret;
+
+       if (addr & 3)
+               return -ENXIO;
+
+       spin_lock_irq(&opp->lock);
+
+       if (type == ATTR_SET)
+               ret = kvm_mpic_write_internal(opp, addr, *val);
+       else
+               ret = kvm_mpic_read_internal(opp, addr, val);
+
+       spin_unlock_irq(&opp->lock);
+
+       pr_debug("%s: type %d addr %llx val %x\n", __func__, type, addr, *val);
+
+       return ret;
+}
+
+static int mpic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+       struct openpic *opp = dev->private;
+       u32 attr32;
+
+       switch (attr->group) {
+       case KVM_DEV_MPIC_GRP_MISC:
+               switch (attr->attr) {
+               case KVM_DEV_MPIC_BASE_ADDR:
+                       return set_base_addr(opp, attr);
+               }
+
+               break;
+
+       case KVM_DEV_MPIC_GRP_REGISTER:
+               if (get_user(attr32, (u32 __user *)(long)attr->addr))
+                       return -EFAULT;
+
+               return access_reg(opp, attr->attr, &attr32, ATTR_SET);
+
+       case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+               if (attr->attr > MAX_SRC)
+                       return -EINVAL;
+
+               if (get_user(attr32, (u32 __user *)(long)attr->addr))
+                       return -EFAULT;
+
+               if (attr32 != 0 && attr32 != 1)
+                       return -EINVAL;
+
+               spin_lock_irq(&opp->lock);
+               openpic_set_irq(opp, attr->attr, attr32);
+               spin_unlock_irq(&opp->lock);
+               return 0;
+       }
+
+       return -ENXIO;
+}
+
+static int mpic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+       struct openpic *opp = dev->private;
+       u64 attr64;
+       u32 attr32;
+       int ret;
+
+       switch (attr->group) {
+       case KVM_DEV_MPIC_GRP_MISC:
+               switch (attr->attr) {
+               case KVM_DEV_MPIC_BASE_ADDR:
+                       mutex_lock(&opp->kvm->slots_lock);
+                       attr64 = opp->reg_base;
+                       mutex_unlock(&opp->kvm->slots_lock);
+
+                       if (copy_to_user((u64 __user *)(long)attr->addr,
+                                        &attr64, sizeof(u64)))
+                               return -EFAULT;
+
+                       return 0;
+               }
+
+               break;
+
+       case KVM_DEV_MPIC_GRP_REGISTER:
+               ret = access_reg(opp, attr->attr, &attr32, ATTR_GET);
+               if (ret)
+                       return ret;
+
+               if (put_user(attr32, (u32 __user *)(long)attr->addr))
+                       return -EFAULT;
+
+               return 0;
+
+       case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+               if (attr->attr > MAX_SRC)
+                       return -EINVAL;
+
+               spin_lock_irq(&opp->lock);
+               attr32 = opp->src[attr->attr].pending;
+               spin_unlock_irq(&opp->lock);
+
+               if (put_user(attr32, (u32 __user *)(long)attr->addr))
+                       return -EFAULT;
+
+               return 0;
+       }
+
+       return -ENXIO;
+}
+
+static int mpic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+       switch (attr->group) {
+       case KVM_DEV_MPIC_GRP_MISC:
+               switch (attr->attr) {
+               case KVM_DEV_MPIC_BASE_ADDR:
+                       return 0;
+               }
+
+               break;
+
+       case KVM_DEV_MPIC_GRP_REGISTER:
+               return 0;
+
+       case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+               if (attr->attr > MAX_SRC)
+                       break;
+
+               return 0;
+       }
+
+       return -ENXIO;
+}
+
+static void mpic_destroy(struct kvm_device *dev)
+{
+       struct openpic *opp = dev->private;
+
+       dev->kvm->arch.mpic = NULL;
+       kfree(opp);
+}
+
+static int mpic_set_default_irq_routing(struct openpic *opp)
+{
+       struct kvm_irq_routing_entry *routing;
+
+       /* Create a nop default map, so that dereferencing it still works */
+       routing = kzalloc((sizeof(*routing)), GFP_KERNEL);
+       if (!routing)
+               return -ENOMEM;
+
+       kvm_set_irq_routing(opp->kvm, routing, 0, 0);
+
+       kfree(routing);
+       return 0;
+}
+
+static int mpic_create(struct kvm_device *dev, u32 type)
+{
+       struct openpic *opp;
+       int ret;
+
+       /* We only support one MPIC at a time for now */
+       if (dev->kvm->arch.mpic)
+               return -EINVAL;
+
+       opp = kzalloc(sizeof(struct openpic), GFP_KERNEL);
+       if (!opp)
+               return -ENOMEM;
+
+       dev->private = opp;
+       opp->kvm = dev->kvm;
+       opp->dev = dev;
+       opp->model = type;
+       spin_lock_init(&opp->lock);
+
+       add_mmio_region(opp, &openpic_gbl_mmio);
+       add_mmio_region(opp, &openpic_tmr_mmio);
+       add_mmio_region(opp, &openpic_src_mmio);
+       add_mmio_region(opp, &openpic_cpu_mmio);
+
+       switch (opp->model) {
+       case KVM_DEV_TYPE_FSL_MPIC_20:
+               opp->fsl = &fsl_mpic_20;
+               opp->brr1 = 0x00400200;
+               opp->flags |= OPENPIC_FLAG_IDR_CRIT;
+               opp->nb_irqs = 80;
+               opp->mpic_mode_mask = GCR_MODE_MIXED;
+
+               fsl_common_init(opp);
+
+               break;
+
+       case KVM_DEV_TYPE_FSL_MPIC_42:
+               opp->fsl = &fsl_mpic_42;
+               opp->brr1 = 0x00400402;
+               opp->flags |= OPENPIC_FLAG_ILR;
+               opp->nb_irqs = 196;
+               opp->mpic_mode_mask = GCR_MODE_PROXY;
+
+               fsl_common_init(opp);
+
+               break;
+
+       default:
+               ret = -ENODEV;
+               goto err;
+       }
+
+       ret = mpic_set_default_irq_routing(opp);
+       if (ret)
+               goto err;
+
+       openpic_reset(opp);
+
+       smp_wmb();
+       dev->kvm->arch.mpic = opp;
+
+       return 0;
+
+err:
+       kfree(opp);
+       return ret;
+}
+
+struct kvm_device_ops kvm_mpic_ops = {
+       .name = "kvm-mpic",
+       .create = mpic_create,
+       .destroy = mpic_destroy,
+       .set_attr = mpic_set_attr,
+       .get_attr = mpic_get_attr,
+       .has_attr = mpic_has_attr,
+};
+
+int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
+                            u32 cpu)
+{
+       struct openpic *opp = dev->private;
+       int ret = 0;
+
+       if (dev->ops != &kvm_mpic_ops)
+               return -EPERM;
+       if (opp->kvm != vcpu->kvm)
+               return -EPERM;
+       if (cpu < 0 || cpu >= MAX_CPU)
+               return -EPERM;
+
+       spin_lock_irq(&opp->lock);
+
+       if (opp->dst[cpu].vcpu) {
+               ret = -EEXIST;
+               goto out;
+       }
+       if (vcpu->arch.irq_type) {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       opp->dst[cpu].vcpu = vcpu;
+       opp->nb_cpus = max(opp->nb_cpus, cpu + 1);
+
+       vcpu->arch.mpic = opp;
+       vcpu->arch.irq_cpu_id = cpu;
+       vcpu->arch.irq_type = KVMPPC_IRQ_MPIC;
+
+       /* This might need to be changed if GCR gets extended */
+       if (opp->mpic_mode_mask == GCR_MODE_PROXY)
+               vcpu->arch.epr_flags |= KVMPPC_EPR_KERNEL;
+
+out:
+       spin_unlock_irq(&opp->lock);
+       return ret;
+}
+
+/*
+ * This should only happen immediately before the mpic is destroyed,
+ * so we shouldn't need to worry about anything still trying to
+ * access the vcpu pointer.
+ */
+void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu)
+{
+       BUG_ON(!opp->dst[vcpu->arch.irq_cpu_id].vcpu);
+
+       opp->dst[vcpu->arch.irq_cpu_id].vcpu = NULL;
+}
+
+/*
+ * Return value:
+ *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
+ *  = 0   Interrupt was coalesced (previous irq is still pending)
+ *  > 0   Number of CPUs interrupt was delivered to
+ */
+static int mpic_set_irq(struct kvm_kernel_irq_routing_entry *e,
+                       struct kvm *kvm, int irq_source_id, int level,
+                       bool line_status)
+{
+       u32 irq = e->irqchip.pin;
+       struct openpic *opp = kvm->arch.mpic;
+       unsigned long flags;
+
+       spin_lock_irqsave(&opp->lock, flags);
+       openpic_set_irq(opp, irq, level);
+       spin_unlock_irqrestore(&opp->lock, flags);
+
+       /* All code paths we care about don't check for the return value */
+       return 0;
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+               struct kvm *kvm, int irq_source_id, int level, bool line_status)
+{
+       struct openpic *opp = kvm->arch.mpic;
+       unsigned long flags;
+
+       spin_lock_irqsave(&opp->lock, flags);
+
+       /*
+        * XXX We ignore the target address for now, as we only support
+        *     a single MSI bank.
+        */
+       openpic_msi_write(kvm->arch.mpic, MSIIR_OFFSET, e->msi.data);
+       spin_unlock_irqrestore(&opp->lock, flags);
+
+       /* All code paths we care about don't check for the return value */
+       return 0;
+}
+
+int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
+                         struct kvm_kernel_irq_routing_entry *e,
+                         const struct kvm_irq_routing_entry *ue)
+{
+       int r = -EINVAL;
+
+       switch (ue->type) {
+       case KVM_IRQ_ROUTING_IRQCHIP:
+               e->set = mpic_set_irq;
+               e->irqchip.irqchip = ue->u.irqchip.irqchip;
+               e->irqchip.pin = ue->u.irqchip.pin;
+               if (e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS)
+                       goto out;
+               rt->chip[ue->u.irqchip.irqchip][e->irqchip.pin] = ue->gsi;
+               break;
+       case KVM_IRQ_ROUTING_MSI:
+               e->set = kvm_set_msi;
+               e->msi.address_lo = ue->u.msi.address_lo;
+               e->msi.address_hi = ue->u.msi.address_hi;
+               e->msi.data = ue->u.msi.data;
+               break;
+       default:
+               goto out;
+       }
+
+       r = 0;
+out:
+       return r;
+}
index 934413cd3a1bf1178e59a82046a366c4a718325d..6316ee336e888e22636f557d1623c54b30d7a207 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/hrtimer.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/file.h>
 #include <asm/cputable.h>
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
@@ -32,6 +33,7 @@
 #include <asm/cputhreads.h>
 #include <asm/irqflags.h>
 #include "timing.h"
+#include "irq.h"
 #include "../mm/mmu_decl.h"
 
 #define CREATE_TRACE_POINTS
@@ -317,6 +319,7 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_ENABLE_CAP:
        case KVM_CAP_ONE_REG:
        case KVM_CAP_IOEVENTFD:
+       case KVM_CAP_DEVICE_CTRL:
                r = 1;
                break;
 #ifndef CONFIG_KVM_BOOK3S_64_HV
@@ -325,6 +328,9 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_PPC_GET_PVINFO:
 #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
        case KVM_CAP_SW_TLB:
+#endif
+#ifdef CONFIG_KVM_MPIC
+       case KVM_CAP_IRQ_MPIC:
 #endif
                r = 1;
                break;
@@ -335,6 +341,10 @@ int kvm_dev_ioctl_check_extension(long ext)
 #ifdef CONFIG_PPC_BOOK3S_64
        case KVM_CAP_SPAPR_TCE:
        case KVM_CAP_PPC_ALLOC_HTAB:
+       case KVM_CAP_PPC_RTAS:
+#ifdef CONFIG_KVM_XICS
+       case KVM_CAP_IRQ_XICS:
+#endif
                r = 1;
                break;
 #endif /* CONFIG_PPC_BOOK3S_64 */
@@ -411,18 +421,17 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 }
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
-                                   struct kvm_memory_slot *memslot,
-                                   struct kvm_memory_slot old,
-                                   struct kvm_userspace_memory_region *mem,
-                                   bool user_alloc)
+                                  struct kvm_memory_slot *memslot,
+                                  struct kvm_userspace_memory_region *mem,
+                                  enum kvm_mr_change change)
 {
        return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-               struct kvm_userspace_memory_region *mem,
-               struct kvm_memory_slot old,
-               bool user_alloc)
+                                  struct kvm_userspace_memory_region *mem,
+                                  const struct kvm_memory_slot *old,
+                                  enum kvm_mr_change change)
 {
        kvmppc_core_commit_memory_region(kvm, mem, old);
 }
@@ -460,6 +469,16 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
        tasklet_kill(&vcpu->arch.tasklet);
 
        kvmppc_remove_vcpu_debugfs(vcpu);
+
+       switch (vcpu->arch.irq_type) {
+       case KVMPPC_IRQ_MPIC:
+               kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu);
+               break;
+       case KVMPPC_IRQ_XICS:
+               kvmppc_xics_free_icp(vcpu);
+               break;
+       }
+
        kvmppc_core_vcpu_free(vcpu);
 }
 
@@ -532,12 +551,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 #endif
 }
 
-int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
-                                        struct kvm_guest_debug *dbg)
-{
-       return -EINVAL;
-}
-
 static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
                                      struct kvm_run *run)
 {
@@ -612,6 +625,8 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
 int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
                        unsigned int rt, unsigned int bytes, int is_bigendian)
 {
+       int idx, ret;
+
        if (bytes > sizeof(run->mmio.data)) {
                printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__,
                       run->mmio.len);
@@ -627,8 +642,14 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
        vcpu->mmio_is_write = 0;
        vcpu->arch.mmio_sign_extend = 0;
 
-       if (!kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
-                            bytes, &run->mmio.data)) {
+       idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+       ret = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
+                             bytes, &run->mmio.data);
+
+       srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+       if (!ret) {
                kvmppc_complete_mmio_load(vcpu, run);
                vcpu->mmio_needed = 0;
                return EMULATE_DONE;
@@ -653,6 +674,7 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
                         u64 val, unsigned int bytes, int is_bigendian)
 {
        void *data = run->mmio.data;
+       int idx, ret;
 
        if (bytes > sizeof(run->mmio.data)) {
                printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__,
@@ -682,9 +704,14 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
                }
        }
 
-       if (!kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
-                             bytes, &run->mmio.data)) {
-               kvmppc_complete_mmio_load(vcpu, run);
+       idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+       ret = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
+                              bytes, &run->mmio.data);
+
+       srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+       if (!ret) {
                vcpu->mmio_needed = 0;
                return EMULATE_DONE;
        }
@@ -740,7 +767,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 {
        if (irq->irq == KVM_INTERRUPT_UNSET) {
-               kvmppc_core_dequeue_external(vcpu, irq);
+               kvmppc_core_dequeue_external(vcpu);
                return 0;
        }
 
@@ -770,7 +797,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
                break;
        case KVM_CAP_PPC_EPR:
                r = 0;
-               vcpu->arch.epr_enabled = cap->args[0];
+               if (cap->args[0])
+                       vcpu->arch.epr_flags |= KVMPPC_EPR_USER;
+               else
+                       vcpu->arch.epr_flags &= ~KVMPPC_EPR_USER;
                break;
 #ifdef CONFIG_BOOKE
        case KVM_CAP_PPC_BOOKE_WATCHDOG:
@@ -791,6 +821,44 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
                break;
        }
 #endif
+#ifdef CONFIG_KVM_MPIC
+       case KVM_CAP_IRQ_MPIC: {
+               struct file *filp;
+               struct kvm_device *dev;
+
+               r = -EBADF;
+               filp = fget(cap->args[0]);
+               if (!filp)
+                       break;
+
+               r = -EPERM;
+               dev = kvm_device_from_filp(filp);
+               if (dev)
+                       r = kvmppc_mpic_connect_vcpu(dev, vcpu, cap->args[1]);
+
+               fput(filp);
+               break;
+       }
+#endif
+#ifdef CONFIG_KVM_XICS
+       case KVM_CAP_IRQ_XICS: {
+               struct file *filp;
+               struct kvm_device *dev;
+
+               r = -EBADF;
+               filp = fget(cap->args[0]);
+               if (!filp)
+                       break;
+
+               r = -EPERM;
+               dev = kvm_device_from_filp(filp);
+               if (dev)
+                       r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
+
+               fput(filp);
+               break;
+       }
+#endif /* CONFIG_KVM_XICS */
        default:
                r = -EINVAL;
                break;
@@ -913,9 +981,22 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
        return 0;
 }
 
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+                         bool line_status)
+{
+       if (!irqchip_in_kernel(kvm))
+               return -ENXIO;
+
+       irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+                                       irq_event->irq, irq_event->level,
+                                       line_status);
+       return 0;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg)
 {
+       struct kvm *kvm __maybe_unused = filp->private_data;
        void __user *argp = (void __user *)arg;
        long r;
 
@@ -934,7 +1015,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 #ifdef CONFIG_PPC_BOOK3S_64
        case KVM_CREATE_SPAPR_TCE: {
                struct kvm_create_spapr_tce create_tce;
-               struct kvm *kvm = filp->private_data;
 
                r = -EFAULT;
                if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
@@ -946,8 +1026,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
        case KVM_ALLOCATE_RMA: {
-               struct kvm *kvm = filp->private_data;
                struct kvm_allocate_rma rma;
+               struct kvm *kvm = filp->private_data;
 
                r = kvm_vm_ioctl_allocate_rma(kvm, &rma);
                if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma)))
@@ -956,7 +1036,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
        }
 
        case KVM_PPC_ALLOCATE_HTAB: {
-               struct kvm *kvm = filp->private_data;
                u32 htab_order;
 
                r = -EFAULT;
@@ -973,7 +1052,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
        }
 
        case KVM_PPC_GET_HTAB_FD: {
-               struct kvm *kvm = filp->private_data;
                struct kvm_get_htab_fd ghf;
 
                r = -EFAULT;
@@ -986,7 +1064,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 #ifdef CONFIG_PPC_BOOK3S_64
        case KVM_PPC_GET_SMMU_INFO: {
-               struct kvm *kvm = filp->private_data;
                struct kvm_ppc_smmu_info info;
 
                memset(&info, 0, sizeof(info));
@@ -995,6 +1072,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
                        r = -EFAULT;
                break;
        }
+       case KVM_PPC_RTAS_DEFINE_TOKEN: {
+               struct kvm *kvm = filp->private_data;
+
+               r = kvm_vm_ioctl_rtas_define_token(kvm, argp);
+               break;
+       }
 #endif /* CONFIG_PPC_BOOK3S_64 */
        default:
                r = -ENOTTY;
index 89db29d17c25fcea905814879b9f75b16cda58ed..7cd728b3b5e44129a7b49c0438454ed23e4f8aee 100644 (file)
@@ -51,6 +51,12 @@ static struct icp_ipl __iomem *icp_native_regs[NR_CPUS];
 static inline unsigned int icp_native_get_xirr(void)
 {
        int cpu = smp_processor_id();
+       unsigned int xirr;
+
+       /* Handled an interrupt latched by KVM */
+       xirr = kvmppc_get_xics_latch();
+       if (xirr)
+               return xirr;
 
        return in_be32(&icp_native_regs[cpu]->xirr.word);
 }
@@ -138,6 +144,7 @@ static unsigned int icp_native_get_irq(void)
 
 static void icp_native_cause_ipi(int cpu, unsigned long data)
 {
+       kvmppc_set_host_ipi(cpu, 1);
        icp_native_set_qirr(cpu, IPI_PRIORITY);
 }
 
@@ -151,6 +158,7 @@ static irqreturn_t icp_native_ipi_action(int irq, void *dev_id)
 {
        int cpu = smp_processor_id();
 
+       kvmppc_set_host_ipi(cpu, 0);
        icp_native_set_qirr(cpu, 0xff);
 
        return smp_ipi_demux();
index 7bf68fff7c5d28cd2255d8d86189eb55368ddd71..9ccd1905bdad45da5d0f1b18ffae312f59957a7b 100644 (file)
@@ -44,5 +44,6 @@ header-y += termios.h
 header-y += types.h
 header-y += ucontext.h
 header-y += unistd.h
+header-y += virtio-ccw.h
 header-y += vtoc.h
 header-y += zcrypt.h
diff --git a/arch/s390/include/uapi/asm/virtio-ccw.h b/arch/s390/include/uapi/asm/virtio-ccw.h
new file mode 100644 (file)
index 0000000..a9a4ebf
--- /dev/null
@@ -0,0 +1,21 @@
+/*
+ * Definitions for virtio-ccw devices.
+ *
+ * Copyright IBM Corp. 2013
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *  Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
+ */
+#ifndef __KVM_VIRTIO_CCW_H
+#define __KVM_VIRTIO_CCW_H
+
+/* Alignment of vring buffers. */
+#define KVM_VIRTIO_CCW_RING_ALIGN 4096
+
+/* Subcode for diagnose 500 (virtio hypercall). */
+#define KVM_S390_VIRTIO_CCW_NOTIFY 3
+
+#endif
index 60f9f8ae0fc8b61f977501ad0e46968b77cbed8e..70b46eacf8e163fbd6973428ed9ed5a4b07aefb0 100644 (file)
@@ -22,6 +22,7 @@ config KVM
        select PREEMPT_NOTIFIERS
        select ANON_INODES
        select HAVE_KVM_CPU_RELAX_INTERCEPT
+       select HAVE_KVM_EVENTFD
        ---help---
          Support hosting paravirtualized guest machines using the SIE
          virtualization capability on the mainframe. This should work
index 3975722bb19d87b81381d25570804bb66685a485..8fe9d65a4585b0670c5a1c622d774632b3f39b18 100644 (file)
@@ -6,7 +6,7 @@
 # it under the terms of the GNU General Public License (version 2 only)
 # as published by the Free Software Foundation.
 
-common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o)
+common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o eventfd.o)
 
 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
 
index a390687feb1359d6b579024d51787e4eeff0207e..1c01a99129896b42868c3dcb67cc2ceda66be97d 100644 (file)
@@ -13,6 +13,7 @@
 
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
+#include <asm/virtio-ccw.h>
 #include "kvm-s390.h"
 #include "trace.h"
 #include "trace-s390.h"
@@ -104,6 +105,29 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu)
        return -EREMOTE;
 }
 
+static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
+{
+       int ret, idx;
+
+       /* No virtio-ccw notification? Get out quickly. */
+       if (!vcpu->kvm->arch.css_support ||
+           (vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY))
+               return -EOPNOTSUPP;
+
+       idx = srcu_read_lock(&vcpu->kvm->srcu);
+       /*
+        * The layout is as follows:
+        * - gpr 2 contains the subchannel id (passed as addr)
+        * - gpr 3 contains the virtqueue index (passed as datamatch)
+        */
+       ret = kvm_io_bus_write(vcpu->kvm, KVM_VIRTIO_CCW_NOTIFY_BUS,
+                               vcpu->run->s.regs.gprs[2],
+                               8, &vcpu->run->s.regs.gprs[3]);
+       srcu_read_unlock(&vcpu->kvm->srcu, idx);
+       /* kvm_io_bus_write returns -EOPNOTSUPP if it found no match. */
+       return ret < 0 ? ret : 0;
+}
+
 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
 {
        int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16;
@@ -118,6 +142,8 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu)
                return __diag_time_slice_end_directed(vcpu);
        case 0x308:
                return __diag_ipl_functions(vcpu);
+       case 0x500:
+               return __diag_virtio_hypercall(vcpu);
        default:
                return -EOPNOTSUPP;
        }
index 4703f129e95e1153570129a0a0f0fc79c34b0f1e..302e0e52b0097e7f21a30bfb3fd9a1866e8c4274 100644 (file)
 #include <asm/uaccess.h>
 #include "kvm-s390.h"
 
-static inline void __user *__guestaddr_to_user(struct kvm_vcpu *vcpu,
-                                              unsigned long guestaddr)
+static inline void __user *__gptr_to_uptr(struct kvm_vcpu *vcpu,
+                                         void __user *gptr,
+                                         int prefixing)
 {
        unsigned long prefix  = vcpu->arch.sie_block->prefix;
-
-       if (guestaddr < 2 * PAGE_SIZE)
-               guestaddr += prefix;
-       else if ((guestaddr >= prefix) && (guestaddr < prefix + 2 * PAGE_SIZE))
-               guestaddr -= prefix;
-
-       return (void __user *) gmap_fault(guestaddr, vcpu->arch.gmap);
-}
-
-static inline int get_guest_u64(struct kvm_vcpu *vcpu, unsigned long guestaddr,
-                               u64 *result)
-{
-       void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
-
-       BUG_ON(guestaddr & 7);
-
-       if (IS_ERR((void __force *) uptr))
-               return PTR_ERR((void __force *) uptr);
-
-       return get_user(*result, (unsigned long __user *) uptr);
-}
-
-static inline int get_guest_u32(struct kvm_vcpu *vcpu, unsigned long guestaddr,
-                               u32 *result)
-{
-       void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
-
-       BUG_ON(guestaddr & 3);
-
-       if (IS_ERR((void __force *) uptr))
-               return PTR_ERR((void __force *) uptr);
-
-       return get_user(*result, (u32 __user *) uptr);
-}
-
-static inline int get_guest_u16(struct kvm_vcpu *vcpu, unsigned long guestaddr,
-                               u16 *result)
-{
-       void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
-
-       BUG_ON(guestaddr & 1);
-
-       if (IS_ERR(uptr))
-               return PTR_ERR(uptr);
-
-       return get_user(*result, (u16 __user *) uptr);
-}
-
-static inline int get_guest_u8(struct kvm_vcpu *vcpu, unsigned long guestaddr,
-                              u8 *result)
-{
-       void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
-
-       if (IS_ERR((void __force *) uptr))
-               return PTR_ERR((void __force *) uptr);
-
-       return get_user(*result, (u8 __user *) uptr);
-}
-
-static inline int put_guest_u64(struct kvm_vcpu *vcpu, unsigned long guestaddr,
-                               u64 value)
-{
-       void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
-
-       BUG_ON(guestaddr & 7);
-
-       if (IS_ERR((void __force *) uptr))
-               return PTR_ERR((void __force *) uptr);
-
-       return put_user(value, (u64 __user *) uptr);
-}
-
-static inline int put_guest_u32(struct kvm_vcpu *vcpu, unsigned long guestaddr,
-                               u32 value)
-{
-       void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
-
-       BUG_ON(guestaddr & 3);
-
-       if (IS_ERR((void __force *) uptr))
-               return PTR_ERR((void __force *) uptr);
-
-       return put_user(value, (u32 __user *) uptr);
-}
-
-static inline int put_guest_u16(struct kvm_vcpu *vcpu, unsigned long guestaddr,
-                               u16 value)
-{
-       void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
-
-       BUG_ON(guestaddr & 1);
-
-       if (IS_ERR((void __force *) uptr))
-               return PTR_ERR((void __force *) uptr);
-
-       return put_user(value, (u16 __user *) uptr);
-}
-
-static inline int put_guest_u8(struct kvm_vcpu *vcpu, unsigned long guestaddr,
-                              u8 value)
-{
-       void __user *uptr = __guestaddr_to_user(vcpu, guestaddr);
-
-       if (IS_ERR((void __force *) uptr))
-               return PTR_ERR((void __force *) uptr);
-
-       return put_user(value, (u8 __user *) uptr);
-}
-
-
-static inline int __copy_to_guest_slow(struct kvm_vcpu *vcpu,
-                                      unsigned long guestdest,
-                                      void *from, unsigned long n)
-{
-       int rc;
-       unsigned long i;
-       u8 *data = from;
-
-       for (i = 0; i < n; i++) {
-               rc = put_guest_u8(vcpu, guestdest++, *(data++));
-               if (rc < 0)
-                       return rc;
+       unsigned long gaddr = (unsigned long) gptr;
+       unsigned long uaddr;
+
+       if (prefixing) {
+               if (gaddr < 2 * PAGE_SIZE)
+                       gaddr += prefix;
+               else if ((gaddr >= prefix) && (gaddr < prefix + 2 * PAGE_SIZE))
+                       gaddr -= prefix;
        }
-       return 0;
-}
-
-static inline int __copy_to_guest_fast(struct kvm_vcpu *vcpu,
-                                      unsigned long guestdest,
-                                      void *from, unsigned long n)
-{
-       int r;
+       uaddr = gmap_fault(gaddr, vcpu->arch.gmap);
+       if (IS_ERR_VALUE(uaddr))
+               uaddr = -EFAULT;
+       return (void __user *)uaddr;
+}
+
+#define get_guest(vcpu, x, gptr)                               \
+({                                                             \
+       __typeof__(gptr) __uptr = __gptr_to_uptr(vcpu, gptr, 1);\
+       int __mask = sizeof(__typeof__(*(gptr))) - 1;           \
+       int __ret = PTR_RET((void __force *)__uptr);            \
+                                                               \
+       if (!__ret) {                                           \
+               BUG_ON((unsigned long)__uptr & __mask);         \
+               __ret = get_user(x, __uptr);                    \
+       }                                                       \
+       __ret;                                                  \
+})
+
+#define put_guest(vcpu, x, gptr)                               \
+({                                                             \
+       __typeof__(gptr) __uptr = __gptr_to_uptr(vcpu, gptr, 1);\
+       int __mask = sizeof(__typeof__(*(gptr))) - 1;           \
+       int __ret = PTR_RET((void __force *)__uptr);            \
+                                                               \
+       if (!__ret) {                                           \
+               BUG_ON((unsigned long)__uptr & __mask);         \
+               __ret = put_user(x, __uptr);                    \
+       }                                                       \
+       __ret;                                                  \
+})
+
+static inline int __copy_guest(struct kvm_vcpu *vcpu, unsigned long to,
+                              unsigned long from, unsigned long len,
+                              int to_guest, int prefixing)
+{
+       unsigned long _len, rc;
        void __user *uptr;
-       unsigned long size;
-
-       if (guestdest + n < guestdest)
-               return -EFAULT;
-
-       /* simple case: all within one segment table entry? */
-       if ((guestdest & PMD_MASK) == ((guestdest+n) & PMD_MASK)) {
-               uptr = (void __user *) gmap_fault(guestdest, vcpu->arch.gmap);
-
-               if (IS_ERR((void __force *) uptr))
-                       return PTR_ERR((void __force *) uptr);
-
-               r = copy_to_user(uptr, from, n);
-
-               if (r)
-                       r = -EFAULT;
-
-               goto out;
-       }
-
-       /* copy first segment */
-       uptr = (void __user *)gmap_fault(guestdest, vcpu->arch.gmap);
-
-       if (IS_ERR((void __force *) uptr))
-               return PTR_ERR((void __force *) uptr);
 
-       size = PMD_SIZE - (guestdest & ~PMD_MASK);
-
-       r = copy_to_user(uptr, from, size);
-
-       if (r) {
-               r = -EFAULT;
-               goto out;
-       }
-       from += size;
-       n -= size;
-       guestdest += size;
-
-       /* copy full segments */
-       while (n >= PMD_SIZE) {
-               uptr = (void __user *)gmap_fault(guestdest, vcpu->arch.gmap);
-
-               if (IS_ERR((void __force *) uptr))
-                       return PTR_ERR((void __force *) uptr);
-
-               r = copy_to_user(uptr, from, PMD_SIZE);
-
-               if (r) {
-                       r = -EFAULT;
-                       goto out;
-               }
-               from += PMD_SIZE;
-               n -= PMD_SIZE;
-               guestdest += PMD_SIZE;
-       }
-
-       /* copy the tail segment */
-       if (n) {
-               uptr = (void __user *)gmap_fault(guestdest, vcpu->arch.gmap);
-
-               if (IS_ERR((void __force *) uptr))
-                       return PTR_ERR((void __force *) uptr);
-
-               r = copy_to_user(uptr, from, n);
-
-               if (r)
-                       r = -EFAULT;
-       }
-out:
-       return r;
-}
-
-static inline int copy_to_guest_absolute(struct kvm_vcpu *vcpu,
-                                        unsigned long guestdest,
-                                        void *from, unsigned long n)
-{
-       return __copy_to_guest_fast(vcpu, guestdest, from, n);
-}
-
-static inline int copy_to_guest(struct kvm_vcpu *vcpu, unsigned long guestdest,
-                               void *from, unsigned long n)
-{
-       unsigned long prefix  = vcpu->arch.sie_block->prefix;
-
-       if ((guestdest < 2 * PAGE_SIZE) && (guestdest + n > 2 * PAGE_SIZE))
-               goto slowpath;
-
-       if ((guestdest < prefix) && (guestdest + n > prefix))
-               goto slowpath;
-
-       if ((guestdest < prefix + 2 * PAGE_SIZE)
-           && (guestdest + n > prefix + 2 * PAGE_SIZE))
-               goto slowpath;
-
-       if (guestdest < 2 * PAGE_SIZE)
-               guestdest += prefix;
-       else if ((guestdest >= prefix) && (guestdest < prefix + 2 * PAGE_SIZE))
-               guestdest -= prefix;
-
-       return __copy_to_guest_fast(vcpu, guestdest, from, n);
-slowpath:
-       return __copy_to_guest_slow(vcpu, guestdest, from, n);
-}
-
-static inline int __copy_from_guest_slow(struct kvm_vcpu *vcpu, void *to,
-                                        unsigned long guestsrc,
-                                        unsigned long n)
-{
-       int rc;
-       unsigned long i;
-       u8 *data = to;
-
-       for (i = 0; i < n; i++) {
-               rc = get_guest_u8(vcpu, guestsrc++, data++);
-               if (rc < 0)
-                       return rc;
+       while (len) {
+               uptr = to_guest ? (void __user *)to : (void __user *)from;
+               uptr = __gptr_to_uptr(vcpu, uptr, prefixing);
+               if (IS_ERR((void __force *)uptr))
+                       return -EFAULT;
+               _len = PAGE_SIZE - ((unsigned long)uptr & (PAGE_SIZE - 1));
+               _len = min(_len, len);
+               if (to_guest)
+                       rc = copy_to_user((void __user *) uptr, (void *)from, _len);
+               else
+                       rc = copy_from_user((void *)to, (void __user *)uptr, _len);
+               if (rc)
+                       return -EFAULT;
+               len -= _len;
+               from += _len;
+               to += _len;
        }
        return 0;
 }
 
-static inline int __copy_from_guest_fast(struct kvm_vcpu *vcpu, void *to,
-                                        unsigned long guestsrc,
-                                        unsigned long n)
-{
-       int r;
-       void __user *uptr;
-       unsigned long size;
-
-       if (guestsrc + n < guestsrc)
-               return -EFAULT;
-
-       /* simple case: all within one segment table entry? */
-       if ((guestsrc & PMD_MASK) == ((guestsrc+n) & PMD_MASK)) {
-               uptr = (void __user *) gmap_fault(guestsrc, vcpu->arch.gmap);
-
-               if (IS_ERR((void __force *) uptr))
-                       return PTR_ERR((void __force *) uptr);
-
-               r = copy_from_user(to, uptr, n);
-
-               if (r)
-                       r = -EFAULT;
-
-               goto out;
-       }
-
-       /* copy first segment */
-       uptr = (void __user *)gmap_fault(guestsrc, vcpu->arch.gmap);
-
-       if (IS_ERR((void __force *) uptr))
-               return PTR_ERR((void __force *) uptr);
-
-       size = PMD_SIZE - (guestsrc & ~PMD_MASK);
-
-       r = copy_from_user(to, uptr, size);
-
-       if (r) {
-               r = -EFAULT;
-               goto out;
-       }
-       to += size;
-       n -= size;
-       guestsrc += size;
-
-       /* copy full segments */
-       while (n >= PMD_SIZE) {
-               uptr = (void __user *)gmap_fault(guestsrc, vcpu->arch.gmap);
-
-               if (IS_ERR((void __force *) uptr))
-                       return PTR_ERR((void __force *) uptr);
-
-               r = copy_from_user(to, uptr, PMD_SIZE);
-
-               if (r) {
-                       r = -EFAULT;
-                       goto out;
-               }
-               to += PMD_SIZE;
-               n -= PMD_SIZE;
-               guestsrc += PMD_SIZE;
-       }
-
-       /* copy the tail segment */
-       if (n) {
-               uptr = (void __user *)gmap_fault(guestsrc, vcpu->arch.gmap);
-
-               if (IS_ERR((void __force *) uptr))
-                       return PTR_ERR((void __force *) uptr);
-
-               r = copy_from_user(to, uptr, n);
-
-               if (r)
-                       r = -EFAULT;
-       }
-out:
-       return r;
-}
-
-static inline int copy_from_guest_absolute(struct kvm_vcpu *vcpu, void *to,
-                                          unsigned long guestsrc,
-                                          unsigned long n)
-{
-       return __copy_from_guest_fast(vcpu, to, guestsrc, n);
-}
-
-static inline int copy_from_guest(struct kvm_vcpu *vcpu, void *to,
-                                 unsigned long guestsrc, unsigned long n)
-{
-       unsigned long prefix  = vcpu->arch.sie_block->prefix;
-
-       if ((guestsrc < 2 * PAGE_SIZE) && (guestsrc + n > 2 * PAGE_SIZE))
-               goto slowpath;
+#define copy_to_guest(vcpu, to, from, size) \
+       __copy_guest(vcpu, to, (unsigned long)from, size, 1, 1)
+#define copy_from_guest(vcpu, to, from, size) \
+       __copy_guest(vcpu, (unsigned long)to, from, size, 0, 1)
+#define copy_to_guest_absolute(vcpu, to, from, size) \
+       __copy_guest(vcpu, to, (unsigned long)from, size, 1, 0)
+#define copy_from_guest_absolute(vcpu, to, from, size) \
+       __copy_guest(vcpu, (unsigned long)to, from, size, 0, 0)
 
-       if ((guestsrc < prefix) && (guestsrc + n > prefix))
-               goto slowpath;
-
-       if ((guestsrc < prefix + 2 * PAGE_SIZE)
-           && (guestsrc + n > prefix + 2 * PAGE_SIZE))
-               goto slowpath;
-
-       if (guestsrc < 2 * PAGE_SIZE)
-               guestsrc += prefix;
-       else if ((guestsrc >= prefix) && (guestsrc < prefix + 2 * PAGE_SIZE))
-               guestsrc -= prefix;
-
-       return __copy_from_guest_fast(vcpu, to, guestsrc, n);
-slowpath:
-       return __copy_from_guest_slow(vcpu, to, guestsrc, n);
-}
-#endif
+#endif /* __KVM_S390_GACCESS_H */
index f26ff1e31bdb6415479444a4436328608b8bded0..b7d1b2edeeb35720402d9bf79055252f2d38ecd1 100644 (file)
@@ -43,12 +43,10 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
        trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr);
 
        do {
-               rc = get_guest_u64(vcpu, useraddr,
-                                  &vcpu->arch.sie_block->gcr[reg]);
-               if (rc == -EFAULT) {
-                       kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-                       break;
-               }
+               rc = get_guest(vcpu, vcpu->arch.sie_block->gcr[reg],
+                              (u64 __user *) useraddr);
+               if (rc)
+                       return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
                useraddr += 8;
                if (reg == reg3)
                        break;
@@ -78,11 +76,9 @@ static int handle_lctl(struct kvm_vcpu *vcpu)
 
        reg = reg1;
        do {
-               rc = get_guest_u32(vcpu, useraddr, &val);
-               if (rc == -EFAULT) {
-                       kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-                       break;
-               }
+               rc = get_guest(vcpu, val, (u32 __user *) useraddr);
+               if (rc)
+                       return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
                vcpu->arch.sie_block->gcr[reg] &= 0xffffffff00000000ul;
                vcpu->arch.sie_block->gcr[reg] |= val;
                useraddr += 4;
index 37116a77cb4b8b16bc8dbb698e34ced1510b03f5..5c948177529e281ca7138a1b9bd7ef0186f4cec9 100644 (file)
@@ -180,7 +180,7 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                                   struct kvm_s390_interrupt_info *inti)
 {
        const unsigned short table[] = { 2, 4, 4, 6 };
-       int rc, exception = 0;
+       int rc = 0;
 
        switch (inti->type) {
        case KVM_S390_INT_EMERGENCY:
@@ -188,74 +188,41 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                vcpu->stat.deliver_emergency_signal++;
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
                                                 inti->emerg.code, 0);
-               rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1201);
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, inti->emerg.code);
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
-                        &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-                       __LC_EXT_NEW_PSW, sizeof(psw_t));
-               if (rc == -EFAULT)
-                       exception = 1;
+               rc  = put_guest(vcpu, 0x1201, (u16 __user *)__LC_EXT_INT_CODE);
+               rc |= put_guest(vcpu, inti->emerg.code,
+                               (u16 __user *)__LC_EXT_CPU_ADDR);
+               rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+                                   &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                                     __LC_EXT_NEW_PSW, sizeof(psw_t));
                break;
-
        case KVM_S390_INT_EXTERNAL_CALL:
                VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call");
                vcpu->stat.deliver_external_call++;
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
                                                 inti->extcall.code, 0);
-               rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202);
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, inti->extcall.code);
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
-                        &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-                       __LC_EXT_NEW_PSW, sizeof(psw_t));
-               if (rc == -EFAULT)
-                       exception = 1;
+               rc  = put_guest(vcpu, 0x1202, (u16 __user *)__LC_EXT_INT_CODE);
+               rc |= put_guest(vcpu, inti->extcall.code,
+                               (u16 __user *)__LC_EXT_CPU_ADDR);
+               rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+                                   &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                                     __LC_EXT_NEW_PSW, sizeof(psw_t));
                break;
-
        case KVM_S390_INT_SERVICE:
                VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x",
                           inti->ext.ext_params);
                vcpu->stat.deliver_service_signal++;
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
                                                 inti->ext.ext_params, 0);
-               rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2401);
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
-                        &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-                       __LC_EXT_NEW_PSW, sizeof(psw_t));
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = put_guest_u32(vcpu, __LC_EXT_PARAMS, inti->ext.ext_params);
-               if (rc == -EFAULT)
-                       exception = 1;
+               rc  = put_guest(vcpu, 0x2401, (u16 __user *)__LC_EXT_INT_CODE);
+               rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+                                   &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                                     __LC_EXT_NEW_PSW, sizeof(psw_t));
+               rc |= put_guest(vcpu, inti->ext.ext_params,
+                               (u32 __user *)__LC_EXT_PARAMS);
                break;
-
        case KVM_S390_INT_VIRTIO:
                VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx",
                           inti->ext.ext_params, inti->ext.ext_params2);
@@ -263,34 +230,17 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
                                                 inti->ext.ext_params,
                                                 inti->ext.ext_params2);
-               rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2603);
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = put_guest_u16(vcpu, __LC_EXT_CPU_ADDR, 0x0d00);
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
-                        &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-                       __LC_EXT_NEW_PSW, sizeof(psw_t));
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = put_guest_u32(vcpu, __LC_EXT_PARAMS, inti->ext.ext_params);
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = put_guest_u64(vcpu, __LC_EXT_PARAMS2,
-                                  inti->ext.ext_params2);
-               if (rc == -EFAULT)
-                       exception = 1;
+               rc  = put_guest(vcpu, 0x2603, (u16 __user *)__LC_EXT_INT_CODE);
+               rc |= put_guest(vcpu, 0x0d00, (u16 __user *)__LC_EXT_CPU_ADDR);
+               rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+                                   &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                                     __LC_EXT_NEW_PSW, sizeof(psw_t));
+               rc |= put_guest(vcpu, inti->ext.ext_params,
+                               (u32 __user *)__LC_EXT_PARAMS);
+               rc |= put_guest(vcpu, inti->ext.ext_params2,
+                               (u64 __user *)__LC_EXT_PARAMS2);
                break;
-
        case KVM_S390_SIGP_STOP:
                VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop");
                vcpu->stat.deliver_stop_signal++;
@@ -313,18 +263,14 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                vcpu->stat.deliver_restart_signal++;
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
                                                 0, 0);
-               rc = copy_to_guest(vcpu, offsetof(struct _lowcore,
-                 restart_old_psw), &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-                       offsetof(struct _lowcore, restart_psw), sizeof(psw_t));
-               if (rc == -EFAULT)
-                       exception = 1;
+               rc  = copy_to_guest(vcpu,
+                                   offsetof(struct _lowcore, restart_old_psw),
+                                   &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                                     offsetof(struct _lowcore, restart_psw),
+                                     sizeof(psw_t));
                atomic_clear_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
                break;
-
        case KVM_S390_PROGRAM_INT:
                VCPU_EVENT(vcpu, 4, "interrupt: pgm check code:%x, ilc:%x",
                           inti->pgm.code,
@@ -332,24 +278,13 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                vcpu->stat.deliver_program_int++;
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
                                                 inti->pgm.code, 0);
-               rc = put_guest_u16(vcpu, __LC_PGM_INT_CODE, inti->pgm.code);
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = put_guest_u16(vcpu, __LC_PGM_ILC,
-                       table[vcpu->arch.sie_block->ipa >> 14]);
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = copy_to_guest(vcpu, __LC_PGM_OLD_PSW,
-                        &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-                       __LC_PGM_NEW_PSW, sizeof(psw_t));
-               if (rc == -EFAULT)
-                       exception = 1;
+               rc  = put_guest(vcpu, inti->pgm.code, (u16 __user *)__LC_PGM_INT_CODE);
+               rc |= put_guest(vcpu, table[vcpu->arch.sie_block->ipa >> 14],
+                               (u16 __user *)__LC_PGM_ILC);
+               rc |= copy_to_guest(vcpu, __LC_PGM_OLD_PSW,
+                                   &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                                     __LC_PGM_NEW_PSW, sizeof(psw_t));
                break;
 
        case KVM_S390_MCHK:
@@ -358,24 +293,13 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
                                                 inti->mchk.cr14,
                                                 inti->mchk.mcic);
-               rc = kvm_s390_vcpu_store_status(vcpu,
-                                               KVM_S390_STORE_STATUS_PREFIXED);
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = put_guest_u64(vcpu, __LC_MCCK_CODE, inti->mchk.mcic);
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = copy_to_guest(vcpu, __LC_MCK_OLD_PSW,
-                                  &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-                                    __LC_MCK_NEW_PSW, sizeof(psw_t));
-               if (rc == -EFAULT)
-                       exception = 1;
+               rc  = kvm_s390_vcpu_store_status(vcpu,
+                                                KVM_S390_STORE_STATUS_PREFIXED);
+               rc |= put_guest(vcpu, inti->mchk.mcic, (u64 __user *) __LC_MCCK_CODE);
+               rc |= copy_to_guest(vcpu, __LC_MCK_OLD_PSW,
+                                   &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                                     __LC_MCK_NEW_PSW, sizeof(psw_t));
                break;
 
        case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
@@ -388,67 +312,44 @@ static void __do_deliver_interrupt(struct kvm_vcpu *vcpu,
                vcpu->stat.deliver_io_int++;
                trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
                                                 param0, param1);
-               rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_ID,
-                                  inti->io.subchannel_id);
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = put_guest_u16(vcpu, __LC_SUBCHANNEL_NR,
-                                  inti->io.subchannel_nr);
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = put_guest_u32(vcpu, __LC_IO_INT_PARM,
-                                  inti->io.io_int_parm);
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = put_guest_u32(vcpu, __LC_IO_INT_WORD,
-                                  inti->io.io_int_word);
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = copy_to_guest(vcpu, __LC_IO_OLD_PSW,
-                                  &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-               if (rc == -EFAULT)
-                       exception = 1;
-
-               rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-                                    __LC_IO_NEW_PSW, sizeof(psw_t));
-               if (rc == -EFAULT)
-                       exception = 1;
+               rc  = put_guest(vcpu, inti->io.subchannel_id,
+                               (u16 __user *) __LC_SUBCHANNEL_ID);
+               rc |= put_guest(vcpu, inti->io.subchannel_nr,
+                               (u16 __user *) __LC_SUBCHANNEL_NR);
+               rc |= put_guest(vcpu, inti->io.io_int_parm,
+                               (u32 __user *) __LC_IO_INT_PARM);
+               rc |= put_guest(vcpu, inti->io.io_int_word,
+                               (u32 __user *) __LC_IO_INT_WORD);
+               rc |= copy_to_guest(vcpu, __LC_IO_OLD_PSW,
+                                   &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+               rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                                     __LC_IO_NEW_PSW, sizeof(psw_t));
                break;
        }
        default:
                BUG();
        }
-       if (exception) {
+       if (rc) {
                printk("kvm: The guest lowcore is not mapped during interrupt "
-                       "delivery, killing userspace\n");
+                      "delivery, killing userspace\n");
                do_exit(SIGKILL);
        }
 }
 
 static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu)
 {
-       int rc, exception = 0;
+       int rc;
 
        if (psw_extint_disabled(vcpu))
                return 0;
        if (!(vcpu->arch.sie_block->gcr[0] & 0x800ul))
                return 0;
-       rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1004);
-       if (rc == -EFAULT)
-               exception = 1;
-       rc = copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
-                &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
-       if (rc == -EFAULT)
-               exception = 1;
-       rc = copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
-               __LC_EXT_NEW_PSW, sizeof(psw_t));
-       if (rc == -EFAULT)
-               exception = 1;
-       if (exception) {
+       rc  = put_guest(vcpu, 0x1004, (u16 __user *)__LC_EXT_INT_CODE);
+       rc |= copy_to_guest(vcpu, __LC_EXT_OLD_PSW,
+                           &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
+       rc |= copy_from_guest(vcpu, &vcpu->arch.sie_block->gpsw,
+                             __LC_EXT_NEW_PSW, sizeof(psw_t));
+       if (rc) {
                printk("kvm: The guest lowcore is not mapped during interrupt "
                        "delivery, killing userspace\n");
                do_exit(SIGKILL);
index 4cf35a0a79e7734b6cd99803f3f54bd2df5c6396..c1c7c683fa26d591af0eeac242ee898f88cb9dc2 100644 (file)
@@ -142,12 +142,16 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_ONE_REG:
        case KVM_CAP_ENABLE_CAP:
        case KVM_CAP_S390_CSS_SUPPORT:
+       case KVM_CAP_IOEVENTFD:
                r = 1;
                break;
        case KVM_CAP_NR_VCPUS:
        case KVM_CAP_MAX_VCPUS:
                r = KVM_MAX_VCPUS;
                break;
+       case KVM_CAP_NR_MEMSLOTS:
+               r = KVM_USER_MEM_SLOTS;
+               break;
        case KVM_CAP_S390_COW:
                r = MACHINE_HAS_ESOP;
                break;
@@ -632,8 +636,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                } else {
                        VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
                        trace_kvm_s390_sie_fault(vcpu);
-                       kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-                       rc = 0;
+                       rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
                }
        }
        VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
@@ -974,22 +977,13 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 /* Section: memory related */
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot,
-                                  struct kvm_memory_slot old,
                                   struct kvm_userspace_memory_region *mem,
-                                  bool user_alloc)
+                                  enum kvm_mr_change change)
 {
-       /* A few sanity checks. We can have exactly one memory slot which has
-          to start at guest virtual zero and which has to be located at a
-          page boundary in userland and which has to end at a page boundary.
-          The memory in userland is ok to be fragmented into various different
-          vmas. It is okay to mmap() and munmap() stuff in this slot after
-          doing this call at any time */
-
-       if (mem->slot)
-               return -EINVAL;
-
-       if (mem->guest_phys_addr)
-               return -EINVAL;
+       /* A few sanity checks. We can have memory slots which have to be
+          located/ended at a segment boundary (1MB). The memory in userland is
+          ok to be fragmented into various different vmas. It is okay to mmap()
+          and munmap() stuff in this slot after doing this call at any time */
 
        if (mem->userspace_addr & 0xffffful)
                return -EINVAL;
@@ -997,19 +991,26 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
        if (mem->memory_size & 0xffffful)
                return -EINVAL;
 
-       if (!user_alloc)
-               return -EINVAL;
-
        return 0;
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
                                struct kvm_userspace_memory_region *mem,
-                               struct kvm_memory_slot old,
-                               bool user_alloc)
+                               const struct kvm_memory_slot *old,
+                               enum kvm_mr_change change)
 {
        int rc;
 
+       /* If the basics of the memslot do not change, we do not want
+        * to update the gmap. Every update causes several unnecessary
+        * segment translation exceptions. This is usually handled just
+        * fine by the normal fault handler + gmap, but it will also
+        * cause faults on the prefix page of running guest CPUs.
+        */
+       if (old->userspace_addr == mem->userspace_addr &&
+           old->base_gfn * PAGE_SIZE == mem->guest_phys_addr &&
+           old->npages * PAGE_SIZE == mem->memory_size)
+               return;
 
        rc = gmap_map_segment(kvm->arch.gmap, mem->userspace_addr,
                mem->guest_phys_addr, mem->memory_size);
index 4d89d64a81612f939e32da29fd5a6c9728379053..efc14f687265d5e0c5ad2d74c4104943187ca72b 100644 (file)
@@ -110,12 +110,12 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
 void kvm_s390_tasklet(unsigned long parm);
 void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu);
 void kvm_s390_deliver_pending_machine_checks(struct kvm_vcpu *vcpu);
-int kvm_s390_inject_vm(struct kvm *kvm,
-               struct kvm_s390_interrupt *s390int);
-int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
-               struct kvm_s390_interrupt *s390int);
-int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
-int kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action);
+int __must_check kvm_s390_inject_vm(struct kvm *kvm,
+                                   struct kvm_s390_interrupt *s390int);
+int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
+                                     struct kvm_s390_interrupt *s390int);
+int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
+int __must_check kvm_s390_inject_sigp_stop(struct kvm_vcpu *vcpu, int action);
 struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
                                                    u64 cr6, u64 schid);
 
index 0ef9894606e519874fb531bb890b4d36badb75aa..6bbd7b5a0bbee36d483e7deba484ce96e3176dfb 100644 (file)
@@ -14,6 +14,8 @@
 #include <linux/kvm.h>
 #include <linux/gfp.h>
 #include <linux/errno.h>
+#include <linux/compat.h>
+#include <asm/asm-offsets.h>
 #include <asm/current.h>
 #include <asm/debug.h>
 #include <asm/ebcdic.h>
@@ -35,31 +37,24 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
        operand2 = kvm_s390_get_base_disp_s(vcpu);
 
        /* must be word boundary */
-       if (operand2 & 3) {
-               kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-               goto out;
-       }
+       if (operand2 & 3)
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
        /* get the value */
-       if (get_guest_u32(vcpu, operand2, &address)) {
-               kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-               goto out;
-       }
+       if (get_guest(vcpu, address, (u32 __user *) operand2))
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
        address = address & 0x7fffe000u;
 
        /* make sure that the new value is valid memory */
        if (copy_from_guest_absolute(vcpu, &tmp, address, 1) ||
-          (copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1))) {
-               kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-               goto out;
-       }
+          (copy_from_guest_absolute(vcpu, &tmp, address + PAGE_SIZE, 1)))
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
        kvm_s390_set_prefix(vcpu, address);
 
        VCPU_EVENT(vcpu, 5, "setting prefix to %x", address);
        trace_kvm_s390_handle_prefix(vcpu, 1, address);
-out:
        return 0;
 }
 
@@ -73,49 +68,37 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
        operand2 = kvm_s390_get_base_disp_s(vcpu);
 
        /* must be word boundary */
-       if (operand2 & 3) {
-               kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-               goto out;
-       }
+       if (operand2 & 3)
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
        address = vcpu->arch.sie_block->prefix;
        address = address & 0x7fffe000u;
 
        /* get the value */
-       if (put_guest_u32(vcpu, operand2, address)) {
-               kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-               goto out;
-       }
+       if (put_guest(vcpu, address, (u32 __user *)operand2))
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
        VCPU_EVENT(vcpu, 5, "storing prefix to %x", address);
        trace_kvm_s390_handle_prefix(vcpu, 0, address);
-out:
        return 0;
 }
 
 static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
 {
        u64 useraddr;
-       int rc;
 
        vcpu->stat.instruction_stap++;
 
        useraddr = kvm_s390_get_base_disp_s(vcpu);
 
-       if (useraddr & 1) {
-               kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-               goto out;
-       }
+       if (useraddr & 1)
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-       rc = put_guest_u16(vcpu, useraddr, vcpu->vcpu_id);
-       if (rc == -EFAULT) {
-               kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-               goto out;
-       }
+       if (put_guest(vcpu, vcpu->vcpu_id, (u16 __user *)useraddr))
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
        VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", useraddr);
        trace_kvm_s390_handle_stap(vcpu, useraddr);
-out:
        return 0;
 }
 
@@ -129,36 +112,38 @@ static int handle_skey(struct kvm_vcpu *vcpu)
 
 static int handle_tpi(struct kvm_vcpu *vcpu)
 {
-       u64 addr;
        struct kvm_s390_interrupt_info *inti;
+       u64 addr;
        int cc;
 
        addr = kvm_s390_get_base_disp_s(vcpu);
-
+       if (addr & 3)
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+       cc = 0;
        inti = kvm_s390_get_io_int(vcpu->kvm, vcpu->run->s.regs.crs[6], 0);
-       if (inti) {
-               if (addr) {
-                       /*
-                        * Store the two-word I/O interruption code into the
-                        * provided area.
-                        */
-                       put_guest_u16(vcpu, addr, inti->io.subchannel_id);
-                       put_guest_u16(vcpu, addr + 2, inti->io.subchannel_nr);
-                       put_guest_u32(vcpu, addr + 4, inti->io.io_int_parm);
-               } else {
-                       /*
-                        * Store the three-word I/O interruption code into
-                        * the appropriate lowcore area.
-                        */
-                       put_guest_u16(vcpu, 184, inti->io.subchannel_id);
-                       put_guest_u16(vcpu, 186, inti->io.subchannel_nr);
-                       put_guest_u32(vcpu, 188, inti->io.io_int_parm);
-                       put_guest_u32(vcpu, 192, inti->io.io_int_word);
-               }
-               cc = 1;
-       } else
-               cc = 0;
+       if (!inti)
+               goto no_interrupt;
+       cc = 1;
+       if (addr) {
+               /*
+                * Store the two-word I/O interruption code into the
+                * provided area.
+                */
+               put_guest(vcpu, inti->io.subchannel_id, (u16 __user *) addr);
+               put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *) (addr + 2));
+               put_guest(vcpu, inti->io.io_int_parm, (u32 __user *) (addr + 4));
+       } else {
+               /*
+                * Store the three-word I/O interruption code into
+                * the appropriate lowcore area.
+                */
+               put_guest(vcpu, inti->io.subchannel_id, (u16 __user *) __LC_SUBCHANNEL_ID);
+               put_guest(vcpu, inti->io.subchannel_nr, (u16 __user *) __LC_SUBCHANNEL_NR);
+               put_guest(vcpu, inti->io.io_int_parm, (u32 __user *) __LC_IO_INT_PARM);
+               put_guest(vcpu, inti->io.io_int_word, (u32 __user *) __LC_IO_INT_WORD);
+       }
        kfree(inti);
+no_interrupt:
        /* Set condition code and we're done. */
        vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
        vcpu->arch.sie_block->gpsw.mask |= (cc & 3ul) << 44;
@@ -230,13 +215,10 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
 
        rc = copy_to_guest(vcpu, offsetof(struct _lowcore, stfl_fac_list),
                           &facility_list, sizeof(facility_list));
-       if (rc == -EFAULT)
-               kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-       else {
-               VCPU_EVENT(vcpu, 5, "store facility list value %x",
-                          facility_list);
-               trace_kvm_s390_handle_stfl(vcpu, facility_list);
-       }
+       if (rc)
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       VCPU_EVENT(vcpu, 5, "store facility list value %x", facility_list);
+       trace_kvm_s390_handle_stfl(vcpu, facility_list);
        return 0;
 }
 
@@ -249,112 +231,80 @@ static void handle_new_psw(struct kvm_vcpu *vcpu)
 
 #define PSW_MASK_ADDR_MODE (PSW_MASK_EA | PSW_MASK_BA)
 #define PSW_MASK_UNASSIGNED 0xb80800fe7fffffffUL
-#define PSW_ADDR_24 0x00000000000fffffUL
+#define PSW_ADDR_24 0x0000000000ffffffUL
 #define PSW_ADDR_31 0x000000007fffffffUL
 
+static int is_valid_psw(psw_t *psw) {
+       if (psw->mask & PSW_MASK_UNASSIGNED)
+               return 0;
+       if ((psw->mask & PSW_MASK_ADDR_MODE) == PSW_MASK_BA) {
+               if (psw->addr & ~PSW_ADDR_31)
+                       return 0;
+       }
+       if (!(psw->mask & PSW_MASK_ADDR_MODE) && (psw->addr & ~PSW_ADDR_24))
+               return 0;
+       if ((psw->mask & PSW_MASK_ADDR_MODE) ==  PSW_MASK_EA)
+               return 0;
+       return 1;
+}
+
 int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
 {
-       u64 addr;
+       psw_t *gpsw = &vcpu->arch.sie_block->gpsw;
        psw_compat_t new_psw;
+       u64 addr;
 
-       if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+       if (gpsw->mask & PSW_MASK_PSTATE)
                return kvm_s390_inject_program_int(vcpu,
                                                   PGM_PRIVILEGED_OPERATION);
-
        addr = kvm_s390_get_base_disp_s(vcpu);
-
-       if (addr & 7) {
-               kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-               goto out;
-       }
-
-       if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) {
-               kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-               goto out;
-       }
-
-       if (!(new_psw.mask & PSW32_MASK_BASE)) {
-               kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-               goto out;
-       }
-
-       vcpu->arch.sie_block->gpsw.mask =
-               (new_psw.mask & ~PSW32_MASK_BASE) << 32;
-       vcpu->arch.sie_block->gpsw.addr = new_psw.addr;
-
-       if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_UNASSIGNED) ||
-           (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) &&
-            (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) ||
-           ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
-            PSW_MASK_EA)) {
-               kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-               goto out;
-       }
-
+       if (addr & 7)
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+       if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw)))
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       if (!(new_psw.mask & PSW32_MASK_BASE))
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+       gpsw->mask = (new_psw.mask & ~PSW32_MASK_BASE) << 32;
+       gpsw->mask |= new_psw.addr & PSW32_ADDR_AMODE;
+       gpsw->addr = new_psw.addr & ~PSW32_ADDR_AMODE;
+       if (!is_valid_psw(gpsw))
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
        handle_new_psw(vcpu);
-out:
        return 0;
 }
 
 static int handle_lpswe(struct kvm_vcpu *vcpu)
 {
-       u64 addr;
        psw_t new_psw;
+       u64 addr;
 
        addr = kvm_s390_get_base_disp_s(vcpu);
-
-       if (addr & 7) {
-               kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-               goto out;
-       }
-
-       if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw))) {
-               kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-               goto out;
-       }
-
-       vcpu->arch.sie_block->gpsw.mask = new_psw.mask;
-       vcpu->arch.sie_block->gpsw.addr = new_psw.addr;
-
-       if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_UNASSIGNED) ||
-           (((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
-             PSW_MASK_BA) &&
-            (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_31)) ||
-           (!(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) &&
-            (vcpu->arch.sie_block->gpsw.addr & ~PSW_ADDR_24)) ||
-           ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_ADDR_MODE) ==
-            PSW_MASK_EA)) {
-               kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-               goto out;
-       }
-
+       if (addr & 7)
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+       if (copy_from_guest(vcpu, &new_psw, addr, sizeof(new_psw)))
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+       vcpu->arch.sie_block->gpsw = new_psw;
+       if (!is_valid_psw(&vcpu->arch.sie_block->gpsw))
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
        handle_new_psw(vcpu);
-out:
        return 0;
 }
 
 static int handle_stidp(struct kvm_vcpu *vcpu)
 {
        u64 operand2;
-       int rc;
 
        vcpu->stat.instruction_stidp++;
 
        operand2 = kvm_s390_get_base_disp_s(vcpu);
 
-       if (operand2 & 7) {
-               kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
-               goto out;
-       }
+       if (operand2 & 7)
+               return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-       rc = put_guest_u64(vcpu, operand2, vcpu->arch.stidp_data);
-       if (rc == -EFAULT) {
-               kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-               goto out;
-       }
+       if (put_guest(vcpu, vcpu->arch.stidp_data, (u64 __user *)operand2))
+               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 
        VCPU_EVENT(vcpu, 5, "%s", "store cpu id");
-out:
        return 0;
 }
 
@@ -394,8 +344,9 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
        int fc = (vcpu->run->s.regs.gprs[0] & 0xf0000000) >> 28;
        int sel1 = vcpu->run->s.regs.gprs[0] & 0xff;
        int sel2 = vcpu->run->s.regs.gprs[1] & 0xffff;
+       unsigned long mem = 0;
        u64 operand2;
-       unsigned long mem;
+       int rc = 0;
 
        vcpu->stat.instruction_stsi++;
        VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2);
@@ -414,37 +365,37 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
        case 2:
                mem = get_zeroed_page(GFP_KERNEL);
                if (!mem)
-                       goto out_fail;
+                       goto out_no_data;
                if (stsi((void *) mem, fc, sel1, sel2))
-                       goto out_mem;
+                       goto out_no_data;
                break;
        case 3:
                if (sel1 != 2 || sel2 != 2)
-                       goto out_fail;
+                       goto out_no_data;
                mem = get_zeroed_page(GFP_KERNEL);
                if (!mem)
-                       goto out_fail;
+                       goto out_no_data;
                handle_stsi_3_2_2(vcpu, (void *) mem);
                break;
        default:
-               goto out_fail;
+               goto out_no_data;
        }
 
        if (copy_to_guest_absolute(vcpu, operand2, (void *) mem, PAGE_SIZE)) {
-               kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-               goto out_mem;
+               rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+               goto out_exception;
        }
        trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
        free_page(mem);
        vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
        vcpu->run->s.regs.gprs[0] = 0;
        return 0;
-out_mem:
-       free_page(mem);
-out_fail:
+out_no_data:
        /* condition code 3 */
        vcpu->arch.sie_block->gpsw.mask |= 3ul << 44;
-       return 0;
+out_exception:
+       free_page(mem);
+       return rc;
 }
 
 static const intercept_handler_t b2_handlers[256] = {
@@ -575,20 +526,13 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
        if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
                return -EOPNOTSUPP;
 
-
-       /* we must resolve the address without holding the mmap semaphore.
-        * This is ok since the userspace hypervisor is not supposed to change
-        * the mapping while the guest queries the memory. Otherwise the guest
-        * might crash or get wrong info anyway. */
-       user_address = (unsigned long) __guestaddr_to_user(vcpu, address1);
-
        down_read(&current->mm->mmap_sem);
+       user_address = __gmap_translate(address1, vcpu->arch.gmap);
+       if (IS_ERR_VALUE(user_address))
+               goto out_inject;
        vma = find_vma(current->mm, user_address);
-       if (!vma) {
-               up_read(&current->mm->mmap_sem);
-               return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
-       }
-
+       if (!vma)
+               goto out_inject;
        vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
        if (!(vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_READ))
                vcpu->arch.sie_block->gpsw.mask |= (1ul << 44);
@@ -597,6 +541,10 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
 
        up_read(&current->mm->mmap_sem);
        return 0;
+
+out_inject:
+       up_read(&current->mm->mmap_sem);
+       return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
 }
 
 int kvm_s390_handle_e5(struct kvm_vcpu *vcpu)
index 40afa0005c6905ae1e2826f0c7151b7740f73b80..9bd4ecac72be33f4366e08d5a98ac687cf4a6fe7 100644 (file)
@@ -19,6 +19,10 @@ BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
 
 BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
 
+#ifdef CONFIG_HAVE_KVM
+BUILD_INTERRUPT(kvm_posted_intr_ipi, POSTED_INTR_VECTOR)
+#endif
+
 /*
  * every pentium local APIC has two 'local interrupts', with a
  * soft-definable vector attached to both interrupts, one of
index 81f04cee5f741226b6e23e45bcab7874bff29e62..ab0ae1aa6d0af1c5d64edcd313e5283aa099c04f 100644 (file)
@@ -11,6 +11,9 @@ typedef struct {
        unsigned int apic_timer_irqs;   /* arch dependent */
        unsigned int irq_spurious_count;
        unsigned int icr_read_retry_count;
+#endif
+#ifdef CONFIG_HAVE_KVM
+       unsigned int kvm_posted_intr_ipis;
 #endif
        unsigned int x86_platform_ipis; /* arch dependent */
        unsigned int apic_perf_irqs;
index 10a78c3d3d5a771d8581a13e402eb69e462e1dae..1da97efad08ae3c79f73827be832ed5618dd0372 100644 (file)
@@ -28,6 +28,7 @@
 /* Interrupt handlers registered during init_IRQ */
 extern void apic_timer_interrupt(void);
 extern void x86_platform_ipi(void);
+extern void kvm_posted_intr_ipi(void);
 extern void error_interrupt(void);
 extern void irq_work_interrupt(void);
 
index aac5fa62a86caf91c82873712543017f89df1756..5702d7e3111db94facc17423b882d46617bd64fe 100644 (file)
  */
 #define X86_PLATFORM_IPI_VECTOR                0xf7
 
+/* Vector for KVM to deliver posted interrupt IPI */
+#ifdef CONFIG_HAVE_KVM
+#define POSTED_INTR_VECTOR             0xf2
+#endif
+
 /*
  * IRQ work vector:
  */
index 4979778cc7fb51a14f5fa5960eb02b36d5c235f9..3741c653767ca958d0cc56f91d4807a4db7440b7 100644 (file)
@@ -31,7 +31,7 @@
 #include <asm/msr-index.h>
 #include <asm/asm.h>
 
-#define KVM_MAX_VCPUS 254
+#define KVM_MAX_VCPUS 255
 #define KVM_SOFT_MAX_VCPUS 160
 #define KVM_USER_MEM_SLOTS 125
 /* memory slots that are not exposed to userspace */
@@ -43,6 +43,8 @@
 #define KVM_PIO_PAGE_OFFSET 1
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2
 
+#define KVM_IRQCHIP_NUM_PINS  KVM_IOAPIC_NUM_PINS
+
 #define CR0_RESERVED_BITS                                               \
        (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
                          | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
@@ -94,9 +96,6 @@
 
 #define ASYNC_PF_PER_VCPU 64
 
-extern raw_spinlock_t kvm_lock;
-extern struct list_head vm_list;
-
 struct kvm_vcpu;
 struct kvm;
 struct kvm_async_pf;
@@ -230,6 +229,7 @@ struct kvm_mmu_page {
 #endif
 
        int write_flooding_count;
+       bool mmio_cached;
 };
 
 struct kvm_pio_request {
@@ -345,7 +345,6 @@ struct kvm_vcpu_arch {
        unsigned long apic_attention;
        int32_t apic_arb_prio;
        int mp_state;
-       int sipi_vector;
        u64 ia32_misc_enable_msr;
        bool tpr_access_reporting;
 
@@ -643,7 +642,7 @@ struct kvm_x86_ops {
        /* Create, but do not attach this VCPU */
        struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
        void (*vcpu_free)(struct kvm_vcpu *vcpu);
-       int (*vcpu_reset)(struct kvm_vcpu *vcpu);
+       void (*vcpu_reset)(struct kvm_vcpu *vcpu);
 
        void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
        void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
@@ -696,14 +695,16 @@ struct kvm_x86_ops {
        int (*nmi_allowed)(struct kvm_vcpu *vcpu);
        bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
        void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
-       void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
-       void (*enable_irq_window)(struct kvm_vcpu *vcpu);
+       int (*enable_nmi_window)(struct kvm_vcpu *vcpu);
+       int (*enable_irq_window)(struct kvm_vcpu *vcpu);
        void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
        int (*vm_has_apicv)(struct kvm *kvm);
        void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
        void (*hwapic_isr_update)(struct kvm *kvm, int isr);
        void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
        void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
+       void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
+       void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
        int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
        int (*get_tdp_level)(void);
        u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
@@ -730,6 +731,7 @@ struct kvm_x86_ops {
        int (*check_intercept)(struct kvm_vcpu *vcpu,
                               struct x86_instruction_info *info,
                               enum x86_intercept_stage stage);
+       void (*handle_external_intr)(struct kvm_vcpu *vcpu);
 };
 
 struct kvm_arch_async_pf {
@@ -767,6 +769,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
                                     struct kvm_memory_slot *slot,
                                     gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
+void kvm_mmu_zap_mmio_sptes(struct kvm *kvm);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
 
@@ -797,6 +800,7 @@ enum emulation_result {
 #define EMULTYPE_TRAP_UD           (1 << 1)
 #define EMULTYPE_SKIP              (1 << 2)
 #define EMULTYPE_RETRY             (1 << 3)
+#define EMULTYPE_NO_REEXECUTE      (1 << 4)
 int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
                            int emulation_type, void *insn, int insn_len);
 
@@ -807,6 +811,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
 }
 
 void kvm_enable_efer_bits(u64);
+bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
 int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
 
@@ -819,6 +824,7 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
 
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
+void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector);
 
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
                    int reason, bool has_error_code, u32 error_code);
@@ -973,7 +979,6 @@ enum {
  * Trap the fault and ignore the instruction if that happens.
  */
 asmlinkage void kvm_spurious_fault(void);
-extern bool kvm_rebooting;
 
 #define ____kvm_handle_fault_on_reboot(insn, cleanup_insn)     \
        "666: " insn "\n\t" \
@@ -1002,6 +1007,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
 
 void kvm_define_shared_msr(unsigned index, u32 msr);
 void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
@@ -1027,7 +1033,7 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu);
 void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu);
 bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr);
 int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
-int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
 int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
 void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
 void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
index b6fbf860e398ed940bfb1113acd769a0afc827da..f3e01a2cbaa1b965f0adf89aa1c00b7a3619df51 100644 (file)
 #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY    0x00000200
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING      0x00000400
 #define SECONDARY_EXEC_ENABLE_INVPCID          0x00001000
+#define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
 
 
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
 #define PIN_BASED_NMI_EXITING                   0x00000008
 #define PIN_BASED_VIRTUAL_NMIS                  0x00000020
+#define PIN_BASED_VMX_PREEMPTION_TIMER          0x00000040
+#define PIN_BASED_POSTED_INTR                   0x00000080
+
+#define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR    0x00000016
 
 #define VM_EXIT_SAVE_DEBUG_CONTROLS             0x00000002
 #define VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200
@@ -81,6 +86,8 @@
 #define VM_EXIT_LOAD_IA32_EFER                  0x00200000
 #define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER       0x00400000
 
+#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR      0x00036dff
+
 #define VM_ENTRY_LOAD_DEBUG_CONTROLS            0x00000002
 #define VM_ENTRY_IA32E_MODE                     0x00000200
 #define VM_ENTRY_SMM                            0x00000400
 #define VM_ENTRY_LOAD_IA32_PAT                 0x00004000
 #define VM_ENTRY_LOAD_IA32_EFER                 0x00008000
 
+#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR     0x000011ff
+
+#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK    0x0000001f
+#define VMX_MISC_SAVE_EFER_LMA                 0x00000020
+
 /* VMCS Encodings */
 enum vmcs_field {
        VIRTUAL_PROCESSOR_ID            = 0x00000000,
+       POSTED_INTR_NV                  = 0x00000002,
        GUEST_ES_SELECTOR               = 0x00000800,
        GUEST_CS_SELECTOR               = 0x00000802,
        GUEST_SS_SELECTOR               = 0x00000804,
@@ -126,6 +139,8 @@ enum vmcs_field {
        VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
        APIC_ACCESS_ADDR                = 0x00002014,
        APIC_ACCESS_ADDR_HIGH           = 0x00002015,
+       POSTED_INTR_DESC_ADDR           = 0x00002016,
+       POSTED_INTR_DESC_ADDR_HIGH      = 0x00002017,
        EPT_POINTER                     = 0x0000201a,
        EPT_POINTER_HIGH                = 0x0000201b,
        EOI_EXIT_BITMAP0                = 0x0000201c,
@@ -136,6 +151,8 @@ enum vmcs_field {
        EOI_EXIT_BITMAP2_HIGH           = 0x00002021,
        EOI_EXIT_BITMAP3                = 0x00002022,
        EOI_EXIT_BITMAP3_HIGH           = 0x00002023,
+       VMREAD_BITMAP                   = 0x00002026,
+       VMWRITE_BITMAP                  = 0x00002028,
        GUEST_PHYSICAL_ADDRESS          = 0x00002400,
        GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
        VMCS_LINK_POINTER               = 0x00002800,
@@ -209,6 +226,7 @@ enum vmcs_field {
        GUEST_INTERRUPTIBILITY_INFO     = 0x00004824,
        GUEST_ACTIVITY_STATE            = 0X00004826,
        GUEST_SYSENTER_CS               = 0x0000482A,
+       VMX_PREEMPTION_TIMER_VALUE      = 0x0000482E,
        HOST_IA32_SYSENTER_CS           = 0x00004c00,
        CR0_GUEST_HOST_MASK             = 0x00006000,
        CR4_GUEST_HOST_MASK             = 0x00006002,
index a65ec29e6ffb0e4cbc69bdbaf353f207f0495c26..5d9a3033b3d76dcf9d0d0566c11cfc0853d90c6f 100644 (file)
@@ -29,7 +29,6 @@
 #define __KVM_HAVE_PIT
 #define __KVM_HAVE_IOAPIC
 #define __KVM_HAVE_IRQ_LINE
-#define __KVM_HAVE_DEVICE_ASSIGNMENT
 #define __KVM_HAVE_MSI
 #define __KVM_HAVE_USER_NMI
 #define __KVM_HAVE_GUEST_DEBUG
index b5757885d7a4790c78b9544e4a2d62842bcde7ca..b3a4866661c5c426efb0504294a3c7228ccd107a 100644 (file)
 #define VMX_BASIC_MEM_TYPE_WB  6LLU
 #define VMX_BASIC_INOUT                0x0040000000000000LLU
 
+/* MSR_IA32_VMX_MISC bits */
+#define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
 /* AMD-V MSRs */
 
 #define MSR_VM_CR                       0xc0010114
index 2871fccfee68619896f03d50ab4b93f75eaad8e3..d651082c7cf720a805f653b162343f6b9a984329 100644 (file)
@@ -65,6 +65,7 @@
 #define EXIT_REASON_EOI_INDUCED         45
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
+#define EXIT_REASON_PREEMPTION_TIMER    52
 #define EXIT_REASON_WBINVD              54
 #define EXIT_REASON_XSETBV              55
 #define EXIT_REASON_APIC_WRITE          56
        { EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
        { EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
        { EXIT_REASON_INVD,                  "INVD" }, \
-       { EXIT_REASON_INVPCID,               "INVPCID" }
-
+       { EXIT_REASON_INVPCID,               "INVPCID" }, \
+       { EXIT_REASON_PREEMPTION_TIMER,      "PREEMPTION_TIMER" }
 
 #endif /* _UAPIVMX_H */
index c1d01e6ca790299611a2ada8eb0e163e4fd8336b..727208941030943370fde4160b6f9ad54e62b68b 100644 (file)
@@ -1166,6 +1166,11 @@ apicinterrupt LOCAL_TIMER_VECTOR \
 apicinterrupt X86_PLATFORM_IPI_VECTOR \
        x86_platform_ipi smp_x86_platform_ipi
 
+#ifdef CONFIG_HAVE_KVM
+apicinterrupt POSTED_INTR_VECTOR \
+       kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
+#endif
+
 apicinterrupt THRESHOLD_APIC_VECTOR \
        threshold_interrupt smp_threshold_interrupt
 apicinterrupt THERMAL_APIC_VECTOR \
index 84b778962c661ce3a00f6c333038d4fa328628a2..ac0631d8996ffe2085d5d57de3b5bde718c3359c 100644 (file)
@@ -224,6 +224,28 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
        set_irq_regs(old_regs);
 }
 
+#ifdef CONFIG_HAVE_KVM
+/*
+ * Handler for POSTED_INTERRUPT_VECTOR.
+ */
+void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
+{
+       struct pt_regs *old_regs = set_irq_regs(regs);
+
+       ack_APIC_irq();
+
+       irq_enter();
+
+       exit_idle();
+
+       inc_irq_stat(kvm_posted_intr_ipis);
+
+       irq_exit();
+
+       set_irq_regs(old_regs);
+}
+#endif
+
 EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
 
 #ifdef CONFIG_HOTPLUG_CPU
index 7dc4e459c2b389c2126d679d1692856ea4caddb0..a2a1fbc594ff906d0727e9f44e7db6f02c876570 100644 (file)
@@ -172,6 +172,10 @@ static void __init apic_intr_init(void)
 
        /* IPI for X86 platform specific use */
        alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
+#ifdef CONFIG_HAVE_KVM
+       /* IPI for KVM to deliver posted interrupt */
+       alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi);
+#endif
 
        /* IPI vectors for APIC spurious and error interrupts */
        alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
index 0732f0089a3df2d0bcbde6b397fc8c3e1e76844c..d2c381280e3cfd1a3b462ab0d43320f424e1e7bc 100644 (file)
@@ -160,8 +160,12 @@ int kvm_register_clock(char *txt)
 {
        int cpu = smp_processor_id();
        int low, high, ret;
-       struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti;
+       struct pvclock_vcpu_time_info *src;
+
+       if (!hv_clock)
+               return 0;
 
+       src = &hv_clock[cpu].pvti;
        low = (int)slow_virt_to_phys(src) | 1;
        high = ((u64)slow_virt_to_phys(src) >> 32);
        ret = native_write_msr_safe(msr_kvm_system_time, low, high);
@@ -276,6 +280,9 @@ int __init kvm_setup_vsyscall_timeinfo(void)
        struct pvclock_vcpu_time_info *vcpu_time;
        unsigned int size;
 
+       if (!hv_clock)
+               return 0;
+
        size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
 
        preempt_disable();
index 586f0005980510e9683ea6c1dbf436cec6e4d52f..a47a3e54b964b5bd486e2d199816fb12acdc9170 100644 (file)
@@ -21,14 +21,13 @@ config KVM
        tristate "Kernel-based Virtual Machine (KVM) support"
        depends on HAVE_KVM
        depends on HIGH_RES_TIMERS
-       # for device assignment:
-       depends on PCI
        # for TASKSTATS/TASK_DELAY_ACCT:
        depends on NET
        select PREEMPT_NOTIFIERS
        select MMU_NOTIFIER
        select ANON_INODES
        select HAVE_KVM_IRQCHIP
+       select HAVE_KVM_IRQ_ROUTING
        select HAVE_KVM_EVENTFD
        select KVM_APIC_ARCHITECTURE
        select KVM_ASYNC_PF
@@ -82,6 +81,17 @@ config KVM_MMU_AUDIT
         This option adds a R/W kVM module parameter 'mmu_audit', which allows
         audit  KVM MMU at runtime.
 
+config KVM_DEVICE_ASSIGNMENT
+       bool "KVM legacy PCI device assignment support"
+       depends on KVM && PCI && IOMMU_API
+       default y
+       ---help---
+         Provide support for legacy PCI device assignment through KVM.  The
+         kernel now also supports a full featured userspace device driver
+         framework through VFIO, which supersedes much of this support.
+
+         If unsure, say Y.
+
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
index 04d30401c5cb26aa2b491ad3b0ceeb8d9fa7e8ff..d609e1d8404852d02b9fdfeed0aa2103580b7cda 100644 (file)
@@ -7,8 +7,9 @@ CFLAGS_vmx.o := -I.
 
 kvm-y                  += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
                                coalesced_mmio.o irq_comm.o eventfd.o \
-                               assigned-dev.o)
-kvm-$(CONFIG_IOMMU_API)        += $(addprefix ../../../virt/kvm/, iommu.o)
+                               irqchip.o)
+kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)    += $(addprefix ../../../virt/kvm/, \
+                               assigned-dev.o iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)     += $(addprefix ../../../virt/kvm/, async_pf.o)
 
 kvm-y                  += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
index a335cc6cde72aa099cbd718bfc02a71c3d70feb2..8e517bba6a7c9434099139ecbb82a5017ca49cbf 100644 (file)
 #define Priv        (1<<27) /* instruction generates #GP if current CPL != 0 */
 #define No64       (1<<28)
 #define PageTable   (1 << 29)   /* instruction used to write page table */
+#define NotImpl     (1 << 30)   /* instruction is not implemented */
 /* Source 2 operand type */
-#define Src2Shift   (30)
+#define Src2Shift   (31)
 #define Src2None    (OpNone << Src2Shift)
 #define Src2CL      (OpCL << Src2Shift)
 #define Src2ImmByte (OpImmByte << Src2Shift)
@@ -1578,12 +1579,21 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 
        memset(&seg_desc, 0, sizeof seg_desc);
 
-       if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
-           || ctxt->mode == X86EMUL_MODE_REAL) {
-               /* set real mode segment descriptor */
+       if (ctxt->mode == X86EMUL_MODE_REAL) {
+               /* set real mode segment descriptor (keep limit etc. for
+                * unreal mode) */
                ctxt->ops->get_segment(ctxt, &dummy, &seg_desc, NULL, seg);
                set_desc_base(&seg_desc, selector << 4);
                goto load;
+       } else if (seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) {
+               /* VM86 needs a clean new segment descriptor */
+               set_desc_base(&seg_desc, selector << 4);
+               set_desc_limit(&seg_desc, 0xffff);
+               seg_desc.type = 3;
+               seg_desc.p = 1;
+               seg_desc.s = 1;
+               seg_desc.dpl = 3;
+               goto load;
        }
 
        rpl = selector & 3;
@@ -3615,7 +3625,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 #define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i }
 #define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \
                      .check_perm = (_p) }
-#define N    D(0)
+#define N    D(NotImpl)
 #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
 #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
 #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
@@ -3713,7 +3723,7 @@ static const struct opcode group5[] = {
        I(SrcMemFAddr | ImplicitOps | Stack,    em_call_far),
        I(SrcMem | Stack,                       em_grp45),
        I(SrcMemFAddr | ImplicitOps,            em_grp45),
-       I(SrcMem | Stack,                       em_grp45), N,
+       I(SrcMem | Stack,                       em_grp45), D(Undefined),
 };
 
 static const struct opcode group6[] = {
@@ -4162,6 +4172,10 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
                break;
        case OpMem8:
                ctxt->memop.bytes = 1;
+               if (ctxt->memop.type == OP_REG) {
+                       ctxt->memop.addr.reg = decode_register(ctxt, ctxt->modrm_rm, 1);
+                       fetch_register_operand(&ctxt->memop);
+               }
                goto mem_common;
        case OpMem16:
                ctxt->memop.bytes = 2;
@@ -4373,7 +4387,7 @@ done_prefixes:
        ctxt->intercept = opcode.intercept;
 
        /* Unrecognised? */
-       if (ctxt->d == 0 || (ctxt->d & Undefined))
+       if (ctxt->d == 0 || (ctxt->d & NotImpl))
                return EMULATION_FAILED;
 
        if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
@@ -4511,7 +4525,8 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
        ctxt->mem_read.pos = 0;
 
-       if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) {
+       if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) ||
+                       (ctxt->d & Undefined)) {
                rc = emulate_ud(ctxt);
                goto done;
        }
index c1d30b2fc9bb166e42f02557fa5b4c79f79786e9..412a5aa0ef94858ade3a32de9e81dfdbdf44d925 100644 (file)
@@ -290,8 +290,8 @@ static void pit_do_work(struct kthread_work *work)
        }
        spin_unlock(&ps->inject_lock);
        if (inject) {
-               kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
-               kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
+               kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false);
+               kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false);
 
                /*
                 * Provides NMI watchdog support via Virtual Wire mode.
index f77df1c5de6eeb4ac341b9a9ac7f6afdb054b658..e1adbb4aca753657bfa246ecf56e3b52b130a9ab 100644 (file)
@@ -94,6 +94,14 @@ static inline int apic_test_vector(int vec, void *bitmap)
        return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
 }
 
+bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       return apic_test_vector(vector, apic->regs + APIC_ISR) ||
+               apic_test_vector(vector, apic->regs + APIC_IRR);
+}
+
 static inline void apic_set_vector(int vec, void *bitmap)
 {
        set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec));
@@ -145,53 +153,6 @@ static inline int kvm_apic_id(struct kvm_lapic *apic)
        return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
 }
 
-void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
-                               struct kvm_lapic_irq *irq,
-                               u64 *eoi_exit_bitmap)
-{
-       struct kvm_lapic **dst;
-       struct kvm_apic_map *map;
-       unsigned long bitmap = 1;
-       int i;
-
-       rcu_read_lock();
-       map = rcu_dereference(vcpu->kvm->arch.apic_map);
-
-       if (unlikely(!map)) {
-               __set_bit(irq->vector, (unsigned long *)eoi_exit_bitmap);
-               goto out;
-       }
-
-       if (irq->dest_mode == 0) { /* physical mode */
-               if (irq->delivery_mode == APIC_DM_LOWEST ||
-                               irq->dest_id == 0xff) {
-                       __set_bit(irq->vector,
-                                 (unsigned long *)eoi_exit_bitmap);
-                       goto out;
-               }
-               dst = &map->phys_map[irq->dest_id & 0xff];
-       } else {
-               u32 mda = irq->dest_id << (32 - map->ldr_bits);
-
-               dst = map->logical_map[apic_cluster_id(map, mda)];
-
-               bitmap = apic_logical_id(map, mda);
-       }
-
-       for_each_set_bit(i, &bitmap, 16) {
-               if (!dst[i])
-                       continue;
-               if (dst[i]->vcpu == vcpu) {
-                       __set_bit(irq->vector,
-                                 (unsigned long *)eoi_exit_bitmap);
-                       break;
-               }
-       }
-
-out:
-       rcu_read_unlock();
-}
-
 static void recalculate_apic_map(struct kvm *kvm)
 {
        struct kvm_apic_map *new, *old = NULL;
@@ -256,7 +217,7 @@ out:
        if (old)
                kfree_rcu(old, rcu);
 
-       kvm_ioapic_make_eoibitmap_request(kvm);
+       kvm_vcpu_request_scan_ioapic(kvm);
 }
 
 static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
@@ -357,6 +318,19 @@ static u8 count_vectors(void *bitmap)
        return count;
 }
 
+void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+{
+       u32 i, pir_val;
+       struct kvm_lapic *apic = vcpu->arch.apic;
+
+       for (i = 0; i <= 7; i++) {
+               pir_val = xchg(&pir[i], 0);
+               if (pir_val)
+                       *((u32 *)(apic->regs + APIC_IRR + i * 0x10)) |= pir_val;
+       }
+}
+EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
+
 static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic)
 {
        apic->irr_pending = true;
@@ -379,6 +353,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
        if (!apic->irr_pending)
                return -1;
 
+       kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
        result = apic_search_irr(apic);
        ASSERT(result == -1 || result >= 16);
 
@@ -431,14 +406,16 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 }
 
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
-                            int vector, int level, int trig_mode);
+                            int vector, int level, int trig_mode,
+                            unsigned long *dest_map);
 
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
+               unsigned long *dest_map)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
 
        return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
-                       irq->level, irq->trig_mode);
+                       irq->level, irq->trig_mode, dest_map);
 }
 
 static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
@@ -505,6 +482,15 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
        return result;
 }
 
+void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       int i;
+
+       for (i = 0; i < 8; i++)
+               apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]);
+}
+
 static void apic_update_ppr(struct kvm_lapic *apic)
 {
        u32 tpr, isrv, ppr, old_ppr;
@@ -611,7 +597,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 }
 
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-               struct kvm_lapic_irq *irq, int *r)
+               struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map)
 {
        struct kvm_apic_map *map;
        unsigned long bitmap = 1;
@@ -622,7 +608,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
        *r = -1;
 
        if (irq->shorthand == APIC_DEST_SELF) {
-               *r = kvm_apic_set_irq(src->vcpu, irq);
+               *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
                return true;
        }
 
@@ -667,7 +653,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
                        continue;
                if (*r < 0)
                        *r = 0;
-               *r += kvm_apic_set_irq(dst[i]->vcpu, irq);
+               *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
        }
 
        ret = true;
@@ -681,7 +667,8 @@ out:
  * Return 1 if successfully added and 0 if discarded.
  */
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
-                            int vector, int level, int trig_mode)
+                            int vector, int level, int trig_mode,
+                            unsigned long *dest_map)
 {
        int result = 0;
        struct kvm_vcpu *vcpu = apic->vcpu;
@@ -694,24 +681,28 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                if (unlikely(!apic_enabled(apic)))
                        break;
 
-               if (trig_mode) {
-                       apic_debug("level trig mode for vector %d", vector);
-                       apic_set_vector(vector, apic->regs + APIC_TMR);
-               } else
-                       apic_clear_vector(vector, apic->regs + APIC_TMR);
+               if (dest_map)
+                       __set_bit(vcpu->vcpu_id, dest_map);
 
-               result = !apic_test_and_set_irr(vector, apic);
-               trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
-                                         trig_mode, vector, !result);
-               if (!result) {
-                       if (trig_mode)
-                               apic_debug("level trig mode repeatedly for "
-                                               "vector %d", vector);
-                       break;
-               }
+               if (kvm_x86_ops->deliver_posted_interrupt) {
+                       result = 1;
+                       kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
+               } else {
+                       result = !apic_test_and_set_irr(vector, apic);
 
-               kvm_make_request(KVM_REQ_EVENT, vcpu);
-               kvm_vcpu_kick(vcpu);
+                       if (!result) {
+                               if (trig_mode)
+                                       apic_debug("level trig mode repeatedly "
+                                               "for vector %d", vector);
+                               goto out;
+                       }
+
+                       kvm_make_request(KVM_REQ_EVENT, vcpu);
+                       kvm_vcpu_kick(vcpu);
+               }
+out:
+               trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
+                               trig_mode, vector, !result);
                break;
 
        case APIC_DM_REMRD:
@@ -731,7 +722,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
        case APIC_DM_INIT:
                if (!trig_mode || level) {
                        result = 1;
-                       vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+                       /* assumes that there are only KVM_APIC_INIT/SIPI */
+                       apic->pending_events = (1UL << KVM_APIC_INIT);
+                       /* make sure pending_events is visible before sending
+                        * the request */
+                       smp_wmb();
                        kvm_make_request(KVM_REQ_EVENT, vcpu);
                        kvm_vcpu_kick(vcpu);
                } else {
@@ -743,13 +738,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
        case APIC_DM_STARTUP:
                apic_debug("SIPI to vcpu %d vector 0x%02x\n",
                           vcpu->vcpu_id, vector);
-               if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
-                       result = 1;
-                       vcpu->arch.sipi_vector = vector;
-                       vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
-                       kvm_make_request(KVM_REQ_EVENT, vcpu);
-                       kvm_vcpu_kick(vcpu);
-               }
+               result = 1;
+               apic->sipi_vector = vector;
+               /* make sure sipi_vector is visible for the receiver */
+               smp_wmb();
+               set_bit(KVM_APIC_SIPI, &apic->pending_events);
+               kvm_make_request(KVM_REQ_EVENT, vcpu);
+               kvm_vcpu_kick(vcpu);
                break;
 
        case APIC_DM_EXTINT:
@@ -782,7 +777,7 @@ static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
                        trigger_mode = IOAPIC_LEVEL_TRIG;
                else
                        trigger_mode = IOAPIC_EDGE_TRIG;
-               kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
+               kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
        }
 }
 
@@ -848,7 +843,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
                   irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
                   irq.vector);
 
-       kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
+       kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
 }
 
 static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -1484,7 +1479,8 @@ int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
                vector = reg & APIC_VECTOR_MASK;
                mode = reg & APIC_MODE_MASK;
                trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
-               return __apic_accept_irq(apic, mode, vector, 1, trig_mode);
+               return __apic_accept_irq(apic, mode, vector, 1, trig_mode,
+                                       NULL);
        }
        return 0;
 }
@@ -1654,6 +1650,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
        apic->highest_isr_cache = -1;
        kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
        kvm_make_request(KVM_REQ_EVENT, vcpu);
+       kvm_rtc_eoi_tracking_restore_one(vcpu);
 }
 
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
@@ -1860,6 +1857,34 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
                                         addr, sizeof(u8));
 }
 
+void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
+{
+       struct kvm_lapic *apic = vcpu->arch.apic;
+       unsigned int sipi_vector;
+
+       if (!kvm_vcpu_has_lapic(vcpu))
+               return;
+
+       if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) {
+               kvm_lapic_reset(vcpu);
+               kvm_vcpu_reset(vcpu);
+               if (kvm_vcpu_is_bsp(apic->vcpu))
+                       vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+               else
+                       vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+       }
+       if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events) &&
+           vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+               /* evaluate pending_events before reading the vector */
+               smp_rmb();
+               sipi_vector = apic->sipi_vector;
+               pr_debug("vcpu %d received sipi with vector # %x\n",
+                        vcpu->vcpu_id, sipi_vector);
+               kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector);
+               vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+       }
+}
+
 void kvm_lapic_init(void)
 {
        /* do not patch jump label more than once per second */
index 1676d34ddb4e26803570b16e655fd81450b315d7..c730ac9fe80188d15957bf9b556918036db892ab 100644 (file)
@@ -5,6 +5,9 @@
 
 #include <linux/kvm_host.h>
 
+#define KVM_APIC_INIT          0
+#define KVM_APIC_SIPI          1
+
 struct kvm_timer {
        struct hrtimer timer;
        s64 period;                             /* unit: ns */
@@ -32,6 +35,8 @@ struct kvm_lapic {
        void *regs;
        gpa_t vapic_addr;
        struct page *vapic_page;
+       unsigned long pending_events;
+       unsigned int sipi_vector;
 };
 int kvm_create_lapic(struct kvm_vcpu *vcpu);
 void kvm_free_lapic(struct kvm_vcpu *vcpu);
@@ -39,6 +44,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu);
 int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
 int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
+void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
 void kvm_lapic_reset(struct kvm_vcpu *vcpu);
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
@@ -47,13 +53,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
 u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 
+void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
+void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
+               unsigned long *dest_map);
 int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
 
 bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
-               struct kvm_lapic_irq *irq, int *r);
+               struct kvm_lapic_irq *irq, int *r, unsigned long *dest_map);
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
@@ -154,8 +163,11 @@ static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
        return ldr & map->lid_mask;
 }
 
-void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
-                               struct kvm_lapic_irq *irq,
-                               u64 *eoi_bitmap);
+static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.apic->pending_events;
+}
+
+bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
 #endif
index 956ca358108a30e1b2048a42d31742cf3e13a147..004cc87b781c2694a0f6428ef8b6614ff60711b7 100644 (file)
@@ -199,8 +199,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
 static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
 {
+       struct kvm_mmu_page *sp =  page_header(__pa(sptep));
+
        access &= ACC_WRITE_MASK | ACC_USER_MASK;
 
+       sp->mmio_cached = true;
        trace_mark_mmio_spte(sptep, gfn, access);
        mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
 }
@@ -1502,6 +1505,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
                                               u64 *parent_pte, int direct)
 {
        struct kvm_mmu_page *sp;
+
        sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
        sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
        if (!direct)
@@ -1644,16 +1648,14 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
                                    struct list_head *invalid_list);
 
-#define for_each_gfn_sp(kvm, sp, gfn)                                  \
-  hlist_for_each_entry(sp,                                             \
-   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)  \
-       if ((sp)->gfn != (gfn)) {} else
+#define for_each_gfn_sp(_kvm, _sp, _gfn)                               \
+       hlist_for_each_entry(_sp,                                       \
+         &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
+               if ((_sp)->gfn != (_gfn)) {} else
 
-#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn)                   \
-  hlist_for_each_entry(sp,                                             \
-   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)  \
-               if ((sp)->gfn != (gfn) || (sp)->role.direct ||          \
-                       (sp)->role.invalid) {} else
+#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)                        \
+       for_each_gfn_sp(_kvm, _sp, _gfn)                                \
+               if ((_sp)->role.direct || (_sp)->role.invalid) {} else
 
 /* @sp->gfn should be write-protected at the call site */
 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -2089,7 +2091,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
                                    struct list_head *invalid_list)
 {
-       struct kvm_mmu_page *sp;
+       struct kvm_mmu_page *sp, *nsp;
 
        if (list_empty(invalid_list))
                return;
@@ -2106,11 +2108,25 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
         */
        kvm_flush_remote_tlbs(kvm);
 
-       do {
-               sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+       list_for_each_entry_safe(sp, nsp, invalid_list, link) {
                WARN_ON(!sp->role.invalid || sp->root_count);
                kvm_mmu_free_page(sp);
-       } while (!list_empty(invalid_list));
+       }
+}
+
+static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
+                                       struct list_head *invalid_list)
+{
+       struct kvm_mmu_page *sp;
+
+       if (list_empty(&kvm->arch.active_mmu_pages))
+               return false;
+
+       sp = list_entry(kvm->arch.active_mmu_pages.prev,
+                       struct kvm_mmu_page, link);
+       kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
+
+       return true;
 }
 
 /*
@@ -2120,23 +2136,15 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
 {
        LIST_HEAD(invalid_list);
-       /*
-        * If we set the number of mmu pages to be smaller be than the
-        * number of actived pages , we must to free some mmu pages before we
-        * change the value
-        */
 
        spin_lock(&kvm->mmu_lock);
 
        if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
-               while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
-                       !list_empty(&kvm->arch.active_mmu_pages)) {
-                       struct kvm_mmu_page *page;
+               /* Need to free some mmu pages to achieve the goal. */
+               while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
+                       if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
+                               break;
 
-                       page = container_of(kvm->arch.active_mmu_pages.prev,
-                                           struct kvm_mmu_page, link);
-                       kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
-               }
                kvm_mmu_commit_zap_page(kvm, &invalid_list);
                goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
        }
@@ -2794,6 +2802,7 @@ exit:
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
                         gva_t gva, pfn_t *pfn, bool write, bool *writable);
+static void make_mmu_pages_available(struct kvm_vcpu *vcpu);
 
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
                         gfn_t gfn, bool prefault)
@@ -2835,7 +2844,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
                goto out_unlock;
-       kvm_mmu_free_some_pages(vcpu);
+       make_mmu_pages_available(vcpu);
        if (likely(!force_pt_level))
                transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
        r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
@@ -2913,7 +2922,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 
        if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
                spin_lock(&vcpu->kvm->mmu_lock);
-               kvm_mmu_free_some_pages(vcpu);
+               make_mmu_pages_available(vcpu);
                sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
                                      1, ACC_ALL, NULL);
                ++sp->root_count;
@@ -2925,7 +2934,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 
                        ASSERT(!VALID_PAGE(root));
                        spin_lock(&vcpu->kvm->mmu_lock);
-                       kvm_mmu_free_some_pages(vcpu);
+                       make_mmu_pages_available(vcpu);
                        sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
                                              i << 30,
                                              PT32_ROOT_LEVEL, 1, ACC_ALL,
@@ -2964,7 +2973,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                ASSERT(!VALID_PAGE(root));
 
                spin_lock(&vcpu->kvm->mmu_lock);
-               kvm_mmu_free_some_pages(vcpu);
+               make_mmu_pages_available(vcpu);
                sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
                                      0, ACC_ALL, NULL);
                root = __pa(sp->spt);
@@ -2998,7 +3007,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                                return 1;
                }
                spin_lock(&vcpu->kvm->mmu_lock);
-               kvm_mmu_free_some_pages(vcpu);
+               make_mmu_pages_available(vcpu);
                sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
                                      PT32_ROOT_LEVEL, 0,
                                      ACC_ALL, NULL);
@@ -3304,7 +3313,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
                goto out_unlock;
-       kvm_mmu_free_some_pages(vcpu);
+       make_mmu_pages_available(vcpu);
        if (likely(!force_pt_level))
                transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
        r = __direct_map(vcpu, gpa, write, map_writable,
@@ -4006,17 +4015,17 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
 
-void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
 {
        LIST_HEAD(invalid_list);
 
-       while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
-              !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
-               struct kvm_mmu_page *sp;
+       if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
+               return;
+
+       while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
+               if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
+                       break;
 
-               sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
-                                 struct kvm_mmu_page, link);
-               kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
                ++vcpu->kvm->stat.mmu_recycled;
        }
        kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
@@ -4185,17 +4194,22 @@ restart:
        spin_unlock(&kvm->mmu_lock);
 }
 
-static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
-                                               struct list_head *invalid_list)
+void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
 {
-       struct kvm_mmu_page *page;
+       struct kvm_mmu_page *sp, *node;
+       LIST_HEAD(invalid_list);
 
-       if (list_empty(&kvm->arch.active_mmu_pages))
-               return;
+       spin_lock(&kvm->mmu_lock);
+restart:
+       list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
+               if (!sp->mmio_cached)
+                       continue;
+               if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
+                       goto restart;
+       }
 
-       page = container_of(kvm->arch.active_mmu_pages.prev,
-                           struct kvm_mmu_page, link);
-       kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
+       kvm_mmu_commit_zap_page(kvm, &invalid_list);
+       spin_unlock(&kvm->mmu_lock);
 }
 
 static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
@@ -4232,7 +4246,7 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
                idx = srcu_read_lock(&kvm->srcu);
                spin_lock(&kvm->mmu_lock);
 
-               kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list);
+               prepare_zap_oldest_mmu_page(kvm, &invalid_list);
                kvm_mmu_commit_zap_page(kvm, &invalid_list);
 
                spin_unlock(&kvm->mmu_lock);
index 69871080e8663c76fed7b94783f9b6869e552ac8..2adcbc2cac6db49ad68ae47132e41fa4f7e6101b 100644 (file)
@@ -57,14 +57,11 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
-       return kvm->arch.n_max_mmu_pages -
-               kvm->arch.n_used_mmu_pages;
-}
+       if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
+               return kvm->arch.n_max_mmu_pages -
+                       kvm->arch.n_used_mmu_pages;
 
-static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
-{
-       if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES))
-               __kvm_mmu_free_some_pages(vcpu);
+       return 0;
 }
 
 static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
index 105dd5bd550e5995d36b36df6c8739ccaedba033..da20860b457a4c33bc7c17d6cece102b8b8f7216 100644 (file)
@@ -627,7 +627,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
                goto out_unlock;
 
        kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
-       kvm_mmu_free_some_pages(vcpu);
+       make_mmu_pages_available(vcpu);
        if (!force_pt_level)
                transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
        r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
index cfc258a6bf97a1efda8b97bb0ae4fd4bd21a9f23..c53e797e7369ad4a1086899ee65eb1e71d048d84 100644 (file)
@@ -360,10 +360,12 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
        return 1;
 }
 
-int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
        struct kvm_pmu *pmu = &vcpu->arch.pmu;
        struct kvm_pmc *pmc;
+       u32 index = msr_info->index;
+       u64 data = msr_info->data;
 
        switch (index) {
        case MSR_CORE_PERF_FIXED_CTR_CTRL:
@@ -375,6 +377,10 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
                }
                break;
        case MSR_CORE_PERF_GLOBAL_STATUS:
+               if (msr_info->host_initiated) {
+                       pmu->global_status = data;
+                       return 0;
+               }
                break; /* RO MSR */
        case MSR_CORE_PERF_GLOBAL_CTRL:
                if (pmu->global_ctrl == data)
@@ -386,7 +392,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
                break;
        case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
                if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
-                       pmu->global_status &= ~data;
+                       if (!msr_info->host_initiated)
+                               pmu->global_status &= ~data;
                        pmu->global_ovf_ctrl = data;
                        return 0;
                }
@@ -394,7 +401,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
        default:
                if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
                                (pmc = get_fixed_pmc(pmu, index))) {
-                       data = (s64)(s32)data;
+                       if (!msr_info->host_initiated)
+                               data = (s64)(s32)data;
                        pmc->counter += data - read_pmc(pmc);
                        return 0;
                } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
index 7d39d70647e3139732e092680c7696b9edbb0ece..a14a6eaf871d9ea312d6dbcdd1bccb5a3c9846ac 100644 (file)
@@ -1131,17 +1131,11 @@ static void init_vmcb(struct vcpu_svm *svm)
        init_seg(&save->gs);
 
        save->cs.selector = 0xf000;
+       save->cs.base = 0xffff0000;
        /* Executable/Readable Code Segment */
        save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
                SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
        save->cs.limit = 0xffff;
-       /*
-        * cs.base should really be 0xffff0000, but vmx can't handle that, so
-        * be consistent with it.
-        *
-        * Replace when we have real mode working for vmx.
-        */
-       save->cs.base = 0xf0000;
 
        save->gdtr.limit = 0xffff;
        save->idtr.limit = 0xffff;
@@ -1191,7 +1185,7 @@ static void init_vmcb(struct vcpu_svm *svm)
        enable_gif(svm);
 }
 
-static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
+static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
        u32 dummy;
@@ -1199,16 +1193,8 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
 
        init_vmcb(svm);
 
-       if (!kvm_vcpu_is_bsp(vcpu)) {
-               kvm_rip_write(vcpu, 0);
-               svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
-               svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
-       }
-
        kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
        kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
-
-       return 0;
 }
 
 static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
@@ -3487,7 +3473,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
            exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
            exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
            exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
-               printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
+               printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
                       "exit_code 0x%x\n",
                       __func__, svm->vmcb->control.exit_int_info,
                       exit_code);
@@ -3591,6 +3577,11 @@ static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
        return;
 }
 
+static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+       return;
+}
+
 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -3641,7 +3632,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
        return ret;
 }
 
-static void enable_irq_window(struct kvm_vcpu *vcpu)
+static int enable_irq_window(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -3655,15 +3646,16 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
                svm_set_vintr(svm);
                svm_inject_irq(svm, 0x0);
        }
+       return 0;
 }
 
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
+static int enable_nmi_window(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
 
        if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
            == HF_NMI_MASK)
-               return; /* IRET will cause a vm exit */
+               return 0; /* IRET will cause a vm exit */
 
        /*
         * Something prevents NMI from been injected. Single step over possible
@@ -3672,6 +3664,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
        svm->nmi_singlestep = true;
        svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
        update_db_bp_intercept(vcpu);
+       return 0;
 }
 
 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -4247,6 +4240,11 @@ out:
        return ret;
 }
 
+static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
+{
+       local_irq_enable();
+}
+
 static struct kvm_x86_ops svm_x86_ops = {
        .cpu_has_kvm_support = has_svm,
        .disabled_by_bios = is_disabled,
@@ -4314,6 +4312,7 @@ static struct kvm_x86_ops svm_x86_ops = {
        .vm_has_apicv = svm_vm_has_apicv,
        .load_eoi_exitmap = svm_load_eoi_exitmap,
        .hwapic_isr_update = svm_hwapic_isr_update,
+       .sync_pir_to_irr = svm_sync_pir_to_irr,
 
        .set_tss_addr = svm_set_tss_addr,
        .get_tdp_level = get_npt_level,
@@ -4342,6 +4341,7 @@ static struct kvm_x86_ops svm_x86_ops = {
        .set_tdp_cr3 = set_tdp_cr3,
 
        .check_intercept = svm_check_intercept,
+       .handle_external_intr = svm_handle_external_intr,
 };
 
 static int __init svm_init(void)
index 867b81037f9680d09f745884f7df03e710b45042..25a791ed21c88057697eb06339f7066d8cfa63e2 100644 (file)
@@ -84,8 +84,11 @@ module_param(vmm_exclusive, bool, S_IRUGO);
 static bool __read_mostly fasteoi = 1;
 module_param(fasteoi, bool, S_IRUGO);
 
-static bool __read_mostly enable_apicv_reg_vid;
+static bool __read_mostly enable_apicv = 1;
+module_param(enable_apicv, bool, S_IRUGO);
 
+static bool __read_mostly enable_shadow_vmcs = 1;
+module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
 /*
  * If nested=1, nested virtualization is supported, i.e., guests may use
  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -298,7 +301,8 @@ struct __packed vmcs12 {
        u32 guest_activity_state;
        u32 guest_sysenter_cs;
        u32 host_ia32_sysenter_cs;
-       u32 padding32[8]; /* room for future expansion */
+       u32 vmx_preemption_timer_value;
+       u32 padding32[7]; /* room for future expansion */
        u16 virtual_processor_id;
        u16 guest_es_selector;
        u16 guest_cs_selector;
@@ -351,6 +355,12 @@ struct nested_vmx {
        /* The host-usable pointer to the above */
        struct page *current_vmcs12_page;
        struct vmcs12 *current_vmcs12;
+       struct vmcs *current_shadow_vmcs;
+       /*
+        * Indicates if the shadow vmcs must be updated with the
+        * data hold by vmcs12
+        */
+       bool sync_shadow_vmcs;
 
        /* vmcs02_list cache of VMCSs recently used to run L2 guests */
        struct list_head vmcs02_pool;
@@ -365,6 +375,31 @@ struct nested_vmx {
        struct page *apic_access_page;
 };
 
+#define POSTED_INTR_ON  0
+/* Posted-Interrupt Descriptor */
+struct pi_desc {
+       u32 pir[8];     /* Posted interrupt requested */
+       u32 control;    /* bit 0 of control is outstanding notification bit */
+       u32 rsvd[7];
+} __aligned(64);
+
+static bool pi_test_and_set_on(struct pi_desc *pi_desc)
+{
+       return test_and_set_bit(POSTED_INTR_ON,
+                       (unsigned long *)&pi_desc->control);
+}
+
+static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
+{
+       return test_and_clear_bit(POSTED_INTR_ON,
+                       (unsigned long *)&pi_desc->control);
+}
+
+static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
+{
+       return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
 struct vcpu_vmx {
        struct kvm_vcpu       vcpu;
        unsigned long         host_rsp;
@@ -377,6 +412,7 @@ struct vcpu_vmx {
        struct shared_msr_entry *guest_msrs;
        int                   nmsrs;
        int                   save_nmsrs;
+       unsigned long         host_idt_base;
 #ifdef CONFIG_X86_64
        u64                   msr_host_kernel_gs_base;
        u64                   msr_guest_kernel_gs_base;
@@ -428,6 +464,9 @@ struct vcpu_vmx {
 
        bool rdtscp_enabled;
 
+       /* Posted interrupt descriptor */
+       struct pi_desc pi_desc;
+
        /* Support for a guest hypervisor (nested VMX) */
        struct nested_vmx nested;
 };
@@ -451,6 +490,64 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 #define FIELD64(number, name)  [number] = VMCS12_OFFSET(name), \
                                [number##_HIGH] = VMCS12_OFFSET(name)+4
 
+
+static const unsigned long shadow_read_only_fields[] = {
+       /*
+        * We do NOT shadow fields that are modified when L0
+        * traps and emulates any vmx instruction (e.g. VMPTRLD,
+        * VMXON...) executed by L1.
+        * For example, VM_INSTRUCTION_ERROR is read
+        * by L1 if a vmx instruction fails (part of the error path).
+        * Note the code assumes this logic. If for some reason
+        * we start shadowing these fields then we need to
+        * force a shadow sync when L0 emulates vmx instructions
+        * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
+        * by nested_vmx_failValid)
+        */
+       VM_EXIT_REASON,
+       VM_EXIT_INTR_INFO,
+       VM_EXIT_INSTRUCTION_LEN,
+       IDT_VECTORING_INFO_FIELD,
+       IDT_VECTORING_ERROR_CODE,
+       VM_EXIT_INTR_ERROR_CODE,
+       EXIT_QUALIFICATION,
+       GUEST_LINEAR_ADDRESS,
+       GUEST_PHYSICAL_ADDRESS
+};
+static const int max_shadow_read_only_fields =
+       ARRAY_SIZE(shadow_read_only_fields);
+
+static const unsigned long shadow_read_write_fields[] = {
+       GUEST_RIP,
+       GUEST_RSP,
+       GUEST_CR0,
+       GUEST_CR3,
+       GUEST_CR4,
+       GUEST_INTERRUPTIBILITY_INFO,
+       GUEST_RFLAGS,
+       GUEST_CS_SELECTOR,
+       GUEST_CS_AR_BYTES,
+       GUEST_CS_LIMIT,
+       GUEST_CS_BASE,
+       GUEST_ES_BASE,
+       CR0_GUEST_HOST_MASK,
+       CR0_READ_SHADOW,
+       CR4_READ_SHADOW,
+       TSC_OFFSET,
+       EXCEPTION_BITMAP,
+       CPU_BASED_VM_EXEC_CONTROL,
+       VM_ENTRY_EXCEPTION_ERROR_CODE,
+       VM_ENTRY_INTR_INFO_FIELD,
+       VM_ENTRY_INSTRUCTION_LEN,
+       VM_ENTRY_EXCEPTION_ERROR_CODE,
+       HOST_FS_BASE,
+       HOST_GS_BASE,
+       HOST_FS_SELECTOR,
+       HOST_GS_SELECTOR
+};
+static const int max_shadow_read_write_fields =
+       ARRAY_SIZE(shadow_read_write_fields);
+
 static const unsigned short vmcs_field_to_offset_table[] = {
        FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
        FIELD(GUEST_ES_SELECTOR, guest_es_selector),
@@ -537,6 +634,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
        FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
        FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
        FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
+       FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
        FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
        FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
        FIELD(CR0_READ_SHADOW, cr0_read_shadow),
@@ -624,6 +722,9 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
 static bool guest_state_valid(struct kvm_vcpu *vcpu);
 static u32 vmx_segment_access_rights(struct kvm_segment *var);
+static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
+static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
+static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -640,6 +741,8 @@ static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_longmode;
 static unsigned long *vmx_msr_bitmap_legacy_x2apic;
 static unsigned long *vmx_msr_bitmap_longmode_x2apic;
+static unsigned long *vmx_vmread_bitmap;
+static unsigned long *vmx_vmwrite_bitmap;
 
 static bool cpu_has_load_ia32_efer;
 static bool cpu_has_load_perf_global_ctrl;
@@ -782,6 +885,18 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
 }
 
+static inline bool cpu_has_vmx_posted_intr(void)
+{
+       return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+}
+
+static inline bool cpu_has_vmx_apicv(void)
+{
+       return cpu_has_vmx_apic_register_virt() &&
+               cpu_has_vmx_virtual_intr_delivery() &&
+               cpu_has_vmx_posted_intr();
+}
+
 static inline bool cpu_has_vmx_flexpriority(void)
 {
        return cpu_has_vmx_tpr_shadow() &&
@@ -895,6 +1010,18 @@ static inline bool cpu_has_vmx_wbinvd_exit(void)
                SECONDARY_EXEC_WBINVD_EXITING;
 }
 
+static inline bool cpu_has_vmx_shadow_vmcs(void)
+{
+       u64 vmx_msr;
+       rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+       /* check if the cpu supports writing r/o exit information fields */
+       if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
+               return false;
+
+       return vmcs_config.cpu_based_2nd_exec_ctrl &
+               SECONDARY_EXEC_SHADOW_VMCS;
+}
+
 static inline bool report_flexpriority(void)
 {
        return flexpriority_enabled;
@@ -1790,7 +1917,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
        u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
        if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
-               nested_pf_handled(vcpu))
+           !vmx->nested.nested_run_pending && nested_pf_handled(vcpu))
                return;
 
        if (has_error_code) {
@@ -2022,6 +2149,7 @@ static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
 static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
+static u32 nested_vmx_misc_low, nested_vmx_misc_high;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
        /*
@@ -2040,30 +2168,40 @@ static __init void nested_vmx_setup_ctls_msrs(void)
         */
 
        /* pin-based controls */
+       rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
+             nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high);
        /*
         * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
         * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
         */
-       nested_vmx_pinbased_ctls_low = 0x16 ;
-       nested_vmx_pinbased_ctls_high = 0x16 |
-               PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
-               PIN_BASED_VIRTUAL_NMIS;
+       nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+       nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK |
+               PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS |
+               PIN_BASED_VMX_PREEMPTION_TIMER;
+       nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
 
-       /* exit controls */
-       nested_vmx_exit_ctls_low = 0;
+       /*
+        * Exit controls
+        * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
+        * 17 must be 1.
+        */
+       nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
        /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
 #ifdef CONFIG_X86_64
        nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #else
        nested_vmx_exit_ctls_high = 0;
 #endif
+       nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
 
        /* entry controls */
        rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
                nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
-       nested_vmx_entry_ctls_low = 0;
+       /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
+       nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
        nested_vmx_entry_ctls_high &=
                VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
+       nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
 
        /* cpu-based controls */
        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2080,6 +2218,7 @@ static __init void nested_vmx_setup_ctls_msrs(void)
                CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
                CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
                CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
+               CPU_BASED_PAUSE_EXITING |
                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
        /*
         * We can allow some features even when not supported by the
@@ -2094,7 +2233,14 @@ static __init void nested_vmx_setup_ctls_msrs(void)
                nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
        nested_vmx_secondary_ctls_low = 0;
        nested_vmx_secondary_ctls_high &=
-               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+               SECONDARY_EXEC_WBINVD_EXITING;
+
+       /* miscellaneous data */
+       rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
+       nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
+               VMX_MISC_SAVE_EFER_LMA;
+       nested_vmx_misc_high = 0;
 }
 
 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
@@ -2165,7 +2311,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
                                        nested_vmx_entry_ctls_high);
                break;
        case MSR_IA32_VMX_MISC:
-               *pdata = 0;
+               *pdata = vmx_control_msr(nested_vmx_misc_low,
+                                        nested_vmx_misc_high);
                break;
        /*
         * These MSRs specify bits which the guest must keep fixed (on or off)
@@ -2529,12 +2676,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
        u32 _vmexit_control = 0;
        u32 _vmentry_control = 0;
 
-       min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-       opt = PIN_BASED_VIRTUAL_NMIS;
-       if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
-                               &_pin_based_exec_control) < 0)
-               return -EIO;
-
        min = CPU_BASED_HLT_EXITING |
 #ifdef CONFIG_X86_64
              CPU_BASED_CR8_LOAD_EXITING |
@@ -2573,7 +2714,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                        SECONDARY_EXEC_RDTSCP |
                        SECONDARY_EXEC_ENABLE_INVPCID |
                        SECONDARY_EXEC_APIC_REGISTER_VIRT |
-                       SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
+                       SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
+                       SECONDARY_EXEC_SHADOW_VMCS;
                if (adjust_vmx_controls(min2, opt2,
                                        MSR_IA32_VMX_PROCBASED_CTLS2,
                                        &_cpu_based_2nd_exec_control) < 0)
@@ -2605,11 +2747,23 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 #ifdef CONFIG_X86_64
        min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #endif
-       opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
+       opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
+               VM_EXIT_ACK_INTR_ON_EXIT;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
                                &_vmexit_control) < 0)
                return -EIO;
 
+       min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
+       opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
+       if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
+                               &_pin_based_exec_control) < 0)
+               return -EIO;
+
+       if (!(_cpu_based_2nd_exec_control &
+               SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
+               !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
+               _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
+
        min = 0;
        opt = VM_ENTRY_LOAD_IA32_PAT;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
@@ -2762,6 +2916,8 @@ static __init int hardware_setup(void)
 
        if (!cpu_has_vmx_vpid())
                enable_vpid = 0;
+       if (!cpu_has_vmx_shadow_vmcs())
+               enable_shadow_vmcs = 0;
 
        if (!cpu_has_vmx_ept() ||
            !cpu_has_vmx_ept_4levels()) {
@@ -2788,14 +2944,16 @@ static __init int hardware_setup(void)
        if (!cpu_has_vmx_ple())
                ple_gap = 0;
 
-       if (!cpu_has_vmx_apic_register_virt() ||
-                               !cpu_has_vmx_virtual_intr_delivery())
-               enable_apicv_reg_vid = 0;
+       if (!cpu_has_vmx_apicv())
+               enable_apicv = 0;
 
-       if (enable_apicv_reg_vid)
+       if (enable_apicv)
                kvm_x86_ops->update_cr8_intercept = NULL;
-       else
+       else {
                kvm_x86_ops->hwapic_irr_update = NULL;
+               kvm_x86_ops->deliver_posted_interrupt = NULL;
+               kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
+       }
 
        if (nested)
                nested_vmx_setup_ctls_msrs();
@@ -2876,22 +3034,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
        vmx->cpl = 0;
 }
 
-static gva_t rmode_tss_base(struct kvm *kvm)
-{
-       if (!kvm->arch.tss_addr) {
-               struct kvm_memslots *slots;
-               struct kvm_memory_slot *slot;
-               gfn_t base_gfn;
-
-               slots = kvm_memslots(kvm);
-               slot = id_to_memslot(slots, 0);
-               base_gfn = slot->base_gfn + slot->npages - 3;
-
-               return base_gfn << PAGE_SHIFT;
-       }
-       return kvm->arch.tss_addr;
-}
-
 static void fix_rmode_seg(int seg, struct kvm_segment *save)
 {
        const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -2942,19 +3084,15 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 
        /*
         * Very old userspace does not call KVM_SET_TSS_ADDR before entering
-        * vcpu. Call it here with phys address pointing 16M below 4G.
+        * vcpu. Warn the user that an update is overdue.
         */
-       if (!vcpu->kvm->arch.tss_addr) {
+       if (!vcpu->kvm->arch.tss_addr)
                printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
                             "called before entering vcpu\n");
-               srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-               vmx_set_tss_addr(vcpu->kvm, 0xfeffd000);
-               vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-       }
 
        vmx_segment_cache_clear(vmx);
 
-       vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
+       vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr);
        vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
        vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
 
@@ -3214,7 +3352,9 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                 */
                if (!nested_vmx_allowed(vcpu))
                        return 1;
-       } else if (to_vmx(vcpu)->nested.vmxon)
+       }
+       if (to_vmx(vcpu)->nested.vmxon &&
+           ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON))
                return 1;
 
        vcpu->arch.cr4 = cr4;
@@ -3550,7 +3690,7 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu)
                return true;
 
        /* real mode guest state checks */
-       if (!is_protmode(vcpu)) {
+       if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
                if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
                        return false;
                if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
@@ -3599,7 +3739,7 @@ static int init_rmode_tss(struct kvm *kvm)
        int r, idx, ret = 0;
 
        idx = srcu_read_lock(&kvm->srcu);
-       fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
+       fn = kvm->arch.tss_addr >> PAGE_SHIFT;
        r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
        if (r < 0)
                goto out;
@@ -3692,7 +3832,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
        kvm_userspace_mem.flags = 0;
        kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
        kvm_userspace_mem.memory_size = PAGE_SIZE;
-       r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
+       r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
        if (r)
                goto out;
 
@@ -3722,7 +3862,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
        kvm_userspace_mem.guest_phys_addr =
                kvm->arch.ept_identity_map_addr;
        kvm_userspace_mem.memory_size = PAGE_SIZE;
-       r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
+       r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
        if (r)
                goto out;
 
@@ -3869,13 +4009,59 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
                        msr, MSR_TYPE_W);
 }
 
+static int vmx_vm_has_apicv(struct kvm *kvm)
+{
+       return enable_apicv && irqchip_in_kernel(kvm);
+}
+
+/*
+ * Send interrupt to vcpu via posted interrupt way.
+ * 1. If target vcpu is running(non-root mode), send posted interrupt
+ * notification to vcpu and hardware will sync PIR to vIRR atomically.
+ * 2. If target vcpu isn't running(root mode), kick it to pick up the
+ * interrupt from PIR in next vmentry.
+ */
+static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       int r;
+
+       if (pi_test_and_set_pir(vector, &vmx->pi_desc))
+               return;
+
+       r = pi_test_and_set_on(&vmx->pi_desc);
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
+#ifdef CONFIG_SMP
+       if (!r && (vcpu->mode == IN_GUEST_MODE))
+               apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
+                               POSTED_INTR_VECTOR);
+       else
+#endif
+               kvm_vcpu_kick(vcpu);
+}
+
+static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+       if (!pi_test_and_clear_on(&vmx->pi_desc))
+               return;
+
+       kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
+}
+
+static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu)
+{
+       return;
+}
+
 /*
  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
  * will not change in the lifetime of the guest.
  * Note that host-state that does change is set elsewhere. E.g., host-state
  * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
  */
-static void vmx_set_constant_host_state(void)
+static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 {
        u32 low32, high32;
        unsigned long tmpl;
@@ -3903,6 +4089,7 @@ static void vmx_set_constant_host_state(void)
 
        native_store_idt(&dt);
        vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
+       vmx->host_idt_base = dt.address;
 
        vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
 
@@ -3928,6 +4115,15 @@ static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
        vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
 }
 
+static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
+{
+       u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
+
+       if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+               pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+       return pin_based_exec_ctrl;
+}
+
 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 {
        u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
@@ -3945,11 +4141,6 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
        return exec_control;
 }
 
-static int vmx_vm_has_apicv(struct kvm *kvm)
-{
-       return enable_apicv_reg_vid && irqchip_in_kernel(kvm);
-}
-
 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 {
        u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
@@ -3971,6 +4162,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
                exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
        exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
+       /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
+          (handle_vmptrld).
+          We can NOT enable shadow_vmcs here because we don't have yet
+          a current VMCS12
+       */
+       exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
        return exec_control;
 }
 
@@ -3999,14 +4196,17 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
        vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
 
+       if (enable_shadow_vmcs) {
+               vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
+               vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
+       }
        if (cpu_has_vmx_msr_bitmap())
                vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
 
        vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
 
        /* Control */
-       vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
-               vmcs_config.pin_based_exec_ctrl);
+       vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
 
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 
@@ -4015,13 +4215,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
                                vmx_secondary_exec_control(vmx));
        }
 
-       if (enable_apicv_reg_vid) {
+       if (vmx_vm_has_apicv(vmx->vcpu.kvm)) {
                vmcs_write64(EOI_EXIT_BITMAP0, 0);
                vmcs_write64(EOI_EXIT_BITMAP1, 0);
                vmcs_write64(EOI_EXIT_BITMAP2, 0);
                vmcs_write64(EOI_EXIT_BITMAP3, 0);
 
                vmcs_write16(GUEST_INTR_STATUS, 0);
+
+               vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+               vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
        }
 
        if (ple_gap) {
@@ -4035,7 +4238,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
        vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
        vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
-       vmx_set_constant_host_state();
+       vmx_set_constant_host_state(vmx);
 #ifdef CONFIG_X86_64
        rdmsrl(MSR_FS_BASE, a);
        vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
@@ -4089,11 +4292,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        return 0;
 }
 
-static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u64 msr;
-       int ret;
 
        vmx->rmode.vm86_active = 0;
 
@@ -4109,12 +4311,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        vmx_segment_cache_clear(vmx);
 
        seg_setup(VCPU_SREG_CS);
-       if (kvm_vcpu_is_bsp(&vmx->vcpu))
-               vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
-       else {
-               vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
-               vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
-       }
+       vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
+       vmcs_write32(GUEST_CS_BASE, 0xffff0000);
 
        seg_setup(VCPU_SREG_DS);
        seg_setup(VCPU_SREG_ES);
@@ -4137,10 +4335,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        vmcs_writel(GUEST_SYSENTER_EIP, 0);
 
        vmcs_writel(GUEST_RFLAGS, 0x02);
-       if (kvm_vcpu_is_bsp(&vmx->vcpu))
-               kvm_rip_write(vcpu, 0xfff0);
-       else
-               kvm_rip_write(vcpu, 0);
+       kvm_rip_write(vcpu, 0xfff0);
 
        vmcs_writel(GUEST_GDTR_BASE, 0);
        vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
@@ -4171,23 +4366,20 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
                vmcs_write64(APIC_ACCESS_ADDR,
                             page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
 
+       if (vmx_vm_has_apicv(vcpu->kvm))
+               memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
+
        if (vmx->vpid != 0)
                vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 
        vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-       vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
        vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
-       srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
        vmx_set_cr4(&vmx->vcpu, 0);
        vmx_set_efer(&vmx->vcpu, 0);
        vmx_fpu_activate(&vmx->vcpu);
        update_exception_bitmap(&vmx->vcpu);
 
        vpid_sync_context(vmx);
-
-       ret = 0;
-
-       return ret;
 }
 
 /*
@@ -4200,40 +4392,45 @@ static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
                PIN_BASED_EXT_INTR_MASK;
 }
 
-static void enable_irq_window(struct kvm_vcpu *vcpu)
+static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
+{
+       return get_vmcs12(vcpu)->pin_based_vm_exec_control &
+               PIN_BASED_NMI_EXITING;
+}
+
+static int enable_irq_window(struct kvm_vcpu *vcpu)
 {
        u32 cpu_based_vm_exec_control;
-       if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
+
+       if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
                /*
                 * We get here if vmx_interrupt_allowed() said we can't
-                * inject to L1 now because L2 must run. Ask L2 to exit
-                * right after entry, so we can inject to L1 more promptly.
+                * inject to L1 now because L2 must run. The caller will have
+                * to make L2 exit right after entry, so we can inject to L1
+                * more promptly.
                 */
-               kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
-               return;
-       }
+               return -EBUSY;
 
        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+       return 0;
 }
 
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
+static int enable_nmi_window(struct kvm_vcpu *vcpu)
 {
        u32 cpu_based_vm_exec_control;
 
-       if (!cpu_has_virtual_nmis()) {
-               enable_irq_window(vcpu);
-               return;
-       }
+       if (!cpu_has_virtual_nmis())
+               return enable_irq_window(vcpu);
+
+       if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI)
+               return enable_irq_window(vcpu);
 
-       if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
-               enable_irq_window(vcpu);
-               return;
-       }
        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+       return 0;
 }
 
 static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -4294,16 +4491,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
 }
 
-static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
-{
-       if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
-               return 0;
-
-       return  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-                 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
-                  | GUEST_INTR_STATE_NMI));
-}
-
 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
 {
        if (!cpu_has_virtual_nmis())
@@ -4333,18 +4520,52 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
        }
 }
 
+static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
+{
+       if (is_guest_mode(vcpu)) {
+               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+               if (to_vmx(vcpu)->nested.nested_run_pending)
+                       return 0;
+               if (nested_exit_on_nmi(vcpu)) {
+                       nested_vmx_vmexit(vcpu);
+                       vmcs12->vm_exit_reason = EXIT_REASON_EXCEPTION_NMI;
+                       vmcs12->vm_exit_intr_info = NMI_VECTOR |
+                               INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK;
+                       /*
+                        * The NMI-triggered VM exit counts as injection:
+                        * clear this one and block further NMIs.
+                        */
+                       vcpu->arch.nmi_pending = 0;
+                       vmx_set_nmi_mask(vcpu, true);
+                       return 0;
+               }
+       }
+
+       if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
+               return 0;
+
+       return  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+                 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
+                  | GUEST_INTR_STATE_NMI));
+}
+
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
-       if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
+       if (is_guest_mode(vcpu)) {
                struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-               if (to_vmx(vcpu)->nested.nested_run_pending ||
-                   (vmcs12->idt_vectoring_info_field &
-                    VECTORING_INFO_VALID_MASK))
+
+               if (to_vmx(vcpu)->nested.nested_run_pending)
                        return 0;
-               nested_vmx_vmexit(vcpu);
-               vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
-               vmcs12->vm_exit_intr_info = 0;
-               /* fall through to normal code, but now in L1, not L2 */
+               if (nested_exit_on_intr(vcpu)) {
+                       nested_vmx_vmexit(vcpu);
+                       vmcs12->vm_exit_reason =
+                               EXIT_REASON_EXTERNAL_INTERRUPT;
+                       vmcs12->vm_exit_intr_info = 0;
+                       /*
+                        * fall through to normal code, but now in L1, not L2
+                        */
+               }
        }
 
        return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
@@ -4362,7 +4583,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
                .flags = 0,
        };
 
-       ret = kvm_set_memory_region(kvm, &tss_mem, false);
+       ret = kvm_set_memory_region(kvm, &tss_mem);
        if (ret)
                return ret;
        kvm->arch.tss_addr = addr;
@@ -4603,34 +4824,50 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
 {
-       if (to_vmx(vcpu)->nested.vmxon &&
-           ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
-               return 1;
-
        if (is_guest_mode(vcpu)) {
+               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+               unsigned long orig_val = val;
+
                /*
                 * We get here when L2 changed cr0 in a way that did not change
                 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
-                * but did change L0 shadowed bits. This can currently happen
-                * with the TS bit: L0 may want to leave TS on (for lazy fpu
-                * loading) while pretending to allow the guest to change it.
+                * but did change L0 shadowed bits. So we first calculate the
+                * effective cr0 value that L1 would like to write into the
+                * hardware. It consists of the L2-owned bits from the new
+                * value combined with the L1-owned bits from L1's guest_cr0.
                 */
-               if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) |
-                        (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits)))
+               val = (val & ~vmcs12->cr0_guest_host_mask) |
+                       (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
+
+               /* TODO: will have to take unrestricted guest mode into
+                * account */
+               if ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)
                        return 1;
-               vmcs_writel(CR0_READ_SHADOW, val);
+
+               if (kvm_set_cr0(vcpu, val))
+                       return 1;
+               vmcs_writel(CR0_READ_SHADOW, orig_val);
                return 0;
-       } else
+       } else {
+               if (to_vmx(vcpu)->nested.vmxon &&
+                   ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
+                       return 1;
                return kvm_set_cr0(vcpu, val);
+       }
 }
 
 static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
 {
        if (is_guest_mode(vcpu)) {
-               if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) |
-                        (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits)))
+               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+               unsigned long orig_val = val;
+
+               /* analogously to handle_set_cr0 */
+               val = (val & ~vmcs12->cr4_guest_host_mask) |
+                       (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
+               if (kvm_set_cr4(vcpu, val))
                        return 1;
-               vmcs_writel(CR4_READ_SHADOW, val);
+               vmcs_writel(CR4_READ_SHADOW, orig_val);
                return 0;
        } else
                return kvm_set_cr4(vcpu, val);
@@ -5183,7 +5420,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
                if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
                        return 1;
 
-               err = emulate_instruction(vcpu, 0);
+               err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
 
                if (err == EMULATE_DO_MMIO) {
                        ret = 0;
@@ -5259,8 +5496,7 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
        }
 
        /* Create a new VMCS */
-       item = (struct vmcs02_list *)
-               kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
+       item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
        if (!item)
                return NULL;
        item->vmcs02.vmcs = alloc_vmcs();
@@ -5309,6 +5545,9 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
                free_loaded_vmcs(&vmx->vmcs01);
 }
 
+static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
+                                u32 vm_instruction_error);
+
 /*
  * Emulate the VMXON instruction.
  * Currently, we just remember that VMX is active, and do not save or even
@@ -5321,6 +5560,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 {
        struct kvm_segment cs;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
+       struct vmcs *shadow_vmcs;
 
        /* The Intel VMX Instruction Reference lists a bunch of bits that
         * are prerequisite to running VMXON, most notably cr4.VMXE must be
@@ -5344,6 +5584,21 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
                kvm_inject_gp(vcpu, 0);
                return 1;
        }
+       if (vmx->nested.vmxon) {
+               nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
+               skip_emulated_instruction(vcpu);
+               return 1;
+       }
+       if (enable_shadow_vmcs) {
+               shadow_vmcs = alloc_vmcs();
+               if (!shadow_vmcs)
+                       return -ENOMEM;
+               /* mark vmcs as shadow */
+               shadow_vmcs->revision_id |= (1u << 31);
+               /* init shadow vmcs */
+               vmcs_clear(shadow_vmcs);
+               vmx->nested.current_shadow_vmcs = shadow_vmcs;
+       }
 
        INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
        vmx->nested.vmcs02_num = 0;
@@ -5384,6 +5639,25 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
+{
+       u32 exec_control;
+       if (enable_shadow_vmcs) {
+               if (vmx->nested.current_vmcs12 != NULL) {
+                       /* copy to memory all shadowed fields in case
+                          they were modified */
+                       copy_shadow_to_vmcs12(vmx);
+                       vmx->nested.sync_shadow_vmcs = false;
+                       exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+                       exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
+                       vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+                       vmcs_write64(VMCS_LINK_POINTER, -1ull);
+               }
+       }
+       kunmap(vmx->nested.current_vmcs12_page);
+       nested_release_page(vmx->nested.current_vmcs12_page);
+}
+
 /*
  * Free whatever needs to be freed from vmx->nested when L1 goes down, or
  * just stops using VMX.
@@ -5394,11 +5668,12 @@ static void free_nested(struct vcpu_vmx *vmx)
                return;
        vmx->nested.vmxon = false;
        if (vmx->nested.current_vmptr != -1ull) {
-               kunmap(vmx->nested.current_vmcs12_page);
-               nested_release_page(vmx->nested.current_vmcs12_page);
+               nested_release_vmcs12(vmx);
                vmx->nested.current_vmptr = -1ull;
                vmx->nested.current_vmcs12 = NULL;
        }
+       if (enable_shadow_vmcs)
+               free_vmcs(vmx->nested.current_shadow_vmcs);
        /* Unpin physical memory we referred to in current vmcs02 */
        if (vmx->nested.apic_access_page) {
                nested_release_page(vmx->nested.apic_access_page);
@@ -5507,6 +5782,10 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
                            X86_EFLAGS_SF | X86_EFLAGS_OF))
                        | X86_EFLAGS_ZF);
        get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
+       /*
+        * We don't need to force a shadow sync because
+        * VM_INSTRUCTION_ERROR is not shadowed
+        */
 }
 
 /* Emulate the VMCLEAR instruction */
@@ -5539,8 +5818,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
        }
 
        if (vmptr == vmx->nested.current_vmptr) {
-               kunmap(vmx->nested.current_vmcs12_page);
-               nested_release_page(vmx->nested.current_vmcs12_page);
+               nested_release_vmcs12(vmx);
                vmx->nested.current_vmptr = -1ull;
                vmx->nested.current_vmcs12 = NULL;
        }
@@ -5639,6 +5917,111 @@ static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
        }
 }
 
+
+static inline bool vmcs12_write_any(struct kvm_vcpu *vcpu,
+                                   unsigned long field, u64 field_value){
+       short offset = vmcs_field_to_offset(field);
+       char *p = ((char *) get_vmcs12(vcpu)) + offset;
+       if (offset < 0)
+               return false;
+
+       switch (vmcs_field_type(field)) {
+       case VMCS_FIELD_TYPE_U16:
+               *(u16 *)p = field_value;
+               return true;
+       case VMCS_FIELD_TYPE_U32:
+               *(u32 *)p = field_value;
+               return true;
+       case VMCS_FIELD_TYPE_U64:
+               *(u64 *)p = field_value;
+               return true;
+       case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+               *(natural_width *)p = field_value;
+               return true;
+       default:
+               return false; /* can never happen. */
+       }
+
+}
+
+static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
+{
+       int i;
+       unsigned long field;
+       u64 field_value;
+       struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
+       unsigned long *fields = (unsigned long *)shadow_read_write_fields;
+       int num_fields = max_shadow_read_write_fields;
+
+       vmcs_load(shadow_vmcs);
+
+       for (i = 0; i < num_fields; i++) {
+               field = fields[i];
+               switch (vmcs_field_type(field)) {
+               case VMCS_FIELD_TYPE_U16:
+                       field_value = vmcs_read16(field);
+                       break;
+               case VMCS_FIELD_TYPE_U32:
+                       field_value = vmcs_read32(field);
+                       break;
+               case VMCS_FIELD_TYPE_U64:
+                       field_value = vmcs_read64(field);
+                       break;
+               case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+                       field_value = vmcs_readl(field);
+                       break;
+               }
+               vmcs12_write_any(&vmx->vcpu, field, field_value);
+       }
+
+       vmcs_clear(shadow_vmcs);
+       vmcs_load(vmx->loaded_vmcs->vmcs);
+}
+
+static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
+{
+       unsigned long *fields[] = {
+               (unsigned long *)shadow_read_write_fields,
+               (unsigned long *)shadow_read_only_fields
+       };
+       int num_lists =  ARRAY_SIZE(fields);
+       int max_fields[] = {
+               max_shadow_read_write_fields,
+               max_shadow_read_only_fields
+       };
+       int i, q;
+       unsigned long field;
+       u64 field_value = 0;
+       struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
+
+       vmcs_load(shadow_vmcs);
+
+       for (q = 0; q < num_lists; q++) {
+               for (i = 0; i < max_fields[q]; i++) {
+                       field = fields[q][i];
+                       vmcs12_read_any(&vmx->vcpu, field, &field_value);
+
+                       switch (vmcs_field_type(field)) {
+                       case VMCS_FIELD_TYPE_U16:
+                               vmcs_write16(field, (u16)field_value);
+                               break;
+                       case VMCS_FIELD_TYPE_U32:
+                               vmcs_write32(field, (u32)field_value);
+                               break;
+                       case VMCS_FIELD_TYPE_U64:
+                               vmcs_write64(field, (u64)field_value);
+                               break;
+                       case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+                               vmcs_writel(field, (long)field_value);
+                               break;
+                       }
+               }
+       }
+
+       vmcs_clear(shadow_vmcs);
+       vmcs_load(vmx->loaded_vmcs->vmcs);
+}
+
 /*
  * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
  * used before) all generate the same failure when it is missing.
@@ -5703,8 +6086,6 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
        gva_t gva;
        unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
        u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
-       char *p;
-       short offset;
        /* The value to write might be 32 or 64 bits, depending on L1's long
         * mode, and eventually we need to write that into a field of several
         * possible lengths. The code below first zero-extends the value to 64
@@ -5741,28 +6122,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                return 1;
        }
 
-       offset = vmcs_field_to_offset(field);
-       if (offset < 0) {
-               nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
-               skip_emulated_instruction(vcpu);
-               return 1;
-       }
-       p = ((char *) get_vmcs12(vcpu)) + offset;
-
-       switch (vmcs_field_type(field)) {
-       case VMCS_FIELD_TYPE_U16:
-               *(u16 *)p = field_value;
-               break;
-       case VMCS_FIELD_TYPE_U32:
-               *(u32 *)p = field_value;
-               break;
-       case VMCS_FIELD_TYPE_U64:
-               *(u64 *)p = field_value;
-               break;
-       case VMCS_FIELD_TYPE_NATURAL_WIDTH:
-               *(natural_width *)p = field_value;
-               break;
-       default:
+       if (!vmcs12_write_any(vcpu, field, field_value)) {
                nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
                skip_emulated_instruction(vcpu);
                return 1;
@@ -5780,6 +6140,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
        gva_t gva;
        gpa_t vmptr;
        struct x86_exception e;
+       u32 exec_control;
 
        if (!nested_vmx_check_permission(vcpu))
                return 1;
@@ -5818,14 +6179,20 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
                        skip_emulated_instruction(vcpu);
                        return 1;
                }
-               if (vmx->nested.current_vmptr != -1ull) {
-                       kunmap(vmx->nested.current_vmcs12_page);
-                       nested_release_page(vmx->nested.current_vmcs12_page);
-               }
+               if (vmx->nested.current_vmptr != -1ull)
+                       nested_release_vmcs12(vmx);
 
                vmx->nested.current_vmptr = vmptr;
                vmx->nested.current_vmcs12 = new_vmcs12;
                vmx->nested.current_vmcs12_page = page;
+               if (enable_shadow_vmcs) {
+                       exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+                       exec_control |= SECONDARY_EXEC_SHADOW_VMCS;
+                       vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+                       vmcs_write64(VMCS_LINK_POINTER,
+                                    __pa(vmx->nested.current_shadow_vmcs));
+                       vmx->nested.sync_shadow_vmcs = true;
+               }
        }
 
        nested_vmx_succeed(vcpu);
@@ -5908,6 +6275,52 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 static const int kvm_vmx_max_exit_handlers =
        ARRAY_SIZE(kvm_vmx_exit_handlers);
 
+static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
+                                      struct vmcs12 *vmcs12)
+{
+       unsigned long exit_qualification;
+       gpa_t bitmap, last_bitmap;
+       unsigned int port;
+       int size;
+       u8 b;
+
+       if (nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING))
+               return 1;
+
+       if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+               return 0;
+
+       exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+       port = exit_qualification >> 16;
+       size = (exit_qualification & 7) + 1;
+
+       last_bitmap = (gpa_t)-1;
+       b = -1;
+
+       while (size > 0) {
+               if (port < 0x8000)
+                       bitmap = vmcs12->io_bitmap_a;
+               else if (port < 0x10000)
+                       bitmap = vmcs12->io_bitmap_b;
+               else
+                       return 1;
+               bitmap += (port & 0x7fff) / 8;
+
+               if (last_bitmap != bitmap)
+                       if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1))
+                               return 1;
+               if (b & (1 << (port & 7)))
+                       return 1;
+
+               port++;
+               size--;
+               last_bitmap = bitmap;
+       }
+
+       return 0;
+}
+
 /*
  * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
  * rather than handle it ourselves in L0. I.e., check whether L1 expressed
@@ -5939,7 +6352,8 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
        /* Then read the msr_index'th bit from this bitmap: */
        if (msr_index < 1024*8) {
                unsigned char b;
-               kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1);
+               if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1))
+                       return 1;
                return 1 & (b >> (msr_index & 7));
        } else
                return 1; /* let L1 handle the wrong parameter */
@@ -6033,10 +6447,10 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
  */
 static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 {
-       u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
        u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+       u32 exit_reason = vmx->exit_reason;
 
        if (vmx->nested.nested_run_pending)
                return 0;
@@ -6060,14 +6474,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
        case EXIT_REASON_TRIPLE_FAULT:
                return 1;
        case EXIT_REASON_PENDING_INTERRUPT:
+               return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
        case EXIT_REASON_NMI_WINDOW:
-               /*
-                * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit
-                * (aka Interrupt Window Exiting) only when L1 turned it on,
-                * so if we got a PENDING_INTERRUPT exit, this must be for L1.
-                * Same for NMI Window Exiting.
-                */
-               return 1;
+               return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
        case EXIT_REASON_TASK_SWITCH:
                return 1;
        case EXIT_REASON_CPUID:
@@ -6097,8 +6506,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
        case EXIT_REASON_DR_ACCESS:
                return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
        case EXIT_REASON_IO_INSTRUCTION:
-               /* TODO: support IO bitmaps */
-               return 1;
+               return nested_vmx_exit_handled_io(vcpu, vmcs12);
        case EXIT_REASON_MSR_READ:
        case EXIT_REASON_MSR_WRITE:
                return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
@@ -6122,6 +6530,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
        case EXIT_REASON_EPT_VIOLATION:
        case EXIT_REASON_EPT_MISCONFIG:
                return 0;
+       case EXIT_REASON_PREEMPTION_TIMER:
+               return vmcs12->pin_based_vm_exec_control &
+                       PIN_BASED_VMX_PREEMPTION_TIMER;
        case EXIT_REASON_WBINVD:
                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
        case EXIT_REASON_XSETBV:
@@ -6316,6 +6727,9 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
 
 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 {
+       if (!vmx_vm_has_apicv(vcpu->kvm))
+               return;
+
        vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
        vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
        vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
@@ -6346,6 +6760,52 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
        }
 }
 
+static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
+{
+       u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+       /*
+        * If external interrupt exists, IF bit is set in rflags/eflags on the
+        * interrupt stack frame, and interrupt will be enabled on a return
+        * from interrupt handler.
+        */
+       if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
+                       == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
+               unsigned int vector;
+               unsigned long entry;
+               gate_desc *desc;
+               struct vcpu_vmx *vmx = to_vmx(vcpu);
+#ifdef CONFIG_X86_64
+               unsigned long tmp;
+#endif
+
+               vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
+               desc = (gate_desc *)vmx->host_idt_base + vector;
+               entry = gate_offset(*desc);
+               asm volatile(
+#ifdef CONFIG_X86_64
+                       "mov %%" _ASM_SP ", %[sp]\n\t"
+                       "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
+                       "push $%c[ss]\n\t"
+                       "push %[sp]\n\t"
+#endif
+                       "pushf\n\t"
+                       "orl $0x200, (%%" _ASM_SP ")\n\t"
+                       __ASM_SIZE(push) " $%c[cs]\n\t"
+                       "call *%[entry]\n\t"
+                       :
+#ifdef CONFIG_X86_64
+                       [sp]"=&r"(tmp)
+#endif
+                       :
+                       [entry]"r"(entry),
+                       [ss]"i"(__KERNEL_DS),
+                       [cs]"i"(__KERNEL_CS)
+                       );
+       } else
+               local_irq_enable();
+}
+
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 {
        u32 exit_intr_info;
@@ -6388,7 +6848,7 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
                        ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
 }
 
-static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
+static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
                                      u32 idt_vectoring_info,
                                      int instr_len_field,
                                      int error_code_field)
@@ -6399,46 +6859,43 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
 
        idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
-       vmx->vcpu.arch.nmi_injected = false;
-       kvm_clear_exception_queue(&vmx->vcpu);
-       kvm_clear_interrupt_queue(&vmx->vcpu);
+       vcpu->arch.nmi_injected = false;
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
 
        if (!idtv_info_valid)
                return;
 
-       kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
+       kvm_make_request(KVM_REQ_EVENT, vcpu);
 
        vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
        type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
 
        switch (type) {
        case INTR_TYPE_NMI_INTR:
-               vmx->vcpu.arch.nmi_injected = true;
+               vcpu->arch.nmi_injected = true;
                /*
                 * SDM 3: 27.7.1.2 (September 2008)
                 * Clear bit "block by NMI" before VM entry if a NMI
                 * delivery faulted.
                 */
-               vmx_set_nmi_mask(&vmx->vcpu, false);
+               vmx_set_nmi_mask(vcpu, false);
                break;
        case INTR_TYPE_SOFT_EXCEPTION:
-               vmx->vcpu.arch.event_exit_inst_len =
-                       vmcs_read32(instr_len_field);
+               vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
                /* fall through */
        case INTR_TYPE_HARD_EXCEPTION:
                if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
                        u32 err = vmcs_read32(error_code_field);
-                       kvm_queue_exception_e(&vmx->vcpu, vector, err);
+                       kvm_queue_exception_e(vcpu, vector, err);
                } else
-                       kvm_queue_exception(&vmx->vcpu, vector);
+                       kvm_queue_exception(vcpu, vector);
                break;
        case INTR_TYPE_SOFT_INTR:
-               vmx->vcpu.arch.event_exit_inst_len =
-                       vmcs_read32(instr_len_field);
+               vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
                /* fall through */
        case INTR_TYPE_EXT_INTR:
-               kvm_queue_interrupt(&vmx->vcpu, vector,
-                       type == INTR_TYPE_SOFT_INTR);
+               kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
                break;
        default:
                break;
@@ -6447,18 +6904,14 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
 
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
-       if (is_guest_mode(&vmx->vcpu))
-               return;
-       __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
+       __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
                                  VM_EXIT_INSTRUCTION_LEN,
                                  IDT_VECTORING_ERROR_CODE);
 }
 
 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
 {
-       if (is_guest_mode(vcpu))
-               return;
-       __vmx_complete_interrupts(to_vmx(vcpu),
+       __vmx_complete_interrupts(vcpu,
                                  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
                                  VM_ENTRY_INSTRUCTION_LEN,
                                  VM_ENTRY_EXCEPTION_ERROR_CODE);
@@ -6489,21 +6942,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long debugctlmsr;
 
-       if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
-               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-               if (vmcs12->idt_vectoring_info_field &
-                               VECTORING_INFO_VALID_MASK) {
-                       vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-                               vmcs12->idt_vectoring_info_field);
-                       vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
-                               vmcs12->vm_exit_instruction_len);
-                       if (vmcs12->idt_vectoring_info_field &
-                                       VECTORING_INFO_DELIVER_CODE_MASK)
-                               vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
-                                       vmcs12->idt_vectoring_error_code);
-               }
-       }
-
        /* Record the guest's net vcpu time for enforced NMI injections. */
        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
                vmx->entry_time = ktime_get();
@@ -6513,6 +6951,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        if (vmx->emulation_required)
                return;
 
+       if (vmx->nested.sync_shadow_vmcs) {
+               copy_vmcs12_to_shadow(vmx);
+               vmx->nested.sync_shadow_vmcs = false;
+       }
+
        if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
                vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
        if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
@@ -6662,17 +7105,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
        vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
-       if (is_guest_mode(vcpu)) {
-               struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-               vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
-               if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
-                       vmcs12->idt_vectoring_error_code =
-                               vmcs_read32(IDT_VECTORING_ERROR_CODE);
-                       vmcs12->vm_exit_instruction_len =
-                               vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
-               }
-       }
-
        vmx->loaded_vmcs->launched = 1;
 
        vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
@@ -6734,10 +7166,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
        put_cpu();
        if (err)
                goto free_vmcs;
-       if (vm_need_virtualize_apic_accesses(kvm))
+       if (vm_need_virtualize_apic_accesses(kvm)) {
                err = alloc_apic_access_page(kvm);
                if (err)
                        goto free_vmcs;
+       }
 
        if (enable_ept) {
                if (!kvm->arch.ept_identity_map_addr)
@@ -6931,9 +7364,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                vmcs12->vm_entry_instruction_len);
        vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
                vmcs12->guest_interruptibility_info);
-       vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state);
        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
-       vmcs_writel(GUEST_DR7, vmcs12->guest_dr7);
+       kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
        vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
        vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
                vmcs12->guest_pending_dbg_exceptions);
@@ -6946,6 +7378,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                (vmcs_config.pin_based_exec_ctrl |
                 vmcs12->pin_based_vm_exec_control));
 
+       if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
+               vmcs_write32(VMX_PREEMPTION_TIMER_VALUE,
+                            vmcs12->vmx_preemption_timer_value);
+
        /*
         * Whether page-faults are trapped is determined by a combination of
         * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
@@ -7016,7 +7452,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
         * Other fields are different per CPU, and will be set later when
         * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
         */
-       vmx_set_constant_host_state();
+       vmx_set_constant_host_state(vmx);
 
        /*
         * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
@@ -7082,7 +7518,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
                vcpu->arch.efer = vmcs12->guest_ia32_efer;
-       if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
+       else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
                vcpu->arch.efer |= (EFER_LMA | EFER_LME);
        else
                vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
@@ -7121,6 +7557,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        int cpu;
        struct loaded_vmcs *vmcs02;
+       bool ia32e;
 
        if (!nested_vmx_check_permission(vcpu) ||
            !nested_vmx_check_vmcs12(vcpu))
@@ -7129,6 +7566,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        skip_emulated_instruction(vcpu);
        vmcs12 = get_vmcs12(vcpu);
 
+       if (enable_shadow_vmcs)
+               copy_shadow_to_vmcs12(vmx);
+
        /*
         * The nested entry process starts with enforcing various prerequisites
         * on vmcs12 as required by the Intel SDM, and act appropriately when
@@ -7146,6 +7586,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
                return 1;
        }
 
+       if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE) {
+               nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+               return 1;
+       }
+
        if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
                        !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) {
                /*TODO: Also verify bits beyond physical address width are 0*/
@@ -7203,6 +7648,45 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
                return 1;
        }
 
+       /*
+        * If the load IA32_EFER VM-entry control is 1, the following checks
+        * are performed on the field for the IA32_EFER MSR:
+        * - Bits reserved in the IA32_EFER MSR must be 0.
+        * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
+        *   the IA-32e mode guest VM-exit control. It must also be identical
+        *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
+        *   CR0.PG) is 1.
+        */
+       if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) {
+               ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
+               if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
+                   ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
+                   ((vmcs12->guest_cr0 & X86_CR0_PG) &&
+                    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) {
+                       nested_vmx_entry_failure(vcpu, vmcs12,
+                               EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+                       return 1;
+               }
+       }
+
+       /*
+        * If the load IA32_EFER VM-exit control is 1, bits reserved in the
+        * IA32_EFER MSR must be 0 in the field for that register. In addition,
+        * the values of the LMA and LME bits in the field must each be that of
+        * the host address-space size VM-exit control.
+        */
+       if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
+               ia32e = (vmcs12->vm_exit_controls &
+                        VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
+               if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
+                   ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
+                   ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) {
+                       nested_vmx_entry_failure(vcpu, vmcs12,
+                               EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+                       return 1;
+               }
+       }
+
        /*
         * We're finally done with prerequisite checking, and can start with
         * the nested entry.
@@ -7223,6 +7707,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        vcpu->cpu = cpu;
        put_cpu();
 
+       vmx_segment_cache_clear(vmx);
+
        vmcs12->launch_state = 1;
 
        prepare_vmcs02(vcpu, vmcs12);
@@ -7273,6 +7759,48 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                        vcpu->arch.cr4_guest_owned_bits));
 }
 
+static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
+                                      struct vmcs12 *vmcs12)
+{
+       u32 idt_vectoring;
+       unsigned int nr;
+
+       if (vcpu->arch.exception.pending) {
+               nr = vcpu->arch.exception.nr;
+               idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
+
+               if (kvm_exception_is_soft(nr)) {
+                       vmcs12->vm_exit_instruction_len =
+                               vcpu->arch.event_exit_inst_len;
+                       idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
+               } else
+                       idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
+
+               if (vcpu->arch.exception.has_error_code) {
+                       idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
+                       vmcs12->idt_vectoring_error_code =
+                               vcpu->arch.exception.error_code;
+               }
+
+               vmcs12->idt_vectoring_info_field = idt_vectoring;
+       } else if (vcpu->arch.nmi_pending) {
+               vmcs12->idt_vectoring_info_field =
+                       INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
+       } else if (vcpu->arch.interrupt.pending) {
+               nr = vcpu->arch.interrupt.nr;
+               idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
+
+               if (vcpu->arch.interrupt.soft) {
+                       idt_vectoring |= INTR_TYPE_SOFT_INTR;
+                       vmcs12->vm_entry_instruction_len =
+                               vcpu->arch.event_exit_inst_len;
+               } else
+                       idt_vectoring |= INTR_TYPE_EXT_INTR;
+
+               vmcs12->idt_vectoring_info_field = idt_vectoring;
+       }
+}
+
 /*
  * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
  * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
@@ -7284,7 +7812,7 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  * exit-information fields only. Other fields are modified by L1 with VMWRITE,
  * which already writes to vmcs12 directly.
  */
-void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
        /* update guest state fields: */
        vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
@@ -7332,16 +7860,19 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
        vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
 
-       vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
        vmcs12->guest_interruptibility_info =
                vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
        vmcs12->guest_pending_dbg_exceptions =
                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 
+       vmcs12->vm_entry_controls =
+               (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
+               (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
+
        /* TODO: These cannot have changed unless we have MSR bitmaps and
         * the relevant bit asks not to trap the change */
        vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
-       if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT)
+       if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
                vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
        vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
        vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
@@ -7349,21 +7880,38 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
        /* update exit information fields: */
 
-       vmcs12->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
+       vmcs12->vm_exit_reason  = to_vmx(vcpu)->exit_reason;
        vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
        vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-       vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
-       vmcs12->idt_vectoring_info_field =
-               vmcs_read32(IDT_VECTORING_INFO_FIELD);
-       vmcs12->idt_vectoring_error_code =
-               vmcs_read32(IDT_VECTORING_ERROR_CODE);
+       if ((vmcs12->vm_exit_intr_info &
+            (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
+           (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK))
+               vmcs12->vm_exit_intr_error_code =
+                       vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+       vmcs12->idt_vectoring_info_field = 0;
        vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
        vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 
-       /* clear vm-entry fields which are to be cleared on exit */
-       if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+       if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
+               /* vm_entry_intr_info_field is cleared on exit. Emulate this
+                * instead of reading the real value. */
                vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
+
+               /*
+                * Transfer the event that L0 or L1 may wanted to inject into
+                * L2 to IDT_VECTORING_INFO_FIELD.
+                */
+               vmcs12_save_pending_event(vcpu, vmcs12);
+       }
+
+       /*
+        * Drop what we picked up for L2 via vmx_complete_interrupts. It is
+        * preserved above and would only end up incorrectly in L1.
+        */
+       vcpu->arch.nmi_injected = false;
+       kvm_clear_exception_queue(vcpu);
+       kvm_clear_interrupt_queue(vcpu);
 }
 
 /*
@@ -7375,11 +7923,12 @@ void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
  * Failures During or After Loading Guest State").
  * This function should be called when the active VMCS is L1's (vmcs01).
  */
-void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
+                                  struct vmcs12 *vmcs12)
 {
        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
                vcpu->arch.efer = vmcs12->host_ia32_efer;
-       if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+       else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
                vcpu->arch.efer |= (EFER_LMA | EFER_LME);
        else
                vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
@@ -7387,6 +7936,7 @@ void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
        kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
        kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
+       vmx_set_rflags(vcpu, X86_EFLAGS_BIT1);
        /*
         * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
         * actually changed, because it depends on the current state of
@@ -7445,6 +7995,9 @@ void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
                vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
                        vmcs12->host_ia32_perf_global_ctrl);
+
+       kvm_set_dr(vcpu, 7, 0x400);
+       vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 }
 
 /*
@@ -7458,6 +8011,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
        int cpu;
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
 
+       /* trying to cancel vmlaunch/vmresume is a bug */
+       WARN_ON_ONCE(vmx->nested.nested_run_pending);
+
        leave_guest_mode(vcpu);
        prepare_vmcs12(vcpu, vmcs12);
 
@@ -7468,6 +8024,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
        vcpu->cpu = cpu;
        put_cpu();
 
+       vmx_segment_cache_clear(vmx);
+
        /* if no vmcs02 cache requested, remove the one we used */
        if (VMCS02_POOL_SIZE == 0)
                nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
@@ -7496,6 +8054,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
                nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
        } else
                nested_vmx_succeed(vcpu);
+       if (enable_shadow_vmcs)
+               vmx->nested.sync_shadow_vmcs = true;
 }
 
 /*
@@ -7513,6 +8073,8 @@ static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
        vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
        vmcs12->exit_qualification = qualification;
        nested_vmx_succeed(vcpu);
+       if (enable_shadow_vmcs)
+               to_vmx(vcpu)->nested.sync_shadow_vmcs = true;
 }
 
 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
@@ -7590,6 +8152,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .load_eoi_exitmap = vmx_load_eoi_exitmap,
        .hwapic_irr_update = vmx_hwapic_irr_update,
        .hwapic_isr_update = vmx_hwapic_isr_update,
+       .sync_pir_to_irr = vmx_sync_pir_to_irr,
+       .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
 
        .set_tss_addr = vmx_set_tss_addr,
        .get_tdp_level = get_ept_level,
@@ -7618,6 +8182,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .set_tdp_cr3 = vmx_set_cr3,
 
        .check_intercept = vmx_check_intercept,
+       .handle_external_intr = vmx_handle_external_intr,
 };
 
 static int __init vmx_init(void)
@@ -7656,6 +8221,24 @@ static int __init vmx_init(void)
                                (unsigned long *)__get_free_page(GFP_KERNEL);
        if (!vmx_msr_bitmap_longmode_x2apic)
                goto out4;
+       vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_vmread_bitmap)
+               goto out5;
+
+       vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_vmwrite_bitmap)
+               goto out6;
+
+       memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
+       memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
+       /* shadowed read/write fields */
+       for (i = 0; i < max_shadow_read_write_fields; i++) {
+               clear_bit(shadow_read_write_fields[i], vmx_vmwrite_bitmap);
+               clear_bit(shadow_read_write_fields[i], vmx_vmread_bitmap);
+       }
+       /* shadowed read only fields */
+       for (i = 0; i < max_shadow_read_only_fields; i++)
+               clear_bit(shadow_read_only_fields[i], vmx_vmread_bitmap);
 
        /*
         * Allow direct access to the PC debug port (it is often used for I/O
@@ -7674,7 +8257,7 @@ static int __init vmx_init(void)
        r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
                     __alignof__(struct vcpu_vmx), THIS_MODULE);
        if (r)
-               goto out3;
+               goto out7;
 
 #ifdef CONFIG_KEXEC
        rcu_assign_pointer(crash_vmclear_loaded_vmcss,
@@ -7692,7 +8275,7 @@ static int __init vmx_init(void)
        memcpy(vmx_msr_bitmap_longmode_x2apic,
                        vmx_msr_bitmap_longmode, PAGE_SIZE);
 
-       if (enable_apicv_reg_vid) {
+       if (enable_apicv) {
                for (msr = 0x800; msr <= 0x8ff; msr++)
                        vmx_disable_intercept_msr_read_x2apic(msr);
 
@@ -7722,6 +8305,12 @@ static int __init vmx_init(void)
 
        return 0;
 
+out7:
+       free_page((unsigned long)vmx_vmwrite_bitmap);
+out6:
+       free_page((unsigned long)vmx_vmread_bitmap);
+out5:
+       free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
 out4:
        free_page((unsigned long)vmx_msr_bitmap_longmode);
 out3:
@@ -7743,6 +8332,8 @@ static void __exit vmx_exit(void)
        free_page((unsigned long)vmx_msr_bitmap_longmode);
        free_page((unsigned long)vmx_io_bitmap_b);
        free_page((unsigned long)vmx_io_bitmap_a);
+       free_page((unsigned long)vmx_vmwrite_bitmap);
+       free_page((unsigned long)vmx_vmread_bitmap);
 
 #ifdef CONFIG_KEXEC
        rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL);
index e1721324c271e18430f43ee226a367c4495d18ab..05a8b1a2300df0997d116e753506bf6e6ffa1fe2 100644 (file)
@@ -162,8 +162,6 @@ u64 __read_mostly host_xcr0;
 
 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
 
-static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);
-
 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
 {
        int i;
@@ -263,6 +261,13 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
 }
 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
 
+asmlinkage void kvm_spurious_fault(void)
+{
+       /* Fault while not rebooting.  We want the trace. */
+       BUG();
+}
+EXPORT_SYMBOL_GPL(kvm_spurious_fault);
+
 #define EXCPT_BENIGN           0
 #define EXCPT_CONTRIBUTORY     1
 #define EXCPT_PF               2
@@ -840,23 +845,17 @@ static const u32 emulated_msrs[] = {
        MSR_IA32_MCG_CTL,
 };
 
-static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
+bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
-       u64 old_efer = vcpu->arch.efer;
-
        if (efer & efer_reserved_bits)
-               return 1;
-
-       if (is_paging(vcpu)
-           && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
-               return 1;
+               return false;
 
        if (efer & EFER_FFXSR) {
                struct kvm_cpuid_entry2 *feat;
 
                feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
                if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
-                       return 1;
+                       return false;
        }
 
        if (efer & EFER_SVME) {
@@ -864,9 +863,24 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 
                feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
                if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
-                       return 1;
+                       return false;
        }
 
+       return true;
+}
+EXPORT_SYMBOL_GPL(kvm_valid_efer);
+
+static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
+{
+       u64 old_efer = vcpu->arch.efer;
+
+       if (!kvm_valid_efer(vcpu, efer))
+               return 1;
+
+       if (is_paging(vcpu)
+           && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
+               return 1;
+
        efer &= ~EFER_LMA;
        efer |= vcpu->arch.efer & EFER_LMA;
 
@@ -1079,6 +1093,10 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
        u32 thresh_lo, thresh_hi;
        int use_scaling = 0;
 
+       /* tsc_khz can be zero if TSC calibration fails */
+       if (this_tsc_khz == 0)
+               return;
+
        /* Compute a scale to convert nanoseconds in TSC cycles */
        kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
                           &vcpu->arch.virtual_tsc_shift,
@@ -1156,20 +1174,23 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
        ns = get_kernel_ns();
        elapsed = ns - kvm->arch.last_tsc_nsec;
 
-       /* n.b - signed multiplication and division required */
-       usdiff = data - kvm->arch.last_tsc_write;
+       if (vcpu->arch.virtual_tsc_khz) {
+               /* n.b - signed multiplication and division required */
+               usdiff = data - kvm->arch.last_tsc_write;
 #ifdef CONFIG_X86_64
-       usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
+               usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
 #else
-       /* do_div() only does unsigned */
-       asm("idivl %2; xor %%edx, %%edx"
-           : "=A"(usdiff)
-           : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
+               /* do_div() only does unsigned */
+               asm("idivl %2; xor %%edx, %%edx"
+               : "=A"(usdiff)
+               : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz));
 #endif
-       do_div(elapsed, 1000);
-       usdiff -= elapsed;
-       if (usdiff < 0)
-               usdiff = -usdiff;
+               do_div(elapsed, 1000);
+               usdiff -= elapsed;
+               if (usdiff < 0)
+                       usdiff = -usdiff;
+       } else
+               usdiff = USEC_PER_SEC; /* disable TSC match window below */
 
        /*
         * Special case: TSC write with a small delta (1 second) of virtual
@@ -2034,7 +2055,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_P6_EVNTSEL0:
        case MSR_P6_EVNTSEL1:
                if (kvm_pmu_msr(vcpu, msr))
-                       return kvm_pmu_set_msr(vcpu, msr, data);
+                       return kvm_pmu_set_msr(vcpu, msr_info);
 
                if (pr || data != 0)
                        vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
@@ -2080,7 +2101,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
                        return xen_hvm_config(vcpu, data);
                if (kvm_pmu_msr(vcpu, msr))
-                       return kvm_pmu_set_msr(vcpu, msr, data);
+                       return kvm_pmu_set_msr(vcpu, msr_info);
                if (!ignore_msrs) {
                        vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
                                    msr, data);
@@ -2479,7 +2500,6 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_USER_NMI:
        case KVM_CAP_REINJECT_CONTROL:
        case KVM_CAP_IRQ_INJECT_STATUS:
-       case KVM_CAP_ASSIGN_DEV_IRQ:
        case KVM_CAP_IRQFD:
        case KVM_CAP_IOEVENTFD:
        case KVM_CAP_PIT2:
@@ -2497,10 +2517,12 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_XSAVE:
        case KVM_CAP_ASYNC_PF:
        case KVM_CAP_GET_TSC_KHZ:
-       case KVM_CAP_PCI_2_3:
        case KVM_CAP_KVMCLOCK_CTRL:
        case KVM_CAP_READONLY_MEM:
-       case KVM_CAP_IRQFD_RESAMPLE:
+#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
+       case KVM_CAP_ASSIGN_DEV_IRQ:
+       case KVM_CAP_PCI_2_3:
+#endif
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
@@ -2521,9 +2543,11 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_PV_MMU:    /* obsolete */
                r = 0;
                break;
+#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
        case KVM_CAP_IOMMU:
                r = iommu_present(&pci_bus_type);
                break;
+#endif
        case KVM_CAP_MCE:
                r = KVM_MAX_MCE_BANKS;
                break;
@@ -2679,6 +2703,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
                                    struct kvm_lapic_state *s)
 {
+       kvm_x86_ops->sync_pir_to_irr(vcpu);
        memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
 
        return 0;
@@ -2696,7 +2721,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
                                    struct kvm_interrupt *irq)
 {
-       if (irq->irq < 0 || irq->irq >= KVM_NR_INTERRUPTS)
+       if (irq->irq >= KVM_NR_INTERRUPTS)
                return -EINVAL;
        if (irqchip_in_kernel(vcpu->kvm))
                return -ENXIO;
@@ -2819,10 +2844,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
        events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
        events->nmi.pad = 0;
 
-       events->sipi_vector = vcpu->arch.sipi_vector;
+       events->sipi_vector = 0; /* never valid when reporting to user space */
 
        events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
-                        | KVM_VCPUEVENT_VALID_SIPI_VECTOR
                         | KVM_VCPUEVENT_VALID_SHADOW);
        memset(&events->reserved, 0, sizeof(events->reserved));
 }
@@ -2853,8 +2877,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                vcpu->arch.nmi_pending = events->nmi.pending;
        kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
 
-       if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
-               vcpu->arch.sipi_vector = events->sipi_vector;
+       if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
+           kvm_vcpu_has_lapic(vcpu))
+               vcpu->arch.apic->sipi_vector = events->sipi_vector;
 
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 
@@ -3478,13 +3503,15 @@ out:
        return r;
 }
 
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+                       bool line_status)
 {
        if (!irqchip_in_kernel(kvm))
                return -ENXIO;
 
        irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
-                                       irq_event->irq, irq_event->level);
+                                       irq_event->irq, irq_event->level,
+                                       line_status);
        return 0;
 }
 
@@ -4752,11 +4779,15 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 }
 
 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
-                                 bool write_fault_to_shadow_pgtable)
+                                 bool write_fault_to_shadow_pgtable,
+                                 int emulation_type)
 {
        gpa_t gpa = cr2;
        pfn_t pfn;
 
+       if (emulation_type & EMULTYPE_NO_REEXECUTE)
+               return false;
+
        if (!vcpu->arch.mmu.direct_map) {
                /*
                 * Write permission should be allowed since only
@@ -4899,8 +4930,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
                if (r != EMULATION_OK)  {
                        if (emulation_type & EMULTYPE_TRAP_UD)
                                return EMULATE_FAIL;
-                       if (reexecute_instruction(vcpu, cr2,
-                                                 write_fault_to_spt))
+                       if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
+                                               emulation_type))
                                return EMULATE_DONE;
                        if (emulation_type & EMULTYPE_SKIP)
                                return EMULATE_FAIL;
@@ -4930,7 +4961,8 @@ restart:
                return EMULATE_DONE;
 
        if (r == EMULATION_FAILED) {
-               if (reexecute_instruction(vcpu, cr2, write_fault_to_spt))
+               if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
+                                       emulation_type))
                        return EMULATE_DONE;
 
                return handle_emulation_failure(vcpu);
@@ -5641,14 +5673,20 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
-static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
+static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 {
        u64 eoi_exit_bitmap[4];
+       u32 tmr[8];
+
+       if (!kvm_apic_hw_enabled(vcpu->arch.apic))
+               return;
 
        memset(eoi_exit_bitmap, 0, 32);
+       memset(tmr, 0, 32);
 
-       kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap);
+       kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr);
        kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+       kvm_apic_update_tmr(vcpu, tmr);
 }
 
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
@@ -5656,7 +5694,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        int r;
        bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
                vcpu->run->request_interrupt_window;
-       bool req_immediate_exit = 0;
+       bool req_immediate_exit = false;
 
        if (vcpu->requests) {
                if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -5698,24 +5736,30 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        record_steal_time(vcpu);
                if (kvm_check_request(KVM_REQ_NMI, vcpu))
                        process_nmi(vcpu);
-               req_immediate_exit =
-                       kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
                if (kvm_check_request(KVM_REQ_PMU, vcpu))
                        kvm_handle_pmu_event(vcpu);
                if (kvm_check_request(KVM_REQ_PMI, vcpu))
                        kvm_deliver_pmi(vcpu);
-               if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu))
-                       update_eoi_exitmap(vcpu);
+               if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
+                       vcpu_scan_ioapic(vcpu);
        }
 
        if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+               kvm_apic_accept_events(vcpu);
+               if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+                       r = 1;
+                       goto out;
+               }
+
                inject_pending_event(vcpu);
 
                /* enable NMI/IRQ window open exits if needed */
                if (vcpu->arch.nmi_pending)
-                       kvm_x86_ops->enable_nmi_window(vcpu);
+                       req_immediate_exit =
+                               kvm_x86_ops->enable_nmi_window(vcpu) != 0;
                else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
-                       kvm_x86_ops->enable_irq_window(vcpu);
+                       req_immediate_exit =
+                               kvm_x86_ops->enable_irq_window(vcpu) != 0;
 
                if (kvm_lapic_enabled(vcpu)) {
                        /*
@@ -5794,7 +5838,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
        vcpu->mode = OUTSIDE_GUEST_MODE;
        smp_wmb();
-       local_irq_enable();
+
+       /* Interrupt is enabled by handle_external_intr() */
+       kvm_x86_ops->handle_external_intr(vcpu);
 
        ++vcpu->stat.exits;
 
@@ -5843,16 +5889,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
        int r;
        struct kvm *kvm = vcpu->kvm;
 
-       if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
-               pr_debug("vcpu %d received sipi with vector # %x\n",
-                        vcpu->vcpu_id, vcpu->arch.sipi_vector);
-               kvm_lapic_reset(vcpu);
-               r = kvm_vcpu_reset(vcpu);
-               if (r)
-                       return r;
-               vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-       }
-
        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
        r = vapic_enter(vcpu);
        if (r) {
@@ -5869,8 +5905,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
                        kvm_vcpu_block(vcpu);
                        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
-                       if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
-                       {
+                       if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
+                               kvm_apic_accept_events(vcpu);
                                switch(vcpu->arch.mp_state) {
                                case KVM_MP_STATE_HALTED:
                                        vcpu->arch.mp_state =
@@ -5878,7 +5914,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                                case KVM_MP_STATE_RUNNABLE:
                                        vcpu->arch.apf.halted = false;
                                        break;
-                               case KVM_MP_STATE_SIPI_RECEIVED:
+                               case KVM_MP_STATE_INIT_RECEIVED:
+                                       break;
                                default:
                                        r = -EINTR;
                                        break;
@@ -6013,6 +6050,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
        if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
                kvm_vcpu_block(vcpu);
+               kvm_apic_accept_events(vcpu);
                clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
                r = -EAGAIN;
                goto out;
@@ -6169,6 +6207,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
+       kvm_apic_accept_events(vcpu);
        mp_state->mp_state = vcpu->arch.mp_state;
        return 0;
 }
@@ -6176,7 +6215,15 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
 {
-       vcpu->arch.mp_state = mp_state->mp_state;
+       if (!kvm_vcpu_has_lapic(vcpu) &&
+           mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
+               return -EINVAL;
+
+       if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
+               vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
+               set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
+       } else
+               vcpu->arch.mp_state = mp_state->mp_state;
        kvm_make_request(KVM_REQ_EVENT, vcpu);
        return 0;
 }
@@ -6475,9 +6522,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        r = vcpu_load(vcpu);
        if (r)
                return r;
-       r = kvm_vcpu_reset(vcpu);
-       if (r == 0)
-               r = kvm_mmu_setup(vcpu);
+       kvm_vcpu_reset(vcpu);
+       r = kvm_mmu_setup(vcpu);
        vcpu_put(vcpu);
 
        return r;
@@ -6514,7 +6560,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        kvm_x86_ops->vcpu_free(vcpu);
 }
 
-static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
 {
        atomic_set(&vcpu->arch.nmi_queued, 0);
        vcpu->arch.nmi_pending = 0;
@@ -6541,7 +6587,18 @@ static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
        vcpu->arch.regs_avail = ~0;
        vcpu->arch.regs_dirty = ~0;
 
-       return kvm_x86_ops->vcpu_reset(vcpu);
+       kvm_x86_ops->vcpu_reset(vcpu);
+}
+
+void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector)
+{
+       struct kvm_segment cs;
+
+       kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
+       cs.selector = vector << 8;
+       cs.base = vector << 12;
+       kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+       kvm_rip_write(vcpu, 0);
 }
 
 int kvm_arch_hardware_enable(void *garbage)
@@ -6706,8 +6763,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        }
        vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
 
-       if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
+       if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
+               r = -ENOMEM;
                goto fail_free_mce_banks;
+       }
 
        r = fx_init(vcpu);
        if (r)
@@ -6811,6 +6870,23 @@ void kvm_arch_sync_events(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
+       if (current->mm == kvm->mm) {
+               /*
+                * Free memory regions allocated on behalf of userspace,
+                * unless the the memory map has changed due to process exit
+                * or fd copying.
+                */
+               struct kvm_userspace_memory_region mem;
+               memset(&mem, 0, sizeof(mem));
+               mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
+               kvm_set_memory_region(kvm, &mem);
+
+               mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
+               kvm_set_memory_region(kvm, &mem);
+
+               mem.slot = TSS_PRIVATE_MEMSLOT;
+               kvm_set_memory_region(kvm, &mem);
+       }
        kvm_iommu_unmap_guest(kvm);
        kfree(kvm->arch.vpic);
        kfree(kvm->arch.vioapic);
@@ -6903,24 +6979,21 @@ out_free:
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
-                               struct kvm_memory_slot old,
                                struct kvm_userspace_memory_region *mem,
-                               bool user_alloc)
+                               enum kvm_mr_change change)
 {
-       int npages = memslot->npages;
-
        /*
         * Only private memory slots need to be mapped here since
         * KVM_SET_MEMORY_REGION ioctl is no longer supported.
         */
-       if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) {
+       if ((memslot->id >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_CREATE)) {
                unsigned long userspace_addr;
 
                /*
                 * MAP_SHARED to prevent internal slot pages from being moved
                 * by fork()/COW.
                 */
-               userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE,
+               userspace_addr = vm_mmap(NULL, 0, memslot->npages * PAGE_SIZE,
                                         PROT_READ | PROT_WRITE,
                                         MAP_SHARED | MAP_ANONYMOUS, 0);
 
@@ -6935,17 +7008,17 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
                                struct kvm_userspace_memory_region *mem,
-                               struct kvm_memory_slot old,
-                               bool user_alloc)
+                               const struct kvm_memory_slot *old,
+                               enum kvm_mr_change change)
 {
 
-       int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
+       int nr_mmu_pages = 0;
 
-       if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) {
+       if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
                int ret;
 
-               ret = vm_munmap(old.userspace_addr,
-                               old.npages * PAGE_SIZE);
+               ret = vm_munmap(old->userspace_addr,
+                               old->npages * PAGE_SIZE);
                if (ret < 0)
                        printk(KERN_WARNING
                               "kvm_vm_ioctl_set_memory_region: "
@@ -6962,14 +7035,14 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
         * Existing largepage mappings are destroyed here and new ones will
         * not be created until the end of the logging.
         */
-       if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
+       if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
                kvm_mmu_slot_remove_write_access(kvm, mem->slot);
        /*
         * If memory slot is created, or moved, we need to clear all
         * mmio sptes.
         */
-       if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) {
-               kvm_mmu_zap_all(kvm);
+       if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
+               kvm_mmu_zap_mmio_sptes(kvm);
                kvm_reload_remote_mmus(kvm);
        }
 }
@@ -6991,7 +7064,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                !vcpu->arch.apf.halted)
                || !list_empty_careful(&vcpu->async_pf.done)
-               || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
+               || kvm_apic_has_events(vcpu)
                || atomic_read(&vcpu->arch.nmi_queued) ||
                (kvm_arch_interrupt_allowed(vcpu) &&
                 kvm_cpu_has_interrupt(vcpu));
index 6711e65764b5e7a4482564014ac6aa0e78189242..2ea6165366b6f4d8da48f74859aeae3e705f5b7d 100644 (file)
@@ -443,29 +443,30 @@ static int __init test_devices_support(unsigned long addr)
 }
 /*
  * Init function for virtio
- * devices are in a single page above top of "normal" mem
+ * devices are in a single page above top of "normal" + standby mem
  */
 static int __init kvm_devices_init(void)
 {
        int rc;
+       unsigned long total_memory_size = sclp_get_rzm() * sclp_get_rnmax();
 
        if (!MACHINE_IS_KVM)
                return -ENODEV;
 
-       if (test_devices_support(real_memory_size) < 0)
+       if (test_devices_support(total_memory_size) < 0)
                return -ENODEV;
 
-       rc = vmem_add_mapping(real_memory_size, PAGE_SIZE);
+       rc = vmem_add_mapping(total_memory_size, PAGE_SIZE);
        if (rc)
                return rc;
 
-       kvm_devices = (void *) real_memory_size;
+       kvm_devices = (void *) total_memory_size;
 
        kvm_root = root_device_register("kvm_s390");
        if (IS_ERR(kvm_root)) {
                rc = PTR_ERR(kvm_root);
                printk(KERN_ERR "Could not register kvm_s390 root device");
-               vmem_remove_mapping(real_memory_size, PAGE_SIZE);
+               vmem_remove_mapping(total_memory_size, PAGE_SIZE);
                return rc;
        }
 
index fb877b59ec57a472e74e314a7c8afb98c245b64a..779dc5136291610f28361b7386151cc1bb4b9372 100644 (file)
@@ -31,6 +31,7 @@
 #include <asm/irq.h>
 #include <asm/cio.h>
 #include <asm/ccwdev.h>
+#include <asm/virtio-ccw.h>
 
 /*
  * virtio related functions
@@ -77,12 +78,9 @@ struct virtio_ccw_vq_info {
        void *queue;
        struct vq_info_block *info_block;
        struct list_head node;
+       long cookie;
 };
 
-#define KVM_VIRTIO_CCW_RING_ALIGN 4096
-
-#define KVM_S390_VIRTIO_CCW_NOTIFY 3
-
 #define CCW_CMD_SET_VQ 0x13
 #define CCW_CMD_VDEV_RESET 0x33
 #define CCW_CMD_SET_IND 0x43
@@ -135,8 +133,11 @@ static int ccw_io_helper(struct virtio_ccw_device *vcdev,
        do {
                spin_lock_irqsave(get_ccwdev_lock(vcdev->cdev), flags);
                ret = ccw_device_start(vcdev->cdev, ccw, intparm, 0, 0);
-               if (!ret)
+               if (!ret) {
+                       if (!vcdev->curr_io)
+                               vcdev->err = 0;
                        vcdev->curr_io |= flag;
+               }
                spin_unlock_irqrestore(get_ccwdev_lock(vcdev->cdev), flags);
                cpu_relax();
        } while (ret == -EBUSY);
@@ -145,15 +146,18 @@ static int ccw_io_helper(struct virtio_ccw_device *vcdev,
 }
 
 static inline long do_kvm_notify(struct subchannel_id schid,
-                                unsigned long queue_index)
+                                unsigned long queue_index,
+                                long cookie)
 {
        register unsigned long __nr asm("1") = KVM_S390_VIRTIO_CCW_NOTIFY;
        register struct subchannel_id __schid asm("2") = schid;
        register unsigned long __index asm("3") = queue_index;
        register long __rc asm("2");
+       register long __cookie asm("4") = cookie;
 
        asm volatile ("diag 2,4,0x500\n"
-                     : "=d" (__rc) : "d" (__nr), "d" (__schid), "d" (__index)
+                     : "=d" (__rc) : "d" (__nr), "d" (__schid), "d" (__index),
+                     "d"(__cookie)
                      : "memory", "cc");
        return __rc;
 }
@@ -166,7 +170,7 @@ static void virtio_ccw_kvm_notify(struct virtqueue *vq)
 
        vcdev = to_vc_device(info->vq->vdev);
        ccw_device_get_schid(vcdev->cdev, &schid);
-       do_kvm_notify(schid, vq->index);
+       info->cookie = do_kvm_notify(schid, vq->index, info->cookie);
 }
 
 static int virtio_ccw_read_vq_conf(struct virtio_ccw_device *vcdev,
index c13958251927d6c122c32795c712e8f994c29109..f0eea07d2c2bb5168820443639af6e85a9d47850 100644 (file)
@@ -117,14 +117,13 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_APF_HALT          12
 #define KVM_REQ_STEAL_UPDATE      13
 #define KVM_REQ_NMI               14
-#define KVM_REQ_IMMEDIATE_EXIT    15
-#define KVM_REQ_PMU               16
-#define KVM_REQ_PMI               17
-#define KVM_REQ_WATCHDOG          18
-#define KVM_REQ_MASTERCLOCK_UPDATE 19
-#define KVM_REQ_MCLOCK_INPROGRESS 20
-#define KVM_REQ_EPR_EXIT          21
-#define KVM_REQ_EOIBITMAP         22
+#define KVM_REQ_PMU               15
+#define KVM_REQ_PMI               16
+#define KVM_REQ_WATCHDOG          17
+#define KVM_REQ_MASTERCLOCK_UPDATE 18
+#define KVM_REQ_MCLOCK_INPROGRESS 19
+#define KVM_REQ_EPR_EXIT          20
+#define KVM_REQ_SCAN_IOAPIC       21
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID            0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID       1
@@ -133,6 +132,9 @@ struct kvm;
 struct kvm_vcpu;
 extern struct kmem_cache *kvm_vcpu_cache;
 
+extern raw_spinlock_t kvm_lock;
+extern struct list_head vm_list;
+
 struct kvm_io_range {
        gpa_t addr;
        int len;
@@ -149,6 +151,7 @@ struct kvm_io_bus {
 enum kvm_bus {
        KVM_MMIO_BUS,
        KVM_PIO_BUS,
+       KVM_VIRTIO_CCW_NOTIFY_BUS,
        KVM_NR_BUSES
 };
 
@@ -252,6 +255,7 @@ struct kvm_vcpu {
                bool dy_eligible;
        } spin_loop;
 #endif
+       bool preempted;
        struct kvm_vcpu_arch arch;
 };
 
@@ -285,7 +289,8 @@ struct kvm_kernel_irq_routing_entry {
        u32 gsi;
        u32 type;
        int (*set)(struct kvm_kernel_irq_routing_entry *e,
-                  struct kvm *kvm, int irq_source_id, int level);
+                  struct kvm *kvm, int irq_source_id, int level,
+                  bool line_status);
        union {
                struct {
                        unsigned irqchip;
@@ -296,10 +301,10 @@ struct kvm_kernel_irq_routing_entry {
        struct hlist_node link;
 };
 
-#ifdef __KVM_HAVE_IOAPIC
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 
 struct kvm_irq_routing_table {
-       int chip[KVM_NR_IRQCHIPS][KVM_IOAPIC_NUM_PINS];
+       int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
        struct kvm_kernel_irq_routing_entry *rt_entries;
        u32 nr_rt_entries;
        /*
@@ -385,6 +390,7 @@ struct kvm {
        long mmu_notifier_count;
 #endif
        long tlbs_dirty;
+       struct list_head devices;
 };
 
 #define kvm_err(fmt, ...) \
@@ -424,6 +430,19 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
 int __must_check vcpu_load(struct kvm_vcpu *vcpu);
 void vcpu_put(struct kvm_vcpu *vcpu);
 
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+int kvm_irqfd_init(void);
+void kvm_irqfd_exit(void);
+#else
+static inline int kvm_irqfd_init(void)
+{
+       return 0;
+}
+
+static inline void kvm_irqfd_exit(void)
+{
+}
+#endif
 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
                  struct module *module);
 void kvm_exit(void);
@@ -452,24 +471,39 @@ id_to_memslot(struct kvm_memslots *slots, int id)
        return slot;
 }
 
+/*
+ * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations:
+ * - create a new memory slot
+ * - delete an existing memory slot
+ * - modify an existing memory slot
+ *   -- move it in the guest physical memory space
+ *   -- just change its flags
+ *
+ * Since flags can be changed by some of these operations, the following
+ * differentiation is the best we can do for __kvm_set_memory_region():
+ */
+enum kvm_mr_change {
+       KVM_MR_CREATE,
+       KVM_MR_DELETE,
+       KVM_MR_MOVE,
+       KVM_MR_FLAGS_ONLY,
+};
+
 int kvm_set_memory_region(struct kvm *kvm,
-                         struct kvm_userspace_memory_region *mem,
-                         bool user_alloc);
+                         struct kvm_userspace_memory_region *mem);
 int __kvm_set_memory_region(struct kvm *kvm,
-                           struct kvm_userspace_memory_region *mem,
-                           bool user_alloc);
+                           struct kvm_userspace_memory_region *mem);
 void kvm_arch_free_memslot(struct kvm_memory_slot *free,
                           struct kvm_memory_slot *dont);
 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
-                               struct kvm_memory_slot old,
                                struct kvm_userspace_memory_region *mem,
-                               bool user_alloc);
+                               enum kvm_mr_change change);
 void kvm_arch_commit_memory_region(struct kvm *kvm,
                                struct kvm_userspace_memory_region *mem,
-                               struct kvm_memory_slot old,
-                               bool user_alloc);
+                               const struct kvm_memory_slot *old,
+                               enum kvm_mr_change change);
 bool kvm_largepages_enabled(void);
 void kvm_disable_largepages(void);
 /* flush all memory translations */
@@ -539,7 +573,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 void kvm_reload_remote_mmus(struct kvm *kvm);
 void kvm_make_mclock_inprogress_request(struct kvm *kvm);
-void kvm_make_update_eoibitmap_request(struct kvm *kvm);
+void kvm_make_scan_ioapic_request(struct kvm *kvm);
 
 long kvm_arch_dev_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg);
@@ -555,10 +589,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
                                struct kvm_dirty_log *log);
 
 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
-                                  struct
-                                  kvm_userspace_memory_region *mem,
-                                  bool user_alloc);
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level);
+                                  struct kvm_userspace_memory_region *mem);
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
+                       bool line_status);
 long kvm_arch_vm_ioctl(struct file *filp,
                       unsigned int ioctl, unsigned long arg);
 
@@ -632,7 +665,6 @@ static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type);
 void kvm_arch_destroy_vm(struct kvm *kvm);
-void kvm_free_all_assigned_devices(struct kvm *kvm);
 void kvm_arch_sync_events(struct kvm *kvm);
 
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
@@ -684,15 +716,11 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
                             bool mask);
 
-#ifdef __KVM_HAVE_IOAPIC
-void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
-                                  union kvm_ioapic_redirect_entry *entry,
-                                  unsigned long *deliver_bitmask);
-#endif
-int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+               bool line_status);
 int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level);
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
-               int irq_source_id, int level);
+               int irq_source_id, int level, bool line_status);
 bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
@@ -705,7 +733,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 /* For vcpu->arch.iommu_flags */
 #define KVM_IOMMU_CACHE_COHERENCY      0x1
 
-#ifdef CONFIG_IOMMU_API
+#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
 void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
 int kvm_iommu_map_guest(struct kvm *kvm);
@@ -714,7 +742,7 @@ int kvm_assign_device(struct kvm *kvm,
                      struct kvm_assigned_dev_kernel *assigned_dev);
 int kvm_deassign_device(struct kvm *kvm,
                        struct kvm_assigned_dev_kernel *assigned_dev);
-#else /* CONFIG_IOMMU_API */
+#else
 static inline int kvm_iommu_map_pages(struct kvm *kvm,
                                      struct kvm_memory_slot *slot)
 {
@@ -726,28 +754,11 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
 {
 }
 
-static inline int kvm_iommu_map_guest(struct kvm *kvm)
-{
-       return -ENODEV;
-}
-
 static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
 {
        return 0;
 }
-
-static inline int kvm_assign_device(struct kvm *kvm,
-               struct kvm_assigned_dev_kernel *assigned_dev)
-{
-       return 0;
-}
-
-static inline int kvm_deassign_device(struct kvm *kvm,
-               struct kvm_assigned_dev_kernel *assigned_dev)
-{
-       return 0;
-}
-#endif /* CONFIG_IOMMU_API */
+#endif
 
 static inline void __guest_enter(void)
 {
@@ -921,7 +932,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
 }
 #endif
 
-#ifdef KVM_CAP_IRQ_ROUTING
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 
 #define KVM_MAX_IRQ_ROUTES 1024
 
@@ -930,6 +941,9 @@ int kvm_set_irq_routing(struct kvm *kvm,
                        const struct kvm_irq_routing_entry *entries,
                        unsigned nr,
                        unsigned flags);
+int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
+                         struct kvm_kernel_irq_routing_entry *e,
+                         const struct kvm_irq_routing_entry *ue);
 void kvm_free_irq_routing(struct kvm *kvm);
 
 int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
@@ -998,11 +1012,13 @@ static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; }
 
 #endif
 
-#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
+#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 
 long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
                                  unsigned long arg);
 
+void kvm_free_all_assigned_devices(struct kvm *kvm);
+
 #else
 
 static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
@@ -1011,6 +1027,8 @@ static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
        return -ENOTTY;
 }
 
+static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {}
+
 #endif
 
 static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
@@ -1028,6 +1046,46 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
        }
 }
 
+extern bool kvm_rebooting;
+
+struct kvm_device_ops;
+
+struct kvm_device {
+       struct kvm_device_ops *ops;
+       struct kvm *kvm;
+       void *private;
+       struct list_head vm_node;
+};
+
+/* create, destroy, and name are mandatory */
+struct kvm_device_ops {
+       const char *name;
+       int (*create)(struct kvm_device *dev, u32 type);
+
+       /*
+        * Destroy is responsible for freeing dev.
+        *
+        * Destroy may be called before or after destructors are called
+        * on emulated I/O regions, depending on whether a reference is
+        * held by a vcpu or other kvm component that gets destroyed
+        * after the emulated I/O.
+        */
+       void (*destroy)(struct kvm_device *dev);
+
+       int (*set_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
+       int (*get_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
+       int (*has_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
+       long (*ioctl)(struct kvm_device *dev, unsigned int ioctl,
+                     unsigned long arg);
+};
+
+void kvm_device_get(struct kvm_device *dev);
+void kvm_device_put(struct kvm_device *dev);
+struct kvm_device *kvm_device_from_filp(struct file *filp);
+
+extern struct kvm_device_ops kvm_mpic_ops;
+extern struct kvm_device_ops kvm_xics_ops;
+
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
 static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
index 19911dddaeb775540362aee763de33dfb78dbead..7005d1109ec94c8c2839bbff6984395e20e8a873 100644 (file)
@@ -37,7 +37,7 @@ TRACE_EVENT(kvm_userspace_exit,
                  __entry->errno < 0 ? -__entry->errno : __entry->reason)
 );
 
-#if defined(__KVM_HAVE_IRQ_LINE)
+#if defined(CONFIG_HAVE_KVM_IRQCHIP)
 TRACE_EVENT(kvm_set_irq,
        TP_PROTO(unsigned int gsi, int level, int irq_source_id),
        TP_ARGS(gsi, level, irq_source_id),
@@ -122,6 +122,10 @@ TRACE_EVENT(kvm_msi_set_irq,
        {KVM_IRQCHIP_PIC_SLAVE,         "PIC slave"},           \
        {KVM_IRQCHIP_IOAPIC,            "IOAPIC"}
 
+#endif /* defined(__KVM_HAVE_IOAPIC) */
+
+#if defined(CONFIG_HAVE_KVM_IRQCHIP)
+
 TRACE_EVENT(kvm_ack_irq,
        TP_PROTO(unsigned int irqchip, unsigned int pin),
        TP_ARGS(irqchip, pin),
@@ -136,14 +140,18 @@ TRACE_EVENT(kvm_ack_irq,
                __entry->pin            = pin;
        ),
 
+#ifdef kvm_irqchips
        TP_printk("irqchip %s pin %u",
                  __print_symbolic(__entry->irqchip, kvm_irqchips),
                 __entry->pin)
+#else
+       TP_printk("irqchip %d pin %u", __entry->irqchip, __entry->pin)
+#endif
 );
 
+#endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */
 
 
-#endif /* defined(__KVM_HAVE_IOAPIC) */
 
 #define KVM_TRACE_MMIO_READ_UNSATISFIED 0
 #define KVM_TRACE_MMIO_READ 1
index 3c56ba3d80c16007f9eda468f96bf337f998c1a9..a5c86fc34a370f8480b7c1773b18d7b5bee6b940 100644 (file)
@@ -449,12 +449,15 @@ enum {
        kvm_ioeventfd_flag_nr_datamatch,
        kvm_ioeventfd_flag_nr_pio,
        kvm_ioeventfd_flag_nr_deassign,
+       kvm_ioeventfd_flag_nr_virtio_ccw_notify,
        kvm_ioeventfd_flag_nr_max,
 };
 
 #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
 #define KVM_IOEVENTFD_FLAG_PIO       (1 << kvm_ioeventfd_flag_nr_pio)
 #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
+#define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
+       (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
 
 #define KVM_IOEVENTFD_VALID_FLAG_MASK  ((1 << kvm_ioeventfd_flag_nr_max) - 1)
 
@@ -558,9 +561,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_MP_STATE 14
 #define KVM_CAP_COALESCED_MMIO 15
 #define KVM_CAP_SYNC_MMU 16  /* Changes to host mmap are reflected in guest */
-#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
 #define KVM_CAP_DEVICE_ASSIGNMENT 17
-#endif
 #define KVM_CAP_IOMMU 18
 #ifdef __KVM_HAVE_MSI
 #define KVM_CAP_DEVICE_MSI 20
@@ -576,13 +577,9 @@ struct kvm_ppc_smmu_info {
 #ifdef __KVM_HAVE_PIT
 #define KVM_CAP_REINJECT_CONTROL 24
 #endif
-#ifdef __KVM_HAVE_IOAPIC
 #define KVM_CAP_IRQ_ROUTING 25
-#endif
 #define KVM_CAP_IRQ_INJECT_STATUS 26
-#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
 #define KVM_CAP_DEVICE_DEASSIGNMENT 27
-#endif
 #ifdef __KVM_HAVE_MSIX
 #define KVM_CAP_DEVICE_MSIX 28
 #endif
@@ -665,6 +662,10 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_EPR 86
 #define KVM_CAP_ARM_PSCI 87
 #define KVM_CAP_ARM_SET_DEVICE_ADDR 88
+#define KVM_CAP_DEVICE_CTRL 89
+#define KVM_CAP_IRQ_MPIC 90
+#define KVM_CAP_PPC_RTAS 91
+#define KVM_CAP_IRQ_XICS 92
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -817,6 +818,28 @@ struct kvm_arm_device_addr {
        __u64 addr;
 };
 
+/*
+ * Device control API, available with KVM_CAP_DEVICE_CTRL
+ */
+#define KVM_CREATE_DEVICE_TEST         1
+
+struct kvm_create_device {
+       __u32   type;   /* in: KVM_DEV_TYPE_xxx */
+       __u32   fd;     /* out: device handle */
+       __u32   flags;  /* in: KVM_CREATE_DEVICE_xxx */
+};
+
+struct kvm_device_attr {
+       __u32   flags;          /* no flags currently defined */
+       __u32   group;          /* device-defined */
+       __u64   attr;           /* group-defined */
+       __u64   addr;           /* userspace address of attr data */
+};
+
+#define KVM_DEV_TYPE_FSL_MPIC_20       1
+#define KVM_DEV_TYPE_FSL_MPIC_42       2
+#define KVM_DEV_TYPE_XICS              3
+
 /*
  * ioctls for VM fds
  */
@@ -904,6 +927,16 @@ struct kvm_s390_ucas_mapping {
 #define KVM_PPC_GET_HTAB_FD      _IOW(KVMIO,  0xaa, struct kvm_get_htab_fd)
 /* Available with KVM_CAP_ARM_SET_DEVICE_ADDR */
 #define KVM_ARM_SET_DEVICE_ADDR          _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
+/* Available with KVM_CAP_PPC_RTAS */
+#define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO,  0xac, struct kvm_rtas_token_args)
+
+/* ioctl for vm fd */
+#define KVM_CREATE_DEVICE        _IOWR(KVMIO,  0xe0, struct kvm_create_device)
+
+/* ioctls for fds returned by KVM_CREATE_DEVICE */
+#define KVM_SET_DEVICE_ATTR      _IOW(KVMIO,  0xe1, struct kvm_device_attr)
+#define KVM_GET_DEVICE_ATTR      _IOW(KVMIO,  0xe2, struct kvm_device_attr)
+#define KVM_HAS_DEVICE_ATTR      _IOW(KVMIO,  0xe3, struct kvm_device_attr)
 
 /*
  * ioctls for vcpu fds
index d01b24b72c61e75f3225776541c2f99824df443e..779262f59e252b458a67335635b59d0d1ca1932e 100644 (file)
@@ -6,6 +6,9 @@ config HAVE_KVM
 config HAVE_KVM_IRQCHIP
        bool
 
+config HAVE_KVM_IRQ_ROUTING
+       bool
+
 config HAVE_KVM_EVENTFD
        bool
        select EVENTFD
index 3642239252b0015593fe690ec0fa83ae5b841145..8db43701016f30cebeb37ab7e4d581166606b86f 100644 (file)
@@ -80,11 +80,12 @@ kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
                spin_lock(&assigned_dev->intx_mask_lock);
                if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
                        kvm_set_irq(assigned_dev->kvm,
-                                   assigned_dev->irq_source_id, vector, 1);
+                                   assigned_dev->irq_source_id, vector, 1,
+                                   false);
                spin_unlock(&assigned_dev->intx_mask_lock);
        } else
                kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-                           vector, 1);
+                           vector, 1, false);
 }
 
 static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
@@ -165,7 +166,7 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
                container_of(kian, struct kvm_assigned_dev_kernel,
                             ack_notifier);
 
-       kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
+       kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false);
 
        spin_lock(&dev->intx_mask_lock);
 
@@ -188,7 +189,7 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 
                if (reassert)
                        kvm_set_irq(dev->kvm, dev->irq_source_id,
-                                   dev->guest_irq, 1);
+                                   dev->guest_irq, 1, false);
        }
 
        spin_unlock(&dev->intx_mask_lock);
@@ -202,7 +203,7 @@ static void deassign_guest_irq(struct kvm *kvm,
                                                &assigned_dev->ack_notifier);
 
        kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-                   assigned_dev->guest_irq, 0);
+                   assigned_dev->guest_irq, 0, false);
 
        if (assigned_dev->irq_source_id != -1)
                kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
@@ -901,7 +902,7 @@ static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
        if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
                if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
                        kvm_set_irq(match->kvm, match->irq_source_id,
-                                   match->guest_irq, 0);
+                                   match->guest_irq, 0, false);
                        /*
                         * Masking at hardware-level is performed on demand,
                         * i.e. when an IRQ actually arrives at the host.
@@ -982,36 +983,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
                        goto out;
                break;
        }
-#ifdef KVM_CAP_IRQ_ROUTING
-       case KVM_SET_GSI_ROUTING: {
-               struct kvm_irq_routing routing;
-               struct kvm_irq_routing __user *urouting;
-               struct kvm_irq_routing_entry *entries;
-
-               r = -EFAULT;
-               if (copy_from_user(&routing, argp, sizeof(routing)))
-                       goto out;
-               r = -EINVAL;
-               if (routing.nr >= KVM_MAX_IRQ_ROUTES)
-                       goto out;
-               if (routing.flags)
-                       goto out;
-               r = -ENOMEM;
-               entries = vmalloc(routing.nr * sizeof(*entries));
-               if (!entries)
-                       goto out;
-               r = -EFAULT;
-               urouting = argp;
-               if (copy_from_user(entries, urouting->entries,
-                                  routing.nr * sizeof(*entries)))
-                       goto out_free_irq_routing;
-               r = kvm_set_irq_routing(kvm, entries, routing.nr,
-                                       routing.flags);
-       out_free_irq_routing:
-               vfree(entries);
-               break;
-       }
-#endif /* KVM_CAP_IRQ_ROUTING */
 #ifdef __KVM_HAVE_MSIX
        case KVM_ASSIGN_SET_MSIX_NR: {
                struct kvm_assigned_msix_nr entry_nr;
index adb17f266b28ce899857538fa85e621d85ad12c4..64ee720b75c7ac4a80c4e1c06cd5cacf0a3fa961 100644 (file)
@@ -35,7 +35,7 @@
 
 #include "iodev.h"
 
-#ifdef __KVM_HAVE_IOAPIC
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 /*
  * --------------------------------------------------------------------
  * irqfd: Allows an fd to be used to inject an interrupt to the guest
@@ -100,11 +100,13 @@ irqfd_inject(struct work_struct *work)
        struct kvm *kvm = irqfd->kvm;
 
        if (!irqfd->resampler) {
-               kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
-               kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+               kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1,
+                               false);
+               kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0,
+                               false);
        } else
                kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
-                           irqfd->gsi, 1);
+                           irqfd->gsi, 1, false);
 }
 
 /*
@@ -121,7 +123,7 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
        resampler = container_of(kian, struct _irqfd_resampler, notifier);
 
        kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
-                   resampler->notifier.gsi, 0);
+                   resampler->notifier.gsi, 0, false);
 
        rcu_read_lock();
 
@@ -146,7 +148,7 @@ irqfd_resampler_shutdown(struct _irqfd *irqfd)
                list_del(&resampler->link);
                kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
                kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
-                           resampler->notifier.gsi, 0);
+                           resampler->notifier.gsi, 0, false);
                kfree(resampler);
        }
 
@@ -225,7 +227,8 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
                irq = rcu_dereference(irqfd->irq_entry);
                /* An event has been signaled, inject an interrupt */
                if (irq)
-                       kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
+                       kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1,
+                                       false);
                else
                        schedule_work(&irqfd->inject);
                rcu_read_unlock();
@@ -430,7 +433,7 @@ fail:
 void
 kvm_eventfd_init(struct kvm *kvm)
 {
-#ifdef __KVM_HAVE_IOAPIC
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
        spin_lock_init(&kvm->irqfds.lock);
        INIT_LIST_HEAD(&kvm->irqfds.items);
        INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
@@ -439,7 +442,7 @@ kvm_eventfd_init(struct kvm *kvm)
        INIT_LIST_HEAD(&kvm->ioeventfds);
 }
 
-#ifdef __KVM_HAVE_IOAPIC
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 /*
  * shutdown any irqfd's that match fd+gsi
  */
@@ -543,7 +546,7 @@ void kvm_irq_routing_update(struct kvm *kvm,
  * aggregated from all vm* instances. We need our own isolated single-thread
  * queue to prevent deadlock against flushing the normal work-queue.
  */
-static int __init irqfd_module_init(void)
+int kvm_irqfd_init(void)
 {
        irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup");
        if (!irqfd_cleanup_wq)
@@ -552,13 +555,10 @@ static int __init irqfd_module_init(void)
        return 0;
 }
 
-static void __exit irqfd_module_exit(void)
+void kvm_irqfd_exit(void)
 {
        destroy_workqueue(irqfd_cleanup_wq);
 }
-
-module_init(irqfd_module_init);
-module_exit(irqfd_module_exit);
 #endif
 
 /*
@@ -577,6 +577,7 @@ struct _ioeventfd {
        struct eventfd_ctx  *eventfd;
        u64                  datamatch;
        struct kvm_io_device dev;
+       u8                   bus_idx;
        bool                 wildcard;
 };
 
@@ -669,7 +670,8 @@ ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
        struct _ioeventfd *_p;
 
        list_for_each_entry(_p, &kvm->ioeventfds, list)
-               if (_p->addr == p->addr && _p->length == p->length &&
+               if (_p->bus_idx == p->bus_idx &&
+                   _p->addr == p->addr && _p->length == p->length &&
                    (_p->wildcard || p->wildcard ||
                     _p->datamatch == p->datamatch))
                        return true;
@@ -677,15 +679,24 @@ ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
        return false;
 }
 
+static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags)
+{
+       if (flags & KVM_IOEVENTFD_FLAG_PIO)
+               return KVM_PIO_BUS;
+       if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY)
+               return KVM_VIRTIO_CCW_NOTIFY_BUS;
+       return KVM_MMIO_BUS;
+}
+
 static int
 kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 {
-       int                       pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
-       enum kvm_bus              bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
+       enum kvm_bus              bus_idx;
        struct _ioeventfd        *p;
        struct eventfd_ctx       *eventfd;
        int                       ret;
 
+       bus_idx = ioeventfd_bus_from_flags(args->flags);
        /* must be natural-word sized */
        switch (args->len) {
        case 1:
@@ -717,6 +728,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 
        INIT_LIST_HEAD(&p->list);
        p->addr    = args->addr;
+       p->bus_idx = bus_idx;
        p->length  = args->len;
        p->eventfd = eventfd;
 
@@ -760,12 +772,12 @@ fail:
 static int
 kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 {
-       int                       pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
-       enum kvm_bus              bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
+       enum kvm_bus              bus_idx;
        struct _ioeventfd        *p, *tmp;
        struct eventfd_ctx       *eventfd;
        int                       ret = -ENOENT;
 
+       bus_idx = ioeventfd_bus_from_flags(args->flags);
        eventfd = eventfd_ctx_fdget(args->fd);
        if (IS_ERR(eventfd))
                return PTR_ERR(eventfd);
@@ -775,7 +787,8 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
        list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
                bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
 
-               if (p->eventfd != eventfd  ||
+               if (p->bus_idx != bus_idx ||
+                   p->eventfd != eventfd  ||
                    p->addr != args->addr  ||
                    p->length != args->len ||
                    p->wildcard != wildcard)
index 5ba005c00e2f76998694d202eca68452600ee5bf..2d682977ce82656bd29a9a915fced16174163db3 100644 (file)
@@ -50,7 +50,8 @@
 #else
 #define ioapic_debug(fmt, arg...)
 #endif
-static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
+static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq,
+               bool line_status);
 
 static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
                                          unsigned long addr,
@@ -90,7 +91,80 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
        return result;
 }
 
-static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
+static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
+{
+       ioapic->rtc_status.pending_eoi = 0;
+       bitmap_zero(ioapic->rtc_status.dest_map, KVM_MAX_VCPUS);
+}
+
+static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
+{
+       bool new_val, old_val;
+       struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+       union kvm_ioapic_redirect_entry *e;
+
+       e = &ioapic->redirtbl[RTC_GSI];
+       if (!kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id,
+                               e->fields.dest_mode))
+               return;
+
+       new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector);
+       old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+
+       if (new_val == old_val)
+               return;
+
+       if (new_val) {
+               __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+               ioapic->rtc_status.pending_eoi++;
+       } else {
+               __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+               ioapic->rtc_status.pending_eoi--;
+       }
+
+       WARN_ON(ioapic->rtc_status.pending_eoi < 0);
+}
+
+void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
+{
+       struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+
+       spin_lock(&ioapic->lock);
+       __rtc_irq_eoi_tracking_restore_one(vcpu);
+       spin_unlock(&ioapic->lock);
+}
+
+static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
+{
+       struct kvm_vcpu *vcpu;
+       int i;
+
+       if (RTC_GSI >= IOAPIC_NUM_PINS)
+               return;
+
+       rtc_irq_eoi_tracking_reset(ioapic);
+       kvm_for_each_vcpu(i, vcpu, ioapic->kvm)
+           __rtc_irq_eoi_tracking_restore_one(vcpu);
+}
+
+static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu)
+{
+       if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map))
+               --ioapic->rtc_status.pending_eoi;
+
+       WARN_ON(ioapic->rtc_status.pending_eoi < 0);
+}
+
+static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
+{
+       if (ioapic->rtc_status.pending_eoi > 0)
+               return true; /* coalesced */
+
+       return false;
+}
+
+static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx,
+               bool line_status)
 {
        union kvm_ioapic_redirect_entry *pent;
        int injected = -1;
@@ -98,7 +172,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
        pent = &ioapic->redirtbl[idx];
 
        if (!pent->fields.mask) {
-               injected = ioapic_deliver(ioapic, idx);
+               injected = ioapic_deliver(ioapic, idx, line_status);
                if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
                        pent->fields.remote_irr = 1;
        }
@@ -119,41 +193,48 @@ static void update_handled_vectors(struct kvm_ioapic *ioapic)
        smp_wmb();
 }
 
-void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
-                                       u64 *eoi_exit_bitmap)
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
+                       u32 *tmr)
 {
        struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
        union kvm_ioapic_redirect_entry *e;
-       struct kvm_lapic_irq irqe;
        int index;
 
        spin_lock(&ioapic->lock);
-       /* traverse ioapic entry to set eoi exit bitmap*/
        for (index = 0; index < IOAPIC_NUM_PINS; index++) {
                e = &ioapic->redirtbl[index];
                if (!e->fields.mask &&
                        (e->fields.trig_mode == IOAPIC_LEVEL_TRIG ||
                         kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC,
-                                index))) {
-                       irqe.dest_id = e->fields.dest_id;
-                       irqe.vector = e->fields.vector;
-                       irqe.dest_mode = e->fields.dest_mode;
-                       irqe.delivery_mode = e->fields.delivery_mode << 8;
-                       kvm_calculate_eoi_exitmap(vcpu, &irqe, eoi_exit_bitmap);
+                                index) || index == RTC_GSI)) {
+                       if (kvm_apic_match_dest(vcpu, NULL, 0,
+                               e->fields.dest_id, e->fields.dest_mode)) {
+                               __set_bit(e->fields.vector,
+                                       (unsigned long *)eoi_exit_bitmap);
+                               if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG)
+                                       __set_bit(e->fields.vector,
+                                               (unsigned long *)tmr);
+                       }
                }
        }
        spin_unlock(&ioapic->lock);
 }
-EXPORT_SYMBOL_GPL(kvm_ioapic_calculate_eoi_exitmap);
 
-void kvm_ioapic_make_eoibitmap_request(struct kvm *kvm)
+#ifdef CONFIG_X86
+void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
 {
        struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 
-       if (!kvm_apic_vid_enabled(kvm) || !ioapic)
+       if (!ioapic)
                return;
-       kvm_make_update_eoibitmap_request(kvm);
+       kvm_make_scan_ioapic_request(kvm);
 }
+#else
+void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
+{
+       return;
+}
+#endif
 
 static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 {
@@ -195,16 +276,17 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
                        kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
                if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
                    && ioapic->irr & (1 << index))
-                       ioapic_service(ioapic, index);
-               kvm_ioapic_make_eoibitmap_request(ioapic->kvm);
+                       ioapic_service(ioapic, index, false);
+               kvm_vcpu_request_scan_ioapic(ioapic->kvm);
                break;
        }
 }
 
-static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
+static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status)
 {
        union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
        struct kvm_lapic_irq irqe;
+       int ret;
 
        ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
                     "vector=%x trig_mode=%x\n",
@@ -220,11 +302,19 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
        irqe.level = 1;
        irqe.shorthand = 0;
 
-       return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
+       if (irq == RTC_GSI && line_status) {
+               BUG_ON(ioapic->rtc_status.pending_eoi != 0);
+               ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
+                               ioapic->rtc_status.dest_map);
+               ioapic->rtc_status.pending_eoi = ret;
+       } else
+               ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
+
+       return ret;
 }
 
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
-                      int level)
+                      int level, bool line_status)
 {
        u32 old_irr;
        u32 mask = 1 << irq;
@@ -244,13 +334,20 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
                ret = 1;
        } else {
                int edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+
+               if (irq == RTC_GSI && line_status &&
+                       rtc_irq_check_coalesced(ioapic)) {
+                       ret = 0; /* coalesced */
+                       goto out;
+               }
                ioapic->irr |= mask;
                if ((edge && old_irr != ioapic->irr) ||
                    (!edge && !entry.fields.remote_irr))
-                       ret = ioapic_service(ioapic, irq);
+                       ret = ioapic_service(ioapic, irq, line_status);
                else
                        ret = 0; /* report coalesced interrupt */
        }
+out:
        trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
        spin_unlock(&ioapic->lock);
 
@@ -267,8 +364,8 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
        spin_unlock(&ioapic->lock);
 }
 
-static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
-                                    int trigger_mode)
+static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
+                       struct kvm_ioapic *ioapic, int vector, int trigger_mode)
 {
        int i;
 
@@ -278,6 +375,8 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
                if (ent->fields.vector != vector)
                        continue;
 
+               if (i == RTC_GSI)
+                       rtc_irq_eoi(ioapic, vcpu);
                /*
                 * We are dropping lock while calling ack notifiers because ack
                 * notifier callbacks for assigned devices call into IOAPIC
@@ -296,7 +395,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int vector,
                ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
                ent->fields.remote_irr = 0;
                if (!ent->fields.mask && (ioapic->irr & (1 << i)))
-                       ioapic_service(ioapic, i);
+                       ioapic_service(ioapic, i, false);
        }
 }
 
@@ -307,12 +406,12 @@ bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
        return test_bit(vector, ioapic->handled_vectors);
 }
 
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode)
+void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode)
 {
-       struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+       struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
 
        spin_lock(&ioapic->lock);
-       __kvm_ioapic_update_eoi(ioapic, vector, trigger_mode);
+       __kvm_ioapic_update_eoi(vcpu, ioapic, vector, trigger_mode);
        spin_unlock(&ioapic->lock);
 }
 
@@ -410,7 +509,7 @@ static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
                break;
 #ifdef CONFIG_IA64
        case IOAPIC_REG_EOI:
-               __kvm_ioapic_update_eoi(ioapic, data, IOAPIC_LEVEL_TRIG);
+               __kvm_ioapic_update_eoi(NULL, ioapic, data, IOAPIC_LEVEL_TRIG);
                break;
 #endif
 
@@ -431,6 +530,7 @@ void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
        ioapic->ioregsel = 0;
        ioapic->irr = 0;
        ioapic->id = 0;
+       rtc_irq_eoi_tracking_reset(ioapic);
        update_handled_vectors(ioapic);
 }
 
@@ -496,7 +596,8 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
        spin_lock(&ioapic->lock);
        memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
        update_handled_vectors(ioapic);
-       kvm_ioapic_make_eoibitmap_request(kvm);
+       kvm_vcpu_request_scan_ioapic(kvm);
+       kvm_rtc_eoi_tracking_restore_all(ioapic);
        spin_unlock(&ioapic->lock);
        return 0;
 }
index 0400a466c50c55211323b67a5a4109784c3fcd74..615d8c995c3c1fec2bdfc4f875bcad477bee9e53 100644 (file)
@@ -34,6 +34,17 @@ struct kvm_vcpu;
 #define        IOAPIC_INIT                     0x5
 #define        IOAPIC_EXTINT                   0x7
 
+#ifdef CONFIG_X86
+#define RTC_GSI 8
+#else
+#define RTC_GSI -1U
+#endif
+
+struct rtc_status {
+       int pending_eoi;
+       DECLARE_BITMAP(dest_map, KVM_MAX_VCPUS);
+};
+
 struct kvm_ioapic {
        u64 base_address;
        u32 ioregsel;
@@ -47,6 +58,7 @@ struct kvm_ioapic {
        void (*ack_notifier)(void *opaque, int irq);
        spinlock_t lock;
        DECLARE_BITMAP(handled_vectors, 256);
+       struct rtc_status rtc_status;
 };
 
 #ifdef DEBUG
@@ -67,24 +79,25 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
        return kvm->arch.vioapic;
 }
 
+void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
 int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
                int short_hand, int dest, int dest_mode);
 int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
-void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
+void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
+                       int trigger_mode);
 bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
 int kvm_ioapic_init(struct kvm *kvm);
 void kvm_ioapic_destroy(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
-                      int level);
+                      int level, bool line_status);
 void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
-               struct kvm_lapic_irq *irq);
+               struct kvm_lapic_irq *irq, unsigned long *dest_map);
 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
-void kvm_ioapic_make_eoibitmap_request(struct kvm *kvm);
-void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
-                                       u64 *eoi_exit_bitmap);
-
+void kvm_vcpu_request_scan_ioapic(struct kvm *kvm);
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
+                       u32 *tmr);
 
 #endif
index e9073cf4d0406c0faaec2faf0d87015f962494ad..e2e6b4473a96fafc98dc85aa6c2e8fa49fcd1c09 100644 (file)
@@ -35,7 +35,8 @@
 #include "ioapic.h"
 
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
-                          struct kvm *kvm, int irq_source_id, int level)
+                          struct kvm *kvm, int irq_source_id, int level,
+                          bool line_status)
 {
 #ifdef CONFIG_X86
        struct kvm_pic *pic = pic_irqchip(kvm);
@@ -46,10 +47,12 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 }
 
 static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
-                             struct kvm *kvm, int irq_source_id, int level)
+                             struct kvm *kvm, int irq_source_id, int level,
+                             bool line_status)
 {
        struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-       return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level);
+       return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
+                               line_status);
 }
 
 inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
@@ -63,7 +66,7 @@ inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
 }
 
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
-               struct kvm_lapic_irq *irq)
+               struct kvm_lapic_irq *irq, unsigned long *dest_map)
 {
        int i, r = -1;
        struct kvm_vcpu *vcpu, *lowest = NULL;
@@ -74,7 +77,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
                irq->delivery_mode = APIC_DM_FIXED;
        }
 
-       if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r))
+       if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
                return r;
 
        kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -88,7 +91,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
                if (!kvm_is_dm_lowest_prio(irq)) {
                        if (r < 0)
                                r = 0;
-                       r += kvm_apic_set_irq(vcpu, irq);
+                       r += kvm_apic_set_irq(vcpu, irq, dest_map);
                } else if (kvm_lapic_enabled(vcpu)) {
                        if (!lowest)
                                lowest = vcpu;
@@ -98,7 +101,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
        }
 
        if (lowest)
-               r = kvm_apic_set_irq(lowest, irq);
+               r = kvm_apic_set_irq(lowest, irq, dest_map);
 
        return r;
 }
@@ -121,7 +124,7 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
 }
 
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
-               struct kvm *kvm, int irq_source_id, int level)
+               struct kvm *kvm, int irq_source_id, int level, bool line_status)
 {
        struct kvm_lapic_irq irq;
 
@@ -130,7 +133,7 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 
        kvm_set_msi_irq(e, &irq);
 
-       return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
+       return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
 }
 
 
@@ -142,63 +145,12 @@ static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
 
        kvm_set_msi_irq(e, &irq);
 
-       if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r))
+       if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
                return r;
        else
                return -EWOULDBLOCK;
 }
 
-int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
-{
-       struct kvm_kernel_irq_routing_entry route;
-
-       if (!irqchip_in_kernel(kvm) || msi->flags != 0)
-               return -EINVAL;
-
-       route.msi.address_lo = msi->address_lo;
-       route.msi.address_hi = msi->address_hi;
-       route.msi.data = msi->data;
-
-       return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
-}
-
-/*
- * Return value:
- *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
- *  = 0   Interrupt was coalesced (previous irq is still pending)
- *  > 0   Number of CPUs interrupt was delivered to
- */
-int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
-{
-       struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
-       int ret = -1, i = 0;
-       struct kvm_irq_routing_table *irq_rt;
-
-       trace_kvm_set_irq(irq, level, irq_source_id);
-
-       /* Not possible to detect if the guest uses the PIC or the
-        * IOAPIC.  So set the bit in both. The guest will ignore
-        * writes to the unused one.
-        */
-       rcu_read_lock();
-       irq_rt = rcu_dereference(kvm->irq_routing);
-       if (irq < irq_rt->nr_rt_entries)
-               hlist_for_each_entry(e, &irq_rt->map[irq], link)
-                       irq_set[i++] = *e;
-       rcu_read_unlock();
-
-       while(i--) {
-               int r;
-               r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level);
-               if (r < 0)
-                       continue;
-
-               ret = r + ((ret < 0) ? 0 : ret);
-       }
-
-       return ret;
-}
-
 /*
  * Deliver an IRQ in an atomic context if we can, or return a failure,
  * user can retry in a process context.
@@ -236,63 +188,6 @@ int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
        return ret;
 }
 
-bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
-{
-       struct kvm_irq_ack_notifier *kian;
-       int gsi;
-
-       rcu_read_lock();
-       gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
-       if (gsi != -1)
-               hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
-                                        link)
-                       if (kian->gsi == gsi) {
-                               rcu_read_unlock();
-                               return true;
-                       }
-
-       rcu_read_unlock();
-
-       return false;
-}
-EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
-
-void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
-{
-       struct kvm_irq_ack_notifier *kian;
-       int gsi;
-
-       trace_kvm_ack_irq(irqchip, pin);
-
-       rcu_read_lock();
-       gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
-       if (gsi != -1)
-               hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
-                                        link)
-                       if (kian->gsi == gsi)
-                               kian->irq_acked(kian);
-       rcu_read_unlock();
-}
-
-void kvm_register_irq_ack_notifier(struct kvm *kvm,
-                                  struct kvm_irq_ack_notifier *kian)
-{
-       mutex_lock(&kvm->irq_lock);
-       hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
-       mutex_unlock(&kvm->irq_lock);
-       kvm_ioapic_make_eoibitmap_request(kvm);
-}
-
-void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
-                                   struct kvm_irq_ack_notifier *kian)
-{
-       mutex_lock(&kvm->irq_lock);
-       hlist_del_init_rcu(&kian->link);
-       mutex_unlock(&kvm->irq_lock);
-       synchronize_rcu();
-       kvm_ioapic_make_eoibitmap_request(kvm);
-}
-
 int kvm_request_irq_source_id(struct kvm *kvm)
 {
        unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
@@ -376,34 +271,14 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
        rcu_read_unlock();
 }
 
-void kvm_free_irq_routing(struct kvm *kvm)
-{
-       /* Called only during vm destruction. Nobody can use the pointer
-          at this stage */
-       kfree(kvm->irq_routing);
-}
-
-static int setup_routing_entry(struct kvm_irq_routing_table *rt,
-                              struct kvm_kernel_irq_routing_entry *e,
-                              const struct kvm_irq_routing_entry *ue)
+int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
+                         struct kvm_kernel_irq_routing_entry *e,
+                         const struct kvm_irq_routing_entry *ue)
 {
        int r = -EINVAL;
        int delta;
        unsigned max_pin;
-       struct kvm_kernel_irq_routing_entry *ei;
 
-       /*
-        * Do not allow GSI to be mapped to the same irqchip more than once.
-        * Allow only one to one mapping between GSI and MSI.
-        */
-       hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
-               if (ei->type == KVM_IRQ_ROUTING_MSI ||
-                   ue->type == KVM_IRQ_ROUTING_MSI ||
-                   ue->u.irqchip.irqchip == ei->irqchip.irqchip)
-                       return r;
-
-       e->gsi = ue->gsi;
-       e->type = ue->type;
        switch (ue->type) {
        case KVM_IRQ_ROUTING_IRQCHIP:
                delta = 0;
@@ -440,69 +315,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
                goto out;
        }
 
-       hlist_add_head(&e->link, &rt->map[e->gsi]);
        r = 0;
 out:
        return r;
 }
 
-
-int kvm_set_irq_routing(struct kvm *kvm,
-                       const struct kvm_irq_routing_entry *ue,
-                       unsigned nr,
-                       unsigned flags)
-{
-       struct kvm_irq_routing_table *new, *old;
-       u32 i, j, nr_rt_entries = 0;
-       int r;
-
-       for (i = 0; i < nr; ++i) {
-               if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES)
-                       return -EINVAL;
-               nr_rt_entries = max(nr_rt_entries, ue[i].gsi);
-       }
-
-       nr_rt_entries += 1;
-
-       new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head))
-                     + (nr * sizeof(struct kvm_kernel_irq_routing_entry)),
-                     GFP_KERNEL);
-
-       if (!new)
-               return -ENOMEM;
-
-       new->rt_entries = (void *)&new->map[nr_rt_entries];
-
-       new->nr_rt_entries = nr_rt_entries;
-       for (i = 0; i < 3; i++)
-               for (j = 0; j < KVM_IOAPIC_NUM_PINS; j++)
-                       new->chip[i][j] = -1;
-
-       for (i = 0; i < nr; ++i) {
-               r = -EINVAL;
-               if (ue->flags)
-                       goto out;
-               r = setup_routing_entry(new, &new->rt_entries[i], ue);
-               if (r)
-                       goto out;
-               ++ue;
-       }
-
-       mutex_lock(&kvm->irq_lock);
-       old = kvm->irq_routing;
-       kvm_irq_routing_update(kvm, new);
-       mutex_unlock(&kvm->irq_lock);
-
-       synchronize_rcu();
-
-       new = old;
-       r = 0;
-
-out:
-       kfree(new);
-       return r;
-}
-
 #define IOAPIC_ROUTING_ENTRY(irq) \
        { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,  \
          .u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC, .u.irqchip.pin = (irq) }
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
new file mode 100644 (file)
index 0000000..20dc9e4
--- /dev/null
@@ -0,0 +1,237 @@
+/*
+ * irqchip.c: Common API for in kernel interrupt controllers
+ * Copyright (c) 2007, Intel Corporation.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright (c) 2013, Alexander Graf <agraf@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * This file is derived from virt/kvm/irq_comm.c.
+ *
+ * Authors:
+ *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *   Alexander Graf <agraf@suse.de>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <trace/events/kvm.h>
+#include "irq.h"
+
+bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
+{
+       struct kvm_irq_ack_notifier *kian;
+       int gsi;
+
+       rcu_read_lock();
+       gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+       if (gsi != -1)
+               hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
+                                        link)
+                       if (kian->gsi == gsi) {
+                               rcu_read_unlock();
+                               return true;
+                       }
+
+       rcu_read_unlock();
+
+       return false;
+}
+EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
+
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
+{
+       struct kvm_irq_ack_notifier *kian;
+       int gsi;
+
+       trace_kvm_ack_irq(irqchip, pin);
+
+       rcu_read_lock();
+       gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+       if (gsi != -1)
+               hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
+                                        link)
+                       if (kian->gsi == gsi)
+                               kian->irq_acked(kian);
+       rcu_read_unlock();
+}
+
+void kvm_register_irq_ack_notifier(struct kvm *kvm,
+                                  struct kvm_irq_ack_notifier *kian)
+{
+       mutex_lock(&kvm->irq_lock);
+       hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
+       mutex_unlock(&kvm->irq_lock);
+#ifdef __KVM_HAVE_IOAPIC
+       kvm_vcpu_request_scan_ioapic(kvm);
+#endif
+}
+
+void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
+                                   struct kvm_irq_ack_notifier *kian)
+{
+       mutex_lock(&kvm->irq_lock);
+       hlist_del_init_rcu(&kian->link);
+       mutex_unlock(&kvm->irq_lock);
+       synchronize_rcu();
+#ifdef __KVM_HAVE_IOAPIC
+       kvm_vcpu_request_scan_ioapic(kvm);
+#endif
+}
+
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+       struct kvm_kernel_irq_routing_entry route;
+
+       if (!irqchip_in_kernel(kvm) || msi->flags != 0)
+               return -EINVAL;
+
+       route.msi.address_lo = msi->address_lo;
+       route.msi.address_hi = msi->address_hi;
+       route.msi.data = msi->data;
+
+       return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false);
+}
+
+/*
+ * Return value:
+ *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
+ *  = 0   Interrupt was coalesced (previous irq is still pending)
+ *  > 0   Number of CPUs interrupt was delivered to
+ */
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+               bool line_status)
+{
+       struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
+       int ret = -1, i = 0;
+       struct kvm_irq_routing_table *irq_rt;
+
+       trace_kvm_set_irq(irq, level, irq_source_id);
+
+       /* Not possible to detect if the guest uses the PIC or the
+        * IOAPIC.  So set the bit in both. The guest will ignore
+        * writes to the unused one.
+        */
+       rcu_read_lock();
+       irq_rt = rcu_dereference(kvm->irq_routing);
+       if (irq < irq_rt->nr_rt_entries)
+               hlist_for_each_entry(e, &irq_rt->map[irq], link)
+                       irq_set[i++] = *e;
+       rcu_read_unlock();
+
+       while(i--) {
+               int r;
+               r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level,
+                                  line_status);
+               if (r < 0)
+                       continue;
+
+               ret = r + ((ret < 0) ? 0 : ret);
+       }
+
+       return ret;
+}
+
+void kvm_free_irq_routing(struct kvm *kvm)
+{
+       /* Called only during vm destruction. Nobody can use the pointer
+          at this stage */
+       kfree(kvm->irq_routing);
+}
+
+static int setup_routing_entry(struct kvm_irq_routing_table *rt,
+                              struct kvm_kernel_irq_routing_entry *e,
+                              const struct kvm_irq_routing_entry *ue)
+{
+       int r = -EINVAL;
+       struct kvm_kernel_irq_routing_entry *ei;
+
+       /*
+        * Do not allow GSI to be mapped to the same irqchip more than once.
+        * Allow only one to one mapping between GSI and MSI.
+        */
+       hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
+               if (ei->type == KVM_IRQ_ROUTING_MSI ||
+                   ue->type == KVM_IRQ_ROUTING_MSI ||
+                   ue->u.irqchip.irqchip == ei->irqchip.irqchip)
+                       return r;
+
+       e->gsi = ue->gsi;
+       e->type = ue->type;
+       r = kvm_set_routing_entry(rt, e, ue);
+       if (r)
+               goto out;
+
+       hlist_add_head(&e->link, &rt->map[e->gsi]);
+       r = 0;
+out:
+       return r;
+}
+
+int kvm_set_irq_routing(struct kvm *kvm,
+                       const struct kvm_irq_routing_entry *ue,
+                       unsigned nr,
+                       unsigned flags)
+{
+       struct kvm_irq_routing_table *new, *old;
+       u32 i, j, nr_rt_entries = 0;
+       int r;
+
+       for (i = 0; i < nr; ++i) {
+               if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES)
+                       return -EINVAL;
+               nr_rt_entries = max(nr_rt_entries, ue[i].gsi);
+       }
+
+       nr_rt_entries += 1;
+
+       new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head))
+                     + (nr * sizeof(struct kvm_kernel_irq_routing_entry)),
+                     GFP_KERNEL);
+
+       if (!new)
+               return -ENOMEM;
+
+       new->rt_entries = (void *)&new->map[nr_rt_entries];
+
+       new->nr_rt_entries = nr_rt_entries;
+       for (i = 0; i < KVM_NR_IRQCHIPS; i++)
+               for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++)
+                       new->chip[i][j] = -1;
+
+       for (i = 0; i < nr; ++i) {
+               r = -EINVAL;
+               if (ue->flags)
+                       goto out;
+               r = setup_routing_entry(new, &new->rt_entries[i], ue);
+               if (r)
+                       goto out;
+               ++ue;
+       }
+
+       mutex_lock(&kvm->irq_lock);
+       old = kvm->irq_routing;
+       kvm_irq_routing_update(kvm, new);
+       mutex_unlock(&kvm->irq_lock);
+
+       synchronize_rcu();
+
+       new = old;
+       r = 0;
+
+out:
+       kfree(new);
+       return r;
+}
index f18013f09e68423d68d4033d1e22fe1f8701886b..45f09362ee7be02df67171efa3508fa225129499 100644 (file)
@@ -217,9 +217,9 @@ void kvm_make_mclock_inprogress_request(struct kvm *kvm)
        make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
 }
 
-void kvm_make_update_eoibitmap_request(struct kvm *kvm)
+void kvm_make_scan_ioapic_request(struct kvm *kvm)
 {
-       make_all_cpus_request(kvm, KVM_REQ_EOIBITMAP);
+       make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
 }
 
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
@@ -244,6 +244,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 
        kvm_vcpu_set_in_spin_loop(vcpu, false);
        kvm_vcpu_set_dy_eligible(vcpu, false);
+       vcpu->preempted = false;
 
        r = kvm_arch_vcpu_init(vcpu);
        if (r < 0)
@@ -503,6 +504,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
        mutex_init(&kvm->irq_lock);
        mutex_init(&kvm->slots_lock);
        atomic_set(&kvm->users_count, 1);
+       INIT_LIST_HEAD(&kvm->devices);
 
        r = kvm_init_mmu_notifier(kvm);
        if (r)
@@ -580,6 +582,19 @@ void kvm_free_physmem(struct kvm *kvm)
        kfree(kvm->memslots);
 }
 
+static void kvm_destroy_devices(struct kvm *kvm)
+{
+       struct list_head *node, *tmp;
+
+       list_for_each_safe(node, tmp, &kvm->devices) {
+               struct kvm_device *dev =
+                       list_entry(node, struct kvm_device, vm_node);
+
+               list_del(node);
+               dev->ops->destroy(dev);
+       }
+}
+
 static void kvm_destroy_vm(struct kvm *kvm)
 {
        int i;
@@ -599,6 +614,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
        kvm_arch_flush_shadow_all(kvm);
 #endif
        kvm_arch_destroy_vm(kvm);
+       kvm_destroy_devices(kvm);
        kvm_free_physmem(kvm);
        cleanup_srcu_struct(&kvm->srcu);
        kvm_arch_free_vm(kvm);
@@ -718,24 +734,6 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
        return old_memslots; 
 }
 
-/*
- * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations:
- * - create a new memory slot
- * - delete an existing memory slot
- * - modify an existing memory slot
- *   -- move it in the guest physical memory space
- *   -- just change its flags
- *
- * Since flags can be changed by some of these operations, the following
- * differentiation is the best we can do for __kvm_set_memory_region():
- */
-enum kvm_mr_change {
-       KVM_MR_CREATE,
-       KVM_MR_DELETE,
-       KVM_MR_MOVE,
-       KVM_MR_FLAGS_ONLY,
-};
-
 /*
  * Allocate some memory and give it an address in the guest physical address
  * space.
@@ -745,8 +743,7 @@ enum kvm_mr_change {
  * Must be called holding mmap_sem for write.
  */
 int __kvm_set_memory_region(struct kvm *kvm,
-                           struct kvm_userspace_memory_region *mem,
-                           bool user_alloc)
+                           struct kvm_userspace_memory_region *mem)
 {
        int r;
        gfn_t base_gfn;
@@ -767,7 +764,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
        if (mem->guest_phys_addr & (PAGE_SIZE - 1))
                goto out;
        /* We can read the guest memory with __xxx_user() later on. */
-       if (user_alloc &&
+       if ((mem->slot < KVM_USER_MEM_SLOTS) &&
            ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
             !access_ok(VERIFY_WRITE,
                        (void __user *)(unsigned long)mem->userspace_addr,
@@ -875,7 +872,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
                slots = old_memslots;
        }
 
-       r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc);
+       r = kvm_arch_prepare_memory_region(kvm, &new, mem, change);
        if (r)
                goto out_slots;
 
@@ -915,7 +912,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
        old_memslots = install_new_memslots(kvm, slots, &new);
 
-       kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
+       kvm_arch_commit_memory_region(kvm, mem, &old, change);
 
        kvm_free_physmem_slot(&old, &new);
        kfree(old_memslots);
@@ -932,26 +929,23 @@ out:
 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
 
 int kvm_set_memory_region(struct kvm *kvm,
-                         struct kvm_userspace_memory_region *mem,
-                         bool user_alloc)
+                         struct kvm_userspace_memory_region *mem)
 {
        int r;
 
        mutex_lock(&kvm->slots_lock);
-       r = __kvm_set_memory_region(kvm, mem, user_alloc);
+       r = __kvm_set_memory_region(kvm, mem);
        mutex_unlock(&kvm->slots_lock);
        return r;
 }
 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
 
 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
-                                  struct
-                                  kvm_userspace_memory_region *mem,
-                                  bool user_alloc)
+                                  struct kvm_userspace_memory_region *mem)
 {
        if (mem->slot >= KVM_USER_MEM_SLOTS)
                return -EINVAL;
-       return kvm_set_memory_region(kvm, mem, user_alloc);
+       return kvm_set_memory_region(kvm, mem);
 }
 
 int kvm_get_dirty_log(struct kvm *kvm,
@@ -1099,7 +1093,7 @@ static int kvm_read_hva_atomic(void *data, void __user *hva, int len)
        return __copy_from_user_inatomic(data, hva, len);
 }
 
-int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
+static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
        unsigned long start, int write, struct page **page)
 {
        int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET;
@@ -1719,6 +1713,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
                        smp_send_reschedule(cpu);
        put_cpu();
 }
+EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
 #endif /* !CONFIG_S390 */
 
 void kvm_resched(struct kvm_vcpu *vcpu)
@@ -1816,6 +1811,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
                                continue;
                        } else if (pass && i > last_boosted_vcpu)
                                break;
+                       if (!ACCESS_ONCE(vcpu->preempted))
+                               continue;
                        if (vcpu == me)
                                continue;
                        if (waitqueue_active(&vcpu->wq))
@@ -2204,6 +2201,119 @@ out:
 }
 #endif
 
+static int kvm_device_ioctl_attr(struct kvm_device *dev,
+                                int (*accessor)(struct kvm_device *dev,
+                                                struct kvm_device_attr *attr),
+                                unsigned long arg)
+{
+       struct kvm_device_attr attr;
+
+       if (!accessor)
+               return -EPERM;
+
+       if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+               return -EFAULT;
+
+       return accessor(dev, &attr);
+}
+
+static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
+                            unsigned long arg)
+{
+       struct kvm_device *dev = filp->private_data;
+
+       switch (ioctl) {
+       case KVM_SET_DEVICE_ATTR:
+               return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
+       case KVM_GET_DEVICE_ATTR:
+               return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
+       case KVM_HAS_DEVICE_ATTR:
+               return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
+       default:
+               if (dev->ops->ioctl)
+                       return dev->ops->ioctl(dev, ioctl, arg);
+
+               return -ENOTTY;
+       }
+}
+
+static int kvm_device_release(struct inode *inode, struct file *filp)
+{
+       struct kvm_device *dev = filp->private_data;
+       struct kvm *kvm = dev->kvm;
+
+       kvm_put_kvm(kvm);
+       return 0;
+}
+
+static const struct file_operations kvm_device_fops = {
+       .unlocked_ioctl = kvm_device_ioctl,
+#ifdef CONFIG_COMPAT
+       .compat_ioctl = kvm_device_ioctl,
+#endif
+       .release = kvm_device_release,
+};
+
+struct kvm_device *kvm_device_from_filp(struct file *filp)
+{
+       if (filp->f_op != &kvm_device_fops)
+               return NULL;
+
+       return filp->private_data;
+}
+
+static int kvm_ioctl_create_device(struct kvm *kvm,
+                                  struct kvm_create_device *cd)
+{
+       struct kvm_device_ops *ops = NULL;
+       struct kvm_device *dev;
+       bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
+       int ret;
+
+       switch (cd->type) {
+#ifdef CONFIG_KVM_MPIC
+       case KVM_DEV_TYPE_FSL_MPIC_20:
+       case KVM_DEV_TYPE_FSL_MPIC_42:
+               ops = &kvm_mpic_ops;
+               break;
+#endif
+#ifdef CONFIG_KVM_XICS
+       case KVM_DEV_TYPE_XICS:
+               ops = &kvm_xics_ops;
+               break;
+#endif
+       default:
+               return -ENODEV;
+       }
+
+       if (test)
+               return 0;
+
+       dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+       if (!dev)
+               return -ENOMEM;
+
+       dev->ops = ops;
+       dev->kvm = kvm;
+
+       ret = ops->create(dev, cd->type);
+       if (ret < 0) {
+               kfree(dev);
+               return ret;
+       }
+
+       ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR);
+       if (ret < 0) {
+               ops->destroy(dev);
+               return ret;
+       }
+
+       list_add(&dev->vm_node, &kvm->devices);
+       kvm_get_kvm(kvm);
+       cd->fd = ret;
+       return 0;
+}
+
 static long kvm_vm_ioctl(struct file *filp,
                           unsigned int ioctl, unsigned long arg)
 {
@@ -2225,7 +2335,7 @@ static long kvm_vm_ioctl(struct file *filp,
                                                sizeof kvm_userspace_mem))
                        goto out;
 
-               r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, true);
+               r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
                break;
        }
        case KVM_GET_DIRTY_LOG: {
@@ -2304,7 +2414,8 @@ static long kvm_vm_ioctl(struct file *filp,
                if (copy_from_user(&irq_event, argp, sizeof irq_event))
                        goto out;
 
-               r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
+               r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
+                                       ioctl == KVM_IRQ_LINE_STATUS);
                if (r)
                        goto out;
 
@@ -2318,6 +2429,54 @@ static long kvm_vm_ioctl(struct file *filp,
                break;
        }
 #endif
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+       case KVM_SET_GSI_ROUTING: {
+               struct kvm_irq_routing routing;
+               struct kvm_irq_routing __user *urouting;
+               struct kvm_irq_routing_entry *entries;
+
+               r = -EFAULT;
+               if (copy_from_user(&routing, argp, sizeof(routing)))
+                       goto out;
+               r = -EINVAL;
+               if (routing.nr >= KVM_MAX_IRQ_ROUTES)
+                       goto out;
+               if (routing.flags)
+                       goto out;
+               r = -ENOMEM;
+               entries = vmalloc(routing.nr * sizeof(*entries));
+               if (!entries)
+                       goto out;
+               r = -EFAULT;
+               urouting = argp;
+               if (copy_from_user(entries, urouting->entries,
+                                  routing.nr * sizeof(*entries)))
+                       goto out_free_irq_routing;
+               r = kvm_set_irq_routing(kvm, entries, routing.nr,
+                                       routing.flags);
+       out_free_irq_routing:
+               vfree(entries);
+               break;
+       }
+#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
+       case KVM_CREATE_DEVICE: {
+               struct kvm_create_device cd;
+
+               r = -EFAULT;
+               if (copy_from_user(&cd, argp, sizeof(cd)))
+                       goto out;
+
+               r = kvm_ioctl_create_device(kvm, &cd);
+               if (r)
+                       goto out;
+
+               r = -EFAULT;
+               if (copy_to_user(argp, &cd, sizeof(cd)))
+                       goto out;
+
+               r = 0;
+               break;
+       }
        default:
                r = kvm_arch_vm_ioctl(filp, ioctl, arg);
                if (r == -ENOTTY)
@@ -2446,9 +2605,12 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
        case KVM_CAP_INTERNAL_ERROR_DATA:
 #ifdef CONFIG_HAVE_KVM_MSI
        case KVM_CAP_SIGNAL_MSI:
+#endif
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+       case KVM_CAP_IRQFD_RESAMPLE:
 #endif
                return 1;
-#ifdef KVM_CAP_IRQ_ROUTING
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
        case KVM_CAP_IRQ_ROUTING:
                return KVM_MAX_IRQ_ROUTES;
 #endif
@@ -2618,14 +2780,6 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
        return NOTIFY_OK;
 }
 
-
-asmlinkage void kvm_spurious_fault(void)
-{
-       /* Fault while not rebooting.  We want the trace. */
-       BUG();
-}
-EXPORT_SYMBOL_GPL(kvm_spurious_fault);
-
 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
                      void *v)
 {
@@ -2658,7 +2812,7 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
        kfree(bus);
 }
 
-int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
+static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
 {
        const struct kvm_io_range *r1 = p1;
        const struct kvm_io_range *r2 = p2;
@@ -2670,7 +2824,7 @@ int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
        return 0;
 }
 
-int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
+static int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
                          gpa_t addr, int len)
 {
        bus->range[bus->dev_count++] = (struct kvm_io_range) {
@@ -2685,7 +2839,7 @@ int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev,
        return 0;
 }
 
-int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
+static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
                             gpa_t addr, int len)
 {
        struct kvm_io_range *range, key;
@@ -2929,6 +3083,8 @@ struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
 {
        struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+       if (vcpu->preempted)
+               vcpu->preempted = false;
 
        kvm_arch_vcpu_load(vcpu, cpu);
 }
@@ -2938,6 +3094,8 @@ static void kvm_sched_out(struct preempt_notifier *pn,
 {
        struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
 
+       if (current->state == TASK_RUNNING)
+               vcpu->preempted = true;
        kvm_arch_vcpu_put(vcpu);
 }
 
@@ -2947,6 +3105,9 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
        int r;
        int cpu;
 
+       r = kvm_irqfd_init();
+       if (r)
+               goto out_irqfd;
        r = kvm_arch_init(opaque);
        if (r)
                goto out_fail;
@@ -3027,6 +3188,8 @@ out_free_0a:
 out_free_0:
        kvm_arch_exit();
 out_fail:
+       kvm_irqfd_exit();
+out_irqfd:
        return r;
 }
 EXPORT_SYMBOL_GPL(kvm_init);
@@ -3043,6 +3206,7 @@ void kvm_exit(void)
        on_each_cpu(hardware_disable_nolock, NULL, 1);
        kvm_arch_hardware_unsetup();
        kvm_arch_exit();
+       kvm_irqfd_exit();
        free_cpumask_var(cpus_hardware_enabled);
 }
 EXPORT_SYMBOL_GPL(kvm_exit);