return 0;
}
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event->irq, irq_event->level);
+ return 0;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
goto out;
}
break;
- case KVM_IRQ_LINE_STATUS:
- case KVM_IRQ_LINE: {
- struct kvm_irq_level irq_event;
-
- r = -EFAULT;
- if (copy_from_user(&irq_event, argp, sizeof irq_event))
- goto out;
- r = -ENXIO;
- if (irqchip_in_kernel(kvm)) {
- __s32 status;
- status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
- irq_event.irq, irq_event.level);
- if (ioctl == KVM_IRQ_LINE_STATUS) {
- r = -EFAULT;
- irq_event.status = status;
- if (copy_to_user(argp, &irq_event,
- sizeof irq_event))
- goto out;
- }
- r = 0;
- }
- break;
- }
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
struct kvm_irqchip chip;
struct kvm;
extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+extern int kvm_unmap_hva_range(struct kvm *kvm,
+ unsigned long start, unsigned long end);
extern int kvm_age_hva(struct kvm *kvm, unsigned long hva);
extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
goto out_put;
}
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp,
- unsigned long gfn))
+static int kvm_handle_hva_range(struct kvm *kvm,
+ unsigned long start,
+ unsigned long end,
+ int (*handler)(struct kvm *kvm,
+ unsigned long *rmapp,
+ unsigned long gfn))
{
int ret;
int retval = 0;
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
- unsigned long start = memslot->userspace_addr;
- unsigned long end;
+ unsigned long hva_start, hva_end;
+ gfn_t gfn, gfn_end;
- end = start + (memslot->npages << PAGE_SHIFT);
- if (hva >= start && hva < end) {
- gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+ hva_start = max(start, memslot->userspace_addr);
+ hva_end = min(end, memslot->userspace_addr +
+ (memslot->npages << PAGE_SHIFT));
+ if (hva_start >= hva_end)
+ continue;
+ /*
+ * {gfn(page) | page intersects with [hva_start, hva_end)} =
+ * {gfn, gfn+1, ..., gfn_end-1}.
+ */
+ gfn = hva_to_gfn_memslot(hva_start, memslot);
+ gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
+
+ for (; gfn < gfn_end; ++gfn) {
+ gfn_t gfn_offset = gfn - memslot->base_gfn;
- ret = handler(kvm, &memslot->rmap[gfn_offset],
- memslot->base_gfn + gfn_offset);
+ ret = handler(kvm, &memslot->rmap[gfn_offset], gfn);
retval |= ret;
}
}
return retval;
}
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+ int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long gfn))
+{
+ return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
+}
+
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long gfn)
{
return 0;
}
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+ if (kvm->arch.using_mmu_notifiers)
+ kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
+ return 0;
+}
+
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long gfn)
{
if (likely(!pfnmap)) {
unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
- pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn);
+ pfn = gfn_to_pfn_memslot(slot, gfn);
if (is_error_pfn(pfn)) {
printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
(long)gfn);
extern unsigned long thread_saved_pc(struct task_struct *t);
extern void show_code(struct pt_regs *regs);
+extern int insn_to_mnemonic(unsigned char *instruction, char buf[8]);
unsigned long get_wchan(struct task_struct *p);
#define task_pt_regs(tsk) ((struct pt_regs *) \
return NULL;
}
+/**
+ * insn_to_mnemonic - decode an s390 instruction
+ * @instruction: instruction to decode
+ * @buf: buffer to fill with mnemonic
+ *
+ * Decode the instruction at @instruction and store the corresponding
+ * mnemonic into @buf.
+ * @buf is left unchanged if the instruction could not be decoded.
+ * Returns:
+ * %0 on success, %-ENOENT if the instruction was not found.
+ */
+int insn_to_mnemonic(unsigned char *instruction, char buf[8])
+{
+ struct insn *insn;
+
+ insn = find_insn(instruction);
+ if (!insn)
+ return -ENOENT;
+ if (insn->name[0] == '\0')
+ snprintf(buf, sizeof(buf), "%s",
+ long_insn_name[(int) insn->name[1]]);
+ else
+ snprintf(buf, sizeof(buf), "%.5s", insn->name);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(insn_to_mnemonic);
+
static int print_insn(char *buffer, unsigned char *code, unsigned long addr)
{
struct insn *insn;
depends on HAVE_KVM && EXPERIMENTAL
select PREEMPT_NOTIFIERS
select ANON_INODES
+ select HAVE_KVM_CPU_RELAX_INTERCEPT
---help---
Support hosting paravirtualized guest machines using the SIE
virtualization capability on the mainframe. This should work
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include "kvm-s390.h"
+#include "trace.h"
+#include "trace-s390.h"
static int diag_release_pages(struct kvm_vcpu *vcpu)
{
vcpu->run->exit_reason = KVM_EXIT_S390_RESET;
VCPU_EVENT(vcpu, 3, "requesting userspace resets %llx",
vcpu->run->s390_reset_flags);
+ trace_kvm_s390_request_resets(vcpu->run->s390_reset_flags);
return -EREMOTE;
}
{
int code = (vcpu->arch.sie_block->ipb & 0xfff0000) >> 16;
+ trace_kvm_s390_handle_diag(vcpu, code);
switch (code) {
case 0x10:
return diag_release_pages(vcpu);
#include "kvm-s390.h"
#include "gaccess.h"
+#include "trace.h"
+#include "trace-s390.h"
static int handle_lctlg(struct kvm_vcpu *vcpu)
{
VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2,
disp2);
+ trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, useraddr);
do {
rc = get_guest_u64(vcpu, useraddr,
VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x,b2:%x,d2:%x", reg1, reg3, base2,
disp2);
+ trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, useraddr);
reg = reg1;
do {
vcpu->stat.exit_stop_request++;
spin_lock_bh(&vcpu->arch.local_int.lock);
+ trace_kvm_s390_stop_request(vcpu->arch.local_int.action_bits);
+
if (vcpu->arch.local_int.action_bits & ACTION_RELOADVCPU_ON_STOP) {
vcpu->arch.local_int.action_bits &= ~ACTION_RELOADVCPU_ON_STOP;
rc = SIE_INTERCEPT_RERUNVCPU;
int rc;
vcpu->stat.exit_validity++;
+ trace_kvm_s390_intercept_validity(vcpu, viwhy);
if (viwhy == 0x37) {
vmaddr = gmap_fault(vcpu->arch.sie_block->prefix,
vcpu->arch.gmap);
intercept_handler_t handler;
vcpu->stat.exit_instruction++;
+ trace_kvm_s390_intercept_instruction(vcpu,
+ vcpu->arch.sie_block->ipa,
+ vcpu->arch.sie_block->ipb);
handler = instruction_handlers[vcpu->arch.sie_block->ipa >> 8];
if (handler)
return handler(vcpu);
static int handle_prog(struct kvm_vcpu *vcpu)
{
vcpu->stat.exit_program_interruption++;
+ trace_kvm_s390_intercept_prog(vcpu, vcpu->arch.sie_block->iprcc);
return kvm_s390_inject_program_int(vcpu, vcpu->arch.sie_block->iprcc);
}
#include <asm/uaccess.h>
#include "kvm-s390.h"
#include "gaccess.h"
+#include "trace-s390.h"
static int psw_extint_disabled(struct kvm_vcpu *vcpu)
{
case KVM_S390_INT_EMERGENCY:
VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp emerg");
vcpu->stat.deliver_emergency_signal++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ inti->emerg.code, 0);
rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1201);
if (rc == -EFAULT)
exception = 1;
case KVM_S390_INT_EXTERNAL_CALL:
VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call");
vcpu->stat.deliver_external_call++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ inti->extcall.code, 0);
rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x1202);
if (rc == -EFAULT)
exception = 1;
VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x",
inti->ext.ext_params);
vcpu->stat.deliver_service_signal++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ inti->ext.ext_params, 0);
rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2401);
if (rc == -EFAULT)
exception = 1;
VCPU_EVENT(vcpu, 4, "interrupt: virtio parm:%x,parm64:%llx",
inti->ext.ext_params, inti->ext.ext_params2);
vcpu->stat.deliver_virtio_interrupt++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ inti->ext.ext_params,
+ inti->ext.ext_params2);
rc = put_guest_u16(vcpu, __LC_EXT_INT_CODE, 0x2603);
if (rc == -EFAULT)
exception = 1;
case KVM_S390_SIGP_STOP:
VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu stop");
vcpu->stat.deliver_stop_signal++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ 0, 0);
__set_intercept_indicator(vcpu, inti);
break;
VCPU_EVENT(vcpu, 4, "interrupt: set prefix to %x",
inti->prefix.address);
vcpu->stat.deliver_prefix_signal++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ inti->prefix.address, 0);
kvm_s390_set_prefix(vcpu, inti->prefix.address);
break;
case KVM_S390_RESTART:
VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu restart");
vcpu->stat.deliver_restart_signal++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ 0, 0);
rc = copy_to_guest(vcpu, offsetof(struct _lowcore,
restart_old_psw), &vcpu->arch.sie_block->gpsw, sizeof(psw_t));
if (rc == -EFAULT)
inti->pgm.code,
table[vcpu->arch.sie_block->ipa >> 14]);
vcpu->stat.deliver_program_int++;
+ trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type,
+ inti->pgm.code, 0);
rc = put_guest_u16(vcpu, __LC_PGM_INT_CODE, inti->pgm.code);
if (rc == -EFAULT)
exception = 1;
inti->pgm.code = code;
VCPU_EVENT(vcpu, 3, "inject: program check %d (from kernel)", code);
+ trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, inti->type, code, 0, 1);
spin_lock_bh(&li->lock);
list_add(&inti->list, &li->list);
atomic_set(&li->active, 1);
kfree(inti);
return -EINVAL;
}
+ trace_kvm_s390_inject_vm(s390int->type, s390int->parm, s390int->parm64,
+ 2);
mutex_lock(&kvm->lock);
fi = &kvm->arch.float_int;
kfree(inti);
return -EINVAL;
}
+ trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, s390int->type, s390int->parm,
+ s390int->parm64, 2);
mutex_lock(&vcpu->kvm->lock);
li = &vcpu->arch.local_int;
#include "kvm-s390.h"
#include "gaccess.h"
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+#include "trace-s390.h"
+
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
struct kvm_stats_debugfs_item debugfs_entries[] = {
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
VCPU_EVENT(vcpu, 3, "%s", "free cpu");
+ trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id);
if (!kvm_is_ucontrol(vcpu->kvm)) {
clear_bit(63 - vcpu->vcpu_id,
(unsigned long *) &vcpu->kvm->arch.sca->mcn);
goto out_free_sie_block;
VM_EVENT(kvm, 3, "create cpu %d at %p, sie block at %p", id, vcpu,
vcpu->arch.sie_block);
+ trace_kvm_s390_create_vcpu(id, vcpu, vcpu->arch.sie_block);
return vcpu;
out_free_sie_block:
local_irq_enable();
VCPU_EVENT(vcpu, 6, "entering sie flags %x",
atomic_read(&vcpu->arch.sie_block->cpuflags));
+ trace_kvm_s390_sie_enter(vcpu,
+ atomic_read(&vcpu->arch.sie_block->cpuflags));
rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs);
if (rc) {
if (kvm_is_ucontrol(vcpu->kvm)) {
rc = SIE_INTERCEPT_UCONTROL;
} else {
VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
+ trace_kvm_s390_sie_fault(vcpu);
kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
rc = 0;
}
}
VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
vcpu->arch.sie_block->icptcode);
+ trace_kvm_s390_sie_exit(vcpu, vcpu->arch.sie_block->icptcode);
local_irq_disable();
kvm_guest_exit();
local_irq_enable();
#include <asm/sysinfo.h>
#include "gaccess.h"
#include "kvm-s390.h"
+#include "trace.h"
static int handle_set_prefix(struct kvm_vcpu *vcpu)
{
kvm_s390_set_prefix(vcpu, address);
VCPU_EVENT(vcpu, 5, "setting prefix to %x", address);
+ trace_kvm_s390_handle_prefix(vcpu, 1, address);
out:
return 0;
}
}
VCPU_EVENT(vcpu, 5, "storing prefix to %x", address);
+ trace_kvm_s390_handle_prefix(vcpu, 0, address);
out:
return 0;
}
}
VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", useraddr);
+ trace_kvm_s390_handle_stap(vcpu, useraddr);
out:
return 0;
}
&facility_list, sizeof(facility_list));
if (rc == -EFAULT)
kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
- else
+ else {
VCPU_EVENT(vcpu, 5, "store facility list value %x",
facility_list);
+ trace_kvm_s390_handle_stfl(vcpu, facility_list);
+ }
return 0;
}
kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
goto out_mem;
}
+ trace_kvm_s390_handle_stsi(vcpu, fc, sel1, sel2, operand2);
free_page(mem);
vcpu->arch.sie_block->gpsw.mask &= ~(3ul << 44);
vcpu->run->s.regs.gprs[0] = 0;
#include <asm/sigp.h>
#include "gaccess.h"
#include "kvm-s390.h"
+#include "trace.h"
static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
u64 *reg)
else
parameter = vcpu->run->s.regs.gprs[r1 + 1];
+ trace_kvm_s390_handle_sigp(vcpu, order_code, cpu_addr, parameter);
switch (order_code) {
case SIGP_SENSE:
vcpu->stat.instruction_sigp_sense++;
--- /dev/null
+#if !defined(_TRACE_KVMS390_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVMS390_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm-s390
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace-s390
+
+/*
+ * Trace point for the creation of the kvm instance.
+ */
+TRACE_EVENT(kvm_s390_create_vm,
+ TP_PROTO(unsigned long type),
+ TP_ARGS(type),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, type)
+ ),
+
+ TP_fast_assign(
+ __entry->type = type;
+ ),
+
+ TP_printk("create vm%s",
+ __entry->type & KVM_VM_S390_UCONTROL ? " (UCONTROL)" : "")
+ );
+
+/*
+ * Trace points for creation and destruction of vpcus.
+ */
+TRACE_EVENT(kvm_s390_create_vcpu,
+ TP_PROTO(unsigned int id, struct kvm_vcpu *vcpu,
+ struct kvm_s390_sie_block *sie_block),
+ TP_ARGS(id, vcpu, sie_block),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, id)
+ __field(struct kvm_vcpu *, vcpu)
+ __field(struct kvm_s390_sie_block *, sie_block)
+ ),
+
+ TP_fast_assign(
+ __entry->id = id;
+ __entry->vcpu = vcpu;
+ __entry->sie_block = sie_block;
+ ),
+
+ TP_printk("create cpu %d at %p, sie block at %p", __entry->id,
+ __entry->vcpu, __entry->sie_block)
+ );
+
+TRACE_EVENT(kvm_s390_destroy_vcpu,
+ TP_PROTO(unsigned int id),
+ TP_ARGS(id),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, id)
+ ),
+
+ TP_fast_assign(
+ __entry->id = id;
+ ),
+
+ TP_printk("destroy cpu %d", __entry->id)
+ );
+
+/*
+ * Trace points for injection of interrupts, either per machine or
+ * per vcpu.
+ */
+
+#define kvm_s390_int_type \
+ {KVM_S390_SIGP_STOP, "sigp stop"}, \
+ {KVM_S390_PROGRAM_INT, "program interrupt"}, \
+ {KVM_S390_SIGP_SET_PREFIX, "sigp set prefix"}, \
+ {KVM_S390_RESTART, "sigp restart"}, \
+ {KVM_S390_INT_VIRTIO, "virtio interrupt"}, \
+ {KVM_S390_INT_SERVICE, "sclp interrupt"}, \
+ {KVM_S390_INT_EMERGENCY, "sigp emergency"}, \
+ {KVM_S390_INT_EXTERNAL_CALL, "sigp ext call"}
+
+TRACE_EVENT(kvm_s390_inject_vm,
+ TP_PROTO(__u64 type, __u32 parm, __u64 parm64, int who),
+ TP_ARGS(type, parm, parm64, who),
+
+ TP_STRUCT__entry(
+ __field(__u32, inttype)
+ __field(__u32, parm)
+ __field(__u64, parm64)
+ __field(int, who)
+ ),
+
+ TP_fast_assign(
+ __entry->inttype = type & 0x00000000ffffffff;
+ __entry->parm = parm;
+ __entry->parm64 = parm64;
+ __entry->who = who;
+ ),
+
+ TP_printk("inject%s: type:%x (%s) parm:%x parm64:%llx",
+ (__entry->who == 1) ? " (from kernel)" :
+ (__entry->who == 2) ? " (from user)" : "",
+ __entry->inttype,
+ __print_symbolic(__entry->inttype, kvm_s390_int_type),
+ __entry->parm, __entry->parm64)
+ );
+
+TRACE_EVENT(kvm_s390_inject_vcpu,
+ TP_PROTO(unsigned int id, __u64 type, __u32 parm, __u64 parm64, \
+ int who),
+ TP_ARGS(id, type, parm, parm64, who),
+
+ TP_STRUCT__entry(
+ __field(int, id)
+ __field(__u32, inttype)
+ __field(__u32, parm)
+ __field(__u64, parm64)
+ __field(int, who)
+ ),
+
+ TP_fast_assign(
+ __entry->id = id;
+ __entry->inttype = type & 0x00000000ffffffff;
+ __entry->parm = parm;
+ __entry->parm64 = parm64;
+ __entry->who = who;
+ ),
+
+ TP_printk("inject%s (vcpu %d): type:%x (%s) parm:%x parm64:%llx",
+ (__entry->who == 1) ? " (from kernel)" :
+ (__entry->who == 2) ? " (from user)" : "",
+ __entry->id, __entry->inttype,
+ __print_symbolic(__entry->inttype, kvm_s390_int_type),
+ __entry->parm, __entry->parm64)
+ );
+
+/*
+ * Trace point for the actual delivery of interrupts.
+ */
+TRACE_EVENT(kvm_s390_deliver_interrupt,
+ TP_PROTO(unsigned int id, __u64 type, __u32 data0, __u64 data1),
+ TP_ARGS(id, type, data0, data1),
+
+ TP_STRUCT__entry(
+ __field(int, id)
+ __field(__u32, inttype)
+ __field(__u32, data0)
+ __field(__u64, data1)
+ ),
+
+ TP_fast_assign(
+ __entry->id = id;
+ __entry->inttype = type & 0x00000000ffffffff;
+ __entry->data0 = data0;
+ __entry->data1 = data1;
+ ),
+
+ TP_printk("deliver interrupt (vcpu %d): type:%x (%s) " \
+ "data:%08x %016llx",
+ __entry->id, __entry->inttype,
+ __print_symbolic(__entry->inttype, kvm_s390_int_type),
+ __entry->data0, __entry->data1)
+ );
+
+/*
+ * Trace point for resets that may be requested from userspace.
+ */
+TRACE_EVENT(kvm_s390_request_resets,
+ TP_PROTO(__u64 resets),
+ TP_ARGS(resets),
+
+ TP_STRUCT__entry(
+ __field(__u64, resets)
+ ),
+
+ TP_fast_assign(
+ __entry->resets = resets;
+ ),
+
+ TP_printk("requesting userspace resets %llx",
+ __entry->resets)
+ );
+
+/*
+ * Trace point for a vcpu's stop requests.
+ */
+TRACE_EVENT(kvm_s390_stop_request,
+ TP_PROTO(unsigned int action_bits),
+ TP_ARGS(action_bits),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, action_bits)
+ ),
+
+ TP_fast_assign(
+ __entry->action_bits = action_bits;
+ ),
+
+ TP_printk("stop request, action_bits = %08x",
+ __entry->action_bits)
+ );
+
+
+#endif /* _TRACE_KVMS390_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
--- /dev/null
+#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_H
+
+#include <linux/tracepoint.h>
+#include <asm/sigp.h>
+#include <asm/debug.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/*
+ * Helpers for vcpu-specific tracepoints containing the same information
+ * as s390dbf VCPU_EVENTs.
+ */
+#define VCPU_PROTO_COMMON struct kvm_vcpu *vcpu
+#define VCPU_ARGS_COMMON vcpu
+#define VCPU_FIELD_COMMON __field(int, id) \
+ __field(unsigned long, pswmask) \
+ __field(unsigned long, pswaddr)
+#define VCPU_ASSIGN_COMMON do { \
+ __entry->id = vcpu->vcpu_id; \
+ __entry->pswmask = vcpu->arch.sie_block->gpsw.mask; \
+ __entry->pswaddr = vcpu->arch.sie_block->gpsw.addr; \
+ } while (0);
+#define VCPU_TP_PRINTK(p_str, p_args...) \
+ TP_printk("%02d[%016lx-%016lx]: " p_str, __entry->id, \
+ __entry->pswmask, __entry->pswaddr, p_args)
+
+/*
+ * Tracepoints for SIE entry and exit.
+ */
+TRACE_EVENT(kvm_s390_sie_enter,
+ TP_PROTO(VCPU_PROTO_COMMON, int cpuflags),
+ TP_ARGS(VCPU_ARGS_COMMON, cpuflags),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(int, cpuflags)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->cpuflags = cpuflags;
+ ),
+
+ VCPU_TP_PRINTK("entering sie flags %x", __entry->cpuflags)
+ );
+
+TRACE_EVENT(kvm_s390_sie_fault,
+ TP_PROTO(VCPU_PROTO_COMMON),
+ TP_ARGS(VCPU_ARGS_COMMON),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ ),
+
+ VCPU_TP_PRINTK("%s", "fault in sie instruction")
+ );
+
+#define sie_intercept_code \
+ {0x04, "Instruction"}, \
+ {0x08, "Program interruption"}, \
+ {0x0C, "Instruction and program interuption"}, \
+ {0x10, "External request"}, \
+ {0x14, "External interruption"}, \
+ {0x18, "I/O request"}, \
+ {0x1C, "Wait state"}, \
+ {0x20, "Validity"}, \
+ {0x28, "Stop request"}
+
+TRACE_EVENT(kvm_s390_sie_exit,
+ TP_PROTO(VCPU_PROTO_COMMON, u8 icptcode),
+ TP_ARGS(VCPU_ARGS_COMMON, icptcode),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(u8, icptcode)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->icptcode = icptcode;
+ ),
+
+ VCPU_TP_PRINTK("exit sie icptcode %d (%s)", __entry->icptcode,
+ __print_symbolic(__entry->icptcode,
+ sie_intercept_code))
+ );
+
+/*
+ * Trace point for intercepted instructions.
+ */
+TRACE_EVENT(kvm_s390_intercept_instruction,
+ TP_PROTO(VCPU_PROTO_COMMON, __u16 ipa, __u32 ipb),
+ TP_ARGS(VCPU_ARGS_COMMON, ipa, ipb),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(__u64, instruction)
+ __field(char, insn[8])
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->instruction = ((__u64)ipa << 48) |
+ ((__u64)ipb << 16);
+ ),
+
+ VCPU_TP_PRINTK("intercepted instruction %016llx (%s)",
+ __entry->instruction,
+ insn_to_mnemonic((unsigned char *)
+ &__entry->instruction,
+ __entry->insn) ?
+ "unknown" : __entry->insn)
+ );
+
+/*
+ * Trace point for intercepted program interruptions.
+ */
+TRACE_EVENT(kvm_s390_intercept_prog,
+ TP_PROTO(VCPU_PROTO_COMMON, __u16 code),
+ TP_ARGS(VCPU_ARGS_COMMON, code),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(__u16, code)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->code = code;
+ ),
+
+ VCPU_TP_PRINTK("intercepted program interruption %04x",
+ __entry->code)
+ );
+
+/*
+ * Trace point for validity intercepts.
+ */
+TRACE_EVENT(kvm_s390_intercept_validity,
+ TP_PROTO(VCPU_PROTO_COMMON, __u16 viwhy),
+ TP_ARGS(VCPU_ARGS_COMMON, viwhy),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(__u16, viwhy)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->viwhy = viwhy;
+ ),
+
+ VCPU_TP_PRINTK("got validity intercept %04x", __entry->viwhy)
+ );
+
+/*
+ * Trace points for instructions that are of special interest.
+ */
+
+#define sigp_order_codes \
+ {SIGP_SENSE, "sense"}, \
+ {SIGP_EXTERNAL_CALL, "external call"}, \
+ {SIGP_EMERGENCY_SIGNAL, "emergency signal"}, \
+ {SIGP_STOP, "stop"}, \
+ {SIGP_STOP_AND_STORE_STATUS, "stop and store status"}, \
+ {SIGP_SET_ARCHITECTURE, "set architecture"}, \
+ {SIGP_SET_PREFIX, "set prefix"}, \
+ {SIGP_SENSE_RUNNING, "sense running"}, \
+ {SIGP_RESTART, "restart"}
+
+TRACE_EVENT(kvm_s390_handle_sigp,
+ TP_PROTO(VCPU_PROTO_COMMON, __u8 order_code, __u16 cpu_addr, \
+ __u32 parameter),
+ TP_ARGS(VCPU_ARGS_COMMON, order_code, cpu_addr, parameter),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(__u8, order_code)
+ __field(__u16, cpu_addr)
+ __field(__u32, parameter)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->order_code = order_code;
+ __entry->cpu_addr = cpu_addr;
+ __entry->parameter = parameter;
+ ),
+
+ VCPU_TP_PRINTK("handle sigp order %02x (%s), cpu address %04x, " \
+ "parameter %08x", __entry->order_code,
+ __print_symbolic(__entry->order_code,
+ sigp_order_codes),
+ __entry->cpu_addr, __entry->parameter)
+ );
+
+#define diagnose_codes \
+ {0x10, "release pages"}, \
+ {0x44, "time slice end"}, \
+ {0x308, "ipl functions"}, \
+ {0x500, "kvm hypercall"}, \
+ {0x501, "kvm breakpoint"}
+
+TRACE_EVENT(kvm_s390_handle_diag,
+ TP_PROTO(VCPU_PROTO_COMMON, __u16 code),
+ TP_ARGS(VCPU_ARGS_COMMON, code),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(__u16, code)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->code = code;
+ ),
+
+ VCPU_TP_PRINTK("handle diagnose call %04x (%s)", __entry->code,
+ __print_symbolic(__entry->code, diagnose_codes))
+ );
+
+TRACE_EVENT(kvm_s390_handle_lctl,
+ TP_PROTO(VCPU_PROTO_COMMON, int g, int reg1, int reg3, u64 addr),
+ TP_ARGS(VCPU_ARGS_COMMON, g, reg1, reg3, addr),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(int, g)
+ __field(int, reg1)
+ __field(int, reg3)
+ __field(u64, addr)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->g = g;
+ __entry->reg1 = reg1;
+ __entry->reg3 = reg3;
+ __entry->addr = addr;
+ ),
+
+ VCPU_TP_PRINTK("%s: loading cr %x-%x from %016llx",
+ __entry->g ? "lctlg" : "lctl",
+ __entry->reg1, __entry->reg3, __entry->addr)
+ );
+
+TRACE_EVENT(kvm_s390_handle_prefix,
+ TP_PROTO(VCPU_PROTO_COMMON, int set, u32 address),
+ TP_ARGS(VCPU_ARGS_COMMON, set, address),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(int, set)
+ __field(u32, address)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->set = set;
+ __entry->address = address;
+ ),
+
+ VCPU_TP_PRINTK("%s prefix to %08x",
+ __entry->set ? "setting" : "storing",
+ __entry->address)
+ );
+
+TRACE_EVENT(kvm_s390_handle_stap,
+ TP_PROTO(VCPU_PROTO_COMMON, u64 address),
+ TP_ARGS(VCPU_ARGS_COMMON, address),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(u64, address)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->address = address;
+ ),
+
+ VCPU_TP_PRINTK("storing cpu address to %016llx",
+ __entry->address)
+ );
+
+TRACE_EVENT(kvm_s390_handle_stfl,
+ TP_PROTO(VCPU_PROTO_COMMON, unsigned int facility_list),
+ TP_ARGS(VCPU_ARGS_COMMON, facility_list),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(unsigned int, facility_list)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->facility_list = facility_list;
+ ),
+
+ VCPU_TP_PRINTK("store facility list value %08x",
+ __entry->facility_list)
+ );
+
+TRACE_EVENT(kvm_s390_handle_stsi,
+ TP_PROTO(VCPU_PROTO_COMMON, int fc, int sel1, int sel2, u64 addr),
+ TP_ARGS(VCPU_ARGS_COMMON, fc, sel1, sel2, addr),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(int, fc)
+ __field(int, sel1)
+ __field(int, sel2)
+ __field(u64, addr)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->fc = fc;
+ __entry->sel1 = sel1;
+ __entry->sel2 = sel2;
+ __entry->addr = addr;
+ ),
+
+ VCPU_TP_PRINTK("STSI %d.%d.%d information stored to %016llx",
+ __entry->fc, __entry->sel1, __entry->sel2,
+ __entry->addr)
+ );
+
+#endif /* _TRACE_KVM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
};
struct kvm_lpage_info {
- unsigned long rmap_pde;
int write_count;
};
struct kvm_arch_memory_slot {
+ unsigned long *rmap_pde[KVM_NR_PAGE_SIZES - 1];
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
};
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
select TASK_DELAY_ACCT
select PERF_EVENTS
select HAVE_KVM_MSI
+ select HAVE_KVM_CPU_RELAX_INTERCEPT
---help---
Support hosting fully virtualized guest machines using hardware
virtualization extensions. You will need a fairly recent
kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
- i8254.o timer.o cpuid.o pmu.o
+ i8254.o cpuid.o pmu.o
kvm-intel-y += vmx.o
kvm-amd-y += svm.o
}
case 7: {
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
- /* Mask ebx against host capbability word 9 */
+ /* Mask ebx against host capability word 9 */
if (index == 0) {
entry->ebx &= kvm_supported_word9_x86_features;
cpuid_mask(&entry->ebx, 9);
if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
goto bad;
} else {
- /* exapand-down segment */
+ /* expand-down segment */
if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
goto bad;
lim = desc.d ? 0xffffffff : 0xffff;
int rc;
struct read_cache *mc = &ctxt->mem_read;
- while (size) {
- int n = min(size, 8u);
- size -= n;
- if (mc->pos < mc->end)
- goto read_cached;
+ if (mc->pos < mc->end)
+ goto read_cached;
- rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
- &ctxt->exception);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- mc->end += n;
+ WARN_ON((mc->end + size) >= sizeof(mc->data));
- read_cached:
- memcpy(dest, mc->data + mc->pos, n);
- mc->pos += n;
- dest += n;
- addr += n;
- }
+ rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, size,
+ &ctxt->exception);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ mc->end += size;
+
+read_cached:
+ memcpy(dest, mc->data + mc->pos, size);
+ mc->pos += size;
return X86EMUL_CONTINUE;
}
err_code = selector & 0xfffc;
err_vec = GP_VECTOR;
- /* can't load system descriptor into segment selecor */
+ /* can't load system descriptor into segment selector */
if (seg <= VCPU_SREG_GS && !seg_desc.s)
goto exception;
setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
struct desc_struct *cs, struct desc_struct *ss)
{
- u16 selector;
-
- memset(cs, 0, sizeof(struct desc_struct));
- ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
- memset(ss, 0, sizeof(struct desc_struct));
-
cs->l = 0; /* will be adjusted later */
set_desc_base(cs, 0); /* flat segment */
cs->g = 1; /* 4kb granularity */
cs->dpl = 0; /* will be adjusted later */
cs->p = 1;
cs->d = 1;
+ cs->avl = 0;
set_desc_base(ss, 0); /* flat segment */
set_desc_limit(ss, 0xfffff); /* 4GB limit */
ss->d = 1; /* 32bit stack segment */
ss->dpl = 0;
ss->p = 1;
+ ss->l = 0;
+ ss->avl = 0;
}
static bool vendor_intel(struct x86_emulate_ctxt *ctxt)
set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
/*
- * Now load segment descriptors. If fault happenes at this stage
+ * Now load segment descriptors. If fault happens at this stage
* it is handled in a context of new task
*/
ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
*
* 1. jmp/call/int to task gate: Check against DPL of the task gate
* 2. Exception/IRQ/iret: No check is performed
- * 3. jmp/call to TSS: Check agains DPL of the TSS
+ * 3. jmp/call to TSS: Check against DPL of the TSS
*/
if (reason == TASK_SWITCH_GATE) {
if (idt_index != -1) {
ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
/* set back link to prev task only if NT bit is set in eflags
- note that old_tss_sel is not used afetr this point */
+ note that old_tss_sel is not used after this point */
if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
old_tss_sel = 0xffff;
ktime_t remaining;
struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
- if (!ps->pit_timer.period)
+ if (!ps->period)
return 0;
/*
* itself with the initial count and continues counting
* from there.
*/
- remaining = hrtimer_get_remaining(&ps->pit_timer.timer);
- elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
- elapsed = mod_64(elapsed, ps->pit_timer.period);
+ remaining = hrtimer_get_remaining(&ps->timer);
+ elapsed = ps->period - ktime_to_ns(remaining);
+ elapsed = mod_64(elapsed, ps->period);
return elapsed;
}
int value;
spin_lock(&ps->inject_lock);
- value = atomic_dec_return(&ps->pit_timer.pending);
+ value = atomic_dec_return(&ps->pending);
if (value < 0)
/* spurious acks can be generated if, for example, the
* PIC is being reset. Handle it gracefully here
*/
- atomic_inc(&ps->pit_timer.pending);
+ atomic_inc(&ps->pending);
else if (value > 0)
/* in this case, we had multiple outstanding pit interrupts
* that we needed to inject. Reinject
if (!kvm_vcpu_is_bsp(vcpu) || !pit)
return;
- timer = &pit->pit_state.pit_timer.timer;
+ timer = &pit->pit_state.timer;
if (hrtimer_cancel(timer))
hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
}
static void destroy_pit_timer(struct kvm_pit *pit)
{
- hrtimer_cancel(&pit->pit_state.pit_timer.timer);
+ hrtimer_cancel(&pit->pit_state.timer);
flush_kthread_work(&pit->expired);
}
-static bool kpit_is_periodic(struct kvm_timer *ktimer)
-{
- struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
- pit_timer);
- return ps->is_periodic;
-}
-
-static struct kvm_timer_ops kpit_ops = {
- .is_periodic = kpit_is_periodic,
-};
-
static void pit_do_work(struct kthread_work *work)
{
struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
{
- struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
- struct kvm_pit *pt = ktimer->kvm->arch.vpit;
+ struct kvm_kpit_state *ps = container_of(data, struct kvm_kpit_state, timer);
+ struct kvm_pit *pt = ps->kvm->arch.vpit;
- if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
- atomic_inc(&ktimer->pending);
+ if (ps->reinject || !atomic_read(&ps->pending)) {
+ atomic_inc(&ps->pending);
queue_kthread_work(&pt->worker, &pt->expired);
}
- if (ktimer->t_ops->is_periodic(ktimer)) {
- hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ if (ps->is_periodic) {
+ hrtimer_add_expires_ns(&ps->timer, ps->period);
return HRTIMER_RESTART;
} else
return HRTIMER_NORESTART;
static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
{
struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
- struct kvm_timer *pt = &ps->pit_timer;
s64 interval;
if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
pr_debug("create pit timer, interval is %llu nsec\n", interval);
/* TODO The new value only affected after the retriggered */
- hrtimer_cancel(&pt->timer);
+ hrtimer_cancel(&ps->timer);
flush_kthread_work(&ps->pit->expired);
- pt->period = interval;
+ ps->period = interval;
ps->is_periodic = is_period;
- pt->timer.function = pit_timer_fn;
- pt->t_ops = &kpit_ops;
- pt->kvm = ps->pit->kvm;
+ ps->timer.function = pit_timer_fn;
+ ps->kvm = ps->pit->kvm;
- atomic_set(&pt->pending, 0);
+ atomic_set(&ps->pending, 0);
ps->irq_ack = 1;
- hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
+ hrtimer_start(&ps->timer, ktime_add_ns(ktime_get(), interval),
HRTIMER_MODE_ABS);
}
}
mutex_unlock(&pit->pit_state.lock);
- atomic_set(&pit->pit_state.pit_timer.pending, 0);
+ atomic_set(&pit->pit_state.pending, 0);
pit->pit_state.irq_ack = 1;
}
struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
if (!mask) {
- atomic_set(&pit->pit_state.pit_timer.pending, 0);
+ atomic_set(&pit->pit_state.pending, 0);
pit->pit_state.irq_ack = 1;
}
}
pit_state = &pit->pit_state;
pit_state->pit = pit;
- hrtimer_init(&pit_state->pit_timer.timer,
- CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init(&pit_state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
pit_state->irq_ack_notifier.gsi = 0;
pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
- pit_state->pit_timer.reinject = true;
+ pit_state->reinject = true;
mutex_unlock(&pit->pit_state.lock);
kvm_pit_reset(pit);
kvm_unregister_irq_ack_notifier(kvm,
&kvm->arch.vpit->pit_state.irq_ack_notifier);
mutex_lock(&kvm->arch.vpit->pit_state.lock);
- timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
+ timer = &kvm->arch.vpit->pit_state.timer;
hrtimer_cancel(timer);
flush_kthread_work(&kvm->arch.vpit->expired);
kthread_stop(kvm->arch.vpit->worker_task);
struct kvm_kpit_state {
struct kvm_kpit_channel_state channels[3];
u32 flags;
- struct kvm_timer pit_timer;
bool is_periodic;
+ s64 period; /* unit: ns */
+ struct hrtimer timer;
+ atomic_t pending; /* accumulated triggered timers */
+ bool reinject;
+ struct kvm *kvm;
u32 speaker_data_on;
struct mutex lock;
struct kvm_pit *pit;
struct kvm_io_device dev_slave;
struct kvm_io_device dev_eclr;
void (*ack_notifier)(void *opaque, int irq);
- unsigned long irq_states[16];
+ unsigned long irq_states[PIC_NUM_PINS];
};
struct kvm_pic *kvm_create_pic(struct kvm *kvm);
+++ /dev/null
-
-struct kvm_timer {
- struct hrtimer timer;
- s64 period; /* unit: ns */
- u32 timer_mode_mask;
- u64 tscdeadline;
- atomic_t pending; /* accumulated triggered timers */
- bool reinject;
- struct kvm_timer_ops *t_ops;
- struct kvm *kvm;
- struct kvm_vcpu *vcpu;
-};
-
-struct kvm_timer_ops {
- bool (*is_periodic)(struct kvm_timer *);
-};
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
val = apic_get_tmcct(apic);
break;
-
+ case APIC_PROCPRI:
+ apic_update_ppr(apic);
+ val = apic_get_reg(apic, offset);
+ break;
case APIC_TASKPRI:
report_tpr_access(apic, false);
/* fall thru */
default:
- apic_update_ppr(apic);
val = apic_get_reg(apic, offset);
break;
}
{
unsigned char alignment = offset & 0xf;
u32 result;
- /* this bitmask has a bit cleared for each reserver register */
+ /* this bitmask has a bit cleared for each reserved register */
static const u64 rmask = 0x43ff01ffffffe70cULL;
if ((alignment + len) > 4) {
atomic_set(&apic->lapic_timer.pending, 0);
if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
- /* lapic timer in oneshot or peroidic mode */
+ /* lapic timer in oneshot or periodic mode */
now = apic->lapic_timer.timer.base->get_time();
apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
* APIC_BUS_CYCLE_NS * apic->divide_count;
*----------------------------------------------------------------------
*/
-static bool lapic_is_periodic(struct kvm_timer *ktimer)
+static bool lapic_is_periodic(struct kvm_lapic *apic)
{
- struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
- lapic_timer);
return apic_lvtt_period(apic);
}
kvm_apic_local_deliver(apic, APIC_LVT0);
}
-static struct kvm_timer_ops lapic_timer_ops = {
- .is_periodic = lapic_is_periodic,
-};
-
static const struct kvm_io_device_ops apic_mmio_ops = {
.read = apic_mmio_read,
.write = apic_mmio_write,
};
+static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
+{
+ struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+ struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ wait_queue_head_t *q = &vcpu->wq;
+
+ /*
+ * There is a race window between reading and incrementing, but we do
+ * not care about potentially losing timer events in the !reinject
+ * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
+ * in vcpu_enter_guest.
+ */
+ if (!atomic_read(&ktimer->pending)) {
+ atomic_inc(&ktimer->pending);
+ /* FIXME: this code should not know anything about vcpus */
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+ }
+
+ if (waitqueue_active(q))
+ wake_up_interruptible(q);
+
+ if (lapic_is_periodic(apic)) {
+ hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+ return HRTIMER_RESTART;
+ } else
+ return HRTIMER_NORESTART;
+}
+
int kvm_create_lapic(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic;
hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
HRTIMER_MODE_ABS);
- apic->lapic_timer.timer.function = kvm_timer_fn;
- apic->lapic_timer.t_ops = &lapic_timer_ops;
- apic->lapic_timer.kvm = vcpu->kvm;
- apic->lapic_timer.vcpu = vcpu;
+ apic->lapic_timer.timer.function = apic_timer_fn;
apic->base_address = APIC_DEFAULT_PHYS_BASE;
vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
#define __KVM_X86_LAPIC_H
#include "iodev.h"
-#include "kvm_timer.h"
#include <linux/kvm_host.h>
+struct kvm_timer {
+ struct hrtimer timer;
+ s64 period; /* unit: ns */
+ u32 timer_mode_mask;
+ u64 tscdeadline;
+ atomic_t pending; /* accumulated triggered timers */
+};
+
struct kvm_lapic {
unsigned long base_address;
struct kvm_io_device dev;
return 0;
pfn = spte_to_pfn(old_spte);
+
+ /*
+ * KVM does not hold the refcount of the page used by
+ * kvm mmu, before reclaiming the page, we should
+ * unmap it from mmu first.
+ */
+ WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
+
if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
kvm_set_pfn_accessed(pfn);
if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
struct kvm_memory_slot *slot)
{
- struct kvm_lpage_info *linfo;
+ unsigned long idx;
if (likely(level == PT_PAGE_TABLE_LEVEL))
return &slot->rmap[gfn - slot->base_gfn];
- linfo = lpage_info_slot(gfn, slot, level);
- return &linfo->rmap_pde;
+ idx = gfn_to_index(gfn, slot->base_gfn, level);
+ return &slot->arch.rmap_pde[level - PT_DIRECTORY_LEVEL][idx];
}
/*
}
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
+ struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
}
static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
+ struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
return 0;
}
-static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
- unsigned long data,
- int (*handler)(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data))
+static int kvm_handle_hva_range(struct kvm *kvm,
+ unsigned long start,
+ unsigned long end,
+ unsigned long data,
+ int (*handler)(struct kvm *kvm,
+ unsigned long *rmapp,
+ struct kvm_memory_slot *slot,
+ unsigned long data))
{
int j;
- int ret;
- int retval = 0;
+ int ret = 0;
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
- unsigned long start = memslot->userspace_addr;
- unsigned long end;
+ unsigned long hva_start, hva_end;
+ gfn_t gfn_start, gfn_end;
- end = start + (memslot->npages << PAGE_SHIFT);
- if (hva >= start && hva < end) {
- gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
- gfn_t gfn = memslot->base_gfn + gfn_offset;
+ hva_start = max(start, memslot->userspace_addr);
+ hva_end = min(end, memslot->userspace_addr +
+ (memslot->npages << PAGE_SHIFT));
+ if (hva_start >= hva_end)
+ continue;
+ /*
+ * {gfn(page) | page intersects with [hva_start, hva_end)} =
+ * {gfn_start, gfn_start+1, ..., gfn_end-1}.
+ */
+ gfn_start = hva_to_gfn_memslot(hva_start, memslot);
+ gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
- ret = handler(kvm, &memslot->rmap[gfn_offset], data);
+ for (j = PT_PAGE_TABLE_LEVEL;
+ j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
+ unsigned long idx, idx_end;
+ unsigned long *rmapp;
- for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
- struct kvm_lpage_info *linfo;
+ /*
+ * {idx(page_j) | page_j intersects with
+ * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
+ */
+ idx = gfn_to_index(gfn_start, memslot->base_gfn, j);
+ idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j);
- linfo = lpage_info_slot(gfn, memslot,
- PT_DIRECTORY_LEVEL + j);
- ret |= handler(kvm, &linfo->rmap_pde, data);
- }
- trace_kvm_age_page(hva, memslot, ret);
- retval |= ret;
+ rmapp = __gfn_to_rmap(gfn_start, j, memslot);
+
+ for (; idx <= idx_end; ++idx)
+ ret |= handler(kvm, rmapp++, memslot, data);
}
}
- return retval;
+ return ret;
+}
+
+static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
+ unsigned long data,
+ int (*handler)(struct kvm *kvm, unsigned long *rmapp,
+ struct kvm_memory_slot *slot,
+ unsigned long data))
+{
+ return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
}
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp);
}
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+ return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
+}
+
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
}
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
+ struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator uninitialized_var(iter);
* This has some overhead, but not as much as the cost of swapping
* out actively used pages or breaking up actively used hugepages.
*/
- if (!shadow_accessed_mask)
- return kvm_unmap_rmapp(kvm, rmapp, data);
+ if (!shadow_accessed_mask) {
+ young = kvm_unmap_rmapp(kvm, rmapp, slot, data);
+ goto out;
+ }
for (sptep = rmap_get_first(*rmapp, &iter); sptep;
sptep = rmap_get_next(&iter)) {
(unsigned long *)sptep);
}
}
-
+out:
+ /* @data has hva passed to kvm_age_hva(). */
+ trace_kvm_age_page(data, slot, young);
return young;
}
static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
- unsigned long data)
+ struct kvm_memory_slot *slot, unsigned long data)
{
u64 *sptep;
struct rmap_iterator iter;
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
- kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
+ kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, 0);
kvm_flush_remote_tlbs(vcpu->kvm);
}
int kvm_age_hva(struct kvm *kvm, unsigned long hva)
{
- return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
+ return kvm_handle_hva(kvm, hva, hva, kvm_age_rmapp);
}
int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
unsigned long hva;
slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
- if (!slot) {
- get_page(fault_page);
- return page_to_pfn(fault_page);
- }
+ if (!slot)
+ return get_fault_pfn();
hva = gfn_to_hva_memslot(slot, gfn);
- return hva_to_pfn_atomic(vcpu->kvm, hva);
+ return hva_to_pfn_atomic(hva);
}
static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
if (!async)
return false; /* *pfn has correct page already */
- put_page(pfn_to_page(*pfn));
+ kvm_release_pfn_clean(*pfn);
if (!prefault && can_do_async_pf(vcpu)) {
trace_kvm_try_async_get_page(gva, gfn);
/*
- * Kernel-based Virtual Machine -- Performane Monitoring Unit support
+ * Kernel-based Virtual Machine -- Performance Monitoring Unit support
*
* Copyright 2011 Red Hat, Inc. and/or its affiliates.
*
if (svm->nested.intercept & 1ULL) {
/*
* The #vmexit can't be emulated here directly because this
- * code path runs with irqs and preemtion disabled. A
+ * code path runs with irqs and preemption disabled. A
* #vmexit emulation might sleep. Only signal request for
* the #vmexit here.
*/
{
/*
* This function merges the msr permission bitmaps of kvm and the
- * nested vmcb. It is omptimized in that it only merges the parts where
+ * nested vmcb. It is optimized in that it only merges the parts where
* the kvm msr permission bitmap may contain zero bits
*/
int i;
+++ /dev/null
-/*
- * Kernel-based Virtual Machine driver for Linux
- *
- * This module enables machines with Intel VT-x extensions to run virtual
- * machines without emulation or binary translation.
- *
- * timer support
- *
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- */
-
-#include <linux/kvm_host.h>
-#include <linux/kvm.h>
-#include <linux/hrtimer.h>
-#include <linux/atomic.h>
-#include "kvm_timer.h"
-
-enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
-{
- struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
- struct kvm_vcpu *vcpu = ktimer->vcpu;
- wait_queue_head_t *q = &vcpu->wq;
-
- /*
- * There is a race window between reading and incrementing, but we do
- * not care about potentially losing timer events in the !reinject
- * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
- * in vcpu_enter_guest.
- */
- if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
- atomic_inc(&ktimer->pending);
- /* FIXME: this code should not know anything about vcpus */
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
- }
-
- if (waitqueue_active(q))
- wake_up_interruptible(q);
-
- if (ktimer->t_ops->is_periodic(ktimer)) {
- hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
- return HRTIMER_RESTART;
- } else
- return HRTIMER_NORESTART;
-}
guest_efer = vmx->vcpu.arch.efer;
/*
- * NX is emulated; LMA and LME handled by hardware; SCE meaninless
+ * NX is emulated; LMA and LME handled by hardware; SCE meaningless
* outside long mode
*/
ignore_bits = EFER_NX | EFER_SCE;
* qemu binaries.
* IA32 arch specifies that at the time of processor reset the
* "Accessed" bit in the AR field of segment registers is 1. And qemu
- * is setting it to 0 in the usedland code. This causes invalid guest
+ * is setting it to 0 in the userland code. This causes invalid guest
* state vmexit when "unrestricted guest" mode is turned on.
* Fix for this setup issue in cpu_reset is being pushed in the qemu
* tree. Newer qemu binaries with that qemu fix would not need this
hypercall[2] = 0xc1;
}
-/* called to set cr0 as approriate for a mov-to-cr0 exit. */
+/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
{
if (to_vmx(vcpu)->nested.vmxon &&
* kvm-specific. Those are put in the beginning of the list.
*/
-#define KVM_SAVE_MSRS_BEGIN 9
+#define KVM_SAVE_MSRS_BEGIN 10
static u32 msrs_to_save[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
* For each generation, we track the original measured
* nanosecond time, offset, and write, so if TSCs are in
* sync, we can match exact offset, and if not, we can match
- * exact software computaion in compute_guest_tsc()
+ * exact software computation in compute_guest_tsc()
*
* These values are tracked in kvm->arch.cur_xxx variables.
*/
{
gpa_t gpa = data & ~0x3f;
- /* Bits 2:5 are resrved, Should be zero */
+ /* Bits 2:5 are reserved, Should be zero */
if (data & 0x3c)
return 1;
* Ignore all writes to this no longer documented MSR.
* Writes are only relevant for old K7 processors,
* all pre-dating SVM, but a recommended workaround from
- * AMD for these chips. It is possible to speicify the
+ * AMD for these chips. It is possible to specify the
* affected processor models on the command line, hence
* the need to ignore the workaround.
*/
if (!vcpu->arch.time_page)
return -EINVAL;
src->flags |= PVCLOCK_GUEST_STOPPED;
- mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT);
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
return 0;
}
if (!kvm->arch.vpit)
return -ENXIO;
mutex_lock(&kvm->arch.vpit->pit_state.lock);
- kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
+ kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
mutex_unlock(&kvm->arch.vpit->pit_state.lock);
return 0;
}
return r;
}
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+{
+ if (!irqchip_in_kernel(kvm))
+ return -ENXIO;
+
+ irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
+ irq_event->irq, irq_event->level);
+ return 0;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
create_pit_unlock:
mutex_unlock(&kvm->slots_lock);
break;
- case KVM_IRQ_LINE_STATUS:
- case KVM_IRQ_LINE: {
- struct kvm_irq_level irq_event;
-
- r = -EFAULT;
- if (copy_from_user(&irq_event, argp, sizeof irq_event))
- goto out;
- r = -ENXIO;
- if (irqchip_in_kernel(kvm)) {
- __s32 status;
- status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
- irq_event.irq, irq_event.level);
- if (ioctl == KVM_IRQ_LINE_STATUS) {
- r = -EFAULT;
- irq_event.status = status;
- if (copy_to_user(argp, &irq_event,
- sizeof irq_event))
- goto out;
- }
- r = 0;
- }
- break;
- }
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
struct kvm_irqchip *chip;
/*
* if emulation was due to access to shadowed page table
- * and it failed try to unshadow page and re-entetr the
+ * and it failed try to unshadow page and re-enter the
* guest to let CPU execute the instruction.
*/
if (kvm_mmu_unprotect_page_virt(vcpu, gva))
/*
* We are here if userspace calls get_regs() in the middle of
* instruction emulation. Registers state needs to be copied
- * back from emulation context to vcpu. Usrapace shouldn't do
+ * back from emulation context to vcpu. Userspace shouldn't do
* that usually, but some bad designed PV devices (vmware
* backdoor interface) need this to work
*/
* as we reset last_host_tsc on all VCPUs to stop this from being
* called multiple times (one for each physical CPU bringup).
*
- * Platforms with unnreliable TSCs don't have to deal with this, they
+ * Platforms with unreliable TSCs don't have to deal with this, they
* will be compensated by the logic in vcpu_load, which sets the TSC to
* catchup mode. This will catchup all VCPUs to real time, but cannot
* guarantee that they stay in perfect synchronization.
int i;
for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+ if (!dont || free->arch.rmap_pde[i] != dont->arch.rmap_pde[i]) {
+ kvm_kvfree(free->arch.rmap_pde[i]);
+ free->arch.rmap_pde[i] = NULL;
+ }
if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) {
kvm_kvfree(free->arch.lpage_info[i]);
free->arch.lpage_info[i] = NULL;
lpages = gfn_to_index(slot->base_gfn + npages - 1,
slot->base_gfn, level) + 1;
+ slot->arch.rmap_pde[i] =
+ kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap_pde[i]));
+ if (!slot->arch.rmap_pde[i])
+ goto out_free;
+
slot->arch.lpage_info[i] =
kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i]));
if (!slot->arch.lpage_info[i])
out_free:
for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) {
+ kvm_kvfree(slot->arch.rmap_pde[i]);
kvm_kvfree(slot->arch.lpage_info[i]);
+ slot->arch.rmap_pde[i] = NULL;
slot->arch.lpage_info[i] = NULL;
}
return -ENOMEM;
map_flags = MAP_SHARED | MAP_ANONYMOUS;
/*To keep backward compatibility with older userspace,
- *x86 needs to hanlde !user_alloc case.
+ *x86 needs to handle !user_alloc case.
*/
if (!user_alloc) {
if (npages && !old.rmap) {
} async_pf;
#endif
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+ /*
+ * Cpu relax intercept or pause loop exit optimization
+ * in_spin_loop: set when a vcpu does a pause loop exit
+ * or cpu relax intercepted.
+ * dy_eligible: indicates whether vcpu is eligible for directed yield.
+ */
+ struct {
+ bool in_spin_loop;
+ bool dy_eligible;
+ } spin_loop;
+#endif
struct kvm_vcpu_arch arch;
};
return slot;
}
-#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
-#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
-static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
-
extern struct page *bad_page;
-extern struct page *fault_page;
-
-extern pfn_t bad_pfn;
-extern pfn_t fault_pfn;
int is_error_page(struct page *page);
int is_error_pfn(pfn_t pfn);
int is_hwpoison_pfn(pfn_t pfn);
-int is_fault_pfn(pfn_t pfn);
int is_noslot_pfn(pfn_t pfn);
int is_invalid_pfn(pfn_t pfn);
int kvm_is_error_hva(unsigned long addr);
int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
int nr_pages);
+struct page *get_bad_page(void);
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
void kvm_release_page_clean(struct page *page);
void kvm_set_page_dirty(struct page *page);
void kvm_set_page_accessed(struct page *page);
-pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
+pfn_t hva_to_pfn_atomic(unsigned long addr);
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async,
bool write_fault, bool *writable);
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
bool *writable);
-pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn);
+pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
void kvm_release_pfn_dirty(pfn_t);
void kvm_release_pfn_clean(pfn_t pfn);
void kvm_set_pfn_dirty(pfn_t pfn);
void kvm_set_pfn_accessed(pfn_t pfn);
void kvm_get_pfn(pfn_t pfn);
+pfn_t get_fault_pfn(void);
int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
int len);
struct
kvm_userspace_memory_region *mem,
int user_alloc);
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level);
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg);
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
-int kvm_is_mmio_pfn(pfn_t pfn);
+bool kvm_is_mmio_pfn(pfn_t pfn);
struct kvm_irq_ack_notifier {
struct hlist_node link;
(base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
}
+static inline gfn_t
+hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
+{
+ gfn_t gfn_offset = (hva - slot->userspace_addr) >> PAGE_SHIFT;
+
+ return slot->base_gfn + gfn_offset;
+}
+
static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
gfn_t gfn)
{
}
}
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+
+static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
+{
+ vcpu->spin_loop.in_spin_loop = val;
+}
+static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
+{
+ vcpu->spin_loop.dy_eligible = val;
+}
+
+#else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
+
+static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
+{
+}
+
+static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
+{
+}
+
+static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+{
+ return true;
+}
+
+#endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
#endif
config HAVE_KVM_MSI
bool
+
+config HAVE_KVM_CPU_RELAX_INTERCEPT
+ bool
typeof(*work), link);
list_del(&work->link);
if (work->page)
- put_page(work->page);
+ kvm_release_page_clean(work->page);
kmem_cache_free(async_pf_cache, work);
}
spin_unlock(&vcpu->async_pf.lock);
list_del(&work->queue);
vcpu->async_pf.queued--;
if (work->page)
- put_page(work->page);
+ kvm_release_page_clean(work->page);
kmem_cache_free(async_pf_cache, work);
}
}
if (!work)
return -ENOMEM;
- work->page = bad_page;
- get_page(bad_page);
+ work->page = get_bad_page();
INIT_LIST_HEAD(&work->queue); /* for list_del to work */
spin_lock(&vcpu->async_pf.lock);
static void kvm_iommu_put_pages(struct kvm *kvm,
gfn_t base_gfn, unsigned long npages);
-static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot,
- gfn_t gfn, unsigned long size)
+static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
+ unsigned long size)
{
gfn_t end_gfn;
pfn_t pfn;
- pfn = gfn_to_pfn_memslot(kvm, slot, gfn);
+ pfn = gfn_to_pfn_memslot(slot, gfn);
end_gfn = gfn + (size >> PAGE_SHIFT);
gfn += 1;
return pfn;
while (gfn < end_gfn)
- gfn_to_pfn_memslot(kvm, slot, gfn++);
+ gfn_to_pfn_memslot(slot, gfn++);
return pfn;
}
* Pin all pages we are about to map in memory. This is
* important because we unmap and unpin in 4kb steps later.
*/
- pfn = kvm_pin_pages(kvm, slot, gfn, page_size);
+ pfn = kvm_pin_pages(slot, gfn, page_size);
if (is_error_pfn(pfn)) {
gfn += 1;
continue;
switch (ue->u.irqchip.irqchip) {
case KVM_IRQCHIP_PIC_MASTER:
e->set = kvm_set_pic_irq;
- max_pin = 16;
+ max_pin = PIC_NUM_PINS;
break;
case KVM_IRQCHIP_PIC_SLAVE:
e->set = kvm_set_pic_irq;
- max_pin = 16;
+ max_pin = PIC_NUM_PINS;
delta = 8;
break;
case KVM_IRQCHIP_IOAPIC:
static bool largepages_enabled = true;
-static struct page *hwpoison_page;
-static pfn_t hwpoison_pfn;
-
-struct page *fault_page;
-pfn_t fault_pfn;
-
-inline int kvm_is_mmio_pfn(pfn_t pfn)
+bool kvm_is_mmio_pfn(pfn_t pfn)
{
+ if (is_error_pfn(pfn))
+ return false;
+
if (pfn_valid(pfn)) {
int reserved;
struct page *tail = pfn_to_page(pfn);
}
vcpu->run = page_address(page);
+ kvm_vcpu_set_in_spin_loop(vcpu, false);
+ kvm_vcpu_set_dy_eligible(vcpu, false);
+
r = kvm_arch_vcpu_init(vcpu);
if (r < 0)
goto fail_free_run;
* count is also read inside the mmu_lock critical section.
*/
kvm->mmu_notifier_count++;
- for (; start < end; start += PAGE_SIZE)
- need_tlb_flush |= kvm_unmap_hva(kvm, start);
+ need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
need_tlb_flush |= kvm->tlbs_dirty;
/* we've to flush the tlb before the pages can be freed */
if (need_tlb_flush)
int is_error_page(struct page *page)
{
- return page == bad_page || page == hwpoison_page || page == fault_page;
+ return IS_ERR(page);
}
EXPORT_SYMBOL_GPL(is_error_page);
int is_error_pfn(pfn_t pfn)
{
- return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
+ return IS_ERR_VALUE(pfn);
}
EXPORT_SYMBOL_GPL(is_error_pfn);
-int is_hwpoison_pfn(pfn_t pfn)
+static pfn_t get_bad_pfn(void)
{
- return pfn == hwpoison_pfn;
+ return -ENOENT;
}
-EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
-int is_fault_pfn(pfn_t pfn)
+pfn_t get_fault_pfn(void)
{
- return pfn == fault_pfn;
+ return -EFAULT;
}
-EXPORT_SYMBOL_GPL(is_fault_pfn);
+EXPORT_SYMBOL_GPL(get_fault_pfn);
+
+static pfn_t get_hwpoison_pfn(void)
+{
+ return -EHWPOISON;
+}
+
+int is_hwpoison_pfn(pfn_t pfn)
+{
+ return pfn == -EHWPOISON;
+}
+EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
int is_noslot_pfn(pfn_t pfn)
{
- return pfn == bad_pfn;
+ return pfn == -ENOENT;
}
EXPORT_SYMBOL_GPL(is_noslot_pfn);
int is_invalid_pfn(pfn_t pfn)
{
- return pfn == hwpoison_pfn || pfn == fault_pfn;
+ return !is_noslot_pfn(pfn) && is_error_pfn(pfn);
}
EXPORT_SYMBOL_GPL(is_invalid_pfn);
+struct page *get_bad_page(void)
+{
+ return ERR_PTR(-ENOENT);
+}
+
static inline unsigned long bad_hva(void)
{
return PAGE_OFFSET;
}
EXPORT_SYMBOL_GPL(gfn_to_hva);
-static pfn_t get_fault_pfn(void)
-{
- get_page(fault_page);
- return fault_pfn;
-}
-
int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int write, struct page **page)
{
return rc == -EHWPOISON;
}
-static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
- bool *async, bool write_fault, bool *writable)
+static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
+ bool write_fault, bool *writable)
{
struct page *page[1];
int npages = 0;
if (npages == -EHWPOISON ||
(!async && check_user_page_hwpoison(addr))) {
up_read(¤t->mm->mmap_sem);
- get_page(hwpoison_page);
- return page_to_pfn(hwpoison_page);
+ return get_hwpoison_pfn();
}
vma = find_vma_intersection(current->mm, addr, addr+1);
return pfn;
}
-pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
+pfn_t hva_to_pfn_atomic(unsigned long addr)
{
- return hva_to_pfn(kvm, addr, true, NULL, true, NULL);
+ return hva_to_pfn(addr, true, NULL, true, NULL);
}
EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
*async = false;
addr = gfn_to_hva(kvm, gfn);
- if (kvm_is_error_hva(addr)) {
- get_page(bad_page);
- return page_to_pfn(bad_page);
- }
+ if (kvm_is_error_hva(addr))
+ return get_bad_pfn();
- return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable);
+ return hva_to_pfn(addr, atomic, async, write_fault, writable);
}
pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
-pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot, gfn_t gfn)
+pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
{
unsigned long addr = gfn_to_hva_memslot(slot, gfn);
- return hva_to_pfn(kvm, addr, false, NULL, true, NULL);
+ return hva_to_pfn(addr, false, NULL, true, NULL);
}
int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages,
}
EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
+static struct page *kvm_pfn_to_page(pfn_t pfn)
+{
+ WARN_ON(kvm_is_mmio_pfn(pfn));
+
+ if (is_error_pfn(pfn) || kvm_is_mmio_pfn(pfn))
+ return get_bad_page();
+
+ return pfn_to_page(pfn);
+}
+
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
pfn_t pfn;
pfn = gfn_to_pfn(kvm, gfn);
- if (!kvm_is_mmio_pfn(pfn))
- return pfn_to_page(pfn);
-
- WARN_ON(kvm_is_mmio_pfn(pfn));
- get_page(bad_page);
- return bad_page;
+ return kvm_pfn_to_page(pfn);
}
EXPORT_SYMBOL_GPL(gfn_to_page);
void kvm_release_page_clean(struct page *page)
{
- kvm_release_pfn_clean(page_to_pfn(page));
+ if (!is_error_page(page))
+ kvm_release_pfn_clean(page_to_pfn(page));
}
EXPORT_SYMBOL_GPL(kvm_release_page_clean);
void kvm_release_pfn_clean(pfn_t pfn)
{
- if (!kvm_is_mmio_pfn(pfn))
+ if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn))
put_page(pfn_to_page(pfn));
}
EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
void kvm_release_page_dirty(struct page *page)
{
+ WARN_ON(is_error_page(page));
+
kvm_release_pfn_dirty(page_to_pfn(page));
}
EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
+#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
+/*
+ * Helper that checks whether a VCPU is eligible for directed yield.
+ * Most eligible candidate to yield is decided by following heuristics:
+ *
+ * (a) VCPU which has not done pl-exit or cpu relax intercepted recently
+ * (preempted lock holder), indicated by @in_spin_loop.
+ * Set at the beiginning and cleared at the end of interception/PLE handler.
+ *
+ * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
+ * chance last time (mostly it has become eligible now since we have probably
+ * yielded to lockholder in last iteration. This is done by toggling
+ * @dy_eligible each time a VCPU checked for eligibility.)
+ *
+ * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
+ * to preempted lock-holder could result in wrong VCPU selection and CPU
+ * burning. Giving priority for a potential lock-holder increases lock
+ * progress.
+ *
+ * Since algorithm is based on heuristics, accessing another VCPU data without
+ * locking does not harm. It may result in trying to yield to same VCPU, fail
+ * and continue with next VCPU and so on.
+ */
+bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
+{
+ bool eligible;
+
+ eligible = !vcpu->spin_loop.in_spin_loop ||
+ (vcpu->spin_loop.in_spin_loop &&
+ vcpu->spin_loop.dy_eligible);
+
+ if (vcpu->spin_loop.in_spin_loop)
+ kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);
+
+ return eligible;
+}
+#endif
void kvm_vcpu_on_spin(struct kvm_vcpu *me)
{
struct kvm *kvm = me->kvm;
int pass;
int i;
+ kvm_vcpu_set_in_spin_loop(me, true);
/*
* We boost the priority of a VCPU that is runnable but not
* currently running, because it got preempted by something
continue;
if (waitqueue_active(&vcpu->wq))
continue;
+ if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
+ continue;
if (kvm_vcpu_yield_to(vcpu)) {
kvm->last_boosted_vcpu = i;
yielded = 1;
}
}
}
+ kvm_vcpu_set_in_spin_loop(me, false);
+
+ /* Ensure vcpu is not eligible during next spinloop */
+ kvm_vcpu_set_dy_eligible(me, false);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
r = kvm_send_userspace_msi(kvm, &msi);
break;
}
+#endif
+#ifdef __KVM_HAVE_IRQ_LINE
+ case KVM_IRQ_LINE_STATUS:
+ case KVM_IRQ_LINE: {
+ struct kvm_irq_level irq_event;
+
+ r = -EFAULT;
+ if (copy_from_user(&irq_event, argp, sizeof irq_event))
+ goto out;
+
+ r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
+ if (r)
+ goto out;
+
+ r = -EFAULT;
+ if (ioctl == KVM_IRQ_LINE_STATUS) {
+ if (copy_to_user(argp, &irq_event, sizeof irq_event))
+ goto out;
+ }
+
+ r = 0;
+ break;
+ }
#endif
default:
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
.resume = kvm_resume,
};
-struct page *bad_page;
-pfn_t bad_pfn;
-
static inline
struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
{
if (r)
goto out_fail;
- bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (bad_page == NULL) {
- r = -ENOMEM;
- goto out;
- }
-
- bad_pfn = page_to_pfn(bad_page);
-
- hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (hwpoison_page == NULL) {
- r = -ENOMEM;
- goto out_free_0;
- }
-
- hwpoison_pfn = page_to_pfn(hwpoison_page);
-
- fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-
- if (fault_page == NULL) {
- r = -ENOMEM;
- goto out_free_0;
- }
-
- fault_pfn = page_to_pfn(fault_page);
-
if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
r = -ENOMEM;
goto out_free_0;
out_free_0a:
free_cpumask_var(cpus_hardware_enabled);
out_free_0:
- if (fault_page)
- __free_page(fault_page);
- if (hwpoison_page)
- __free_page(hwpoison_page);
- __free_page(bad_page);
-out:
kvm_arch_exit();
out_fail:
return r;
kvm_arch_hardware_unsetup();
kvm_arch_exit();
free_cpumask_var(cpus_hardware_enabled);
- __free_page(fault_page);
- __free_page(hwpoison_page);
- __free_page(bad_page);
}
EXPORT_SYMBOL_GPL(kvm_exit);