X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=kvm-all.c;h=961e1dbb710cb1e9934a588e6b9d87b8529446d6;hb=a22260ae380fa6abb546479cfc2962ba4c40382d;hp=9b73ccfbecb12d6fd08e1bc9ccc54868cfb4271d;hpb=7a85d1cf1cf9b2b9379c35f1b672d1661e2d8132;p=qemu.git

diff --git a/kvm-all.c b/kvm-all.c
index 9b73ccfbe..961e1dbb7 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -22,19 +22,27 @@
 
 #include "qemu-common.h"
 #include "qemu-barrier.h"
+#include "qemu-option.h"
+#include "qemu-config.h"
 #include "sysemu.h"
 #include "hw/hw.h"
+#include "hw/msi.h"
 #include "gdbstub.h"
 #include "kvm.h"
 #include "bswap.h"
 #include "memory.h"
 #include "exec-memory.h"
+#include "event_notifier.h"
 
 /* This check must be after config-host.h is included */
 #ifdef CONFIG_EVENTFD
 #include <sys/eventfd.h>
 #endif
 
+#ifdef CONFIG_VALGRIND_H
+#include <valgrind/memcheck.h>
+#endif
+
 /* KVM uses PAGE_SIZE in its definition of COALESCED_MMIO_MAX */
 #define PAGE_SIZE TARGET_PAGE_SIZE
 
@@ -48,9 +56,11 @@
     do { } while (0)
 #endif
 
+#define KVM_MSI_HASHTAB_SIZE    256
+
 typedef struct KVMSlot
 {
-    target_phys_addr_t start_addr;
+    hwaddr start_addr;
     ram_addr_t memory_size;
     void *ram;
     int slot;
@@ -78,20 +88,27 @@ struct KVMState
     int pit_state2;
     int xsave, xcrs;
     int many_ioeventfds;
+    int intx_set_mask;
     /* The man page (and posix) say ioctl numbers are signed int, but
      * they're not.  Linux, glibc and *BSD all treat ioctl numbers as
      * unsigned, and treating them as signed here can break things */
-    unsigned irqchip_inject_ioctl;
+    unsigned irq_set_ioctl;
 #ifdef KVM_CAP_IRQ_ROUTING
     struct kvm_irq_routing *irq_routes;
     int nr_allocated_irq_routes;
     uint32_t *used_gsi_bitmap;
-    unsigned int max_gsi;
+    unsigned int gsi_count;
+    QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
+    bool direct_msi;
 #endif
 };
 
 KVMState *kvm_state;
 bool kvm_kernel_irqchip;
+bool kvm_async_interrupts_allowed;
+bool kvm_irqfds_allowed;
+bool kvm_msi_via_irqfd_allowed;
+bool kvm_gsi_routing_allowed;
 
 static const KVMCapabilityInfo kvm_required_capabilites[] = {
     KVM_CAP_INFO(USER_MEMORY),
@@ -114,8 +131,8 @@ static KVMSlot *kvm_alloc_slot(KVMState *s)
 }
 
 static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
-                                         target_phys_addr_t start_addr,
-                                         target_phys_addr_t end_addr)
+                                         hwaddr start_addr,
+                                         hwaddr end_addr)
 {
     int i;
 
@@ -135,8 +152,8 @@ static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
  * Find overlapping slot with lowest start address
  */
 static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
-                                            target_phys_addr_t start_addr,
-                                            target_phys_addr_t end_addr)
+                                            hwaddr start_addr,
+                                            hwaddr end_addr)
 {
     KVMSlot *found = NULL;
     int i;
@@ -159,7 +176,7 @@ static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
 }
 
 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
-                                       target_phys_addr_t *phys_addr)
+                                       hwaddr *phys_addr)
 {
     int i;
 
@@ -276,7 +293,7 @@ static int kvm_slot_dirty_pages_log_change(KVMSlot *mem, bool log_dirty)
     return kvm_set_user_memory_region(s, mem);
 }
 
-static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
+static int kvm_dirty_pages_log_change(hwaddr phys_addr,
                                       ram_addr_t size, bool log_dirty)
 {
     KVMState *s = kvm_state;
@@ -285,7 +302,7 @@ static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
     if (mem == NULL)  {
         fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
                 TARGET_FMT_plx "\n", __func__, phys_addr,
-                (target_phys_addr_t)(phys_addr + size - 1));
+                (hwaddr)(phys_addr + size - 1));
         return -EINVAL;
     }
     return kvm_slot_dirty_pages_log_change(mem, log_dirty);
@@ -346,7 +363,7 @@ static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section,
 {
     unsigned int i, j;
     unsigned long page_number, c;
-    target_phys_addr_t addr, addr1;
+    hwaddr addr, addr1;
     unsigned int len = ((section->size / TARGET_PAGE_SIZE) + HOST_LONG_BITS - 1) / HOST_LONG_BITS;
     unsigned long hpratio = getpagesize() / TARGET_PAGE_SIZE;
 
@@ -389,8 +406,8 @@ static int kvm_physical_sync_dirty_bitmap(MemoryRegionSection *section)
     KVMDirtyLog d;
     KVMSlot *mem;
     int ret = 0;
-    target_phys_addr_t start_addr = section->offset_within_address_space;
-    target_phys_addr_t end_addr = start_addr + section->size;
+    hwaddr start_addr = section->offset_within_address_space;
+    hwaddr end_addr = start_addr + section->size;
 
     d.dirty_bitmap = NULL;
     while (start_addr < end_addr) {
@@ -437,9 +454,10 @@ static int kvm_physical_sync_dirty_bitmap(MemoryRegionSection *section)
     return ret;
 }
 
-int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
+static void kvm_coalesce_mmio_region(MemoryListener *listener,
+                                     MemoryRegionSection *secion,
+                                     hwaddr start, hwaddr size)
 {
-    int ret = -ENOSYS;
     KVMState *s = kvm_state;
 
     if (s->coalesced_mmio) {
@@ -449,15 +467,14 @@ int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
         zone.size = size;
         zone.pad = 0;
 
-        ret = kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
+        (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
     }
-
-    return ret;
 }
 
-int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
+static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
+                                       MemoryRegionSection *secion,
+                                       hwaddr start, hwaddr size)
 {
-    int ret = -ENOSYS;
     KVMState *s = kvm_state;
 
     if (s->coalesced_mmio) {
@@ -467,10 +484,8 @@ int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
         zone.size = size;
         zone.pad = 0;
 
-        ret = kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
+        (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
     }
-
-    return ret;
 }
 
 int kvm_check_extension(KVMState *s, unsigned int extension)
@@ -541,7 +556,7 @@ static void kvm_set_phys_mem(MemoryRegionSection *section, bool add)
     int err;
     MemoryRegion *mr = section->mr;
     bool log_dirty = memory_region_is_logging(mr);
-    target_phys_addr_t start_addr = section->offset_within_address_space;
+    hwaddr start_addr = section->offset_within_address_space;
     ram_addr_t size = section->size;
     void *ram = NULL;
     unsigned delta;
@@ -686,14 +701,6 @@ static void kvm_set_phys_mem(MemoryRegionSection *section, bool add)
     }
 }
 
-static void kvm_begin(MemoryListener *listener)
-{
-}
-
-static void kvm_commit(MemoryListener *listener)
-{
-}
-
 static void kvm_region_add(MemoryListener *listener,
                            MemoryRegionSection *section)
 {
@@ -706,11 +713,6 @@ static void kvm_region_del(MemoryListener *listener,
     kvm_set_phys_mem(section, false);
 }
 
-static void kvm_region_nop(MemoryListener *listener,
-                           MemoryRegionSection *section)
-{
-}
-
 static void kvm_log_sync(MemoryListener *listener,
                          MemoryRegionSection *section)
 {
@@ -738,9 +740,12 @@ static void kvm_log_global_stop(struct MemoryListener *listener)
     assert(r >= 0);
 }
 
-static void kvm_mem_ioeventfd_add(MemoryRegionSection *section,
-                                  bool match_data, uint64_t data, int fd)
+static void kvm_mem_ioeventfd_add(MemoryListener *listener,
+                                  MemoryRegionSection *section,
+                                  bool match_data, uint64_t data,
+                                  EventNotifier *e)
 {
+    int fd = event_notifier_get_fd(e);
     int r;
 
     assert(match_data && section->size <= 8);
@@ -752,9 +757,12 @@ static void kvm_mem_ioeventfd_add(MemoryRegionSection *section,
     }
 }
 
-static void kvm_mem_ioeventfd_del(MemoryRegionSection *section,
-                                  bool match_data, uint64_t data, int fd)
+static void kvm_mem_ioeventfd_del(MemoryListener *listener,
+                                  MemoryRegionSection *section,
+                                  bool match_data, uint64_t data,
+                                  EventNotifier *e)
 {
+    int fd = event_notifier_get_fd(e);
     int r;
 
     r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
@@ -764,9 +772,12 @@ static void kvm_mem_ioeventfd_del(MemoryRegionSection *section,
     }
 }
 
-static void kvm_io_ioeventfd_add(MemoryRegionSection *section,
-                                 bool match_data, uint64_t data, int fd)
+static void kvm_io_ioeventfd_add(MemoryListener *listener,
+                                 MemoryRegionSection *section,
+                                 bool match_data, uint64_t data,
+                                 EventNotifier *e)
 {
+    int fd = event_notifier_get_fd(e);
     int r;
 
     assert(match_data && section->size == 2);
@@ -778,10 +789,13 @@ static void kvm_io_ioeventfd_add(MemoryRegionSection *section,
     }
 }
 
-static void kvm_io_ioeventfd_del(MemoryRegionSection *section,
-                                 bool match_data, uint64_t data, int fd)
+static void kvm_io_ioeventfd_del(MemoryListener *listener,
+                                 MemoryRegionSection *section,
+                                 bool match_data, uint64_t data,
+                                 EventNotifier *e)
 
 {
+    int fd = event_notifier_get_fd(e);
     int r;
 
     r = kvm_set_ioeventfd_pio_word(fd, section->offset_within_address_space,
@@ -791,41 +805,24 @@ static void kvm_io_ioeventfd_del(MemoryRegionSection *section,
     }
 }
 
-static void kvm_eventfd_add(MemoryListener *listener,
-                            MemoryRegionSection *section,
-                            bool match_data, uint64_t data, int fd)
-{
-    if (section->address_space == get_system_memory()) {
-        kvm_mem_ioeventfd_add(section, match_data, data, fd);
-    } else {
-        kvm_io_ioeventfd_add(section, match_data, data, fd);
-    }
-}
-
-static void kvm_eventfd_del(MemoryListener *listener,
-                            MemoryRegionSection *section,
-                            bool match_data, uint64_t data, int fd)
-{
-    if (section->address_space == get_system_memory()) {
-        kvm_mem_ioeventfd_del(section, match_data, data, fd);
-    } else {
-        kvm_io_ioeventfd_del(section, match_data, data, fd);
-    }
-}
-
 static MemoryListener kvm_memory_listener = {
-    .begin = kvm_begin,
-    .commit = kvm_commit,
     .region_add = kvm_region_add,
     .region_del = kvm_region_del,
-    .region_nop = kvm_region_nop,
     .log_start = kvm_log_start,
     .log_stop = kvm_log_stop,
     .log_sync = kvm_log_sync,
     .log_global_start = kvm_log_global_start,
     .log_global_stop = kvm_log_global_stop,
-    .eventfd_add = kvm_eventfd_add,
-    .eventfd_del = kvm_eventfd_del,
+    .eventfd_add = kvm_mem_ioeventfd_add,
+    .eventfd_del = kvm_mem_ioeventfd_del,
+    .coalesced_mmio_add = kvm_coalesce_mmio_region,
+    .coalesced_mmio_del = kvm_uncoalesce_mmio_region,
+    .priority = 10,
+};
+
+static MemoryListener kvm_io_listener = {
+    .eventfd_add = kvm_io_ioeventfd_add,
+    .eventfd_del = kvm_io_ioeventfd_del,
     .priority = 10,
 };
 
@@ -838,35 +835,43 @@ static void kvm_handle_interrupt(CPUArchState *env, int mask)
     }
 }
 
-int kvm_irqchip_set_irq(KVMState *s, int irq, int level)
+int kvm_set_irq(KVMState *s, int irq, int level)
 {
     struct kvm_irq_level event;
     int ret;
 
-    assert(kvm_irqchip_in_kernel());
+    assert(kvm_async_interrupts_enabled());
 
     event.level = level;
     event.irq = irq;
-    ret = kvm_vm_ioctl(s, s->irqchip_inject_ioctl, &event);
+    ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
     if (ret < 0) {
-        perror("kvm_set_irqchip_line");
+        perror("kvm_set_irq");
         abort();
     }
 
-    return (s->irqchip_inject_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
+    return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
 }
 
 #ifdef KVM_CAP_IRQ_ROUTING
+typedef struct KVMMSIRoute {
+    struct kvm_irq_routing_entry kroute;
+    QTAILQ_ENTRY(KVMMSIRoute) entry;
+} KVMMSIRoute;
+
 static void set_gsi(KVMState *s, unsigned int gsi)
 {
-    assert(gsi < s->max_gsi);
-
     s->used_gsi_bitmap[gsi / 32] |= 1U << (gsi % 32);
 }
 
+static void clear_gsi(KVMState *s, unsigned int gsi)
+{
+    s->used_gsi_bitmap[gsi / 32] &= ~(1U << (gsi % 32));
+}
+
 static void kvm_init_irq_routing(KVMState *s)
 {
-    int gsi_count;
+    int gsi_count, i;
 
     gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING);
     if (gsi_count > 0) {
@@ -875,7 +880,7 @@ static void kvm_init_irq_routing(KVMState *s)
         /* Round up so we can search ints using ffs */
         gsi_bits = ALIGN(gsi_count, 32);
         s->used_gsi_bitmap = g_malloc0(gsi_bits / 8);
-        s->max_gsi = gsi_bits;
+        s->gsi_count = gsi_count;
 
         /* Mark any over-allocated bits as already in use */
         for (i = gsi_count; i < gsi_bits; i++) {
@@ -886,9 +891,24 @@ static void kvm_init_irq_routing(KVMState *s)
     s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
     s->nr_allocated_irq_routes = 0;
 
+    if (!s->direct_msi) {
+        for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
+            QTAILQ_INIT(&s->msi_hashtab[i]);
+        }
+    }
+
     kvm_arch_init_irq_routing(s);
 }
 
+static void kvm_irqchip_commit_routes(KVMState *s)
+{
+    int ret;
+
+    s->irq_routes->flags = 0;
+    ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
+    assert(ret == 0);
+}
+
 static void kvm_add_routing_entry(KVMState *s,
                                   struct kvm_irq_routing_entry *entry)
 {
@@ -914,12 +934,40 @@ static void kvm_add_routing_entry(KVMState *s,
     new->u = entry->u;
 
     set_gsi(s, entry->gsi);
+
+    kvm_irqchip_commit_routes(s);
 }
 
-void kvm_irqchip_add_route(KVMState *s, int irq, int irqchip, int pin)
+static int kvm_update_routing_entry(KVMState *s,
+                                    struct kvm_irq_routing_entry *new_entry)
+{
+    struct kvm_irq_routing_entry *entry;
+    int n;
+
+    for (n = 0; n < s->irq_routes->nr; n++) {
+        entry = &s->irq_routes->entries[n];
+        if (entry->gsi != new_entry->gsi) {
+            continue;
+        }
+
+        entry->type = new_entry->type;
+        entry->flags = new_entry->flags;
+        entry->u = new_entry->u;
+
+        kvm_irqchip_commit_routes(s);
+
+        return 0;
+    }
+
+    return -ESRCH;
+}
+
+void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
 {
     struct kvm_irq_routing_entry e;
 
+    assert(pin < s->gsi_count);
+
     e.gsi = irq;
     e.type = KVM_IRQ_ROUTING_IRQCHIP;
     e.flags = 0;
@@ -928,10 +976,185 @@ void kvm_irqchip_add_route(KVMState *s, int irq, int irqchip, int pin)
     kvm_add_routing_entry(s, &e);
 }
 
-int kvm_irqchip_commit_routes(KVMState *s)
+void kvm_irqchip_release_virq(KVMState *s, int virq)
 {
-    s->irq_routes->flags = 0;
-    return kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
+    struct kvm_irq_routing_entry *e;
+    int i;
+
+    for (i = 0; i < s->irq_routes->nr; i++) {
+        e = &s->irq_routes->entries[i];
+        if (e->gsi == virq) {
+            s->irq_routes->nr--;
+            *e = s->irq_routes->entries[s->irq_routes->nr];
+        }
+    }
+    clear_gsi(s, virq);
+
+    kvm_irqchip_commit_routes(s);
+}
+
+static unsigned int kvm_hash_msi(uint32_t data)
+{
+    /* This is optimized for IA32 MSI layout. However, no other arch shall
+     * repeat the mistake of not providing a direct MSI injection API. */
+    return data & 0xff;
+}
+
+static void kvm_flush_dynamic_msi_routes(KVMState *s)
+{
+    KVMMSIRoute *route, *next;
+    unsigned int hash;
+
+    for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
+        QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
+            kvm_irqchip_release_virq(s, route->kroute.gsi);
+            QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
+            g_free(route);
+        }
+    }
+}
+
+static int kvm_irqchip_get_virq(KVMState *s)
+{
+    uint32_t *word = s->used_gsi_bitmap;
+    int max_words = ALIGN(s->gsi_count, 32) / 32;
+    int i, bit;
+    bool retry = true;
+
+again:
+    /* Return the lowest unused GSI in the bitmap */
+    for (i = 0; i < max_words; i++) {
+        bit = ffs(~word[i]);
+        if (!bit) {
+            continue;
+        }
+
+        return bit - 1 + i * 32;
+    }
+    if (!s->direct_msi && retry) {
+        retry = false;
+        kvm_flush_dynamic_msi_routes(s);
+        goto again;
+    }
+    return -ENOSPC;
+
+}
+
+static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg)
+{
+    unsigned int hash = kvm_hash_msi(msg.data);
+    KVMMSIRoute *route;
+
+    QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
+        if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
+            route->kroute.u.msi.address_hi == (msg.address >> 32) &&
+            route->kroute.u.msi.data == msg.data) {
+            return route;
+        }
+    }
+    return NULL;
+}
+
+int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
+{
+    struct kvm_msi msi;
+    KVMMSIRoute *route;
+
+    if (s->direct_msi) {
+        msi.address_lo = (uint32_t)msg.address;
+        msi.address_hi = msg.address >> 32;
+        msi.data = msg.data;
+        msi.flags = 0;
+        memset(msi.pad, 0, sizeof(msi.pad));
+
+        return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
+    }
+
+    route = kvm_lookup_msi_route(s, msg);
+    if (!route) {
+        int virq;
+
+        virq = kvm_irqchip_get_virq(s);
+        if (virq < 0) {
+            return virq;
+        }
+
+        route = g_malloc(sizeof(KVMMSIRoute));
+        route->kroute.gsi = virq;
+        route->kroute.type = KVM_IRQ_ROUTING_MSI;
+        route->kroute.flags = 0;
+        route->kroute.u.msi.address_lo = (uint32_t)msg.address;
+        route->kroute.u.msi.address_hi = msg.address >> 32;
+        route->kroute.u.msi.data = msg.data;
+
+        kvm_add_routing_entry(s, &route->kroute);
+
+        QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
+                           entry);
+    }
+
+    assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);
+
+    return kvm_set_irq(s, route->kroute.gsi, 1);
+}
+
+int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg)
+{
+    struct kvm_irq_routing_entry kroute;
+    int virq;
+
+    if (!kvm_gsi_routing_enabled()) {
+        return -ENOSYS;
+    }
+
+    virq = kvm_irqchip_get_virq(s);
+    if (virq < 0) {
+        return virq;
+    }
+
+    kroute.gsi = virq;
+    kroute.type = KVM_IRQ_ROUTING_MSI;
+    kroute.flags = 0;
+    kroute.u.msi.address_lo = (uint32_t)msg.address;
+    kroute.u.msi.address_hi = msg.address >> 32;
+    kroute.u.msi.data = msg.data;
+
+    kvm_add_routing_entry(s, &kroute);
+
+    return virq;
+}
+
+int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
+{
+    struct kvm_irq_routing_entry kroute;
+
+    if (!kvm_irqchip_in_kernel()) {
+        return -ENOSYS;
+    }
+
+    kroute.gsi = virq;
+    kroute.type = KVM_IRQ_ROUTING_MSI;
+    kroute.flags = 0;
+    kroute.u.msi.address_lo = (uint32_t)msg.address;
+    kroute.u.msi.address_hi = msg.address >> 32;
+    kroute.u.msi.data = msg.data;
+
+    return kvm_update_routing_entry(s, &kroute);
+}
+
+static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign)
+{
+    struct kvm_irqfd irqfd = {
+        .fd = fd,
+        .gsi = virq,
+        .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
+    };
+
+    if (!kvm_irqfds_enabled()) {
+        return -ENOSYS;
+    }
+
+    return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
 }
 
 #else /* !KVM_CAP_IRQ_ROUTING */
@@ -939,8 +1162,37 @@ int kvm_irqchip_commit_routes(KVMState *s)
 static void kvm_init_irq_routing(KVMState *s)
 {
 }
+
+void kvm_irqchip_release_virq(KVMState *s, int virq)
+{
+}
+
+int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
+{
+    abort();
+}
+
+int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg)
+{
+    return -ENOSYS;
+}
+
+static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign)
+{
+    abort();
+}
 #endif /* !KVM_CAP_IRQ_ROUTING */
 
+int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, int virq)
+{
+    return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), virq, true);
+}
+
+int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq)
+{
+    return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), virq, false);
+}
+
 static int kvm_irqchip_create(KVMState *s)
 {
     QemuOptsList *list = qemu_find_opts("machine");
@@ -948,7 +1200,7 @@ static int kvm_irqchip_create(KVMState *s)
 
     if (QTAILQ_EMPTY(&list->head) ||
         !qemu_opt_get_bool(QTAILQ_FIRST(&list->head),
-                           "kernel_irqchip", false) ||
+                           "kernel_irqchip", true) ||
         !kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
         return 0;
     }
@@ -959,17 +1211,37 @@ static int kvm_irqchip_create(KVMState *s)
         return ret;
     }
 
-    s->irqchip_inject_ioctl = KVM_IRQ_LINE;
-    if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
-        s->irqchip_inject_ioctl = KVM_IRQ_LINE_STATUS;
-    }
     kvm_kernel_irqchip = true;
+    /* If we have an in-kernel IRQ chip then we must have asynchronous
+     * interrupt delivery (though the reverse is not necessarily true)
+     */
+    kvm_async_interrupts_allowed = true;
 
     kvm_init_irq_routing(s);
 
     return 0;
 }
 
+static int kvm_max_vcpus(KVMState *s)
+{
+    int ret;
+
+    /* Find number of supported CPUs using the recommended
+     * procedure from the kernel API documentation to cope with
+     * older kernels that may be missing capabilities.
+     */
+    ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
+    if (ret) {
+        return ret;
+    }
+    ret = kvm_check_extension(s, KVM_CAP_NR_VCPUS);
+    if (ret) {
+        return ret;
+    }
+
+    return 4;
+}
+
 int kvm_init(void)
 {
     static const char upgrade_note[] =
@@ -979,6 +1251,7 @@ int kvm_init(void)
     const KVMCapabilityInfo *missing_cap;
     int ret;
     int i;
+    int max_vcpus;
 
     s = g_malloc0(sizeof(KVMState));
 
@@ -1019,6 +1292,14 @@ int kvm_init(void)
         goto err;
     }
 
+    max_vcpus = kvm_max_vcpus(s);
+    if (smp_cpus > max_vcpus) {
+        ret = -EINVAL;
+        fprintf(stderr, "Number of SMP cpus requested (%d) exceeds max cpus "
+                "supported by KVM (%d)\n", smp_cpus, max_vcpus);
+        goto err;
+    }
+
     s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
     if (s->vmfd < 0) {
 #ifdef TARGET_S390X
@@ -1072,6 +1353,17 @@ int kvm_init(void)
     s->pit_state2 = kvm_check_extension(s, KVM_CAP_PIT_STATE2);
 #endif
 
+#ifdef KVM_CAP_IRQ_ROUTING
+    s->direct_msi = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
+#endif
+
+    s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3);
+
+    s->irq_set_ioctl = KVM_IRQ_LINE;
+    if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
+        s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
+    }
+
     ret = kvm_arch_init(s);
     if (ret < 0) {
         goto err;
@@ -1083,7 +1375,8 @@ int kvm_init(void)
     }
 
     kvm_state = s;
-    memory_listener_register(&kvm_memory_listener, NULL);
+    memory_listener_register(&kvm_memory_listener, &address_space_memory);
+    memory_listener_register(&kvm_io_listener, &address_space_io);
 
     s->many_ioeventfds = kvm_check_many_ioeventfds();
 
@@ -1092,13 +1385,11 @@ int kvm_init(void)
     return 0;
 
 err:
-    if (s) {
-        if (s->vmfd >= 0) {
-            close(s->vmfd);
-        }
-        if (s->fd != -1) {
-            close(s->fd);
-        }
+    if (s->vmfd >= 0) {
+        close(s->vmfd);
+    }
+    if (s->fd != -1) {
+        close(s->fd);
     }
     g_free(s);
 
@@ -1259,8 +1550,6 @@ int kvm_cpu_exec(CPUArchState *env)
         qemu_mutex_lock_iothread();
         kvm_arch_post_run(env, run);
 
-        kvm_flush_coalesced_mmio_buffer();
-
         if (run_ret < 0) {
             if (run_ret == -EINTR || run_ret == -EAGAIN) {
                 DPRINTF("io window exit\n");
@@ -1426,13 +1715,29 @@ int kvm_has_gsi_routing(void)
 #endif
 }
 
-int kvm_allows_irq0_override(void)
+int kvm_has_intx_set_mask(void)
 {
-    return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
+    return kvm_state->intx_set_mask;
+}
+
+void *kvm_vmalloc(ram_addr_t size)
+{
+#ifdef TARGET_S390X
+    void *mem;
+
+    mem = kvm_arch_vmalloc(size);
+    if (mem) {
+        return mem;
+    }
+#endif
+    return qemu_vmalloc(size);
 }
 
 void kvm_setup_guest_memory(void *start, size_t size)
 {
+#ifdef CONFIG_VALGRIND_H
+    VALGRIND_MAKE_MEM_DEFINED(start, size);
+#endif
     if (!kvm_has_sync_mmu()) {
         int ret = qemu_madvise(start, size, QEMU_MADV_DONTFORK);