target-ppc/kvm.c

   1 /*
   2  * PowerPC implementation of KVM hooks
   3  *
   4  * Copyright IBM Corp. 2007
   5  * Copyright (C) 2011 Freescale Semiconductor, Inc.
   6  *
   7  * Authors:
   8  *  Jerone Young <jyoung5@us.ibm.com>
   9  *  Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
  10  *  Hollis Blanchard <hollisb@us.ibm.com>
  11  *
  12  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  13  * See the COPYING file in the top-level directory.
  14  *
  15  */
  16
  17 #include "qemu/osdep.h"
  18 #include <dirent.h>
  19 #include <sys/ioctl.h>
  20 #include <sys/vfs.h>
  21
  22 #include <linux/kvm.h>
  23
  24 #include "qemu-common.h"
  25 #include "qemu/error-report.h"
  26 #include "cpu.h"
  27 #include "qemu/timer.h"
  28 #include "sysemu/sysemu.h"
  29 #include "sysemu/kvm.h"
  30 #include "kvm_ppc.h"
  31 #include "sysemu/cpus.h"
  32 #include "sysemu/device_tree.h"
  33 #include "mmu-hash64.h"
  34
  35 #include "hw/sysbus.h"
  36 #include "hw/ppc/spapr.h"
  37 #include "hw/ppc/spapr_vio.h"
  38 #include "hw/ppc/ppc.h"
  39 #include "sysemu/watchdog.h"
  40 #include "trace.h"
  41 #include "exec/gdbstub.h"
  42 #include "exec/memattrs.h"
  43 #include "sysemu/hostmem.h"
  44 #include "qemu/cutils.h"
  45
  46 //#define DEBUG_KVM
  47
  48 #ifdef DEBUG_KVM
  49 #define DPRINTF(fmt, ...) \
  50     do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
  51 #else
  52 #define DPRINTF(fmt, ...) \
  53     do { } while (0)
  54 #endif
  55
  56 #define PROC_DEVTREE_CPU      "/proc/device-tree/cpus/"
  57
  58 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
  59     KVM_CAP_LAST_INFO
  60 };
  61
  62 static int cap_interrupt_unset = false;
  63 static int cap_interrupt_level = false;
  64 static int cap_segstate;
  65 static int cap_booke_sregs;
  66 static int cap_ppc_smt;
  67 static int cap_ppc_rma;
  68 static int cap_spapr_tce;
  69 static int cap_spapr_multitce;
  70 static int cap_spapr_vfio;
  71 static int cap_hior;
  72 static int cap_one_reg;
  73 static int cap_epr;
  74 static int cap_ppc_watchdog;
  75 static int cap_papr;
  76 static int cap_htab_fd;
  77 static int cap_fixup_hcalls;
  78
  79 static uint32_t debug_inst_opcode;
  80
  81 /* XXX We have a race condition where we actually have a level triggered
  82  *     interrupt, but the infrastructure can't expose that yet, so the guest
  83  *     takes but ignores it, goes to sleep and never gets notified that there's
  84  *     still an interrupt pending.
  85  *
  86  *     As a quick workaround, let's just wake up again 20 ms after we injected
  87  *     an interrupt. That way we can assure that we're always reinjecting
  88  *     interrupts in case the guest swallowed them.
  89  */
  90 static QEMUTimer *idle_timer;
  91
  92 static void kvm_kick_cpu(void *opaque)
  93 {
  94     PowerPCCPU *cpu = opaque;
  95
  96     qemu_cpu_kick(CPU(cpu));
  97 }
  98
  99 static int kvm_ppc_register_host_cpu_type(void);
 100
 101 int kvm_arch_init(MachineState *ms, KVMState *s)
 102 {
 103     cap_interrupt_unset = kvm_check_extension(s, KVM_CAP_PPC_UNSET_IRQ);
 104     cap_interrupt_level = kvm_check_extension(s, KVM_CAP_PPC_IRQ_LEVEL);
 105     cap_segstate = kvm_check_extension(s, KVM_CAP_PPC_SEGSTATE);
 106     cap_booke_sregs = kvm_check_extension(s, KVM_CAP_PPC_BOOKE_SREGS);
 107     cap_ppc_smt = kvm_check_extension(s, KVM_CAP_PPC_SMT);
 108     cap_ppc_rma = kvm_check_extension(s, KVM_CAP_PPC_RMA);
 109     cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE);
 110     cap_spapr_multitce = kvm_check_extension(s, KVM_CAP_SPAPR_MULTITCE);
 111     cap_spapr_vfio = false;
 112     cap_one_reg = kvm_check_extension(s, KVM_CAP_ONE_REG);
 113     cap_hior = kvm_check_extension(s, KVM_CAP_PPC_HIOR);
 114     cap_epr = kvm_check_extension(s, KVM_CAP_PPC_EPR);
 115     cap_ppc_watchdog = kvm_check_extension(s, KVM_CAP_PPC_BOOKE_WATCHDOG);
 116     /* Note: we don't set cap_papr here, because this capability is
 117      * only activated after this by kvmppc_set_papr() */
 118     cap_htab_fd = kvm_check_extension(s, KVM_CAP_PPC_HTAB_FD);
 119     cap_fixup_hcalls = kvm_check_extension(s, KVM_CAP_PPC_FIXUP_HCALL);
 120
 121     if (!cap_interrupt_level) {
 122         fprintf(stderr, "KVM: Couldn't find level irq capability. Expect the "
 123                         "VM to stall at times!\n");
 124     }
 125
 126     kvm_ppc_register_host_cpu_type();
 127
 128     return 0;
 129 }
 130
 131 static int kvm_arch_sync_sregs(PowerPCCPU *cpu)
 132 {
 133     CPUPPCState *cenv = &cpu->env;
 134     CPUState *cs = CPU(cpu);
 135     struct kvm_sregs sregs;
 136     int ret;
 137
 138     if (cenv->excp_model == POWERPC_EXCP_BOOKE) {
 139         /* What we're really trying to say is "if we're on BookE, we use
 140            the native PVR for now". This is the only sane way to check
 141            it though, so we potentially confuse users that they can run
 142            BookE guests on BookS. Let's hope nobody dares enough :) */
 143         return 0;
 144     } else {
 145         if (!cap_segstate) {
 146             fprintf(stderr, "kvm error: missing PVR setting capability\n");
 147             return -ENOSYS;
 148         }
 149     }
 150
 151     ret = kvm_vcpu_ioctl(cs, KVM_GET_SREGS, &sregs);
 152     if (ret) {
 153         return ret;
 154     }
 155
 156     sregs.pvr = cenv->spr[SPR_PVR];
 157     return kvm_vcpu_ioctl(cs, KVM_SET_SREGS, &sregs);
 158 }
 159
 160 /* Set up a shared TLB array with KVM */
 161 static int kvm_booke206_tlb_init(PowerPCCPU *cpu)
 162 {
 163     CPUPPCState *env = &cpu->env;
 164     CPUState *cs = CPU(cpu);
 165     struct kvm_book3e_206_tlb_params params = {};
 166     struct kvm_config_tlb cfg = {};
 167     unsigned int entries = 0;
 168     int ret, i;
 169
 170     if (!kvm_enabled() ||
 171         !kvm_check_extension(cs->kvm_state, KVM_CAP_SW_TLB)) {
 172         return 0;
 173     }
 174
 175     assert(ARRAY_SIZE(params.tlb_sizes) == BOOKE206_MAX_TLBN);
 176
 177     for (i = 0; i < BOOKE206_MAX_TLBN; i++) {
 178         params.tlb_sizes[i] = booke206_tlb_size(env, i);
 179         params.tlb_ways[i] = booke206_tlb_ways(env, i);
 180         entries += params.tlb_sizes[i];
 181     }
 182
 183     assert(entries == env->nb_tlb);
 184     assert(sizeof(struct kvm_book3e_206_tlb_entry) == sizeof(ppcmas_tlb_t));
 185
 186     env->tlb_dirty = true;
 187
 188     cfg.array = (uintptr_t)env->tlb.tlbm;
 189     cfg.array_len = sizeof(ppcmas_tlb_t) * entries;
 190     cfg.params = (uintptr_t)&params;
 191     cfg.mmu_type = KVM_MMU_FSL_BOOKE_NOHV;
 192
 193     ret = kvm_vcpu_enable_cap(cs, KVM_CAP_SW_TLB, 0, (uintptr_t)&cfg);
 194     if (ret < 0) {
 195         fprintf(stderr, "%s: couldn't enable KVM_CAP_SW_TLB: %s\n",
 196                 __func__, strerror(-ret));
 197         return ret;
 198     }
 199
 200     env->kvm_sw_tlb = true;
 201     return 0;
 202 }
 203
 204
 205 #if defined(TARGET_PPC64)
 206 static void kvm_get_fallback_smmu_info(PowerPCCPU *cpu,
 207                                        struct kvm_ppc_smmu_info *info)
 208 {
 209     CPUPPCState *env = &cpu->env;
 210     CPUState *cs = CPU(cpu);
 211
 212     memset(info, 0, sizeof(*info));
 213
 214     /* We don't have the new KVM_PPC_GET_SMMU_INFO ioctl, so
 215      * need to "guess" what the supported page sizes are.
 216      *
 217      * For that to work we make a few assumptions:
 218      *
 219      * - If KVM_CAP_PPC_GET_PVINFO is supported we are running "PR"
 220      *   KVM which only supports 4K and 16M pages, but supports them
 221      *   regardless of the backing store characteritics. We also don't
 222      *   support 1T segments.
 223      *
 224      *   This is safe as if HV KVM ever supports that capability or PR
 225      *   KVM grows supports for more page/segment sizes, those versions
 226      *   will have implemented KVM_CAP_PPC_GET_SMMU_INFO and thus we
 227      *   will not hit this fallback
 228      *
 229      * - Else we are running HV KVM. This means we only support page
 230      *   sizes that fit in the backing store. Additionally we only
 231      *   advertize 64K pages if the processor is ARCH 2.06 and we assume
 232      *   P7 encodings for the SLB and hash table. Here too, we assume
 233      *   support for any newer processor will mean a kernel that
 234      *   implements KVM_CAP_PPC_GET_SMMU_INFO and thus doesn't hit
 235      *   this fallback.
 236      */
 237     if (kvm_check_extension(cs->kvm_state, KVM_CAP_PPC_GET_PVINFO)) {
 238         /* No flags */
 239         info->flags = 0;
 240         info->slb_size = 64;
 241
 242         /* Standard 4k base page size segment */
 243         info->sps[0].page_shift = 12;
 244         info->sps[0].slb_enc = 0;
 245         info->sps[0].enc[0].page_shift = 12;
 246         info->sps[0].enc[0].pte_enc = 0;
 247
 248         /* Standard 16M large page size segment */
 249         info->sps[1].page_shift = 24;
 250         info->sps[1].slb_enc = SLB_VSID_L;
 251         info->sps[1].enc[0].page_shift = 24;
 252         info->sps[1].enc[0].pte_enc = 0;
 253     } else {
 254         int i = 0;
 255
 256         /* HV KVM has backing store size restrictions */
 257         info->flags = KVM_PPC_PAGE_SIZES_REAL;
 258
 259         if (env->mmu_model & POWERPC_MMU_1TSEG) {
 260             info->flags |= KVM_PPC_1T_SEGMENTS;
 261         }
 262
 263         if (env->mmu_model == POWERPC_MMU_2_06 ||
 264             env->mmu_model == POWERPC_MMU_2_07) {
 265             info->slb_size = 32;
 266         } else {
 267             info->slb_size = 64;
 268         }
 269
 270         /* Standard 4k base page size segment */
 271         info->sps[i].page_shift = 12;
 272         info->sps[i].slb_enc = 0;
 273         info->sps[i].enc[0].page_shift = 12;
 274         info->sps[i].enc[0].pte_enc = 0;
 275         i++;
 276
 277         /* 64K on MMU 2.06 and later */
 278         if (env->mmu_model == POWERPC_MMU_2_06 ||
 279             env->mmu_model == POWERPC_MMU_2_07) {
 280             info->sps[i].page_shift = 16;
 281             info->sps[i].slb_enc = 0x110;
 282             info->sps[i].enc[0].page_shift = 16;
 283             info->sps[i].enc[0].pte_enc = 1;
 284             i++;
 285         }
 286
 287         /* Standard 16M large page size segment */
 288         info->sps[i].page_shift = 24;
 289         info->sps[i].slb_enc = SLB_VSID_L;
 290         info->sps[i].enc[0].page_shift = 24;
 291         info->sps[i].enc[0].pte_enc = 0;
 292     }
 293 }
 294
 295 static void kvm_get_smmu_info(PowerPCCPU *cpu, struct kvm_ppc_smmu_info *info)
 296 {
 297     CPUState *cs = CPU(cpu);
 298     int ret;
 299
 300     if (kvm_check_extension(cs->kvm_state, KVM_CAP_PPC_GET_SMMU_INFO)) {
 301         ret = kvm_vm_ioctl(cs->kvm_state, KVM_PPC_GET_SMMU_INFO, info);
 302         if (ret == 0) {
 303             return;
 304         }
 305     }
 306
 307     kvm_get_fallback_smmu_info(cpu, info);
 308 }
 309
 310 static long gethugepagesize(const char *mem_path)
 311 {
 312     struct statfs fs;
 313     int ret;
 314
 315     do {
 316         ret = statfs(mem_path, &fs);
 317     } while (ret != 0 && errno == EINTR);
 318
 319     if (ret != 0) {
 320         fprintf(stderr, "Couldn't statfs() memory path: %s\n",
 321                 strerror(errno));
 322         exit(1);
 323     }
 324
 325 #define HUGETLBFS_MAGIC       0x958458f6
 326
 327     if (fs.f_type != HUGETLBFS_MAGIC) {
 328         /* Explicit mempath, but it's ordinary pages */
 329         return getpagesize();
 330     }
 331
 332     /* It's hugepage, return the huge page size */
 333     return fs.f_bsize;
 334 }
 335
 336 /*
 337  * FIXME TOCTTOU: this iterates over memory backends' mem-path, which
 338  * may or may not name the same files / on the same filesystem now as
 339  * when we actually open and map them.  Iterate over the file
 340  * descriptors instead, and use qemu_fd_getpagesize().
 341  */
 342 static int find_max_supported_pagesize(Object *obj, void *opaque)
 343 {
 344     char *mem_path;
 345     long *hpsize_min = opaque;
 346
 347     if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
 348         mem_path = object_property_get_str(obj, "mem-path", NULL);
 349         if (mem_path) {
 350             long hpsize = gethugepagesize(mem_path);
 351             if (hpsize < *hpsize_min) {
 352                 *hpsize_min = hpsize;
 353             }
 354         } else {
 355             *hpsize_min = getpagesize();
 356         }
 357     }
 358
 359     return 0;
 360 }
 361
 362 static long getrampagesize(void)
 363 {
 364     long hpsize = LONG_MAX;
 365     Object *memdev_root;
 366
 367     if (mem_path) {
 368         return gethugepagesize(mem_path);
 369     }
 370
 371     /* it's possible we have memory-backend objects with
 372      * hugepage-backed RAM. these may get mapped into system
 373      * address space via -numa parameters or memory hotplug
 374      * hooks. we want to take these into account, but we
 375      * also want to make sure these supported hugepage
 376      * sizes are applicable across the entire range of memory
 377      * we may boot from, so we take the min across all
 378      * backends, and assume normal pages in cases where a
 379      * backend isn't backed by hugepages.
 380      */
 381     memdev_root = object_resolve_path("/objects", NULL);
 382     if (!memdev_root) {
 383         return getpagesize();
 384     }
 385
 386     object_child_foreach(memdev_root, find_max_supported_pagesize, &hpsize);
 387
 388     return (hpsize == LONG_MAX) ? getpagesize() : hpsize;
 389 }
 390
 391 static bool kvm_valid_page_size(uint32_t flags, long rampgsize, uint32_t shift)
 392 {
 393     if (!(flags & KVM_PPC_PAGE_SIZES_REAL)) {
 394         return true;
 395     }
 396
 397     return (1ul << shift) <= rampgsize;
 398 }
 399
 400 static void kvm_fixup_page_sizes(PowerPCCPU *cpu)
 401 {
 402     static struct kvm_ppc_smmu_info smmu_info;
 403     static bool has_smmu_info;
 404     CPUPPCState *env = &cpu->env;
 405     long rampagesize;
 406     int iq, ik, jq, jk;
 407
 408     /* We only handle page sizes for 64-bit server guests for now */
 409     if (!(env->mmu_model & POWERPC_MMU_64)) {
 410         return;
 411     }
 412
 413     /* Collect MMU info from kernel if not already */
 414     if (!has_smmu_info) {
 415         kvm_get_smmu_info(cpu, &smmu_info);
 416         has_smmu_info = true;
 417     }
 418
 419     rampagesize = getrampagesize();
 420
 421     /* Convert to QEMU form */
 422     memset(&env->sps, 0, sizeof(env->sps));
 423
 424     /* If we have HV KVM, we need to forbid CI large pages if our
 425      * host page size is smaller than 64K.
 426      */
 427     if (smmu_info.flags & KVM_PPC_PAGE_SIZES_REAL) {
 428         env->ci_large_pages = getpagesize() >= 0x10000;
 429     }
 430
 431     /*
 432      * XXX This loop should be an entry wide AND of the capabilities that
 433      *     the selected CPU has with the capabilities that KVM supports.
 434      */
 435     for (ik = iq = 0; ik < KVM_PPC_PAGE_SIZES_MAX_SZ; ik++) {
 436         struct ppc_one_seg_page_size *qsps = &env->sps.sps[iq];
 437         struct kvm_ppc_one_seg_page_size *ksps = &smmu_info.sps[ik];
 438
 439         if (!kvm_valid_page_size(smmu_info.flags, rampagesize,
 440                                  ksps->page_shift)) {
 441             continue;
 442         }
 443         qsps->page_shift = ksps->page_shift;
 444         qsps->slb_enc = ksps->slb_enc;
 445         for (jk = jq = 0; jk < KVM_PPC_PAGE_SIZES_MAX_SZ; jk++) {
 446             if (!kvm_valid_page_size(smmu_info.flags, rampagesize,
 447                                      ksps->enc[jk].page_shift)) {
 448                 continue;
 449             }
 450             qsps->enc[jq].page_shift = ksps->enc[jk].page_shift;
 451             qsps->enc[jq].pte_enc = ksps->enc[jk].pte_enc;
 452             if (++jq >= PPC_PAGE_SIZES_MAX_SZ) {
 453                 break;
 454             }
 455         }
 456         if (++iq >= PPC_PAGE_SIZES_MAX_SZ) {
 457             break;
 458         }
 459     }
 460     env->slb_nr = smmu_info.slb_size;
 461     if (!(smmu_info.flags & KVM_PPC_1T_SEGMENTS)) {
 462         env->mmu_model &= ~POWERPC_MMU_1TSEG;
 463     }
 464 }
 465 #else /* defined (TARGET_PPC64) */
 466
 467 static inline void kvm_fixup_page_sizes(PowerPCCPU *cpu)
 468 {
 469 }
 470
 471 #endif /* !defined (TARGET_PPC64) */
 472
 473 unsigned long kvm_arch_vcpu_id(CPUState *cpu)
 474 {
 475     return ppc_get_vcpu_dt_id(POWERPC_CPU(cpu));
 476 }
 477
 478 /* e500 supports 2 h/w breakpoint and 2 watchpoint.
 479  * book3s supports only 1 watchpoint, so array size
 480  * of 4 is sufficient for now.
 481  */
 482 #define MAX_HW_BKPTS 4
 483
 484 static struct HWBreakpoint {
 485     target_ulong addr;
 486     int type;
 487 } hw_debug_points[MAX_HW_BKPTS];
 488
 489 static CPUWatchpoint hw_watchpoint;
 490
 491 /* Default there is no breakpoint and watchpoint supported */
 492 static int max_hw_breakpoint;
 493 static int max_hw_watchpoint;
 494 static int nb_hw_breakpoint;
 495 static int nb_hw_watchpoint;
 496
 497 static void kvmppc_hw_debug_points_init(CPUPPCState *cenv)
 498 {
 499     if (cenv->excp_model == POWERPC_EXCP_BOOKE) {
 500         max_hw_breakpoint = 2;
 501         max_hw_watchpoint = 2;
 502     }
 503
 504     if ((max_hw_breakpoint + max_hw_watchpoint) > MAX_HW_BKPTS) {
 505         fprintf(stderr, "Error initializing h/w breakpoints\n");
 506         return;
 507     }
 508 }
 509
 510 int kvm_arch_init_vcpu(CPUState *cs)
 511 {
 512     PowerPCCPU *cpu = POWERPC_CPU(cs);
 513     CPUPPCState *cenv = &cpu->env;
 514     int ret;
 515
 516     /* Gather server mmu info from KVM and update the CPU state */
 517     kvm_fixup_page_sizes(cpu);
 518
 519     /* Synchronize sregs with kvm */
 520     ret = kvm_arch_sync_sregs(cpu);
 521     if (ret) {
 522         if (ret == -EINVAL) {
 523             error_report("Register sync failed... If you're using kvm-hv.ko,"
 524                          " only \"-cpu host\" is possible");
 525         }
 526         return ret;
 527     }
 528
 529     idle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, kvm_kick_cpu, cpu);
 530
 531     /* Some targets support access to KVM's guest TLB. */
 532     switch (cenv->mmu_model) {
 533     case POWERPC_MMU_BOOKE206:
 534         ret = kvm_booke206_tlb_init(cpu);
 535         break;
 536     default:
 537         break;
 538     }
 539
 540     kvm_get_one_reg(cs, KVM_REG_PPC_DEBUG_INST, &debug_inst_opcode);
 541     kvmppc_hw_debug_points_init(cenv);
 542
 543     return ret;
 544 }
 545
 546 static void kvm_sw_tlb_put(PowerPCCPU *cpu)
 547 {
 548     CPUPPCState *env = &cpu->env;
 549     CPUState *cs = CPU(cpu);
 550     struct kvm_dirty_tlb dirty_tlb;
 551     unsigned char *bitmap;
 552     int ret;
 553
 554     if (!env->kvm_sw_tlb) {
 555         return;
 556     }
 557
 558     bitmap = g_malloc((env->nb_tlb + 7) / 8);
 559     memset(bitmap, 0xFF, (env->nb_tlb + 7) / 8);
 560
 561     dirty_tlb.bitmap = (uintptr_t)bitmap;
 562     dirty_tlb.num_dirty = env->nb_tlb;
 563
 564     ret = kvm_vcpu_ioctl(cs, KVM_DIRTY_TLB, &dirty_tlb);
 565     if (ret) {
 566         fprintf(stderr, "%s: KVM_DIRTY_TLB: %s\n",
 567                 __func__, strerror(-ret));
 568     }
 569
 570     g_free(bitmap);
 571 }
 572
 573 static void kvm_get_one_spr(CPUState *cs, uint64_t id, int spr)
 574 {
 575     PowerPCCPU *cpu = POWERPC_CPU(cs);
 576     CPUPPCState *env = &cpu->env;
 577     union {
 578         uint32_t u32;
 579         uint64_t u64;
 580     } val;
 581     struct kvm_one_reg reg = {
 582         .id = id,
 583         .addr = (uintptr_t) &val,
 584     };
 585     int ret;
 586
 587     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 588     if (ret != 0) {
 589         trace_kvm_failed_spr_get(spr, strerror(errno));
 590     } else {
 591         switch (id & KVM_REG_SIZE_MASK) {
 592         case KVM_REG_SIZE_U32:
 593             env->spr[spr] = val.u32;
 594             break;
 595
 596         case KVM_REG_SIZE_U64:
 597             env->spr[spr] = val.u64;
 598             break;
 599
 600         default:
 601             /* Don't handle this size yet */
 602             abort();
 603         }
 604     }
 605 }
 606
 607 static void kvm_put_one_spr(CPUState *cs, uint64_t id, int spr)
 608 {
 609     PowerPCCPU *cpu = POWERPC_CPU(cs);
 610     CPUPPCState *env = &cpu->env;
 611     union {
 612         uint32_t u32;
 613         uint64_t u64;
 614     } val;
 615     struct kvm_one_reg reg = {
 616         .id = id,
 617         .addr = (uintptr_t) &val,
 618     };
 619     int ret;
 620
 621     switch (id & KVM_REG_SIZE_MASK) {
 622     case KVM_REG_SIZE_U32:
 623         val.u32 = env->spr[spr];
 624         break;
 625
 626     case KVM_REG_SIZE_U64:
 627         val.u64 = env->spr[spr];
 628         break;
 629
 630     default:
 631         /* Don't handle this size yet */
 632         abort();
 633     }
 634
 635     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 636     if (ret != 0) {
 637         trace_kvm_failed_spr_set(spr, strerror(errno));
 638     }
 639 }
 640
 641 static int kvm_put_fp(CPUState *cs)
 642 {
 643     PowerPCCPU *cpu = POWERPC_CPU(cs);
 644     CPUPPCState *env = &cpu->env;
 645     struct kvm_one_reg reg;
 646     int i;
 647     int ret;
 648
 649     if (env->insns_flags & PPC_FLOAT) {
 650         uint64_t fpscr = env->fpscr;
 651         bool vsx = !!(env->insns_flags2 & PPC2_VSX);
 652
 653         reg.id = KVM_REG_PPC_FPSCR;
 654         reg.addr = (uintptr_t)&fpscr;
 655         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 656         if (ret < 0) {
 657             DPRINTF("Unable to set FPSCR to KVM: %s\n", strerror(errno));
 658             return ret;
 659         }
 660
 661         for (i = 0; i < 32; i++) {
 662             uint64_t vsr[2];
 663
 664 #ifdef HOST_WORDS_BIGENDIAN
 665             vsr[0] = float64_val(env->fpr[i]);
 666             vsr[1] = env->vsr[i];
 667 #else
 668             vsr[0] = env->vsr[i];
 669             vsr[1] = float64_val(env->fpr[i]);
 670 #endif
 671             reg.addr = (uintptr_t) &vsr;
 672             reg.id = vsx ? KVM_REG_PPC_VSR(i) : KVM_REG_PPC_FPR(i);
 673
 674             ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 675             if (ret < 0) {
 676                 DPRINTF("Unable to set %s%d to KVM: %s\n", vsx ? "VSR" : "FPR",
 677                         i, strerror(errno));
 678                 return ret;
 679             }
 680         }
 681     }
 682
 683     if (env->insns_flags & PPC_ALTIVEC) {
 684         reg.id = KVM_REG_PPC_VSCR;
 685         reg.addr = (uintptr_t)&env->vscr;
 686         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 687         if (ret < 0) {
 688             DPRINTF("Unable to set VSCR to KVM: %s\n", strerror(errno));
 689             return ret;
 690         }
 691
 692         for (i = 0; i < 32; i++) {
 693             reg.id = KVM_REG_PPC_VR(i);
 694             reg.addr = (uintptr_t)&env->avr[i];
 695             ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 696             if (ret < 0) {
 697                 DPRINTF("Unable to set VR%d to KVM: %s\n", i, strerror(errno));
 698                 return ret;
 699             }
 700         }
 701     }
 702
 703     return 0;
 704 }
 705
 706 static int kvm_get_fp(CPUState *cs)
 707 {
 708     PowerPCCPU *cpu = POWERPC_CPU(cs);
 709     CPUPPCState *env = &cpu->env;
 710     struct kvm_one_reg reg;
 711     int i;
 712     int ret;
 713
 714     if (env->insns_flags & PPC_FLOAT) {
 715         uint64_t fpscr;
 716         bool vsx = !!(env->insns_flags2 & PPC2_VSX);
 717
 718         reg.id = KVM_REG_PPC_FPSCR;
 719         reg.addr = (uintptr_t)&fpscr;
 720         ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 721         if (ret < 0) {
 722             DPRINTF("Unable to get FPSCR from KVM: %s\n", strerror(errno));
 723             return ret;
 724         } else {
 725             env->fpscr = fpscr;
 726         }
 727
 728         for (i = 0; i < 32; i++) {
 729             uint64_t vsr[2];
 730
 731             reg.addr = (uintptr_t) &vsr;
 732             reg.id = vsx ? KVM_REG_PPC_VSR(i) : KVM_REG_PPC_FPR(i);
 733
 734             ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 735             if (ret < 0) {
 736                 DPRINTF("Unable to get %s%d from KVM: %s\n",
 737                         vsx ? "VSR" : "FPR", i, strerror(errno));
 738                 return ret;
 739             } else {
 740 #ifdef HOST_WORDS_BIGENDIAN
 741                 env->fpr[i] = vsr[0];
 742                 if (vsx) {
 743                     env->vsr[i] = vsr[1];
 744                 }
 745 #else
 746                 env->fpr[i] = vsr[1];
 747                 if (vsx) {
 748                     env->vsr[i] = vsr[0];
 749                 }
 750 #endif
 751             }
 752         }
 753     }
 754
 755     if (env->insns_flags & PPC_ALTIVEC) {
 756         reg.id = KVM_REG_PPC_VSCR;
 757         reg.addr = (uintptr_t)&env->vscr;
 758         ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 759         if (ret < 0) {
 760             DPRINTF("Unable to get VSCR from KVM: %s\n", strerror(errno));
 761             return ret;
 762         }
 763
 764         for (i = 0; i < 32; i++) {
 765             reg.id = KVM_REG_PPC_VR(i);
 766             reg.addr = (uintptr_t)&env->avr[i];
 767             ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 768             if (ret < 0) {
 769                 DPRINTF("Unable to get VR%d from KVM: %s\n",
 770                         i, strerror(errno));
 771                 return ret;
 772             }
 773         }
 774     }
 775
 776     return 0;
 777 }
 778
 779 #if defined(TARGET_PPC64)
 780 static int kvm_get_vpa(CPUState *cs)
 781 {
 782     PowerPCCPU *cpu = POWERPC_CPU(cs);
 783     CPUPPCState *env = &cpu->env;
 784     struct kvm_one_reg reg;
 785     int ret;
 786
 787     reg.id = KVM_REG_PPC_VPA_ADDR;
 788     reg.addr = (uintptr_t)&env->vpa_addr;
 789     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 790     if (ret < 0) {
 791         DPRINTF("Unable to get VPA address from KVM: %s\n", strerror(errno));
 792         return ret;
 793     }
 794
 795     assert((uintptr_t)&env->slb_shadow_size
 796            == ((uintptr_t)&env->slb_shadow_addr + 8));
 797     reg.id = KVM_REG_PPC_VPA_SLB;
 798     reg.addr = (uintptr_t)&env->slb_shadow_addr;
 799     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 800     if (ret < 0) {
 801         DPRINTF("Unable to get SLB shadow state from KVM: %s\n",
 802                 strerror(errno));
 803         return ret;
 804     }
 805
 806     assert((uintptr_t)&env->dtl_size == ((uintptr_t)&env->dtl_addr + 8));
 807     reg.id = KVM_REG_PPC_VPA_DTL;
 808     reg.addr = (uintptr_t)&env->dtl_addr;
 809     ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
 810     if (ret < 0) {
 811         DPRINTF("Unable to get dispatch trace log state from KVM: %s\n",
 812                 strerror(errno));
 813         return ret;
 814     }
 815
 816     return 0;
 817 }
 818
 819 static int kvm_put_vpa(CPUState *cs)
 820 {
 821     PowerPCCPU *cpu = POWERPC_CPU(cs);
 822     CPUPPCState *env = &cpu->env;
 823     struct kvm_one_reg reg;
 824     int ret;
 825
 826     /* SLB shadow or DTL can't be registered unless a master VPA is
 827      * registered.  That means when restoring state, if a VPA *is*
 828      * registered, we need to set that up first.  If not, we need to
 829      * deregister the others before deregistering the master VPA */
 830     assert(env->vpa_addr || !(env->slb_shadow_addr || env->dtl_addr));
 831
 832     if (env->vpa_addr) {
 833         reg.id = KVM_REG_PPC_VPA_ADDR;
 834         reg.addr = (uintptr_t)&env->vpa_addr;
 835         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 836         if (ret < 0) {
 837             DPRINTF("Unable to set VPA address to KVM: %s\n", strerror(errno));
 838             return ret;
 839         }
 840     }
 841
 842     assert((uintptr_t)&env->slb_shadow_size
 843            == ((uintptr_t)&env->slb_shadow_addr + 8));
 844     reg.id = KVM_REG_PPC_VPA_SLB;
 845     reg.addr = (uintptr_t)&env->slb_shadow_addr;
 846     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 847     if (ret < 0) {
 848         DPRINTF("Unable to set SLB shadow state to KVM: %s\n", strerror(errno));
 849         return ret;
 850     }
 851
 852     assert((uintptr_t)&env->dtl_size == ((uintptr_t)&env->dtl_addr + 8));
 853     reg.id = KVM_REG_PPC_VPA_DTL;
 854     reg.addr = (uintptr_t)&env->dtl_addr;
 855     ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 856     if (ret < 0) {
 857         DPRINTF("Unable to set dispatch trace log state to KVM: %s\n",
 858                 strerror(errno));
 859         return ret;
 860     }
 861
 862     if (!env->vpa_addr) {
 863         reg.id = KVM_REG_PPC_VPA_ADDR;
 864         reg.addr = (uintptr_t)&env->vpa_addr;
 865         ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 866         if (ret < 0) {
 867             DPRINTF("Unable to set VPA address to KVM: %s\n", strerror(errno));
 868             return ret;
 869         }
 870     }
 871
 872     return 0;
 873 }
 874 #endif /* TARGET_PPC64 */
 875
 876 int kvmppc_put_books_sregs(PowerPCCPU *cpu)
 877 {
 878     CPUPPCState *env = &cpu->env;
 879     struct kvm_sregs sregs;
 880     int i;
 881
 882     sregs.pvr = env->spr[SPR_PVR];
 883
 884     sregs.u.s.sdr1 = env->spr[SPR_SDR1];
 885
 886     /* Sync SLB */
 887 #ifdef TARGET_PPC64
 888     for (i = 0; i < ARRAY_SIZE(env->slb); i++) {
 889         sregs.u.s.ppc64.slb[i].slbe = env->slb[i].esid;
 890         if (env->slb[i].esid & SLB_ESID_V) {
 891             sregs.u.s.ppc64.slb[i].slbe |= i;
 892         }
 893         sregs.u.s.ppc64.slb[i].slbv = env->slb[i].vsid;
 894     }
 895 #endif
 896
 897     /* Sync SRs */
 898     for (i = 0; i < 16; i++) {
 899         sregs.u.s.ppc32.sr[i] = env->sr[i];
 900     }
 901
 902     /* Sync BATs */
 903     for (i = 0; i < 8; i++) {
 904         /* Beware. We have to swap upper and lower bits here */
 905         sregs.u.s.ppc32.dbat[i] = ((uint64_t)env->DBAT[0][i] << 32)
 906             | env->DBAT[1][i];
 907         sregs.u.s.ppc32.ibat[i] = ((uint64_t)env->IBAT[0][i] << 32)
 908             | env->IBAT[1][i];
 909     }
 910
 911     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_SREGS, &sregs);
 912 }
 913
 914 int kvm_arch_put_registers(CPUState *cs, int level)
 915 {
 916     PowerPCCPU *cpu = POWERPC_CPU(cs);
 917     CPUPPCState *env = &cpu->env;
 918     struct kvm_regs regs;
 919     int ret;
 920     int i;
 921
 922     ret = kvm_vcpu_ioctl(cs, KVM_GET_REGS, &regs);
 923     if (ret < 0) {
 924         return ret;
 925     }
 926
 927     regs.ctr = env->ctr;
 928     regs.lr  = env->lr;
 929     regs.xer = cpu_read_xer(env);
 930     regs.msr = env->msr;
 931     regs.pc = env->nip;
 932
 933     regs.srr0 = env->spr[SPR_SRR0];
 934     regs.srr1 = env->spr[SPR_SRR1];
 935
 936     regs.sprg0 = env->spr[SPR_SPRG0];
 937     regs.sprg1 = env->spr[SPR_SPRG1];
 938     regs.sprg2 = env->spr[SPR_SPRG2];
 939     regs.sprg3 = env->spr[SPR_SPRG3];
 940     regs.sprg4 = env->spr[SPR_SPRG4];
 941     regs.sprg5 = env->spr[SPR_SPRG5];
 942     regs.sprg6 = env->spr[SPR_SPRG6];
 943     regs.sprg7 = env->spr[SPR_SPRG7];
 944
 945     regs.pid = env->spr[SPR_BOOKE_PID];
 946
 947     for (i = 0;i < 32; i++)
 948         regs.gpr[i] = env->gpr[i];
 949
 950     regs.cr = 0;
 951     for (i = 0; i < 8; i++) {
 952         regs.cr |= (env->crf[i] & 15) << (4 * (7 - i));
 953     }
 954
 955     ret = kvm_vcpu_ioctl(cs, KVM_SET_REGS, &regs);
 956     if (ret < 0)
 957         return ret;
 958
 959     kvm_put_fp(cs);
 960
 961     if (env->tlb_dirty) {
 962         kvm_sw_tlb_put(cpu);
 963         env->tlb_dirty = false;
 964     }
 965
 966     if (cap_segstate && (level >= KVM_PUT_RESET_STATE)) {
 967         ret = kvmppc_put_books_sregs(cpu);
 968         if (ret < 0) {
 969             return ret;
 970         }
 971     }
 972
 973     if (cap_hior && (level >= KVM_PUT_RESET_STATE)) {
 974         kvm_put_one_spr(cs, KVM_REG_PPC_HIOR, SPR_HIOR);
 975     }
 976
 977     if (cap_one_reg) {
 978         int i;
 979
 980         /* We deliberately ignore errors here, for kernels which have
 981          * the ONE_REG calls, but don't support the specific
 982          * registers, there's a reasonable chance things will still
 983          * work, at least until we try to migrate. */
 984         for (i = 0; i < 1024; i++) {
 985             uint64_t id = env->spr_cb[i].one_reg_id;
 986
 987             if (id != 0) {
 988                 kvm_put_one_spr(cs, id, i);
 989             }
 990         }
 991
 992 #ifdef TARGET_PPC64
 993         if (msr_ts) {
 994             for (i = 0; i < ARRAY_SIZE(env->tm_gpr); i++) {
 995                 kvm_set_one_reg(cs, KVM_REG_PPC_TM_GPR(i), &env->tm_gpr[i]);
 996             }
 997             for (i = 0; i < ARRAY_SIZE(env->tm_vsr); i++) {
 998                 kvm_set_one_reg(cs, KVM_REG_PPC_TM_VSR(i), &env->tm_vsr[i]);
 999             }
1000             kvm_set_one_reg(cs, KVM_REG_PPC_TM_CR, &env->tm_cr);
1001             kvm_set_one_reg(cs, KVM_REG_PPC_TM_LR, &env->tm_lr);
1002             kvm_set_one_reg(cs, KVM_REG_PPC_TM_CTR, &env->tm_ctr);
1003             kvm_set_one_reg(cs, KVM_REG_PPC_TM_FPSCR, &env->tm_fpscr);
1004             kvm_set_one_reg(cs, KVM_REG_PPC_TM_AMR, &env->tm_amr);
1005             kvm_set_one_reg(cs, KVM_REG_PPC_TM_PPR, &env->tm_ppr);
1006             kvm_set_one_reg(cs, KVM_REG_PPC_TM_VRSAVE, &env->tm_vrsave);
1007             kvm_set_one_reg(cs, KVM_REG_PPC_TM_VSCR, &env->tm_vscr);
1008             kvm_set_one_reg(cs, KVM_REG_PPC_TM_DSCR, &env->tm_dscr);
1009             kvm_set_one_reg(cs, KVM_REG_PPC_TM_TAR, &env->tm_tar);
1010         }
1011
1012         if (cap_papr) {
1013             if (kvm_put_vpa(cs) < 0) {
1014                 DPRINTF("Warning: Unable to set VPA information to KVM\n");
1015             }
1016         }
1017
1018         kvm_set_one_reg(cs, KVM_REG_PPC_TB_OFFSET, &env->tb_env->tb_offset);
1019 #endif /* TARGET_PPC64 */
1020     }
1021
1022     return ret;
1023 }
1024
1025 static void kvm_sync_excp(CPUPPCState *env, int vector, int ivor)
1026 {
1027      env->excp_vectors[vector] = env->spr[ivor] + env->spr[SPR_BOOKE_IVPR];
1028 }
1029
1030 static int kvmppc_get_booke_sregs(PowerPCCPU *cpu)
1031 {
1032     CPUPPCState *env = &cpu->env;
1033     struct kvm_sregs sregs;
1034     int ret;
1035
1036     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
1037     if (ret < 0) {
1038         return ret;
1039     }
1040
1041     if (sregs.u.e.features & KVM_SREGS_E_BASE) {
1042         env->spr[SPR_BOOKE_CSRR0] = sregs.u.e.csrr0;
1043         env->spr[SPR_BOOKE_CSRR1] = sregs.u.e.csrr1;
1044         env->spr[SPR_BOOKE_ESR] = sregs.u.e.esr;
1045         env->spr[SPR_BOOKE_DEAR] = sregs.u.e.dear;
1046         env->spr[SPR_BOOKE_MCSR] = sregs.u.e.mcsr;
1047         env->spr[SPR_BOOKE_TSR] = sregs.u.e.tsr;
1048         env->spr[SPR_BOOKE_TCR] = sregs.u.e.tcr;
1049         env->spr[SPR_DECR] = sregs.u.e.dec;
1050         env->spr[SPR_TBL] = sregs.u.e.tb & 0xffffffff;
1051         env->spr[SPR_TBU] = sregs.u.e.tb >> 32;
1052         env->spr[SPR_VRSAVE] = sregs.u.e.vrsave;
1053     }
1054
1055     if (sregs.u.e.features & KVM_SREGS_E_ARCH206) {
1056         env->spr[SPR_BOOKE_PIR] = sregs.u.e.pir;
1057         env->spr[SPR_BOOKE_MCSRR0] = sregs.u.e.mcsrr0;
1058         env->spr[SPR_BOOKE_MCSRR1] = sregs.u.e.mcsrr1;
1059         env->spr[SPR_BOOKE_DECAR] = sregs.u.e.decar;
1060         env->spr[SPR_BOOKE_IVPR] = sregs.u.e.ivpr;
1061     }
1062
1063     if (sregs.u.e.features & KVM_SREGS_E_64) {
1064         env->spr[SPR_BOOKE_EPCR] = sregs.u.e.epcr;
1065     }
1066
1067     if (sregs.u.e.features & KVM_SREGS_E_SPRG8) {
1068         env->spr[SPR_BOOKE_SPRG8] = sregs.u.e.sprg8;
1069     }
1070
1071     if (sregs.u.e.features & KVM_SREGS_E_IVOR) {
1072         env->spr[SPR_BOOKE_IVOR0] = sregs.u.e.ivor_low[0];
1073         kvm_sync_excp(env, POWERPC_EXCP_CRITICAL,  SPR_BOOKE_IVOR0);
1074         env->spr[SPR_BOOKE_IVOR1] = sregs.u.e.ivor_low[1];
1075         kvm_sync_excp(env, POWERPC_EXCP_MCHECK,  SPR_BOOKE_IVOR1);
1076         env->spr[SPR_BOOKE_IVOR2] = sregs.u.e.ivor_low[2];
1077         kvm_sync_excp(env, POWERPC_EXCP_DSI,  SPR_BOOKE_IVOR2);
1078         env->spr[SPR_BOOKE_IVOR3] = sregs.u.e.ivor_low[3];
1079         kvm_sync_excp(env, POWERPC_EXCP_ISI,  SPR_BOOKE_IVOR3);
1080         env->spr[SPR_BOOKE_IVOR4] = sregs.u.e.ivor_low[4];
1081         kvm_sync_excp(env, POWERPC_EXCP_EXTERNAL,  SPR_BOOKE_IVOR4);
1082         env->spr[SPR_BOOKE_IVOR5] = sregs.u.e.ivor_low[5];
1083         kvm_sync_excp(env, POWERPC_EXCP_ALIGN,  SPR_BOOKE_IVOR5);
1084         env->spr[SPR_BOOKE_IVOR6] = sregs.u.e.ivor_low[6];
1085         kvm_sync_excp(env, POWERPC_EXCP_PROGRAM,  SPR_BOOKE_IVOR6);
1086         env->spr[SPR_BOOKE_IVOR7] = sregs.u.e.ivor_low[7];
1087         kvm_sync_excp(env, POWERPC_EXCP_FPU,  SPR_BOOKE_IVOR7);
1088         env->spr[SPR_BOOKE_IVOR8] = sregs.u.e.ivor_low[8];
1089         kvm_sync_excp(env, POWERPC_EXCP_SYSCALL,  SPR_BOOKE_IVOR8);
1090         env->spr[SPR_BOOKE_IVOR9] = sregs.u.e.ivor_low[9];
1091         kvm_sync_excp(env, POWERPC_EXCP_APU,  SPR_BOOKE_IVOR9);
1092         env->spr[SPR_BOOKE_IVOR10] = sregs.u.e.ivor_low[10];
1093         kvm_sync_excp(env, POWERPC_EXCP_DECR,  SPR_BOOKE_IVOR10);
1094         env->spr[SPR_BOOKE_IVOR11] = sregs.u.e.ivor_low[11];
1095         kvm_sync_excp(env, POWERPC_EXCP_FIT,  SPR_BOOKE_IVOR11);
1096         env->spr[SPR_BOOKE_IVOR12] = sregs.u.e.ivor_low[12];
1097         kvm_sync_excp(env, POWERPC_EXCP_WDT,  SPR_BOOKE_IVOR12);
1098         env->spr[SPR_BOOKE_IVOR13] = sregs.u.e.ivor_low[13];
1099         kvm_sync_excp(env, POWERPC_EXCP_DTLB,  SPR_BOOKE_IVOR13);
1100         env->spr[SPR_BOOKE_IVOR14] = sregs.u.e.ivor_low[14];
1101         kvm_sync_excp(env, POWERPC_EXCP_ITLB,  SPR_BOOKE_IVOR14);
1102         env->spr[SPR_BOOKE_IVOR15] = sregs.u.e.ivor_low[15];
1103         kvm_sync_excp(env, POWERPC_EXCP_DEBUG,  SPR_BOOKE_IVOR15);
1104
1105         if (sregs.u.e.features & KVM_SREGS_E_SPE) {
1106             env->spr[SPR_BOOKE_IVOR32] = sregs.u.e.ivor_high[0];
1107             kvm_sync_excp(env, POWERPC_EXCP_SPEU,  SPR_BOOKE_IVOR32);
1108             env->spr[SPR_BOOKE_IVOR33] = sregs.u.e.ivor_high[1];
1109             kvm_sync_excp(env, POWERPC_EXCP_EFPDI,  SPR_BOOKE_IVOR33);
1110             env->spr[SPR_BOOKE_IVOR34] = sregs.u.e.ivor_high[2];
1111             kvm_sync_excp(env, POWERPC_EXCP_EFPRI,  SPR_BOOKE_IVOR34);
1112         }
1113
1114         if (sregs.u.e.features & KVM_SREGS_E_PM) {
1115             env->spr[SPR_BOOKE_IVOR35] = sregs.u.e.ivor_high[3];
1116             kvm_sync_excp(env, POWERPC_EXCP_EPERFM,  SPR_BOOKE_IVOR35);
1117         }
1118
1119         if (sregs.u.e.features & KVM_SREGS_E_PC) {
1120             env->spr[SPR_BOOKE_IVOR36] = sregs.u.e.ivor_high[4];
1121             kvm_sync_excp(env, POWERPC_EXCP_DOORI,  SPR_BOOKE_IVOR36);
1122             env->spr[SPR_BOOKE_IVOR37] = sregs.u.e.ivor_high[5];
1123             kvm_sync_excp(env, POWERPC_EXCP_DOORCI, SPR_BOOKE_IVOR37);
1124         }
1125     }
1126
1127     if (sregs.u.e.features & KVM_SREGS_E_ARCH206_MMU) {
1128         env->spr[SPR_BOOKE_MAS0] = sregs.u.e.mas0;
1129         env->spr[SPR_BOOKE_MAS1] = sregs.u.e.mas1;
1130         env->spr[SPR_BOOKE_MAS2] = sregs.u.e.mas2;
1131         env->spr[SPR_BOOKE_MAS3] = sregs.u.e.mas7_3 & 0xffffffff;
1132         env->spr[SPR_BOOKE_MAS4] = sregs.u.e.mas4;
1133         env->spr[SPR_BOOKE_MAS6] = sregs.u.e.mas6;
1134         env->spr[SPR_BOOKE_MAS7] = sregs.u.e.mas7_3 >> 32;
1135         env->spr[SPR_MMUCFG] = sregs.u.e.mmucfg;
1136         env->spr[SPR_BOOKE_TLB0CFG] = sregs.u.e.tlbcfg[0];
1137         env->spr[SPR_BOOKE_TLB1CFG] = sregs.u.e.tlbcfg[1];
1138     }
1139
1140     if (sregs.u.e.features & KVM_SREGS_EXP) {
1141         env->spr[SPR_BOOKE_EPR] = sregs.u.e.epr;
1142     }
1143
1144     if (sregs.u.e.features & KVM_SREGS_E_PD) {
1145         env->spr[SPR_BOOKE_EPLC] = sregs.u.e.eplc;
1146         env->spr[SPR_BOOKE_EPSC] = sregs.u.e.epsc;
1147     }
1148
1149     if (sregs.u.e.impl_id == KVM_SREGS_E_IMPL_FSL) {
1150         env->spr[SPR_E500_SVR] = sregs.u.e.impl.fsl.svr;
1151         env->spr[SPR_Exxx_MCAR] = sregs.u.e.impl.fsl.mcar;
1152         env->spr[SPR_HID0] = sregs.u.e.impl.fsl.hid0;
1153
1154         if (sregs.u.e.impl.fsl.features & KVM_SREGS_E_FSL_PIDn) {
1155             env->spr[SPR_BOOKE_PID1] = sregs.u.e.impl.fsl.pid1;
1156             env->spr[SPR_BOOKE_PID2] = sregs.u.e.impl.fsl.pid2;
1157         }
1158     }
1159
1160     return 0;
1161 }
1162
1163 static int kvmppc_get_books_sregs(PowerPCCPU *cpu)
1164 {
1165     CPUPPCState *env = &cpu->env;
1166     struct kvm_sregs sregs;
1167     int ret;
1168     int i;
1169
1170     ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_SREGS, &sregs);
1171     if (ret < 0) {
1172         return ret;
1173     }
1174
1175     if (!env->external_htab) {
1176         ppc_store_sdr1(env, sregs.u.s.sdr1);
1177     }
1178
1179     /* Sync SLB */
1180 #ifdef TARGET_PPC64
1181     /*
1182      * The packed SLB array we get from KVM_GET_SREGS only contains
1183      * information about valid entries. So we flush our internal copy
1184      * to get rid of stale ones, then put all valid SLB entries back
1185      * in.
1186      */
1187     memset(env->slb, 0, sizeof(env->slb));
1188     for (i = 0; i < ARRAY_SIZE(env->slb); i++) {
1189         target_ulong rb = sregs.u.s.ppc64.slb[i].slbe;
1190         target_ulong rs = sregs.u.s.ppc64.slb[i].slbv;
1191         /*
1192          * Only restore valid entries
1193          */
1194         if (rb & SLB_ESID_V) {
1195             ppc_store_slb(cpu, rb & 0xfff, rb & ~0xfffULL, rs);
1196         }
1197     }
1198 #endif
1199
1200     /* Sync SRs */
1201     for (i = 0; i < 16; i++) {
1202         env->sr[i] = sregs.u.s.ppc32.sr[i];
1203     }
1204
1205     /* Sync BATs */
1206     for (i = 0; i < 8; i++) {
1207         env->DBAT[0][i] = sregs.u.s.ppc32.dbat[i] & 0xffffffff;
1208         env->DBAT[1][i] = sregs.u.s.ppc32.dbat[i] >> 32;
1209         env->IBAT[0][i] = sregs.u.s.ppc32.ibat[i] & 0xffffffff;
1210         env->IBAT[1][i] = sregs.u.s.ppc32.ibat[i] >> 32;
1211     }
1212
1213     return 0;
1214 }
1215
1216 int kvm_arch_get_registers(CPUState *cs)
1217 {
1218     PowerPCCPU *cpu = POWERPC_CPU(cs);
1219     CPUPPCState *env = &cpu->env;
1220     struct kvm_regs regs;
1221     uint32_t cr;
1222     int i, ret;
1223
1224     ret = kvm_vcpu_ioctl(cs, KVM_GET_REGS, &regs);
1225     if (ret < 0)
1226         return ret;
1227
1228     cr = regs.cr;
1229     for (i = 7; i >= 0; i--) {
1230         env->crf[i] = cr & 15;
1231         cr >>= 4;
1232     }
1233
1234     env->ctr = regs.ctr;
1235     env->lr = regs.lr;
1236     cpu_write_xer(env, regs.xer);
1237     env->msr = regs.msr;
1238     env->nip = regs.pc;
1239
1240     env->spr[SPR_SRR0] = regs.srr0;
1241     env->spr[SPR_SRR1] = regs.srr1;
1242
1243     env->spr[SPR_SPRG0] = regs.sprg0;
1244     env->spr[SPR_SPRG1] = regs.sprg1;
1245     env->spr[SPR_SPRG2] = regs.sprg2;
1246     env->spr[SPR_SPRG3] = regs.sprg3;
1247     env->spr[SPR_SPRG4] = regs.sprg4;
1248     env->spr[SPR_SPRG5] = regs.sprg5;
1249     env->spr[SPR_SPRG6] = regs.sprg6;
1250     env->spr[SPR_SPRG7] = regs.sprg7;
1251
1252     env->spr[SPR_BOOKE_PID] = regs.pid;
1253
1254     for (i = 0;i < 32; i++)
1255         env->gpr[i] = regs.gpr[i];
1256
1257     kvm_get_fp(cs);
1258
1259     if (cap_booke_sregs) {
1260         ret = kvmppc_get_booke_sregs(cpu);
1261         if (ret < 0) {
1262             return ret;
1263         }
1264     }
1265
1266     if (cap_segstate) {
1267         ret = kvmppc_get_books_sregs(cpu);
1268         if (ret < 0) {
1269             return ret;
1270         }
1271     }
1272
1273     if (cap_hior) {
1274         kvm_get_one_spr(cs, KVM_REG_PPC_HIOR, SPR_HIOR);
1275     }
1276
1277     if (cap_one_reg) {
1278         int i;
1279
1280         /* We deliberately ignore errors here, for kernels which have
1281          * the ONE_REG calls, but don't support the specific
1282          * registers, there's a reasonable chance things will still
1283          * work, at least until we try to migrate. */
1284         for (i = 0; i < 1024; i++) {
1285             uint64_t id = env->spr_cb[i].one_reg_id;
1286
1287             if (id != 0) {
1288                 kvm_get_one_spr(cs, id, i);
1289             }
1290         }
1291
1292 #ifdef TARGET_PPC64
1293         if (msr_ts) {
1294             for (i = 0; i < ARRAY_SIZE(env->tm_gpr); i++) {
1295                 kvm_get_one_reg(cs, KVM_REG_PPC_TM_GPR(i), &env->tm_gpr[i]);
1296             }
1297             for (i = 0; i < ARRAY_SIZE(env->tm_vsr); i++) {
1298                 kvm_get_one_reg(cs, KVM_REG_PPC_TM_VSR(i), &env->tm_vsr[i]);
1299             }
1300             kvm_get_one_reg(cs, KVM_REG_PPC_TM_CR, &env->tm_cr);
1301             kvm_get_one_reg(cs, KVM_REG_PPC_TM_LR, &env->tm_lr);
1302             kvm_get_one_reg(cs, KVM_REG_PPC_TM_CTR, &env->tm_ctr);
1303             kvm_get_one_reg(cs, KVM_REG_PPC_TM_FPSCR, &env->tm_fpscr);
1304             kvm_get_one_reg(cs, KVM_REG_PPC_TM_AMR, &env->tm_amr);
1305             kvm_get_one_reg(cs, KVM_REG_PPC_TM_PPR, &env->tm_ppr);
1306             kvm_get_one_reg(cs, KVM_REG_PPC_TM_VRSAVE, &env->tm_vrsave);
1307             kvm_get_one_reg(cs, KVM_REG_PPC_TM_VSCR, &env->tm_vscr);
1308             kvm_get_one_reg(cs, KVM_REG_PPC_TM_DSCR, &env->tm_dscr);
1309             kvm_get_one_reg(cs, KVM_REG_PPC_TM_TAR, &env->tm_tar);
1310         }
1311
1312         if (cap_papr) {
1313             if (kvm_get_vpa(cs) < 0) {
1314                 DPRINTF("Warning: Unable to get VPA information from KVM\n");
1315             }
1316         }
1317
1318         kvm_get_one_reg(cs, KVM_REG_PPC_TB_OFFSET, &env->tb_env->tb_offset);
1319 #endif
1320     }
1321
1322     return 0;
1323 }
1324
1325 int kvmppc_set_interrupt(PowerPCCPU *cpu, int irq, int level)
1326 {
1327     unsigned virq = level ? KVM_INTERRUPT_SET_LEVEL : KVM_INTERRUPT_UNSET;
1328
1329     if (irq != PPC_INTERRUPT_EXT) {
1330         return 0;
1331     }
1332
1333     if (!kvm_enabled() || !cap_interrupt_unset || !cap_interrupt_level) {
1334         return 0;
1335     }
1336
1337     kvm_vcpu_ioctl(CPU(cpu), KVM_INTERRUPT, &virq);
1338
1339     return 0;
1340 }
1341
1342 #if defined(TARGET_PPCEMB)
1343 #define PPC_INPUT_INT PPC40x_INPUT_INT
1344 #elif defined(TARGET_PPC64)
1345 #define PPC_INPUT_INT PPC970_INPUT_INT
1346 #else
1347 #define PPC_INPUT_INT PPC6xx_INPUT_INT
1348 #endif
1349
1350 void kvm_arch_pre_run(CPUState *cs, struct kvm_run *run)
1351 {
1352     PowerPCCPU *cpu = POWERPC_CPU(cs);
1353     CPUPPCState *env = &cpu->env;
1354     int r;
1355     unsigned irq;
1356
1357     qemu_mutex_lock_iothread();
1358
1359     /* PowerPC QEMU tracks the various core input pins (interrupt, critical
1360      * interrupt, reset, etc) in PPC-specific env->irq_input_state. */
1361     if (!cap_interrupt_level &&
1362         run->ready_for_interrupt_injection &&
1363         (cs->interrupt_request & CPU_INTERRUPT_HARD) &&
1364         (env->irq_input_state & (1<<PPC_INPUT_INT)))
1365     {
1366         /* For now KVM disregards the 'irq' argument. However, in the
1367          * future KVM could cache it in-kernel to avoid a heavyweight exit
1368          * when reading the UIC.
1369          */
1370         irq = KVM_INTERRUPT_SET;
1371
1372         DPRINTF("injected interrupt %d\n", irq);
1373         r = kvm_vcpu_ioctl(cs, KVM_INTERRUPT, &irq);
1374         if (r < 0) {
1375             printf("cpu %d fail inject %x\n", cs->cpu_index, irq);
1376         }
1377
1378         /* Always wake up soon in case the interrupt was level based */
1379         timer_mod(idle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
1380                        (NANOSECONDS_PER_SECOND / 50));
1381     }
1382
1383     /* We don't know if there are more interrupts pending after this. However,
1384      * the guest will return to userspace in the course of handling this one
1385      * anyways, so we will get a chance to deliver the rest. */
1386
1387     qemu_mutex_unlock_iothread();
1388 }
1389
1390 MemTxAttrs kvm_arch_post_run(CPUState *cs, struct kvm_run *run)
1391 {
1392     return MEMTXATTRS_UNSPECIFIED;
1393 }
1394
1395 int kvm_arch_process_async_events(CPUState *cs)
1396 {
1397     return cs->halted;
1398 }
1399
1400 static int kvmppc_handle_halt(PowerPCCPU *cpu)
1401 {
1402     CPUState *cs = CPU(cpu);
1403     CPUPPCState *env = &cpu->env;
1404
1405     if (!(cs->interrupt_request & CPU_INTERRUPT_HARD) && (msr_ee)) {
1406         cs->halted = 1;
1407         cs->exception_index = EXCP_HLT;
1408     }
1409
1410     return 0;
1411 }
1412
1413 /* map dcr access to existing qemu dcr emulation */
1414 static int kvmppc_handle_dcr_read(CPUPPCState *env, uint32_t dcrn, uint32_t *data)
1415 {
1416     if (ppc_dcr_read(env->dcr_env, dcrn, data) < 0)
1417         fprintf(stderr, "Read to unhandled DCR (0x%x)\n", dcrn);
1418
1419     return 0;
1420 }
1421
1422 static int kvmppc_handle_dcr_write(CPUPPCState *env, uint32_t dcrn, uint32_t data)
1423 {
1424     if (ppc_dcr_write(env->dcr_env, dcrn, data) < 0)
1425         fprintf(stderr, "Write to unhandled DCR (0x%x)\n", dcrn);
1426
1427     return 0;
1428 }
1429
1430 int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
1431 {
1432     /* Mixed endian case is not handled */
1433     uint32_t sc = debug_inst_opcode;
1434
1435     if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn,
1436                             sizeof(sc), 0) ||
1437         cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&sc, sizeof(sc), 1)) {
1438         return -EINVAL;
1439     }
1440
1441     return 0;
1442 }
1443
1444 int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
1445 {
1446     uint32_t sc;
1447
1448     if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&sc, sizeof(sc), 0) ||
1449         sc != debug_inst_opcode ||
1450         cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn,
1451                             sizeof(sc), 1)) {
1452         return -EINVAL;
1453     }
1454
1455     return 0;
1456 }
1457
1458 static int find_hw_breakpoint(target_ulong addr, int type)
1459 {
1460     int n;
1461
1462     assert((nb_hw_breakpoint + nb_hw_watchpoint)
1463            <= ARRAY_SIZE(hw_debug_points));
1464
1465     for (n = 0; n < nb_hw_breakpoint + nb_hw_watchpoint; n++) {
1466         if (hw_debug_points[n].addr == addr &&
1467              hw_debug_points[n].type == type) {
1468             return n;
1469         }
1470     }
1471
1472     return -1;
1473 }
1474
1475 static int find_hw_watchpoint(target_ulong addr, int *flag)
1476 {
1477     int n;
1478
1479     n = find_hw_breakpoint(addr, GDB_WATCHPOINT_ACCESS);
1480     if (n >= 0) {
1481         *flag = BP_MEM_ACCESS;
1482         return n;
1483     }
1484
1485     n = find_hw_breakpoint(addr, GDB_WATCHPOINT_WRITE);
1486     if (n >= 0) {
1487         *flag = BP_MEM_WRITE;
1488         return n;
1489     }
1490
1491     n = find_hw_breakpoint(addr, GDB_WATCHPOINT_READ);
1492     if (n >= 0) {
1493         *flag = BP_MEM_READ;
1494         return n;
1495     }
1496
1497     return -1;
1498 }
1499
1500 int kvm_arch_insert_hw_breakpoint(target_ulong addr,
1501                                   target_ulong len, int type)
1502 {
1503     if ((nb_hw_breakpoint + nb_hw_watchpoint) >= ARRAY_SIZE(hw_debug_points)) {
1504         return -ENOBUFS;
1505     }
1506
1507     hw_debug_points[nb_hw_breakpoint + nb_hw_watchpoint].addr = addr;
1508     hw_debug_points[nb_hw_breakpoint + nb_hw_watchpoint].type = type;
1509
1510     switch (type) {
1511     case GDB_BREAKPOINT_HW:
1512         if (nb_hw_breakpoint >= max_hw_breakpoint) {
1513             return -ENOBUFS;
1514         }
1515
1516         if (find_hw_breakpoint(addr, type) >= 0) {
1517             return -EEXIST;
1518         }
1519
1520         nb_hw_breakpoint++;
1521         break;
1522
1523     case GDB_WATCHPOINT_WRITE:
1524     case GDB_WATCHPOINT_READ:
1525     case GDB_WATCHPOINT_ACCESS:
1526         if (nb_hw_watchpoint >= max_hw_watchpoint) {
1527             return -ENOBUFS;
1528         }
1529
1530         if (find_hw_breakpoint(addr, type) >= 0) {
1531             return -EEXIST;
1532         }
1533
1534         nb_hw_watchpoint++;
1535         break;
1536
1537     default:
1538         return -ENOSYS;
1539     }
1540
1541     return 0;
1542 }
1543
1544 int kvm_arch_remove_hw_breakpoint(target_ulong addr,
1545                                   target_ulong len, int type)
1546 {
1547     int n;
1548
1549     n = find_hw_breakpoint(addr, type);
1550     if (n < 0) {
1551         return -ENOENT;
1552     }
1553
1554     switch (type) {
1555     case GDB_BREAKPOINT_HW:
1556         nb_hw_breakpoint--;
1557         break;
1558
1559     case GDB_WATCHPOINT_WRITE:
1560     case GDB_WATCHPOINT_READ:
1561     case GDB_WATCHPOINT_ACCESS:
1562         nb_hw_watchpoint--;
1563         break;
1564
1565     default:
1566         return -ENOSYS;
1567     }
1568     hw_debug_points[n] = hw_debug_points[nb_hw_breakpoint + nb_hw_watchpoint];
1569
1570     return 0;
1571 }
1572
1573 void kvm_arch_remove_all_hw_breakpoints(void)
1574 {
1575     nb_hw_breakpoint = nb_hw_watchpoint = 0;
1576 }
1577
1578 void kvm_arch_update_guest_debug(CPUState *cs, struct kvm_guest_debug *dbg)
1579 {
1580     int n;
1581
1582     /* Software Breakpoint updates */
1583     if (kvm_sw_breakpoints_active(cs)) {
1584         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
1585     }
1586
1587     assert((nb_hw_breakpoint + nb_hw_watchpoint)
1588            <= ARRAY_SIZE(hw_debug_points));
1589     assert((nb_hw_breakpoint + nb_hw_watchpoint) <= ARRAY_SIZE(dbg->arch.bp));
1590
1591     if (nb_hw_breakpoint + nb_hw_watchpoint > 0) {
1592         dbg->control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
1593         memset(dbg->arch.bp, 0, sizeof(dbg->arch.bp));
1594         for (n = 0; n < nb_hw_breakpoint + nb_hw_watchpoint; n++) {
1595             switch (hw_debug_points[n].type) {
1596             case GDB_BREAKPOINT_HW:
1597                 dbg->arch.bp[n].type = KVMPPC_DEBUG_BREAKPOINT;
1598                 break;
1599             case GDB_WATCHPOINT_WRITE:
1600                 dbg->arch.bp[n].type = KVMPPC_DEBUG_WATCH_WRITE;
1601                 break;
1602             case GDB_WATCHPOINT_READ:
1603                 dbg->arch.bp[n].type = KVMPPC_DEBUG_WATCH_READ;
1604                 break;
1605             case GDB_WATCHPOINT_ACCESS:
1606                 dbg->arch.bp[n].type = KVMPPC_DEBUG_WATCH_WRITE |
1607                                         KVMPPC_DEBUG_WATCH_READ;
1608                 break;
1609             default:
1610                 cpu_abort(cs, "Unsupported breakpoint type\n");
1611             }
1612             dbg->arch.bp[n].addr = hw_debug_points[n].addr;
1613         }
1614     }
1615 }
1616
1617 static int kvm_handle_debug(PowerPCCPU *cpu, struct kvm_run *run)
1618 {
1619     CPUState *cs = CPU(cpu);
1620     CPUPPCState *env = &cpu->env;
1621     struct kvm_debug_exit_arch *arch_info = &run->debug.arch;
1622     int handle = 0;
1623     int n;
1624     int flag = 0;
1625
1626     if (cs->singlestep_enabled) {
1627         handle = 1;
1628     } else if (arch_info->status) {
1629         if (nb_hw_breakpoint + nb_hw_watchpoint > 0) {
1630             if (arch_info->status & KVMPPC_DEBUG_BREAKPOINT) {
1631                 n = find_hw_breakpoint(arch_info->address, GDB_BREAKPOINT_HW);
1632                 if (n >= 0) {
1633                     handle = 1;
1634                 }
1635             } else if (arch_info->status & (KVMPPC_DEBUG_WATCH_READ |
1636                                             KVMPPC_DEBUG_WATCH_WRITE)) {
1637                 n = find_hw_watchpoint(arch_info->address,  &flag);
1638                 if (n >= 0) {
1639                     handle = 1;
1640                     cs->watchpoint_hit = &hw_watchpoint;
1641                     hw_watchpoint.vaddr = hw_debug_points[n].addr;
1642                     hw_watchpoint.flags = flag;
1643                 }
1644             }
1645         }
1646     } else if (kvm_find_sw_breakpoint(cs, arch_info->address)) {
1647         handle = 1;
1648     } else {
1649         /* QEMU is not able to handle debug exception, so inject
1650          * program exception to guest;
1651          * Yes program exception NOT debug exception !!
1652          * When QEMU is using debug resources then debug exception must
1653          * be always set. To achieve this we set MSR_DE and also set
1654          * MSRP_DEP so guest cannot change MSR_DE.
1655          * When emulating debug resource for guest we want guest
1656          * to control MSR_DE (enable/disable debug interrupt on need).
1657          * Supporting both configurations are NOT possible.
1658          * So the result is that we cannot share debug resources
1659          * between QEMU and Guest on BOOKE architecture.
1660          * In the current design QEMU gets the priority over guest,
1661          * this means that if QEMU is using debug resources then guest
1662          * cannot use them;
1663          * For software breakpoint QEMU uses a privileged instruction;
1664          * So there cannot be any reason that we are here for guest
1665          * set debug exception, only possibility is guest executed a
1666          * privileged / illegal instruction and that's why we are
1667          * injecting a program interrupt.
1668          */
1669
1670         cpu_synchronize_state(cs);
1671         /* env->nip is PC, so increment this by 4 to use
1672          * ppc_cpu_do_interrupt(), which set srr0 = env->nip - 4.
1673          */
1674         env->nip += 4;
1675         cs->exception_index = POWERPC_EXCP_PROGRAM;
1676         env->error_code = POWERPC_EXCP_INVAL;
1677         ppc_cpu_do_interrupt(cs);
1678     }
1679
1680     return handle;
1681 }
1682
1683 int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run)
1684 {
1685     PowerPCCPU *cpu = POWERPC_CPU(cs);
1686     CPUPPCState *env = &cpu->env;
1687     int ret;
1688
1689     qemu_mutex_lock_iothread();
1690
1691     switch (run->exit_reason) {
1692     case KVM_EXIT_DCR:
1693         if (run->dcr.is_write) {
1694             DPRINTF("handle dcr write\n");
1695             ret = kvmppc_handle_dcr_write(env, run->dcr.dcrn, run->dcr.data);
1696         } else {
1697             DPRINTF("handle dcr read\n");
1698             ret = kvmppc_handle_dcr_read(env, run->dcr.dcrn, &run->dcr.data);
1699         }
1700         break;
1701     case KVM_EXIT_HLT:
1702         DPRINTF("handle halt\n");
1703         ret = kvmppc_handle_halt(cpu);
1704         break;
1705 #if defined(TARGET_PPC64)
1706     case KVM_EXIT_PAPR_HCALL:
1707         DPRINTF("handle PAPR hypercall\n");
1708         run->papr_hcall.ret = spapr_hypercall(cpu,
1709                                               run->papr_hcall.nr,
1710                                               run->papr_hcall.args);
1711         ret = 0;
1712         break;
1713 #endif
1714     case KVM_EXIT_EPR:
1715         DPRINTF("handle epr\n");
1716         run->epr.epr = ldl_phys(cs->as, env->mpic_iack);
1717         ret = 0;
1718         break;
1719     case KVM_EXIT_WATCHDOG:
1720         DPRINTF("handle watchdog expiry\n");
1721         watchdog_perform_action();
1722         ret = 0;
1723         break;
1724
1725     case KVM_EXIT_DEBUG:
1726         DPRINTF("handle debug exception\n");
1727         if (kvm_handle_debug(cpu, run)) {
1728             ret = EXCP_DEBUG;
1729             break;
1730         }
1731         /* re-enter, this exception was guest-internal */
1732         ret = 0;
1733         break;
1734
1735     default:
1736         fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
1737         ret = -1;
1738         break;
1739     }
1740
1741     qemu_mutex_unlock_iothread();
1742     return ret;
1743 }
1744
1745 int kvmppc_or_tsr_bits(PowerPCCPU *cpu, uint32_t tsr_bits)
1746 {
1747     CPUState *cs = CPU(cpu);
1748     uint32_t bits = tsr_bits;
1749     struct kvm_one_reg reg = {
1750         .id = KVM_REG_PPC_OR_TSR,
1751         .addr = (uintptr_t) &bits,
1752     };
1753
1754     return kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1755 }
1756
1757 int kvmppc_clear_tsr_bits(PowerPCCPU *cpu, uint32_t tsr_bits)
1758 {
1759
1760     CPUState *cs = CPU(cpu);
1761     uint32_t bits = tsr_bits;
1762     struct kvm_one_reg reg = {
1763         .id = KVM_REG_PPC_CLEAR_TSR,
1764         .addr = (uintptr_t) &bits,
1765     };
1766
1767     return kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1768 }
1769
1770 int kvmppc_set_tcr(PowerPCCPU *cpu)
1771 {
1772     CPUState *cs = CPU(cpu);
1773     CPUPPCState *env = &cpu->env;
1774     uint32_t tcr = env->spr[SPR_BOOKE_TCR];
1775
1776     struct kvm_one_reg reg = {
1777         .id = KVM_REG_PPC_TCR,
1778         .addr = (uintptr_t) &tcr,
1779     };
1780
1781     return kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1782 }
1783
1784 int kvmppc_booke_watchdog_enable(PowerPCCPU *cpu)
1785 {
1786     CPUState *cs = CPU(cpu);
1787     int ret;
1788
1789     if (!kvm_enabled()) {
1790         return -1;
1791     }
1792
1793     if (!cap_ppc_watchdog) {
1794         printf("warning: KVM does not support watchdog");
1795         return -1;
1796     }
1797
1798     ret = kvm_vcpu_enable_cap(cs, KVM_CAP_PPC_BOOKE_WATCHDOG, 0);
1799     if (ret < 0) {
1800         fprintf(stderr, "%s: couldn't enable KVM_CAP_PPC_BOOKE_WATCHDOG: %s\n",
1801                 __func__, strerror(-ret));
1802         return ret;
1803     }
1804
1805     return ret;
1806 }
1807
1808 static int read_cpuinfo(const char *field, char *value, int len)
1809 {
1810     FILE *f;
1811     int ret = -1;
1812     int field_len = strlen(field);
1813     char line[512];
1814
1815     f = fopen("/proc/cpuinfo", "r");
1816     if (!f) {
1817         return -1;
1818     }
1819
1820     do {
1821         if (!fgets(line, sizeof(line), f)) {
1822             break;
1823         }
1824         if (!strncmp(line, field, field_len)) {
1825             pstrcpy(value, len, line);
1826             ret = 0;
1827             break;
1828         }
1829     } while(*line);
1830
1831     fclose(f);
1832
1833     return ret;
1834 }
1835
1836 uint32_t kvmppc_get_tbfreq(void)
1837 {
1838     char line[512];
1839     char *ns;
1840     uint32_t retval = NANOSECONDS_PER_SECOND;
1841
1842     if (read_cpuinfo("timebase", line, sizeof(line))) {
1843         return retval;
1844     }
1845
1846     if (!(ns = strchr(line, ':'))) {
1847         return retval;
1848     }
1849
1850     ns++;
1851
1852     return atoi(ns);
1853 }
1854
1855 bool kvmppc_get_host_serial(char **value)
1856 {
1857     return g_file_get_contents("/proc/device-tree/system-id", value, NULL,
1858                                NULL);
1859 }
1860
1861 bool kvmppc_get_host_model(char **value)
1862 {
1863     return g_file_get_contents("/proc/device-tree/model", value, NULL, NULL);
1864 }
1865
1866 /* Try to find a device tree node for a CPU with clock-frequency property */
1867 static int kvmppc_find_cpu_dt(char *buf, int buf_len)
1868 {
1869     struct dirent *dirp;
1870     DIR *dp;
1871
1872     if ((dp = opendir(PROC_DEVTREE_CPU)) == NULL) {
1873         printf("Can't open directory " PROC_DEVTREE_CPU "\n");
1874         return -1;
1875     }
1876
1877     buf[0] = '\0';
1878     while ((dirp = readdir(dp)) != NULL) {
1879         FILE *f;
1880         snprintf(buf, buf_len, "%s%s/clock-frequency", PROC_DEVTREE_CPU,
1881                  dirp->d_name);
1882         f = fopen(buf, "r");
1883         if (f) {
1884             snprintf(buf, buf_len, "%s%s", PROC_DEVTREE_CPU, dirp->d_name);
1885             fclose(f);
1886             break;
1887         }
1888         buf[0] = '\0';
1889     }
1890     closedir(dp);
1891     if (buf[0] == '\0') {
1892         printf("Unknown host!\n");
1893         return -1;
1894     }
1895
1896     return 0;
1897 }
1898
1899 static uint64_t kvmppc_read_int_dt(const char *filename)
1900 {
1901     union {
1902         uint32_t v32;
1903         uint64_t v64;
1904     } u;
1905     FILE *f;
1906     int len;
1907
1908     f = fopen(filename, "rb");
1909     if (!f) {
1910         return -1;
1911     }
1912
1913     len = fread(&u, 1, sizeof(u), f);
1914     fclose(f);
1915     switch (len) {
1916     case 4:
1917         /* property is a 32-bit quantity */
1918         return be32_to_cpu(u.v32);
1919     case 8:
1920         return be64_to_cpu(u.v64);
1921     }
1922
1923     return 0;
1924 }
1925
1926 /* Read a CPU node property from the host device tree that's a single
1927  * integer (32-bit or 64-bit).  Returns 0 if anything goes wrong
1928  * (can't find or open the property, or doesn't understand the
1929  * format) */
1930 static uint64_t kvmppc_read_int_cpu_dt(const char *propname)
1931 {
1932     char buf[PATH_MAX], *tmp;
1933     uint64_t val;
1934
1935     if (kvmppc_find_cpu_dt(buf, sizeof(buf))) {
1936         return -1;
1937     }
1938
1939     tmp = g_strdup_printf("%s/%s", buf, propname);
1940     val = kvmppc_read_int_dt(tmp);
1941     g_free(tmp);
1942
1943     return val;
1944 }
1945
1946 uint64_t kvmppc_get_clockfreq(void)
1947 {
1948     return kvmppc_read_int_cpu_dt("clock-frequency");
1949 }
1950
1951 uint32_t kvmppc_get_vmx(void)
1952 {
1953     return kvmppc_read_int_cpu_dt("ibm,vmx");
1954 }
1955
1956 uint32_t kvmppc_get_dfp(void)
1957 {
1958     return kvmppc_read_int_cpu_dt("ibm,dfp");
1959 }
1960
1961 static int kvmppc_get_pvinfo(CPUPPCState *env, struct kvm_ppc_pvinfo *pvinfo)
1962  {
1963      PowerPCCPU *cpu = ppc_env_get_cpu(env);
1964      CPUState *cs = CPU(cpu);
1965
1966     if (kvm_vm_check_extension(cs->kvm_state, KVM_CAP_PPC_GET_PVINFO) &&
1967         !kvm_vm_ioctl(cs->kvm_state, KVM_PPC_GET_PVINFO, pvinfo)) {
1968         return 0;
1969     }
1970
1971     return 1;
1972 }
1973
1974 int kvmppc_get_hasidle(CPUPPCState *env)
1975 {
1976     struct kvm_ppc_pvinfo pvinfo;
1977
1978     if (!kvmppc_get_pvinfo(env, &pvinfo) &&
1979         (pvinfo.flags & KVM_PPC_PVINFO_FLAGS_EV_IDLE)) {
1980         return 1;
1981     }
1982
1983     return 0;
1984 }
1985
1986 int kvmppc_get_hypercall(CPUPPCState *env, uint8_t *buf, int buf_len)
1987 {
1988     uint32_t *hc = (uint32_t*)buf;
1989     struct kvm_ppc_pvinfo pvinfo;
1990
1991     if (!kvmppc_get_pvinfo(env, &pvinfo)) {
1992         memcpy(buf, pvinfo.hcall, buf_len);
1993         return 0;
1994     }
1995
1996     /*
1997      * Fallback to always fail hypercalls regardless of endianness:
1998      *
1999      *     tdi 0,r0,72 (becomes b .+8 in wrong endian, nop in good endian)
2000      *     li r3, -1
2001      *     b .+8       (becomes nop in wrong endian)
2002      *     bswap32(li r3, -1)
2003      */
2004
2005     hc[0] = cpu_to_be32(0x08000048);
2006     hc[1] = cpu_to_be32(0x3860ffff);
2007     hc[2] = cpu_to_be32(0x48000008);
2008     hc[3] = cpu_to_be32(bswap32(0x3860ffff));
2009
2010     return 1;
2011 }
2012
2013 static inline int kvmppc_enable_hcall(KVMState *s, target_ulong hcall)
2014 {
2015     return kvm_vm_enable_cap(s, KVM_CAP_PPC_ENABLE_HCALL, 0, hcall, 1);
2016 }
2017
2018 void kvmppc_enable_logical_ci_hcalls(void)
2019 {
2020     /*
2021      * FIXME: it would be nice if we could detect the cases where
2022      * we're using a device which requires the in kernel
2023      * implementation of these hcalls, but the kernel lacks them and
2024      * produce a warning.
2025      */
2026     kvmppc_enable_hcall(kvm_state, H_LOGICAL_CI_LOAD);
2027     kvmppc_enable_hcall(kvm_state, H_LOGICAL_CI_STORE);
2028 }
2029
2030 void kvmppc_enable_set_mode_hcall(void)
2031 {
2032     kvmppc_enable_hcall(kvm_state, H_SET_MODE);
2033 }
2034
2035 void kvmppc_set_papr(PowerPCCPU *cpu)
2036 {
2037     CPUState *cs = CPU(cpu);
2038     int ret;
2039
2040     ret = kvm_vcpu_enable_cap(cs, KVM_CAP_PPC_PAPR, 0);
2041     if (ret) {
2042         error_report("This vCPU type or KVM version does not support PAPR");
2043         exit(1);
2044     }
2045
2046     /* Update the capability flag so we sync the right information
2047      * with kvm */
2048     cap_papr = 1;
2049 }
2050
2051 int kvmppc_set_compat(PowerPCCPU *cpu, uint32_t cpu_version)
2052 {
2053     return kvm_set_one_reg(CPU(cpu), KVM_REG_PPC_ARCH_COMPAT, &cpu_version);
2054 }
2055
2056 void kvmppc_set_mpic_proxy(PowerPCCPU *cpu, int mpic_proxy)
2057 {
2058     CPUState *cs = CPU(cpu);
2059     int ret;
2060
2061     ret = kvm_vcpu_enable_cap(cs, KVM_CAP_PPC_EPR, 0, mpic_proxy);
2062     if (ret && mpic_proxy) {
2063         error_report("This KVM version does not support EPR");
2064         exit(1);
2065     }
2066 }
2067
2068 int kvmppc_smt_threads(void)
2069 {
2070     return cap_ppc_smt ? cap_ppc_smt : 1;
2071 }
2072
2073 #ifdef TARGET_PPC64
2074 off_t kvmppc_alloc_rma(void **rma)
2075 {
2076     off_t size;
2077     int fd;
2078     struct kvm_allocate_rma ret;
2079
2080     /* If cap_ppc_rma == 0, contiguous RMA allocation is not supported
2081      * if cap_ppc_rma == 1, contiguous RMA allocation is supported, but
2082      *                      not necessary on this hardware
2083      * if cap_ppc_rma == 2, contiguous RMA allocation is needed on this hardware
2084      *
2085      * FIXME: We should allow the user to force contiguous RMA
2086      * allocation in the cap_ppc_rma==1 case.
2087      */
2088     if (cap_ppc_rma < 2) {
2089         return 0;
2090     }
2091
2092     fd = kvm_vm_ioctl(kvm_state, KVM_ALLOCATE_RMA, &ret);
2093     if (fd < 0) {
2094         fprintf(stderr, "KVM: Error on KVM_ALLOCATE_RMA: %s\n",
2095                 strerror(errno));
2096         return -1;
2097     }
2098
2099     size = MIN(ret.rma_size, 256ul << 20);
2100
2101     *rma = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
2102     if (*rma == MAP_FAILED) {
2103         fprintf(stderr, "KVM: Error mapping RMA: %s\n", strerror(errno));
2104         return -1;
2105     };
2106
2107     return size;
2108 }
2109
2110 uint64_t kvmppc_rma_size(uint64_t current_size, unsigned int hash_shift)
2111 {
2112     struct kvm_ppc_smmu_info info;
2113     long rampagesize, best_page_shift;
2114     int i;
2115
2116     if (cap_ppc_rma >= 2) {
2117         return current_size;
2118     }
2119
2120     /* Find the largest hardware supported page size that's less than
2121      * or equal to the (logical) backing page size of guest RAM */
2122     kvm_get_smmu_info(POWERPC_CPU(first_cpu), &info);
2123     rampagesize = getrampagesize();
2124     best_page_shift = 0;
2125
2126     for (i = 0; i < KVM_PPC_PAGE_SIZES_MAX_SZ; i++) {
2127         struct kvm_ppc_one_seg_page_size *sps = &info.sps[i];
2128
2129         if (!sps->page_shift) {
2130             continue;
2131         }
2132
2133         if ((sps->page_shift > best_page_shift)
2134             && ((1UL << sps->page_shift) <= rampagesize)) {
2135             best_page_shift = sps->page_shift;
2136         }
2137     }
2138
2139     return MIN(current_size,
2140                1ULL << (best_page_shift + hash_shift - 7));
2141 }
2142 #endif
2143
2144 bool kvmppc_spapr_use_multitce(void)
2145 {
2146     return cap_spapr_multitce;
2147 }
2148
2149 void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd,
2150                               bool need_vfio)
2151 {
2152     struct kvm_create_spapr_tce args = {
2153         .liobn = liobn,
2154         .window_size = window_size,
2155     };
2156     long len;
2157     int fd;
2158     void *table;
2159
2160     /* Must set fd to -1 so we don't try to munmap when called for
2161      * destroying the table, which the upper layers -will- do
2162      */
2163     *pfd = -1;
2164     if (!cap_spapr_tce || (need_vfio && !cap_spapr_vfio)) {
2165         return NULL;
2166     }
2167
2168     fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_SPAPR_TCE, &args);
2169     if (fd < 0) {
2170         fprintf(stderr, "KVM: Failed to create TCE table for liobn 0x%x\n",
2171                 liobn);
2172         return NULL;
2173     }
2174
2175     len = (window_size / SPAPR_TCE_PAGE_SIZE) * sizeof(uint64_t);
2176     /* FIXME: round this up to page size */
2177
2178     table = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
2179     if (table == MAP_FAILED) {
2180         fprintf(stderr, "KVM: Failed to map TCE table for liobn 0x%x\n",
2181                 liobn);
2182         close(fd);
2183         return NULL;
2184     }
2185
2186     *pfd = fd;
2187     return table;
2188 }
2189
2190 int kvmppc_remove_spapr_tce(void *table, int fd, uint32_t nb_table)
2191 {
2192     long len;
2193
2194     if (fd < 0) {
2195         return -1;
2196     }
2197
2198     len = nb_table * sizeof(uint64_t);
2199     if ((munmap(table, len) < 0) ||
2200         (close(fd) < 0)) {
2201         fprintf(stderr, "KVM: Unexpected error removing TCE table: %s",
2202                 strerror(errno));
2203         /* Leak the table */
2204     }
2205
2206     return 0;
2207 }
2208
2209 int kvmppc_reset_htab(int shift_hint)
2210 {
2211     uint32_t shift = shift_hint;
2212
2213     if (!kvm_enabled()) {
2214         /* Full emulation, tell caller to allocate htab itself */
2215         return 0;
2216     }
2217     if (kvm_check_extension(kvm_state, KVM_CAP_PPC_ALLOC_HTAB)) {
2218         int ret;
2219         ret = kvm_vm_ioctl(kvm_state, KVM_PPC_ALLOCATE_HTAB, &shift);
2220         if (ret == -ENOTTY) {
2221             /* At least some versions of PR KVM advertise the
2222              * capability, but don't implement the ioctl().  Oops.
2223              * Return 0 so that we allocate the htab in qemu, as is
2224              * correct for PR. */
2225             return 0;
2226         } else if (ret < 0) {
2227             return ret;
2228         }
2229         return shift;
2230     }
2231
2232     /* We have a kernel that predates the htab reset calls.  For PR
2233      * KVM, we need to allocate the htab ourselves, for an HV KVM of
2234      * this era, it has allocated a 16MB fixed size hash table
2235      * already.  Kernels of this era have the GET_PVINFO capability
2236      * only on PR, so we use this hack to determine the right
2237      * answer */
2238     if (kvm_check_extension(kvm_state, KVM_CAP_PPC_GET_PVINFO)) {
2239         /* PR - tell caller to allocate htab */
2240         return 0;
2241     } else {
2242         /* HV - assume 16MB kernel allocated htab */
2243         return 24;
2244     }
2245 }
2246
2247 static inline uint32_t mfpvr(void)
2248 {
2249     uint32_t pvr;
2250
2251     asm ("mfpvr %0"
2252          : "=r"(pvr));
2253     return pvr;
2254 }
2255
2256 static void alter_insns(uint64_t *word, uint64_t flags, bool on)
2257 {
2258     if (on) {
2259         *word |= flags;
2260     } else {
2261         *word &= ~flags;
2262     }
2263 }
2264
2265 static void kvmppc_host_cpu_initfn(Object *obj)
2266 {
2267     assert(kvm_enabled());
2268 }
2269
2270 static void kvmppc_host_cpu_class_init(ObjectClass *oc, void *data)
2271 {
2272     DeviceClass *dc = DEVICE_CLASS(oc);
2273     PowerPCCPUClass *pcc = POWERPC_CPU_CLASS(oc);
2274     uint32_t vmx = kvmppc_get_vmx();
2275     uint32_t dfp = kvmppc_get_dfp();
2276     uint32_t dcache_size = kvmppc_read_int_cpu_dt("d-cache-size");
2277     uint32_t icache_size = kvmppc_read_int_cpu_dt("i-cache-size");
2278
2279     /* Now fix up the class with information we can query from the host */
2280     pcc->pvr = mfpvr();
2281
2282     if (vmx != -1) {
2283         /* Only override when we know what the host supports */
2284         alter_insns(&pcc->insns_flags, PPC_ALTIVEC, vmx > 0);
2285         alter_insns(&pcc->insns_flags2, PPC2_VSX, vmx > 1);
2286     }
2287     if (dfp != -1) {
2288         /* Only override when we know what the host supports */
2289         alter_insns(&pcc->insns_flags2, PPC2_DFP, dfp);
2290     }
2291
2292     if (dcache_size != -1) {
2293         pcc->l1_dcache_size = dcache_size;
2294     }
2295
2296     if (icache_size != -1) {
2297         pcc->l1_icache_size = icache_size;
2298     }
2299
2300     /* Reason: kvmppc_host_cpu_initfn() dies when !kvm_enabled() */
2301     dc->cannot_destroy_with_object_finalize_yet = true;
2302 }
2303
2304 bool kvmppc_has_cap_epr(void)
2305 {
2306     return cap_epr;
2307 }
2308
2309 bool kvmppc_has_cap_htab_fd(void)
2310 {
2311     return cap_htab_fd;
2312 }
2313
2314 bool kvmppc_has_cap_fixup_hcalls(void)
2315 {
2316     return cap_fixup_hcalls;
2317 }
2318
2319 static PowerPCCPUClass *ppc_cpu_get_family_class(PowerPCCPUClass *pcc)
2320 {
2321     ObjectClass *oc = OBJECT_CLASS(pcc);
2322
2323     while (oc && !object_class_is_abstract(oc)) {
2324         oc = object_class_get_parent(oc);
2325     }
2326     assert(oc);
2327
2328     return POWERPC_CPU_CLASS(oc);
2329 }
2330
2331 PowerPCCPUClass *kvm_ppc_get_host_cpu_class(void)
2332 {
2333     uint32_t host_pvr = mfpvr();
2334     PowerPCCPUClass *pvr_pcc;
2335
2336     pvr_pcc = ppc_cpu_class_by_pvr(host_pvr);
2337     if (pvr_pcc == NULL) {
2338         pvr_pcc = ppc_cpu_class_by_pvr_mask(host_pvr);
2339     }
2340
2341     return pvr_pcc;
2342 }
2343
2344 static int kvm_ppc_register_host_cpu_type(void)
2345 {
2346     TypeInfo type_info = {
2347         .name = TYPE_HOST_POWERPC_CPU,
2348         .instance_init = kvmppc_host_cpu_initfn,
2349         .class_init = kvmppc_host_cpu_class_init,
2350     };
2351     PowerPCCPUClass *pvr_pcc;
2352     DeviceClass *dc;
2353
2354     pvr_pcc = kvm_ppc_get_host_cpu_class();
2355     if (pvr_pcc == NULL) {
2356         return -1;
2357     }
2358     type_info.parent = object_class_get_name(OBJECT_CLASS(pvr_pcc));
2359     type_register(&type_info);
2360
2361     /* Register generic family CPU class for a family */
2362     pvr_pcc = ppc_cpu_get_family_class(pvr_pcc);
2363     dc = DEVICE_CLASS(pvr_pcc);
2364     type_info.parent = object_class_get_name(OBJECT_CLASS(pvr_pcc));
2365     type_info.name = g_strdup_printf("%s-"TYPE_POWERPC_CPU, dc->desc);
2366     type_register(&type_info);
2367
2368     return 0;
2369 }
2370
2371 int kvmppc_define_rtas_kernel_token(uint32_t token, const char *function)
2372 {
2373     struct kvm_rtas_token_args args = {
2374         .token = token,
2375     };
2376
2377     if (!kvm_check_extension(kvm_state, KVM_CAP_PPC_RTAS)) {
2378         return -ENOENT;
2379     }
2380
2381     strncpy(args.name, function, sizeof(args.name));
2382
2383     return kvm_vm_ioctl(kvm_state, KVM_PPC_RTAS_DEFINE_TOKEN, &args);
2384 }
2385
2386 int kvmppc_get_htab_fd(bool write)
2387 {
2388     struct kvm_get_htab_fd s = {
2389         .flags = write ? KVM_GET_HTAB_WRITE : 0,
2390         .start_index = 0,
2391     };
2392
2393     if (!cap_htab_fd) {
2394         fprintf(stderr, "KVM version doesn't support saving the hash table\n");
2395         return -1;
2396     }
2397
2398     return kvm_vm_ioctl(kvm_state, KVM_PPC_GET_HTAB_FD, &s);
2399 }
2400
2401 int kvmppc_save_htab(QEMUFile *f, int fd, size_t bufsize, int64_t max_ns)
2402 {
2403     int64_t starttime = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2404     uint8_t buf[bufsize];
2405     ssize_t rc;
2406
2407     do {
2408         rc = read(fd, buf, bufsize);
2409         if (rc < 0) {
2410             fprintf(stderr, "Error reading data from KVM HTAB fd: %s\n",
2411                     strerror(errno));
2412             return rc;
2413         } else if (rc) {
2414             uint8_t *buffer = buf;
2415             ssize_t n = rc;
2416             while (n) {
2417                 struct kvm_get_htab_header *head =
2418                     (struct kvm_get_htab_header *) buffer;
2419                 size_t chunksize = sizeof(*head) +
2420                      HASH_PTE_SIZE_64 * head->n_valid;
2421
2422                 qemu_put_be32(f, head->index);
2423                 qemu_put_be16(f, head->n_valid);
2424                 qemu_put_be16(f, head->n_invalid);
2425                 qemu_put_buffer(f, (void *)(head + 1),
2426                                 HASH_PTE_SIZE_64 * head->n_valid);
2427
2428                 buffer += chunksize;
2429                 n -= chunksize;
2430             }
2431         }
2432     } while ((rc != 0)
2433              && ((max_ns < 0)
2434                  || ((qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - starttime) < max_ns)));
2435
2436     return (rc == 0) ? 1 : 0;
2437 }
2438
2439 int kvmppc_load_htab_chunk(QEMUFile *f, int fd, uint32_t index,
2440                            uint16_t n_valid, uint16_t n_invalid)
2441 {
2442     struct kvm_get_htab_header *buf;
2443     size_t chunksize = sizeof(*buf) + n_valid*HASH_PTE_SIZE_64;
2444     ssize_t rc;
2445
2446     buf = alloca(chunksize);
2447     buf->index = index;
2448     buf->n_valid = n_valid;
2449     buf->n_invalid = n_invalid;
2450
2451     qemu_get_buffer(f, (void *)(buf + 1), HASH_PTE_SIZE_64*n_valid);
2452
2453     rc = write(fd, buf, chunksize);
2454     if (rc < 0) {
2455         fprintf(stderr, "Error writing KVM hash table: %s\n",
2456                 strerror(errno));
2457         return rc;
2458     }
2459     if (rc != chunksize) {
2460         /* We should never get a short write on a single chunk */
2461         fprintf(stderr, "Short write, restoring KVM hash table\n");
2462         return -1;
2463     }
2464     return 0;
2465 }
2466
2467 bool kvm_arch_stop_on_emulation_error(CPUState *cpu)
2468 {
2469     return true;
2470 }
2471
2472 int kvm_arch_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
2473 {
2474     return 1;
2475 }
2476
2477 int kvm_arch_on_sigbus(int code, void *addr)
2478 {
2479     return 1;
2480 }
2481
2482 void kvm_arch_init_irq_routing(KVMState *s)
2483 {
2484 }
2485
2486 struct kvm_get_htab_buf {
2487     struct kvm_get_htab_header header;
2488     /*
2489      * We require one extra byte for read
2490      */
2491     target_ulong hpte[(HPTES_PER_GROUP * 2) + 1];
2492 };
2493
2494 uint64_t kvmppc_hash64_read_pteg(PowerPCCPU *cpu, target_ulong pte_index)
2495 {
2496     int htab_fd;
2497     struct kvm_get_htab_fd ghf;
2498     struct kvm_get_htab_buf  *hpte_buf;
2499
2500     ghf.flags = 0;
2501     ghf.start_index = pte_index;
2502     htab_fd = kvm_vm_ioctl(kvm_state, KVM_PPC_GET_HTAB_FD, &ghf);
2503     if (htab_fd < 0) {
2504         goto error_out;
2505     }
2506
2507     hpte_buf = g_malloc0(sizeof(*hpte_buf));
2508     /*
2509      * Read the hpte group
2510      */
2511     if (read(htab_fd, hpte_buf, sizeof(*hpte_buf)) < 0) {
2512         goto out_close;
2513     }
2514
2515     close(htab_fd);
2516     return (uint64_t)(uintptr_t) hpte_buf->hpte;
2517
2518 out_close:
2519     g_free(hpte_buf);
2520     close(htab_fd);
2521 error_out:
2522     return 0;
2523 }
2524
2525 void kvmppc_hash64_free_pteg(uint64_t token)
2526 {
2527     struct kvm_get_htab_buf *htab_buf;
2528
2529     htab_buf = container_of((void *)(uintptr_t) token, struct kvm_get_htab_buf,
2530                             hpte);
2531     g_free(htab_buf);
2532     return;
2533 }
2534
2535 void kvmppc_hash64_write_pte(CPUPPCState *env, target_ulong pte_index,
2536                              target_ulong pte0, target_ulong pte1)
2537 {
2538     int htab_fd;
2539     struct kvm_get_htab_fd ghf;
2540     struct kvm_get_htab_buf hpte_buf;
2541
2542     ghf.flags = 0;
2543     ghf.start_index = 0;     /* Ignored */
2544     htab_fd = kvm_vm_ioctl(kvm_state, KVM_PPC_GET_HTAB_FD, &ghf);
2545     if (htab_fd < 0) {
2546         goto error_out;
2547     }
2548
2549     hpte_buf.header.n_valid = 1;
2550     hpte_buf.header.n_invalid = 0;
2551     hpte_buf.header.index = pte_index;
2552     hpte_buf.hpte[0] = pte0;
2553     hpte_buf.hpte[1] = pte1;
2554     /*
2555      * Write the hpte entry.
2556      * CAUTION: write() has the warn_unused_result attribute. Hence we
2557      * need to check the return value, even though we do nothing.
2558      */
2559     if (write(htab_fd, &hpte_buf, sizeof(hpte_buf)) < 0) {
2560         goto out_close;
2561     }
2562
2563 out_close:
2564     close(htab_fd);
2565     return;
2566
2567 error_out:
2568     return;
2569 }
2570
2571 int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route,
2572                              uint64_t address, uint32_t data, PCIDevice *dev)
2573 {
2574     return 0;
2575 }
2576
2577 int kvm_arch_msi_data_to_gsi(uint32_t data)
2578 {
2579     return data & 0xffff;
2580 }
2581
2582 int kvmppc_enable_hwrng(void)
2583 {
2584     if (!kvm_enabled() || !kvm_check_extension(kvm_state, KVM_CAP_PPC_HWRNG)) {
2585         return -1;
2586     }
2587
2588     return kvmppc_enable_hcall(kvm_state, H_RANDOM);
2589 }