target-ppc/kvm.c

   1 /*
   2  * PowerPC implementation of KVM hooks
   3  *
   4  * Copyright IBM Corp. 2007
   5  * Copyright (C) 2011 Freescale Semiconductor, Inc.
   6  *
   7  * Authors:
   8  *  Jerone Young <jyoung5@us.ibm.com>
   9  *  Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
  10  *  Hollis Blanchard <hollisb@us.ibm.com>
  11  *
  12  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  13  * See the COPYING file in the top-level directory.
  14  *
  15  */
  16
  17 #include <dirent.h>
  18 #include <sys/types.h>
  19 #include <sys/ioctl.h>
  20 #include <sys/mman.h>
  21 #include <sys/vfs.h>
  22
  23 #include <linux/kvm.h>
  24
  25 #include "qemu-common.h"
  26 #include "qemu-timer.h"
  27 #include "sysemu.h"
  28 #include "kvm.h"
  29 #include "kvm_ppc.h"
  30 #include "cpu.h"
  31 #include "cpus.h"
  32 #include "device_tree.h"
  33 #include "hw/sysbus.h"
  34 #include "hw/spapr.h"
  35
  36 #include "hw/sysbus.h"
  37 #include "hw/spapr.h"
  38 #include "hw/spapr_vio.h"
  39
  40 //#define DEBUG_KVM
  41
  42 #ifdef DEBUG_KVM
  43 #define dprintf(fmt, ...) \
  44     do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
  45 #else
  46 #define dprintf(fmt, ...) \
  47     do { } while (0)
  48 #endif
  49
  50 #define PROC_DEVTREE_CPU      "/proc/device-tree/cpus/"
  51
  52 const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
  53     KVM_CAP_LAST_INFO
  54 };
  55
  56 static int cap_interrupt_unset = false;
  57 static int cap_interrupt_level = false;
  58 static int cap_segstate;
  59 static int cap_booke_sregs;
  60 static int cap_ppc_smt;
  61 static int cap_ppc_rma;
  62 static int cap_spapr_tce;
  63
  64 /* XXX We have a race condition where we actually have a level triggered
  65  *     interrupt, but the infrastructure can't expose that yet, so the guest
  66  *     takes but ignores it, goes to sleep and never gets notified that there's
  67  *     still an interrupt pending.
  68  *
  69  *     As a quick workaround, let's just wake up again 20 ms after we injected
  70  *     an interrupt. That way we can assure that we're always reinjecting
  71  *     interrupts in case the guest swallowed them.
  72  */
  73 static QEMUTimer *idle_timer;
  74
  75 static void kvm_kick_env(void *env)
  76 {
  77     qemu_cpu_kick(env);
  78 }
  79
  80 int kvm_arch_init(KVMState *s)
  81 {
  82     cap_interrupt_unset = kvm_check_extension(s, KVM_CAP_PPC_UNSET_IRQ);
  83     cap_interrupt_level = kvm_check_extension(s, KVM_CAP_PPC_IRQ_LEVEL);
  84     cap_segstate = kvm_check_extension(s, KVM_CAP_PPC_SEGSTATE);
  85     cap_booke_sregs = kvm_check_extension(s, KVM_CAP_PPC_BOOKE_SREGS);
  86     cap_ppc_smt = kvm_check_extension(s, KVM_CAP_PPC_SMT);
  87     cap_ppc_rma = kvm_check_extension(s, KVM_CAP_PPC_RMA);
  88     cap_spapr_tce = kvm_check_extension(s, KVM_CAP_SPAPR_TCE);
  89
  90     if (!cap_interrupt_level) {
  91         fprintf(stderr, "KVM: Couldn't find level irq capability. Expect the "
  92                         "VM to stall at times!\n");
  93     }
  94
  95     return 0;
  96 }
  97
  98 static int kvm_arch_sync_sregs(CPUPPCState *cenv)
  99 {
 100     struct kvm_sregs sregs;
 101     int ret;
 102
 103     if (cenv->excp_model == POWERPC_EXCP_BOOKE) {
 104         /* What we're really trying to say is "if we're on BookE, we use
 105            the native PVR for now". This is the only sane way to check
 106            it though, so we potentially confuse users that they can run
 107            BookE guests on BookS. Let's hope nobody dares enough :) */
 108         return 0;
 109     } else {
 110         if (!cap_segstate) {
 111             fprintf(stderr, "kvm error: missing PVR setting capability\n");
 112             return -ENOSYS;
 113         }
 114     }
 115
 116     ret = kvm_vcpu_ioctl(cenv, KVM_GET_SREGS, &sregs);
 117     if (ret) {
 118         return ret;
 119     }
 120
 121     sregs.pvr = cenv->spr[SPR_PVR];
 122     return kvm_vcpu_ioctl(cenv, KVM_SET_SREGS, &sregs);
 123 }
 124
 125 /* Set up a shared TLB array with KVM */
 126 static int kvm_booke206_tlb_init(CPUPPCState *env)
 127 {
 128     struct kvm_book3e_206_tlb_params params = {};
 129     struct kvm_config_tlb cfg = {};
 130     struct kvm_enable_cap encap = {};
 131     unsigned int entries = 0;
 132     int ret, i;
 133
 134     if (!kvm_enabled() ||
 135         !kvm_check_extension(env->kvm_state, KVM_CAP_SW_TLB)) {
 136         return 0;
 137     }
 138
 139     assert(ARRAY_SIZE(params.tlb_sizes) == BOOKE206_MAX_TLBN);
 140
 141     for (i = 0; i < BOOKE206_MAX_TLBN; i++) {
 142         params.tlb_sizes[i] = booke206_tlb_size(env, i);
 143         params.tlb_ways[i] = booke206_tlb_ways(env, i);
 144         entries += params.tlb_sizes[i];
 145     }
 146
 147     assert(entries == env->nb_tlb);
 148     assert(sizeof(struct kvm_book3e_206_tlb_entry) == sizeof(ppcmas_tlb_t));
 149
 150     env->tlb_dirty = true;
 151
 152     cfg.array = (uintptr_t)env->tlb.tlbm;
 153     cfg.array_len = sizeof(ppcmas_tlb_t) * entries;
 154     cfg.params = (uintptr_t)&params;
 155     cfg.mmu_type = KVM_MMU_FSL_BOOKE_NOHV;
 156
 157     encap.cap = KVM_CAP_SW_TLB;
 158     encap.args[0] = (uintptr_t)&cfg;
 159
 160     ret = kvm_vcpu_ioctl(env, KVM_ENABLE_CAP, &encap);
 161     if (ret < 0) {
 162         fprintf(stderr, "%s: couldn't enable KVM_CAP_SW_TLB: %s\n",
 163                 __func__, strerror(-ret));
 164         return ret;
 165     }
 166
 167     env->kvm_sw_tlb = true;
 168     return 0;
 169 }
 170
 171
 172 #if defined(TARGET_PPC64)
 173 static void kvm_get_fallback_smmu_info(CPUPPCState *env,
 174                                        struct kvm_ppc_smmu_info *info)
 175 {
 176     memset(info, 0, sizeof(*info));
 177
 178     /* We don't have the new KVM_PPC_GET_SMMU_INFO ioctl, so
 179      * need to "guess" what the supported page sizes are.
 180      *
 181      * For that to work we make a few assumptions:
 182      *
 183      * - If KVM_CAP_PPC_GET_PVINFO is supported we are running "PR"
 184      *   KVM which only supports 4K and 16M pages, but supports them
 185      *   regardless of the backing store characteritics. We also don't
 186      *   support 1T segments.
 187      *
 188      *   This is safe as if HV KVM ever supports that capability or PR
 189      *   KVM grows supports for more page/segment sizes, those versions
 190      *   will have implemented KVM_CAP_PPC_GET_SMMU_INFO and thus we
 191      *   will not hit this fallback
 192      *
 193      * - Else we are running HV KVM. This means we only support page
 194      *   sizes that fit in the backing store. Additionally we only
 195      *   advertize 64K pages if the processor is ARCH 2.06 and we assume
 196      *   P7 encodings for the SLB and hash table. Here too, we assume
 197      *   support for any newer processor will mean a kernel that
 198      *   implements KVM_CAP_PPC_GET_SMMU_INFO and thus doesn't hit
 199      *   this fallback.
 200      */
 201     if (kvm_check_extension(env->kvm_state, KVM_CAP_PPC_GET_PVINFO)) {
 202         /* No flags */
 203         info->flags = 0;
 204         info->slb_size = 64;
 205
 206         /* Standard 4k base page size segment */
 207         info->sps[0].page_shift = 12;
 208         info->sps[0].slb_enc = 0;
 209         info->sps[0].enc[0].page_shift = 12;
 210         info->sps[0].enc[0].pte_enc = 0;
 211
 212         /* Standard 16M large page size segment */
 213         info->sps[1].page_shift = 24;
 214         info->sps[1].slb_enc = SLB_VSID_L;
 215         info->sps[1].enc[0].page_shift = 24;
 216         info->sps[1].enc[0].pte_enc = 0;
 217     } else {
 218         int i = 0;
 219
 220         /* HV KVM has backing store size restrictions */
 221         info->flags = KVM_PPC_PAGE_SIZES_REAL;
 222
 223         if (env->mmu_model & POWERPC_MMU_1TSEG) {
 224             info->flags |= KVM_PPC_1T_SEGMENTS;
 225         }
 226
 227         if (env->mmu_model == POWERPC_MMU_2_06) {
 228             info->slb_size = 32;
 229         } else {
 230             info->slb_size = 64;
 231         }
 232
 233         /* Standard 4k base page size segment */
 234         info->sps[i].page_shift = 12;
 235         info->sps[i].slb_enc = 0;
 236         info->sps[i].enc[0].page_shift = 12;
 237         info->sps[i].enc[0].pte_enc = 0;
 238         i++;
 239
 240         /* 64K on MMU 2.06 */
 241         if (env->mmu_model == POWERPC_MMU_2_06) {
 242             info->sps[i].page_shift = 16;
 243             info->sps[i].slb_enc = 0x110;
 244             info->sps[i].enc[0].page_shift = 16;
 245             info->sps[i].enc[0].pte_enc = 1;
 246             i++;
 247         }
 248
 249         /* Standard 16M large page size segment */
 250         info->sps[i].page_shift = 24;
 251         info->sps[i].slb_enc = SLB_VSID_L;
 252         info->sps[i].enc[0].page_shift = 24;
 253         info->sps[i].enc[0].pte_enc = 0;
 254     }
 255 }
 256
 257 static void kvm_get_smmu_info(CPUPPCState *env, struct kvm_ppc_smmu_info *info)
 258 {
 259     int ret;
 260
 261     if (kvm_check_extension(env->kvm_state, KVM_CAP_PPC_GET_SMMU_INFO)) {
 262         ret = kvm_vm_ioctl(env->kvm_state, KVM_PPC_GET_SMMU_INFO, info);
 263         if (ret == 0) {
 264             return;
 265         }
 266     }
 267
 268     kvm_get_fallback_smmu_info(env, info);
 269 }
 270
 271 static long getrampagesize(void)
 272 {
 273     struct statfs fs;
 274     int ret;
 275
 276     if (!mem_path) {
 277         /* guest RAM is backed by normal anonymous pages */
 278         return getpagesize();
 279     }
 280
 281     do {
 282         ret = statfs(mem_path, &fs);
 283     } while (ret != 0 && errno == EINTR);
 284
 285     if (ret != 0) {
 286         fprintf(stderr, "Couldn't statfs() memory path: %s\n",
 287                 strerror(errno));
 288         exit(1);
 289     }
 290
 291 #define HUGETLBFS_MAGIC       0x958458f6
 292
 293     if (fs.f_type != HUGETLBFS_MAGIC) {
 294         /* Explicit mempath, but it's ordinary pages */
 295         return getpagesize();
 296     }
 297
 298     /* It's hugepage, return the huge page size */
 299     return fs.f_bsize;
 300 }
 301
 302 static bool kvm_valid_page_size(uint32_t flags, long rampgsize, uint32_t shift)
 303 {
 304     if (!(flags & KVM_PPC_PAGE_SIZES_REAL)) {
 305         return true;
 306     }
 307
 308     return (1ul << shift) <= rampgsize;
 309 }
 310
 311 static void kvm_fixup_page_sizes(CPUPPCState *env)
 312 {
 313     static struct kvm_ppc_smmu_info smmu_info;
 314     static bool has_smmu_info;
 315     long rampagesize;
 316     int iq, ik, jq, jk;
 317
 318     /* We only handle page sizes for 64-bit server guests for now */
 319     if (!(env->mmu_model & POWERPC_MMU_64)) {
 320         return;
 321     }
 322
 323     /* Collect MMU info from kernel if not already */
 324     if (!has_smmu_info) {
 325         kvm_get_smmu_info(env, &smmu_info);
 326         has_smmu_info = true;
 327     }
 328
 329     rampagesize = getrampagesize();
 330
 331     /* Convert to QEMU form */
 332     memset(&env->sps, 0, sizeof(env->sps));
 333
 334     for (ik = iq = 0; ik < KVM_PPC_PAGE_SIZES_MAX_SZ; ik++) {
 335         struct ppc_one_seg_page_size *qsps = &env->sps.sps[iq];
 336         struct kvm_ppc_one_seg_page_size *ksps = &smmu_info.sps[ik];
 337
 338         if (!kvm_valid_page_size(smmu_info.flags, rampagesize,
 339                                  ksps->page_shift)) {
 340             continue;
 341         }
 342         qsps->page_shift = ksps->page_shift;
 343         qsps->slb_enc = ksps->slb_enc;
 344         for (jk = jq = 0; jk < KVM_PPC_PAGE_SIZES_MAX_SZ; jk++) {
 345             if (!kvm_valid_page_size(smmu_info.flags, rampagesize,
 346                                      ksps->enc[jk].page_shift)) {
 347                 continue;
 348             }
 349             qsps->enc[jq].page_shift = ksps->enc[jk].page_shift;
 350             qsps->enc[jq].pte_enc = ksps->enc[jk].pte_enc;
 351             if (++jq >= PPC_PAGE_SIZES_MAX_SZ) {
 352                 break;
 353             }
 354         }
 355         if (++iq >= PPC_PAGE_SIZES_MAX_SZ) {
 356             break;
 357         }
 358     }
 359     env->slb_nr = smmu_info.slb_size;
 360     if (smmu_info.flags & KVM_PPC_1T_SEGMENTS) {
 361         env->mmu_model |= POWERPC_MMU_1TSEG;
 362     } else {
 363         env->mmu_model &= ~POWERPC_MMU_1TSEG;
 364     }
 365 }
 366 #else /* defined (TARGET_PPC64) */
 367
 368 static inline void kvm_fixup_page_sizes(CPUPPCState *env)
 369 {
 370 }
 371
 372 #endif /* !defined (TARGET_PPC64) */
 373
 374 int kvm_arch_init_vcpu(CPUPPCState *cenv)
 375 {
 376     int ret;
 377
 378     /* Gather server mmu info from KVM and update the CPU state */
 379     kvm_fixup_page_sizes(cenv);
 380
 381     /* Synchronize sregs with kvm */
 382     ret = kvm_arch_sync_sregs(cenv);
 383     if (ret) {
 384         return ret;
 385     }
 386
 387     idle_timer = qemu_new_timer_ns(vm_clock, kvm_kick_env, cenv);
 388
 389     /* Some targets support access to KVM's guest TLB. */
 390     switch (cenv->mmu_model) {
 391     case POWERPC_MMU_BOOKE206:
 392         ret = kvm_booke206_tlb_init(cenv);
 393         break;
 394     default:
 395         break;
 396     }
 397
 398     return ret;
 399 }
 400
 401 void kvm_arch_reset_vcpu(CPUPPCState *env)
 402 {
 403 }
 404
 405 static void kvm_sw_tlb_put(CPUPPCState *env)
 406 {
 407     struct kvm_dirty_tlb dirty_tlb;
 408     unsigned char *bitmap;
 409     int ret;
 410
 411     if (!env->kvm_sw_tlb) {
 412         return;
 413     }
 414
 415     bitmap = g_malloc((env->nb_tlb + 7) / 8);
 416     memset(bitmap, 0xFF, (env->nb_tlb + 7) / 8);
 417
 418     dirty_tlb.bitmap = (uintptr_t)bitmap;
 419     dirty_tlb.num_dirty = env->nb_tlb;
 420
 421     ret = kvm_vcpu_ioctl(env, KVM_DIRTY_TLB, &dirty_tlb);
 422     if (ret) {
 423         fprintf(stderr, "%s: KVM_DIRTY_TLB: %s\n",
 424                 __func__, strerror(-ret));
 425     }
 426
 427     g_free(bitmap);
 428 }
 429
 430 int kvm_arch_put_registers(CPUPPCState *env, int level)
 431 {
 432     struct kvm_regs regs;
 433     int ret;
 434     int i;
 435
 436     ret = kvm_vcpu_ioctl(env, KVM_GET_REGS, &regs);
 437     if (ret < 0)
 438         return ret;
 439
 440     regs.ctr = env->ctr;
 441     regs.lr  = env->lr;
 442     regs.xer = env->xer;
 443     regs.msr = env->msr;
 444     regs.pc = env->nip;
 445
 446     regs.srr0 = env->spr[SPR_SRR0];
 447     regs.srr1 = env->spr[SPR_SRR1];
 448
 449     regs.sprg0 = env->spr[SPR_SPRG0];
 450     regs.sprg1 = env->spr[SPR_SPRG1];
 451     regs.sprg2 = env->spr[SPR_SPRG2];
 452     regs.sprg3 = env->spr[SPR_SPRG3];
 453     regs.sprg4 = env->spr[SPR_SPRG4];
 454     regs.sprg5 = env->spr[SPR_SPRG5];
 455     regs.sprg6 = env->spr[SPR_SPRG6];
 456     regs.sprg7 = env->spr[SPR_SPRG7];
 457
 458     regs.pid = env->spr[SPR_BOOKE_PID];
 459
 460     for (i = 0;i < 32; i++)
 461         regs.gpr[i] = env->gpr[i];
 462
 463     ret = kvm_vcpu_ioctl(env, KVM_SET_REGS, &regs);
 464     if (ret < 0)
 465         return ret;
 466
 467     if (env->tlb_dirty) {
 468         kvm_sw_tlb_put(env);
 469         env->tlb_dirty = false;
 470     }
 471
 472     return ret;
 473 }
 474
 475 int kvm_arch_get_registers(CPUPPCState *env)
 476 {
 477     struct kvm_regs regs;
 478     struct kvm_sregs sregs;
 479     uint32_t cr;
 480     int i, ret;
 481
 482     ret = kvm_vcpu_ioctl(env, KVM_GET_REGS, &regs);
 483     if (ret < 0)
 484         return ret;
 485
 486     cr = regs.cr;
 487     for (i = 7; i >= 0; i--) {
 488         env->crf[i] = cr & 15;
 489         cr >>= 4;
 490     }
 491
 492     env->ctr = regs.ctr;
 493     env->lr = regs.lr;
 494     env->xer = regs.xer;
 495     env->msr = regs.msr;
 496     env->nip = regs.pc;
 497
 498     env->spr[SPR_SRR0] = regs.srr0;
 499     env->spr[SPR_SRR1] = regs.srr1;
 500
 501     env->spr[SPR_SPRG0] = regs.sprg0;
 502     env->spr[SPR_SPRG1] = regs.sprg1;
 503     env->spr[SPR_SPRG2] = regs.sprg2;
 504     env->spr[SPR_SPRG3] = regs.sprg3;
 505     env->spr[SPR_SPRG4] = regs.sprg4;
 506     env->spr[SPR_SPRG5] = regs.sprg5;
 507     env->spr[SPR_SPRG6] = regs.sprg6;
 508     env->spr[SPR_SPRG7] = regs.sprg7;
 509
 510     env->spr[SPR_BOOKE_PID] = regs.pid;
 511
 512     for (i = 0;i < 32; i++)
 513         env->gpr[i] = regs.gpr[i];
 514
 515     if (cap_booke_sregs) {
 516         ret = kvm_vcpu_ioctl(env, KVM_GET_SREGS, &sregs);
 517         if (ret < 0) {
 518             return ret;
 519         }
 520
 521         if (sregs.u.e.features & KVM_SREGS_E_BASE) {
 522             env->spr[SPR_BOOKE_CSRR0] = sregs.u.e.csrr0;
 523             env->spr[SPR_BOOKE_CSRR1] = sregs.u.e.csrr1;
 524             env->spr[SPR_BOOKE_ESR] = sregs.u.e.esr;
 525             env->spr[SPR_BOOKE_DEAR] = sregs.u.e.dear;
 526             env->spr[SPR_BOOKE_MCSR] = sregs.u.e.mcsr;
 527             env->spr[SPR_BOOKE_TSR] = sregs.u.e.tsr;
 528             env->spr[SPR_BOOKE_TCR] = sregs.u.e.tcr;
 529             env->spr[SPR_DECR] = sregs.u.e.dec;
 530             env->spr[SPR_TBL] = sregs.u.e.tb & 0xffffffff;
 531             env->spr[SPR_TBU] = sregs.u.e.tb >> 32;
 532             env->spr[SPR_VRSAVE] = sregs.u.e.vrsave;
 533         }
 534
 535         if (sregs.u.e.features & KVM_SREGS_E_ARCH206) {
 536             env->spr[SPR_BOOKE_PIR] = sregs.u.e.pir;
 537             env->spr[SPR_BOOKE_MCSRR0] = sregs.u.e.mcsrr0;
 538             env->spr[SPR_BOOKE_MCSRR1] = sregs.u.e.mcsrr1;
 539             env->spr[SPR_BOOKE_DECAR] = sregs.u.e.decar;
 540             env->spr[SPR_BOOKE_IVPR] = sregs.u.e.ivpr;
 541         }
 542
 543         if (sregs.u.e.features & KVM_SREGS_E_64) {
 544             env->spr[SPR_BOOKE_EPCR] = sregs.u.e.epcr;
 545         }
 546
 547         if (sregs.u.e.features & KVM_SREGS_E_SPRG8) {
 548             env->spr[SPR_BOOKE_SPRG8] = sregs.u.e.sprg8;
 549         }
 550
 551         if (sregs.u.e.features & KVM_SREGS_E_IVOR) {
 552             env->spr[SPR_BOOKE_IVOR0] = sregs.u.e.ivor_low[0];
 553             env->spr[SPR_BOOKE_IVOR1] = sregs.u.e.ivor_low[1];
 554             env->spr[SPR_BOOKE_IVOR2] = sregs.u.e.ivor_low[2];
 555             env->spr[SPR_BOOKE_IVOR3] = sregs.u.e.ivor_low[3];
 556             env->spr[SPR_BOOKE_IVOR4] = sregs.u.e.ivor_low[4];
 557             env->spr[SPR_BOOKE_IVOR5] = sregs.u.e.ivor_low[5];
 558             env->spr[SPR_BOOKE_IVOR6] = sregs.u.e.ivor_low[6];
 559             env->spr[SPR_BOOKE_IVOR7] = sregs.u.e.ivor_low[7];
 560             env->spr[SPR_BOOKE_IVOR8] = sregs.u.e.ivor_low[8];
 561             env->spr[SPR_BOOKE_IVOR9] = sregs.u.e.ivor_low[9];
 562             env->spr[SPR_BOOKE_IVOR10] = sregs.u.e.ivor_low[10];
 563             env->spr[SPR_BOOKE_IVOR11] = sregs.u.e.ivor_low[11];
 564             env->spr[SPR_BOOKE_IVOR12] = sregs.u.e.ivor_low[12];
 565             env->spr[SPR_BOOKE_IVOR13] = sregs.u.e.ivor_low[13];
 566             env->spr[SPR_BOOKE_IVOR14] = sregs.u.e.ivor_low[14];
 567             env->spr[SPR_BOOKE_IVOR15] = sregs.u.e.ivor_low[15];
 568
 569             if (sregs.u.e.features & KVM_SREGS_E_SPE) {
 570                 env->spr[SPR_BOOKE_IVOR32] = sregs.u.e.ivor_high[0];
 571                 env->spr[SPR_BOOKE_IVOR33] = sregs.u.e.ivor_high[1];
 572                 env->spr[SPR_BOOKE_IVOR34] = sregs.u.e.ivor_high[2];
 573             }
 574
 575             if (sregs.u.e.features & KVM_SREGS_E_PM) {
 576                 env->spr[SPR_BOOKE_IVOR35] = sregs.u.e.ivor_high[3];
 577             }
 578
 579             if (sregs.u.e.features & KVM_SREGS_E_PC) {
 580                 env->spr[SPR_BOOKE_IVOR36] = sregs.u.e.ivor_high[4];
 581                 env->spr[SPR_BOOKE_IVOR37] = sregs.u.e.ivor_high[5];
 582             }
 583         }
 584
 585         if (sregs.u.e.features & KVM_SREGS_E_ARCH206_MMU) {
 586             env->spr[SPR_BOOKE_MAS0] = sregs.u.e.mas0;
 587             env->spr[SPR_BOOKE_MAS1] = sregs.u.e.mas1;
 588             env->spr[SPR_BOOKE_MAS2] = sregs.u.e.mas2;
 589             env->spr[SPR_BOOKE_MAS3] = sregs.u.e.mas7_3 & 0xffffffff;
 590             env->spr[SPR_BOOKE_MAS4] = sregs.u.e.mas4;
 591             env->spr[SPR_BOOKE_MAS6] = sregs.u.e.mas6;
 592             env->spr[SPR_BOOKE_MAS7] = sregs.u.e.mas7_3 >> 32;
 593             env->spr[SPR_MMUCFG] = sregs.u.e.mmucfg;
 594             env->spr[SPR_BOOKE_TLB0CFG] = sregs.u.e.tlbcfg[0];
 595             env->spr[SPR_BOOKE_TLB1CFG] = sregs.u.e.tlbcfg[1];
 596         }
 597
 598         if (sregs.u.e.features & KVM_SREGS_EXP) {
 599             env->spr[SPR_BOOKE_EPR] = sregs.u.e.epr;
 600         }
 601
 602         if (sregs.u.e.features & KVM_SREGS_E_PD) {
 603             env->spr[SPR_BOOKE_EPLC] = sregs.u.e.eplc;
 604             env->spr[SPR_BOOKE_EPSC] = sregs.u.e.epsc;
 605         }
 606
 607         if (sregs.u.e.impl_id == KVM_SREGS_E_IMPL_FSL) {
 608             env->spr[SPR_E500_SVR] = sregs.u.e.impl.fsl.svr;
 609             env->spr[SPR_Exxx_MCAR] = sregs.u.e.impl.fsl.mcar;
 610             env->spr[SPR_HID0] = sregs.u.e.impl.fsl.hid0;
 611
 612             if (sregs.u.e.impl.fsl.features & KVM_SREGS_E_FSL_PIDn) {
 613                 env->spr[SPR_BOOKE_PID1] = sregs.u.e.impl.fsl.pid1;
 614                 env->spr[SPR_BOOKE_PID2] = sregs.u.e.impl.fsl.pid2;
 615             }
 616         }
 617     }
 618
 619     if (cap_segstate) {
 620         ret = kvm_vcpu_ioctl(env, KVM_GET_SREGS, &sregs);
 621         if (ret < 0) {
 622             return ret;
 623         }
 624
 625         ppc_store_sdr1(env, sregs.u.s.sdr1);
 626
 627         /* Sync SLB */
 628 #ifdef TARGET_PPC64
 629         for (i = 0; i < 64; i++) {
 630             ppc_store_slb(env, sregs.u.s.ppc64.slb[i].slbe,
 631                                sregs.u.s.ppc64.slb[i].slbv);
 632         }
 633 #endif
 634
 635         /* Sync SRs */
 636         for (i = 0; i < 16; i++) {
 637             env->sr[i] = sregs.u.s.ppc32.sr[i];
 638         }
 639
 640         /* Sync BATs */
 641         for (i = 0; i < 8; i++) {
 642             env->DBAT[0][i] = sregs.u.s.ppc32.dbat[i] & 0xffffffff;
 643             env->DBAT[1][i] = sregs.u.s.ppc32.dbat[i] >> 32;
 644             env->IBAT[0][i] = sregs.u.s.ppc32.ibat[i] & 0xffffffff;
 645             env->IBAT[1][i] = sregs.u.s.ppc32.ibat[i] >> 32;
 646         }
 647     }
 648
 649     return 0;
 650 }
 651
 652 int kvmppc_set_interrupt(CPUPPCState *env, int irq, int level)
 653 {
 654     unsigned virq = level ? KVM_INTERRUPT_SET_LEVEL : KVM_INTERRUPT_UNSET;
 655
 656     if (irq != PPC_INTERRUPT_EXT) {
 657         return 0;
 658     }
 659
 660     if (!kvm_enabled() || !cap_interrupt_unset || !cap_interrupt_level) {
 661         return 0;
 662     }
 663
 664     kvm_vcpu_ioctl(env, KVM_INTERRUPT, &virq);
 665
 666     return 0;
 667 }
 668
 669 #if defined(TARGET_PPCEMB)
 670 #define PPC_INPUT_INT PPC40x_INPUT_INT
 671 #elif defined(TARGET_PPC64)
 672 #define PPC_INPUT_INT PPC970_INPUT_INT
 673 #else
 674 #define PPC_INPUT_INT PPC6xx_INPUT_INT
 675 #endif
 676
 677 void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
 678 {
 679     int r;
 680     unsigned irq;
 681
 682     /* PowerPC QEMU tracks the various core input pins (interrupt, critical
 683      * interrupt, reset, etc) in PPC-specific env->irq_input_state. */
 684     if (!cap_interrupt_level &&
 685         run->ready_for_interrupt_injection &&
 686         (env->interrupt_request & CPU_INTERRUPT_HARD) &&
 687         (env->irq_input_state & (1<<PPC_INPUT_INT)))
 688     {
 689         /* For now KVM disregards the 'irq' argument. However, in the
 690          * future KVM could cache it in-kernel to avoid a heavyweight exit
 691          * when reading the UIC.
 692          */
 693         irq = KVM_INTERRUPT_SET;
 694
 695         dprintf("injected interrupt %d\n", irq);
 696         r = kvm_vcpu_ioctl(env, KVM_INTERRUPT, &irq);
 697         if (r < 0)
 698             printf("cpu %d fail inject %x\n", env->cpu_index, irq);
 699
 700         /* Always wake up soon in case the interrupt was level based */
 701         qemu_mod_timer(idle_timer, qemu_get_clock_ns(vm_clock) +
 702                        (get_ticks_per_sec() / 50));
 703     }
 704
 705     /* We don't know if there are more interrupts pending after this. However,
 706      * the guest will return to userspace in the course of handling this one
 707      * anyways, so we will get a chance to deliver the rest. */
 708 }
 709
 710 void kvm_arch_post_run(CPUPPCState *env, struct kvm_run *run)
 711 {
 712 }
 713
 714 int kvm_arch_process_async_events(CPUPPCState *env)
 715 {
 716     return env->halted;
 717 }
 718
 719 static int kvmppc_handle_halt(CPUPPCState *env)
 720 {
 721     if (!(env->interrupt_request & CPU_INTERRUPT_HARD) && (msr_ee)) {
 722         env->halted = 1;
 723         env->exception_index = EXCP_HLT;
 724     }
 725
 726     return 0;
 727 }
 728
 729 /* map dcr access to existing qemu dcr emulation */
 730 static int kvmppc_handle_dcr_read(CPUPPCState *env, uint32_t dcrn, uint32_t *data)
 731 {
 732     if (ppc_dcr_read(env->dcr_env, dcrn, data) < 0)
 733         fprintf(stderr, "Read to unhandled DCR (0x%x)\n", dcrn);
 734
 735     return 0;
 736 }
 737
 738 static int kvmppc_handle_dcr_write(CPUPPCState *env, uint32_t dcrn, uint32_t data)
 739 {
 740     if (ppc_dcr_write(env->dcr_env, dcrn, data) < 0)
 741         fprintf(stderr, "Write to unhandled DCR (0x%x)\n", dcrn);
 742
 743     return 0;
 744 }
 745
 746 int kvm_arch_handle_exit(CPUPPCState *env, struct kvm_run *run)
 747 {
 748     int ret;
 749
 750     switch (run->exit_reason) {
 751     case KVM_EXIT_DCR:
 752         if (run->dcr.is_write) {
 753             dprintf("handle dcr write\n");
 754             ret = kvmppc_handle_dcr_write(env, run->dcr.dcrn, run->dcr.data);
 755         } else {
 756             dprintf("handle dcr read\n");
 757             ret = kvmppc_handle_dcr_read(env, run->dcr.dcrn, &run->dcr.data);
 758         }
 759         break;
 760     case KVM_EXIT_HLT:
 761         dprintf("handle halt\n");
 762         ret = kvmppc_handle_halt(env);
 763         break;
 764 #ifdef CONFIG_PSERIES
 765     case KVM_EXIT_PAPR_HCALL:
 766         dprintf("handle PAPR hypercall\n");
 767         run->papr_hcall.ret = spapr_hypercall(env, run->papr_hcall.nr,
 768                                               run->papr_hcall.args);
 769         ret = 1;
 770         break;
 771 #endif
 772     default:
 773         fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
 774         ret = -1;
 775         break;
 776     }
 777
 778     return ret;
 779 }
 780
 781 static int read_cpuinfo(const char *field, char *value, int len)
 782 {
 783     FILE *f;
 784     int ret = -1;
 785     int field_len = strlen(field);
 786     char line[512];
 787
 788     f = fopen("/proc/cpuinfo", "r");
 789     if (!f) {
 790         return -1;
 791     }
 792
 793     do {
 794         if(!fgets(line, sizeof(line), f)) {
 795             break;
 796         }
 797         if (!strncmp(line, field, field_len)) {
 798             strncpy(value, line, len);
 799             ret = 0;
 800             break;
 801         }
 802     } while(*line);
 803
 804     fclose(f);
 805
 806     return ret;
 807 }
 808
 809 uint32_t kvmppc_get_tbfreq(void)
 810 {
 811     char line[512];
 812     char *ns;
 813     uint32_t retval = get_ticks_per_sec();
 814
 815     if (read_cpuinfo("timebase", line, sizeof(line))) {
 816         return retval;
 817     }
 818
 819     if (!(ns = strchr(line, ':'))) {
 820         return retval;
 821     }
 822
 823     ns++;
 824
 825     retval = atoi(ns);
 826     return retval;
 827 }
 828
 829 /* Try to find a device tree node for a CPU with clock-frequency property */
 830 static int kvmppc_find_cpu_dt(char *buf, int buf_len)
 831 {
 832     struct dirent *dirp;
 833     DIR *dp;
 834
 835     if ((dp = opendir(PROC_DEVTREE_CPU)) == NULL) {
 836         printf("Can't open directory " PROC_DEVTREE_CPU "\n");
 837         return -1;
 838     }
 839
 840     buf[0] = '\0';
 841     while ((dirp = readdir(dp)) != NULL) {
 842         FILE *f;
 843         snprintf(buf, buf_len, "%s%s/clock-frequency", PROC_DEVTREE_CPU,
 844                  dirp->d_name);
 845         f = fopen(buf, "r");
 846         if (f) {
 847             snprintf(buf, buf_len, "%s%s", PROC_DEVTREE_CPU, dirp->d_name);
 848             fclose(f);
 849             break;
 850         }
 851         buf[0] = '\0';
 852     }
 853     closedir(dp);
 854     if (buf[0] == '\0') {
 855         printf("Unknown host!\n");
 856         return -1;
 857     }
 858
 859     return 0;
 860 }
 861
 862 /* Read a CPU node property from the host device tree that's a single
 863  * integer (32-bit or 64-bit).  Returns 0 if anything goes wrong
 864  * (can't find or open the property, or doesn't understand the
 865  * format) */
 866 static uint64_t kvmppc_read_int_cpu_dt(const char *propname)
 867 {
 868     char buf[PATH_MAX];
 869     union {
 870         uint32_t v32;
 871         uint64_t v64;
 872     } u;
 873     FILE *f;
 874     int len;
 875
 876     if (kvmppc_find_cpu_dt(buf, sizeof(buf))) {
 877         return -1;
 878     }
 879
 880     strncat(buf, "/", sizeof(buf) - strlen(buf));
 881     strncat(buf, propname, sizeof(buf) - strlen(buf));
 882
 883     f = fopen(buf, "rb");
 884     if (!f) {
 885         return -1;
 886     }
 887
 888     len = fread(&u, 1, sizeof(u), f);
 889     fclose(f);
 890     switch (len) {
 891     case 4:
 892         /* property is a 32-bit quantity */
 893         return be32_to_cpu(u.v32);
 894     case 8:
 895         return be64_to_cpu(u.v64);
 896     }
 897
 898     return 0;
 899 }
 900
 901 uint64_t kvmppc_get_clockfreq(void)
 902 {
 903     return kvmppc_read_int_cpu_dt("clock-frequency");
 904 }
 905
 906 uint32_t kvmppc_get_vmx(void)
 907 {
 908     return kvmppc_read_int_cpu_dt("ibm,vmx");
 909 }
 910
 911 uint32_t kvmppc_get_dfp(void)
 912 {
 913     return kvmppc_read_int_cpu_dt("ibm,dfp");
 914 }
 915
 916 int kvmppc_get_hypercall(CPUPPCState *env, uint8_t *buf, int buf_len)
 917 {
 918     uint32_t *hc = (uint32_t*)buf;
 919
 920     struct kvm_ppc_pvinfo pvinfo;
 921
 922     if (kvm_check_extension(env->kvm_state, KVM_CAP_PPC_GET_PVINFO) &&
 923         !kvm_vm_ioctl(env->kvm_state, KVM_PPC_GET_PVINFO, &pvinfo)) {
 924         memcpy(buf, pvinfo.hcall, buf_len);
 925
 926         return 0;
 927     }
 928
 929     /*
 930      * Fallback to always fail hypercalls:
 931      *
 932      *     li r3, -1
 933      *     nop
 934      *     nop
 935      *     nop
 936      */
 937
 938     hc[0] = 0x3860ffff;
 939     hc[1] = 0x60000000;
 940     hc[2] = 0x60000000;
 941     hc[3] = 0x60000000;
 942
 943     return 0;
 944 }
 945
 946 void kvmppc_set_papr(CPUPPCState *env)
 947 {
 948     struct kvm_enable_cap cap = {};
 949     struct kvm_one_reg reg = {};
 950     struct kvm_sregs sregs = {};
 951     int ret;
 952     uint64_t hior = env->spr[SPR_HIOR];
 953
 954     cap.cap = KVM_CAP_PPC_PAPR;
 955     ret = kvm_vcpu_ioctl(env, KVM_ENABLE_CAP, &cap);
 956
 957     if (ret) {
 958         goto fail;
 959     }
 960
 961     /*
 962      * XXX We set HIOR here. It really should be a qdev property of
 963      *     the CPU node, but we don't have CPUs converted to qdev yet.
 964      *
 965      *     Once we have qdev CPUs, move HIOR to a qdev property and
 966      *     remove this chunk.
 967      */
 968     reg.id = KVM_REG_PPC_HIOR;
 969     reg.addr = (uintptr_t)&hior;
 970     ret = kvm_vcpu_ioctl(env, KVM_SET_ONE_REG, &reg);
 971     if (ret) {
 972         fprintf(stderr, "Couldn't set HIOR. Maybe you're running an old \n"
 973                         "kernel with support for HV KVM but no PAPR PR \n"
 974                         "KVM in which case things will work. If they don't \n"
 975                         "please update your host kernel!\n");
 976     }
 977
 978     /* Set SDR1 so kernel space finds the HTAB */
 979     ret = kvm_vcpu_ioctl(env, KVM_GET_SREGS, &sregs);
 980     if (ret) {
 981         goto fail;
 982     }
 983
 984     sregs.u.s.sdr1 = env->spr[SPR_SDR1];
 985
 986     ret = kvm_vcpu_ioctl(env, KVM_SET_SREGS, &sregs);
 987     if (ret) {
 988         goto fail;
 989     }
 990
 991     return;
 992
 993 fail:
 994     cpu_abort(env, "This KVM version does not support PAPR\n");
 995 }
 996
 997 int kvmppc_smt_threads(void)
 998 {
 999     return cap_ppc_smt ? cap_ppc_smt : 1;
1000 }
1001
1002 off_t kvmppc_alloc_rma(const char *name, MemoryRegion *sysmem)
1003 {
1004     void *rma;
1005     off_t size;
1006     int fd;
1007     struct kvm_allocate_rma ret;
1008     MemoryRegion *rma_region;
1009
1010     /* If cap_ppc_rma == 0, contiguous RMA allocation is not supported
1011      * if cap_ppc_rma == 1, contiguous RMA allocation is supported, but
1012      *                      not necessary on this hardware
1013      * if cap_ppc_rma == 2, contiguous RMA allocation is needed on this hardware
1014      *
1015      * FIXME: We should allow the user to force contiguous RMA
1016      * allocation in the cap_ppc_rma==1 case.
1017      */
1018     if (cap_ppc_rma < 2) {
1019         return 0;
1020     }
1021
1022     fd = kvm_vm_ioctl(kvm_state, KVM_ALLOCATE_RMA, &ret);
1023     if (fd < 0) {
1024         fprintf(stderr, "KVM: Error on KVM_ALLOCATE_RMA: %s\n",
1025                 strerror(errno));
1026         return -1;
1027     }
1028
1029     size = MIN(ret.rma_size, 256ul << 20);
1030
1031     rma = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
1032     if (rma == MAP_FAILED) {
1033         fprintf(stderr, "KVM: Error mapping RMA: %s\n", strerror(errno));
1034         return -1;
1035     };
1036
1037     rma_region = g_new(MemoryRegion, 1);
1038     memory_region_init_ram_ptr(rma_region, name, size, rma);
1039     vmstate_register_ram_global(rma_region);
1040     memory_region_add_subregion(sysmem, 0, rma_region);
1041
1042     return size;
1043 }
1044
1045 void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd)
1046 {
1047     struct kvm_create_spapr_tce args = {
1048         .liobn = liobn,
1049         .window_size = window_size,
1050     };
1051     long len;
1052     int fd;
1053     void *table;
1054
1055     /* Must set fd to -1 so we don't try to munmap when called for
1056      * destroying the table, which the upper layers -will- do
1057      */
1058     *pfd = -1;
1059     if (!cap_spapr_tce) {
1060         return NULL;
1061     }
1062
1063     fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_SPAPR_TCE, &args);
1064     if (fd < 0) {
1065         fprintf(stderr, "KVM: Failed to create TCE table for liobn 0x%x\n",
1066                 liobn);
1067         return NULL;
1068     }
1069
1070     len = (window_size / SPAPR_VIO_TCE_PAGE_SIZE) * sizeof(VIOsPAPR_RTCE);
1071     /* FIXME: round this up to page size */
1072
1073     table = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
1074     if (table == MAP_FAILED) {
1075         fprintf(stderr, "KVM: Failed to map TCE table for liobn 0x%x\n",
1076                 liobn);
1077         close(fd);
1078         return NULL;
1079     }
1080
1081     *pfd = fd;
1082     return table;
1083 }
1084
1085 int kvmppc_remove_spapr_tce(void *table, int fd, uint32_t window_size)
1086 {
1087     long len;
1088
1089     if (fd < 0) {
1090         return -1;
1091     }
1092
1093     len = (window_size / SPAPR_VIO_TCE_PAGE_SIZE)*sizeof(VIOsPAPR_RTCE);
1094     if ((munmap(table, len) < 0) ||
1095         (close(fd) < 0)) {
1096         fprintf(stderr, "KVM: Unexpected error removing TCE table: %s",
1097                 strerror(errno));
1098         /* Leak the table */
1099     }
1100
1101     return 0;
1102 }
1103
1104 static inline uint32_t mfpvr(void)
1105 {
1106     uint32_t pvr;
1107
1108     asm ("mfpvr %0"
1109          : "=r"(pvr));
1110     return pvr;
1111 }
1112
1113 static void alter_insns(uint64_t *word, uint64_t flags, bool on)
1114 {
1115     if (on) {
1116         *word |= flags;
1117     } else {
1118         *word &= ~flags;
1119     }
1120 }
1121
1122 const ppc_def_t *kvmppc_host_cpu_def(void)
1123 {
1124     uint32_t host_pvr = mfpvr();
1125     const ppc_def_t *base_spec;
1126     ppc_def_t *spec;
1127     uint32_t vmx = kvmppc_get_vmx();
1128     uint32_t dfp = kvmppc_get_dfp();
1129
1130     base_spec = ppc_find_by_pvr(host_pvr);
1131
1132     spec = g_malloc0(sizeof(*spec));
1133     memcpy(spec, base_spec, sizeof(*spec));
1134
1135     /* Now fix up the spec with information we can query from the host */
1136
1137     if (vmx != -1) {
1138         /* Only override when we know what the host supports */
1139         alter_insns(&spec->insns_flags, PPC_ALTIVEC, vmx > 0);
1140         alter_insns(&spec->insns_flags2, PPC2_VSX, vmx > 1);
1141     }
1142     if (dfp != -1) {
1143         /* Only override when we know what the host supports */
1144         alter_insns(&spec->insns_flags2, PPC2_DFP, dfp);
1145     }
1146
1147     return spec;
1148 }
1149
1150 int kvmppc_fixup_cpu(CPUPPCState *env)
1151 {
1152     int smt;
1153
1154     /* Adjust cpu index for SMT */
1155     smt = kvmppc_smt_threads();
1156     env->cpu_index = (env->cpu_index / smp_threads) * smt
1157         + (env->cpu_index % smp_threads);
1158
1159     return 0;
1160 }
1161
1162
1163 bool kvm_arch_stop_on_emulation_error(CPUPPCState *env)
1164 {
1165     return true;
1166 }
1167
1168 int kvm_arch_on_sigbus_vcpu(CPUPPCState *env, int code, void *addr)
1169 {
1170     return 1;
1171 }
1172
1173 int kvm_arch_on_sigbus(int code, void *addr)
1174 {
1175     return 1;
1176 }