cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu/config-file.h"
  27 #include "cpu.h"
  28 #include "monitor/monitor.h"
  29 #include "qapi/error.h"
  30 #include "qapi/qapi-commands-misc.h"
  31 #include "qapi/qapi-events-run-state.h"
  32 #include "qapi/qmp/qerror.h"
  33 #include "qemu/error-report.h"
  34 #include "sysemu/sysemu.h"
  35 #include "sysemu/block-backend.h"
  36 #include "exec/gdbstub.h"
  37 #include "sysemu/dma.h"
  38 #include "sysemu/hw_accel.h"
  39 #include "sysemu/kvm.h"
  40 #include "sysemu/hax.h"
  41 #include "sysemu/hvf.h"
  42 #include "sysemu/whpx.h"
  43 #include "exec/exec-all.h"
  44
  45 #include "qemu/thread.h"
  46 #include "sysemu/cpus.h"
  47 #include "sysemu/qtest.h"
  48 #include "qemu/main-loop.h"
  49 #include "qemu/option.h"
  50 #include "qemu/bitmap.h"
  51 #include "qemu/seqlock.h"
  52 #include "tcg.h"
  53 #include "hw/nmi.h"
  54 #include "sysemu/replay.h"
  55 #include "hw/boards.h"
  56
  57 #ifdef CONFIG_LINUX
  58
  59 #include <sys/prctl.h>
  60
  61 #ifndef PR_MCE_KILL
  62 #define PR_MCE_KILL 33
  63 #endif
  64
  65 #ifndef PR_MCE_KILL_SET
  66 #define PR_MCE_KILL_SET 1
  67 #endif
  68
  69 #ifndef PR_MCE_KILL_EARLY
  70 #define PR_MCE_KILL_EARLY 1
  71 #endif
  72
  73 #endif /* CONFIG_LINUX */
  74
  75 int64_t max_delay;
  76 int64_t max_advance;
  77
  78 /* vcpu throttling controls */
  79 static QEMUTimer *throttle_timer;
  80 static unsigned int throttle_percentage;
  81
  82 #define CPU_THROTTLE_PCT_MIN 1
  83 #define CPU_THROTTLE_PCT_MAX 99
  84 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  85
  86 bool cpu_is_stopped(CPUState *cpu)
  87 {
  88     return cpu->stopped || !runstate_is_running();
  89 }
  90
  91 static bool cpu_thread_is_idle(CPUState *cpu)
  92 {
  93     if (cpu->stop || cpu->queued_work_first) {
  94         return false;
  95     }
  96     if (cpu_is_stopped(cpu)) {
  97         return true;
  98     }
  99     if (!cpu->halted || cpu_has_work(cpu) ||
 100         kvm_halt_in_kernel()) {
 101         return false;
 102     }
 103     return true;
 104 }
 105
 106 static bool all_cpu_threads_idle(void)
 107 {
 108     CPUState *cpu;
 109
 110     CPU_FOREACH(cpu) {
 111         if (!cpu_thread_is_idle(cpu)) {
 112             return false;
 113         }
 114     }
 115     return true;
 116 }
 117
 118 /***********************************************************/
 119 /* guest cycle counter */
 120
 121 /* Protected by TimersState seqlock */
 122
 123 static bool icount_sleep = true;
 124 /* Conversion factor from emulated instructions to virtual clock ticks.  */
 125 static int icount_time_shift;
 126 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 127 #define MAX_ICOUNT_SHIFT 10
 128
 129 typedef struct TimersState {
 130     /* Protected by BQL.  */
 131     int64_t cpu_ticks_prev;
 132     int64_t cpu_ticks_offset;
 133
 134     /* cpu_clock_offset can be read out of BQL, so protect it with
 135      * this lock.
 136      */
 137     QemuSeqLock vm_clock_seqlock;
 138     int64_t cpu_clock_offset;
 139     int32_t cpu_ticks_enabled;
 140     int64_t dummy;
 141
 142     /* Compensate for varying guest execution speed.  */
 143     int64_t qemu_icount_bias;
 144     /* Only written by TCG thread */
 145     int64_t qemu_icount;
 146     /* for adjusting icount */
 147     int64_t vm_clock_warp_start;
 148     QEMUTimer *icount_rt_timer;
 149     QEMUTimer *icount_vm_timer;
 150     QEMUTimer *icount_warp_timer;
 151 } TimersState;
 152
 153 static TimersState timers_state;
 154 bool mttcg_enabled;
 155
 156 /*
 157  * We default to false if we know other options have been enabled
 158  * which are currently incompatible with MTTCG. Otherwise when each
 159  * guest (target) has been updated to support:
 160  *   - atomic instructions
 161  *   - memory ordering primitives (barriers)
 162  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 163  *
 164  * Once a guest architecture has been converted to the new primitives
 165  * there are two remaining limitations to check.
 166  *
 167  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 168  * - The host must have a stronger memory order than the guest
 169  *
 170  * It may be possible in future to support strong guests on weak hosts
 171  * but that will require tagging all load/stores in a guest with their
 172  * implicit memory order requirements which would likely slow things
 173  * down a lot.
 174  */
 175
 176 static bool check_tcg_memory_orders_compatible(void)
 177 {
 178 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 179     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 180 #else
 181     return false;
 182 #endif
 183 }
 184
 185 static bool default_mttcg_enabled(void)
 186 {
 187     if (use_icount || TCG_OVERSIZED_GUEST) {
 188         return false;
 189     } else {
 190 #ifdef TARGET_SUPPORTS_MTTCG
 191         return check_tcg_memory_orders_compatible();
 192 #else
 193         return false;
 194 #endif
 195     }
 196 }
 197
 198 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 199 {
 200     const char *t = qemu_opt_get(opts, "thread");
 201     if (t) {
 202         if (strcmp(t, "multi") == 0) {
 203             if (TCG_OVERSIZED_GUEST) {
 204                 error_setg(errp, "No MTTCG when guest word size > hosts");
 205             } else if (use_icount) {
 206                 error_setg(errp, "No MTTCG when icount is enabled");
 207             } else {
 208 #ifndef TARGET_SUPPORTS_MTTCG
 209                 error_report("Guest not yet converted to MTTCG - "
 210                              "you may get unexpected results");
 211 #endif
 212                 if (!check_tcg_memory_orders_compatible()) {
 213                     error_report("Guest expects a stronger memory ordering "
 214                                  "than the host provides");
 215                     error_printf("This may cause strange/hard to debug errors\n");
 216                 }
 217                 mttcg_enabled = true;
 218             }
 219         } else if (strcmp(t, "single") == 0) {
 220             mttcg_enabled = false;
 221         } else {
 222             error_setg(errp, "Invalid 'thread' setting %s", t);
 223         }
 224     } else {
 225         mttcg_enabled = default_mttcg_enabled();
 226     }
 227 }
 228
 229 /* The current number of executed instructions is based on what we
 230  * originally budgeted minus the current state of the decrementing
 231  * icount counters in extra/u16.low.
 232  */
 233 static int64_t cpu_get_icount_executed(CPUState *cpu)
 234 {
 235     return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
 236 }
 237
 238 /*
 239  * Update the global shared timer_state.qemu_icount to take into
 240  * account executed instructions. This is done by the TCG vCPU
 241  * thread so the main-loop can see time has moved forward.
 242  */
 243 void cpu_update_icount(CPUState *cpu)
 244 {
 245     int64_t executed = cpu_get_icount_executed(cpu);
 246     cpu->icount_budget -= executed;
 247
 248 #ifdef CONFIG_ATOMIC64
 249     atomic_set__nocheck(&timers_state.qemu_icount,
 250                         atomic_read__nocheck(&timers_state.qemu_icount) +
 251                         executed);
 252 #else /* FIXME: we need 64bit atomics to do this safely */
 253     timers_state.qemu_icount += executed;
 254 #endif
 255 }
 256
 257 int64_t cpu_get_icount_raw(void)
 258 {
 259     CPUState *cpu = current_cpu;
 260
 261     if (cpu && cpu->running) {
 262         if (!cpu->can_do_io) {
 263             error_report("Bad icount read");
 264             exit(1);
 265         }
 266         /* Take into account what has run */
 267         cpu_update_icount(cpu);
 268     }
 269 #ifdef CONFIG_ATOMIC64
 270     return atomic_read__nocheck(&timers_state.qemu_icount);
 271 #else /* FIXME: we need 64bit atomics to do this safely */
 272     return timers_state.qemu_icount;
 273 #endif
 274 }
 275
 276 /* Return the virtual CPU time, based on the instruction counter.  */
 277 static int64_t cpu_get_icount_locked(void)
 278 {
 279     int64_t icount = cpu_get_icount_raw();
 280     return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
 281 }
 282
 283 int64_t cpu_get_icount(void)
 284 {
 285     int64_t icount;
 286     unsigned start;
 287
 288     do {
 289         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 290         icount = cpu_get_icount_locked();
 291     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 292
 293     return icount;
 294 }
 295
 296 int64_t cpu_icount_to_ns(int64_t icount)
 297 {
 298     return icount << icount_time_shift;
 299 }
 300
 301 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 302  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 303  * counter.
 304  *
 305  * Caller must hold the BQL
 306  */
 307 int64_t cpu_get_ticks(void)
 308 {
 309     int64_t ticks;
 310
 311     if (use_icount) {
 312         return cpu_get_icount();
 313     }
 314
 315     ticks = timers_state.cpu_ticks_offset;
 316     if (timers_state.cpu_ticks_enabled) {
 317         ticks += cpu_get_host_ticks();
 318     }
 319
 320     if (timers_state.cpu_ticks_prev > ticks) {
 321         /* Note: non increasing ticks may happen if the host uses
 322            software suspend */
 323         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 324         ticks = timers_state.cpu_ticks_prev;
 325     }
 326
 327     timers_state.cpu_ticks_prev = ticks;
 328     return ticks;
 329 }
 330
 331 static int64_t cpu_get_clock_locked(void)
 332 {
 333     int64_t time;
 334
 335     time = timers_state.cpu_clock_offset;
 336     if (timers_state.cpu_ticks_enabled) {
 337         time += get_clock();
 338     }
 339
 340     return time;
 341 }
 342
 343 /* Return the monotonic time elapsed in VM, i.e.,
 344  * the time between vm_start and vm_stop
 345  */
 346 int64_t cpu_get_clock(void)
 347 {
 348     int64_t ti;
 349     unsigned start;
 350
 351     do {
 352         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 353         ti = cpu_get_clock_locked();
 354     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 355
 356     return ti;
 357 }
 358
 359 /* enable cpu_get_ticks()
 360  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 361  */
 362 void cpu_enable_ticks(void)
 363 {
 364     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 365     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 366     if (!timers_state.cpu_ticks_enabled) {
 367         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 368         timers_state.cpu_clock_offset -= get_clock();
 369         timers_state.cpu_ticks_enabled = 1;
 370     }
 371     seqlock_write_end(&timers_state.vm_clock_seqlock);
 372 }
 373
 374 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 375  * cpu_get_ticks() after that.
 376  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 377  */
 378 void cpu_disable_ticks(void)
 379 {
 380     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 381     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 382     if (timers_state.cpu_ticks_enabled) {
 383         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 384         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 385         timers_state.cpu_ticks_enabled = 0;
 386     }
 387     seqlock_write_end(&timers_state.vm_clock_seqlock);
 388 }
 389
 390 /* Correlation between real and virtual time is always going to be
 391    fairly approximate, so ignore small variation.
 392    When the guest is idle real and virtual time will be aligned in
 393    the IO wait loop.  */
 394 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 395
 396 static void icount_adjust(void)
 397 {
 398     int64_t cur_time;
 399     int64_t cur_icount;
 400     int64_t delta;
 401
 402     /* Protected by TimersState mutex.  */
 403     static int64_t last_delta;
 404
 405     /* If the VM is not running, then do nothing.  */
 406     if (!runstate_is_running()) {
 407         return;
 408     }
 409
 410     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 411     cur_time = cpu_get_clock_locked();
 412     cur_icount = cpu_get_icount_locked();
 413
 414     delta = cur_icount - cur_time;
 415     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 416     if (delta > 0
 417         && last_delta + ICOUNT_WOBBLE < delta * 2
 418         && icount_time_shift > 0) {
 419         /* The guest is getting too far ahead.  Slow time down.  */
 420         icount_time_shift--;
 421     }
 422     if (delta < 0
 423         && last_delta - ICOUNT_WOBBLE > delta * 2
 424         && icount_time_shift < MAX_ICOUNT_SHIFT) {
 425         /* The guest is getting too far behind.  Speed time up.  */
 426         icount_time_shift++;
 427     }
 428     last_delta = delta;
 429     timers_state.qemu_icount_bias = cur_icount
 430                               - (timers_state.qemu_icount << icount_time_shift);
 431     seqlock_write_end(&timers_state.vm_clock_seqlock);
 432 }
 433
 434 static void icount_adjust_rt(void *opaque)
 435 {
 436     timer_mod(timers_state.icount_rt_timer,
 437               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 438     icount_adjust();
 439 }
 440
 441 static void icount_adjust_vm(void *opaque)
 442 {
 443     timer_mod(timers_state.icount_vm_timer,
 444                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 445                    NANOSECONDS_PER_SECOND / 10);
 446     icount_adjust();
 447 }
 448
 449 static int64_t qemu_icount_round(int64_t count)
 450 {
 451     return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 452 }
 453
 454 static void icount_warp_rt(void)
 455 {
 456     unsigned seq;
 457     int64_t warp_start;
 458
 459     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 460      * changes from -1 to another value, so the race here is okay.
 461      */
 462     do {
 463         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 464         warp_start = timers_state.vm_clock_warp_start;
 465     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 466
 467     if (warp_start == -1) {
 468         return;
 469     }
 470
 471     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 472     if (runstate_is_running()) {
 473         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 474                                      cpu_get_clock_locked());
 475         int64_t warp_delta;
 476
 477         warp_delta = clock - timers_state.vm_clock_warp_start;
 478         if (use_icount == 2) {
 479             /*
 480              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 481              * far ahead of real time.
 482              */
 483             int64_t cur_icount = cpu_get_icount_locked();
 484             int64_t delta = clock - cur_icount;
 485             warp_delta = MIN(warp_delta, delta);
 486         }
 487         timers_state.qemu_icount_bias += warp_delta;
 488     }
 489     timers_state.vm_clock_warp_start = -1;
 490     seqlock_write_end(&timers_state.vm_clock_seqlock);
 491
 492     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 493         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 494     }
 495 }
 496
 497 static void icount_timer_cb(void *opaque)
 498 {
 499     /* No need for a checkpoint because the timer already synchronizes
 500      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 501      */
 502     icount_warp_rt();
 503 }
 504
 505 void qtest_clock_warp(int64_t dest)
 506 {
 507     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 508     AioContext *aio_context;
 509     assert(qtest_enabled());
 510     aio_context = qemu_get_aio_context();
 511     while (clock < dest) {
 512         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 513         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 514
 515         seqlock_write_begin(&timers_state.vm_clock_seqlock);
 516         timers_state.qemu_icount_bias += warp;
 517         seqlock_write_end(&timers_state.vm_clock_seqlock);
 518
 519         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 520         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 521         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 522     }
 523     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 524 }
 525
 526 void qemu_start_warp_timer(void)
 527 {
 528     int64_t clock;
 529     int64_t deadline;
 530
 531     if (!use_icount) {
 532         return;
 533     }
 534
 535     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 536      * do not fire, so computing the deadline does not make sense.
 537      */
 538     if (!runstate_is_running()) {
 539         return;
 540     }
 541
 542     /* warp clock deterministically in record/replay mode */
 543     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 544         return;
 545     }
 546
 547     if (!all_cpu_threads_idle()) {
 548         return;
 549     }
 550
 551     if (qtest_enabled()) {
 552         /* When testing, qtest commands advance icount.  */
 553         return;
 554     }
 555
 556     /* We want to use the earliest deadline from ALL vm_clocks */
 557     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 558     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 559     if (deadline < 0) {
 560         static bool notified;
 561         if (!icount_sleep && !notified) {
 562             warn_report("icount sleep disabled and no active timers");
 563             notified = true;
 564         }
 565         return;
 566     }
 567
 568     if (deadline > 0) {
 569         /*
 570          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 571          * sleep.  Otherwise, the CPU might be waiting for a future timer
 572          * interrupt to wake it up, but the interrupt never comes because
 573          * the vCPU isn't running any insns and thus doesn't advance the
 574          * QEMU_CLOCK_VIRTUAL.
 575          */
 576         if (!icount_sleep) {
 577             /*
 578              * We never let VCPUs sleep in no sleep icount mode.
 579              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 580              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 581              * It is useful when we want a deterministic execution time,
 582              * isolated from host latencies.
 583              */
 584             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 585             timers_state.qemu_icount_bias += deadline;
 586             seqlock_write_end(&timers_state.vm_clock_seqlock);
 587             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 588         } else {
 589             /*
 590              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 591              * "real" time, (related to the time left until the next event) has
 592              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 593              * This avoids that the warps are visible externally; for example,
 594              * you will not be sending network packets continuously instead of
 595              * every 100ms.
 596              */
 597             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 598             if (timers_state.vm_clock_warp_start == -1
 599                 || timers_state.vm_clock_warp_start > clock) {
 600                 timers_state.vm_clock_warp_start = clock;
 601             }
 602             seqlock_write_end(&timers_state.vm_clock_seqlock);
 603             timer_mod_anticipate(timers_state.icount_warp_timer,
 604                                  clock + deadline);
 605         }
 606     } else if (deadline == 0) {
 607         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 608     }
 609 }
 610
 611 static void qemu_account_warp_timer(void)
 612 {
 613     if (!use_icount || !icount_sleep) {
 614         return;
 615     }
 616
 617     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 618      * do not fire, so computing the deadline does not make sense.
 619      */
 620     if (!runstate_is_running()) {
 621         return;
 622     }
 623
 624     /* warp clock deterministically in record/replay mode */
 625     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 626         return;
 627     }
 628
 629     timer_del(timers_state.icount_warp_timer);
 630     icount_warp_rt();
 631 }
 632
 633 static bool icount_state_needed(void *opaque)
 634 {
 635     return use_icount;
 636 }
 637
 638 static bool warp_timer_state_needed(void *opaque)
 639 {
 640     TimersState *s = opaque;
 641     return s->icount_warp_timer != NULL;
 642 }
 643
 644 static bool adjust_timers_state_needed(void *opaque)
 645 {
 646     TimersState *s = opaque;
 647     return s->icount_rt_timer != NULL;
 648 }
 649
 650 /*
 651  * Subsection for warp timer migration is optional, because may not be created
 652  */
 653 static const VMStateDescription icount_vmstate_warp_timer = {
 654     .name = "timer/icount/warp_timer",
 655     .version_id = 1,
 656     .minimum_version_id = 1,
 657     .needed = warp_timer_state_needed,
 658     .fields = (VMStateField[]) {
 659         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 660         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 661         VMSTATE_END_OF_LIST()
 662     }
 663 };
 664
 665 static const VMStateDescription icount_vmstate_adjust_timers = {
 666     .name = "timer/icount/timers",
 667     .version_id = 1,
 668     .minimum_version_id = 1,
 669     .needed = adjust_timers_state_needed,
 670     .fields = (VMStateField[]) {
 671         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 672         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 673         VMSTATE_END_OF_LIST()
 674     }
 675 };
 676
 677 /*
 678  * This is a subsection for icount migration.
 679  */
 680 static const VMStateDescription icount_vmstate_timers = {
 681     .name = "timer/icount",
 682     .version_id = 1,
 683     .minimum_version_id = 1,
 684     .needed = icount_state_needed,
 685     .fields = (VMStateField[]) {
 686         VMSTATE_INT64(qemu_icount_bias, TimersState),
 687         VMSTATE_INT64(qemu_icount, TimersState),
 688         VMSTATE_END_OF_LIST()
 689     },
 690     .subsections = (const VMStateDescription*[]) {
 691         &icount_vmstate_warp_timer,
 692         &icount_vmstate_adjust_timers,
 693         NULL
 694     }
 695 };
 696
 697 static const VMStateDescription vmstate_timers = {
 698     .name = "timer",
 699     .version_id = 2,
 700     .minimum_version_id = 1,
 701     .fields = (VMStateField[]) {
 702         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 703         VMSTATE_INT64(dummy, TimersState),
 704         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 705         VMSTATE_END_OF_LIST()
 706     },
 707     .subsections = (const VMStateDescription*[]) {
 708         &icount_vmstate_timers,
 709         NULL
 710     }
 711 };
 712
 713 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 714 {
 715     double pct;
 716     double throttle_ratio;
 717     long sleeptime_ns;
 718
 719     if (!cpu_throttle_get_percentage()) {
 720         return;
 721     }
 722
 723     pct = (double)cpu_throttle_get_percentage()/100;
 724     throttle_ratio = pct / (1 - pct);
 725     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 726
 727     qemu_mutex_unlock_iothread();
 728     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 729     qemu_mutex_lock_iothread();
 730     atomic_set(&cpu->throttle_thread_scheduled, 0);
 731 }
 732
 733 static void cpu_throttle_timer_tick(void *opaque)
 734 {
 735     CPUState *cpu;
 736     double pct;
 737
 738     /* Stop the timer if needed */
 739     if (!cpu_throttle_get_percentage()) {
 740         return;
 741     }
 742     CPU_FOREACH(cpu) {
 743         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 744             async_run_on_cpu(cpu, cpu_throttle_thread,
 745                              RUN_ON_CPU_NULL);
 746         }
 747     }
 748
 749     pct = (double)cpu_throttle_get_percentage()/100;
 750     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 751                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 752 }
 753
 754 void cpu_throttle_set(int new_throttle_pct)
 755 {
 756     /* Ensure throttle percentage is within valid range */
 757     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 758     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 759
 760     atomic_set(&throttle_percentage, new_throttle_pct);
 761
 762     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 763                                        CPU_THROTTLE_TIMESLICE_NS);
 764 }
 765
 766 void cpu_throttle_stop(void)
 767 {
 768     atomic_set(&throttle_percentage, 0);
 769 }
 770
 771 bool cpu_throttle_active(void)
 772 {
 773     return (cpu_throttle_get_percentage() != 0);
 774 }
 775
 776 int cpu_throttle_get_percentage(void)
 777 {
 778     return atomic_read(&throttle_percentage);
 779 }
 780
 781 void cpu_ticks_init(void)
 782 {
 783     seqlock_init(&timers_state.vm_clock_seqlock);
 784     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 785     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 786                                            cpu_throttle_timer_tick, NULL);
 787 }
 788
 789 void configure_icount(QemuOpts *opts, Error **errp)
 790 {
 791     const char *option;
 792     char *rem_str = NULL;
 793
 794     option = qemu_opt_get(opts, "shift");
 795     if (!option) {
 796         if (qemu_opt_get(opts, "align") != NULL) {
 797             error_setg(errp, "Please specify shift option when using align");
 798         }
 799         return;
 800     }
 801
 802     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 803     if (icount_sleep) {
 804         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 805                                          icount_timer_cb, NULL);
 806     }
 807
 808     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 809
 810     if (icount_align_option && !icount_sleep) {
 811         error_setg(errp, "align=on and sleep=off are incompatible");
 812     }
 813     if (strcmp(option, "auto") != 0) {
 814         errno = 0;
 815         icount_time_shift = strtol(option, &rem_str, 0);
 816         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 817             error_setg(errp, "icount: Invalid shift value");
 818         }
 819         use_icount = 1;
 820         return;
 821     } else if (icount_align_option) {
 822         error_setg(errp, "shift=auto and align=on are incompatible");
 823     } else if (!icount_sleep) {
 824         error_setg(errp, "shift=auto and sleep=off are incompatible");
 825     }
 826
 827     use_icount = 2;
 828
 829     /* 125MIPS seems a reasonable initial guess at the guest speed.
 830        It will be corrected fairly quickly anyway.  */
 831     icount_time_shift = 3;
 832
 833     /* Have both realtime and virtual time triggers for speed adjustment.
 834        The realtime trigger catches emulated time passing too slowly,
 835        the virtual time trigger catches emulated time passing too fast.
 836        Realtime triggers occur even when idle, so use them less frequently
 837        than VM triggers.  */
 838     timers_state.vm_clock_warp_start = -1;
 839     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 840                                    icount_adjust_rt, NULL);
 841     timer_mod(timers_state.icount_rt_timer,
 842                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 843     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 844                                         icount_adjust_vm, NULL);
 845     timer_mod(timers_state.icount_vm_timer,
 846                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 847                    NANOSECONDS_PER_SECOND / 10);
 848 }
 849
 850 /***********************************************************/
 851 /* TCG vCPU kick timer
 852  *
 853  * The kick timer is responsible for moving single threaded vCPU
 854  * emulation on to the next vCPU. If more than one vCPU is running a
 855  * timer event with force a cpu->exit so the next vCPU can get
 856  * scheduled.
 857  *
 858  * The timer is removed if all vCPUs are idle and restarted again once
 859  * idleness is complete.
 860  */
 861
 862 static QEMUTimer *tcg_kick_vcpu_timer;
 863 static CPUState *tcg_current_rr_cpu;
 864
 865 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 866
 867 static inline int64_t qemu_tcg_next_kick(void)
 868 {
 869     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 870 }
 871
 872 /* Kick the currently round-robin scheduled vCPU */
 873 static void qemu_cpu_kick_rr_cpu(void)
 874 {
 875     CPUState *cpu;
 876     do {
 877         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 878         if (cpu) {
 879             cpu_exit(cpu);
 880         }
 881     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 882 }
 883
 884 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 885 {
 886 }
 887
 888 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 889 {
 890     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 891         qemu_notify_event();
 892         return;
 893     }
 894
 895     if (qemu_in_vcpu_thread()) {
 896         /* A CPU is currently running; kick it back out to the
 897          * tcg_cpu_exec() loop so it will recalculate its
 898          * icount deadline immediately.
 899          */
 900         qemu_cpu_kick(current_cpu);
 901     } else if (first_cpu) {
 902         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 903          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 904          * causes cpu_thread_is_idle to return false.  This way,
 905          * handle_icount_deadline can run.
 906          * If we have no CPUs at all for some reason, we don't
 907          * need to do anything.
 908          */
 909         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 910     }
 911 }
 912
 913 static void kick_tcg_thread(void *opaque)
 914 {
 915     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 916     qemu_cpu_kick_rr_cpu();
 917 }
 918
 919 static void start_tcg_kick_timer(void)
 920 {
 921     assert(!mttcg_enabled);
 922     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 923         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 924                                            kick_tcg_thread, NULL);
 925         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 926     }
 927 }
 928
 929 static void stop_tcg_kick_timer(void)
 930 {
 931     assert(!mttcg_enabled);
 932     if (tcg_kick_vcpu_timer) {
 933         timer_del(tcg_kick_vcpu_timer);
 934         tcg_kick_vcpu_timer = NULL;
 935     }
 936 }
 937
 938 /***********************************************************/
 939 void hw_error(const char *fmt, ...)
 940 {
 941     va_list ap;
 942     CPUState *cpu;
 943
 944     va_start(ap, fmt);
 945     fprintf(stderr, "qemu: hardware error: ");
 946     vfprintf(stderr, fmt, ap);
 947     fprintf(stderr, "\n");
 948     CPU_FOREACH(cpu) {
 949         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 950         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 951     }
 952     va_end(ap);
 953     abort();
 954 }
 955
 956 void cpu_synchronize_all_states(void)
 957 {
 958     CPUState *cpu;
 959
 960     CPU_FOREACH(cpu) {
 961         cpu_synchronize_state(cpu);
 962         /* TODO: move to cpu_synchronize_state() */
 963         if (hvf_enabled()) {
 964             hvf_cpu_synchronize_state(cpu);
 965         }
 966     }
 967 }
 968
 969 void cpu_synchronize_all_post_reset(void)
 970 {
 971     CPUState *cpu;
 972
 973     CPU_FOREACH(cpu) {
 974         cpu_synchronize_post_reset(cpu);
 975         /* TODO: move to cpu_synchronize_post_reset() */
 976         if (hvf_enabled()) {
 977             hvf_cpu_synchronize_post_reset(cpu);
 978         }
 979     }
 980 }
 981
 982 void cpu_synchronize_all_post_init(void)
 983 {
 984     CPUState *cpu;
 985
 986     CPU_FOREACH(cpu) {
 987         cpu_synchronize_post_init(cpu);
 988         /* TODO: move to cpu_synchronize_post_init() */
 989         if (hvf_enabled()) {
 990             hvf_cpu_synchronize_post_init(cpu);
 991         }
 992     }
 993 }
 994
 995 void cpu_synchronize_all_pre_loadvm(void)
 996 {
 997     CPUState *cpu;
 998
 999     CPU_FOREACH(cpu) {
1000         cpu_synchronize_pre_loadvm(cpu);
1001     }
1002 }
1003
1004 static int do_vm_stop(RunState state, bool send_stop)
1005 {
1006     int ret = 0;
1007
1008     if (runstate_is_running()) {
1009         cpu_disable_ticks();
1010         pause_all_vcpus();
1011         runstate_set(state);
1012         vm_state_notify(0, state);
1013         if (send_stop) {
1014             qapi_event_send_stop(&error_abort);
1015         }
1016     }
1017
1018     bdrv_drain_all();
1019     replay_disable_events();
1020     ret = bdrv_flush_all();
1021
1022     return ret;
1023 }
1024
1025 /* Special vm_stop() variant for terminating the process.  Historically clients
1026  * did not expect a QMP STOP event and so we need to retain compatibility.
1027  */
1028 int vm_shutdown(void)
1029 {
1030     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1031 }
1032
1033 static bool cpu_can_run(CPUState *cpu)
1034 {
1035     if (cpu->stop) {
1036         return false;
1037     }
1038     if (cpu_is_stopped(cpu)) {
1039         return false;
1040     }
1041     return true;
1042 }
1043
1044 static void cpu_handle_guest_debug(CPUState *cpu)
1045 {
1046     gdb_set_stop_cpu(cpu);
1047     qemu_system_debug_request();
1048     cpu->stopped = true;
1049 }
1050
1051 #ifdef CONFIG_LINUX
1052 static void sigbus_reraise(void)
1053 {
1054     sigset_t set;
1055     struct sigaction action;
1056
1057     memset(&action, 0, sizeof(action));
1058     action.sa_handler = SIG_DFL;
1059     if (!sigaction(SIGBUS, &action, NULL)) {
1060         raise(SIGBUS);
1061         sigemptyset(&set);
1062         sigaddset(&set, SIGBUS);
1063         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1064     }
1065     perror("Failed to re-raise SIGBUS!\n");
1066     abort();
1067 }
1068
1069 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1070 {
1071     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1072         sigbus_reraise();
1073     }
1074
1075     if (current_cpu) {
1076         /* Called asynchronously in VCPU thread.  */
1077         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1078             sigbus_reraise();
1079         }
1080     } else {
1081         /* Called synchronously (via signalfd) in main thread.  */
1082         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1083             sigbus_reraise();
1084         }
1085     }
1086 }
1087
1088 static void qemu_init_sigbus(void)
1089 {
1090     struct sigaction action;
1091
1092     memset(&action, 0, sizeof(action));
1093     action.sa_flags = SA_SIGINFO;
1094     action.sa_sigaction = sigbus_handler;
1095     sigaction(SIGBUS, &action, NULL);
1096
1097     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1098 }
1099 #else /* !CONFIG_LINUX */
1100 static void qemu_init_sigbus(void)
1101 {
1102 }
1103 #endif /* !CONFIG_LINUX */
1104
1105 static QemuMutex qemu_global_mutex;
1106
1107 static QemuThread io_thread;
1108
1109 /* cpu creation */
1110 static QemuCond qemu_cpu_cond;
1111 /* system init */
1112 static QemuCond qemu_pause_cond;
1113
1114 void qemu_init_cpu_loop(void)
1115 {
1116     qemu_init_sigbus();
1117     qemu_cond_init(&qemu_cpu_cond);
1118     qemu_cond_init(&qemu_pause_cond);
1119     qemu_mutex_init(&qemu_global_mutex);
1120
1121     qemu_thread_get_self(&io_thread);
1122 }
1123
1124 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1125 {
1126     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1127 }
1128
1129 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1130 {
1131     if (kvm_destroy_vcpu(cpu) < 0) {
1132         error_report("kvm_destroy_vcpu failed");
1133         exit(EXIT_FAILURE);
1134     }
1135 }
1136
1137 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1138 {
1139 }
1140
1141 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1142 {
1143     g_assert(qemu_cpu_is_self(cpu));
1144     cpu->stop = false;
1145     cpu->stopped = true;
1146     if (exit) {
1147         cpu_exit(cpu);
1148     }
1149     qemu_cond_broadcast(&qemu_pause_cond);
1150 }
1151
1152 static void qemu_wait_io_event_common(CPUState *cpu)
1153 {
1154     atomic_mb_set(&cpu->thread_kicked, false);
1155     if (cpu->stop) {
1156         qemu_cpu_stop(cpu, false);
1157     }
1158     process_queued_cpu_work(cpu);
1159 }
1160
1161 static void qemu_tcg_rr_wait_io_event(CPUState *cpu)
1162 {
1163     while (all_cpu_threads_idle()) {
1164         stop_tcg_kick_timer();
1165         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1166     }
1167
1168     start_tcg_kick_timer();
1169
1170     qemu_wait_io_event_common(cpu);
1171 }
1172
1173 static void qemu_wait_io_event(CPUState *cpu)
1174 {
1175     while (cpu_thread_is_idle(cpu)) {
1176         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1177     }
1178
1179 #ifdef _WIN32
1180     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1181     if (!tcg_enabled()) {
1182         SleepEx(0, TRUE);
1183     }
1184 #endif
1185     qemu_wait_io_event_common(cpu);
1186 }
1187
1188 static void *qemu_kvm_cpu_thread_fn(void *arg)
1189 {
1190     CPUState *cpu = arg;
1191     int r;
1192
1193     rcu_register_thread();
1194
1195     qemu_mutex_lock_iothread();
1196     qemu_thread_get_self(cpu->thread);
1197     cpu->thread_id = qemu_get_thread_id();
1198     cpu->can_do_io = 1;
1199     current_cpu = cpu;
1200
1201     r = kvm_init_vcpu(cpu);
1202     if (r < 0) {
1203         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1204         exit(1);
1205     }
1206
1207     kvm_init_cpu_signals(cpu);
1208
1209     /* signal CPU creation */
1210     cpu->created = true;
1211     qemu_cond_signal(&qemu_cpu_cond);
1212
1213     do {
1214         if (cpu_can_run(cpu)) {
1215             r = kvm_cpu_exec(cpu);
1216             if (r == EXCP_DEBUG) {
1217                 cpu_handle_guest_debug(cpu);
1218             }
1219         }
1220         qemu_wait_io_event(cpu);
1221     } while (!cpu->unplug || cpu_can_run(cpu));
1222
1223     qemu_kvm_destroy_vcpu(cpu);
1224     cpu->created = false;
1225     qemu_cond_signal(&qemu_cpu_cond);
1226     qemu_mutex_unlock_iothread();
1227     rcu_unregister_thread();
1228     return NULL;
1229 }
1230
1231 static void *qemu_dummy_cpu_thread_fn(void *arg)
1232 {
1233 #ifdef _WIN32
1234     error_report("qtest is not supported under Windows");
1235     exit(1);
1236 #else
1237     CPUState *cpu = arg;
1238     sigset_t waitset;
1239     int r;
1240
1241     rcu_register_thread();
1242
1243     qemu_mutex_lock_iothread();
1244     qemu_thread_get_self(cpu->thread);
1245     cpu->thread_id = qemu_get_thread_id();
1246     cpu->can_do_io = 1;
1247     current_cpu = cpu;
1248
1249     sigemptyset(&waitset);
1250     sigaddset(&waitset, SIG_IPI);
1251
1252     /* signal CPU creation */
1253     cpu->created = true;
1254     qemu_cond_signal(&qemu_cpu_cond);
1255
1256     do {
1257         qemu_mutex_unlock_iothread();
1258         do {
1259             int sig;
1260             r = sigwait(&waitset, &sig);
1261         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1262         if (r == -1) {
1263             perror("sigwait");
1264             exit(1);
1265         }
1266         qemu_mutex_lock_iothread();
1267         qemu_wait_io_event(cpu);
1268     } while (!cpu->unplug);
1269
1270     rcu_unregister_thread();
1271     return NULL;
1272 #endif
1273 }
1274
1275 static int64_t tcg_get_icount_limit(void)
1276 {
1277     int64_t deadline;
1278
1279     if (replay_mode != REPLAY_MODE_PLAY) {
1280         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1281
1282         /* Maintain prior (possibly buggy) behaviour where if no deadline
1283          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1284          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1285          * nanoseconds.
1286          */
1287         if ((deadline < 0) || (deadline > INT32_MAX)) {
1288             deadline = INT32_MAX;
1289         }
1290
1291         return qemu_icount_round(deadline);
1292     } else {
1293         return replay_get_instructions();
1294     }
1295 }
1296
1297 static void handle_icount_deadline(void)
1298 {
1299     assert(qemu_in_vcpu_thread());
1300     if (use_icount) {
1301         int64_t deadline =
1302             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1303
1304         if (deadline == 0) {
1305             /* Wake up other AioContexts.  */
1306             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1307             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1308         }
1309     }
1310 }
1311
1312 static void prepare_icount_for_run(CPUState *cpu)
1313 {
1314     if (use_icount) {
1315         int insns_left;
1316
1317         /* These should always be cleared by process_icount_data after
1318          * each vCPU execution. However u16.high can be raised
1319          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1320          */
1321         g_assert(cpu->icount_decr.u16.low == 0);
1322         g_assert(cpu->icount_extra == 0);
1323
1324         cpu->icount_budget = tcg_get_icount_limit();
1325         insns_left = MIN(0xffff, cpu->icount_budget);
1326         cpu->icount_decr.u16.low = insns_left;
1327         cpu->icount_extra = cpu->icount_budget - insns_left;
1328
1329         replay_mutex_lock();
1330     }
1331 }
1332
1333 static void process_icount_data(CPUState *cpu)
1334 {
1335     if (use_icount) {
1336         /* Account for executed instructions */
1337         cpu_update_icount(cpu);
1338
1339         /* Reset the counters */
1340         cpu->icount_decr.u16.low = 0;
1341         cpu->icount_extra = 0;
1342         cpu->icount_budget = 0;
1343
1344         replay_account_executed_instructions();
1345
1346         replay_mutex_unlock();
1347     }
1348 }
1349
1350
1351 static int tcg_cpu_exec(CPUState *cpu)
1352 {
1353     int ret;
1354 #ifdef CONFIG_PROFILER
1355     int64_t ti;
1356 #endif
1357
1358     assert(tcg_enabled());
1359 #ifdef CONFIG_PROFILER
1360     ti = profile_getclock();
1361 #endif
1362     cpu_exec_start(cpu);
1363     ret = cpu_exec(cpu);
1364     cpu_exec_end(cpu);
1365 #ifdef CONFIG_PROFILER
1366     tcg_time += profile_getclock() - ti;
1367 #endif
1368     return ret;
1369 }
1370
1371 /* Destroy any remaining vCPUs which have been unplugged and have
1372  * finished running
1373  */
1374 static void deal_with_unplugged_cpus(void)
1375 {
1376     CPUState *cpu;
1377
1378     CPU_FOREACH(cpu) {
1379         if (cpu->unplug && !cpu_can_run(cpu)) {
1380             qemu_tcg_destroy_vcpu(cpu);
1381             cpu->created = false;
1382             qemu_cond_signal(&qemu_cpu_cond);
1383             break;
1384         }
1385     }
1386 }
1387
1388 /* Single-threaded TCG
1389  *
1390  * In the single-threaded case each vCPU is simulated in turn. If
1391  * there is more than a single vCPU we create a simple timer to kick
1392  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1393  * This is done explicitly rather than relying on side-effects
1394  * elsewhere.
1395  */
1396
1397 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1398 {
1399     CPUState *cpu = arg;
1400
1401     assert(tcg_enabled());
1402     rcu_register_thread();
1403     tcg_register_thread();
1404
1405     qemu_mutex_lock_iothread();
1406     qemu_thread_get_self(cpu->thread);
1407
1408     cpu->thread_id = qemu_get_thread_id();
1409     cpu->created = true;
1410     cpu->can_do_io = 1;
1411     qemu_cond_signal(&qemu_cpu_cond);
1412
1413     /* wait for initial kick-off after machine start */
1414     while (first_cpu->stopped) {
1415         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1416
1417         /* process any pending work */
1418         CPU_FOREACH(cpu) {
1419             current_cpu = cpu;
1420             qemu_wait_io_event_common(cpu);
1421         }
1422     }
1423
1424     start_tcg_kick_timer();
1425
1426     cpu = first_cpu;
1427
1428     /* process any pending work */
1429     cpu->exit_request = 1;
1430
1431     while (1) {
1432         qemu_mutex_unlock_iothread();
1433         replay_mutex_lock();
1434         qemu_mutex_lock_iothread();
1435         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1436         qemu_account_warp_timer();
1437
1438         /* Run the timers here.  This is much more efficient than
1439          * waking up the I/O thread and waiting for completion.
1440          */
1441         handle_icount_deadline();
1442
1443         replay_mutex_unlock();
1444
1445         if (!cpu) {
1446             cpu = first_cpu;
1447         }
1448
1449         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1450
1451             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1452             current_cpu = cpu;
1453
1454             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1455                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1456
1457             if (cpu_can_run(cpu)) {
1458                 int r;
1459
1460                 qemu_mutex_unlock_iothread();
1461                 prepare_icount_for_run(cpu);
1462
1463                 r = tcg_cpu_exec(cpu);
1464
1465                 process_icount_data(cpu);
1466                 qemu_mutex_lock_iothread();
1467
1468                 if (r == EXCP_DEBUG) {
1469                     cpu_handle_guest_debug(cpu);
1470                     break;
1471                 } else if (r == EXCP_ATOMIC) {
1472                     qemu_mutex_unlock_iothread();
1473                     cpu_exec_step_atomic(cpu);
1474                     qemu_mutex_lock_iothread();
1475                     break;
1476                 }
1477             } else if (cpu->stop) {
1478                 if (cpu->unplug) {
1479                     cpu = CPU_NEXT(cpu);
1480                 }
1481                 break;
1482             }
1483
1484             cpu = CPU_NEXT(cpu);
1485         } /* while (cpu && !cpu->exit_request).. */
1486
1487         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1488         atomic_set(&tcg_current_rr_cpu, NULL);
1489
1490         if (cpu && cpu->exit_request) {
1491             atomic_mb_set(&cpu->exit_request, 0);
1492         }
1493
1494         qemu_tcg_rr_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1495         deal_with_unplugged_cpus();
1496     }
1497
1498     rcu_unregister_thread();
1499     return NULL;
1500 }
1501
1502 static void *qemu_hax_cpu_thread_fn(void *arg)
1503 {
1504     CPUState *cpu = arg;
1505     int r;
1506
1507     rcu_register_thread();
1508     qemu_mutex_lock_iothread();
1509     qemu_thread_get_self(cpu->thread);
1510
1511     cpu->thread_id = qemu_get_thread_id();
1512     cpu->created = true;
1513     cpu->halted = 0;
1514     current_cpu = cpu;
1515
1516     hax_init_vcpu(cpu);
1517     qemu_cond_signal(&qemu_cpu_cond);
1518
1519     do {
1520         if (cpu_can_run(cpu)) {
1521             r = hax_smp_cpu_exec(cpu);
1522             if (r == EXCP_DEBUG) {
1523                 cpu_handle_guest_debug(cpu);
1524             }
1525         }
1526
1527         qemu_wait_io_event(cpu);
1528     } while (!cpu->unplug || cpu_can_run(cpu));
1529     rcu_unregister_thread();
1530     return NULL;
1531 }
1532
1533 /* The HVF-specific vCPU thread function. This one should only run when the host
1534  * CPU supports the VMX "unrestricted guest" feature. */
1535 static void *qemu_hvf_cpu_thread_fn(void *arg)
1536 {
1537     CPUState *cpu = arg;
1538
1539     int r;
1540
1541     assert(hvf_enabled());
1542
1543     rcu_register_thread();
1544
1545     qemu_mutex_lock_iothread();
1546     qemu_thread_get_self(cpu->thread);
1547
1548     cpu->thread_id = qemu_get_thread_id();
1549     cpu->can_do_io = 1;
1550     current_cpu = cpu;
1551
1552     hvf_init_vcpu(cpu);
1553
1554     /* signal CPU creation */
1555     cpu->created = true;
1556     qemu_cond_signal(&qemu_cpu_cond);
1557
1558     do {
1559         if (cpu_can_run(cpu)) {
1560             r = hvf_vcpu_exec(cpu);
1561             if (r == EXCP_DEBUG) {
1562                 cpu_handle_guest_debug(cpu);
1563             }
1564         }
1565         qemu_wait_io_event(cpu);
1566     } while (!cpu->unplug || cpu_can_run(cpu));
1567
1568     hvf_vcpu_destroy(cpu);
1569     cpu->created = false;
1570     qemu_cond_signal(&qemu_cpu_cond);
1571     qemu_mutex_unlock_iothread();
1572     rcu_unregister_thread();
1573     return NULL;
1574 }
1575
1576 static void *qemu_whpx_cpu_thread_fn(void *arg)
1577 {
1578     CPUState *cpu = arg;
1579     int r;
1580
1581     rcu_register_thread();
1582
1583     qemu_mutex_lock_iothread();
1584     qemu_thread_get_self(cpu->thread);
1585     cpu->thread_id = qemu_get_thread_id();
1586     current_cpu = cpu;
1587
1588     r = whpx_init_vcpu(cpu);
1589     if (r < 0) {
1590         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1591         exit(1);
1592     }
1593
1594     /* signal CPU creation */
1595     cpu->created = true;
1596     qemu_cond_signal(&qemu_cpu_cond);
1597
1598     do {
1599         if (cpu_can_run(cpu)) {
1600             r = whpx_vcpu_exec(cpu);
1601             if (r == EXCP_DEBUG) {
1602                 cpu_handle_guest_debug(cpu);
1603             }
1604         }
1605         while (cpu_thread_is_idle(cpu)) {
1606             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1607         }
1608         qemu_wait_io_event_common(cpu);
1609     } while (!cpu->unplug || cpu_can_run(cpu));
1610
1611     whpx_destroy_vcpu(cpu);
1612     cpu->created = false;
1613     qemu_cond_signal(&qemu_cpu_cond);
1614     qemu_mutex_unlock_iothread();
1615     rcu_unregister_thread();
1616     return NULL;
1617 }
1618
1619 #ifdef _WIN32
1620 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1621 {
1622 }
1623 #endif
1624
1625 /* Multi-threaded TCG
1626  *
1627  * In the multi-threaded case each vCPU has its own thread. The TLS
1628  * variable current_cpu can be used deep in the code to find the
1629  * current CPUState for a given thread.
1630  */
1631
1632 static void *qemu_tcg_cpu_thread_fn(void *arg)
1633 {
1634     CPUState *cpu = arg;
1635
1636     assert(tcg_enabled());
1637     g_assert(!use_icount);
1638
1639     rcu_register_thread();
1640     tcg_register_thread();
1641
1642     qemu_mutex_lock_iothread();
1643     qemu_thread_get_self(cpu->thread);
1644
1645     cpu->thread_id = qemu_get_thread_id();
1646     cpu->created = true;
1647     cpu->can_do_io = 1;
1648     current_cpu = cpu;
1649     qemu_cond_signal(&qemu_cpu_cond);
1650
1651     /* process any pending work */
1652     cpu->exit_request = 1;
1653
1654     do {
1655         if (cpu_can_run(cpu)) {
1656             int r;
1657             qemu_mutex_unlock_iothread();
1658             r = tcg_cpu_exec(cpu);
1659             qemu_mutex_lock_iothread();
1660             switch (r) {
1661             case EXCP_DEBUG:
1662                 cpu_handle_guest_debug(cpu);
1663                 break;
1664             case EXCP_HALTED:
1665                 /* during start-up the vCPU is reset and the thread is
1666                  * kicked several times. If we don't ensure we go back
1667                  * to sleep in the halted state we won't cleanly
1668                  * start-up when the vCPU is enabled.
1669                  *
1670                  * cpu->halted should ensure we sleep in wait_io_event
1671                  */
1672                 g_assert(cpu->halted);
1673                 break;
1674             case EXCP_ATOMIC:
1675                 qemu_mutex_unlock_iothread();
1676                 cpu_exec_step_atomic(cpu);
1677                 qemu_mutex_lock_iothread();
1678             default:
1679                 /* Ignore everything else? */
1680                 break;
1681             }
1682         }
1683
1684         atomic_mb_set(&cpu->exit_request, 0);
1685         qemu_wait_io_event(cpu);
1686     } while (!cpu->unplug || cpu_can_run(cpu));
1687
1688     qemu_tcg_destroy_vcpu(cpu);
1689     cpu->created = false;
1690     qemu_cond_signal(&qemu_cpu_cond);
1691     qemu_mutex_unlock_iothread();
1692     rcu_unregister_thread();
1693     return NULL;
1694 }
1695
1696 static void qemu_cpu_kick_thread(CPUState *cpu)
1697 {
1698 #ifndef _WIN32
1699     int err;
1700
1701     if (cpu->thread_kicked) {
1702         return;
1703     }
1704     cpu->thread_kicked = true;
1705     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1706     if (err) {
1707         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1708         exit(1);
1709     }
1710 #else /* _WIN32 */
1711     if (!qemu_cpu_is_self(cpu)) {
1712         if (whpx_enabled()) {
1713             whpx_vcpu_kick(cpu);
1714         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1715             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1716                     __func__, GetLastError());
1717             exit(1);
1718         }
1719     }
1720 #endif
1721 }
1722
1723 void qemu_cpu_kick(CPUState *cpu)
1724 {
1725     qemu_cond_broadcast(cpu->halt_cond);
1726     if (tcg_enabled()) {
1727         cpu_exit(cpu);
1728         /* NOP unless doing single-thread RR */
1729         qemu_cpu_kick_rr_cpu();
1730     } else {
1731         if (hax_enabled()) {
1732             /*
1733              * FIXME: race condition with the exit_request check in
1734              * hax_vcpu_hax_exec
1735              */
1736             cpu->exit_request = 1;
1737         }
1738         qemu_cpu_kick_thread(cpu);
1739     }
1740 }
1741
1742 void qemu_cpu_kick_self(void)
1743 {
1744     assert(current_cpu);
1745     qemu_cpu_kick_thread(current_cpu);
1746 }
1747
1748 bool qemu_cpu_is_self(CPUState *cpu)
1749 {
1750     return qemu_thread_is_self(cpu->thread);
1751 }
1752
1753 bool qemu_in_vcpu_thread(void)
1754 {
1755     return current_cpu && qemu_cpu_is_self(current_cpu);
1756 }
1757
1758 static __thread bool iothread_locked = false;
1759
1760 bool qemu_mutex_iothread_locked(void)
1761 {
1762     return iothread_locked;
1763 }
1764
1765 /*
1766  * The BQL is taken from so many places that it is worth profiling the
1767  * callers directly, instead of funneling them all through a single function.
1768  */
1769 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1770 {
1771     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1772
1773     g_assert(!qemu_mutex_iothread_locked());
1774     bql_lock(&qemu_global_mutex, file, line);
1775     iothread_locked = true;
1776 }
1777
1778 void qemu_mutex_unlock_iothread(void)
1779 {
1780     g_assert(qemu_mutex_iothread_locked());
1781     iothread_locked = false;
1782     qemu_mutex_unlock(&qemu_global_mutex);
1783 }
1784
1785 static bool all_vcpus_paused(void)
1786 {
1787     CPUState *cpu;
1788
1789     CPU_FOREACH(cpu) {
1790         if (!cpu->stopped) {
1791             return false;
1792         }
1793     }
1794
1795     return true;
1796 }
1797
1798 void pause_all_vcpus(void)
1799 {
1800     CPUState *cpu;
1801
1802     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1803     CPU_FOREACH(cpu) {
1804         if (qemu_cpu_is_self(cpu)) {
1805             qemu_cpu_stop(cpu, true);
1806         } else {
1807             cpu->stop = true;
1808             qemu_cpu_kick(cpu);
1809         }
1810     }
1811
1812     /* We need to drop the replay_lock so any vCPU threads woken up
1813      * can finish their replay tasks
1814      */
1815     replay_mutex_unlock();
1816
1817     while (!all_vcpus_paused()) {
1818         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1819         CPU_FOREACH(cpu) {
1820             qemu_cpu_kick(cpu);
1821         }
1822     }
1823
1824     qemu_mutex_unlock_iothread();
1825     replay_mutex_lock();
1826     qemu_mutex_lock_iothread();
1827 }
1828
1829 void cpu_resume(CPUState *cpu)
1830 {
1831     cpu->stop = false;
1832     cpu->stopped = false;
1833     qemu_cpu_kick(cpu);
1834 }
1835
1836 void resume_all_vcpus(void)
1837 {
1838     CPUState *cpu;
1839
1840     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1841     CPU_FOREACH(cpu) {
1842         cpu_resume(cpu);
1843     }
1844 }
1845
1846 void cpu_remove_sync(CPUState *cpu)
1847 {
1848     cpu->stop = true;
1849     cpu->unplug = true;
1850     qemu_cpu_kick(cpu);
1851     qemu_mutex_unlock_iothread();
1852     qemu_thread_join(cpu->thread);
1853     qemu_mutex_lock_iothread();
1854 }
1855
1856 /* For temporary buffers for forming a name */
1857 #define VCPU_THREAD_NAME_SIZE 16
1858
1859 static void qemu_tcg_init_vcpu(CPUState *cpu)
1860 {
1861     char thread_name[VCPU_THREAD_NAME_SIZE];
1862     static QemuCond *single_tcg_halt_cond;
1863     static QemuThread *single_tcg_cpu_thread;
1864     static int tcg_region_inited;
1865
1866     assert(tcg_enabled());
1867     /*
1868      * Initialize TCG regions--once. Now is a good time, because:
1869      * (1) TCG's init context, prologue and target globals have been set up.
1870      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1871      *     -accel flag is processed, so the check doesn't work then).
1872      */
1873     if (!tcg_region_inited) {
1874         tcg_region_inited = 1;
1875         tcg_region_init();
1876     }
1877
1878     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1879         cpu->thread = g_malloc0(sizeof(QemuThread));
1880         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1881         qemu_cond_init(cpu->halt_cond);
1882
1883         if (qemu_tcg_mttcg_enabled()) {
1884             /* create a thread per vCPU with TCG (MTTCG) */
1885             parallel_cpus = true;
1886             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1887                  cpu->cpu_index);
1888
1889             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1890                                cpu, QEMU_THREAD_JOINABLE);
1891
1892         } else {
1893             /* share a single thread for all cpus with TCG */
1894             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1895             qemu_thread_create(cpu->thread, thread_name,
1896                                qemu_tcg_rr_cpu_thread_fn,
1897                                cpu, QEMU_THREAD_JOINABLE);
1898
1899             single_tcg_halt_cond = cpu->halt_cond;
1900             single_tcg_cpu_thread = cpu->thread;
1901         }
1902 #ifdef _WIN32
1903         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1904 #endif
1905     } else {
1906         /* For non-MTTCG cases we share the thread */
1907         cpu->thread = single_tcg_cpu_thread;
1908         cpu->halt_cond = single_tcg_halt_cond;
1909         cpu->thread_id = first_cpu->thread_id;
1910         cpu->can_do_io = 1;
1911         cpu->created = true;
1912     }
1913 }
1914
1915 static void qemu_hax_start_vcpu(CPUState *cpu)
1916 {
1917     char thread_name[VCPU_THREAD_NAME_SIZE];
1918
1919     cpu->thread = g_malloc0(sizeof(QemuThread));
1920     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1921     qemu_cond_init(cpu->halt_cond);
1922
1923     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1924              cpu->cpu_index);
1925     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1926                        cpu, QEMU_THREAD_JOINABLE);
1927 #ifdef _WIN32
1928     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1929 #endif
1930 }
1931
1932 static void qemu_kvm_start_vcpu(CPUState *cpu)
1933 {
1934     char thread_name[VCPU_THREAD_NAME_SIZE];
1935
1936     cpu->thread = g_malloc0(sizeof(QemuThread));
1937     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1938     qemu_cond_init(cpu->halt_cond);
1939     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1940              cpu->cpu_index);
1941     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1942                        cpu, QEMU_THREAD_JOINABLE);
1943 }
1944
1945 static void qemu_hvf_start_vcpu(CPUState *cpu)
1946 {
1947     char thread_name[VCPU_THREAD_NAME_SIZE];
1948
1949     /* HVF currently does not support TCG, and only runs in
1950      * unrestricted-guest mode. */
1951     assert(hvf_enabled());
1952
1953     cpu->thread = g_malloc0(sizeof(QemuThread));
1954     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1955     qemu_cond_init(cpu->halt_cond);
1956
1957     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
1958              cpu->cpu_index);
1959     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
1960                        cpu, QEMU_THREAD_JOINABLE);
1961 }
1962
1963 static void qemu_whpx_start_vcpu(CPUState *cpu)
1964 {
1965     char thread_name[VCPU_THREAD_NAME_SIZE];
1966
1967     cpu->thread = g_malloc0(sizeof(QemuThread));
1968     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1969     qemu_cond_init(cpu->halt_cond);
1970     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
1971              cpu->cpu_index);
1972     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
1973                        cpu, QEMU_THREAD_JOINABLE);
1974 #ifdef _WIN32
1975     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1976 #endif
1977 }
1978
1979 static void qemu_dummy_start_vcpu(CPUState *cpu)
1980 {
1981     char thread_name[VCPU_THREAD_NAME_SIZE];
1982
1983     cpu->thread = g_malloc0(sizeof(QemuThread));
1984     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1985     qemu_cond_init(cpu->halt_cond);
1986     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1987              cpu->cpu_index);
1988     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1989                        QEMU_THREAD_JOINABLE);
1990 }
1991
1992 void qemu_init_vcpu(CPUState *cpu)
1993 {
1994     cpu->nr_cores = smp_cores;
1995     cpu->nr_threads = smp_threads;
1996     cpu->stopped = true;
1997
1998     if (!cpu->as) {
1999         /* If the target cpu hasn't set up any address spaces itself,
2000          * give it the default one.
2001          */
2002         cpu->num_ases = 1;
2003         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2004     }
2005
2006     if (kvm_enabled()) {
2007         qemu_kvm_start_vcpu(cpu);
2008     } else if (hax_enabled()) {
2009         qemu_hax_start_vcpu(cpu);
2010     } else if (hvf_enabled()) {
2011         qemu_hvf_start_vcpu(cpu);
2012     } else if (tcg_enabled()) {
2013         qemu_tcg_init_vcpu(cpu);
2014     } else if (whpx_enabled()) {
2015         qemu_whpx_start_vcpu(cpu);
2016     } else {
2017         qemu_dummy_start_vcpu(cpu);
2018     }
2019
2020     while (!cpu->created) {
2021         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2022     }
2023 }
2024
2025 void cpu_stop_current(void)
2026 {
2027     if (current_cpu) {
2028         qemu_cpu_stop(current_cpu, true);
2029     }
2030 }
2031
2032 int vm_stop(RunState state)
2033 {
2034     if (qemu_in_vcpu_thread()) {
2035         qemu_system_vmstop_request_prepare();
2036         qemu_system_vmstop_request(state);
2037         /*
2038          * FIXME: should not return to device code in case
2039          * vm_stop() has been requested.
2040          */
2041         cpu_stop_current();
2042         return 0;
2043     }
2044
2045     return do_vm_stop(state, true);
2046 }
2047
2048 /**
2049  * Prepare for (re)starting the VM.
2050  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2051  * running or in case of an error condition), 0 otherwise.
2052  */
2053 int vm_prepare_start(void)
2054 {
2055     RunState requested;
2056
2057     qemu_vmstop_requested(&requested);
2058     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2059         return -1;
2060     }
2061
2062     /* Ensure that a STOP/RESUME pair of events is emitted if a
2063      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2064      * example, according to documentation is always followed by
2065      * the STOP event.
2066      */
2067     if (runstate_is_running()) {
2068         qapi_event_send_stop(&error_abort);
2069         qapi_event_send_resume(&error_abort);
2070         return -1;
2071     }
2072
2073     /* We are sending this now, but the CPUs will be resumed shortly later */
2074     qapi_event_send_resume(&error_abort);
2075
2076     replay_enable_events();
2077     cpu_enable_ticks();
2078     runstate_set(RUN_STATE_RUNNING);
2079     vm_state_notify(1, RUN_STATE_RUNNING);
2080     return 0;
2081 }
2082
2083 void vm_start(void)
2084 {
2085     if (!vm_prepare_start()) {
2086         resume_all_vcpus();
2087     }
2088 }
2089
2090 /* does a state transition even if the VM is already stopped,
2091    current state is forgotten forever */
2092 int vm_stop_force_state(RunState state)
2093 {
2094     if (runstate_is_running()) {
2095         return vm_stop(state);
2096     } else {
2097         runstate_set(state);
2098
2099         bdrv_drain_all();
2100         /* Make sure to return an error if the flush in a previous vm_stop()
2101          * failed. */
2102         return bdrv_flush_all();
2103     }
2104 }
2105
2106 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
2107 {
2108     /* XXX: implement xxx_cpu_list for targets that still miss it */
2109 #if defined(cpu_list)
2110     cpu_list(f, cpu_fprintf);
2111 #endif
2112 }
2113
2114 CpuInfoList *qmp_query_cpus(Error **errp)
2115 {
2116     MachineState *ms = MACHINE(qdev_get_machine());
2117     MachineClass *mc = MACHINE_GET_CLASS(ms);
2118     CpuInfoList *head = NULL, *cur_item = NULL;
2119     CPUState *cpu;
2120
2121     CPU_FOREACH(cpu) {
2122         CpuInfoList *info;
2123 #if defined(TARGET_I386)
2124         X86CPU *x86_cpu = X86_CPU(cpu);
2125         CPUX86State *env = &x86_cpu->env;
2126 #elif defined(TARGET_PPC)
2127         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2128         CPUPPCState *env = &ppc_cpu->env;
2129 #elif defined(TARGET_SPARC)
2130         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2131         CPUSPARCState *env = &sparc_cpu->env;
2132 #elif defined(TARGET_RISCV)
2133         RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2134         CPURISCVState *env = &riscv_cpu->env;
2135 #elif defined(TARGET_MIPS)
2136         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2137         CPUMIPSState *env = &mips_cpu->env;
2138 #elif defined(TARGET_TRICORE)
2139         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2140         CPUTriCoreState *env = &tricore_cpu->env;
2141 #elif defined(TARGET_S390X)
2142         S390CPU *s390_cpu = S390_CPU(cpu);
2143         CPUS390XState *env = &s390_cpu->env;
2144 #endif
2145
2146         cpu_synchronize_state(cpu);
2147
2148         info = g_malloc0(sizeof(*info));
2149         info->value = g_malloc0(sizeof(*info->value));
2150         info->value->CPU = cpu->cpu_index;
2151         info->value->current = (cpu == first_cpu);
2152         info->value->halted = cpu->halted;
2153         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2154         info->value->thread_id = cpu->thread_id;
2155 #if defined(TARGET_I386)
2156         info->value->arch = CPU_INFO_ARCH_X86;
2157         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2158 #elif defined(TARGET_PPC)
2159         info->value->arch = CPU_INFO_ARCH_PPC;
2160         info->value->u.ppc.nip = env->nip;
2161 #elif defined(TARGET_SPARC)
2162         info->value->arch = CPU_INFO_ARCH_SPARC;
2163         info->value->u.q_sparc.pc = env->pc;
2164         info->value->u.q_sparc.npc = env->npc;
2165 #elif defined(TARGET_MIPS)
2166         info->value->arch = CPU_INFO_ARCH_MIPS;
2167         info->value->u.q_mips.PC = env->active_tc.PC;
2168 #elif defined(TARGET_TRICORE)
2169         info->value->arch = CPU_INFO_ARCH_TRICORE;
2170         info->value->u.tricore.PC = env->PC;
2171 #elif defined(TARGET_S390X)
2172         info->value->arch = CPU_INFO_ARCH_S390;
2173         info->value->u.s390.cpu_state = env->cpu_state;
2174 #elif defined(TARGET_RISCV)
2175         info->value->arch = CPU_INFO_ARCH_RISCV;
2176         info->value->u.riscv.pc = env->pc;
2177 #else
2178         info->value->arch = CPU_INFO_ARCH_OTHER;
2179 #endif
2180         info->value->has_props = !!mc->cpu_index_to_instance_props;
2181         if (info->value->has_props) {
2182             CpuInstanceProperties *props;
2183             props = g_malloc0(sizeof(*props));
2184             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2185             info->value->props = props;
2186         }
2187
2188         /* XXX: waiting for the qapi to support GSList */
2189         if (!cur_item) {
2190             head = cur_item = info;
2191         } else {
2192             cur_item->next = info;
2193             cur_item = info;
2194         }
2195     }
2196
2197     return head;
2198 }
2199
2200 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2201 {
2202     /*
2203      * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2204      * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2205      */
2206     switch (target) {
2207     case SYS_EMU_TARGET_I386:
2208     case SYS_EMU_TARGET_X86_64:
2209         return CPU_INFO_ARCH_X86;
2210
2211     case SYS_EMU_TARGET_PPC:
2212     case SYS_EMU_TARGET_PPCEMB:
2213     case SYS_EMU_TARGET_PPC64:
2214         return CPU_INFO_ARCH_PPC;
2215
2216     case SYS_EMU_TARGET_SPARC:
2217     case SYS_EMU_TARGET_SPARC64:
2218         return CPU_INFO_ARCH_SPARC;
2219
2220     case SYS_EMU_TARGET_MIPS:
2221     case SYS_EMU_TARGET_MIPSEL:
2222     case SYS_EMU_TARGET_MIPS64:
2223     case SYS_EMU_TARGET_MIPS64EL:
2224         return CPU_INFO_ARCH_MIPS;
2225
2226     case SYS_EMU_TARGET_TRICORE:
2227         return CPU_INFO_ARCH_TRICORE;
2228
2229     case SYS_EMU_TARGET_S390X:
2230         return CPU_INFO_ARCH_S390;
2231
2232     case SYS_EMU_TARGET_RISCV32:
2233     case SYS_EMU_TARGET_RISCV64:
2234         return CPU_INFO_ARCH_RISCV;
2235
2236     default:
2237         return CPU_INFO_ARCH_OTHER;
2238     }
2239 }
2240
2241 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2242 {
2243 #ifdef TARGET_S390X
2244     S390CPU *s390_cpu = S390_CPU(cpu);
2245     CPUS390XState *env = &s390_cpu->env;
2246
2247     info->cpu_state = env->cpu_state;
2248 #else
2249     abort();
2250 #endif
2251 }
2252
2253 /*
2254  * fast means: we NEVER interrupt vCPU threads to retrieve
2255  * information from KVM.
2256  */
2257 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2258 {
2259     MachineState *ms = MACHINE(qdev_get_machine());
2260     MachineClass *mc = MACHINE_GET_CLASS(ms);
2261     CpuInfoFastList *head = NULL, *cur_item = NULL;
2262     SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2263                                           -1, &error_abort);
2264     CPUState *cpu;
2265
2266     CPU_FOREACH(cpu) {
2267         CpuInfoFastList *info = g_malloc0(sizeof(*info));
2268         info->value = g_malloc0(sizeof(*info->value));
2269
2270         info->value->cpu_index = cpu->cpu_index;
2271         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2272         info->value->thread_id = cpu->thread_id;
2273
2274         info->value->has_props = !!mc->cpu_index_to_instance_props;
2275         if (info->value->has_props) {
2276             CpuInstanceProperties *props;
2277             props = g_malloc0(sizeof(*props));
2278             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2279             info->value->props = props;
2280         }
2281
2282         info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2283         info->value->target = target;
2284         if (target == SYS_EMU_TARGET_S390X) {
2285             cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2286         }
2287
2288         if (!cur_item) {
2289             head = cur_item = info;
2290         } else {
2291             cur_item->next = info;
2292             cur_item = info;
2293         }
2294     }
2295
2296     return head;
2297 }
2298
2299 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2300                  bool has_cpu, int64_t cpu_index, Error **errp)
2301 {
2302     FILE *f;
2303     uint32_t l;
2304     CPUState *cpu;
2305     uint8_t buf[1024];
2306     int64_t orig_addr = addr, orig_size = size;
2307
2308     if (!has_cpu) {
2309         cpu_index = 0;
2310     }
2311
2312     cpu = qemu_get_cpu(cpu_index);
2313     if (cpu == NULL) {
2314         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2315                    "a CPU number");
2316         return;
2317     }
2318
2319     f = fopen(filename, "wb");
2320     if (!f) {
2321         error_setg_file_open(errp, errno, filename);
2322         return;
2323     }
2324
2325     while (size != 0) {
2326         l = sizeof(buf);
2327         if (l > size)
2328             l = size;
2329         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2330             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2331                              " specified", orig_addr, orig_size);
2332             goto exit;
2333         }
2334         if (fwrite(buf, 1, l, f) != l) {
2335             error_setg(errp, QERR_IO_ERROR);
2336             goto exit;
2337         }
2338         addr += l;
2339         size -= l;
2340     }
2341
2342 exit:
2343     fclose(f);
2344 }
2345
2346 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2347                   Error **errp)
2348 {
2349     FILE *f;
2350     uint32_t l;
2351     uint8_t buf[1024];
2352
2353     f = fopen(filename, "wb");
2354     if (!f) {
2355         error_setg_file_open(errp, errno, filename);
2356         return;
2357     }
2358
2359     while (size != 0) {
2360         l = sizeof(buf);
2361         if (l > size)
2362             l = size;
2363         cpu_physical_memory_read(addr, buf, l);
2364         if (fwrite(buf, 1, l, f) != l) {
2365             error_setg(errp, QERR_IO_ERROR);
2366             goto exit;
2367         }
2368         addr += l;
2369         size -= l;
2370     }
2371
2372 exit:
2373     fclose(f);
2374 }
2375
2376 void qmp_inject_nmi(Error **errp)
2377 {
2378     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2379 }
2380
2381 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2382 {
2383     if (!use_icount) {
2384         return;
2385     }
2386
2387     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
2388                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2389     if (icount_align_option) {
2390         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
2391         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
2392     } else {
2393         cpu_fprintf(f, "Max guest delay     NA\n");
2394         cpu_fprintf(f, "Max guest advance   NA\n");
2395     }
2396 }