cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu/config-file.h"
  27 #include "cpu.h"
  28 #include "monitor/monitor.h"
  29 #include "qapi/error.h"
  30 #include "qapi/qapi-commands-misc.h"
  31 #include "qapi/qapi-events-run-state.h"
  32 #include "qapi/qmp/qerror.h"
  33 #include "qemu/error-report.h"
  34 #include "sysemu/sysemu.h"
  35 #include "sysemu/block-backend.h"
  36 #include "exec/gdbstub.h"
  37 #include "sysemu/dma.h"
  38 #include "sysemu/hw_accel.h"
  39 #include "sysemu/kvm.h"
  40 #include "sysemu/hax.h"
  41 #include "sysemu/hvf.h"
  42 #include "sysemu/whpx.h"
  43 #include "exec/exec-all.h"
  44
  45 #include "qemu/thread.h"
  46 #include "sysemu/cpus.h"
  47 #include "sysemu/qtest.h"
  48 #include "qemu/main-loop.h"
  49 #include "qemu/option.h"
  50 #include "qemu/bitmap.h"
  51 #include "qemu/seqlock.h"
  52 #include "tcg.h"
  53 #include "hw/nmi.h"
  54 #include "sysemu/replay.h"
  55 #include "hw/boards.h"
  56
  57 #ifdef CONFIG_LINUX
  58
  59 #include <sys/prctl.h>
  60
  61 #ifndef PR_MCE_KILL
  62 #define PR_MCE_KILL 33
  63 #endif
  64
  65 #ifndef PR_MCE_KILL_SET
  66 #define PR_MCE_KILL_SET 1
  67 #endif
  68
  69 #ifndef PR_MCE_KILL_EARLY
  70 #define PR_MCE_KILL_EARLY 1
  71 #endif
  72
  73 #endif /* CONFIG_LINUX */
  74
  75 int64_t max_delay;
  76 int64_t max_advance;
  77
  78 /* vcpu throttling controls */
  79 static QEMUTimer *throttle_timer;
  80 static unsigned int throttle_percentage;
  81
  82 #define CPU_THROTTLE_PCT_MIN 1
  83 #define CPU_THROTTLE_PCT_MAX 99
  84 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  85
  86 bool cpu_is_stopped(CPUState *cpu)
  87 {
  88     return cpu->stopped || !runstate_is_running();
  89 }
  90
  91 static bool cpu_thread_is_idle(CPUState *cpu)
  92 {
  93     if (cpu->stop || cpu->queued_work_first) {
  94         return false;
  95     }
  96     if (cpu_is_stopped(cpu)) {
  97         return true;
  98     }
  99     if (!cpu->halted || cpu_has_work(cpu) ||
 100         kvm_halt_in_kernel()) {
 101         return false;
 102     }
 103     return true;
 104 }
 105
 106 static bool all_cpu_threads_idle(void)
 107 {
 108     CPUState *cpu;
 109
 110     CPU_FOREACH(cpu) {
 111         if (!cpu_thread_is_idle(cpu)) {
 112             return false;
 113         }
 114     }
 115     return true;
 116 }
 117
 118 /***********************************************************/
 119 /* guest cycle counter */
 120
 121 /* Protected by TimersState seqlock */
 122
 123 static bool icount_sleep = true;
 124 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 125 #define MAX_ICOUNT_SHIFT 10
 126
 127 typedef struct TimersState {
 128     /* Protected by BQL.  */
 129     int64_t cpu_ticks_prev;
 130     int64_t cpu_ticks_offset;
 131
 132     /* Protect fields that can be respectively read outside the
 133      * BQL, and written from multiple threads.
 134      */
 135     QemuSeqLock vm_clock_seqlock;
 136     QemuSpin vm_clock_lock;
 137
 138     int16_t cpu_ticks_enabled;
 139
 140     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 141     int16_t icount_time_shift;
 142
 143     /* Compensate for varying guest execution speed.  */
 144     int64_t qemu_icount_bias;
 145
 146     int64_t vm_clock_warp_start;
 147     int64_t cpu_clock_offset;
 148
 149     /* Only written by TCG thread */
 150     int64_t qemu_icount;
 151
 152     /* for adjusting icount */
 153     QEMUTimer *icount_rt_timer;
 154     QEMUTimer *icount_vm_timer;
 155     QEMUTimer *icount_warp_timer;
 156 } TimersState;
 157
 158 static TimersState timers_state;
 159 bool mttcg_enabled;
 160
 161 /*
 162  * We default to false if we know other options have been enabled
 163  * which are currently incompatible with MTTCG. Otherwise when each
 164  * guest (target) has been updated to support:
 165  *   - atomic instructions
 166  *   - memory ordering primitives (barriers)
 167  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 168  *
 169  * Once a guest architecture has been converted to the new primitives
 170  * there are two remaining limitations to check.
 171  *
 172  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 173  * - The host must have a stronger memory order than the guest
 174  *
 175  * It may be possible in future to support strong guests on weak hosts
 176  * but that will require tagging all load/stores in a guest with their
 177  * implicit memory order requirements which would likely slow things
 178  * down a lot.
 179  */
 180
 181 static bool check_tcg_memory_orders_compatible(void)
 182 {
 183 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 184     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 185 #else
 186     return false;
 187 #endif
 188 }
 189
 190 static bool default_mttcg_enabled(void)
 191 {
 192     if (use_icount || TCG_OVERSIZED_GUEST) {
 193         return false;
 194     } else {
 195 #ifdef TARGET_SUPPORTS_MTTCG
 196         return check_tcg_memory_orders_compatible();
 197 #else
 198         return false;
 199 #endif
 200     }
 201 }
 202
 203 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 204 {
 205     const char *t = qemu_opt_get(opts, "thread");
 206     if (t) {
 207         if (strcmp(t, "multi") == 0) {
 208             if (TCG_OVERSIZED_GUEST) {
 209                 error_setg(errp, "No MTTCG when guest word size > hosts");
 210             } else if (use_icount) {
 211                 error_setg(errp, "No MTTCG when icount is enabled");
 212             } else {
 213 #ifndef TARGET_SUPPORTS_MTTCG
 214                 error_report("Guest not yet converted to MTTCG - "
 215                              "you may get unexpected results");
 216 #endif
 217                 if (!check_tcg_memory_orders_compatible()) {
 218                     error_report("Guest expects a stronger memory ordering "
 219                                  "than the host provides");
 220                     error_printf("This may cause strange/hard to debug errors\n");
 221                 }
 222                 mttcg_enabled = true;
 223             }
 224         } else if (strcmp(t, "single") == 0) {
 225             mttcg_enabled = false;
 226         } else {
 227             error_setg(errp, "Invalid 'thread' setting %s", t);
 228         }
 229     } else {
 230         mttcg_enabled = default_mttcg_enabled();
 231     }
 232 }
 233
 234 /* The current number of executed instructions is based on what we
 235  * originally budgeted minus the current state of the decrementing
 236  * icount counters in extra/u16.low.
 237  */
 238 static int64_t cpu_get_icount_executed(CPUState *cpu)
 239 {
 240     return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
 241 }
 242
 243 /*
 244  * Update the global shared timer_state.qemu_icount to take into
 245  * account executed instructions. This is done by the TCG vCPU
 246  * thread so the main-loop can see time has moved forward.
 247  */
 248 static void cpu_update_icount_locked(CPUState *cpu)
 249 {
 250     int64_t executed = cpu_get_icount_executed(cpu);
 251     cpu->icount_budget -= executed;
 252
 253     atomic_set_i64(&timers_state.qemu_icount,
 254                    timers_state.qemu_icount + executed);
 255 }
 256
 257 /*
 258  * Update the global shared timer_state.qemu_icount to take into
 259  * account executed instructions. This is done by the TCG vCPU
 260  * thread so the main-loop can see time has moved forward.
 261  */
 262 void cpu_update_icount(CPUState *cpu)
 263 {
 264     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 265                        &timers_state.vm_clock_lock);
 266     cpu_update_icount_locked(cpu);
 267     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 268                          &timers_state.vm_clock_lock);
 269 }
 270
 271 static int64_t cpu_get_icount_raw_locked(void)
 272 {
 273     CPUState *cpu = current_cpu;
 274
 275     if (cpu && cpu->running) {
 276         if (!cpu->can_do_io) {
 277             error_report("Bad icount read");
 278             exit(1);
 279         }
 280         /* Take into account what has run */
 281         cpu_update_icount_locked(cpu);
 282     }
 283     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 284     return atomic_read_i64(&timers_state.qemu_icount);
 285 }
 286
 287 static int64_t cpu_get_icount_locked(void)
 288 {
 289     int64_t icount = cpu_get_icount_raw_locked();
 290     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 291         cpu_icount_to_ns(icount);
 292 }
 293
 294 int64_t cpu_get_icount_raw(void)
 295 {
 296     int64_t icount;
 297     unsigned start;
 298
 299     do {
 300         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 301         icount = cpu_get_icount_raw_locked();
 302     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 303
 304     return icount;
 305 }
 306
 307 /* Return the virtual CPU time, based on the instruction counter.  */
 308 int64_t cpu_get_icount(void)
 309 {
 310     int64_t icount;
 311     unsigned start;
 312
 313     do {
 314         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 315         icount = cpu_get_icount_locked();
 316     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 317
 318     return icount;
 319 }
 320
 321 int64_t cpu_icount_to_ns(int64_t icount)
 322 {
 323     return icount << atomic_read(&timers_state.icount_time_shift);
 324 }
 325
 326 static int64_t cpu_get_ticks_locked(void)
 327 {
 328     int64_t ticks = timers_state.cpu_ticks_offset;
 329     if (timers_state.cpu_ticks_enabled) {
 330         ticks += cpu_get_host_ticks();
 331     }
 332
 333     if (timers_state.cpu_ticks_prev > ticks) {
 334         /* Non increasing ticks may happen if the host uses software suspend.  */
 335         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 336         ticks = timers_state.cpu_ticks_prev;
 337     }
 338
 339     timers_state.cpu_ticks_prev = ticks;
 340     return ticks;
 341 }
 342
 343 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 344  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 345  * counter.
 346  */
 347 int64_t cpu_get_ticks(void)
 348 {
 349     int64_t ticks;
 350
 351     if (use_icount) {
 352         return cpu_get_icount();
 353     }
 354
 355     qemu_spin_lock(&timers_state.vm_clock_lock);
 356     ticks = cpu_get_ticks_locked();
 357     qemu_spin_unlock(&timers_state.vm_clock_lock);
 358     return ticks;
 359 }
 360
 361 static int64_t cpu_get_clock_locked(void)
 362 {
 363     int64_t time;
 364
 365     time = timers_state.cpu_clock_offset;
 366     if (timers_state.cpu_ticks_enabled) {
 367         time += get_clock();
 368     }
 369
 370     return time;
 371 }
 372
 373 /* Return the monotonic time elapsed in VM, i.e.,
 374  * the time between vm_start and vm_stop
 375  */
 376 int64_t cpu_get_clock(void)
 377 {
 378     int64_t ti;
 379     unsigned start;
 380
 381     do {
 382         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 383         ti = cpu_get_clock_locked();
 384     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 385
 386     return ti;
 387 }
 388
 389 /* enable cpu_get_ticks()
 390  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 391  */
 392 void cpu_enable_ticks(void)
 393 {
 394     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 395                        &timers_state.vm_clock_lock);
 396     if (!timers_state.cpu_ticks_enabled) {
 397         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 398         timers_state.cpu_clock_offset -= get_clock();
 399         timers_state.cpu_ticks_enabled = 1;
 400     }
 401     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 402                        &timers_state.vm_clock_lock);
 403 }
 404
 405 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 406  * cpu_get_ticks() after that.
 407  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 408  */
 409 void cpu_disable_ticks(void)
 410 {
 411     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 412                        &timers_state.vm_clock_lock);
 413     if (timers_state.cpu_ticks_enabled) {
 414         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 415         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 416         timers_state.cpu_ticks_enabled = 0;
 417     }
 418     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 419                          &timers_state.vm_clock_lock);
 420 }
 421
 422 /* Correlation between real and virtual time is always going to be
 423    fairly approximate, so ignore small variation.
 424    When the guest is idle real and virtual time will be aligned in
 425    the IO wait loop.  */
 426 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 427
 428 static void icount_adjust(void)
 429 {
 430     int64_t cur_time;
 431     int64_t cur_icount;
 432     int64_t delta;
 433
 434     /* Protected by TimersState mutex.  */
 435     static int64_t last_delta;
 436
 437     /* If the VM is not running, then do nothing.  */
 438     if (!runstate_is_running()) {
 439         return;
 440     }
 441
 442     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 443                        &timers_state.vm_clock_lock);
 444     cur_time = cpu_get_clock_locked();
 445     cur_icount = cpu_get_icount_locked();
 446
 447     delta = cur_icount - cur_time;
 448     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 449     if (delta > 0
 450         && last_delta + ICOUNT_WOBBLE < delta * 2
 451         && timers_state.icount_time_shift > 0) {
 452         /* The guest is getting too far ahead.  Slow time down.  */
 453         atomic_set(&timers_state.icount_time_shift,
 454                    timers_state.icount_time_shift - 1);
 455     }
 456     if (delta < 0
 457         && last_delta - ICOUNT_WOBBLE > delta * 2
 458         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 459         /* The guest is getting too far behind.  Speed time up.  */
 460         atomic_set(&timers_state.icount_time_shift,
 461                    timers_state.icount_time_shift + 1);
 462     }
 463     last_delta = delta;
 464     atomic_set_i64(&timers_state.qemu_icount_bias,
 465                    cur_icount - (timers_state.qemu_icount
 466                                  << timers_state.icount_time_shift));
 467     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 468                          &timers_state.vm_clock_lock);
 469 }
 470
 471 static void icount_adjust_rt(void *opaque)
 472 {
 473     timer_mod(timers_state.icount_rt_timer,
 474               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 475     icount_adjust();
 476 }
 477
 478 static void icount_adjust_vm(void *opaque)
 479 {
 480     timer_mod(timers_state.icount_vm_timer,
 481                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 482                    NANOSECONDS_PER_SECOND / 10);
 483     icount_adjust();
 484 }
 485
 486 static int64_t qemu_icount_round(int64_t count)
 487 {
 488     int shift = atomic_read(&timers_state.icount_time_shift);
 489     return (count + (1 << shift) - 1) >> shift;
 490 }
 491
 492 static void icount_warp_rt(void)
 493 {
 494     unsigned seq;
 495     int64_t warp_start;
 496
 497     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 498      * changes from -1 to another value, so the race here is okay.
 499      */
 500     do {
 501         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 502         warp_start = timers_state.vm_clock_warp_start;
 503     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 504
 505     if (warp_start == -1) {
 506         return;
 507     }
 508
 509     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 510                        &timers_state.vm_clock_lock);
 511     if (runstate_is_running()) {
 512         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 513                                      cpu_get_clock_locked());
 514         int64_t warp_delta;
 515
 516         warp_delta = clock - timers_state.vm_clock_warp_start;
 517         if (use_icount == 2) {
 518             /*
 519              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 520              * far ahead of real time.
 521              */
 522             int64_t cur_icount = cpu_get_icount_locked();
 523             int64_t delta = clock - cur_icount;
 524             warp_delta = MIN(warp_delta, delta);
 525         }
 526         atomic_set_i64(&timers_state.qemu_icount_bias,
 527                        timers_state.qemu_icount_bias + warp_delta);
 528     }
 529     timers_state.vm_clock_warp_start = -1;
 530     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 531                        &timers_state.vm_clock_lock);
 532
 533     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 534         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 535     }
 536 }
 537
 538 static void icount_timer_cb(void *opaque)
 539 {
 540     /* No need for a checkpoint because the timer already synchronizes
 541      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 542      */
 543     icount_warp_rt();
 544 }
 545
 546 void qtest_clock_warp(int64_t dest)
 547 {
 548     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 549     AioContext *aio_context;
 550     assert(qtest_enabled());
 551     aio_context = qemu_get_aio_context();
 552     while (clock < dest) {
 553         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 554         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 555
 556         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 557                            &timers_state.vm_clock_lock);
 558         atomic_set_i64(&timers_state.qemu_icount_bias,
 559                        timers_state.qemu_icount_bias + warp);
 560         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 561                              &timers_state.vm_clock_lock);
 562
 563         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 564         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 565         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 566     }
 567     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 568 }
 569
 570 void qemu_start_warp_timer(void)
 571 {
 572     int64_t clock;
 573     int64_t deadline;
 574
 575     if (!use_icount) {
 576         return;
 577     }
 578
 579     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 580      * do not fire, so computing the deadline does not make sense.
 581      */
 582     if (!runstate_is_running()) {
 583         return;
 584     }
 585
 586     /* warp clock deterministically in record/replay mode */
 587     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 588         return;
 589     }
 590
 591     if (!all_cpu_threads_idle()) {
 592         return;
 593     }
 594
 595     if (qtest_enabled()) {
 596         /* When testing, qtest commands advance icount.  */
 597         return;
 598     }
 599
 600     /* We want to use the earliest deadline from ALL vm_clocks */
 601     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 602     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 603     if (deadline < 0) {
 604         static bool notified;
 605         if (!icount_sleep && !notified) {
 606             warn_report("icount sleep disabled and no active timers");
 607             notified = true;
 608         }
 609         return;
 610     }
 611
 612     if (deadline > 0) {
 613         /*
 614          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 615          * sleep.  Otherwise, the CPU might be waiting for a future timer
 616          * interrupt to wake it up, but the interrupt never comes because
 617          * the vCPU isn't running any insns and thus doesn't advance the
 618          * QEMU_CLOCK_VIRTUAL.
 619          */
 620         if (!icount_sleep) {
 621             /*
 622              * We never let VCPUs sleep in no sleep icount mode.
 623              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 624              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 625              * It is useful when we want a deterministic execution time,
 626              * isolated from host latencies.
 627              */
 628             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 629                                &timers_state.vm_clock_lock);
 630             atomic_set_i64(&timers_state.qemu_icount_bias,
 631                            timers_state.qemu_icount_bias + deadline);
 632             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 633                                  &timers_state.vm_clock_lock);
 634             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 635         } else {
 636             /*
 637              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 638              * "real" time, (related to the time left until the next event) has
 639              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 640              * This avoids that the warps are visible externally; for example,
 641              * you will not be sending network packets continuously instead of
 642              * every 100ms.
 643              */
 644             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 645                                &timers_state.vm_clock_lock);
 646             if (timers_state.vm_clock_warp_start == -1
 647                 || timers_state.vm_clock_warp_start > clock) {
 648                 timers_state.vm_clock_warp_start = clock;
 649             }
 650             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 651                                  &timers_state.vm_clock_lock);
 652             timer_mod_anticipate(timers_state.icount_warp_timer,
 653                                  clock + deadline);
 654         }
 655     } else if (deadline == 0) {
 656         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 657     }
 658 }
 659
 660 static void qemu_account_warp_timer(void)
 661 {
 662     if (!use_icount || !icount_sleep) {
 663         return;
 664     }
 665
 666     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 667      * do not fire, so computing the deadline does not make sense.
 668      */
 669     if (!runstate_is_running()) {
 670         return;
 671     }
 672
 673     /* warp clock deterministically in record/replay mode */
 674     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 675         return;
 676     }
 677
 678     timer_del(timers_state.icount_warp_timer);
 679     icount_warp_rt();
 680 }
 681
 682 static bool icount_state_needed(void *opaque)
 683 {
 684     return use_icount;
 685 }
 686
 687 static bool warp_timer_state_needed(void *opaque)
 688 {
 689     TimersState *s = opaque;
 690     return s->icount_warp_timer != NULL;
 691 }
 692
 693 static bool adjust_timers_state_needed(void *opaque)
 694 {
 695     TimersState *s = opaque;
 696     return s->icount_rt_timer != NULL;
 697 }
 698
 699 /*
 700  * Subsection for warp timer migration is optional, because may not be created
 701  */
 702 static const VMStateDescription icount_vmstate_warp_timer = {
 703     .name = "timer/icount/warp_timer",
 704     .version_id = 1,
 705     .minimum_version_id = 1,
 706     .needed = warp_timer_state_needed,
 707     .fields = (VMStateField[]) {
 708         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 709         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 710         VMSTATE_END_OF_LIST()
 711     }
 712 };
 713
 714 static const VMStateDescription icount_vmstate_adjust_timers = {
 715     .name = "timer/icount/timers",
 716     .version_id = 1,
 717     .minimum_version_id = 1,
 718     .needed = adjust_timers_state_needed,
 719     .fields = (VMStateField[]) {
 720         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 721         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 722         VMSTATE_END_OF_LIST()
 723     }
 724 };
 725
 726 /*
 727  * This is a subsection for icount migration.
 728  */
 729 static const VMStateDescription icount_vmstate_timers = {
 730     .name = "timer/icount",
 731     .version_id = 1,
 732     .minimum_version_id = 1,
 733     .needed = icount_state_needed,
 734     .fields = (VMStateField[]) {
 735         VMSTATE_INT64(qemu_icount_bias, TimersState),
 736         VMSTATE_INT64(qemu_icount, TimersState),
 737         VMSTATE_END_OF_LIST()
 738     },
 739     .subsections = (const VMStateDescription*[]) {
 740         &icount_vmstate_warp_timer,
 741         &icount_vmstate_adjust_timers,
 742         NULL
 743     }
 744 };
 745
 746 static const VMStateDescription vmstate_timers = {
 747     .name = "timer",
 748     .version_id = 2,
 749     .minimum_version_id = 1,
 750     .fields = (VMStateField[]) {
 751         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 752         VMSTATE_UNUSED(8),
 753         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 754         VMSTATE_END_OF_LIST()
 755     },
 756     .subsections = (const VMStateDescription*[]) {
 757         &icount_vmstate_timers,
 758         NULL
 759     }
 760 };
 761
 762 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 763 {
 764     double pct;
 765     double throttle_ratio;
 766     long sleeptime_ns;
 767
 768     if (!cpu_throttle_get_percentage()) {
 769         return;
 770     }
 771
 772     pct = (double)cpu_throttle_get_percentage()/100;
 773     throttle_ratio = pct / (1 - pct);
 774     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 775
 776     qemu_mutex_unlock_iothread();
 777     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 778     qemu_mutex_lock_iothread();
 779     atomic_set(&cpu->throttle_thread_scheduled, 0);
 780 }
 781
 782 static void cpu_throttle_timer_tick(void *opaque)
 783 {
 784     CPUState *cpu;
 785     double pct;
 786
 787     /* Stop the timer if needed */
 788     if (!cpu_throttle_get_percentage()) {
 789         return;
 790     }
 791     CPU_FOREACH(cpu) {
 792         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 793             async_run_on_cpu(cpu, cpu_throttle_thread,
 794                              RUN_ON_CPU_NULL);
 795         }
 796     }
 797
 798     pct = (double)cpu_throttle_get_percentage()/100;
 799     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 800                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 801 }
 802
 803 void cpu_throttle_set(int new_throttle_pct)
 804 {
 805     /* Ensure throttle percentage is within valid range */
 806     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 807     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 808
 809     atomic_set(&throttle_percentage, new_throttle_pct);
 810
 811     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 812                                        CPU_THROTTLE_TIMESLICE_NS);
 813 }
 814
 815 void cpu_throttle_stop(void)
 816 {
 817     atomic_set(&throttle_percentage, 0);
 818 }
 819
 820 bool cpu_throttle_active(void)
 821 {
 822     return (cpu_throttle_get_percentage() != 0);
 823 }
 824
 825 int cpu_throttle_get_percentage(void)
 826 {
 827     return atomic_read(&throttle_percentage);
 828 }
 829
 830 void cpu_ticks_init(void)
 831 {
 832     seqlock_init(&timers_state.vm_clock_seqlock);
 833     qemu_spin_init(&timers_state.vm_clock_lock);
 834     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 835     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 836                                            cpu_throttle_timer_tick, NULL);
 837 }
 838
 839 void configure_icount(QemuOpts *opts, Error **errp)
 840 {
 841     const char *option;
 842     char *rem_str = NULL;
 843
 844     option = qemu_opt_get(opts, "shift");
 845     if (!option) {
 846         if (qemu_opt_get(opts, "align") != NULL) {
 847             error_setg(errp, "Please specify shift option when using align");
 848         }
 849         return;
 850     }
 851
 852     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 853     if (icount_sleep) {
 854         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 855                                          icount_timer_cb, NULL);
 856     }
 857
 858     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 859
 860     if (icount_align_option && !icount_sleep) {
 861         error_setg(errp, "align=on and sleep=off are incompatible");
 862     }
 863     if (strcmp(option, "auto") != 0) {
 864         errno = 0;
 865         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 866         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 867             error_setg(errp, "icount: Invalid shift value");
 868         }
 869         use_icount = 1;
 870         return;
 871     } else if (icount_align_option) {
 872         error_setg(errp, "shift=auto and align=on are incompatible");
 873     } else if (!icount_sleep) {
 874         error_setg(errp, "shift=auto and sleep=off are incompatible");
 875     }
 876
 877     use_icount = 2;
 878
 879     /* 125MIPS seems a reasonable initial guess at the guest speed.
 880        It will be corrected fairly quickly anyway.  */
 881     timers_state.icount_time_shift = 3;
 882
 883     /* Have both realtime and virtual time triggers for speed adjustment.
 884        The realtime trigger catches emulated time passing too slowly,
 885        the virtual time trigger catches emulated time passing too fast.
 886        Realtime triggers occur even when idle, so use them less frequently
 887        than VM triggers.  */
 888     timers_state.vm_clock_warp_start = -1;
 889     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 890                                    icount_adjust_rt, NULL);
 891     timer_mod(timers_state.icount_rt_timer,
 892                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 893     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 894                                         icount_adjust_vm, NULL);
 895     timer_mod(timers_state.icount_vm_timer,
 896                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 897                    NANOSECONDS_PER_SECOND / 10);
 898 }
 899
 900 /***********************************************************/
 901 /* TCG vCPU kick timer
 902  *
 903  * The kick timer is responsible for moving single threaded vCPU
 904  * emulation on to the next vCPU. If more than one vCPU is running a
 905  * timer event with force a cpu->exit so the next vCPU can get
 906  * scheduled.
 907  *
 908  * The timer is removed if all vCPUs are idle and restarted again once
 909  * idleness is complete.
 910  */
 911
 912 static QEMUTimer *tcg_kick_vcpu_timer;
 913 static CPUState *tcg_current_rr_cpu;
 914
 915 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 916
 917 static inline int64_t qemu_tcg_next_kick(void)
 918 {
 919     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 920 }
 921
 922 /* Kick the currently round-robin scheduled vCPU */
 923 static void qemu_cpu_kick_rr_cpu(void)
 924 {
 925     CPUState *cpu;
 926     do {
 927         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 928         if (cpu) {
 929             cpu_exit(cpu);
 930         }
 931     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 932 }
 933
 934 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 935 {
 936 }
 937
 938 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 939 {
 940     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 941         qemu_notify_event();
 942         return;
 943     }
 944
 945     if (qemu_in_vcpu_thread()) {
 946         /* A CPU is currently running; kick it back out to the
 947          * tcg_cpu_exec() loop so it will recalculate its
 948          * icount deadline immediately.
 949          */
 950         qemu_cpu_kick(current_cpu);
 951     } else if (first_cpu) {
 952         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 953          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 954          * causes cpu_thread_is_idle to return false.  This way,
 955          * handle_icount_deadline can run.
 956          * If we have no CPUs at all for some reason, we don't
 957          * need to do anything.
 958          */
 959         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 960     }
 961 }
 962
 963 static void kick_tcg_thread(void *opaque)
 964 {
 965     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 966     qemu_cpu_kick_rr_cpu();
 967 }
 968
 969 static void start_tcg_kick_timer(void)
 970 {
 971     assert(!mttcg_enabled);
 972     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 973         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 974                                            kick_tcg_thread, NULL);
 975         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 976     }
 977 }
 978
 979 static void stop_tcg_kick_timer(void)
 980 {
 981     assert(!mttcg_enabled);
 982     if (tcg_kick_vcpu_timer) {
 983         timer_del(tcg_kick_vcpu_timer);
 984         tcg_kick_vcpu_timer = NULL;
 985     }
 986 }
 987
 988 /***********************************************************/
 989 void hw_error(const char *fmt, ...)
 990 {
 991     va_list ap;
 992     CPUState *cpu;
 993
 994     va_start(ap, fmt);
 995     fprintf(stderr, "qemu: hardware error: ");
 996     vfprintf(stderr, fmt, ap);
 997     fprintf(stderr, "\n");
 998     CPU_FOREACH(cpu) {
 999         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1000         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
1001     }
1002     va_end(ap);
1003     abort();
1004 }
1005
1006 void cpu_synchronize_all_states(void)
1007 {
1008     CPUState *cpu;
1009
1010     CPU_FOREACH(cpu) {
1011         cpu_synchronize_state(cpu);
1012         /* TODO: move to cpu_synchronize_state() */
1013         if (hvf_enabled()) {
1014             hvf_cpu_synchronize_state(cpu);
1015         }
1016     }
1017 }
1018
1019 void cpu_synchronize_all_post_reset(void)
1020 {
1021     CPUState *cpu;
1022
1023     CPU_FOREACH(cpu) {
1024         cpu_synchronize_post_reset(cpu);
1025         /* TODO: move to cpu_synchronize_post_reset() */
1026         if (hvf_enabled()) {
1027             hvf_cpu_synchronize_post_reset(cpu);
1028         }
1029     }
1030 }
1031
1032 void cpu_synchronize_all_post_init(void)
1033 {
1034     CPUState *cpu;
1035
1036     CPU_FOREACH(cpu) {
1037         cpu_synchronize_post_init(cpu);
1038         /* TODO: move to cpu_synchronize_post_init() */
1039         if (hvf_enabled()) {
1040             hvf_cpu_synchronize_post_init(cpu);
1041         }
1042     }
1043 }
1044
1045 void cpu_synchronize_all_pre_loadvm(void)
1046 {
1047     CPUState *cpu;
1048
1049     CPU_FOREACH(cpu) {
1050         cpu_synchronize_pre_loadvm(cpu);
1051     }
1052 }
1053
1054 static int do_vm_stop(RunState state, bool send_stop)
1055 {
1056     int ret = 0;
1057
1058     if (runstate_is_running()) {
1059         cpu_disable_ticks();
1060         pause_all_vcpus();
1061         runstate_set(state);
1062         vm_state_notify(0, state);
1063         if (send_stop) {
1064             qapi_event_send_stop();
1065         }
1066     }
1067
1068     bdrv_drain_all();
1069     replay_disable_events();
1070     ret = bdrv_flush_all();
1071
1072     return ret;
1073 }
1074
1075 /* Special vm_stop() variant for terminating the process.  Historically clients
1076  * did not expect a QMP STOP event and so we need to retain compatibility.
1077  */
1078 int vm_shutdown(void)
1079 {
1080     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1081 }
1082
1083 static bool cpu_can_run(CPUState *cpu)
1084 {
1085     if (cpu->stop) {
1086         return false;
1087     }
1088     if (cpu_is_stopped(cpu)) {
1089         return false;
1090     }
1091     return true;
1092 }
1093
1094 static void cpu_handle_guest_debug(CPUState *cpu)
1095 {
1096     gdb_set_stop_cpu(cpu);
1097     qemu_system_debug_request();
1098     cpu->stopped = true;
1099 }
1100
1101 #ifdef CONFIG_LINUX
1102 static void sigbus_reraise(void)
1103 {
1104     sigset_t set;
1105     struct sigaction action;
1106
1107     memset(&action, 0, sizeof(action));
1108     action.sa_handler = SIG_DFL;
1109     if (!sigaction(SIGBUS, &action, NULL)) {
1110         raise(SIGBUS);
1111         sigemptyset(&set);
1112         sigaddset(&set, SIGBUS);
1113         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1114     }
1115     perror("Failed to re-raise SIGBUS!\n");
1116     abort();
1117 }
1118
1119 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1120 {
1121     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1122         sigbus_reraise();
1123     }
1124
1125     if (current_cpu) {
1126         /* Called asynchronously in VCPU thread.  */
1127         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1128             sigbus_reraise();
1129         }
1130     } else {
1131         /* Called synchronously (via signalfd) in main thread.  */
1132         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1133             sigbus_reraise();
1134         }
1135     }
1136 }
1137
1138 static void qemu_init_sigbus(void)
1139 {
1140     struct sigaction action;
1141
1142     memset(&action, 0, sizeof(action));
1143     action.sa_flags = SA_SIGINFO;
1144     action.sa_sigaction = sigbus_handler;
1145     sigaction(SIGBUS, &action, NULL);
1146
1147     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1148 }
1149 #else /* !CONFIG_LINUX */
1150 static void qemu_init_sigbus(void)
1151 {
1152 }
1153 #endif /* !CONFIG_LINUX */
1154
1155 static QemuMutex qemu_global_mutex;
1156
1157 static QemuThread io_thread;
1158
1159 /* cpu creation */
1160 static QemuCond qemu_cpu_cond;
1161 /* system init */
1162 static QemuCond qemu_pause_cond;
1163
1164 void qemu_init_cpu_loop(void)
1165 {
1166     qemu_init_sigbus();
1167     qemu_cond_init(&qemu_cpu_cond);
1168     qemu_cond_init(&qemu_pause_cond);
1169     qemu_mutex_init(&qemu_global_mutex);
1170
1171     qemu_thread_get_self(&io_thread);
1172 }
1173
1174 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1175 {
1176     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1177 }
1178
1179 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1180 {
1181     if (kvm_destroy_vcpu(cpu) < 0) {
1182         error_report("kvm_destroy_vcpu failed");
1183         exit(EXIT_FAILURE);
1184     }
1185 }
1186
1187 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1188 {
1189 }
1190
1191 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1192 {
1193     g_assert(qemu_cpu_is_self(cpu));
1194     cpu->stop = false;
1195     cpu->stopped = true;
1196     if (exit) {
1197         cpu_exit(cpu);
1198     }
1199     qemu_cond_broadcast(&qemu_pause_cond);
1200 }
1201
1202 static void qemu_wait_io_event_common(CPUState *cpu)
1203 {
1204     atomic_mb_set(&cpu->thread_kicked, false);
1205     if (cpu->stop) {
1206         qemu_cpu_stop(cpu, false);
1207     }
1208     process_queued_cpu_work(cpu);
1209 }
1210
1211 static void qemu_tcg_rr_wait_io_event(CPUState *cpu)
1212 {
1213     while (all_cpu_threads_idle()) {
1214         stop_tcg_kick_timer();
1215         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1216     }
1217
1218     start_tcg_kick_timer();
1219
1220     qemu_wait_io_event_common(cpu);
1221 }
1222
1223 static void qemu_wait_io_event(CPUState *cpu)
1224 {
1225     while (cpu_thread_is_idle(cpu)) {
1226         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1227     }
1228
1229 #ifdef _WIN32
1230     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1231     if (!tcg_enabled()) {
1232         SleepEx(0, TRUE);
1233     }
1234 #endif
1235     qemu_wait_io_event_common(cpu);
1236 }
1237
1238 static void *qemu_kvm_cpu_thread_fn(void *arg)
1239 {
1240     CPUState *cpu = arg;
1241     int r;
1242
1243     rcu_register_thread();
1244
1245     qemu_mutex_lock_iothread();
1246     qemu_thread_get_self(cpu->thread);
1247     cpu->thread_id = qemu_get_thread_id();
1248     cpu->can_do_io = 1;
1249     current_cpu = cpu;
1250
1251     r = kvm_init_vcpu(cpu);
1252     if (r < 0) {
1253         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1254         exit(1);
1255     }
1256
1257     kvm_init_cpu_signals(cpu);
1258
1259     /* signal CPU creation */
1260     cpu->created = true;
1261     qemu_cond_signal(&qemu_cpu_cond);
1262
1263     do {
1264         if (cpu_can_run(cpu)) {
1265             r = kvm_cpu_exec(cpu);
1266             if (r == EXCP_DEBUG) {
1267                 cpu_handle_guest_debug(cpu);
1268             }
1269         }
1270         qemu_wait_io_event(cpu);
1271     } while (!cpu->unplug || cpu_can_run(cpu));
1272
1273     qemu_kvm_destroy_vcpu(cpu);
1274     cpu->created = false;
1275     qemu_cond_signal(&qemu_cpu_cond);
1276     qemu_mutex_unlock_iothread();
1277     rcu_unregister_thread();
1278     return NULL;
1279 }
1280
1281 static void *qemu_dummy_cpu_thread_fn(void *arg)
1282 {
1283 #ifdef _WIN32
1284     error_report("qtest is not supported under Windows");
1285     exit(1);
1286 #else
1287     CPUState *cpu = arg;
1288     sigset_t waitset;
1289     int r;
1290
1291     rcu_register_thread();
1292
1293     qemu_mutex_lock_iothread();
1294     qemu_thread_get_self(cpu->thread);
1295     cpu->thread_id = qemu_get_thread_id();
1296     cpu->can_do_io = 1;
1297     current_cpu = cpu;
1298
1299     sigemptyset(&waitset);
1300     sigaddset(&waitset, SIG_IPI);
1301
1302     /* signal CPU creation */
1303     cpu->created = true;
1304     qemu_cond_signal(&qemu_cpu_cond);
1305
1306     do {
1307         qemu_mutex_unlock_iothread();
1308         do {
1309             int sig;
1310             r = sigwait(&waitset, &sig);
1311         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1312         if (r == -1) {
1313             perror("sigwait");
1314             exit(1);
1315         }
1316         qemu_mutex_lock_iothread();
1317         qemu_wait_io_event(cpu);
1318     } while (!cpu->unplug);
1319
1320     rcu_unregister_thread();
1321     return NULL;
1322 #endif
1323 }
1324
1325 static int64_t tcg_get_icount_limit(void)
1326 {
1327     int64_t deadline;
1328
1329     if (replay_mode != REPLAY_MODE_PLAY) {
1330         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1331
1332         /* Maintain prior (possibly buggy) behaviour where if no deadline
1333          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1334          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1335          * nanoseconds.
1336          */
1337         if ((deadline < 0) || (deadline > INT32_MAX)) {
1338             deadline = INT32_MAX;
1339         }
1340
1341         return qemu_icount_round(deadline);
1342     } else {
1343         return replay_get_instructions();
1344     }
1345 }
1346
1347 static void handle_icount_deadline(void)
1348 {
1349     assert(qemu_in_vcpu_thread());
1350     if (use_icount) {
1351         int64_t deadline =
1352             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1353
1354         if (deadline == 0) {
1355             /* Wake up other AioContexts.  */
1356             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1357             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1358         }
1359     }
1360 }
1361
1362 static void prepare_icount_for_run(CPUState *cpu)
1363 {
1364     if (use_icount) {
1365         int insns_left;
1366
1367         /* These should always be cleared by process_icount_data after
1368          * each vCPU execution. However u16.high can be raised
1369          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1370          */
1371         g_assert(cpu->icount_decr.u16.low == 0);
1372         g_assert(cpu->icount_extra == 0);
1373
1374         cpu->icount_budget = tcg_get_icount_limit();
1375         insns_left = MIN(0xffff, cpu->icount_budget);
1376         cpu->icount_decr.u16.low = insns_left;
1377         cpu->icount_extra = cpu->icount_budget - insns_left;
1378
1379         replay_mutex_lock();
1380     }
1381 }
1382
1383 static void process_icount_data(CPUState *cpu)
1384 {
1385     if (use_icount) {
1386         /* Account for executed instructions */
1387         cpu_update_icount(cpu);
1388
1389         /* Reset the counters */
1390         cpu->icount_decr.u16.low = 0;
1391         cpu->icount_extra = 0;
1392         cpu->icount_budget = 0;
1393
1394         replay_account_executed_instructions();
1395
1396         replay_mutex_unlock();
1397     }
1398 }
1399
1400
1401 static int tcg_cpu_exec(CPUState *cpu)
1402 {
1403     int ret;
1404 #ifdef CONFIG_PROFILER
1405     int64_t ti;
1406 #endif
1407
1408     assert(tcg_enabled());
1409 #ifdef CONFIG_PROFILER
1410     ti = profile_getclock();
1411 #endif
1412     cpu_exec_start(cpu);
1413     ret = cpu_exec(cpu);
1414     cpu_exec_end(cpu);
1415 #ifdef CONFIG_PROFILER
1416     tcg_time += profile_getclock() - ti;
1417 #endif
1418     return ret;
1419 }
1420
1421 /* Destroy any remaining vCPUs which have been unplugged and have
1422  * finished running
1423  */
1424 static void deal_with_unplugged_cpus(void)
1425 {
1426     CPUState *cpu;
1427
1428     CPU_FOREACH(cpu) {
1429         if (cpu->unplug && !cpu_can_run(cpu)) {
1430             qemu_tcg_destroy_vcpu(cpu);
1431             cpu->created = false;
1432             qemu_cond_signal(&qemu_cpu_cond);
1433             break;
1434         }
1435     }
1436 }
1437
1438 /* Single-threaded TCG
1439  *
1440  * In the single-threaded case each vCPU is simulated in turn. If
1441  * there is more than a single vCPU we create a simple timer to kick
1442  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1443  * This is done explicitly rather than relying on side-effects
1444  * elsewhere.
1445  */
1446
1447 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1448 {
1449     CPUState *cpu = arg;
1450
1451     assert(tcg_enabled());
1452     rcu_register_thread();
1453     tcg_register_thread();
1454
1455     qemu_mutex_lock_iothread();
1456     qemu_thread_get_self(cpu->thread);
1457
1458     cpu->thread_id = qemu_get_thread_id();
1459     cpu->created = true;
1460     cpu->can_do_io = 1;
1461     qemu_cond_signal(&qemu_cpu_cond);
1462
1463     /* wait for initial kick-off after machine start */
1464     while (first_cpu->stopped) {
1465         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1466
1467         /* process any pending work */
1468         CPU_FOREACH(cpu) {
1469             current_cpu = cpu;
1470             qemu_wait_io_event_common(cpu);
1471         }
1472     }
1473
1474     start_tcg_kick_timer();
1475
1476     cpu = first_cpu;
1477
1478     /* process any pending work */
1479     cpu->exit_request = 1;
1480
1481     while (1) {
1482         qemu_mutex_unlock_iothread();
1483         replay_mutex_lock();
1484         qemu_mutex_lock_iothread();
1485         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1486         qemu_account_warp_timer();
1487
1488         /* Run the timers here.  This is much more efficient than
1489          * waking up the I/O thread and waiting for completion.
1490          */
1491         handle_icount_deadline();
1492
1493         replay_mutex_unlock();
1494
1495         if (!cpu) {
1496             cpu = first_cpu;
1497         }
1498
1499         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1500
1501             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1502             current_cpu = cpu;
1503
1504             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1505                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1506
1507             if (cpu_can_run(cpu)) {
1508                 int r;
1509
1510                 qemu_mutex_unlock_iothread();
1511                 prepare_icount_for_run(cpu);
1512
1513                 r = tcg_cpu_exec(cpu);
1514
1515                 process_icount_data(cpu);
1516                 qemu_mutex_lock_iothread();
1517
1518                 if (r == EXCP_DEBUG) {
1519                     cpu_handle_guest_debug(cpu);
1520                     break;
1521                 } else if (r == EXCP_ATOMIC) {
1522                     qemu_mutex_unlock_iothread();
1523                     cpu_exec_step_atomic(cpu);
1524                     qemu_mutex_lock_iothread();
1525                     break;
1526                 }
1527             } else if (cpu->stop) {
1528                 if (cpu->unplug) {
1529                     cpu = CPU_NEXT(cpu);
1530                 }
1531                 break;
1532             }
1533
1534             cpu = CPU_NEXT(cpu);
1535         } /* while (cpu && !cpu->exit_request).. */
1536
1537         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1538         atomic_set(&tcg_current_rr_cpu, NULL);
1539
1540         if (cpu && cpu->exit_request) {
1541             atomic_mb_set(&cpu->exit_request, 0);
1542         }
1543
1544         qemu_tcg_rr_wait_io_event(cpu ? cpu : first_cpu);
1545         deal_with_unplugged_cpus();
1546     }
1547
1548     rcu_unregister_thread();
1549     return NULL;
1550 }
1551
1552 static void *qemu_hax_cpu_thread_fn(void *arg)
1553 {
1554     CPUState *cpu = arg;
1555     int r;
1556
1557     rcu_register_thread();
1558     qemu_mutex_lock_iothread();
1559     qemu_thread_get_self(cpu->thread);
1560
1561     cpu->thread_id = qemu_get_thread_id();
1562     cpu->created = true;
1563     cpu->halted = 0;
1564     current_cpu = cpu;
1565
1566     hax_init_vcpu(cpu);
1567     qemu_cond_signal(&qemu_cpu_cond);
1568
1569     do {
1570         if (cpu_can_run(cpu)) {
1571             r = hax_smp_cpu_exec(cpu);
1572             if (r == EXCP_DEBUG) {
1573                 cpu_handle_guest_debug(cpu);
1574             }
1575         }
1576
1577         qemu_wait_io_event(cpu);
1578     } while (!cpu->unplug || cpu_can_run(cpu));
1579     rcu_unregister_thread();
1580     return NULL;
1581 }
1582
1583 /* The HVF-specific vCPU thread function. This one should only run when the host
1584  * CPU supports the VMX "unrestricted guest" feature. */
1585 static void *qemu_hvf_cpu_thread_fn(void *arg)
1586 {
1587     CPUState *cpu = arg;
1588
1589     int r;
1590
1591     assert(hvf_enabled());
1592
1593     rcu_register_thread();
1594
1595     qemu_mutex_lock_iothread();
1596     qemu_thread_get_self(cpu->thread);
1597
1598     cpu->thread_id = qemu_get_thread_id();
1599     cpu->can_do_io = 1;
1600     current_cpu = cpu;
1601
1602     hvf_init_vcpu(cpu);
1603
1604     /* signal CPU creation */
1605     cpu->created = true;
1606     qemu_cond_signal(&qemu_cpu_cond);
1607
1608     do {
1609         if (cpu_can_run(cpu)) {
1610             r = hvf_vcpu_exec(cpu);
1611             if (r == EXCP_DEBUG) {
1612                 cpu_handle_guest_debug(cpu);
1613             }
1614         }
1615         qemu_wait_io_event(cpu);
1616     } while (!cpu->unplug || cpu_can_run(cpu));
1617
1618     hvf_vcpu_destroy(cpu);
1619     cpu->created = false;
1620     qemu_cond_signal(&qemu_cpu_cond);
1621     qemu_mutex_unlock_iothread();
1622     rcu_unregister_thread();
1623     return NULL;
1624 }
1625
1626 static void *qemu_whpx_cpu_thread_fn(void *arg)
1627 {
1628     CPUState *cpu = arg;
1629     int r;
1630
1631     rcu_register_thread();
1632
1633     qemu_mutex_lock_iothread();
1634     qemu_thread_get_self(cpu->thread);
1635     cpu->thread_id = qemu_get_thread_id();
1636     current_cpu = cpu;
1637
1638     r = whpx_init_vcpu(cpu);
1639     if (r < 0) {
1640         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1641         exit(1);
1642     }
1643
1644     /* signal CPU creation */
1645     cpu->created = true;
1646     qemu_cond_signal(&qemu_cpu_cond);
1647
1648     do {
1649         if (cpu_can_run(cpu)) {
1650             r = whpx_vcpu_exec(cpu);
1651             if (r == EXCP_DEBUG) {
1652                 cpu_handle_guest_debug(cpu);
1653             }
1654         }
1655         while (cpu_thread_is_idle(cpu)) {
1656             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1657         }
1658         qemu_wait_io_event_common(cpu);
1659     } while (!cpu->unplug || cpu_can_run(cpu));
1660
1661     whpx_destroy_vcpu(cpu);
1662     cpu->created = false;
1663     qemu_cond_signal(&qemu_cpu_cond);
1664     qemu_mutex_unlock_iothread();
1665     rcu_unregister_thread();
1666     return NULL;
1667 }
1668
1669 #ifdef _WIN32
1670 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1671 {
1672 }
1673 #endif
1674
1675 /* Multi-threaded TCG
1676  *
1677  * In the multi-threaded case each vCPU has its own thread. The TLS
1678  * variable current_cpu can be used deep in the code to find the
1679  * current CPUState for a given thread.
1680  */
1681
1682 static void *qemu_tcg_cpu_thread_fn(void *arg)
1683 {
1684     CPUState *cpu = arg;
1685
1686     assert(tcg_enabled());
1687     g_assert(!use_icount);
1688
1689     rcu_register_thread();
1690     tcg_register_thread();
1691
1692     qemu_mutex_lock_iothread();
1693     qemu_thread_get_self(cpu->thread);
1694
1695     cpu->thread_id = qemu_get_thread_id();
1696     cpu->created = true;
1697     cpu->can_do_io = 1;
1698     current_cpu = cpu;
1699     qemu_cond_signal(&qemu_cpu_cond);
1700
1701     /* process any pending work */
1702     cpu->exit_request = 1;
1703
1704     do {
1705         if (cpu_can_run(cpu)) {
1706             int r;
1707             qemu_mutex_unlock_iothread();
1708             r = tcg_cpu_exec(cpu);
1709             qemu_mutex_lock_iothread();
1710             switch (r) {
1711             case EXCP_DEBUG:
1712                 cpu_handle_guest_debug(cpu);
1713                 break;
1714             case EXCP_HALTED:
1715                 /* during start-up the vCPU is reset and the thread is
1716                  * kicked several times. If we don't ensure we go back
1717                  * to sleep in the halted state we won't cleanly
1718                  * start-up when the vCPU is enabled.
1719                  *
1720                  * cpu->halted should ensure we sleep in wait_io_event
1721                  */
1722                 g_assert(cpu->halted);
1723                 break;
1724             case EXCP_ATOMIC:
1725                 qemu_mutex_unlock_iothread();
1726                 cpu_exec_step_atomic(cpu);
1727                 qemu_mutex_lock_iothread();
1728             default:
1729                 /* Ignore everything else? */
1730                 break;
1731             }
1732         }
1733
1734         atomic_mb_set(&cpu->exit_request, 0);
1735         qemu_wait_io_event(cpu);
1736     } while (!cpu->unplug || cpu_can_run(cpu));
1737
1738     qemu_tcg_destroy_vcpu(cpu);
1739     cpu->created = false;
1740     qemu_cond_signal(&qemu_cpu_cond);
1741     qemu_mutex_unlock_iothread();
1742     rcu_unregister_thread();
1743     return NULL;
1744 }
1745
1746 static void qemu_cpu_kick_thread(CPUState *cpu)
1747 {
1748 #ifndef _WIN32
1749     int err;
1750
1751     if (cpu->thread_kicked) {
1752         return;
1753     }
1754     cpu->thread_kicked = true;
1755     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1756     if (err) {
1757         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1758         exit(1);
1759     }
1760 #else /* _WIN32 */
1761     if (!qemu_cpu_is_self(cpu)) {
1762         if (whpx_enabled()) {
1763             whpx_vcpu_kick(cpu);
1764         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1765             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1766                     __func__, GetLastError());
1767             exit(1);
1768         }
1769     }
1770 #endif
1771 }
1772
1773 void qemu_cpu_kick(CPUState *cpu)
1774 {
1775     qemu_cond_broadcast(cpu->halt_cond);
1776     if (tcg_enabled()) {
1777         cpu_exit(cpu);
1778         /* NOP unless doing single-thread RR */
1779         qemu_cpu_kick_rr_cpu();
1780     } else {
1781         if (hax_enabled()) {
1782             /*
1783              * FIXME: race condition with the exit_request check in
1784              * hax_vcpu_hax_exec
1785              */
1786             cpu->exit_request = 1;
1787         }
1788         qemu_cpu_kick_thread(cpu);
1789     }
1790 }
1791
1792 void qemu_cpu_kick_self(void)
1793 {
1794     assert(current_cpu);
1795     qemu_cpu_kick_thread(current_cpu);
1796 }
1797
1798 bool qemu_cpu_is_self(CPUState *cpu)
1799 {
1800     return qemu_thread_is_self(cpu->thread);
1801 }
1802
1803 bool qemu_in_vcpu_thread(void)
1804 {
1805     return current_cpu && qemu_cpu_is_self(current_cpu);
1806 }
1807
1808 static __thread bool iothread_locked = false;
1809
1810 bool qemu_mutex_iothread_locked(void)
1811 {
1812     return iothread_locked;
1813 }
1814
1815 /*
1816  * The BQL is taken from so many places that it is worth profiling the
1817  * callers directly, instead of funneling them all through a single function.
1818  */
1819 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1820 {
1821     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1822
1823     g_assert(!qemu_mutex_iothread_locked());
1824     bql_lock(&qemu_global_mutex, file, line);
1825     iothread_locked = true;
1826 }
1827
1828 void qemu_mutex_unlock_iothread(void)
1829 {
1830     g_assert(qemu_mutex_iothread_locked());
1831     iothread_locked = false;
1832     qemu_mutex_unlock(&qemu_global_mutex);
1833 }
1834
1835 static bool all_vcpus_paused(void)
1836 {
1837     CPUState *cpu;
1838
1839     CPU_FOREACH(cpu) {
1840         if (!cpu->stopped) {
1841             return false;
1842         }
1843     }
1844
1845     return true;
1846 }
1847
1848 void pause_all_vcpus(void)
1849 {
1850     CPUState *cpu;
1851
1852     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1853     CPU_FOREACH(cpu) {
1854         if (qemu_cpu_is_self(cpu)) {
1855             qemu_cpu_stop(cpu, true);
1856         } else {
1857             cpu->stop = true;
1858             qemu_cpu_kick(cpu);
1859         }
1860     }
1861
1862     /* We need to drop the replay_lock so any vCPU threads woken up
1863      * can finish their replay tasks
1864      */
1865     replay_mutex_unlock();
1866
1867     while (!all_vcpus_paused()) {
1868         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1869         CPU_FOREACH(cpu) {
1870             qemu_cpu_kick(cpu);
1871         }
1872     }
1873
1874     qemu_mutex_unlock_iothread();
1875     replay_mutex_lock();
1876     qemu_mutex_lock_iothread();
1877 }
1878
1879 void cpu_resume(CPUState *cpu)
1880 {
1881     cpu->stop = false;
1882     cpu->stopped = false;
1883     qemu_cpu_kick(cpu);
1884 }
1885
1886 void resume_all_vcpus(void)
1887 {
1888     CPUState *cpu;
1889
1890     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1891     CPU_FOREACH(cpu) {
1892         cpu_resume(cpu);
1893     }
1894 }
1895
1896 void cpu_remove_sync(CPUState *cpu)
1897 {
1898     cpu->stop = true;
1899     cpu->unplug = true;
1900     qemu_cpu_kick(cpu);
1901     qemu_mutex_unlock_iothread();
1902     qemu_thread_join(cpu->thread);
1903     qemu_mutex_lock_iothread();
1904 }
1905
1906 /* For temporary buffers for forming a name */
1907 #define VCPU_THREAD_NAME_SIZE 16
1908
1909 static void qemu_tcg_init_vcpu(CPUState *cpu)
1910 {
1911     char thread_name[VCPU_THREAD_NAME_SIZE];
1912     static QemuCond *single_tcg_halt_cond;
1913     static QemuThread *single_tcg_cpu_thread;
1914     static int tcg_region_inited;
1915
1916     assert(tcg_enabled());
1917     /*
1918      * Initialize TCG regions--once. Now is a good time, because:
1919      * (1) TCG's init context, prologue and target globals have been set up.
1920      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1921      *     -accel flag is processed, so the check doesn't work then).
1922      */
1923     if (!tcg_region_inited) {
1924         tcg_region_inited = 1;
1925         tcg_region_init();
1926     }
1927
1928     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1929         cpu->thread = g_malloc0(sizeof(QemuThread));
1930         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1931         qemu_cond_init(cpu->halt_cond);
1932
1933         if (qemu_tcg_mttcg_enabled()) {
1934             /* create a thread per vCPU with TCG (MTTCG) */
1935             parallel_cpus = true;
1936             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1937                  cpu->cpu_index);
1938
1939             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1940                                cpu, QEMU_THREAD_JOINABLE);
1941
1942         } else {
1943             /* share a single thread for all cpus with TCG */
1944             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1945             qemu_thread_create(cpu->thread, thread_name,
1946                                qemu_tcg_rr_cpu_thread_fn,
1947                                cpu, QEMU_THREAD_JOINABLE);
1948
1949             single_tcg_halt_cond = cpu->halt_cond;
1950             single_tcg_cpu_thread = cpu->thread;
1951         }
1952 #ifdef _WIN32
1953         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1954 #endif
1955     } else {
1956         /* For non-MTTCG cases we share the thread */
1957         cpu->thread = single_tcg_cpu_thread;
1958         cpu->halt_cond = single_tcg_halt_cond;
1959         cpu->thread_id = first_cpu->thread_id;
1960         cpu->can_do_io = 1;
1961         cpu->created = true;
1962     }
1963 }
1964
1965 static void qemu_hax_start_vcpu(CPUState *cpu)
1966 {
1967     char thread_name[VCPU_THREAD_NAME_SIZE];
1968
1969     cpu->thread = g_malloc0(sizeof(QemuThread));
1970     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1971     qemu_cond_init(cpu->halt_cond);
1972
1973     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1974              cpu->cpu_index);
1975     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1976                        cpu, QEMU_THREAD_JOINABLE);
1977 #ifdef _WIN32
1978     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1979 #endif
1980 }
1981
1982 static void qemu_kvm_start_vcpu(CPUState *cpu)
1983 {
1984     char thread_name[VCPU_THREAD_NAME_SIZE];
1985
1986     cpu->thread = g_malloc0(sizeof(QemuThread));
1987     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1988     qemu_cond_init(cpu->halt_cond);
1989     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1990              cpu->cpu_index);
1991     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1992                        cpu, QEMU_THREAD_JOINABLE);
1993 }
1994
1995 static void qemu_hvf_start_vcpu(CPUState *cpu)
1996 {
1997     char thread_name[VCPU_THREAD_NAME_SIZE];
1998
1999     /* HVF currently does not support TCG, and only runs in
2000      * unrestricted-guest mode. */
2001     assert(hvf_enabled());
2002
2003     cpu->thread = g_malloc0(sizeof(QemuThread));
2004     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2005     qemu_cond_init(cpu->halt_cond);
2006
2007     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2008              cpu->cpu_index);
2009     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2010                        cpu, QEMU_THREAD_JOINABLE);
2011 }
2012
2013 static void qemu_whpx_start_vcpu(CPUState *cpu)
2014 {
2015     char thread_name[VCPU_THREAD_NAME_SIZE];
2016
2017     cpu->thread = g_malloc0(sizeof(QemuThread));
2018     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2019     qemu_cond_init(cpu->halt_cond);
2020     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2021              cpu->cpu_index);
2022     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2023                        cpu, QEMU_THREAD_JOINABLE);
2024 #ifdef _WIN32
2025     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2026 #endif
2027 }
2028
2029 static void qemu_dummy_start_vcpu(CPUState *cpu)
2030 {
2031     char thread_name[VCPU_THREAD_NAME_SIZE];
2032
2033     cpu->thread = g_malloc0(sizeof(QemuThread));
2034     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2035     qemu_cond_init(cpu->halt_cond);
2036     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2037              cpu->cpu_index);
2038     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2039                        QEMU_THREAD_JOINABLE);
2040 }
2041
2042 void qemu_init_vcpu(CPUState *cpu)
2043 {
2044     cpu->nr_cores = smp_cores;
2045     cpu->nr_threads = smp_threads;
2046     cpu->stopped = true;
2047
2048     if (!cpu->as) {
2049         /* If the target cpu hasn't set up any address spaces itself,
2050          * give it the default one.
2051          */
2052         cpu->num_ases = 1;
2053         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2054     }
2055
2056     if (kvm_enabled()) {
2057         qemu_kvm_start_vcpu(cpu);
2058     } else if (hax_enabled()) {
2059         qemu_hax_start_vcpu(cpu);
2060     } else if (hvf_enabled()) {
2061         qemu_hvf_start_vcpu(cpu);
2062     } else if (tcg_enabled()) {
2063         qemu_tcg_init_vcpu(cpu);
2064     } else if (whpx_enabled()) {
2065         qemu_whpx_start_vcpu(cpu);
2066     } else {
2067         qemu_dummy_start_vcpu(cpu);
2068     }
2069
2070     while (!cpu->created) {
2071         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2072     }
2073 }
2074
2075 void cpu_stop_current(void)
2076 {
2077     if (current_cpu) {
2078         qemu_cpu_stop(current_cpu, true);
2079     }
2080 }
2081
2082 int vm_stop(RunState state)
2083 {
2084     if (qemu_in_vcpu_thread()) {
2085         qemu_system_vmstop_request_prepare();
2086         qemu_system_vmstop_request(state);
2087         /*
2088          * FIXME: should not return to device code in case
2089          * vm_stop() has been requested.
2090          */
2091         cpu_stop_current();
2092         return 0;
2093     }
2094
2095     return do_vm_stop(state, true);
2096 }
2097
2098 /**
2099  * Prepare for (re)starting the VM.
2100  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2101  * running or in case of an error condition), 0 otherwise.
2102  */
2103 int vm_prepare_start(void)
2104 {
2105     RunState requested;
2106
2107     qemu_vmstop_requested(&requested);
2108     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2109         return -1;
2110     }
2111
2112     /* Ensure that a STOP/RESUME pair of events is emitted if a
2113      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2114      * example, according to documentation is always followed by
2115      * the STOP event.
2116      */
2117     if (runstate_is_running()) {
2118         qapi_event_send_stop();
2119         qapi_event_send_resume();
2120         return -1;
2121     }
2122
2123     /* We are sending this now, but the CPUs will be resumed shortly later */
2124     qapi_event_send_resume();
2125
2126     replay_enable_events();
2127     cpu_enable_ticks();
2128     runstate_set(RUN_STATE_RUNNING);
2129     vm_state_notify(1, RUN_STATE_RUNNING);
2130     return 0;
2131 }
2132
2133 void vm_start(void)
2134 {
2135     if (!vm_prepare_start()) {
2136         resume_all_vcpus();
2137     }
2138 }
2139
2140 /* does a state transition even if the VM is already stopped,
2141    current state is forgotten forever */
2142 int vm_stop_force_state(RunState state)
2143 {
2144     if (runstate_is_running()) {
2145         return vm_stop(state);
2146     } else {
2147         runstate_set(state);
2148
2149         bdrv_drain_all();
2150         /* Make sure to return an error if the flush in a previous vm_stop()
2151          * failed. */
2152         return bdrv_flush_all();
2153     }
2154 }
2155
2156 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
2157 {
2158     /* XXX: implement xxx_cpu_list for targets that still miss it */
2159 #if defined(cpu_list)
2160     cpu_list(f, cpu_fprintf);
2161 #endif
2162 }
2163
2164 CpuInfoList *qmp_query_cpus(Error **errp)
2165 {
2166     MachineState *ms = MACHINE(qdev_get_machine());
2167     MachineClass *mc = MACHINE_GET_CLASS(ms);
2168     CpuInfoList *head = NULL, *cur_item = NULL;
2169     CPUState *cpu;
2170
2171     CPU_FOREACH(cpu) {
2172         CpuInfoList *info;
2173 #if defined(TARGET_I386)
2174         X86CPU *x86_cpu = X86_CPU(cpu);
2175         CPUX86State *env = &x86_cpu->env;
2176 #elif defined(TARGET_PPC)
2177         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2178         CPUPPCState *env = &ppc_cpu->env;
2179 #elif defined(TARGET_SPARC)
2180         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2181         CPUSPARCState *env = &sparc_cpu->env;
2182 #elif defined(TARGET_RISCV)
2183         RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2184         CPURISCVState *env = &riscv_cpu->env;
2185 #elif defined(TARGET_MIPS)
2186         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2187         CPUMIPSState *env = &mips_cpu->env;
2188 #elif defined(TARGET_TRICORE)
2189         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2190         CPUTriCoreState *env = &tricore_cpu->env;
2191 #elif defined(TARGET_S390X)
2192         S390CPU *s390_cpu = S390_CPU(cpu);
2193         CPUS390XState *env = &s390_cpu->env;
2194 #endif
2195
2196         cpu_synchronize_state(cpu);
2197
2198         info = g_malloc0(sizeof(*info));
2199         info->value = g_malloc0(sizeof(*info->value));
2200         info->value->CPU = cpu->cpu_index;
2201         info->value->current = (cpu == first_cpu);
2202         info->value->halted = cpu->halted;
2203         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2204         info->value->thread_id = cpu->thread_id;
2205 #if defined(TARGET_I386)
2206         info->value->arch = CPU_INFO_ARCH_X86;
2207         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2208 #elif defined(TARGET_PPC)
2209         info->value->arch = CPU_INFO_ARCH_PPC;
2210         info->value->u.ppc.nip = env->nip;
2211 #elif defined(TARGET_SPARC)
2212         info->value->arch = CPU_INFO_ARCH_SPARC;
2213         info->value->u.q_sparc.pc = env->pc;
2214         info->value->u.q_sparc.npc = env->npc;
2215 #elif defined(TARGET_MIPS)
2216         info->value->arch = CPU_INFO_ARCH_MIPS;
2217         info->value->u.q_mips.PC = env->active_tc.PC;
2218 #elif defined(TARGET_TRICORE)
2219         info->value->arch = CPU_INFO_ARCH_TRICORE;
2220         info->value->u.tricore.PC = env->PC;
2221 #elif defined(TARGET_S390X)
2222         info->value->arch = CPU_INFO_ARCH_S390;
2223         info->value->u.s390.cpu_state = env->cpu_state;
2224 #elif defined(TARGET_RISCV)
2225         info->value->arch = CPU_INFO_ARCH_RISCV;
2226         info->value->u.riscv.pc = env->pc;
2227 #else
2228         info->value->arch = CPU_INFO_ARCH_OTHER;
2229 #endif
2230         info->value->has_props = !!mc->cpu_index_to_instance_props;
2231         if (info->value->has_props) {
2232             CpuInstanceProperties *props;
2233             props = g_malloc0(sizeof(*props));
2234             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2235             info->value->props = props;
2236         }
2237
2238         /* XXX: waiting for the qapi to support GSList */
2239         if (!cur_item) {
2240             head = cur_item = info;
2241         } else {
2242             cur_item->next = info;
2243             cur_item = info;
2244         }
2245     }
2246
2247     return head;
2248 }
2249
2250 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2251 {
2252     /*
2253      * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2254      * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2255      */
2256     switch (target) {
2257     case SYS_EMU_TARGET_I386:
2258     case SYS_EMU_TARGET_X86_64:
2259         return CPU_INFO_ARCH_X86;
2260
2261     case SYS_EMU_TARGET_PPC:
2262     case SYS_EMU_TARGET_PPC64:
2263         return CPU_INFO_ARCH_PPC;
2264
2265     case SYS_EMU_TARGET_SPARC:
2266     case SYS_EMU_TARGET_SPARC64:
2267         return CPU_INFO_ARCH_SPARC;
2268
2269     case SYS_EMU_TARGET_MIPS:
2270     case SYS_EMU_TARGET_MIPSEL:
2271     case SYS_EMU_TARGET_MIPS64:
2272     case SYS_EMU_TARGET_MIPS64EL:
2273         return CPU_INFO_ARCH_MIPS;
2274
2275     case SYS_EMU_TARGET_TRICORE:
2276         return CPU_INFO_ARCH_TRICORE;
2277
2278     case SYS_EMU_TARGET_S390X:
2279         return CPU_INFO_ARCH_S390;
2280
2281     case SYS_EMU_TARGET_RISCV32:
2282     case SYS_EMU_TARGET_RISCV64:
2283         return CPU_INFO_ARCH_RISCV;
2284
2285     default:
2286         return CPU_INFO_ARCH_OTHER;
2287     }
2288 }
2289
2290 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2291 {
2292 #ifdef TARGET_S390X
2293     S390CPU *s390_cpu = S390_CPU(cpu);
2294     CPUS390XState *env = &s390_cpu->env;
2295
2296     info->cpu_state = env->cpu_state;
2297 #else
2298     abort();
2299 #endif
2300 }
2301
2302 /*
2303  * fast means: we NEVER interrupt vCPU threads to retrieve
2304  * information from KVM.
2305  */
2306 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2307 {
2308     MachineState *ms = MACHINE(qdev_get_machine());
2309     MachineClass *mc = MACHINE_GET_CLASS(ms);
2310     CpuInfoFastList *head = NULL, *cur_item = NULL;
2311     SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2312                                           -1, &error_abort);
2313     CPUState *cpu;
2314
2315     CPU_FOREACH(cpu) {
2316         CpuInfoFastList *info = g_malloc0(sizeof(*info));
2317         info->value = g_malloc0(sizeof(*info->value));
2318
2319         info->value->cpu_index = cpu->cpu_index;
2320         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2321         info->value->thread_id = cpu->thread_id;
2322
2323         info->value->has_props = !!mc->cpu_index_to_instance_props;
2324         if (info->value->has_props) {
2325             CpuInstanceProperties *props;
2326             props = g_malloc0(sizeof(*props));
2327             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2328             info->value->props = props;
2329         }
2330
2331         info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2332         info->value->target = target;
2333         if (target == SYS_EMU_TARGET_S390X) {
2334             cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2335         }
2336
2337         if (!cur_item) {
2338             head = cur_item = info;
2339         } else {
2340             cur_item->next = info;
2341             cur_item = info;
2342         }
2343     }
2344
2345     return head;
2346 }
2347
2348 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2349                  bool has_cpu, int64_t cpu_index, Error **errp)
2350 {
2351     FILE *f;
2352     uint32_t l;
2353     CPUState *cpu;
2354     uint8_t buf[1024];
2355     int64_t orig_addr = addr, orig_size = size;
2356
2357     if (!has_cpu) {
2358         cpu_index = 0;
2359     }
2360
2361     cpu = qemu_get_cpu(cpu_index);
2362     if (cpu == NULL) {
2363         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2364                    "a CPU number");
2365         return;
2366     }
2367
2368     f = fopen(filename, "wb");
2369     if (!f) {
2370         error_setg_file_open(errp, errno, filename);
2371         return;
2372     }
2373
2374     while (size != 0) {
2375         l = sizeof(buf);
2376         if (l > size)
2377             l = size;
2378         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2379             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2380                              " specified", orig_addr, orig_size);
2381             goto exit;
2382         }
2383         if (fwrite(buf, 1, l, f) != l) {
2384             error_setg(errp, QERR_IO_ERROR);
2385             goto exit;
2386         }
2387         addr += l;
2388         size -= l;
2389     }
2390
2391 exit:
2392     fclose(f);
2393 }
2394
2395 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2396                   Error **errp)
2397 {
2398     FILE *f;
2399     uint32_t l;
2400     uint8_t buf[1024];
2401
2402     f = fopen(filename, "wb");
2403     if (!f) {
2404         error_setg_file_open(errp, errno, filename);
2405         return;
2406     }
2407
2408     while (size != 0) {
2409         l = sizeof(buf);
2410         if (l > size)
2411             l = size;
2412         cpu_physical_memory_read(addr, buf, l);
2413         if (fwrite(buf, 1, l, f) != l) {
2414             error_setg(errp, QERR_IO_ERROR);
2415             goto exit;
2416         }
2417         addr += l;
2418         size -= l;
2419     }
2420
2421 exit:
2422     fclose(f);
2423 }
2424
2425 void qmp_inject_nmi(Error **errp)
2426 {
2427     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2428 }
2429
2430 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2431 {
2432     if (!use_icount) {
2433         return;
2434     }
2435
2436     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
2437                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2438     if (icount_align_option) {
2439         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
2440         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
2441     } else {
2442         cpu_fprintf(f, "Max guest delay     NA\n");
2443         cpu_fprintf(f, "Max guest advance   NA\n");
2444     }
2445 }