cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu/config-file.h"
  27 #include "cpu.h"
  28 #include "monitor/monitor.h"
  29 #include "qapi/error.h"
  30 #include "qapi/qapi-commands-misc.h"
  31 #include "qapi/qapi-events-run-state.h"
  32 #include "qapi/qmp/qerror.h"
  33 #include "qemu/error-report.h"
  34 #include "sysemu/sysemu.h"
  35 #include "sysemu/block-backend.h"
  36 #include "exec/gdbstub.h"
  37 #include "sysemu/dma.h"
  38 #include "sysemu/hw_accel.h"
  39 #include "sysemu/kvm.h"
  40 #include "sysemu/hax.h"
  41 #include "sysemu/hvf.h"
  42 #include "sysemu/whpx.h"
  43 #include "exec/exec-all.h"
  44
  45 #include "qemu/thread.h"
  46 #include "sysemu/cpus.h"
  47 #include "sysemu/qtest.h"
  48 #include "qemu/main-loop.h"
  49 #include "qemu/option.h"
  50 #include "qemu/bitmap.h"
  51 #include "qemu/seqlock.h"
  52 #include "tcg.h"
  53 #include "hw/nmi.h"
  54 #include "sysemu/replay.h"
  55 #include "hw/boards.h"
  56
  57 #ifdef CONFIG_LINUX
  58
  59 #include <sys/prctl.h>
  60
  61 #ifndef PR_MCE_KILL
  62 #define PR_MCE_KILL 33
  63 #endif
  64
  65 #ifndef PR_MCE_KILL_SET
  66 #define PR_MCE_KILL_SET 1
  67 #endif
  68
  69 #ifndef PR_MCE_KILL_EARLY
  70 #define PR_MCE_KILL_EARLY 1
  71 #endif
  72
  73 #endif /* CONFIG_LINUX */
  74
  75 int64_t max_delay;
  76 int64_t max_advance;
  77
  78 /* vcpu throttling controls */
  79 static QEMUTimer *throttle_timer;
  80 static unsigned int throttle_percentage;
  81
  82 #define CPU_THROTTLE_PCT_MIN 1
  83 #define CPU_THROTTLE_PCT_MAX 99
  84 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  85
  86 bool cpu_is_stopped(CPUState *cpu)
  87 {
  88     return cpu->stopped || !runstate_is_running();
  89 }
  90
  91 static bool cpu_thread_is_idle(CPUState *cpu)
  92 {
  93     if (cpu->stop || cpu->queued_work_first) {
  94         return false;
  95     }
  96     if (cpu_is_stopped(cpu)) {
  97         return true;
  98     }
  99     if (!cpu->halted || cpu_has_work(cpu) ||
 100         kvm_halt_in_kernel()) {
 101         return false;
 102     }
 103     return true;
 104 }
 105
 106 static bool all_cpu_threads_idle(void)
 107 {
 108     CPUState *cpu;
 109
 110     CPU_FOREACH(cpu) {
 111         if (!cpu_thread_is_idle(cpu)) {
 112             return false;
 113         }
 114     }
 115     return true;
 116 }
 117
 118 /***********************************************************/
 119 /* guest cycle counter */
 120
 121 /* Protected by TimersState seqlock */
 122
 123 static bool icount_sleep = true;
 124 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 125 #define MAX_ICOUNT_SHIFT 10
 126
 127 typedef struct TimersState {
 128     /* Protected by BQL.  */
 129     int64_t cpu_ticks_prev;
 130     int64_t cpu_ticks_offset;
 131
 132     /* Protect fields that can be respectively read outside the
 133      * BQL, and written from multiple threads.
 134      */
 135     QemuSeqLock vm_clock_seqlock;
 136     QemuSpin vm_clock_lock;
 137
 138     int16_t cpu_ticks_enabled;
 139
 140     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 141     int16_t icount_time_shift;
 142
 143     /* Compensate for varying guest execution speed.  */
 144     int64_t qemu_icount_bias;
 145
 146     int64_t vm_clock_warp_start;
 147     int64_t cpu_clock_offset;
 148
 149     /* Only written by TCG thread */
 150     int64_t qemu_icount;
 151
 152     /* for adjusting icount */
 153     QEMUTimer *icount_rt_timer;
 154     QEMUTimer *icount_vm_timer;
 155     QEMUTimer *icount_warp_timer;
 156 } TimersState;
 157
 158 static TimersState timers_state;
 159 bool mttcg_enabled;
 160
 161 /*
 162  * We default to false if we know other options have been enabled
 163  * which are currently incompatible with MTTCG. Otherwise when each
 164  * guest (target) has been updated to support:
 165  *   - atomic instructions
 166  *   - memory ordering primitives (barriers)
 167  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 168  *
 169  * Once a guest architecture has been converted to the new primitives
 170  * there are two remaining limitations to check.
 171  *
 172  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 173  * - The host must have a stronger memory order than the guest
 174  *
 175  * It may be possible in future to support strong guests on weak hosts
 176  * but that will require tagging all load/stores in a guest with their
 177  * implicit memory order requirements which would likely slow things
 178  * down a lot.
 179  */
 180
 181 static bool check_tcg_memory_orders_compatible(void)
 182 {
 183 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 184     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 185 #else
 186     return false;
 187 #endif
 188 }
 189
 190 static bool default_mttcg_enabled(void)
 191 {
 192     if (use_icount || TCG_OVERSIZED_GUEST) {
 193         return false;
 194     } else {
 195 #ifdef TARGET_SUPPORTS_MTTCG
 196         return check_tcg_memory_orders_compatible();
 197 #else
 198         return false;
 199 #endif
 200     }
 201 }
 202
 203 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 204 {
 205     const char *t = qemu_opt_get(opts, "thread");
 206     if (t) {
 207         if (strcmp(t, "multi") == 0) {
 208             if (TCG_OVERSIZED_GUEST) {
 209                 error_setg(errp, "No MTTCG when guest word size > hosts");
 210             } else if (use_icount) {
 211                 error_setg(errp, "No MTTCG when icount is enabled");
 212             } else {
 213 #ifndef TARGET_SUPPORTS_MTTCG
 214                 warn_report("Guest not yet converted to MTTCG - "
 215                             "you may get unexpected results");
 216 #endif
 217                 if (!check_tcg_memory_orders_compatible()) {
 218                     warn_report("Guest expects a stronger memory ordering "
 219                                 "than the host provides");
 220                     error_printf("This may cause strange/hard to debug errors\n");
 221                 }
 222                 mttcg_enabled = true;
 223             }
 224         } else if (strcmp(t, "single") == 0) {
 225             mttcg_enabled = false;
 226         } else {
 227             error_setg(errp, "Invalid 'thread' setting %s", t);
 228         }
 229     } else {
 230         mttcg_enabled = default_mttcg_enabled();
 231     }
 232 }
 233
 234 /* The current number of executed instructions is based on what we
 235  * originally budgeted minus the current state of the decrementing
 236  * icount counters in extra/u16.low.
 237  */
 238 static int64_t cpu_get_icount_executed(CPUState *cpu)
 239 {
 240     return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
 241 }
 242
 243 /*
 244  * Update the global shared timer_state.qemu_icount to take into
 245  * account executed instructions. This is done by the TCG vCPU
 246  * thread so the main-loop can see time has moved forward.
 247  */
 248 static void cpu_update_icount_locked(CPUState *cpu)
 249 {
 250     int64_t executed = cpu_get_icount_executed(cpu);
 251     cpu->icount_budget -= executed;
 252
 253     atomic_set_i64(&timers_state.qemu_icount,
 254                    timers_state.qemu_icount + executed);
 255 }
 256
 257 /*
 258  * Update the global shared timer_state.qemu_icount to take into
 259  * account executed instructions. This is done by the TCG vCPU
 260  * thread so the main-loop can see time has moved forward.
 261  */
 262 void cpu_update_icount(CPUState *cpu)
 263 {
 264     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 265                        &timers_state.vm_clock_lock);
 266     cpu_update_icount_locked(cpu);
 267     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 268                          &timers_state.vm_clock_lock);
 269 }
 270
 271 static int64_t cpu_get_icount_raw_locked(void)
 272 {
 273     CPUState *cpu = current_cpu;
 274
 275     if (cpu && cpu->running) {
 276         if (!cpu->can_do_io) {
 277             error_report("Bad icount read");
 278             exit(1);
 279         }
 280         /* Take into account what has run */
 281         cpu_update_icount_locked(cpu);
 282     }
 283     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 284     return atomic_read_i64(&timers_state.qemu_icount);
 285 }
 286
 287 static int64_t cpu_get_icount_locked(void)
 288 {
 289     int64_t icount = cpu_get_icount_raw_locked();
 290     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 291         cpu_icount_to_ns(icount);
 292 }
 293
 294 int64_t cpu_get_icount_raw(void)
 295 {
 296     int64_t icount;
 297     unsigned start;
 298
 299     do {
 300         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 301         icount = cpu_get_icount_raw_locked();
 302     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 303
 304     return icount;
 305 }
 306
 307 /* Return the virtual CPU time, based on the instruction counter.  */
 308 int64_t cpu_get_icount(void)
 309 {
 310     int64_t icount;
 311     unsigned start;
 312
 313     do {
 314         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 315         icount = cpu_get_icount_locked();
 316     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 317
 318     return icount;
 319 }
 320
 321 int64_t cpu_icount_to_ns(int64_t icount)
 322 {
 323     return icount << atomic_read(&timers_state.icount_time_shift);
 324 }
 325
 326 static int64_t cpu_get_ticks_locked(void)
 327 {
 328     int64_t ticks = timers_state.cpu_ticks_offset;
 329     if (timers_state.cpu_ticks_enabled) {
 330         ticks += cpu_get_host_ticks();
 331     }
 332
 333     if (timers_state.cpu_ticks_prev > ticks) {
 334         /* Non increasing ticks may happen if the host uses software suspend.  */
 335         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 336         ticks = timers_state.cpu_ticks_prev;
 337     }
 338
 339     timers_state.cpu_ticks_prev = ticks;
 340     return ticks;
 341 }
 342
 343 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 344  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 345  * counter.
 346  */
 347 int64_t cpu_get_ticks(void)
 348 {
 349     int64_t ticks;
 350
 351     if (use_icount) {
 352         return cpu_get_icount();
 353     }
 354
 355     qemu_spin_lock(&timers_state.vm_clock_lock);
 356     ticks = cpu_get_ticks_locked();
 357     qemu_spin_unlock(&timers_state.vm_clock_lock);
 358     return ticks;
 359 }
 360
 361 static int64_t cpu_get_clock_locked(void)
 362 {
 363     int64_t time;
 364
 365     time = timers_state.cpu_clock_offset;
 366     if (timers_state.cpu_ticks_enabled) {
 367         time += get_clock();
 368     }
 369
 370     return time;
 371 }
 372
 373 /* Return the monotonic time elapsed in VM, i.e.,
 374  * the time between vm_start and vm_stop
 375  */
 376 int64_t cpu_get_clock(void)
 377 {
 378     int64_t ti;
 379     unsigned start;
 380
 381     do {
 382         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 383         ti = cpu_get_clock_locked();
 384     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 385
 386     return ti;
 387 }
 388
 389 /* enable cpu_get_ticks()
 390  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 391  */
 392 void cpu_enable_ticks(void)
 393 {
 394     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 395                        &timers_state.vm_clock_lock);
 396     if (!timers_state.cpu_ticks_enabled) {
 397         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 398         timers_state.cpu_clock_offset -= get_clock();
 399         timers_state.cpu_ticks_enabled = 1;
 400     }
 401     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 402                        &timers_state.vm_clock_lock);
 403 }
 404
 405 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 406  * cpu_get_ticks() after that.
 407  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 408  */
 409 void cpu_disable_ticks(void)
 410 {
 411     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 412                        &timers_state.vm_clock_lock);
 413     if (timers_state.cpu_ticks_enabled) {
 414         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 415         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 416         timers_state.cpu_ticks_enabled = 0;
 417     }
 418     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 419                          &timers_state.vm_clock_lock);
 420 }
 421
 422 /* Correlation between real and virtual time is always going to be
 423    fairly approximate, so ignore small variation.
 424    When the guest is idle real and virtual time will be aligned in
 425    the IO wait loop.  */
 426 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 427
 428 static void icount_adjust(void)
 429 {
 430     int64_t cur_time;
 431     int64_t cur_icount;
 432     int64_t delta;
 433
 434     /* Protected by TimersState mutex.  */
 435     static int64_t last_delta;
 436
 437     /* If the VM is not running, then do nothing.  */
 438     if (!runstate_is_running()) {
 439         return;
 440     }
 441
 442     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 443                        &timers_state.vm_clock_lock);
 444     cur_time = cpu_get_clock_locked();
 445     cur_icount = cpu_get_icount_locked();
 446
 447     delta = cur_icount - cur_time;
 448     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 449     if (delta > 0
 450         && last_delta + ICOUNT_WOBBLE < delta * 2
 451         && timers_state.icount_time_shift > 0) {
 452         /* The guest is getting too far ahead.  Slow time down.  */
 453         atomic_set(&timers_state.icount_time_shift,
 454                    timers_state.icount_time_shift - 1);
 455     }
 456     if (delta < 0
 457         && last_delta - ICOUNT_WOBBLE > delta * 2
 458         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 459         /* The guest is getting too far behind.  Speed time up.  */
 460         atomic_set(&timers_state.icount_time_shift,
 461                    timers_state.icount_time_shift + 1);
 462     }
 463     last_delta = delta;
 464     atomic_set_i64(&timers_state.qemu_icount_bias,
 465                    cur_icount - (timers_state.qemu_icount
 466                                  << timers_state.icount_time_shift));
 467     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 468                          &timers_state.vm_clock_lock);
 469 }
 470
 471 static void icount_adjust_rt(void *opaque)
 472 {
 473     timer_mod(timers_state.icount_rt_timer,
 474               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 475     icount_adjust();
 476 }
 477
 478 static void icount_adjust_vm(void *opaque)
 479 {
 480     timer_mod(timers_state.icount_vm_timer,
 481                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 482                    NANOSECONDS_PER_SECOND / 10);
 483     icount_adjust();
 484 }
 485
 486 static int64_t qemu_icount_round(int64_t count)
 487 {
 488     int shift = atomic_read(&timers_state.icount_time_shift);
 489     return (count + (1 << shift) - 1) >> shift;
 490 }
 491
 492 static void icount_warp_rt(void)
 493 {
 494     unsigned seq;
 495     int64_t warp_start;
 496
 497     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 498      * changes from -1 to another value, so the race here is okay.
 499      */
 500     do {
 501         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 502         warp_start = timers_state.vm_clock_warp_start;
 503     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 504
 505     if (warp_start == -1) {
 506         return;
 507     }
 508
 509     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 510                        &timers_state.vm_clock_lock);
 511     if (runstate_is_running()) {
 512         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 513                                             cpu_get_clock_locked());
 514         int64_t warp_delta;
 515
 516         warp_delta = clock - timers_state.vm_clock_warp_start;
 517         if (use_icount == 2) {
 518             /*
 519              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 520              * far ahead of real time.
 521              */
 522             int64_t cur_icount = cpu_get_icount_locked();
 523             int64_t delta = clock - cur_icount;
 524             warp_delta = MIN(warp_delta, delta);
 525         }
 526         atomic_set_i64(&timers_state.qemu_icount_bias,
 527                        timers_state.qemu_icount_bias + warp_delta);
 528     }
 529     timers_state.vm_clock_warp_start = -1;
 530     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 531                        &timers_state.vm_clock_lock);
 532
 533     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 534         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 535     }
 536 }
 537
 538 static void icount_timer_cb(void *opaque)
 539 {
 540     /* No need for a checkpoint because the timer already synchronizes
 541      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 542      */
 543     icount_warp_rt();
 544 }
 545
 546 void qtest_clock_warp(int64_t dest)
 547 {
 548     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 549     AioContext *aio_context;
 550     assert(qtest_enabled());
 551     aio_context = qemu_get_aio_context();
 552     while (clock < dest) {
 553         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 554         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 555
 556         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 557                            &timers_state.vm_clock_lock);
 558         atomic_set_i64(&timers_state.qemu_icount_bias,
 559                        timers_state.qemu_icount_bias + warp);
 560         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 561                              &timers_state.vm_clock_lock);
 562
 563         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 564         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 565         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 566     }
 567     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 568 }
 569
 570 void qemu_start_warp_timer(void)
 571 {
 572     int64_t clock;
 573     int64_t deadline;
 574
 575     if (!use_icount) {
 576         return;
 577     }
 578
 579     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 580      * do not fire, so computing the deadline does not make sense.
 581      */
 582     if (!runstate_is_running()) {
 583         return;
 584     }
 585
 586     if (replay_mode != REPLAY_MODE_PLAY) {
 587         if (!all_cpu_threads_idle()) {
 588             return;
 589         }
 590
 591         if (qtest_enabled()) {
 592             /* When testing, qtest commands advance icount.  */
 593             return;
 594         }
 595
 596         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 597     } else {
 598         /* warp clock deterministically in record/replay mode */
 599         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 600             /* vCPU is sleeping and warp can't be started.
 601                It is probably a race condition: notification sent
 602                to vCPU was processed in advance and vCPU went to sleep.
 603                Therefore we have to wake it up for doing someting. */
 604             if (replay_has_checkpoint()) {
 605                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 606             }
 607             return;
 608         }
 609     }
 610
 611     /* We want to use the earliest deadline from ALL vm_clocks */
 612     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 613     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 614     if (deadline < 0) {
 615         static bool notified;
 616         if (!icount_sleep && !notified) {
 617             warn_report("icount sleep disabled and no active timers");
 618             notified = true;
 619         }
 620         return;
 621     }
 622
 623     if (deadline > 0) {
 624         /*
 625          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 626          * sleep.  Otherwise, the CPU might be waiting for a future timer
 627          * interrupt to wake it up, but the interrupt never comes because
 628          * the vCPU isn't running any insns and thus doesn't advance the
 629          * QEMU_CLOCK_VIRTUAL.
 630          */
 631         if (!icount_sleep) {
 632             /*
 633              * We never let VCPUs sleep in no sleep icount mode.
 634              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 635              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 636              * It is useful when we want a deterministic execution time,
 637              * isolated from host latencies.
 638              */
 639             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 640                                &timers_state.vm_clock_lock);
 641             atomic_set_i64(&timers_state.qemu_icount_bias,
 642                            timers_state.qemu_icount_bias + deadline);
 643             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 644                                  &timers_state.vm_clock_lock);
 645             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 646         } else {
 647             /*
 648              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 649              * "real" time, (related to the time left until the next event) has
 650              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 651              * This avoids that the warps are visible externally; for example,
 652              * you will not be sending network packets continuously instead of
 653              * every 100ms.
 654              */
 655             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 656                                &timers_state.vm_clock_lock);
 657             if (timers_state.vm_clock_warp_start == -1
 658                 || timers_state.vm_clock_warp_start > clock) {
 659                 timers_state.vm_clock_warp_start = clock;
 660             }
 661             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 662                                  &timers_state.vm_clock_lock);
 663             timer_mod_anticipate(timers_state.icount_warp_timer,
 664                                  clock + deadline);
 665         }
 666     } else if (deadline == 0) {
 667         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 668     }
 669 }
 670
 671 static void qemu_account_warp_timer(void)
 672 {
 673     if (!use_icount || !icount_sleep) {
 674         return;
 675     }
 676
 677     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 678      * do not fire, so computing the deadline does not make sense.
 679      */
 680     if (!runstate_is_running()) {
 681         return;
 682     }
 683
 684     /* warp clock deterministically in record/replay mode */
 685     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 686         return;
 687     }
 688
 689     timer_del(timers_state.icount_warp_timer);
 690     icount_warp_rt();
 691 }
 692
 693 static bool icount_state_needed(void *opaque)
 694 {
 695     return use_icount;
 696 }
 697
 698 static bool warp_timer_state_needed(void *opaque)
 699 {
 700     TimersState *s = opaque;
 701     return s->icount_warp_timer != NULL;
 702 }
 703
 704 static bool adjust_timers_state_needed(void *opaque)
 705 {
 706     TimersState *s = opaque;
 707     return s->icount_rt_timer != NULL;
 708 }
 709
 710 /*
 711  * Subsection for warp timer migration is optional, because may not be created
 712  */
 713 static const VMStateDescription icount_vmstate_warp_timer = {
 714     .name = "timer/icount/warp_timer",
 715     .version_id = 1,
 716     .minimum_version_id = 1,
 717     .needed = warp_timer_state_needed,
 718     .fields = (VMStateField[]) {
 719         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 720         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 721         VMSTATE_END_OF_LIST()
 722     }
 723 };
 724
 725 static const VMStateDescription icount_vmstate_adjust_timers = {
 726     .name = "timer/icount/timers",
 727     .version_id = 1,
 728     .minimum_version_id = 1,
 729     .needed = adjust_timers_state_needed,
 730     .fields = (VMStateField[]) {
 731         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 732         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 733         VMSTATE_END_OF_LIST()
 734     }
 735 };
 736
 737 /*
 738  * This is a subsection for icount migration.
 739  */
 740 static const VMStateDescription icount_vmstate_timers = {
 741     .name = "timer/icount",
 742     .version_id = 1,
 743     .minimum_version_id = 1,
 744     .needed = icount_state_needed,
 745     .fields = (VMStateField[]) {
 746         VMSTATE_INT64(qemu_icount_bias, TimersState),
 747         VMSTATE_INT64(qemu_icount, TimersState),
 748         VMSTATE_END_OF_LIST()
 749     },
 750     .subsections = (const VMStateDescription*[]) {
 751         &icount_vmstate_warp_timer,
 752         &icount_vmstate_adjust_timers,
 753         NULL
 754     }
 755 };
 756
 757 static const VMStateDescription vmstate_timers = {
 758     .name = "timer",
 759     .version_id = 2,
 760     .minimum_version_id = 1,
 761     .fields = (VMStateField[]) {
 762         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 763         VMSTATE_UNUSED(8),
 764         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 765         VMSTATE_END_OF_LIST()
 766     },
 767     .subsections = (const VMStateDescription*[]) {
 768         &icount_vmstate_timers,
 769         NULL
 770     }
 771 };
 772
 773 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 774 {
 775     double pct;
 776     double throttle_ratio;
 777     long sleeptime_ns;
 778
 779     if (!cpu_throttle_get_percentage()) {
 780         return;
 781     }
 782
 783     pct = (double)cpu_throttle_get_percentage()/100;
 784     throttle_ratio = pct / (1 - pct);
 785     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 786
 787     qemu_mutex_unlock_iothread();
 788     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 789     qemu_mutex_lock_iothread();
 790     atomic_set(&cpu->throttle_thread_scheduled, 0);
 791 }
 792
 793 static void cpu_throttle_timer_tick(void *opaque)
 794 {
 795     CPUState *cpu;
 796     double pct;
 797
 798     /* Stop the timer if needed */
 799     if (!cpu_throttle_get_percentage()) {
 800         return;
 801     }
 802     CPU_FOREACH(cpu) {
 803         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 804             async_run_on_cpu(cpu, cpu_throttle_thread,
 805                              RUN_ON_CPU_NULL);
 806         }
 807     }
 808
 809     pct = (double)cpu_throttle_get_percentage()/100;
 810     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 811                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 812 }
 813
 814 void cpu_throttle_set(int new_throttle_pct)
 815 {
 816     /* Ensure throttle percentage is within valid range */
 817     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 818     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 819
 820     atomic_set(&throttle_percentage, new_throttle_pct);
 821
 822     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 823                                        CPU_THROTTLE_TIMESLICE_NS);
 824 }
 825
 826 void cpu_throttle_stop(void)
 827 {
 828     atomic_set(&throttle_percentage, 0);
 829 }
 830
 831 bool cpu_throttle_active(void)
 832 {
 833     return (cpu_throttle_get_percentage() != 0);
 834 }
 835
 836 int cpu_throttle_get_percentage(void)
 837 {
 838     return atomic_read(&throttle_percentage);
 839 }
 840
 841 void cpu_ticks_init(void)
 842 {
 843     seqlock_init(&timers_state.vm_clock_seqlock);
 844     qemu_spin_init(&timers_state.vm_clock_lock);
 845     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 846     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 847                                            cpu_throttle_timer_tick, NULL);
 848 }
 849
 850 void configure_icount(QemuOpts *opts, Error **errp)
 851 {
 852     const char *option;
 853     char *rem_str = NULL;
 854
 855     option = qemu_opt_get(opts, "shift");
 856     if (!option) {
 857         if (qemu_opt_get(opts, "align") != NULL) {
 858             error_setg(errp, "Please specify shift option when using align");
 859         }
 860         return;
 861     }
 862
 863     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 864     if (icount_sleep) {
 865         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 866                                          icount_timer_cb, NULL);
 867     }
 868
 869     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 870
 871     if (icount_align_option && !icount_sleep) {
 872         error_setg(errp, "align=on and sleep=off are incompatible");
 873     }
 874     if (strcmp(option, "auto") != 0) {
 875         errno = 0;
 876         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 877         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 878             error_setg(errp, "icount: Invalid shift value");
 879         }
 880         use_icount = 1;
 881         return;
 882     } else if (icount_align_option) {
 883         error_setg(errp, "shift=auto and align=on are incompatible");
 884     } else if (!icount_sleep) {
 885         error_setg(errp, "shift=auto and sleep=off are incompatible");
 886     }
 887
 888     use_icount = 2;
 889
 890     /* 125MIPS seems a reasonable initial guess at the guest speed.
 891        It will be corrected fairly quickly anyway.  */
 892     timers_state.icount_time_shift = 3;
 893
 894     /* Have both realtime and virtual time triggers for speed adjustment.
 895        The realtime trigger catches emulated time passing too slowly,
 896        the virtual time trigger catches emulated time passing too fast.
 897        Realtime triggers occur even when idle, so use them less frequently
 898        than VM triggers.  */
 899     timers_state.vm_clock_warp_start = -1;
 900     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 901                                    icount_adjust_rt, NULL);
 902     timer_mod(timers_state.icount_rt_timer,
 903                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 904     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 905                                         icount_adjust_vm, NULL);
 906     timer_mod(timers_state.icount_vm_timer,
 907                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 908                    NANOSECONDS_PER_SECOND / 10);
 909 }
 910
 911 /***********************************************************/
 912 /* TCG vCPU kick timer
 913  *
 914  * The kick timer is responsible for moving single threaded vCPU
 915  * emulation on to the next vCPU. If more than one vCPU is running a
 916  * timer event with force a cpu->exit so the next vCPU can get
 917  * scheduled.
 918  *
 919  * The timer is removed if all vCPUs are idle and restarted again once
 920  * idleness is complete.
 921  */
 922
 923 static QEMUTimer *tcg_kick_vcpu_timer;
 924 static CPUState *tcg_current_rr_cpu;
 925
 926 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 927
 928 static inline int64_t qemu_tcg_next_kick(void)
 929 {
 930     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 931 }
 932
 933 /* Kick the currently round-robin scheduled vCPU */
 934 static void qemu_cpu_kick_rr_cpu(void)
 935 {
 936     CPUState *cpu;
 937     do {
 938         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 939         if (cpu) {
 940             cpu_exit(cpu);
 941         }
 942     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 943 }
 944
 945 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 946 {
 947 }
 948
 949 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 950 {
 951     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 952         qemu_notify_event();
 953         return;
 954     }
 955
 956     if (qemu_in_vcpu_thread()) {
 957         /* A CPU is currently running; kick it back out to the
 958          * tcg_cpu_exec() loop so it will recalculate its
 959          * icount deadline immediately.
 960          */
 961         qemu_cpu_kick(current_cpu);
 962     } else if (first_cpu) {
 963         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 964          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 965          * causes cpu_thread_is_idle to return false.  This way,
 966          * handle_icount_deadline can run.
 967          * If we have no CPUs at all for some reason, we don't
 968          * need to do anything.
 969          */
 970         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 971     }
 972 }
 973
 974 static void kick_tcg_thread(void *opaque)
 975 {
 976     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 977     qemu_cpu_kick_rr_cpu();
 978 }
 979
 980 static void start_tcg_kick_timer(void)
 981 {
 982     assert(!mttcg_enabled);
 983     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 984         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 985                                            kick_tcg_thread, NULL);
 986     }
 987     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 988         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 989     }
 990 }
 991
 992 static void stop_tcg_kick_timer(void)
 993 {
 994     assert(!mttcg_enabled);
 995     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 996         timer_del(tcg_kick_vcpu_timer);
 997     }
 998 }
 999
1000 /***********************************************************/
1001 void hw_error(const char *fmt, ...)
1002 {
1003     va_list ap;
1004     CPUState *cpu;
1005
1006     va_start(ap, fmt);
1007     fprintf(stderr, "qemu: hardware error: ");
1008     vfprintf(stderr, fmt, ap);
1009     fprintf(stderr, "\n");
1010     CPU_FOREACH(cpu) {
1011         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1012         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
1013     }
1014     va_end(ap);
1015     abort();
1016 }
1017
1018 void cpu_synchronize_all_states(void)
1019 {
1020     CPUState *cpu;
1021
1022     CPU_FOREACH(cpu) {
1023         cpu_synchronize_state(cpu);
1024         /* TODO: move to cpu_synchronize_state() */
1025         if (hvf_enabled()) {
1026             hvf_cpu_synchronize_state(cpu);
1027         }
1028     }
1029 }
1030
1031 void cpu_synchronize_all_post_reset(void)
1032 {
1033     CPUState *cpu;
1034
1035     CPU_FOREACH(cpu) {
1036         cpu_synchronize_post_reset(cpu);
1037         /* TODO: move to cpu_synchronize_post_reset() */
1038         if (hvf_enabled()) {
1039             hvf_cpu_synchronize_post_reset(cpu);
1040         }
1041     }
1042 }
1043
1044 void cpu_synchronize_all_post_init(void)
1045 {
1046     CPUState *cpu;
1047
1048     CPU_FOREACH(cpu) {
1049         cpu_synchronize_post_init(cpu);
1050         /* TODO: move to cpu_synchronize_post_init() */
1051         if (hvf_enabled()) {
1052             hvf_cpu_synchronize_post_init(cpu);
1053         }
1054     }
1055 }
1056
1057 void cpu_synchronize_all_pre_loadvm(void)
1058 {
1059     CPUState *cpu;
1060
1061     CPU_FOREACH(cpu) {
1062         cpu_synchronize_pre_loadvm(cpu);
1063     }
1064 }
1065
1066 static int do_vm_stop(RunState state, bool send_stop)
1067 {
1068     int ret = 0;
1069
1070     if (runstate_is_running()) {
1071         cpu_disable_ticks();
1072         pause_all_vcpus();
1073         runstate_set(state);
1074         vm_state_notify(0, state);
1075         if (send_stop) {
1076             qapi_event_send_stop();
1077         }
1078     }
1079
1080     bdrv_drain_all();
1081     replay_disable_events();
1082     ret = bdrv_flush_all();
1083
1084     return ret;
1085 }
1086
1087 /* Special vm_stop() variant for terminating the process.  Historically clients
1088  * did not expect a QMP STOP event and so we need to retain compatibility.
1089  */
1090 int vm_shutdown(void)
1091 {
1092     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1093 }
1094
1095 static bool cpu_can_run(CPUState *cpu)
1096 {
1097     if (cpu->stop) {
1098         return false;
1099     }
1100     if (cpu_is_stopped(cpu)) {
1101         return false;
1102     }
1103     return true;
1104 }
1105
1106 static void cpu_handle_guest_debug(CPUState *cpu)
1107 {
1108     gdb_set_stop_cpu(cpu);
1109     qemu_system_debug_request();
1110     cpu->stopped = true;
1111 }
1112
1113 #ifdef CONFIG_LINUX
1114 static void sigbus_reraise(void)
1115 {
1116     sigset_t set;
1117     struct sigaction action;
1118
1119     memset(&action, 0, sizeof(action));
1120     action.sa_handler = SIG_DFL;
1121     if (!sigaction(SIGBUS, &action, NULL)) {
1122         raise(SIGBUS);
1123         sigemptyset(&set);
1124         sigaddset(&set, SIGBUS);
1125         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1126     }
1127     perror("Failed to re-raise SIGBUS!\n");
1128     abort();
1129 }
1130
1131 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1132 {
1133     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1134         sigbus_reraise();
1135     }
1136
1137     if (current_cpu) {
1138         /* Called asynchronously in VCPU thread.  */
1139         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1140             sigbus_reraise();
1141         }
1142     } else {
1143         /* Called synchronously (via signalfd) in main thread.  */
1144         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1145             sigbus_reraise();
1146         }
1147     }
1148 }
1149
1150 static void qemu_init_sigbus(void)
1151 {
1152     struct sigaction action;
1153
1154     memset(&action, 0, sizeof(action));
1155     action.sa_flags = SA_SIGINFO;
1156     action.sa_sigaction = sigbus_handler;
1157     sigaction(SIGBUS, &action, NULL);
1158
1159     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1160 }
1161 #else /* !CONFIG_LINUX */
1162 static void qemu_init_sigbus(void)
1163 {
1164 }
1165 #endif /* !CONFIG_LINUX */
1166
1167 static QemuMutex qemu_global_mutex;
1168
1169 static QemuThread io_thread;
1170
1171 /* cpu creation */
1172 static QemuCond qemu_cpu_cond;
1173 /* system init */
1174 static QemuCond qemu_pause_cond;
1175
1176 void qemu_init_cpu_loop(void)
1177 {
1178     qemu_init_sigbus();
1179     qemu_cond_init(&qemu_cpu_cond);
1180     qemu_cond_init(&qemu_pause_cond);
1181     qemu_mutex_init(&qemu_global_mutex);
1182
1183     qemu_thread_get_self(&io_thread);
1184 }
1185
1186 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1187 {
1188     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1189 }
1190
1191 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1192 {
1193     if (kvm_destroy_vcpu(cpu) < 0) {
1194         error_report("kvm_destroy_vcpu failed");
1195         exit(EXIT_FAILURE);
1196     }
1197 }
1198
1199 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1200 {
1201 }
1202
1203 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1204 {
1205     g_assert(qemu_cpu_is_self(cpu));
1206     cpu->stop = false;
1207     cpu->stopped = true;
1208     if (exit) {
1209         cpu_exit(cpu);
1210     }
1211     qemu_cond_broadcast(&qemu_pause_cond);
1212 }
1213
1214 static void qemu_wait_io_event_common(CPUState *cpu)
1215 {
1216     atomic_mb_set(&cpu->thread_kicked, false);
1217     if (cpu->stop) {
1218         qemu_cpu_stop(cpu, false);
1219     }
1220     process_queued_cpu_work(cpu);
1221 }
1222
1223 static void qemu_tcg_rr_wait_io_event(void)
1224 {
1225     CPUState *cpu;
1226
1227     while (all_cpu_threads_idle()) {
1228         stop_tcg_kick_timer();
1229         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1230     }
1231
1232     start_tcg_kick_timer();
1233
1234     CPU_FOREACH(cpu) {
1235         qemu_wait_io_event_common(cpu);
1236     }
1237 }
1238
1239 static void qemu_wait_io_event(CPUState *cpu)
1240 {
1241     while (cpu_thread_is_idle(cpu)) {
1242         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1243     }
1244
1245 #ifdef _WIN32
1246     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1247     if (!tcg_enabled()) {
1248         SleepEx(0, TRUE);
1249     }
1250 #endif
1251     qemu_wait_io_event_common(cpu);
1252 }
1253
1254 static void *qemu_kvm_cpu_thread_fn(void *arg)
1255 {
1256     CPUState *cpu = arg;
1257     int r;
1258
1259     rcu_register_thread();
1260
1261     qemu_mutex_lock_iothread();
1262     qemu_thread_get_self(cpu->thread);
1263     cpu->thread_id = qemu_get_thread_id();
1264     cpu->can_do_io = 1;
1265     current_cpu = cpu;
1266
1267     r = kvm_init_vcpu(cpu);
1268     if (r < 0) {
1269         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1270         exit(1);
1271     }
1272
1273     kvm_init_cpu_signals(cpu);
1274
1275     /* signal CPU creation */
1276     cpu->created = true;
1277     qemu_cond_signal(&qemu_cpu_cond);
1278
1279     do {
1280         if (cpu_can_run(cpu)) {
1281             r = kvm_cpu_exec(cpu);
1282             if (r == EXCP_DEBUG) {
1283                 cpu_handle_guest_debug(cpu);
1284             }
1285         }
1286         qemu_wait_io_event(cpu);
1287     } while (!cpu->unplug || cpu_can_run(cpu));
1288
1289     qemu_kvm_destroy_vcpu(cpu);
1290     cpu->created = false;
1291     qemu_cond_signal(&qemu_cpu_cond);
1292     qemu_mutex_unlock_iothread();
1293     rcu_unregister_thread();
1294     return NULL;
1295 }
1296
1297 static void *qemu_dummy_cpu_thread_fn(void *arg)
1298 {
1299 #ifdef _WIN32
1300     error_report("qtest is not supported under Windows");
1301     exit(1);
1302 #else
1303     CPUState *cpu = arg;
1304     sigset_t waitset;
1305     int r;
1306
1307     rcu_register_thread();
1308
1309     qemu_mutex_lock_iothread();
1310     qemu_thread_get_self(cpu->thread);
1311     cpu->thread_id = qemu_get_thread_id();
1312     cpu->can_do_io = 1;
1313     current_cpu = cpu;
1314
1315     sigemptyset(&waitset);
1316     sigaddset(&waitset, SIG_IPI);
1317
1318     /* signal CPU creation */
1319     cpu->created = true;
1320     qemu_cond_signal(&qemu_cpu_cond);
1321
1322     do {
1323         qemu_mutex_unlock_iothread();
1324         do {
1325             int sig;
1326             r = sigwait(&waitset, &sig);
1327         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1328         if (r == -1) {
1329             perror("sigwait");
1330             exit(1);
1331         }
1332         qemu_mutex_lock_iothread();
1333         qemu_wait_io_event(cpu);
1334     } while (!cpu->unplug);
1335
1336     rcu_unregister_thread();
1337     return NULL;
1338 #endif
1339 }
1340
1341 static int64_t tcg_get_icount_limit(void)
1342 {
1343     int64_t deadline;
1344
1345     if (replay_mode != REPLAY_MODE_PLAY) {
1346         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1347
1348         /* Maintain prior (possibly buggy) behaviour where if no deadline
1349          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1350          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1351          * nanoseconds.
1352          */
1353         if ((deadline < 0) || (deadline > INT32_MAX)) {
1354             deadline = INT32_MAX;
1355         }
1356
1357         return qemu_icount_round(deadline);
1358     } else {
1359         return replay_get_instructions();
1360     }
1361 }
1362
1363 static void handle_icount_deadline(void)
1364 {
1365     assert(qemu_in_vcpu_thread());
1366     if (use_icount) {
1367         int64_t deadline =
1368             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1369
1370         if (deadline == 0) {
1371             /* Wake up other AioContexts.  */
1372             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1373             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1374         }
1375     }
1376 }
1377
1378 static void prepare_icount_for_run(CPUState *cpu)
1379 {
1380     if (use_icount) {
1381         int insns_left;
1382
1383         /* These should always be cleared by process_icount_data after
1384          * each vCPU execution. However u16.high can be raised
1385          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1386          */
1387         g_assert(cpu->icount_decr.u16.low == 0);
1388         g_assert(cpu->icount_extra == 0);
1389
1390         cpu->icount_budget = tcg_get_icount_limit();
1391         insns_left = MIN(0xffff, cpu->icount_budget);
1392         cpu->icount_decr.u16.low = insns_left;
1393         cpu->icount_extra = cpu->icount_budget - insns_left;
1394
1395         replay_mutex_lock();
1396     }
1397 }
1398
1399 static void process_icount_data(CPUState *cpu)
1400 {
1401     if (use_icount) {
1402         /* Account for executed instructions */
1403         cpu_update_icount(cpu);
1404
1405         /* Reset the counters */
1406         cpu->icount_decr.u16.low = 0;
1407         cpu->icount_extra = 0;
1408         cpu->icount_budget = 0;
1409
1410         replay_account_executed_instructions();
1411
1412         replay_mutex_unlock();
1413     }
1414 }
1415
1416
1417 static int tcg_cpu_exec(CPUState *cpu)
1418 {
1419     int ret;
1420 #ifdef CONFIG_PROFILER
1421     int64_t ti;
1422 #endif
1423
1424     assert(tcg_enabled());
1425 #ifdef CONFIG_PROFILER
1426     ti = profile_getclock();
1427 #endif
1428     cpu_exec_start(cpu);
1429     ret = cpu_exec(cpu);
1430     cpu_exec_end(cpu);
1431 #ifdef CONFIG_PROFILER
1432     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1433                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1434 #endif
1435     return ret;
1436 }
1437
1438 /* Destroy any remaining vCPUs which have been unplugged and have
1439  * finished running
1440  */
1441 static void deal_with_unplugged_cpus(void)
1442 {
1443     CPUState *cpu;
1444
1445     CPU_FOREACH(cpu) {
1446         if (cpu->unplug && !cpu_can_run(cpu)) {
1447             qemu_tcg_destroy_vcpu(cpu);
1448             cpu->created = false;
1449             qemu_cond_signal(&qemu_cpu_cond);
1450             break;
1451         }
1452     }
1453 }
1454
1455 /* Single-threaded TCG
1456  *
1457  * In the single-threaded case each vCPU is simulated in turn. If
1458  * there is more than a single vCPU we create a simple timer to kick
1459  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1460  * This is done explicitly rather than relying on side-effects
1461  * elsewhere.
1462  */
1463
1464 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1465 {
1466     CPUState *cpu = arg;
1467
1468     assert(tcg_enabled());
1469     rcu_register_thread();
1470     tcg_register_thread();
1471
1472     qemu_mutex_lock_iothread();
1473     qemu_thread_get_self(cpu->thread);
1474
1475     cpu->thread_id = qemu_get_thread_id();
1476     cpu->created = true;
1477     cpu->can_do_io = 1;
1478     qemu_cond_signal(&qemu_cpu_cond);
1479
1480     /* wait for initial kick-off after machine start */
1481     while (first_cpu->stopped) {
1482         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1483
1484         /* process any pending work */
1485         CPU_FOREACH(cpu) {
1486             current_cpu = cpu;
1487             qemu_wait_io_event_common(cpu);
1488         }
1489     }
1490
1491     start_tcg_kick_timer();
1492
1493     cpu = first_cpu;
1494
1495     /* process any pending work */
1496     cpu->exit_request = 1;
1497
1498     while (1) {
1499         qemu_mutex_unlock_iothread();
1500         replay_mutex_lock();
1501         qemu_mutex_lock_iothread();
1502         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1503         qemu_account_warp_timer();
1504
1505         /* Run the timers here.  This is much more efficient than
1506          * waking up the I/O thread and waiting for completion.
1507          */
1508         handle_icount_deadline();
1509
1510         replay_mutex_unlock();
1511
1512         if (!cpu) {
1513             cpu = first_cpu;
1514         }
1515
1516         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1517
1518             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1519             current_cpu = cpu;
1520
1521             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1522                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1523
1524             if (cpu_can_run(cpu)) {
1525                 int r;
1526
1527                 qemu_mutex_unlock_iothread();
1528                 prepare_icount_for_run(cpu);
1529
1530                 r = tcg_cpu_exec(cpu);
1531
1532                 process_icount_data(cpu);
1533                 qemu_mutex_lock_iothread();
1534
1535                 if (r == EXCP_DEBUG) {
1536                     cpu_handle_guest_debug(cpu);
1537                     break;
1538                 } else if (r == EXCP_ATOMIC) {
1539                     qemu_mutex_unlock_iothread();
1540                     cpu_exec_step_atomic(cpu);
1541                     qemu_mutex_lock_iothread();
1542                     break;
1543                 }
1544             } else if (cpu->stop) {
1545                 if (cpu->unplug) {
1546                     cpu = CPU_NEXT(cpu);
1547                 }
1548                 break;
1549             }
1550
1551             cpu = CPU_NEXT(cpu);
1552         } /* while (cpu && !cpu->exit_request).. */
1553
1554         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1555         atomic_set(&tcg_current_rr_cpu, NULL);
1556
1557         if (cpu && cpu->exit_request) {
1558             atomic_mb_set(&cpu->exit_request, 0);
1559         }
1560
1561         if (use_icount && all_cpu_threads_idle()) {
1562             /*
1563              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1564              * in the main_loop, wake it up in order to start the warp timer.
1565              */
1566             qemu_notify_event();
1567         }
1568
1569         qemu_tcg_rr_wait_io_event();
1570         deal_with_unplugged_cpus();
1571     }
1572
1573     rcu_unregister_thread();
1574     return NULL;
1575 }
1576
1577 static void *qemu_hax_cpu_thread_fn(void *arg)
1578 {
1579     CPUState *cpu = arg;
1580     int r;
1581
1582     rcu_register_thread();
1583     qemu_mutex_lock_iothread();
1584     qemu_thread_get_self(cpu->thread);
1585
1586     cpu->thread_id = qemu_get_thread_id();
1587     cpu->created = true;
1588     cpu->halted = 0;
1589     current_cpu = cpu;
1590
1591     hax_init_vcpu(cpu);
1592     qemu_cond_signal(&qemu_cpu_cond);
1593
1594     do {
1595         if (cpu_can_run(cpu)) {
1596             r = hax_smp_cpu_exec(cpu);
1597             if (r == EXCP_DEBUG) {
1598                 cpu_handle_guest_debug(cpu);
1599             }
1600         }
1601
1602         qemu_wait_io_event(cpu);
1603     } while (!cpu->unplug || cpu_can_run(cpu));
1604     rcu_unregister_thread();
1605     return NULL;
1606 }
1607
1608 /* The HVF-specific vCPU thread function. This one should only run when the host
1609  * CPU supports the VMX "unrestricted guest" feature. */
1610 static void *qemu_hvf_cpu_thread_fn(void *arg)
1611 {
1612     CPUState *cpu = arg;
1613
1614     int r;
1615
1616     assert(hvf_enabled());
1617
1618     rcu_register_thread();
1619
1620     qemu_mutex_lock_iothread();
1621     qemu_thread_get_self(cpu->thread);
1622
1623     cpu->thread_id = qemu_get_thread_id();
1624     cpu->can_do_io = 1;
1625     current_cpu = cpu;
1626
1627     hvf_init_vcpu(cpu);
1628
1629     /* signal CPU creation */
1630     cpu->created = true;
1631     qemu_cond_signal(&qemu_cpu_cond);
1632
1633     do {
1634         if (cpu_can_run(cpu)) {
1635             r = hvf_vcpu_exec(cpu);
1636             if (r == EXCP_DEBUG) {
1637                 cpu_handle_guest_debug(cpu);
1638             }
1639         }
1640         qemu_wait_io_event(cpu);
1641     } while (!cpu->unplug || cpu_can_run(cpu));
1642
1643     hvf_vcpu_destroy(cpu);
1644     cpu->created = false;
1645     qemu_cond_signal(&qemu_cpu_cond);
1646     qemu_mutex_unlock_iothread();
1647     rcu_unregister_thread();
1648     return NULL;
1649 }
1650
1651 static void *qemu_whpx_cpu_thread_fn(void *arg)
1652 {
1653     CPUState *cpu = arg;
1654     int r;
1655
1656     rcu_register_thread();
1657
1658     qemu_mutex_lock_iothread();
1659     qemu_thread_get_self(cpu->thread);
1660     cpu->thread_id = qemu_get_thread_id();
1661     current_cpu = cpu;
1662
1663     r = whpx_init_vcpu(cpu);
1664     if (r < 0) {
1665         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1666         exit(1);
1667     }
1668
1669     /* signal CPU creation */
1670     cpu->created = true;
1671     qemu_cond_signal(&qemu_cpu_cond);
1672
1673     do {
1674         if (cpu_can_run(cpu)) {
1675             r = whpx_vcpu_exec(cpu);
1676             if (r == EXCP_DEBUG) {
1677                 cpu_handle_guest_debug(cpu);
1678             }
1679         }
1680         while (cpu_thread_is_idle(cpu)) {
1681             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1682         }
1683         qemu_wait_io_event_common(cpu);
1684     } while (!cpu->unplug || cpu_can_run(cpu));
1685
1686     whpx_destroy_vcpu(cpu);
1687     cpu->created = false;
1688     qemu_cond_signal(&qemu_cpu_cond);
1689     qemu_mutex_unlock_iothread();
1690     rcu_unregister_thread();
1691     return NULL;
1692 }
1693
1694 #ifdef _WIN32
1695 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1696 {
1697 }
1698 #endif
1699
1700 /* Multi-threaded TCG
1701  *
1702  * In the multi-threaded case each vCPU has its own thread. The TLS
1703  * variable current_cpu can be used deep in the code to find the
1704  * current CPUState for a given thread.
1705  */
1706
1707 static void *qemu_tcg_cpu_thread_fn(void *arg)
1708 {
1709     CPUState *cpu = arg;
1710
1711     assert(tcg_enabled());
1712     g_assert(!use_icount);
1713
1714     rcu_register_thread();
1715     tcg_register_thread();
1716
1717     qemu_mutex_lock_iothread();
1718     qemu_thread_get_self(cpu->thread);
1719
1720     cpu->thread_id = qemu_get_thread_id();
1721     cpu->created = true;
1722     cpu->can_do_io = 1;
1723     current_cpu = cpu;
1724     qemu_cond_signal(&qemu_cpu_cond);
1725
1726     /* process any pending work */
1727     cpu->exit_request = 1;
1728
1729     do {
1730         if (cpu_can_run(cpu)) {
1731             int r;
1732             qemu_mutex_unlock_iothread();
1733             r = tcg_cpu_exec(cpu);
1734             qemu_mutex_lock_iothread();
1735             switch (r) {
1736             case EXCP_DEBUG:
1737                 cpu_handle_guest_debug(cpu);
1738                 break;
1739             case EXCP_HALTED:
1740                 /* during start-up the vCPU is reset and the thread is
1741                  * kicked several times. If we don't ensure we go back
1742                  * to sleep in the halted state we won't cleanly
1743                  * start-up when the vCPU is enabled.
1744                  *
1745                  * cpu->halted should ensure we sleep in wait_io_event
1746                  */
1747                 g_assert(cpu->halted);
1748                 break;
1749             case EXCP_ATOMIC:
1750                 qemu_mutex_unlock_iothread();
1751                 cpu_exec_step_atomic(cpu);
1752                 qemu_mutex_lock_iothread();
1753             default:
1754                 /* Ignore everything else? */
1755                 break;
1756             }
1757         }
1758
1759         atomic_mb_set(&cpu->exit_request, 0);
1760         qemu_wait_io_event(cpu);
1761     } while (!cpu->unplug || cpu_can_run(cpu));
1762
1763     qemu_tcg_destroy_vcpu(cpu);
1764     cpu->created = false;
1765     qemu_cond_signal(&qemu_cpu_cond);
1766     qemu_mutex_unlock_iothread();
1767     rcu_unregister_thread();
1768     return NULL;
1769 }
1770
1771 static void qemu_cpu_kick_thread(CPUState *cpu)
1772 {
1773 #ifndef _WIN32
1774     int err;
1775
1776     if (cpu->thread_kicked) {
1777         return;
1778     }
1779     cpu->thread_kicked = true;
1780     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1781     if (err && err != ESRCH) {
1782         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1783         exit(1);
1784     }
1785 #else /* _WIN32 */
1786     if (!qemu_cpu_is_self(cpu)) {
1787         if (whpx_enabled()) {
1788             whpx_vcpu_kick(cpu);
1789         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1790             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1791                     __func__, GetLastError());
1792             exit(1);
1793         }
1794     }
1795 #endif
1796 }
1797
1798 void qemu_cpu_kick(CPUState *cpu)
1799 {
1800     qemu_cond_broadcast(cpu->halt_cond);
1801     if (tcg_enabled()) {
1802         cpu_exit(cpu);
1803         /* NOP unless doing single-thread RR */
1804         qemu_cpu_kick_rr_cpu();
1805     } else {
1806         if (hax_enabled()) {
1807             /*
1808              * FIXME: race condition with the exit_request check in
1809              * hax_vcpu_hax_exec
1810              */
1811             cpu->exit_request = 1;
1812         }
1813         qemu_cpu_kick_thread(cpu);
1814     }
1815 }
1816
1817 void qemu_cpu_kick_self(void)
1818 {
1819     assert(current_cpu);
1820     qemu_cpu_kick_thread(current_cpu);
1821 }
1822
1823 bool qemu_cpu_is_self(CPUState *cpu)
1824 {
1825     return qemu_thread_is_self(cpu->thread);
1826 }
1827
1828 bool qemu_in_vcpu_thread(void)
1829 {
1830     return current_cpu && qemu_cpu_is_self(current_cpu);
1831 }
1832
1833 static __thread bool iothread_locked = false;
1834
1835 bool qemu_mutex_iothread_locked(void)
1836 {
1837     return iothread_locked;
1838 }
1839
1840 /*
1841  * The BQL is taken from so many places that it is worth profiling the
1842  * callers directly, instead of funneling them all through a single function.
1843  */
1844 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1845 {
1846     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1847
1848     g_assert(!qemu_mutex_iothread_locked());
1849     bql_lock(&qemu_global_mutex, file, line);
1850     iothread_locked = true;
1851 }
1852
1853 void qemu_mutex_unlock_iothread(void)
1854 {
1855     g_assert(qemu_mutex_iothread_locked());
1856     iothread_locked = false;
1857     qemu_mutex_unlock(&qemu_global_mutex);
1858 }
1859
1860 static bool all_vcpus_paused(void)
1861 {
1862     CPUState *cpu;
1863
1864     CPU_FOREACH(cpu) {
1865         if (!cpu->stopped) {
1866             return false;
1867         }
1868     }
1869
1870     return true;
1871 }
1872
1873 void pause_all_vcpus(void)
1874 {
1875     CPUState *cpu;
1876
1877     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1878     CPU_FOREACH(cpu) {
1879         if (qemu_cpu_is_self(cpu)) {
1880             qemu_cpu_stop(cpu, true);
1881         } else {
1882             cpu->stop = true;
1883             qemu_cpu_kick(cpu);
1884         }
1885     }
1886
1887     /* We need to drop the replay_lock so any vCPU threads woken up
1888      * can finish their replay tasks
1889      */
1890     replay_mutex_unlock();
1891
1892     while (!all_vcpus_paused()) {
1893         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1894         CPU_FOREACH(cpu) {
1895             qemu_cpu_kick(cpu);
1896         }
1897     }
1898
1899     qemu_mutex_unlock_iothread();
1900     replay_mutex_lock();
1901     qemu_mutex_lock_iothread();
1902 }
1903
1904 void cpu_resume(CPUState *cpu)
1905 {
1906     cpu->stop = false;
1907     cpu->stopped = false;
1908     qemu_cpu_kick(cpu);
1909 }
1910
1911 void resume_all_vcpus(void)
1912 {
1913     CPUState *cpu;
1914
1915     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1916     CPU_FOREACH(cpu) {
1917         cpu_resume(cpu);
1918     }
1919 }
1920
1921 void cpu_remove_sync(CPUState *cpu)
1922 {
1923     cpu->stop = true;
1924     cpu->unplug = true;
1925     qemu_cpu_kick(cpu);
1926     qemu_mutex_unlock_iothread();
1927     qemu_thread_join(cpu->thread);
1928     qemu_mutex_lock_iothread();
1929 }
1930
1931 /* For temporary buffers for forming a name */
1932 #define VCPU_THREAD_NAME_SIZE 16
1933
1934 static void qemu_tcg_init_vcpu(CPUState *cpu)
1935 {
1936     char thread_name[VCPU_THREAD_NAME_SIZE];
1937     static QemuCond *single_tcg_halt_cond;
1938     static QemuThread *single_tcg_cpu_thread;
1939     static int tcg_region_inited;
1940
1941     assert(tcg_enabled());
1942     /*
1943      * Initialize TCG regions--once. Now is a good time, because:
1944      * (1) TCG's init context, prologue and target globals have been set up.
1945      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1946      *     -accel flag is processed, so the check doesn't work then).
1947      */
1948     if (!tcg_region_inited) {
1949         tcg_region_inited = 1;
1950         tcg_region_init();
1951     }
1952
1953     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1954         cpu->thread = g_malloc0(sizeof(QemuThread));
1955         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1956         qemu_cond_init(cpu->halt_cond);
1957
1958         if (qemu_tcg_mttcg_enabled()) {
1959             /* create a thread per vCPU with TCG (MTTCG) */
1960             parallel_cpus = true;
1961             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1962                  cpu->cpu_index);
1963
1964             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1965                                cpu, QEMU_THREAD_JOINABLE);
1966
1967         } else {
1968             /* share a single thread for all cpus with TCG */
1969             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1970             qemu_thread_create(cpu->thread, thread_name,
1971                                qemu_tcg_rr_cpu_thread_fn,
1972                                cpu, QEMU_THREAD_JOINABLE);
1973
1974             single_tcg_halt_cond = cpu->halt_cond;
1975             single_tcg_cpu_thread = cpu->thread;
1976         }
1977 #ifdef _WIN32
1978         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1979 #endif
1980     } else {
1981         /* For non-MTTCG cases we share the thread */
1982         cpu->thread = single_tcg_cpu_thread;
1983         cpu->halt_cond = single_tcg_halt_cond;
1984         cpu->thread_id = first_cpu->thread_id;
1985         cpu->can_do_io = 1;
1986         cpu->created = true;
1987     }
1988 }
1989
1990 static void qemu_hax_start_vcpu(CPUState *cpu)
1991 {
1992     char thread_name[VCPU_THREAD_NAME_SIZE];
1993
1994     cpu->thread = g_malloc0(sizeof(QemuThread));
1995     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1996     qemu_cond_init(cpu->halt_cond);
1997
1998     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1999              cpu->cpu_index);
2000     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2001                        cpu, QEMU_THREAD_JOINABLE);
2002 #ifdef _WIN32
2003     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2004 #endif
2005 }
2006
2007 static void qemu_kvm_start_vcpu(CPUState *cpu)
2008 {
2009     char thread_name[VCPU_THREAD_NAME_SIZE];
2010
2011     cpu->thread = g_malloc0(sizeof(QemuThread));
2012     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2013     qemu_cond_init(cpu->halt_cond);
2014     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2015              cpu->cpu_index);
2016     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2017                        cpu, QEMU_THREAD_JOINABLE);
2018 }
2019
2020 static void qemu_hvf_start_vcpu(CPUState *cpu)
2021 {
2022     char thread_name[VCPU_THREAD_NAME_SIZE];
2023
2024     /* HVF currently does not support TCG, and only runs in
2025      * unrestricted-guest mode. */
2026     assert(hvf_enabled());
2027
2028     cpu->thread = g_malloc0(sizeof(QemuThread));
2029     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2030     qemu_cond_init(cpu->halt_cond);
2031
2032     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2033              cpu->cpu_index);
2034     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2035                        cpu, QEMU_THREAD_JOINABLE);
2036 }
2037
2038 static void qemu_whpx_start_vcpu(CPUState *cpu)
2039 {
2040     char thread_name[VCPU_THREAD_NAME_SIZE];
2041
2042     cpu->thread = g_malloc0(sizeof(QemuThread));
2043     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2044     qemu_cond_init(cpu->halt_cond);
2045     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2046              cpu->cpu_index);
2047     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2048                        cpu, QEMU_THREAD_JOINABLE);
2049 #ifdef _WIN32
2050     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2051 #endif
2052 }
2053
2054 static void qemu_dummy_start_vcpu(CPUState *cpu)
2055 {
2056     char thread_name[VCPU_THREAD_NAME_SIZE];
2057
2058     cpu->thread = g_malloc0(sizeof(QemuThread));
2059     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2060     qemu_cond_init(cpu->halt_cond);
2061     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2062              cpu->cpu_index);
2063     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2064                        QEMU_THREAD_JOINABLE);
2065 }
2066
2067 void qemu_init_vcpu(CPUState *cpu)
2068 {
2069     cpu->nr_cores = smp_cores;
2070     cpu->nr_threads = smp_threads;
2071     cpu->stopped = true;
2072
2073     if (!cpu->as) {
2074         /* If the target cpu hasn't set up any address spaces itself,
2075          * give it the default one.
2076          */
2077         cpu->num_ases = 1;
2078         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2079     }
2080
2081     if (kvm_enabled()) {
2082         qemu_kvm_start_vcpu(cpu);
2083     } else if (hax_enabled()) {
2084         qemu_hax_start_vcpu(cpu);
2085     } else if (hvf_enabled()) {
2086         qemu_hvf_start_vcpu(cpu);
2087     } else if (tcg_enabled()) {
2088         qemu_tcg_init_vcpu(cpu);
2089     } else if (whpx_enabled()) {
2090         qemu_whpx_start_vcpu(cpu);
2091     } else {
2092         qemu_dummy_start_vcpu(cpu);
2093     }
2094
2095     while (!cpu->created) {
2096         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2097     }
2098 }
2099
2100 void cpu_stop_current(void)
2101 {
2102     if (current_cpu) {
2103         current_cpu->stop = true;
2104         cpu_exit(current_cpu);
2105     }
2106 }
2107
2108 int vm_stop(RunState state)
2109 {
2110     if (qemu_in_vcpu_thread()) {
2111         qemu_system_vmstop_request_prepare();
2112         qemu_system_vmstop_request(state);
2113         /*
2114          * FIXME: should not return to device code in case
2115          * vm_stop() has been requested.
2116          */
2117         cpu_stop_current();
2118         return 0;
2119     }
2120
2121     return do_vm_stop(state, true);
2122 }
2123
2124 /**
2125  * Prepare for (re)starting the VM.
2126  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2127  * running or in case of an error condition), 0 otherwise.
2128  */
2129 int vm_prepare_start(void)
2130 {
2131     RunState requested;
2132
2133     qemu_vmstop_requested(&requested);
2134     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2135         return -1;
2136     }
2137
2138     /* Ensure that a STOP/RESUME pair of events is emitted if a
2139      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2140      * example, according to documentation is always followed by
2141      * the STOP event.
2142      */
2143     if (runstate_is_running()) {
2144         qapi_event_send_stop();
2145         qapi_event_send_resume();
2146         return -1;
2147     }
2148
2149     /* We are sending this now, but the CPUs will be resumed shortly later */
2150     qapi_event_send_resume();
2151
2152     replay_enable_events();
2153     cpu_enable_ticks();
2154     runstate_set(RUN_STATE_RUNNING);
2155     vm_state_notify(1, RUN_STATE_RUNNING);
2156     return 0;
2157 }
2158
2159 void vm_start(void)
2160 {
2161     if (!vm_prepare_start()) {
2162         resume_all_vcpus();
2163     }
2164 }
2165
2166 /* does a state transition even if the VM is already stopped,
2167    current state is forgotten forever */
2168 int vm_stop_force_state(RunState state)
2169 {
2170     if (runstate_is_running()) {
2171         return vm_stop(state);
2172     } else {
2173         runstate_set(state);
2174
2175         bdrv_drain_all();
2176         /* Make sure to return an error if the flush in a previous vm_stop()
2177          * failed. */
2178         return bdrv_flush_all();
2179     }
2180 }
2181
2182 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
2183 {
2184     /* XXX: implement xxx_cpu_list for targets that still miss it */
2185 #if defined(cpu_list)
2186     cpu_list(f, cpu_fprintf);
2187 #endif
2188 }
2189
2190 CpuInfoList *qmp_query_cpus(Error **errp)
2191 {
2192     MachineState *ms = MACHINE(qdev_get_machine());
2193     MachineClass *mc = MACHINE_GET_CLASS(ms);
2194     CpuInfoList *head = NULL, *cur_item = NULL;
2195     CPUState *cpu;
2196
2197     CPU_FOREACH(cpu) {
2198         CpuInfoList *info;
2199 #if defined(TARGET_I386)
2200         X86CPU *x86_cpu = X86_CPU(cpu);
2201         CPUX86State *env = &x86_cpu->env;
2202 #elif defined(TARGET_PPC)
2203         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2204         CPUPPCState *env = &ppc_cpu->env;
2205 #elif defined(TARGET_SPARC)
2206         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2207         CPUSPARCState *env = &sparc_cpu->env;
2208 #elif defined(TARGET_RISCV)
2209         RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2210         CPURISCVState *env = &riscv_cpu->env;
2211 #elif defined(TARGET_MIPS)
2212         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2213         CPUMIPSState *env = &mips_cpu->env;
2214 #elif defined(TARGET_TRICORE)
2215         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2216         CPUTriCoreState *env = &tricore_cpu->env;
2217 #elif defined(TARGET_S390X)
2218         S390CPU *s390_cpu = S390_CPU(cpu);
2219         CPUS390XState *env = &s390_cpu->env;
2220 #endif
2221
2222         cpu_synchronize_state(cpu);
2223
2224         info = g_malloc0(sizeof(*info));
2225         info->value = g_malloc0(sizeof(*info->value));
2226         info->value->CPU = cpu->cpu_index;
2227         info->value->current = (cpu == first_cpu);
2228         info->value->halted = cpu->halted;
2229         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2230         info->value->thread_id = cpu->thread_id;
2231 #if defined(TARGET_I386)
2232         info->value->arch = CPU_INFO_ARCH_X86;
2233         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2234 #elif defined(TARGET_PPC)
2235         info->value->arch = CPU_INFO_ARCH_PPC;
2236         info->value->u.ppc.nip = env->nip;
2237 #elif defined(TARGET_SPARC)
2238         info->value->arch = CPU_INFO_ARCH_SPARC;
2239         info->value->u.q_sparc.pc = env->pc;
2240         info->value->u.q_sparc.npc = env->npc;
2241 #elif defined(TARGET_MIPS)
2242         info->value->arch = CPU_INFO_ARCH_MIPS;
2243         info->value->u.q_mips.PC = env->active_tc.PC;
2244 #elif defined(TARGET_TRICORE)
2245         info->value->arch = CPU_INFO_ARCH_TRICORE;
2246         info->value->u.tricore.PC = env->PC;
2247 #elif defined(TARGET_S390X)
2248         info->value->arch = CPU_INFO_ARCH_S390;
2249         info->value->u.s390.cpu_state = env->cpu_state;
2250 #elif defined(TARGET_RISCV)
2251         info->value->arch = CPU_INFO_ARCH_RISCV;
2252         info->value->u.riscv.pc = env->pc;
2253 #else
2254         info->value->arch = CPU_INFO_ARCH_OTHER;
2255 #endif
2256         info->value->has_props = !!mc->cpu_index_to_instance_props;
2257         if (info->value->has_props) {
2258             CpuInstanceProperties *props;
2259             props = g_malloc0(sizeof(*props));
2260             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2261             info->value->props = props;
2262         }
2263
2264         /* XXX: waiting for the qapi to support GSList */
2265         if (!cur_item) {
2266             head = cur_item = info;
2267         } else {
2268             cur_item->next = info;
2269             cur_item = info;
2270         }
2271     }
2272
2273     return head;
2274 }
2275
2276 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2277 {
2278     /*
2279      * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2280      * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2281      */
2282     switch (target) {
2283     case SYS_EMU_TARGET_I386:
2284     case SYS_EMU_TARGET_X86_64:
2285         return CPU_INFO_ARCH_X86;
2286
2287     case SYS_EMU_TARGET_PPC:
2288     case SYS_EMU_TARGET_PPC64:
2289         return CPU_INFO_ARCH_PPC;
2290
2291     case SYS_EMU_TARGET_SPARC:
2292     case SYS_EMU_TARGET_SPARC64:
2293         return CPU_INFO_ARCH_SPARC;
2294
2295     case SYS_EMU_TARGET_MIPS:
2296     case SYS_EMU_TARGET_MIPSEL:
2297     case SYS_EMU_TARGET_MIPS64:
2298     case SYS_EMU_TARGET_MIPS64EL:
2299         return CPU_INFO_ARCH_MIPS;
2300
2301     case SYS_EMU_TARGET_TRICORE:
2302         return CPU_INFO_ARCH_TRICORE;
2303
2304     case SYS_EMU_TARGET_S390X:
2305         return CPU_INFO_ARCH_S390;
2306
2307     case SYS_EMU_TARGET_RISCV32:
2308     case SYS_EMU_TARGET_RISCV64:
2309         return CPU_INFO_ARCH_RISCV;
2310
2311     default:
2312         return CPU_INFO_ARCH_OTHER;
2313     }
2314 }
2315
2316 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2317 {
2318 #ifdef TARGET_S390X
2319     S390CPU *s390_cpu = S390_CPU(cpu);
2320     CPUS390XState *env = &s390_cpu->env;
2321
2322     info->cpu_state = env->cpu_state;
2323 #else
2324     abort();
2325 #endif
2326 }
2327
2328 /*
2329  * fast means: we NEVER interrupt vCPU threads to retrieve
2330  * information from KVM.
2331  */
2332 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2333 {
2334     MachineState *ms = MACHINE(qdev_get_machine());
2335     MachineClass *mc = MACHINE_GET_CLASS(ms);
2336     CpuInfoFastList *head = NULL, *cur_item = NULL;
2337     SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2338                                           -1, &error_abort);
2339     CPUState *cpu;
2340
2341     CPU_FOREACH(cpu) {
2342         CpuInfoFastList *info = g_malloc0(sizeof(*info));
2343         info->value = g_malloc0(sizeof(*info->value));
2344
2345         info->value->cpu_index = cpu->cpu_index;
2346         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2347         info->value->thread_id = cpu->thread_id;
2348
2349         info->value->has_props = !!mc->cpu_index_to_instance_props;
2350         if (info->value->has_props) {
2351             CpuInstanceProperties *props;
2352             props = g_malloc0(sizeof(*props));
2353             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2354             info->value->props = props;
2355         }
2356
2357         info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2358         info->value->target = target;
2359         if (target == SYS_EMU_TARGET_S390X) {
2360             cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2361         }
2362
2363         if (!cur_item) {
2364             head = cur_item = info;
2365         } else {
2366             cur_item->next = info;
2367             cur_item = info;
2368         }
2369     }
2370
2371     return head;
2372 }
2373
2374 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2375                  bool has_cpu, int64_t cpu_index, Error **errp)
2376 {
2377     FILE *f;
2378     uint32_t l;
2379     CPUState *cpu;
2380     uint8_t buf[1024];
2381     int64_t orig_addr = addr, orig_size = size;
2382
2383     if (!has_cpu) {
2384         cpu_index = 0;
2385     }
2386
2387     cpu = qemu_get_cpu(cpu_index);
2388     if (cpu == NULL) {
2389         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2390                    "a CPU number");
2391         return;
2392     }
2393
2394     f = fopen(filename, "wb");
2395     if (!f) {
2396         error_setg_file_open(errp, errno, filename);
2397         return;
2398     }
2399
2400     while (size != 0) {
2401         l = sizeof(buf);
2402         if (l > size)
2403             l = size;
2404         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2405             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2406                              " specified", orig_addr, orig_size);
2407             goto exit;
2408         }
2409         if (fwrite(buf, 1, l, f) != l) {
2410             error_setg(errp, QERR_IO_ERROR);
2411             goto exit;
2412         }
2413         addr += l;
2414         size -= l;
2415     }
2416
2417 exit:
2418     fclose(f);
2419 }
2420
2421 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2422                   Error **errp)
2423 {
2424     FILE *f;
2425     uint32_t l;
2426     uint8_t buf[1024];
2427
2428     f = fopen(filename, "wb");
2429     if (!f) {
2430         error_setg_file_open(errp, errno, filename);
2431         return;
2432     }
2433
2434     while (size != 0) {
2435         l = sizeof(buf);
2436         if (l > size)
2437             l = size;
2438         cpu_physical_memory_read(addr, buf, l);
2439         if (fwrite(buf, 1, l, f) != l) {
2440             error_setg(errp, QERR_IO_ERROR);
2441             goto exit;
2442         }
2443         addr += l;
2444         size -= l;
2445     }
2446
2447 exit:
2448     fclose(f);
2449 }
2450
2451 void qmp_inject_nmi(Error **errp)
2452 {
2453     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2454 }
2455
2456 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2457 {
2458     if (!use_icount) {
2459         return;
2460     }
2461
2462     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
2463                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2464     if (icount_align_option) {
2465         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
2466         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
2467     } else {
2468         cpu_fprintf(f, "Max guest delay     NA\n");
2469         cpu_fprintf(f, "Max guest advance   NA\n");
2470     }
2471 }