cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu-common.h"
  27 #include "qemu/config-file.h"
  28 #include "qemu/cutils.h"
  29 #include "migration/vmstate.h"
  30 #include "monitor/monitor.h"
  31 #include "qapi/error.h"
  32 #include "qapi/qapi-commands-misc.h"
  33 #include "qapi/qapi-events-run-state.h"
  34 #include "qapi/qmp/qerror.h"
  35 #include "qemu/error-report.h"
  36 #include "qemu/qemu-print.h"
  37 #include "sysemu/tcg.h"
  38 #include "sysemu/block-backend.h"
  39 #include "exec/gdbstub.h"
  40 #include "sysemu/dma.h"
  41 #include "sysemu/hw_accel.h"
  42 #include "sysemu/kvm.h"
  43 #include "sysemu/hax.h"
  44 #include "sysemu/hvf.h"
  45 #include "sysemu/whpx.h"
  46 #include "exec/exec-all.h"
  47
  48 #include "qemu/thread.h"
  49 #include "qemu/plugin.h"
  50 #include "sysemu/cpus.h"
  51 #include "sysemu/qtest.h"
  52 #include "qemu/main-loop.h"
  53 #include "qemu/option.h"
  54 #include "qemu/bitmap.h"
  55 #include "qemu/seqlock.h"
  56 #include "qemu/guest-random.h"
  57 #include "tcg/tcg.h"
  58 #include "hw/nmi.h"
  59 #include "sysemu/replay.h"
  60 #include "sysemu/runstate.h"
  61 #include "hw/boards.h"
  62 #include "hw/hw.h"
  63
  64 #ifdef CONFIG_LINUX
  65
  66 #include <sys/prctl.h>
  67
  68 #ifndef PR_MCE_KILL
  69 #define PR_MCE_KILL 33
  70 #endif
  71
  72 #ifndef PR_MCE_KILL_SET
  73 #define PR_MCE_KILL_SET 1
  74 #endif
  75
  76 #ifndef PR_MCE_KILL_EARLY
  77 #define PR_MCE_KILL_EARLY 1
  78 #endif
  79
  80 #endif /* CONFIG_LINUX */
  81
  82 static QemuMutex qemu_global_mutex;
  83
  84 int64_t max_delay;
  85 int64_t max_advance;
  86
  87 /* vcpu throttling controls */
  88 static QEMUTimer *throttle_timer;
  89 static unsigned int throttle_percentage;
  90
  91 #define CPU_THROTTLE_PCT_MIN 1
  92 #define CPU_THROTTLE_PCT_MAX 99
  93 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  94
  95 bool cpu_is_stopped(CPUState *cpu)
  96 {
  97     return cpu->stopped || !runstate_is_running();
  98 }
  99
 100 static inline bool cpu_work_list_empty(CPUState *cpu)
 101 {
 102     bool ret;
 103
 104     qemu_mutex_lock(&cpu->work_mutex);
 105     ret = QSIMPLEQ_EMPTY(&cpu->work_list);
 106     qemu_mutex_unlock(&cpu->work_mutex);
 107     return ret;
 108 }
 109
 110 static bool cpu_thread_is_idle(CPUState *cpu)
 111 {
 112     if (cpu->stop || !cpu_work_list_empty(cpu)) {
 113         return false;
 114     }
 115     if (cpu_is_stopped(cpu)) {
 116         return true;
 117     }
 118     if (!cpu->halted || cpu_has_work(cpu) ||
 119         kvm_halt_in_kernel()) {
 120         return false;
 121     }
 122     return true;
 123 }
 124
 125 static bool all_cpu_threads_idle(void)
 126 {
 127     CPUState *cpu;
 128
 129     CPU_FOREACH(cpu) {
 130         if (!cpu_thread_is_idle(cpu)) {
 131             return false;
 132         }
 133     }
 134     return true;
 135 }
 136
 137 /***********************************************************/
 138 /* guest cycle counter */
 139
 140 /* Protected by TimersState seqlock */
 141
 142 static bool icount_sleep = true;
 143 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 144 #define MAX_ICOUNT_SHIFT 10
 145
 146 typedef struct TimersState {
 147     /* Protected by BQL.  */
 148     int64_t cpu_ticks_prev;
 149     int64_t cpu_ticks_offset;
 150
 151     /* Protect fields that can be respectively read outside the
 152      * BQL, and written from multiple threads.
 153      */
 154     QemuSeqLock vm_clock_seqlock;
 155     QemuSpin vm_clock_lock;
 156
 157     int16_t cpu_ticks_enabled;
 158
 159     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 160     int16_t icount_time_shift;
 161
 162     /* Compensate for varying guest execution speed.  */
 163     int64_t qemu_icount_bias;
 164
 165     int64_t vm_clock_warp_start;
 166     int64_t cpu_clock_offset;
 167
 168     /* Only written by TCG thread */
 169     int64_t qemu_icount;
 170
 171     /* for adjusting icount */
 172     QEMUTimer *icount_rt_timer;
 173     QEMUTimer *icount_vm_timer;
 174     QEMUTimer *icount_warp_timer;
 175 } TimersState;
 176
 177 static TimersState timers_state;
 178 bool mttcg_enabled;
 179
 180
 181 /* The current number of executed instructions is based on what we
 182  * originally budgeted minus the current state of the decrementing
 183  * icount counters in extra/u16.low.
 184  */
 185 static int64_t cpu_get_icount_executed(CPUState *cpu)
 186 {
 187     return (cpu->icount_budget -
 188             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 189 }
 190
 191 /*
 192  * Update the global shared timer_state.qemu_icount to take into
 193  * account executed instructions. This is done by the TCG vCPU
 194  * thread so the main-loop can see time has moved forward.
 195  */
 196 static void cpu_update_icount_locked(CPUState *cpu)
 197 {
 198     int64_t executed = cpu_get_icount_executed(cpu);
 199     cpu->icount_budget -= executed;
 200
 201     atomic_set_i64(&timers_state.qemu_icount,
 202                    timers_state.qemu_icount + executed);
 203 }
 204
 205 /*
 206  * Update the global shared timer_state.qemu_icount to take into
 207  * account executed instructions. This is done by the TCG vCPU
 208  * thread so the main-loop can see time has moved forward.
 209  */
 210 void cpu_update_icount(CPUState *cpu)
 211 {
 212     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 213                        &timers_state.vm_clock_lock);
 214     cpu_update_icount_locked(cpu);
 215     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 216                          &timers_state.vm_clock_lock);
 217 }
 218
 219 static int64_t cpu_get_icount_raw_locked(void)
 220 {
 221     CPUState *cpu = current_cpu;
 222
 223     if (cpu && cpu->running) {
 224         if (!cpu->can_do_io) {
 225             error_report("Bad icount read");
 226             exit(1);
 227         }
 228         /* Take into account what has run */
 229         cpu_update_icount_locked(cpu);
 230     }
 231     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 232     return atomic_read_i64(&timers_state.qemu_icount);
 233 }
 234
 235 static int64_t cpu_get_icount_locked(void)
 236 {
 237     int64_t icount = cpu_get_icount_raw_locked();
 238     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 239         cpu_icount_to_ns(icount);
 240 }
 241
 242 int64_t cpu_get_icount_raw(void)
 243 {
 244     int64_t icount;
 245     unsigned start;
 246
 247     do {
 248         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 249         icount = cpu_get_icount_raw_locked();
 250     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 251
 252     return icount;
 253 }
 254
 255 /* Return the virtual CPU time, based on the instruction counter.  */
 256 int64_t cpu_get_icount(void)
 257 {
 258     int64_t icount;
 259     unsigned start;
 260
 261     do {
 262         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 263         icount = cpu_get_icount_locked();
 264     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 265
 266     return icount;
 267 }
 268
 269 int64_t cpu_icount_to_ns(int64_t icount)
 270 {
 271     return icount << atomic_read(&timers_state.icount_time_shift);
 272 }
 273
 274 static int64_t cpu_get_ticks_locked(void)
 275 {
 276     int64_t ticks = timers_state.cpu_ticks_offset;
 277     if (timers_state.cpu_ticks_enabled) {
 278         ticks += cpu_get_host_ticks();
 279     }
 280
 281     if (timers_state.cpu_ticks_prev > ticks) {
 282         /* Non increasing ticks may happen if the host uses software suspend.  */
 283         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 284         ticks = timers_state.cpu_ticks_prev;
 285     }
 286
 287     timers_state.cpu_ticks_prev = ticks;
 288     return ticks;
 289 }
 290
 291 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 292  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 293  * counter.
 294  */
 295 int64_t cpu_get_ticks(void)
 296 {
 297     int64_t ticks;
 298
 299     if (use_icount) {
 300         return cpu_get_icount();
 301     }
 302
 303     qemu_spin_lock(&timers_state.vm_clock_lock);
 304     ticks = cpu_get_ticks_locked();
 305     qemu_spin_unlock(&timers_state.vm_clock_lock);
 306     return ticks;
 307 }
 308
 309 static int64_t cpu_get_clock_locked(void)
 310 {
 311     int64_t time;
 312
 313     time = timers_state.cpu_clock_offset;
 314     if (timers_state.cpu_ticks_enabled) {
 315         time += get_clock();
 316     }
 317
 318     return time;
 319 }
 320
 321 /* Return the monotonic time elapsed in VM, i.e.,
 322  * the time between vm_start and vm_stop
 323  */
 324 int64_t cpu_get_clock(void)
 325 {
 326     int64_t ti;
 327     unsigned start;
 328
 329     do {
 330         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 331         ti = cpu_get_clock_locked();
 332     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 333
 334     return ti;
 335 }
 336
 337 /* enable cpu_get_ticks()
 338  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 339  */
 340 void cpu_enable_ticks(void)
 341 {
 342     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 343                        &timers_state.vm_clock_lock);
 344     if (!timers_state.cpu_ticks_enabled) {
 345         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 346         timers_state.cpu_clock_offset -= get_clock();
 347         timers_state.cpu_ticks_enabled = 1;
 348     }
 349     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 350                        &timers_state.vm_clock_lock);
 351 }
 352
 353 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 354  * cpu_get_ticks() after that.
 355  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 356  */
 357 void cpu_disable_ticks(void)
 358 {
 359     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 360                        &timers_state.vm_clock_lock);
 361     if (timers_state.cpu_ticks_enabled) {
 362         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 363         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 364         timers_state.cpu_ticks_enabled = 0;
 365     }
 366     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 367                          &timers_state.vm_clock_lock);
 368 }
 369
 370 /* Correlation between real and virtual time is always going to be
 371    fairly approximate, so ignore small variation.
 372    When the guest is idle real and virtual time will be aligned in
 373    the IO wait loop.  */
 374 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 375
 376 static void icount_adjust(void)
 377 {
 378     int64_t cur_time;
 379     int64_t cur_icount;
 380     int64_t delta;
 381
 382     /* Protected by TimersState mutex.  */
 383     static int64_t last_delta;
 384
 385     /* If the VM is not running, then do nothing.  */
 386     if (!runstate_is_running()) {
 387         return;
 388     }
 389
 390     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 391                        &timers_state.vm_clock_lock);
 392     cur_time = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 393                                    cpu_get_clock_locked());
 394     cur_icount = cpu_get_icount_locked();
 395
 396     delta = cur_icount - cur_time;
 397     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 398     if (delta > 0
 399         && last_delta + ICOUNT_WOBBLE < delta * 2
 400         && timers_state.icount_time_shift > 0) {
 401         /* The guest is getting too far ahead.  Slow time down.  */
 402         atomic_set(&timers_state.icount_time_shift,
 403                    timers_state.icount_time_shift - 1);
 404     }
 405     if (delta < 0
 406         && last_delta - ICOUNT_WOBBLE > delta * 2
 407         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 408         /* The guest is getting too far behind.  Speed time up.  */
 409         atomic_set(&timers_state.icount_time_shift,
 410                    timers_state.icount_time_shift + 1);
 411     }
 412     last_delta = delta;
 413     atomic_set_i64(&timers_state.qemu_icount_bias,
 414                    cur_icount - (timers_state.qemu_icount
 415                                  << timers_state.icount_time_shift));
 416     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 417                          &timers_state.vm_clock_lock);
 418 }
 419
 420 static void icount_adjust_rt(void *opaque)
 421 {
 422     timer_mod(timers_state.icount_rt_timer,
 423               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 424     icount_adjust();
 425 }
 426
 427 static void icount_adjust_vm(void *opaque)
 428 {
 429     timer_mod(timers_state.icount_vm_timer,
 430                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 431                    NANOSECONDS_PER_SECOND / 10);
 432     icount_adjust();
 433 }
 434
 435 static int64_t qemu_icount_round(int64_t count)
 436 {
 437     int shift = atomic_read(&timers_state.icount_time_shift);
 438     return (count + (1 << shift) - 1) >> shift;
 439 }
 440
 441 static void icount_warp_rt(void)
 442 {
 443     unsigned seq;
 444     int64_t warp_start;
 445
 446     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 447      * changes from -1 to another value, so the race here is okay.
 448      */
 449     do {
 450         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 451         warp_start = timers_state.vm_clock_warp_start;
 452     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 453
 454     if (warp_start == -1) {
 455         return;
 456     }
 457
 458     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 459                        &timers_state.vm_clock_lock);
 460     if (runstate_is_running()) {
 461         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 462                                             cpu_get_clock_locked());
 463         int64_t warp_delta;
 464
 465         warp_delta = clock - timers_state.vm_clock_warp_start;
 466         if (use_icount == 2) {
 467             /*
 468              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 469              * far ahead of real time.
 470              */
 471             int64_t cur_icount = cpu_get_icount_locked();
 472             int64_t delta = clock - cur_icount;
 473             warp_delta = MIN(warp_delta, delta);
 474         }
 475         atomic_set_i64(&timers_state.qemu_icount_bias,
 476                        timers_state.qemu_icount_bias + warp_delta);
 477     }
 478     timers_state.vm_clock_warp_start = -1;
 479     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 480                        &timers_state.vm_clock_lock);
 481
 482     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 483         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 484     }
 485 }
 486
 487 static void icount_timer_cb(void *opaque)
 488 {
 489     /* No need for a checkpoint because the timer already synchronizes
 490      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 491      */
 492     icount_warp_rt();
 493 }
 494
 495 void qtest_clock_warp(int64_t dest)
 496 {
 497     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 498     AioContext *aio_context;
 499     assert(qtest_enabled());
 500     aio_context = qemu_get_aio_context();
 501     while (clock < dest) {
 502         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 503                                                       QEMU_TIMER_ATTR_ALL);
 504         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 505
 506         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 507                            &timers_state.vm_clock_lock);
 508         atomic_set_i64(&timers_state.qemu_icount_bias,
 509                        timers_state.qemu_icount_bias + warp);
 510         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 511                              &timers_state.vm_clock_lock);
 512
 513         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 514         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 515         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 516     }
 517     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 518 }
 519
 520 void qemu_start_warp_timer(void)
 521 {
 522     int64_t clock;
 523     int64_t deadline;
 524
 525     if (!use_icount) {
 526         return;
 527     }
 528
 529     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 530      * do not fire, so computing the deadline does not make sense.
 531      */
 532     if (!runstate_is_running()) {
 533         return;
 534     }
 535
 536     if (replay_mode != REPLAY_MODE_PLAY) {
 537         if (!all_cpu_threads_idle()) {
 538             return;
 539         }
 540
 541         if (qtest_enabled()) {
 542             /* When testing, qtest commands advance icount.  */
 543             return;
 544         }
 545
 546         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 547     } else {
 548         /* warp clock deterministically in record/replay mode */
 549         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 550             /* vCPU is sleeping and warp can't be started.
 551                It is probably a race condition: notification sent
 552                to vCPU was processed in advance and vCPU went to sleep.
 553                Therefore we have to wake it up for doing someting. */
 554             if (replay_has_checkpoint()) {
 555                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 556             }
 557             return;
 558         }
 559     }
 560
 561     /* We want to use the earliest deadline from ALL vm_clocks */
 562     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 563     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 564                                           ~QEMU_TIMER_ATTR_EXTERNAL);
 565     if (deadline < 0) {
 566         static bool notified;
 567         if (!icount_sleep && !notified) {
 568             warn_report("icount sleep disabled and no active timers");
 569             notified = true;
 570         }
 571         return;
 572     }
 573
 574     if (deadline > 0) {
 575         /*
 576          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 577          * sleep.  Otherwise, the CPU might be waiting for a future timer
 578          * interrupt to wake it up, but the interrupt never comes because
 579          * the vCPU isn't running any insns and thus doesn't advance the
 580          * QEMU_CLOCK_VIRTUAL.
 581          */
 582         if (!icount_sleep) {
 583             /*
 584              * We never let VCPUs sleep in no sleep icount mode.
 585              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 586              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 587              * It is useful when we want a deterministic execution time,
 588              * isolated from host latencies.
 589              */
 590             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 591                                &timers_state.vm_clock_lock);
 592             atomic_set_i64(&timers_state.qemu_icount_bias,
 593                            timers_state.qemu_icount_bias + deadline);
 594             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 595                                  &timers_state.vm_clock_lock);
 596             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 597         } else {
 598             /*
 599              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 600              * "real" time, (related to the time left until the next event) has
 601              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 602              * This avoids that the warps are visible externally; for example,
 603              * you will not be sending network packets continuously instead of
 604              * every 100ms.
 605              */
 606             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 607                                &timers_state.vm_clock_lock);
 608             if (timers_state.vm_clock_warp_start == -1
 609                 || timers_state.vm_clock_warp_start > clock) {
 610                 timers_state.vm_clock_warp_start = clock;
 611             }
 612             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 613                                  &timers_state.vm_clock_lock);
 614             timer_mod_anticipate(timers_state.icount_warp_timer,
 615                                  clock + deadline);
 616         }
 617     } else if (deadline == 0) {
 618         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 619     }
 620 }
 621
 622 static void qemu_account_warp_timer(void)
 623 {
 624     if (!use_icount || !icount_sleep) {
 625         return;
 626     }
 627
 628     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 629      * do not fire, so computing the deadline does not make sense.
 630      */
 631     if (!runstate_is_running()) {
 632         return;
 633     }
 634
 635     /* warp clock deterministically in record/replay mode */
 636     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 637         return;
 638     }
 639
 640     timer_del(timers_state.icount_warp_timer);
 641     icount_warp_rt();
 642 }
 643
 644 static bool icount_state_needed(void *opaque)
 645 {
 646     return use_icount;
 647 }
 648
 649 static bool warp_timer_state_needed(void *opaque)
 650 {
 651     TimersState *s = opaque;
 652     return s->icount_warp_timer != NULL;
 653 }
 654
 655 static bool adjust_timers_state_needed(void *opaque)
 656 {
 657     TimersState *s = opaque;
 658     return s->icount_rt_timer != NULL;
 659 }
 660
 661 static bool shift_state_needed(void *opaque)
 662 {
 663     return use_icount == 2;
 664 }
 665
 666 /*
 667  * Subsection for warp timer migration is optional, because may not be created
 668  */
 669 static const VMStateDescription icount_vmstate_warp_timer = {
 670     .name = "timer/icount/warp_timer",
 671     .version_id = 1,
 672     .minimum_version_id = 1,
 673     .needed = warp_timer_state_needed,
 674     .fields = (VMStateField[]) {
 675         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 676         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 677         VMSTATE_END_OF_LIST()
 678     }
 679 };
 680
 681 static const VMStateDescription icount_vmstate_adjust_timers = {
 682     .name = "timer/icount/timers",
 683     .version_id = 1,
 684     .minimum_version_id = 1,
 685     .needed = adjust_timers_state_needed,
 686     .fields = (VMStateField[]) {
 687         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 688         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 689         VMSTATE_END_OF_LIST()
 690     }
 691 };
 692
 693 static const VMStateDescription icount_vmstate_shift = {
 694     .name = "timer/icount/shift",
 695     .version_id = 1,
 696     .minimum_version_id = 1,
 697     .needed = shift_state_needed,
 698     .fields = (VMStateField[]) {
 699         VMSTATE_INT16(icount_time_shift, TimersState),
 700         VMSTATE_END_OF_LIST()
 701     }
 702 };
 703
 704 /*
 705  * This is a subsection for icount migration.
 706  */
 707 static const VMStateDescription icount_vmstate_timers = {
 708     .name = "timer/icount",
 709     .version_id = 1,
 710     .minimum_version_id = 1,
 711     .needed = icount_state_needed,
 712     .fields = (VMStateField[]) {
 713         VMSTATE_INT64(qemu_icount_bias, TimersState),
 714         VMSTATE_INT64(qemu_icount, TimersState),
 715         VMSTATE_END_OF_LIST()
 716     },
 717     .subsections = (const VMStateDescription*[]) {
 718         &icount_vmstate_warp_timer,
 719         &icount_vmstate_adjust_timers,
 720         &icount_vmstate_shift,
 721         NULL
 722     }
 723 };
 724
 725 static const VMStateDescription vmstate_timers = {
 726     .name = "timer",
 727     .version_id = 2,
 728     .minimum_version_id = 1,
 729     .fields = (VMStateField[]) {
 730         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 731         VMSTATE_UNUSED(8),
 732         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 733         VMSTATE_END_OF_LIST()
 734     },
 735     .subsections = (const VMStateDescription*[]) {
 736         &icount_vmstate_timers,
 737         NULL
 738     }
 739 };
 740
 741 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 742 {
 743     double pct;
 744     double throttle_ratio;
 745     int64_t sleeptime_ns, endtime_ns;
 746
 747     if (!cpu_throttle_get_percentage()) {
 748         return;
 749     }
 750
 751     pct = (double)cpu_throttle_get_percentage()/100;
 752     throttle_ratio = pct / (1 - pct);
 753     /* Add 1ns to fix double's rounding error (like 0.9999999...) */
 754     sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
 755     endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
 756     while (sleeptime_ns > 0 && !cpu->stop) {
 757         if (sleeptime_ns > SCALE_MS) {
 758             qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
 759                                 sleeptime_ns / SCALE_MS);
 760         } else {
 761             qemu_mutex_unlock_iothread();
 762             g_usleep(sleeptime_ns / SCALE_US);
 763             qemu_mutex_lock_iothread();
 764         }
 765         sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 766     }
 767     atomic_set(&cpu->throttle_thread_scheduled, 0);
 768 }
 769
 770 static void cpu_throttle_timer_tick(void *opaque)
 771 {
 772     CPUState *cpu;
 773     double pct;
 774
 775     /* Stop the timer if needed */
 776     if (!cpu_throttle_get_percentage()) {
 777         return;
 778     }
 779     CPU_FOREACH(cpu) {
 780         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 781             async_run_on_cpu(cpu, cpu_throttle_thread,
 782                              RUN_ON_CPU_NULL);
 783         }
 784     }
 785
 786     pct = (double)cpu_throttle_get_percentage()/100;
 787     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 788                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 789 }
 790
 791 void cpu_throttle_set(int new_throttle_pct)
 792 {
 793     /* Ensure throttle percentage is within valid range */
 794     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 795     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 796
 797     atomic_set(&throttle_percentage, new_throttle_pct);
 798
 799     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 800                                        CPU_THROTTLE_TIMESLICE_NS);
 801 }
 802
 803 void cpu_throttle_stop(void)
 804 {
 805     atomic_set(&throttle_percentage, 0);
 806 }
 807
 808 bool cpu_throttle_active(void)
 809 {
 810     return (cpu_throttle_get_percentage() != 0);
 811 }
 812
 813 int cpu_throttle_get_percentage(void)
 814 {
 815     return atomic_read(&throttle_percentage);
 816 }
 817
 818 void cpu_ticks_init(void)
 819 {
 820     seqlock_init(&timers_state.vm_clock_seqlock);
 821     qemu_spin_init(&timers_state.vm_clock_lock);
 822     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 823     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 824                                            cpu_throttle_timer_tick, NULL);
 825 }
 826
 827 void configure_icount(QemuOpts *opts, Error **errp)
 828 {
 829     const char *option = qemu_opt_get(opts, "shift");
 830     bool sleep = qemu_opt_get_bool(opts, "sleep", true);
 831     bool align = qemu_opt_get_bool(opts, "align", false);
 832     long time_shift = -1;
 833
 834     if (!option) {
 835         if (qemu_opt_get(opts, "align") != NULL) {
 836             error_setg(errp, "Please specify shift option when using align");
 837         }
 838         return;
 839     }
 840
 841     if (align && !sleep) {
 842         error_setg(errp, "align=on and sleep=off are incompatible");
 843         return;
 844     }
 845
 846     if (strcmp(option, "auto") != 0) {
 847         if (qemu_strtol(option, NULL, 0, &time_shift) < 0
 848             || time_shift < 0 || time_shift > MAX_ICOUNT_SHIFT) {
 849             error_setg(errp, "icount: Invalid shift value");
 850             return;
 851         }
 852     } else if (icount_align_option) {
 853         error_setg(errp, "shift=auto and align=on are incompatible");
 854         return;
 855     } else if (!icount_sleep) {
 856         error_setg(errp, "shift=auto and sleep=off are incompatible");
 857         return;
 858     }
 859
 860     icount_sleep = sleep;
 861     if (icount_sleep) {
 862         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 863                                          icount_timer_cb, NULL);
 864     }
 865
 866     icount_align_option = align;
 867
 868     if (time_shift >= 0) {
 869         timers_state.icount_time_shift = time_shift;
 870         use_icount = 1;
 871         return;
 872     }
 873
 874     use_icount = 2;
 875
 876     /* 125MIPS seems a reasonable initial guess at the guest speed.
 877        It will be corrected fairly quickly anyway.  */
 878     timers_state.icount_time_shift = 3;
 879
 880     /* Have both realtime and virtual time triggers for speed adjustment.
 881        The realtime trigger catches emulated time passing too slowly,
 882        the virtual time trigger catches emulated time passing too fast.
 883        Realtime triggers occur even when idle, so use them less frequently
 884        than VM triggers.  */
 885     timers_state.vm_clock_warp_start = -1;
 886     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 887                                    icount_adjust_rt, NULL);
 888     timer_mod(timers_state.icount_rt_timer,
 889                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 890     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 891                                         icount_adjust_vm, NULL);
 892     timer_mod(timers_state.icount_vm_timer,
 893                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 894                    NANOSECONDS_PER_SECOND / 10);
 895 }
 896
 897 /***********************************************************/
 898 /* TCG vCPU kick timer
 899  *
 900  * The kick timer is responsible for moving single threaded vCPU
 901  * emulation on to the next vCPU. If more than one vCPU is running a
 902  * timer event with force a cpu->exit so the next vCPU can get
 903  * scheduled.
 904  *
 905  * The timer is removed if all vCPUs are idle and restarted again once
 906  * idleness is complete.
 907  */
 908
 909 static QEMUTimer *tcg_kick_vcpu_timer;
 910 static CPUState *tcg_current_rr_cpu;
 911
 912 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 913
 914 static inline int64_t qemu_tcg_next_kick(void)
 915 {
 916     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 917 }
 918
 919 /* Kick the currently round-robin scheduled vCPU to next */
 920 static void qemu_cpu_kick_rr_next_cpu(void)
 921 {
 922     CPUState *cpu;
 923     do {
 924         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 925         if (cpu) {
 926             cpu_exit(cpu);
 927         }
 928     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 929 }
 930
 931 /* Kick all RR vCPUs */
 932 static void qemu_cpu_kick_rr_cpus(void)
 933 {
 934     CPUState *cpu;
 935
 936     CPU_FOREACH(cpu) {
 937         cpu_exit(cpu);
 938     };
 939 }
 940
 941 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 942 {
 943 }
 944
 945 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 946 {
 947     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 948         qemu_notify_event();
 949         return;
 950     }
 951
 952     if (qemu_in_vcpu_thread()) {
 953         /* A CPU is currently running; kick it back out to the
 954          * tcg_cpu_exec() loop so it will recalculate its
 955          * icount deadline immediately.
 956          */
 957         qemu_cpu_kick(current_cpu);
 958     } else if (first_cpu) {
 959         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 960          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 961          * causes cpu_thread_is_idle to return false.  This way,
 962          * handle_icount_deadline can run.
 963          * If we have no CPUs at all for some reason, we don't
 964          * need to do anything.
 965          */
 966         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 967     }
 968 }
 969
 970 static void kick_tcg_thread(void *opaque)
 971 {
 972     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 973     qemu_cpu_kick_rr_next_cpu();
 974 }
 975
 976 static void start_tcg_kick_timer(void)
 977 {
 978     assert(!mttcg_enabled);
 979     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 980         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 981                                            kick_tcg_thread, NULL);
 982     }
 983     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 984         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 985     }
 986 }
 987
 988 static void stop_tcg_kick_timer(void)
 989 {
 990     assert(!mttcg_enabled);
 991     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 992         timer_del(tcg_kick_vcpu_timer);
 993     }
 994 }
 995
 996 /***********************************************************/
 997 void hw_error(const char *fmt, ...)
 998 {
 999     va_list ap;
1000     CPUState *cpu;
1001
1002     va_start(ap, fmt);
1003     fprintf(stderr, "qemu: hardware error: ");
1004     vfprintf(stderr, fmt, ap);
1005     fprintf(stderr, "\n");
1006     CPU_FOREACH(cpu) {
1007         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1008         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1009     }
1010     va_end(ap);
1011     abort();
1012 }
1013
1014 void cpu_synchronize_all_states(void)
1015 {
1016     CPUState *cpu;
1017
1018     CPU_FOREACH(cpu) {
1019         cpu_synchronize_state(cpu);
1020         /* TODO: move to cpu_synchronize_state() */
1021         if (hvf_enabled()) {
1022             hvf_cpu_synchronize_state(cpu);
1023         }
1024     }
1025 }
1026
1027 void cpu_synchronize_all_post_reset(void)
1028 {
1029     CPUState *cpu;
1030
1031     CPU_FOREACH(cpu) {
1032         cpu_synchronize_post_reset(cpu);
1033         /* TODO: move to cpu_synchronize_post_reset() */
1034         if (hvf_enabled()) {
1035             hvf_cpu_synchronize_post_reset(cpu);
1036         }
1037     }
1038 }
1039
1040 void cpu_synchronize_all_post_init(void)
1041 {
1042     CPUState *cpu;
1043
1044     CPU_FOREACH(cpu) {
1045         cpu_synchronize_post_init(cpu);
1046         /* TODO: move to cpu_synchronize_post_init() */
1047         if (hvf_enabled()) {
1048             hvf_cpu_synchronize_post_init(cpu);
1049         }
1050     }
1051 }
1052
1053 void cpu_synchronize_all_pre_loadvm(void)
1054 {
1055     CPUState *cpu;
1056
1057     CPU_FOREACH(cpu) {
1058         cpu_synchronize_pre_loadvm(cpu);
1059     }
1060 }
1061
1062 static int do_vm_stop(RunState state, bool send_stop)
1063 {
1064     int ret = 0;
1065
1066     if (runstate_is_running()) {
1067         runstate_set(state);
1068         cpu_disable_ticks();
1069         pause_all_vcpus();
1070         vm_state_notify(0, state);
1071         if (send_stop) {
1072             qapi_event_send_stop();
1073         }
1074     }
1075
1076     bdrv_drain_all();
1077     ret = bdrv_flush_all();
1078
1079     return ret;
1080 }
1081
1082 /* Special vm_stop() variant for terminating the process.  Historically clients
1083  * did not expect a QMP STOP event and so we need to retain compatibility.
1084  */
1085 int vm_shutdown(void)
1086 {
1087     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1088 }
1089
1090 static bool cpu_can_run(CPUState *cpu)
1091 {
1092     if (cpu->stop) {
1093         return false;
1094     }
1095     if (cpu_is_stopped(cpu)) {
1096         return false;
1097     }
1098     return true;
1099 }
1100
1101 static void cpu_handle_guest_debug(CPUState *cpu)
1102 {
1103     gdb_set_stop_cpu(cpu);
1104     qemu_system_debug_request();
1105     cpu->stopped = true;
1106 }
1107
1108 #ifdef CONFIG_LINUX
1109 static void sigbus_reraise(void)
1110 {
1111     sigset_t set;
1112     struct sigaction action;
1113
1114     memset(&action, 0, sizeof(action));
1115     action.sa_handler = SIG_DFL;
1116     if (!sigaction(SIGBUS, &action, NULL)) {
1117         raise(SIGBUS);
1118         sigemptyset(&set);
1119         sigaddset(&set, SIGBUS);
1120         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1121     }
1122     perror("Failed to re-raise SIGBUS!\n");
1123     abort();
1124 }
1125
1126 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1127 {
1128     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1129         sigbus_reraise();
1130     }
1131
1132     if (current_cpu) {
1133         /* Called asynchronously in VCPU thread.  */
1134         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1135             sigbus_reraise();
1136         }
1137     } else {
1138         /* Called synchronously (via signalfd) in main thread.  */
1139         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1140             sigbus_reraise();
1141         }
1142     }
1143 }
1144
1145 static void qemu_init_sigbus(void)
1146 {
1147     struct sigaction action;
1148
1149     memset(&action, 0, sizeof(action));
1150     action.sa_flags = SA_SIGINFO;
1151     action.sa_sigaction = sigbus_handler;
1152     sigaction(SIGBUS, &action, NULL);
1153
1154     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1155 }
1156 #else /* !CONFIG_LINUX */
1157 static void qemu_init_sigbus(void)
1158 {
1159 }
1160 #endif /* !CONFIG_LINUX */
1161
1162 static QemuThread io_thread;
1163
1164 /* cpu creation */
1165 static QemuCond qemu_cpu_cond;
1166 /* system init */
1167 static QemuCond qemu_pause_cond;
1168
1169 void qemu_init_cpu_loop(void)
1170 {
1171     qemu_init_sigbus();
1172     qemu_cond_init(&qemu_cpu_cond);
1173     qemu_cond_init(&qemu_pause_cond);
1174     qemu_mutex_init(&qemu_global_mutex);
1175
1176     qemu_thread_get_self(&io_thread);
1177 }
1178
1179 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1180 {
1181     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1182 }
1183
1184 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1185 {
1186     if (kvm_destroy_vcpu(cpu) < 0) {
1187         error_report("kvm_destroy_vcpu failed");
1188         exit(EXIT_FAILURE);
1189     }
1190 }
1191
1192 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1193 {
1194 }
1195
1196 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1197 {
1198     g_assert(qemu_cpu_is_self(cpu));
1199     cpu->stop = false;
1200     cpu->stopped = true;
1201     if (exit) {
1202         cpu_exit(cpu);
1203     }
1204     qemu_cond_broadcast(&qemu_pause_cond);
1205 }
1206
1207 static void qemu_wait_io_event_common(CPUState *cpu)
1208 {
1209     atomic_mb_set(&cpu->thread_kicked, false);
1210     if (cpu->stop) {
1211         qemu_cpu_stop(cpu, false);
1212     }
1213     process_queued_cpu_work(cpu);
1214 }
1215
1216 static void qemu_tcg_rr_wait_io_event(void)
1217 {
1218     CPUState *cpu;
1219
1220     while (all_cpu_threads_idle()) {
1221         stop_tcg_kick_timer();
1222         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1223     }
1224
1225     start_tcg_kick_timer();
1226
1227     CPU_FOREACH(cpu) {
1228         qemu_wait_io_event_common(cpu);
1229     }
1230 }
1231
1232 static void qemu_wait_io_event(CPUState *cpu)
1233 {
1234     bool slept = false;
1235
1236     while (cpu_thread_is_idle(cpu)) {
1237         if (!slept) {
1238             slept = true;
1239             qemu_plugin_vcpu_idle_cb(cpu);
1240         }
1241         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1242     }
1243     if (slept) {
1244         qemu_plugin_vcpu_resume_cb(cpu);
1245     }
1246
1247 #ifdef _WIN32
1248     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1249     if (!tcg_enabled()) {
1250         SleepEx(0, TRUE);
1251     }
1252 #endif
1253     qemu_wait_io_event_common(cpu);
1254 }
1255
1256 static void *qemu_kvm_cpu_thread_fn(void *arg)
1257 {
1258     CPUState *cpu = arg;
1259     int r;
1260
1261     rcu_register_thread();
1262
1263     qemu_mutex_lock_iothread();
1264     qemu_thread_get_self(cpu->thread);
1265     cpu->thread_id = qemu_get_thread_id();
1266     cpu->can_do_io = 1;
1267     current_cpu = cpu;
1268
1269     r = kvm_init_vcpu(cpu);
1270     if (r < 0) {
1271         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1272         exit(1);
1273     }
1274
1275     kvm_init_cpu_signals(cpu);
1276
1277     /* signal CPU creation */
1278     cpu->created = true;
1279     qemu_cond_signal(&qemu_cpu_cond);
1280     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1281
1282     do {
1283         if (cpu_can_run(cpu)) {
1284             r = kvm_cpu_exec(cpu);
1285             if (r == EXCP_DEBUG) {
1286                 cpu_handle_guest_debug(cpu);
1287             }
1288         }
1289         qemu_wait_io_event(cpu);
1290     } while (!cpu->unplug || cpu_can_run(cpu));
1291
1292     qemu_kvm_destroy_vcpu(cpu);
1293     cpu->created = false;
1294     qemu_cond_signal(&qemu_cpu_cond);
1295     qemu_mutex_unlock_iothread();
1296     rcu_unregister_thread();
1297     return NULL;
1298 }
1299
1300 static void *qemu_dummy_cpu_thread_fn(void *arg)
1301 {
1302 #ifdef _WIN32
1303     error_report("qtest is not supported under Windows");
1304     exit(1);
1305 #else
1306     CPUState *cpu = arg;
1307     sigset_t waitset;
1308     int r;
1309
1310     rcu_register_thread();
1311
1312     qemu_mutex_lock_iothread();
1313     qemu_thread_get_self(cpu->thread);
1314     cpu->thread_id = qemu_get_thread_id();
1315     cpu->can_do_io = 1;
1316     current_cpu = cpu;
1317
1318     sigemptyset(&waitset);
1319     sigaddset(&waitset, SIG_IPI);
1320
1321     /* signal CPU creation */
1322     cpu->created = true;
1323     qemu_cond_signal(&qemu_cpu_cond);
1324     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1325
1326     do {
1327         qemu_mutex_unlock_iothread();
1328         do {
1329             int sig;
1330             r = sigwait(&waitset, &sig);
1331         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1332         if (r == -1) {
1333             perror("sigwait");
1334             exit(1);
1335         }
1336         qemu_mutex_lock_iothread();
1337         qemu_wait_io_event(cpu);
1338     } while (!cpu->unplug);
1339
1340     qemu_mutex_unlock_iothread();
1341     rcu_unregister_thread();
1342     return NULL;
1343 #endif
1344 }
1345
1346 static int64_t tcg_get_icount_limit(void)
1347 {
1348     int64_t deadline;
1349
1350     if (replay_mode != REPLAY_MODE_PLAY) {
1351         /*
1352          * Include all the timers, because they may need an attention.
1353          * Too long CPU execution may create unnecessary delay in UI.
1354          */
1355         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1356                                               QEMU_TIMER_ATTR_ALL);
1357         /* Check realtime timers, because they help with input processing */
1358         deadline = qemu_soonest_timeout(deadline,
1359                 qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
1360                                            QEMU_TIMER_ATTR_ALL));
1361
1362         /* Maintain prior (possibly buggy) behaviour where if no deadline
1363          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1364          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1365          * nanoseconds.
1366          */
1367         if ((deadline < 0) || (deadline > INT32_MAX)) {
1368             deadline = INT32_MAX;
1369         }
1370
1371         return qemu_icount_round(deadline);
1372     } else {
1373         return replay_get_instructions();
1374     }
1375 }
1376
1377 static void notify_aio_contexts(void)
1378 {
1379     /* Wake up other AioContexts.  */
1380     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1381     qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1382 }
1383
1384 static void handle_icount_deadline(void)
1385 {
1386     assert(qemu_in_vcpu_thread());
1387     if (use_icount) {
1388         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1389                                                       QEMU_TIMER_ATTR_ALL);
1390
1391         if (deadline == 0) {
1392             notify_aio_contexts();
1393         }
1394     }
1395 }
1396
1397 static void prepare_icount_for_run(CPUState *cpu)
1398 {
1399     if (use_icount) {
1400         int insns_left;
1401
1402         /* These should always be cleared by process_icount_data after
1403          * each vCPU execution. However u16.high can be raised
1404          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1405          */
1406         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1407         g_assert(cpu->icount_extra == 0);
1408
1409         cpu->icount_budget = tcg_get_icount_limit();
1410         insns_left = MIN(0xffff, cpu->icount_budget);
1411         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1412         cpu->icount_extra = cpu->icount_budget - insns_left;
1413
1414         replay_mutex_lock();
1415
1416         if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
1417             notify_aio_contexts();
1418         }
1419     }
1420 }
1421
1422 static void process_icount_data(CPUState *cpu)
1423 {
1424     if (use_icount) {
1425         /* Account for executed instructions */
1426         cpu_update_icount(cpu);
1427
1428         /* Reset the counters */
1429         cpu_neg(cpu)->icount_decr.u16.low = 0;
1430         cpu->icount_extra = 0;
1431         cpu->icount_budget = 0;
1432
1433         replay_account_executed_instructions();
1434
1435         replay_mutex_unlock();
1436     }
1437 }
1438
1439
1440 static int tcg_cpu_exec(CPUState *cpu)
1441 {
1442     int ret;
1443 #ifdef CONFIG_PROFILER
1444     int64_t ti;
1445 #endif
1446
1447     assert(tcg_enabled());
1448 #ifdef CONFIG_PROFILER
1449     ti = profile_getclock();
1450 #endif
1451     cpu_exec_start(cpu);
1452     ret = cpu_exec(cpu);
1453     cpu_exec_end(cpu);
1454 #ifdef CONFIG_PROFILER
1455     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1456                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1457 #endif
1458     return ret;
1459 }
1460
1461 /* Destroy any remaining vCPUs which have been unplugged and have
1462  * finished running
1463  */
1464 static void deal_with_unplugged_cpus(void)
1465 {
1466     CPUState *cpu;
1467
1468     CPU_FOREACH(cpu) {
1469         if (cpu->unplug && !cpu_can_run(cpu)) {
1470             qemu_tcg_destroy_vcpu(cpu);
1471             cpu->created = false;
1472             qemu_cond_signal(&qemu_cpu_cond);
1473             break;
1474         }
1475     }
1476 }
1477
1478 /* Single-threaded TCG
1479  *
1480  * In the single-threaded case each vCPU is simulated in turn. If
1481  * there is more than a single vCPU we create a simple timer to kick
1482  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1483  * This is done explicitly rather than relying on side-effects
1484  * elsewhere.
1485  */
1486
1487 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1488 {
1489     CPUState *cpu = arg;
1490
1491     assert(tcg_enabled());
1492     rcu_register_thread();
1493     tcg_register_thread();
1494
1495     qemu_mutex_lock_iothread();
1496     qemu_thread_get_self(cpu->thread);
1497
1498     cpu->thread_id = qemu_get_thread_id();
1499     cpu->created = true;
1500     cpu->can_do_io = 1;
1501     qemu_cond_signal(&qemu_cpu_cond);
1502     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1503
1504     /* wait for initial kick-off after machine start */
1505     while (first_cpu->stopped) {
1506         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1507
1508         /* process any pending work */
1509         CPU_FOREACH(cpu) {
1510             current_cpu = cpu;
1511             qemu_wait_io_event_common(cpu);
1512         }
1513     }
1514
1515     start_tcg_kick_timer();
1516
1517     cpu = first_cpu;
1518
1519     /* process any pending work */
1520     cpu->exit_request = 1;
1521
1522     while (1) {
1523         qemu_mutex_unlock_iothread();
1524         replay_mutex_lock();
1525         qemu_mutex_lock_iothread();
1526         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1527         qemu_account_warp_timer();
1528
1529         /* Run the timers here.  This is much more efficient than
1530          * waking up the I/O thread and waiting for completion.
1531          */
1532         handle_icount_deadline();
1533
1534         replay_mutex_unlock();
1535
1536         if (!cpu) {
1537             cpu = first_cpu;
1538         }
1539
1540         while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
1541
1542             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1543             current_cpu = cpu;
1544
1545             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1546                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1547
1548             if (cpu_can_run(cpu)) {
1549                 int r;
1550
1551                 qemu_mutex_unlock_iothread();
1552                 prepare_icount_for_run(cpu);
1553
1554                 r = tcg_cpu_exec(cpu);
1555
1556                 process_icount_data(cpu);
1557                 qemu_mutex_lock_iothread();
1558
1559                 if (r == EXCP_DEBUG) {
1560                     cpu_handle_guest_debug(cpu);
1561                     break;
1562                 } else if (r == EXCP_ATOMIC) {
1563                     qemu_mutex_unlock_iothread();
1564                     cpu_exec_step_atomic(cpu);
1565                     qemu_mutex_lock_iothread();
1566                     break;
1567                 }
1568             } else if (cpu->stop) {
1569                 if (cpu->unplug) {
1570                     cpu = CPU_NEXT(cpu);
1571                 }
1572                 break;
1573             }
1574
1575             cpu = CPU_NEXT(cpu);
1576         } /* while (cpu && !cpu->exit_request).. */
1577
1578         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1579         atomic_set(&tcg_current_rr_cpu, NULL);
1580
1581         if (cpu && cpu->exit_request) {
1582             atomic_mb_set(&cpu->exit_request, 0);
1583         }
1584
1585         if (use_icount && all_cpu_threads_idle()) {
1586             /*
1587              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1588              * in the main_loop, wake it up in order to start the warp timer.
1589              */
1590             qemu_notify_event();
1591         }
1592
1593         qemu_tcg_rr_wait_io_event();
1594         deal_with_unplugged_cpus();
1595     }
1596
1597     rcu_unregister_thread();
1598     return NULL;
1599 }
1600
1601 static void *qemu_hax_cpu_thread_fn(void *arg)
1602 {
1603     CPUState *cpu = arg;
1604     int r;
1605
1606     rcu_register_thread();
1607     qemu_mutex_lock_iothread();
1608     qemu_thread_get_self(cpu->thread);
1609
1610     cpu->thread_id = qemu_get_thread_id();
1611     cpu->created = true;
1612     current_cpu = cpu;
1613
1614     hax_init_vcpu(cpu);
1615     qemu_cond_signal(&qemu_cpu_cond);
1616     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1617
1618     do {
1619         if (cpu_can_run(cpu)) {
1620             r = hax_smp_cpu_exec(cpu);
1621             if (r == EXCP_DEBUG) {
1622                 cpu_handle_guest_debug(cpu);
1623             }
1624         }
1625
1626         qemu_wait_io_event(cpu);
1627     } while (!cpu->unplug || cpu_can_run(cpu));
1628     rcu_unregister_thread();
1629     return NULL;
1630 }
1631
1632 /* The HVF-specific vCPU thread function. This one should only run when the host
1633  * CPU supports the VMX "unrestricted guest" feature. */
1634 static void *qemu_hvf_cpu_thread_fn(void *arg)
1635 {
1636     CPUState *cpu = arg;
1637
1638     int r;
1639
1640     assert(hvf_enabled());
1641
1642     rcu_register_thread();
1643
1644     qemu_mutex_lock_iothread();
1645     qemu_thread_get_self(cpu->thread);
1646
1647     cpu->thread_id = qemu_get_thread_id();
1648     cpu->can_do_io = 1;
1649     current_cpu = cpu;
1650
1651     hvf_init_vcpu(cpu);
1652
1653     /* signal CPU creation */
1654     cpu->created = true;
1655     qemu_cond_signal(&qemu_cpu_cond);
1656     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1657
1658     do {
1659         if (cpu_can_run(cpu)) {
1660             r = hvf_vcpu_exec(cpu);
1661             if (r == EXCP_DEBUG) {
1662                 cpu_handle_guest_debug(cpu);
1663             }
1664         }
1665         qemu_wait_io_event(cpu);
1666     } while (!cpu->unplug || cpu_can_run(cpu));
1667
1668     hvf_vcpu_destroy(cpu);
1669     cpu->created = false;
1670     qemu_cond_signal(&qemu_cpu_cond);
1671     qemu_mutex_unlock_iothread();
1672     rcu_unregister_thread();
1673     return NULL;
1674 }
1675
1676 static void *qemu_whpx_cpu_thread_fn(void *arg)
1677 {
1678     CPUState *cpu = arg;
1679     int r;
1680
1681     rcu_register_thread();
1682
1683     qemu_mutex_lock_iothread();
1684     qemu_thread_get_self(cpu->thread);
1685     cpu->thread_id = qemu_get_thread_id();
1686     current_cpu = cpu;
1687
1688     r = whpx_init_vcpu(cpu);
1689     if (r < 0) {
1690         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1691         exit(1);
1692     }
1693
1694     /* signal CPU creation */
1695     cpu->created = true;
1696     qemu_cond_signal(&qemu_cpu_cond);
1697     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1698
1699     do {
1700         if (cpu_can_run(cpu)) {
1701             r = whpx_vcpu_exec(cpu);
1702             if (r == EXCP_DEBUG) {
1703                 cpu_handle_guest_debug(cpu);
1704             }
1705         }
1706         while (cpu_thread_is_idle(cpu)) {
1707             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1708         }
1709         qemu_wait_io_event_common(cpu);
1710     } while (!cpu->unplug || cpu_can_run(cpu));
1711
1712     whpx_destroy_vcpu(cpu);
1713     cpu->created = false;
1714     qemu_cond_signal(&qemu_cpu_cond);
1715     qemu_mutex_unlock_iothread();
1716     rcu_unregister_thread();
1717     return NULL;
1718 }
1719
1720 #ifdef _WIN32
1721 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1722 {
1723 }
1724 #endif
1725
1726 /* Multi-threaded TCG
1727  *
1728  * In the multi-threaded case each vCPU has its own thread. The TLS
1729  * variable current_cpu can be used deep in the code to find the
1730  * current CPUState for a given thread.
1731  */
1732
1733 static void *qemu_tcg_cpu_thread_fn(void *arg)
1734 {
1735     CPUState *cpu = arg;
1736
1737     assert(tcg_enabled());
1738     g_assert(!use_icount);
1739
1740     rcu_register_thread();
1741     tcg_register_thread();
1742
1743     qemu_mutex_lock_iothread();
1744     qemu_thread_get_self(cpu->thread);
1745
1746     cpu->thread_id = qemu_get_thread_id();
1747     cpu->created = true;
1748     cpu->can_do_io = 1;
1749     current_cpu = cpu;
1750     qemu_cond_signal(&qemu_cpu_cond);
1751     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1752
1753     /* process any pending work */
1754     cpu->exit_request = 1;
1755
1756     do {
1757         if (cpu_can_run(cpu)) {
1758             int r;
1759             qemu_mutex_unlock_iothread();
1760             r = tcg_cpu_exec(cpu);
1761             qemu_mutex_lock_iothread();
1762             switch (r) {
1763             case EXCP_DEBUG:
1764                 cpu_handle_guest_debug(cpu);
1765                 break;
1766             case EXCP_HALTED:
1767                 /* during start-up the vCPU is reset and the thread is
1768                  * kicked several times. If we don't ensure we go back
1769                  * to sleep in the halted state we won't cleanly
1770                  * start-up when the vCPU is enabled.
1771                  *
1772                  * cpu->halted should ensure we sleep in wait_io_event
1773                  */
1774                 g_assert(cpu->halted);
1775                 break;
1776             case EXCP_ATOMIC:
1777                 qemu_mutex_unlock_iothread();
1778                 cpu_exec_step_atomic(cpu);
1779                 qemu_mutex_lock_iothread();
1780             default:
1781                 /* Ignore everything else? */
1782                 break;
1783             }
1784         }
1785
1786         atomic_mb_set(&cpu->exit_request, 0);
1787         qemu_wait_io_event(cpu);
1788     } while (!cpu->unplug || cpu_can_run(cpu));
1789
1790     qemu_tcg_destroy_vcpu(cpu);
1791     cpu->created = false;
1792     qemu_cond_signal(&qemu_cpu_cond);
1793     qemu_mutex_unlock_iothread();
1794     rcu_unregister_thread();
1795     return NULL;
1796 }
1797
1798 static void qemu_cpu_kick_thread(CPUState *cpu)
1799 {
1800 #ifndef _WIN32
1801     int err;
1802
1803     if (cpu->thread_kicked) {
1804         return;
1805     }
1806     cpu->thread_kicked = true;
1807     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1808     if (err && err != ESRCH) {
1809         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1810         exit(1);
1811     }
1812 #else /* _WIN32 */
1813     if (!qemu_cpu_is_self(cpu)) {
1814         if (whpx_enabled()) {
1815             whpx_vcpu_kick(cpu);
1816         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1817             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1818                     __func__, GetLastError());
1819             exit(1);
1820         }
1821     }
1822 #endif
1823 }
1824
1825 void qemu_cpu_kick(CPUState *cpu)
1826 {
1827     qemu_cond_broadcast(cpu->halt_cond);
1828     if (tcg_enabled()) {
1829         if (qemu_tcg_mttcg_enabled()) {
1830             cpu_exit(cpu);
1831         } else {
1832             qemu_cpu_kick_rr_cpus();
1833         }
1834     } else {
1835         if (hax_enabled()) {
1836             /*
1837              * FIXME: race condition with the exit_request check in
1838              * hax_vcpu_hax_exec
1839              */
1840             cpu->exit_request = 1;
1841         }
1842         qemu_cpu_kick_thread(cpu);
1843     }
1844 }
1845
1846 void qemu_cpu_kick_self(void)
1847 {
1848     assert(current_cpu);
1849     qemu_cpu_kick_thread(current_cpu);
1850 }
1851
1852 bool qemu_cpu_is_self(CPUState *cpu)
1853 {
1854     return qemu_thread_is_self(cpu->thread);
1855 }
1856
1857 bool qemu_in_vcpu_thread(void)
1858 {
1859     return current_cpu && qemu_cpu_is_self(current_cpu);
1860 }
1861
1862 static __thread bool iothread_locked = false;
1863
1864 bool qemu_mutex_iothread_locked(void)
1865 {
1866     return iothread_locked;
1867 }
1868
1869 /*
1870  * The BQL is taken from so many places that it is worth profiling the
1871  * callers directly, instead of funneling them all through a single function.
1872  */
1873 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1874 {
1875     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1876
1877     g_assert(!qemu_mutex_iothread_locked());
1878     bql_lock(&qemu_global_mutex, file, line);
1879     iothread_locked = true;
1880 }
1881
1882 void qemu_mutex_unlock_iothread(void)
1883 {
1884     g_assert(qemu_mutex_iothread_locked());
1885     iothread_locked = false;
1886     qemu_mutex_unlock(&qemu_global_mutex);
1887 }
1888
1889 void qemu_cond_wait_iothread(QemuCond *cond)
1890 {
1891     qemu_cond_wait(cond, &qemu_global_mutex);
1892 }
1893
1894 static bool all_vcpus_paused(void)
1895 {
1896     CPUState *cpu;
1897
1898     CPU_FOREACH(cpu) {
1899         if (!cpu->stopped) {
1900             return false;
1901         }
1902     }
1903
1904     return true;
1905 }
1906
1907 void pause_all_vcpus(void)
1908 {
1909     CPUState *cpu;
1910
1911     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1912     CPU_FOREACH(cpu) {
1913         if (qemu_cpu_is_self(cpu)) {
1914             qemu_cpu_stop(cpu, true);
1915         } else {
1916             cpu->stop = true;
1917             qemu_cpu_kick(cpu);
1918         }
1919     }
1920
1921     /* We need to drop the replay_lock so any vCPU threads woken up
1922      * can finish their replay tasks
1923      */
1924     replay_mutex_unlock();
1925
1926     while (!all_vcpus_paused()) {
1927         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1928         CPU_FOREACH(cpu) {
1929             qemu_cpu_kick(cpu);
1930         }
1931     }
1932
1933     qemu_mutex_unlock_iothread();
1934     replay_mutex_lock();
1935     qemu_mutex_lock_iothread();
1936 }
1937
1938 void cpu_resume(CPUState *cpu)
1939 {
1940     cpu->stop = false;
1941     cpu->stopped = false;
1942     qemu_cpu_kick(cpu);
1943 }
1944
1945 void resume_all_vcpus(void)
1946 {
1947     CPUState *cpu;
1948
1949     if (!runstate_is_running()) {
1950         return;
1951     }
1952
1953     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1954     CPU_FOREACH(cpu) {
1955         cpu_resume(cpu);
1956     }
1957 }
1958
1959 void cpu_remove_sync(CPUState *cpu)
1960 {
1961     cpu->stop = true;
1962     cpu->unplug = true;
1963     qemu_cpu_kick(cpu);
1964     qemu_mutex_unlock_iothread();
1965     qemu_thread_join(cpu->thread);
1966     qemu_mutex_lock_iothread();
1967 }
1968
1969 /* For temporary buffers for forming a name */
1970 #define VCPU_THREAD_NAME_SIZE 16
1971
1972 static void qemu_tcg_init_vcpu(CPUState *cpu)
1973 {
1974     char thread_name[VCPU_THREAD_NAME_SIZE];
1975     static QemuCond *single_tcg_halt_cond;
1976     static QemuThread *single_tcg_cpu_thread;
1977     static int tcg_region_inited;
1978
1979     assert(tcg_enabled());
1980     /*
1981      * Initialize TCG regions--once. Now is a good time, because:
1982      * (1) TCG's init context, prologue and target globals have been set up.
1983      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1984      *     -accel flag is processed, so the check doesn't work then).
1985      */
1986     if (!tcg_region_inited) {
1987         tcg_region_inited = 1;
1988         tcg_region_init();
1989     }
1990
1991     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1992         cpu->thread = g_malloc0(sizeof(QemuThread));
1993         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1994         qemu_cond_init(cpu->halt_cond);
1995
1996         if (qemu_tcg_mttcg_enabled()) {
1997             /* create a thread per vCPU with TCG (MTTCG) */
1998             parallel_cpus = true;
1999             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
2000                  cpu->cpu_index);
2001
2002             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
2003                                cpu, QEMU_THREAD_JOINABLE);
2004
2005         } else {
2006             /* share a single thread for all cpus with TCG */
2007             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
2008             qemu_thread_create(cpu->thread, thread_name,
2009                                qemu_tcg_rr_cpu_thread_fn,
2010                                cpu, QEMU_THREAD_JOINABLE);
2011
2012             single_tcg_halt_cond = cpu->halt_cond;
2013             single_tcg_cpu_thread = cpu->thread;
2014         }
2015 #ifdef _WIN32
2016         cpu->hThread = qemu_thread_get_handle(cpu->thread);
2017 #endif
2018     } else {
2019         /* For non-MTTCG cases we share the thread */
2020         cpu->thread = single_tcg_cpu_thread;
2021         cpu->halt_cond = single_tcg_halt_cond;
2022         cpu->thread_id = first_cpu->thread_id;
2023         cpu->can_do_io = 1;
2024         cpu->created = true;
2025     }
2026 }
2027
2028 static void qemu_hax_start_vcpu(CPUState *cpu)
2029 {
2030     char thread_name[VCPU_THREAD_NAME_SIZE];
2031
2032     cpu->thread = g_malloc0(sizeof(QemuThread));
2033     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2034     qemu_cond_init(cpu->halt_cond);
2035
2036     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2037              cpu->cpu_index);
2038     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2039                        cpu, QEMU_THREAD_JOINABLE);
2040 #ifdef _WIN32
2041     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2042 #endif
2043 }
2044
2045 static void qemu_kvm_start_vcpu(CPUState *cpu)
2046 {
2047     char thread_name[VCPU_THREAD_NAME_SIZE];
2048
2049     cpu->thread = g_malloc0(sizeof(QemuThread));
2050     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2051     qemu_cond_init(cpu->halt_cond);
2052     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2053              cpu->cpu_index);
2054     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2055                        cpu, QEMU_THREAD_JOINABLE);
2056 }
2057
2058 static void qemu_hvf_start_vcpu(CPUState *cpu)
2059 {
2060     char thread_name[VCPU_THREAD_NAME_SIZE];
2061
2062     /* HVF currently does not support TCG, and only runs in
2063      * unrestricted-guest mode. */
2064     assert(hvf_enabled());
2065
2066     cpu->thread = g_malloc0(sizeof(QemuThread));
2067     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2068     qemu_cond_init(cpu->halt_cond);
2069
2070     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2071              cpu->cpu_index);
2072     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2073                        cpu, QEMU_THREAD_JOINABLE);
2074 }
2075
2076 static void qemu_whpx_start_vcpu(CPUState *cpu)
2077 {
2078     char thread_name[VCPU_THREAD_NAME_SIZE];
2079
2080     cpu->thread = g_malloc0(sizeof(QemuThread));
2081     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2082     qemu_cond_init(cpu->halt_cond);
2083     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2084              cpu->cpu_index);
2085     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2086                        cpu, QEMU_THREAD_JOINABLE);
2087 #ifdef _WIN32
2088     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2089 #endif
2090 }
2091
2092 static void qemu_dummy_start_vcpu(CPUState *cpu)
2093 {
2094     char thread_name[VCPU_THREAD_NAME_SIZE];
2095
2096     cpu->thread = g_malloc0(sizeof(QemuThread));
2097     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2098     qemu_cond_init(cpu->halt_cond);
2099     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2100              cpu->cpu_index);
2101     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2102                        QEMU_THREAD_JOINABLE);
2103 }
2104
2105 void qemu_init_vcpu(CPUState *cpu)
2106 {
2107     MachineState *ms = MACHINE(qdev_get_machine());
2108
2109     cpu->nr_cores = ms->smp.cores;
2110     cpu->nr_threads =  ms->smp.threads;
2111     cpu->stopped = true;
2112     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2113
2114     if (!cpu->as) {
2115         /* If the target cpu hasn't set up any address spaces itself,
2116          * give it the default one.
2117          */
2118         cpu->num_ases = 1;
2119         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2120     }
2121
2122     if (kvm_enabled()) {
2123         qemu_kvm_start_vcpu(cpu);
2124     } else if (hax_enabled()) {
2125         qemu_hax_start_vcpu(cpu);
2126     } else if (hvf_enabled()) {
2127         qemu_hvf_start_vcpu(cpu);
2128     } else if (tcg_enabled()) {
2129         qemu_tcg_init_vcpu(cpu);
2130     } else if (whpx_enabled()) {
2131         qemu_whpx_start_vcpu(cpu);
2132     } else {
2133         qemu_dummy_start_vcpu(cpu);
2134     }
2135
2136     while (!cpu->created) {
2137         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2138     }
2139 }
2140
2141 void cpu_stop_current(void)
2142 {
2143     if (current_cpu) {
2144         current_cpu->stop = true;
2145         cpu_exit(current_cpu);
2146     }
2147 }
2148
2149 int vm_stop(RunState state)
2150 {
2151     if (qemu_in_vcpu_thread()) {
2152         qemu_system_vmstop_request_prepare();
2153         qemu_system_vmstop_request(state);
2154         /*
2155          * FIXME: should not return to device code in case
2156          * vm_stop() has been requested.
2157          */
2158         cpu_stop_current();
2159         return 0;
2160     }
2161
2162     return do_vm_stop(state, true);
2163 }
2164
2165 /**
2166  * Prepare for (re)starting the VM.
2167  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2168  * running or in case of an error condition), 0 otherwise.
2169  */
2170 int vm_prepare_start(void)
2171 {
2172     RunState requested;
2173
2174     qemu_vmstop_requested(&requested);
2175     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2176         return -1;
2177     }
2178
2179     /* Ensure that a STOP/RESUME pair of events is emitted if a
2180      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2181      * example, according to documentation is always followed by
2182      * the STOP event.
2183      */
2184     if (runstate_is_running()) {
2185         qapi_event_send_stop();
2186         qapi_event_send_resume();
2187         return -1;
2188     }
2189
2190     /* We are sending this now, but the CPUs will be resumed shortly later */
2191     qapi_event_send_resume();
2192
2193     cpu_enable_ticks();
2194     runstate_set(RUN_STATE_RUNNING);
2195     vm_state_notify(1, RUN_STATE_RUNNING);
2196     return 0;
2197 }
2198
2199 void vm_start(void)
2200 {
2201     if (!vm_prepare_start()) {
2202         resume_all_vcpus();
2203     }
2204 }
2205
2206 /* does a state transition even if the VM is already stopped,
2207    current state is forgotten forever */
2208 int vm_stop_force_state(RunState state)
2209 {
2210     if (runstate_is_running()) {
2211         return vm_stop(state);
2212     } else {
2213         runstate_set(state);
2214
2215         bdrv_drain_all();
2216         /* Make sure to return an error if the flush in a previous vm_stop()
2217          * failed. */
2218         return bdrv_flush_all();
2219     }
2220 }
2221
2222 void list_cpus(const char *optarg)
2223 {
2224     /* XXX: implement xxx_cpu_list for targets that still miss it */
2225 #if defined(cpu_list)
2226     cpu_list();
2227 #endif
2228 }
2229
2230 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2231                  bool has_cpu, int64_t cpu_index, Error **errp)
2232 {
2233     FILE *f;
2234     uint32_t l;
2235     CPUState *cpu;
2236     uint8_t buf[1024];
2237     int64_t orig_addr = addr, orig_size = size;
2238
2239     if (!has_cpu) {
2240         cpu_index = 0;
2241     }
2242
2243     cpu = qemu_get_cpu(cpu_index);
2244     if (cpu == NULL) {
2245         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2246                    "a CPU number");
2247         return;
2248     }
2249
2250     f = fopen(filename, "wb");
2251     if (!f) {
2252         error_setg_file_open(errp, errno, filename);
2253         return;
2254     }
2255
2256     while (size != 0) {
2257         l = sizeof(buf);
2258         if (l > size)
2259             l = size;
2260         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2261             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2262                              " specified", orig_addr, orig_size);
2263             goto exit;
2264         }
2265         if (fwrite(buf, 1, l, f) != l) {
2266             error_setg(errp, QERR_IO_ERROR);
2267             goto exit;
2268         }
2269         addr += l;
2270         size -= l;
2271     }
2272
2273 exit:
2274     fclose(f);
2275 }
2276
2277 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2278                   Error **errp)
2279 {
2280     FILE *f;
2281     uint32_t l;
2282     uint8_t buf[1024];
2283
2284     f = fopen(filename, "wb");
2285     if (!f) {
2286         error_setg_file_open(errp, errno, filename);
2287         return;
2288     }
2289
2290     while (size != 0) {
2291         l = sizeof(buf);
2292         if (l > size)
2293             l = size;
2294         cpu_physical_memory_read(addr, buf, l);
2295         if (fwrite(buf, 1, l, f) != l) {
2296             error_setg(errp, QERR_IO_ERROR);
2297             goto exit;
2298         }
2299         addr += l;
2300         size -= l;
2301     }
2302
2303 exit:
2304     fclose(f);
2305 }
2306
2307 void qmp_inject_nmi(Error **errp)
2308 {
2309     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2310 }
2311
2312 void dump_drift_info(void)
2313 {
2314     if (!use_icount) {
2315         return;
2316     }
2317
2318     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2319                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2320     if (icount_align_option) {
2321         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2322                     -max_delay / SCALE_MS);
2323         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2324                     max_advance / SCALE_MS);
2325     } else {
2326         qemu_printf("Max guest delay     NA\n");
2327         qemu_printf("Max guest advance   NA\n");
2328     }
2329 }