cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu-common.h"
  27 #include "qemu/config-file.h"
  28 #include "qemu/cutils.h"
  29 #include "migration/vmstate.h"
  30 #include "monitor/monitor.h"
  31 #include "qapi/error.h"
  32 #include "qapi/qapi-commands-misc.h"
  33 #include "qapi/qapi-events-run-state.h"
  34 #include "qapi/qmp/qerror.h"
  35 #include "qemu/error-report.h"
  36 #include "qemu/qemu-print.h"
  37 #include "sysemu/tcg.h"
  38 #include "sysemu/block-backend.h"
  39 #include "exec/gdbstub.h"
  40 #include "sysemu/dma.h"
  41 #include "sysemu/hw_accel.h"
  42 #include "sysemu/kvm.h"
  43 #include "sysemu/hax.h"
  44 #include "sysemu/hvf.h"
  45 #include "sysemu/whpx.h"
  46 #include "exec/exec-all.h"
  47
  48 #include "qemu/thread.h"
  49 #include "qemu/plugin.h"
  50 #include "sysemu/cpus.h"
  51 #include "sysemu/qtest.h"
  52 #include "qemu/main-loop.h"
  53 #include "qemu/option.h"
  54 #include "qemu/bitmap.h"
  55 #include "qemu/seqlock.h"
  56 #include "qemu/guest-random.h"
  57 #include "tcg/tcg.h"
  58 #include "hw/nmi.h"
  59 #include "sysemu/replay.h"
  60 #include "sysemu/runstate.h"
  61 #include "hw/boards.h"
  62 #include "hw/hw.h"
  63
  64 #ifdef CONFIG_LINUX
  65
  66 #include <sys/prctl.h>
  67
  68 #ifndef PR_MCE_KILL
  69 #define PR_MCE_KILL 33
  70 #endif
  71
  72 #ifndef PR_MCE_KILL_SET
  73 #define PR_MCE_KILL_SET 1
  74 #endif
  75
  76 #ifndef PR_MCE_KILL_EARLY
  77 #define PR_MCE_KILL_EARLY 1
  78 #endif
  79
  80 #endif /* CONFIG_LINUX */
  81
  82 static QemuMutex qemu_global_mutex;
  83
  84 int64_t max_delay;
  85 int64_t max_advance;
  86
  87 /* vcpu throttling controls */
  88 static QEMUTimer *throttle_timer;
  89 static unsigned int throttle_percentage;
  90
  91 #define CPU_THROTTLE_PCT_MIN 1
  92 #define CPU_THROTTLE_PCT_MAX 99
  93 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  94
  95 bool cpu_is_stopped(CPUState *cpu)
  96 {
  97     return cpu->stopped || !runstate_is_running();
  98 }
  99
 100 static inline bool cpu_work_list_empty(CPUState *cpu)
 101 {
 102     bool ret;
 103
 104     qemu_mutex_lock(&cpu->work_mutex);
 105     ret = QSIMPLEQ_EMPTY(&cpu->work_list);
 106     qemu_mutex_unlock(&cpu->work_mutex);
 107     return ret;
 108 }
 109
 110 static bool cpu_thread_is_idle(CPUState *cpu)
 111 {
 112     if (cpu->stop || !cpu_work_list_empty(cpu)) {
 113         return false;
 114     }
 115     if (cpu_is_stopped(cpu)) {
 116         return true;
 117     }
 118     if (!cpu->halted || cpu_has_work(cpu) ||
 119         kvm_halt_in_kernel()) {
 120         return false;
 121     }
 122     return true;
 123 }
 124
 125 static bool all_cpu_threads_idle(void)
 126 {
 127     CPUState *cpu;
 128
 129     CPU_FOREACH(cpu) {
 130         if (!cpu_thread_is_idle(cpu)) {
 131             return false;
 132         }
 133     }
 134     return true;
 135 }
 136
 137 /***********************************************************/
 138 /* guest cycle counter */
 139
 140 /* Protected by TimersState seqlock */
 141
 142 static bool icount_sleep = true;
 143 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 144 #define MAX_ICOUNT_SHIFT 10
 145
 146 typedef struct TimersState {
 147     /* Protected by BQL.  */
 148     int64_t cpu_ticks_prev;
 149     int64_t cpu_ticks_offset;
 150
 151     /* Protect fields that can be respectively read outside the
 152      * BQL, and written from multiple threads.
 153      */
 154     QemuSeqLock vm_clock_seqlock;
 155     QemuSpin vm_clock_lock;
 156
 157     int16_t cpu_ticks_enabled;
 158
 159     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 160     int16_t icount_time_shift;
 161
 162     /* Compensate for varying guest execution speed.  */
 163     int64_t qemu_icount_bias;
 164
 165     int64_t vm_clock_warp_start;
 166     int64_t cpu_clock_offset;
 167
 168     /* Only written by TCG thread */
 169     int64_t qemu_icount;
 170
 171     /* for adjusting icount */
 172     QEMUTimer *icount_rt_timer;
 173     QEMUTimer *icount_vm_timer;
 174     QEMUTimer *icount_warp_timer;
 175 } TimersState;
 176
 177 static TimersState timers_state;
 178 bool mttcg_enabled;
 179
 180
 181 /* The current number of executed instructions is based on what we
 182  * originally budgeted minus the current state of the decrementing
 183  * icount counters in extra/u16.low.
 184  */
 185 static int64_t cpu_get_icount_executed(CPUState *cpu)
 186 {
 187     return (cpu->icount_budget -
 188             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 189 }
 190
 191 /*
 192  * Update the global shared timer_state.qemu_icount to take into
 193  * account executed instructions. This is done by the TCG vCPU
 194  * thread so the main-loop can see time has moved forward.
 195  */
 196 static void cpu_update_icount_locked(CPUState *cpu)
 197 {
 198     int64_t executed = cpu_get_icount_executed(cpu);
 199     cpu->icount_budget -= executed;
 200
 201     atomic_set_i64(&timers_state.qemu_icount,
 202                    timers_state.qemu_icount + executed);
 203 }
 204
 205 /*
 206  * Update the global shared timer_state.qemu_icount to take into
 207  * account executed instructions. This is done by the TCG vCPU
 208  * thread so the main-loop can see time has moved forward.
 209  */
 210 void cpu_update_icount(CPUState *cpu)
 211 {
 212     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 213                        &timers_state.vm_clock_lock);
 214     cpu_update_icount_locked(cpu);
 215     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 216                          &timers_state.vm_clock_lock);
 217 }
 218
 219 static int64_t cpu_get_icount_raw_locked(void)
 220 {
 221     CPUState *cpu = current_cpu;
 222
 223     if (cpu && cpu->running) {
 224         if (!cpu->can_do_io) {
 225             error_report("Bad icount read");
 226             exit(1);
 227         }
 228         /* Take into account what has run */
 229         cpu_update_icount_locked(cpu);
 230     }
 231     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 232     return atomic_read_i64(&timers_state.qemu_icount);
 233 }
 234
 235 static int64_t cpu_get_icount_locked(void)
 236 {
 237     int64_t icount = cpu_get_icount_raw_locked();
 238     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 239         cpu_icount_to_ns(icount);
 240 }
 241
 242 int64_t cpu_get_icount_raw(void)
 243 {
 244     int64_t icount;
 245     unsigned start;
 246
 247     do {
 248         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 249         icount = cpu_get_icount_raw_locked();
 250     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 251
 252     return icount;
 253 }
 254
 255 /* Return the virtual CPU time, based on the instruction counter.  */
 256 int64_t cpu_get_icount(void)
 257 {
 258     int64_t icount;
 259     unsigned start;
 260
 261     do {
 262         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 263         icount = cpu_get_icount_locked();
 264     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 265
 266     return icount;
 267 }
 268
 269 int64_t cpu_icount_to_ns(int64_t icount)
 270 {
 271     return icount << atomic_read(&timers_state.icount_time_shift);
 272 }
 273
 274 static int64_t cpu_get_ticks_locked(void)
 275 {
 276     int64_t ticks = timers_state.cpu_ticks_offset;
 277     if (timers_state.cpu_ticks_enabled) {
 278         ticks += cpu_get_host_ticks();
 279     }
 280
 281     if (timers_state.cpu_ticks_prev > ticks) {
 282         /* Non increasing ticks may happen if the host uses software suspend.  */
 283         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 284         ticks = timers_state.cpu_ticks_prev;
 285     }
 286
 287     timers_state.cpu_ticks_prev = ticks;
 288     return ticks;
 289 }
 290
 291 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 292  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 293  * counter.
 294  */
 295 int64_t cpu_get_ticks(void)
 296 {
 297     int64_t ticks;
 298
 299     if (use_icount) {
 300         return cpu_get_icount();
 301     }
 302
 303     qemu_spin_lock(&timers_state.vm_clock_lock);
 304     ticks = cpu_get_ticks_locked();
 305     qemu_spin_unlock(&timers_state.vm_clock_lock);
 306     return ticks;
 307 }
 308
 309 static int64_t cpu_get_clock_locked(void)
 310 {
 311     int64_t time;
 312
 313     time = timers_state.cpu_clock_offset;
 314     if (timers_state.cpu_ticks_enabled) {
 315         time += get_clock();
 316     }
 317
 318     return time;
 319 }
 320
 321 /* Return the monotonic time elapsed in VM, i.e.,
 322  * the time between vm_start and vm_stop
 323  */
 324 int64_t cpu_get_clock(void)
 325 {
 326     int64_t ti;
 327     unsigned start;
 328
 329     do {
 330         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 331         ti = cpu_get_clock_locked();
 332     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 333
 334     return ti;
 335 }
 336
 337 /* enable cpu_get_ticks()
 338  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 339  */
 340 void cpu_enable_ticks(void)
 341 {
 342     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 343                        &timers_state.vm_clock_lock);
 344     if (!timers_state.cpu_ticks_enabled) {
 345         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 346         timers_state.cpu_clock_offset -= get_clock();
 347         timers_state.cpu_ticks_enabled = 1;
 348     }
 349     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 350                        &timers_state.vm_clock_lock);
 351 }
 352
 353 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 354  * cpu_get_ticks() after that.
 355  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 356  */
 357 void cpu_disable_ticks(void)
 358 {
 359     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 360                        &timers_state.vm_clock_lock);
 361     if (timers_state.cpu_ticks_enabled) {
 362         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 363         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 364         timers_state.cpu_ticks_enabled = 0;
 365     }
 366     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 367                          &timers_state.vm_clock_lock);
 368 }
 369
 370 /* Correlation between real and virtual time is always going to be
 371    fairly approximate, so ignore small variation.
 372    When the guest is idle real and virtual time will be aligned in
 373    the IO wait loop.  */
 374 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 375
 376 static void icount_adjust(void)
 377 {
 378     int64_t cur_time;
 379     int64_t cur_icount;
 380     int64_t delta;
 381
 382     /* Protected by TimersState mutex.  */
 383     static int64_t last_delta;
 384
 385     /* If the VM is not running, then do nothing.  */
 386     if (!runstate_is_running()) {
 387         return;
 388     }
 389
 390     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 391                        &timers_state.vm_clock_lock);
 392     cur_time = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 393                                    cpu_get_clock_locked());
 394     cur_icount = cpu_get_icount_locked();
 395
 396     delta = cur_icount - cur_time;
 397     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 398     if (delta > 0
 399         && last_delta + ICOUNT_WOBBLE < delta * 2
 400         && timers_state.icount_time_shift > 0) {
 401         /* The guest is getting too far ahead.  Slow time down.  */
 402         atomic_set(&timers_state.icount_time_shift,
 403                    timers_state.icount_time_shift - 1);
 404     }
 405     if (delta < 0
 406         && last_delta - ICOUNT_WOBBLE > delta * 2
 407         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 408         /* The guest is getting too far behind.  Speed time up.  */
 409         atomic_set(&timers_state.icount_time_shift,
 410                    timers_state.icount_time_shift + 1);
 411     }
 412     last_delta = delta;
 413     atomic_set_i64(&timers_state.qemu_icount_bias,
 414                    cur_icount - (timers_state.qemu_icount
 415                                  << timers_state.icount_time_shift));
 416     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 417                          &timers_state.vm_clock_lock);
 418 }
 419
 420 static void icount_adjust_rt(void *opaque)
 421 {
 422     timer_mod(timers_state.icount_rt_timer,
 423               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 424     icount_adjust();
 425 }
 426
 427 static void icount_adjust_vm(void *opaque)
 428 {
 429     timer_mod(timers_state.icount_vm_timer,
 430                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 431                    NANOSECONDS_PER_SECOND / 10);
 432     icount_adjust();
 433 }
 434
 435 static int64_t qemu_icount_round(int64_t count)
 436 {
 437     int shift = atomic_read(&timers_state.icount_time_shift);
 438     return (count + (1 << shift) - 1) >> shift;
 439 }
 440
 441 static void icount_warp_rt(void)
 442 {
 443     unsigned seq;
 444     int64_t warp_start;
 445
 446     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 447      * changes from -1 to another value, so the race here is okay.
 448      */
 449     do {
 450         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 451         warp_start = timers_state.vm_clock_warp_start;
 452     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 453
 454     if (warp_start == -1) {
 455         return;
 456     }
 457
 458     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 459                        &timers_state.vm_clock_lock);
 460     if (runstate_is_running()) {
 461         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 462                                             cpu_get_clock_locked());
 463         int64_t warp_delta;
 464
 465         warp_delta = clock - timers_state.vm_clock_warp_start;
 466         if (use_icount == 2) {
 467             /*
 468              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 469              * far ahead of real time.
 470              */
 471             int64_t cur_icount = cpu_get_icount_locked();
 472             int64_t delta = clock - cur_icount;
 473             warp_delta = MIN(warp_delta, delta);
 474         }
 475         atomic_set_i64(&timers_state.qemu_icount_bias,
 476                        timers_state.qemu_icount_bias + warp_delta);
 477     }
 478     timers_state.vm_clock_warp_start = -1;
 479     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 480                        &timers_state.vm_clock_lock);
 481
 482     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 483         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 484     }
 485 }
 486
 487 static void icount_timer_cb(void *opaque)
 488 {
 489     /* No need for a checkpoint because the timer already synchronizes
 490      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 491      */
 492     icount_warp_rt();
 493 }
 494
 495 void qtest_clock_warp(int64_t dest)
 496 {
 497     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 498     AioContext *aio_context;
 499     assert(qtest_enabled());
 500     aio_context = qemu_get_aio_context();
 501     while (clock < dest) {
 502         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 503                                                       QEMU_TIMER_ATTR_ALL);
 504         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 505
 506         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 507                            &timers_state.vm_clock_lock);
 508         atomic_set_i64(&timers_state.qemu_icount_bias,
 509                        timers_state.qemu_icount_bias + warp);
 510         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 511                              &timers_state.vm_clock_lock);
 512
 513         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 514         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 515         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 516     }
 517     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 518 }
 519
 520 void qemu_start_warp_timer(void)
 521 {
 522     int64_t clock;
 523     int64_t deadline;
 524
 525     if (!use_icount) {
 526         return;
 527     }
 528
 529     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 530      * do not fire, so computing the deadline does not make sense.
 531      */
 532     if (!runstate_is_running()) {
 533         return;
 534     }
 535
 536     if (replay_mode != REPLAY_MODE_PLAY) {
 537         if (!all_cpu_threads_idle()) {
 538             return;
 539         }
 540
 541         if (qtest_enabled()) {
 542             /* When testing, qtest commands advance icount.  */
 543             return;
 544         }
 545
 546         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 547     } else {
 548         /* warp clock deterministically in record/replay mode */
 549         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 550             /* vCPU is sleeping and warp can't be started.
 551                It is probably a race condition: notification sent
 552                to vCPU was processed in advance and vCPU went to sleep.
 553                Therefore we have to wake it up for doing someting. */
 554             if (replay_has_checkpoint()) {
 555                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 556             }
 557             return;
 558         }
 559     }
 560
 561     /* We want to use the earliest deadline from ALL vm_clocks */
 562     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 563     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 564                                           ~QEMU_TIMER_ATTR_EXTERNAL);
 565     if (deadline < 0) {
 566         static bool notified;
 567         if (!icount_sleep && !notified) {
 568             warn_report("icount sleep disabled and no active timers");
 569             notified = true;
 570         }
 571         return;
 572     }
 573
 574     if (deadline > 0) {
 575         /*
 576          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 577          * sleep.  Otherwise, the CPU might be waiting for a future timer
 578          * interrupt to wake it up, but the interrupt never comes because
 579          * the vCPU isn't running any insns and thus doesn't advance the
 580          * QEMU_CLOCK_VIRTUAL.
 581          */
 582         if (!icount_sleep) {
 583             /*
 584              * We never let VCPUs sleep in no sleep icount mode.
 585              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 586              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 587              * It is useful when we want a deterministic execution time,
 588              * isolated from host latencies.
 589              */
 590             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 591                                &timers_state.vm_clock_lock);
 592             atomic_set_i64(&timers_state.qemu_icount_bias,
 593                            timers_state.qemu_icount_bias + deadline);
 594             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 595                                  &timers_state.vm_clock_lock);
 596             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 597         } else {
 598             /*
 599              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 600              * "real" time, (related to the time left until the next event) has
 601              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 602              * This avoids that the warps are visible externally; for example,
 603              * you will not be sending network packets continuously instead of
 604              * every 100ms.
 605              */
 606             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 607                                &timers_state.vm_clock_lock);
 608             if (timers_state.vm_clock_warp_start == -1
 609                 || timers_state.vm_clock_warp_start > clock) {
 610                 timers_state.vm_clock_warp_start = clock;
 611             }
 612             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 613                                  &timers_state.vm_clock_lock);
 614             timer_mod_anticipate(timers_state.icount_warp_timer,
 615                                  clock + deadline);
 616         }
 617     } else if (deadline == 0) {
 618         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 619     }
 620 }
 621
 622 static void qemu_account_warp_timer(void)
 623 {
 624     if (!use_icount || !icount_sleep) {
 625         return;
 626     }
 627
 628     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 629      * do not fire, so computing the deadline does not make sense.
 630      */
 631     if (!runstate_is_running()) {
 632         return;
 633     }
 634
 635     /* warp clock deterministically in record/replay mode */
 636     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 637         return;
 638     }
 639
 640     timer_del(timers_state.icount_warp_timer);
 641     icount_warp_rt();
 642 }
 643
 644 static bool icount_state_needed(void *opaque)
 645 {
 646     return use_icount;
 647 }
 648
 649 static bool warp_timer_state_needed(void *opaque)
 650 {
 651     TimersState *s = opaque;
 652     return s->icount_warp_timer != NULL;
 653 }
 654
 655 static bool adjust_timers_state_needed(void *opaque)
 656 {
 657     TimersState *s = opaque;
 658     return s->icount_rt_timer != NULL;
 659 }
 660
 661 static bool shift_state_needed(void *opaque)
 662 {
 663     return use_icount == 2;
 664 }
 665
 666 /*
 667  * Subsection for warp timer migration is optional, because may not be created
 668  */
 669 static const VMStateDescription icount_vmstate_warp_timer = {
 670     .name = "timer/icount/warp_timer",
 671     .version_id = 1,
 672     .minimum_version_id = 1,
 673     .needed = warp_timer_state_needed,
 674     .fields = (VMStateField[]) {
 675         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 676         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 677         VMSTATE_END_OF_LIST()
 678     }
 679 };
 680
 681 static const VMStateDescription icount_vmstate_adjust_timers = {
 682     .name = "timer/icount/timers",
 683     .version_id = 1,
 684     .minimum_version_id = 1,
 685     .needed = adjust_timers_state_needed,
 686     .fields = (VMStateField[]) {
 687         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 688         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 689         VMSTATE_END_OF_LIST()
 690     }
 691 };
 692
 693 static const VMStateDescription icount_vmstate_shift = {
 694     .name = "timer/icount/shift",
 695     .version_id = 1,
 696     .minimum_version_id = 1,
 697     .needed = shift_state_needed,
 698     .fields = (VMStateField[]) {
 699         VMSTATE_INT16(icount_time_shift, TimersState),
 700         VMSTATE_END_OF_LIST()
 701     }
 702 };
 703
 704 /*
 705  * This is a subsection for icount migration.
 706  */
 707 static const VMStateDescription icount_vmstate_timers = {
 708     .name = "timer/icount",
 709     .version_id = 1,
 710     .minimum_version_id = 1,
 711     .needed = icount_state_needed,
 712     .fields = (VMStateField[]) {
 713         VMSTATE_INT64(qemu_icount_bias, TimersState),
 714         VMSTATE_INT64(qemu_icount, TimersState),
 715         VMSTATE_END_OF_LIST()
 716     },
 717     .subsections = (const VMStateDescription*[]) {
 718         &icount_vmstate_warp_timer,
 719         &icount_vmstate_adjust_timers,
 720         &icount_vmstate_shift,
 721         NULL
 722     }
 723 };
 724
 725 static const VMStateDescription vmstate_timers = {
 726     .name = "timer",
 727     .version_id = 2,
 728     .minimum_version_id = 1,
 729     .fields = (VMStateField[]) {
 730         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 731         VMSTATE_UNUSED(8),
 732         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 733         VMSTATE_END_OF_LIST()
 734     },
 735     .subsections = (const VMStateDescription*[]) {
 736         &icount_vmstate_timers,
 737         NULL
 738     }
 739 };
 740
 741 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 742 {
 743     double pct;
 744     double throttle_ratio;
 745     int64_t sleeptime_ns, endtime_ns;
 746
 747     if (!cpu_throttle_get_percentage()) {
 748         return;
 749     }
 750
 751     pct = (double)cpu_throttle_get_percentage()/100;
 752     throttle_ratio = pct / (1 - pct);
 753     /* Add 1ns to fix double's rounding error (like 0.9999999...) */
 754     sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
 755     endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
 756     while (sleeptime_ns > 0 && !cpu->stop) {
 757         if (sleeptime_ns > SCALE_MS) {
 758             qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
 759                                 sleeptime_ns / SCALE_MS);
 760         } else {
 761             qemu_mutex_unlock_iothread();
 762             g_usleep(sleeptime_ns / SCALE_US);
 763             qemu_mutex_lock_iothread();
 764         }
 765         sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 766     }
 767     atomic_set(&cpu->throttle_thread_scheduled, 0);
 768 }
 769
 770 static void cpu_throttle_timer_tick(void *opaque)
 771 {
 772     CPUState *cpu;
 773     double pct;
 774
 775     /* Stop the timer if needed */
 776     if (!cpu_throttle_get_percentage()) {
 777         return;
 778     }
 779     CPU_FOREACH(cpu) {
 780         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 781             async_run_on_cpu(cpu, cpu_throttle_thread,
 782                              RUN_ON_CPU_NULL);
 783         }
 784     }
 785
 786     pct = (double)cpu_throttle_get_percentage()/100;
 787     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 788                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 789 }
 790
 791 void cpu_throttle_set(int new_throttle_pct)
 792 {
 793     /* Ensure throttle percentage is within valid range */
 794     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 795     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 796
 797     atomic_set(&throttle_percentage, new_throttle_pct);
 798
 799     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 800                                        CPU_THROTTLE_TIMESLICE_NS);
 801 }
 802
 803 void cpu_throttle_stop(void)
 804 {
 805     atomic_set(&throttle_percentage, 0);
 806 }
 807
 808 bool cpu_throttle_active(void)
 809 {
 810     return (cpu_throttle_get_percentage() != 0);
 811 }
 812
 813 int cpu_throttle_get_percentage(void)
 814 {
 815     return atomic_read(&throttle_percentage);
 816 }
 817
 818 void cpu_ticks_init(void)
 819 {
 820     seqlock_init(&timers_state.vm_clock_seqlock);
 821     qemu_spin_init(&timers_state.vm_clock_lock);
 822     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 823     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 824                                            cpu_throttle_timer_tick, NULL);
 825 }
 826
 827 void configure_icount(QemuOpts *opts, Error **errp)
 828 {
 829     const char *option = qemu_opt_get(opts, "shift");
 830     bool sleep = qemu_opt_get_bool(opts, "sleep", true);
 831     bool align = qemu_opt_get_bool(opts, "align", false);
 832     long time_shift = -1;
 833
 834     if (!option) {
 835         if (qemu_opt_get(opts, "align") != NULL) {
 836             error_setg(errp, "Please specify shift option when using align");
 837         }
 838         return;
 839     }
 840
 841     if (align && !sleep) {
 842         error_setg(errp, "align=on and sleep=off are incompatible");
 843         return;
 844     }
 845
 846     if (strcmp(option, "auto") != 0) {
 847         if (qemu_strtol(option, NULL, 0, &time_shift) < 0
 848             || time_shift < 0 || time_shift > MAX_ICOUNT_SHIFT) {
 849             error_setg(errp, "icount: Invalid shift value");
 850             return;
 851         }
 852     } else if (icount_align_option) {
 853         error_setg(errp, "shift=auto and align=on are incompatible");
 854         return;
 855     } else if (!icount_sleep) {
 856         error_setg(errp, "shift=auto and sleep=off are incompatible");
 857         return;
 858     }
 859
 860     icount_sleep = sleep;
 861     if (icount_sleep) {
 862         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 863                                          icount_timer_cb, NULL);
 864     }
 865
 866     icount_align_option = align;
 867
 868     if (time_shift >= 0) {
 869         timers_state.icount_time_shift = time_shift;
 870         use_icount = 1;
 871         return;
 872     }
 873
 874     use_icount = 2;
 875
 876     /* 125MIPS seems a reasonable initial guess at the guest speed.
 877        It will be corrected fairly quickly anyway.  */
 878     timers_state.icount_time_shift = 3;
 879
 880     /* Have both realtime and virtual time triggers for speed adjustment.
 881        The realtime trigger catches emulated time passing too slowly,
 882        the virtual time trigger catches emulated time passing too fast.
 883        Realtime triggers occur even when idle, so use them less frequently
 884        than VM triggers.  */
 885     timers_state.vm_clock_warp_start = -1;
 886     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 887                                    icount_adjust_rt, NULL);
 888     timer_mod(timers_state.icount_rt_timer,
 889                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 890     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 891                                         icount_adjust_vm, NULL);
 892     timer_mod(timers_state.icount_vm_timer,
 893                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 894                    NANOSECONDS_PER_SECOND / 10);
 895 }
 896
 897 /***********************************************************/
 898 /* TCG vCPU kick timer
 899  *
 900  * The kick timer is responsible for moving single threaded vCPU
 901  * emulation on to the next vCPU. If more than one vCPU is running a
 902  * timer event with force a cpu->exit so the next vCPU can get
 903  * scheduled.
 904  *
 905  * The timer is removed if all vCPUs are idle and restarted again once
 906  * idleness is complete.
 907  */
 908
 909 static QEMUTimer *tcg_kick_vcpu_timer;
 910 static CPUState *tcg_current_rr_cpu;
 911
 912 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 913
 914 static inline int64_t qemu_tcg_next_kick(void)
 915 {
 916     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 917 }
 918
 919 /* Kick the currently round-robin scheduled vCPU to next */
 920 static void qemu_cpu_kick_rr_next_cpu(void)
 921 {
 922     CPUState *cpu;
 923     do {
 924         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 925         if (cpu) {
 926             cpu_exit(cpu);
 927         }
 928     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 929 }
 930
 931 /* Kick all RR vCPUs */
 932 static void qemu_cpu_kick_rr_cpus(void)
 933 {
 934     CPUState *cpu;
 935
 936     CPU_FOREACH(cpu) {
 937         cpu_exit(cpu);
 938     };
 939 }
 940
 941 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 942 {
 943 }
 944
 945 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 946 {
 947     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 948         qemu_notify_event();
 949         return;
 950     }
 951
 952     if (qemu_in_vcpu_thread()) {
 953         /* A CPU is currently running; kick it back out to the
 954          * tcg_cpu_exec() loop so it will recalculate its
 955          * icount deadline immediately.
 956          */
 957         qemu_cpu_kick(current_cpu);
 958     } else if (first_cpu) {
 959         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 960          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 961          * causes cpu_thread_is_idle to return false.  This way,
 962          * handle_icount_deadline can run.
 963          * If we have no CPUs at all for some reason, we don't
 964          * need to do anything.
 965          */
 966         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 967     }
 968 }
 969
 970 static void kick_tcg_thread(void *opaque)
 971 {
 972     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 973     qemu_cpu_kick_rr_next_cpu();
 974 }
 975
 976 static void start_tcg_kick_timer(void)
 977 {
 978     assert(!mttcg_enabled);
 979     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 980         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 981                                            kick_tcg_thread, NULL);
 982     }
 983     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 984         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 985     }
 986 }
 987
 988 static void stop_tcg_kick_timer(void)
 989 {
 990     assert(!mttcg_enabled);
 991     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 992         timer_del(tcg_kick_vcpu_timer);
 993     }
 994 }
 995
 996 /***********************************************************/
 997 void hw_error(const char *fmt, ...)
 998 {
 999     va_list ap;
1000     CPUState *cpu;
1001
1002     va_start(ap, fmt);
1003     fprintf(stderr, "qemu: hardware error: ");
1004     vfprintf(stderr, fmt, ap);
1005     fprintf(stderr, "\n");
1006     CPU_FOREACH(cpu) {
1007         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1008         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1009     }
1010     va_end(ap);
1011     abort();
1012 }
1013
1014 void cpu_synchronize_all_states(void)
1015 {
1016     CPUState *cpu;
1017
1018     CPU_FOREACH(cpu) {
1019         cpu_synchronize_state(cpu);
1020         /* TODO: move to cpu_synchronize_state() */
1021         if (hvf_enabled()) {
1022             hvf_cpu_synchronize_state(cpu);
1023         }
1024     }
1025 }
1026
1027 void cpu_synchronize_all_post_reset(void)
1028 {
1029     CPUState *cpu;
1030
1031     CPU_FOREACH(cpu) {
1032         cpu_synchronize_post_reset(cpu);
1033         /* TODO: move to cpu_synchronize_post_reset() */
1034         if (hvf_enabled()) {
1035             hvf_cpu_synchronize_post_reset(cpu);
1036         }
1037     }
1038 }
1039
1040 void cpu_synchronize_all_post_init(void)
1041 {
1042     CPUState *cpu;
1043
1044     CPU_FOREACH(cpu) {
1045         cpu_synchronize_post_init(cpu);
1046         /* TODO: move to cpu_synchronize_post_init() */
1047         if (hvf_enabled()) {
1048             hvf_cpu_synchronize_post_init(cpu);
1049         }
1050     }
1051 }
1052
1053 void cpu_synchronize_all_pre_loadvm(void)
1054 {
1055     CPUState *cpu;
1056
1057     CPU_FOREACH(cpu) {
1058         cpu_synchronize_pre_loadvm(cpu);
1059     }
1060 }
1061
1062 static int do_vm_stop(RunState state, bool send_stop)
1063 {
1064     int ret = 0;
1065
1066     if (runstate_is_running()) {
1067         runstate_set(state);
1068         cpu_disable_ticks();
1069         pause_all_vcpus();
1070         vm_state_notify(0, state);
1071         if (send_stop) {
1072             qapi_event_send_stop();
1073         }
1074     }
1075
1076     bdrv_drain_all();
1077     ret = bdrv_flush_all();
1078
1079     return ret;
1080 }
1081
1082 /* Special vm_stop() variant for terminating the process.  Historically clients
1083  * did not expect a QMP STOP event and so we need to retain compatibility.
1084  */
1085 int vm_shutdown(void)
1086 {
1087     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1088 }
1089
1090 static bool cpu_can_run(CPUState *cpu)
1091 {
1092     if (cpu->stop) {
1093         return false;
1094     }
1095     if (cpu_is_stopped(cpu)) {
1096         return false;
1097     }
1098     return true;
1099 }
1100
1101 static void cpu_handle_guest_debug(CPUState *cpu)
1102 {
1103     gdb_set_stop_cpu(cpu);
1104     qemu_system_debug_request();
1105     cpu->stopped = true;
1106 }
1107
1108 #ifdef CONFIG_LINUX
1109 static void sigbus_reraise(void)
1110 {
1111     sigset_t set;
1112     struct sigaction action;
1113
1114     memset(&action, 0, sizeof(action));
1115     action.sa_handler = SIG_DFL;
1116     if (!sigaction(SIGBUS, &action, NULL)) {
1117         raise(SIGBUS);
1118         sigemptyset(&set);
1119         sigaddset(&set, SIGBUS);
1120         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1121     }
1122     perror("Failed to re-raise SIGBUS!\n");
1123     abort();
1124 }
1125
1126 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1127 {
1128     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1129         sigbus_reraise();
1130     }
1131
1132     if (current_cpu) {
1133         /* Called asynchronously in VCPU thread.  */
1134         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1135             sigbus_reraise();
1136         }
1137     } else {
1138         /* Called synchronously (via signalfd) in main thread.  */
1139         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1140             sigbus_reraise();
1141         }
1142     }
1143 }
1144
1145 static void qemu_init_sigbus(void)
1146 {
1147     struct sigaction action;
1148
1149     memset(&action, 0, sizeof(action));
1150     action.sa_flags = SA_SIGINFO;
1151     action.sa_sigaction = sigbus_handler;
1152     sigaction(SIGBUS, &action, NULL);
1153
1154     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1155 }
1156 #else /* !CONFIG_LINUX */
1157 static void qemu_init_sigbus(void)
1158 {
1159 }
1160 #endif /* !CONFIG_LINUX */
1161
1162 static QemuThread io_thread;
1163
1164 /* cpu creation */
1165 static QemuCond qemu_cpu_cond;
1166 /* system init */
1167 static QemuCond qemu_pause_cond;
1168
1169 void qemu_init_cpu_loop(void)
1170 {
1171     qemu_init_sigbus();
1172     qemu_cond_init(&qemu_cpu_cond);
1173     qemu_cond_init(&qemu_pause_cond);
1174     qemu_mutex_init(&qemu_global_mutex);
1175
1176     qemu_thread_get_self(&io_thread);
1177 }
1178
1179 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1180 {
1181     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1182 }
1183
1184 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1185 {
1186     if (kvm_destroy_vcpu(cpu) < 0) {
1187         error_report("kvm_destroy_vcpu failed");
1188         exit(EXIT_FAILURE);
1189     }
1190 }
1191
1192 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1193 {
1194 }
1195
1196 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1197 {
1198     g_assert(qemu_cpu_is_self(cpu));
1199     cpu->stop = false;
1200     cpu->stopped = true;
1201     if (exit) {
1202         cpu_exit(cpu);
1203     }
1204     qemu_cond_broadcast(&qemu_pause_cond);
1205 }
1206
1207 static void qemu_wait_io_event_common(CPUState *cpu)
1208 {
1209     atomic_mb_set(&cpu->thread_kicked, false);
1210     if (cpu->stop) {
1211         qemu_cpu_stop(cpu, false);
1212     }
1213     process_queued_cpu_work(cpu);
1214 }
1215
1216 static void qemu_tcg_rr_wait_io_event(void)
1217 {
1218     CPUState *cpu;
1219
1220     while (all_cpu_threads_idle()) {
1221         stop_tcg_kick_timer();
1222         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1223     }
1224
1225     start_tcg_kick_timer();
1226
1227     CPU_FOREACH(cpu) {
1228         qemu_wait_io_event_common(cpu);
1229     }
1230 }
1231
1232 static void qemu_wait_io_event(CPUState *cpu)
1233 {
1234     bool slept = false;
1235
1236     while (cpu_thread_is_idle(cpu)) {
1237         if (!slept) {
1238             slept = true;
1239             qemu_plugin_vcpu_idle_cb(cpu);
1240         }
1241         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1242     }
1243     if (slept) {
1244         qemu_plugin_vcpu_resume_cb(cpu);
1245     }
1246
1247 #ifdef _WIN32
1248     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1249     if (!tcg_enabled()) {
1250         SleepEx(0, TRUE);
1251     }
1252 #endif
1253     qemu_wait_io_event_common(cpu);
1254 }
1255
1256 static void *qemu_kvm_cpu_thread_fn(void *arg)
1257 {
1258     CPUState *cpu = arg;
1259     int r;
1260
1261     rcu_register_thread();
1262
1263     qemu_mutex_lock_iothread();
1264     qemu_thread_get_self(cpu->thread);
1265     cpu->thread_id = qemu_get_thread_id();
1266     cpu->can_do_io = 1;
1267     current_cpu = cpu;
1268
1269     r = kvm_init_vcpu(cpu);
1270     if (r < 0) {
1271         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1272         exit(1);
1273     }
1274
1275     kvm_init_cpu_signals(cpu);
1276
1277     /* signal CPU creation */
1278     cpu->created = true;
1279     qemu_cond_signal(&qemu_cpu_cond);
1280     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1281
1282     do {
1283         if (cpu_can_run(cpu)) {
1284             r = kvm_cpu_exec(cpu);
1285             if (r == EXCP_DEBUG) {
1286                 cpu_handle_guest_debug(cpu);
1287             }
1288         }
1289         qemu_wait_io_event(cpu);
1290     } while (!cpu->unplug || cpu_can_run(cpu));
1291
1292     qemu_kvm_destroy_vcpu(cpu);
1293     cpu->created = false;
1294     qemu_cond_signal(&qemu_cpu_cond);
1295     qemu_mutex_unlock_iothread();
1296     rcu_unregister_thread();
1297     return NULL;
1298 }
1299
1300 static void *qemu_dummy_cpu_thread_fn(void *arg)
1301 {
1302 #ifdef _WIN32
1303     error_report("qtest is not supported under Windows");
1304     exit(1);
1305 #else
1306     CPUState *cpu = arg;
1307     sigset_t waitset;
1308     int r;
1309
1310     rcu_register_thread();
1311
1312     qemu_mutex_lock_iothread();
1313     qemu_thread_get_self(cpu->thread);
1314     cpu->thread_id = qemu_get_thread_id();
1315     cpu->can_do_io = 1;
1316     current_cpu = cpu;
1317
1318     sigemptyset(&waitset);
1319     sigaddset(&waitset, SIG_IPI);
1320
1321     /* signal CPU creation */
1322     cpu->created = true;
1323     qemu_cond_signal(&qemu_cpu_cond);
1324     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1325
1326     do {
1327         qemu_mutex_unlock_iothread();
1328         do {
1329             int sig;
1330             r = sigwait(&waitset, &sig);
1331         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1332         if (r == -1) {
1333             perror("sigwait");
1334             exit(1);
1335         }
1336         qemu_mutex_lock_iothread();
1337         qemu_wait_io_event(cpu);
1338     } while (!cpu->unplug);
1339
1340     qemu_mutex_unlock_iothread();
1341     rcu_unregister_thread();
1342     return NULL;
1343 #endif
1344 }
1345
1346 static int64_t tcg_get_icount_limit(void)
1347 {
1348     int64_t deadline;
1349
1350     if (replay_mode != REPLAY_MODE_PLAY) {
1351         /*
1352          * Include all the timers, because they may need an attention.
1353          * Too long CPU execution may create unnecessary delay in UI.
1354          */
1355         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1356                                               QEMU_TIMER_ATTR_ALL);
1357         /* Check realtime timers, because they help with input processing */
1358         deadline = qemu_soonest_timeout(deadline,
1359                 qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
1360                                            QEMU_TIMER_ATTR_ALL));
1361
1362         /* Maintain prior (possibly buggy) behaviour where if no deadline
1363          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1364          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1365          * nanoseconds.
1366          */
1367         if ((deadline < 0) || (deadline > INT32_MAX)) {
1368             deadline = INT32_MAX;
1369         }
1370
1371         return qemu_icount_round(deadline);
1372     } else {
1373         return replay_get_instructions();
1374     }
1375 }
1376
1377 static void handle_icount_deadline(void)
1378 {
1379     assert(qemu_in_vcpu_thread());
1380     if (use_icount) {
1381         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1382                                                       QEMU_TIMER_ATTR_ALL);
1383
1384         if (deadline == 0) {
1385             /* Wake up other AioContexts.  */
1386             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1387             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1388         }
1389     }
1390 }
1391
1392 static void prepare_icount_for_run(CPUState *cpu)
1393 {
1394     if (use_icount) {
1395         int insns_left;
1396
1397         /* These should always be cleared by process_icount_data after
1398          * each vCPU execution. However u16.high can be raised
1399          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1400          */
1401         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1402         g_assert(cpu->icount_extra == 0);
1403
1404         cpu->icount_budget = tcg_get_icount_limit();
1405         insns_left = MIN(0xffff, cpu->icount_budget);
1406         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1407         cpu->icount_extra = cpu->icount_budget - insns_left;
1408
1409         replay_mutex_lock();
1410     }
1411 }
1412
1413 static void process_icount_data(CPUState *cpu)
1414 {
1415     if (use_icount) {
1416         /* Account for executed instructions */
1417         cpu_update_icount(cpu);
1418
1419         /* Reset the counters */
1420         cpu_neg(cpu)->icount_decr.u16.low = 0;
1421         cpu->icount_extra = 0;
1422         cpu->icount_budget = 0;
1423
1424         replay_account_executed_instructions();
1425
1426         replay_mutex_unlock();
1427     }
1428 }
1429
1430
1431 static int tcg_cpu_exec(CPUState *cpu)
1432 {
1433     int ret;
1434 #ifdef CONFIG_PROFILER
1435     int64_t ti;
1436 #endif
1437
1438     assert(tcg_enabled());
1439 #ifdef CONFIG_PROFILER
1440     ti = profile_getclock();
1441 #endif
1442     cpu_exec_start(cpu);
1443     ret = cpu_exec(cpu);
1444     cpu_exec_end(cpu);
1445 #ifdef CONFIG_PROFILER
1446     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1447                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1448 #endif
1449     return ret;
1450 }
1451
1452 /* Destroy any remaining vCPUs which have been unplugged and have
1453  * finished running
1454  */
1455 static void deal_with_unplugged_cpus(void)
1456 {
1457     CPUState *cpu;
1458
1459     CPU_FOREACH(cpu) {
1460         if (cpu->unplug && !cpu_can_run(cpu)) {
1461             qemu_tcg_destroy_vcpu(cpu);
1462             cpu->created = false;
1463             qemu_cond_signal(&qemu_cpu_cond);
1464             break;
1465         }
1466     }
1467 }
1468
1469 /* Single-threaded TCG
1470  *
1471  * In the single-threaded case each vCPU is simulated in turn. If
1472  * there is more than a single vCPU we create a simple timer to kick
1473  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1474  * This is done explicitly rather than relying on side-effects
1475  * elsewhere.
1476  */
1477
1478 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1479 {
1480     CPUState *cpu = arg;
1481
1482     assert(tcg_enabled());
1483     rcu_register_thread();
1484     tcg_register_thread();
1485
1486     qemu_mutex_lock_iothread();
1487     qemu_thread_get_self(cpu->thread);
1488
1489     cpu->thread_id = qemu_get_thread_id();
1490     cpu->created = true;
1491     cpu->can_do_io = 1;
1492     qemu_cond_signal(&qemu_cpu_cond);
1493     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1494
1495     /* wait for initial kick-off after machine start */
1496     while (first_cpu->stopped) {
1497         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1498
1499         /* process any pending work */
1500         CPU_FOREACH(cpu) {
1501             current_cpu = cpu;
1502             qemu_wait_io_event_common(cpu);
1503         }
1504     }
1505
1506     start_tcg_kick_timer();
1507
1508     cpu = first_cpu;
1509
1510     /* process any pending work */
1511     cpu->exit_request = 1;
1512
1513     while (1) {
1514         qemu_mutex_unlock_iothread();
1515         replay_mutex_lock();
1516         qemu_mutex_lock_iothread();
1517         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1518         qemu_account_warp_timer();
1519
1520         /* Run the timers here.  This is much more efficient than
1521          * waking up the I/O thread and waiting for completion.
1522          */
1523         handle_icount_deadline();
1524
1525         replay_mutex_unlock();
1526
1527         if (!cpu) {
1528             cpu = first_cpu;
1529         }
1530
1531         while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
1532
1533             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1534             current_cpu = cpu;
1535
1536             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1537                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1538
1539             if (cpu_can_run(cpu)) {
1540                 int r;
1541
1542                 qemu_mutex_unlock_iothread();
1543                 prepare_icount_for_run(cpu);
1544
1545                 r = tcg_cpu_exec(cpu);
1546
1547                 process_icount_data(cpu);
1548                 qemu_mutex_lock_iothread();
1549
1550                 if (r == EXCP_DEBUG) {
1551                     cpu_handle_guest_debug(cpu);
1552                     break;
1553                 } else if (r == EXCP_ATOMIC) {
1554                     qemu_mutex_unlock_iothread();
1555                     cpu_exec_step_atomic(cpu);
1556                     qemu_mutex_lock_iothread();
1557                     break;
1558                 }
1559             } else if (cpu->stop) {
1560                 if (cpu->unplug) {
1561                     cpu = CPU_NEXT(cpu);
1562                 }
1563                 break;
1564             }
1565
1566             cpu = CPU_NEXT(cpu);
1567         } /* while (cpu && !cpu->exit_request).. */
1568
1569         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1570         atomic_set(&tcg_current_rr_cpu, NULL);
1571
1572         if (cpu && cpu->exit_request) {
1573             atomic_mb_set(&cpu->exit_request, 0);
1574         }
1575
1576         if (use_icount && all_cpu_threads_idle()) {
1577             /*
1578              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1579              * in the main_loop, wake it up in order to start the warp timer.
1580              */
1581             qemu_notify_event();
1582         }
1583
1584         qemu_tcg_rr_wait_io_event();
1585         deal_with_unplugged_cpus();
1586     }
1587
1588     rcu_unregister_thread();
1589     return NULL;
1590 }
1591
1592 static void *qemu_hax_cpu_thread_fn(void *arg)
1593 {
1594     CPUState *cpu = arg;
1595     int r;
1596
1597     rcu_register_thread();
1598     qemu_mutex_lock_iothread();
1599     qemu_thread_get_self(cpu->thread);
1600
1601     cpu->thread_id = qemu_get_thread_id();
1602     cpu->created = true;
1603     current_cpu = cpu;
1604
1605     hax_init_vcpu(cpu);
1606     qemu_cond_signal(&qemu_cpu_cond);
1607     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1608
1609     do {
1610         if (cpu_can_run(cpu)) {
1611             r = hax_smp_cpu_exec(cpu);
1612             if (r == EXCP_DEBUG) {
1613                 cpu_handle_guest_debug(cpu);
1614             }
1615         }
1616
1617         qemu_wait_io_event(cpu);
1618     } while (!cpu->unplug || cpu_can_run(cpu));
1619     rcu_unregister_thread();
1620     return NULL;
1621 }
1622
1623 /* The HVF-specific vCPU thread function. This one should only run when the host
1624  * CPU supports the VMX "unrestricted guest" feature. */
1625 static void *qemu_hvf_cpu_thread_fn(void *arg)
1626 {
1627     CPUState *cpu = arg;
1628
1629     int r;
1630
1631     assert(hvf_enabled());
1632
1633     rcu_register_thread();
1634
1635     qemu_mutex_lock_iothread();
1636     qemu_thread_get_self(cpu->thread);
1637
1638     cpu->thread_id = qemu_get_thread_id();
1639     cpu->can_do_io = 1;
1640     current_cpu = cpu;
1641
1642     hvf_init_vcpu(cpu);
1643
1644     /* signal CPU creation */
1645     cpu->created = true;
1646     qemu_cond_signal(&qemu_cpu_cond);
1647     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1648
1649     do {
1650         if (cpu_can_run(cpu)) {
1651             r = hvf_vcpu_exec(cpu);
1652             if (r == EXCP_DEBUG) {
1653                 cpu_handle_guest_debug(cpu);
1654             }
1655         }
1656         qemu_wait_io_event(cpu);
1657     } while (!cpu->unplug || cpu_can_run(cpu));
1658
1659     hvf_vcpu_destroy(cpu);
1660     cpu->created = false;
1661     qemu_cond_signal(&qemu_cpu_cond);
1662     qemu_mutex_unlock_iothread();
1663     rcu_unregister_thread();
1664     return NULL;
1665 }
1666
1667 static void *qemu_whpx_cpu_thread_fn(void *arg)
1668 {
1669     CPUState *cpu = arg;
1670     int r;
1671
1672     rcu_register_thread();
1673
1674     qemu_mutex_lock_iothread();
1675     qemu_thread_get_self(cpu->thread);
1676     cpu->thread_id = qemu_get_thread_id();
1677     current_cpu = cpu;
1678
1679     r = whpx_init_vcpu(cpu);
1680     if (r < 0) {
1681         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1682         exit(1);
1683     }
1684
1685     /* signal CPU creation */
1686     cpu->created = true;
1687     qemu_cond_signal(&qemu_cpu_cond);
1688     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1689
1690     do {
1691         if (cpu_can_run(cpu)) {
1692             r = whpx_vcpu_exec(cpu);
1693             if (r == EXCP_DEBUG) {
1694                 cpu_handle_guest_debug(cpu);
1695             }
1696         }
1697         while (cpu_thread_is_idle(cpu)) {
1698             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1699         }
1700         qemu_wait_io_event_common(cpu);
1701     } while (!cpu->unplug || cpu_can_run(cpu));
1702
1703     whpx_destroy_vcpu(cpu);
1704     cpu->created = false;
1705     qemu_cond_signal(&qemu_cpu_cond);
1706     qemu_mutex_unlock_iothread();
1707     rcu_unregister_thread();
1708     return NULL;
1709 }
1710
1711 #ifdef _WIN32
1712 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1713 {
1714 }
1715 #endif
1716
1717 /* Multi-threaded TCG
1718  *
1719  * In the multi-threaded case each vCPU has its own thread. The TLS
1720  * variable current_cpu can be used deep in the code to find the
1721  * current CPUState for a given thread.
1722  */
1723
1724 static void *qemu_tcg_cpu_thread_fn(void *arg)
1725 {
1726     CPUState *cpu = arg;
1727
1728     assert(tcg_enabled());
1729     g_assert(!use_icount);
1730
1731     rcu_register_thread();
1732     tcg_register_thread();
1733
1734     qemu_mutex_lock_iothread();
1735     qemu_thread_get_self(cpu->thread);
1736
1737     cpu->thread_id = qemu_get_thread_id();
1738     cpu->created = true;
1739     cpu->can_do_io = 1;
1740     current_cpu = cpu;
1741     qemu_cond_signal(&qemu_cpu_cond);
1742     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1743
1744     /* process any pending work */
1745     cpu->exit_request = 1;
1746
1747     do {
1748         if (cpu_can_run(cpu)) {
1749             int r;
1750             qemu_mutex_unlock_iothread();
1751             r = tcg_cpu_exec(cpu);
1752             qemu_mutex_lock_iothread();
1753             switch (r) {
1754             case EXCP_DEBUG:
1755                 cpu_handle_guest_debug(cpu);
1756                 break;
1757             case EXCP_HALTED:
1758                 /* during start-up the vCPU is reset and the thread is
1759                  * kicked several times. If we don't ensure we go back
1760                  * to sleep in the halted state we won't cleanly
1761                  * start-up when the vCPU is enabled.
1762                  *
1763                  * cpu->halted should ensure we sleep in wait_io_event
1764                  */
1765                 g_assert(cpu->halted);
1766                 break;
1767             case EXCP_ATOMIC:
1768                 qemu_mutex_unlock_iothread();
1769                 cpu_exec_step_atomic(cpu);
1770                 qemu_mutex_lock_iothread();
1771             default:
1772                 /* Ignore everything else? */
1773                 break;
1774             }
1775         }
1776
1777         atomic_mb_set(&cpu->exit_request, 0);
1778         qemu_wait_io_event(cpu);
1779     } while (!cpu->unplug || cpu_can_run(cpu));
1780
1781     qemu_tcg_destroy_vcpu(cpu);
1782     cpu->created = false;
1783     qemu_cond_signal(&qemu_cpu_cond);
1784     qemu_mutex_unlock_iothread();
1785     rcu_unregister_thread();
1786     return NULL;
1787 }
1788
1789 static void qemu_cpu_kick_thread(CPUState *cpu)
1790 {
1791 #ifndef _WIN32
1792     int err;
1793
1794     if (cpu->thread_kicked) {
1795         return;
1796     }
1797     cpu->thread_kicked = true;
1798     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1799     if (err && err != ESRCH) {
1800         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1801         exit(1);
1802     }
1803 #else /* _WIN32 */
1804     if (!qemu_cpu_is_self(cpu)) {
1805         if (whpx_enabled()) {
1806             whpx_vcpu_kick(cpu);
1807         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1808             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1809                     __func__, GetLastError());
1810             exit(1);
1811         }
1812     }
1813 #endif
1814 }
1815
1816 void qemu_cpu_kick(CPUState *cpu)
1817 {
1818     qemu_cond_broadcast(cpu->halt_cond);
1819     if (tcg_enabled()) {
1820         if (qemu_tcg_mttcg_enabled()) {
1821             cpu_exit(cpu);
1822         } else {
1823             qemu_cpu_kick_rr_cpus();
1824         }
1825     } else {
1826         if (hax_enabled()) {
1827             /*
1828              * FIXME: race condition with the exit_request check in
1829              * hax_vcpu_hax_exec
1830              */
1831             cpu->exit_request = 1;
1832         }
1833         qemu_cpu_kick_thread(cpu);
1834     }
1835 }
1836
1837 void qemu_cpu_kick_self(void)
1838 {
1839     assert(current_cpu);
1840     qemu_cpu_kick_thread(current_cpu);
1841 }
1842
1843 bool qemu_cpu_is_self(CPUState *cpu)
1844 {
1845     return qemu_thread_is_self(cpu->thread);
1846 }
1847
1848 bool qemu_in_vcpu_thread(void)
1849 {
1850     return current_cpu && qemu_cpu_is_self(current_cpu);
1851 }
1852
1853 static __thread bool iothread_locked = false;
1854
1855 bool qemu_mutex_iothread_locked(void)
1856 {
1857     return iothread_locked;
1858 }
1859
1860 /*
1861  * The BQL is taken from so many places that it is worth profiling the
1862  * callers directly, instead of funneling them all through a single function.
1863  */
1864 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1865 {
1866     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1867
1868     g_assert(!qemu_mutex_iothread_locked());
1869     bql_lock(&qemu_global_mutex, file, line);
1870     iothread_locked = true;
1871 }
1872
1873 void qemu_mutex_unlock_iothread(void)
1874 {
1875     g_assert(qemu_mutex_iothread_locked());
1876     iothread_locked = false;
1877     qemu_mutex_unlock(&qemu_global_mutex);
1878 }
1879
1880 void qemu_cond_wait_iothread(QemuCond *cond)
1881 {
1882     qemu_cond_wait(cond, &qemu_global_mutex);
1883 }
1884
1885 static bool all_vcpus_paused(void)
1886 {
1887     CPUState *cpu;
1888
1889     CPU_FOREACH(cpu) {
1890         if (!cpu->stopped) {
1891             return false;
1892         }
1893     }
1894
1895     return true;
1896 }
1897
1898 void pause_all_vcpus(void)
1899 {
1900     CPUState *cpu;
1901
1902     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1903     CPU_FOREACH(cpu) {
1904         if (qemu_cpu_is_self(cpu)) {
1905             qemu_cpu_stop(cpu, true);
1906         } else {
1907             cpu->stop = true;
1908             qemu_cpu_kick(cpu);
1909         }
1910     }
1911
1912     /* We need to drop the replay_lock so any vCPU threads woken up
1913      * can finish their replay tasks
1914      */
1915     replay_mutex_unlock();
1916
1917     while (!all_vcpus_paused()) {
1918         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1919         CPU_FOREACH(cpu) {
1920             qemu_cpu_kick(cpu);
1921         }
1922     }
1923
1924     qemu_mutex_unlock_iothread();
1925     replay_mutex_lock();
1926     qemu_mutex_lock_iothread();
1927 }
1928
1929 void cpu_resume(CPUState *cpu)
1930 {
1931     cpu->stop = false;
1932     cpu->stopped = false;
1933     qemu_cpu_kick(cpu);
1934 }
1935
1936 void resume_all_vcpus(void)
1937 {
1938     CPUState *cpu;
1939
1940     if (!runstate_is_running()) {
1941         return;
1942     }
1943
1944     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1945     CPU_FOREACH(cpu) {
1946         cpu_resume(cpu);
1947     }
1948 }
1949
1950 void cpu_remove_sync(CPUState *cpu)
1951 {
1952     cpu->stop = true;
1953     cpu->unplug = true;
1954     qemu_cpu_kick(cpu);
1955     qemu_mutex_unlock_iothread();
1956     qemu_thread_join(cpu->thread);
1957     qemu_mutex_lock_iothread();
1958 }
1959
1960 /* For temporary buffers for forming a name */
1961 #define VCPU_THREAD_NAME_SIZE 16
1962
1963 static void qemu_tcg_init_vcpu(CPUState *cpu)
1964 {
1965     char thread_name[VCPU_THREAD_NAME_SIZE];
1966     static QemuCond *single_tcg_halt_cond;
1967     static QemuThread *single_tcg_cpu_thread;
1968     static int tcg_region_inited;
1969
1970     assert(tcg_enabled());
1971     /*
1972      * Initialize TCG regions--once. Now is a good time, because:
1973      * (1) TCG's init context, prologue and target globals have been set up.
1974      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1975      *     -accel flag is processed, so the check doesn't work then).
1976      */
1977     if (!tcg_region_inited) {
1978         tcg_region_inited = 1;
1979         tcg_region_init();
1980     }
1981
1982     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1983         cpu->thread = g_malloc0(sizeof(QemuThread));
1984         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1985         qemu_cond_init(cpu->halt_cond);
1986
1987         if (qemu_tcg_mttcg_enabled()) {
1988             /* create a thread per vCPU with TCG (MTTCG) */
1989             parallel_cpus = true;
1990             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1991                  cpu->cpu_index);
1992
1993             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1994                                cpu, QEMU_THREAD_JOINABLE);
1995
1996         } else {
1997             /* share a single thread for all cpus with TCG */
1998             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1999             qemu_thread_create(cpu->thread, thread_name,
2000                                qemu_tcg_rr_cpu_thread_fn,
2001                                cpu, QEMU_THREAD_JOINABLE);
2002
2003             single_tcg_halt_cond = cpu->halt_cond;
2004             single_tcg_cpu_thread = cpu->thread;
2005         }
2006 #ifdef _WIN32
2007         cpu->hThread = qemu_thread_get_handle(cpu->thread);
2008 #endif
2009     } else {
2010         /* For non-MTTCG cases we share the thread */
2011         cpu->thread = single_tcg_cpu_thread;
2012         cpu->halt_cond = single_tcg_halt_cond;
2013         cpu->thread_id = first_cpu->thread_id;
2014         cpu->can_do_io = 1;
2015         cpu->created = true;
2016     }
2017 }
2018
2019 static void qemu_hax_start_vcpu(CPUState *cpu)
2020 {
2021     char thread_name[VCPU_THREAD_NAME_SIZE];
2022
2023     cpu->thread = g_malloc0(sizeof(QemuThread));
2024     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2025     qemu_cond_init(cpu->halt_cond);
2026
2027     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2028              cpu->cpu_index);
2029     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2030                        cpu, QEMU_THREAD_JOINABLE);
2031 #ifdef _WIN32
2032     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2033 #endif
2034 }
2035
2036 static void qemu_kvm_start_vcpu(CPUState *cpu)
2037 {
2038     char thread_name[VCPU_THREAD_NAME_SIZE];
2039
2040     cpu->thread = g_malloc0(sizeof(QemuThread));
2041     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2042     qemu_cond_init(cpu->halt_cond);
2043     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2044              cpu->cpu_index);
2045     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2046                        cpu, QEMU_THREAD_JOINABLE);
2047 }
2048
2049 static void qemu_hvf_start_vcpu(CPUState *cpu)
2050 {
2051     char thread_name[VCPU_THREAD_NAME_SIZE];
2052
2053     /* HVF currently does not support TCG, and only runs in
2054      * unrestricted-guest mode. */
2055     assert(hvf_enabled());
2056
2057     cpu->thread = g_malloc0(sizeof(QemuThread));
2058     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2059     qemu_cond_init(cpu->halt_cond);
2060
2061     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2062              cpu->cpu_index);
2063     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2064                        cpu, QEMU_THREAD_JOINABLE);
2065 }
2066
2067 static void qemu_whpx_start_vcpu(CPUState *cpu)
2068 {
2069     char thread_name[VCPU_THREAD_NAME_SIZE];
2070
2071     cpu->thread = g_malloc0(sizeof(QemuThread));
2072     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2073     qemu_cond_init(cpu->halt_cond);
2074     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2075              cpu->cpu_index);
2076     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2077                        cpu, QEMU_THREAD_JOINABLE);
2078 #ifdef _WIN32
2079     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2080 #endif
2081 }
2082
2083 static void qemu_dummy_start_vcpu(CPUState *cpu)
2084 {
2085     char thread_name[VCPU_THREAD_NAME_SIZE];
2086
2087     cpu->thread = g_malloc0(sizeof(QemuThread));
2088     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2089     qemu_cond_init(cpu->halt_cond);
2090     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2091              cpu->cpu_index);
2092     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2093                        QEMU_THREAD_JOINABLE);
2094 }
2095
2096 void qemu_init_vcpu(CPUState *cpu)
2097 {
2098     MachineState *ms = MACHINE(qdev_get_machine());
2099
2100     cpu->nr_cores = ms->smp.cores;
2101     cpu->nr_threads =  ms->smp.threads;
2102     cpu->stopped = true;
2103     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2104
2105     if (!cpu->as) {
2106         /* If the target cpu hasn't set up any address spaces itself,
2107          * give it the default one.
2108          */
2109         cpu->num_ases = 1;
2110         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2111     }
2112
2113     if (kvm_enabled()) {
2114         qemu_kvm_start_vcpu(cpu);
2115     } else if (hax_enabled()) {
2116         qemu_hax_start_vcpu(cpu);
2117     } else if (hvf_enabled()) {
2118         qemu_hvf_start_vcpu(cpu);
2119     } else if (tcg_enabled()) {
2120         qemu_tcg_init_vcpu(cpu);
2121     } else if (whpx_enabled()) {
2122         qemu_whpx_start_vcpu(cpu);
2123     } else {
2124         qemu_dummy_start_vcpu(cpu);
2125     }
2126
2127     while (!cpu->created) {
2128         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2129     }
2130 }
2131
2132 void cpu_stop_current(void)
2133 {
2134     if (current_cpu) {
2135         current_cpu->stop = true;
2136         cpu_exit(current_cpu);
2137     }
2138 }
2139
2140 int vm_stop(RunState state)
2141 {
2142     if (qemu_in_vcpu_thread()) {
2143         qemu_system_vmstop_request_prepare();
2144         qemu_system_vmstop_request(state);
2145         /*
2146          * FIXME: should not return to device code in case
2147          * vm_stop() has been requested.
2148          */
2149         cpu_stop_current();
2150         return 0;
2151     }
2152
2153     return do_vm_stop(state, true);
2154 }
2155
2156 /**
2157  * Prepare for (re)starting the VM.
2158  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2159  * running or in case of an error condition), 0 otherwise.
2160  */
2161 int vm_prepare_start(void)
2162 {
2163     RunState requested;
2164
2165     qemu_vmstop_requested(&requested);
2166     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2167         return -1;
2168     }
2169
2170     /* Ensure that a STOP/RESUME pair of events is emitted if a
2171      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2172      * example, according to documentation is always followed by
2173      * the STOP event.
2174      */
2175     if (runstate_is_running()) {
2176         qapi_event_send_stop();
2177         qapi_event_send_resume();
2178         return -1;
2179     }
2180
2181     /* We are sending this now, but the CPUs will be resumed shortly later */
2182     qapi_event_send_resume();
2183
2184     cpu_enable_ticks();
2185     runstate_set(RUN_STATE_RUNNING);
2186     vm_state_notify(1, RUN_STATE_RUNNING);
2187     return 0;
2188 }
2189
2190 void vm_start(void)
2191 {
2192     if (!vm_prepare_start()) {
2193         resume_all_vcpus();
2194     }
2195 }
2196
2197 /* does a state transition even if the VM is already stopped,
2198    current state is forgotten forever */
2199 int vm_stop_force_state(RunState state)
2200 {
2201     if (runstate_is_running()) {
2202         return vm_stop(state);
2203     } else {
2204         runstate_set(state);
2205
2206         bdrv_drain_all();
2207         /* Make sure to return an error if the flush in a previous vm_stop()
2208          * failed. */
2209         return bdrv_flush_all();
2210     }
2211 }
2212
2213 void list_cpus(const char *optarg)
2214 {
2215     /* XXX: implement xxx_cpu_list for targets that still miss it */
2216 #if defined(cpu_list)
2217     cpu_list();
2218 #endif
2219 }
2220
2221 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2222                  bool has_cpu, int64_t cpu_index, Error **errp)
2223 {
2224     FILE *f;
2225     uint32_t l;
2226     CPUState *cpu;
2227     uint8_t buf[1024];
2228     int64_t orig_addr = addr, orig_size = size;
2229
2230     if (!has_cpu) {
2231         cpu_index = 0;
2232     }
2233
2234     cpu = qemu_get_cpu(cpu_index);
2235     if (cpu == NULL) {
2236         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2237                    "a CPU number");
2238         return;
2239     }
2240
2241     f = fopen(filename, "wb");
2242     if (!f) {
2243         error_setg_file_open(errp, errno, filename);
2244         return;
2245     }
2246
2247     while (size != 0) {
2248         l = sizeof(buf);
2249         if (l > size)
2250             l = size;
2251         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2252             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2253                              " specified", orig_addr, orig_size);
2254             goto exit;
2255         }
2256         if (fwrite(buf, 1, l, f) != l) {
2257             error_setg(errp, QERR_IO_ERROR);
2258             goto exit;
2259         }
2260         addr += l;
2261         size -= l;
2262     }
2263
2264 exit:
2265     fclose(f);
2266 }
2267
2268 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2269                   Error **errp)
2270 {
2271     FILE *f;
2272     uint32_t l;
2273     uint8_t buf[1024];
2274
2275     f = fopen(filename, "wb");
2276     if (!f) {
2277         error_setg_file_open(errp, errno, filename);
2278         return;
2279     }
2280
2281     while (size != 0) {
2282         l = sizeof(buf);
2283         if (l > size)
2284             l = size;
2285         cpu_physical_memory_read(addr, buf, l);
2286         if (fwrite(buf, 1, l, f) != l) {
2287             error_setg(errp, QERR_IO_ERROR);
2288             goto exit;
2289         }
2290         addr += l;
2291         size -= l;
2292     }
2293
2294 exit:
2295     fclose(f);
2296 }
2297
2298 void qmp_inject_nmi(Error **errp)
2299 {
2300     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2301 }
2302
2303 void dump_drift_info(void)
2304 {
2305     if (!use_icount) {
2306         return;
2307     }
2308
2309     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2310                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2311     if (icount_align_option) {
2312         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2313                     -max_delay / SCALE_MS);
2314         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2315                     max_advance / SCALE_MS);
2316     } else {
2317         qemu_printf("Max guest delay     NA\n");
2318         qemu_printf("Max guest advance   NA\n");
2319     }
2320 }