cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu/config-file.h"
  27 #include "cpu.h"
  28 #include "monitor/monitor.h"
  29 #include "qapi/error.h"
  30 #include "qapi/qapi-commands-misc.h"
  31 #include "qapi/qapi-events-run-state.h"
  32 #include "qapi/qmp/qerror.h"
  33 #include "qemu/error-report.h"
  34 #include "qemu/qemu-print.h"
  35 #include "sysemu/sysemu.h"
  36 #include "sysemu/tcg.h"
  37 #include "sysemu/block-backend.h"
  38 #include "exec/gdbstub.h"
  39 #include "sysemu/dma.h"
  40 #include "sysemu/hw_accel.h"
  41 #include "sysemu/kvm.h"
  42 #include "sysemu/hax.h"
  43 #include "sysemu/hvf.h"
  44 #include "sysemu/whpx.h"
  45 #include "exec/exec-all.h"
  46
  47 #include "qemu/thread.h"
  48 #include "sysemu/cpus.h"
  49 #include "sysemu/qtest.h"
  50 #include "qemu/main-loop.h"
  51 #include "qemu/option.h"
  52 #include "qemu/bitmap.h"
  53 #include "qemu/seqlock.h"
  54 #include "qemu/guest-random.h"
  55 #include "tcg.h"
  56 #include "hw/nmi.h"
  57 #include "sysemu/replay.h"
  58 #include "hw/boards.h"
  59
  60 #ifdef CONFIG_LINUX
  61
  62 #include <sys/prctl.h>
  63
  64 #ifndef PR_MCE_KILL
  65 #define PR_MCE_KILL 33
  66 #endif
  67
  68 #ifndef PR_MCE_KILL_SET
  69 #define PR_MCE_KILL_SET 1
  70 #endif
  71
  72 #ifndef PR_MCE_KILL_EARLY
  73 #define PR_MCE_KILL_EARLY 1
  74 #endif
  75
  76 #endif /* CONFIG_LINUX */
  77
  78 int64_t max_delay;
  79 int64_t max_advance;
  80
  81 /* vcpu throttling controls */
  82 static QEMUTimer *throttle_timer;
  83 static unsigned int throttle_percentage;
  84
  85 #define CPU_THROTTLE_PCT_MIN 1
  86 #define CPU_THROTTLE_PCT_MAX 99
  87 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  88
  89 bool cpu_is_stopped(CPUState *cpu)
  90 {
  91     return cpu->stopped || !runstate_is_running();
  92 }
  93
  94 static bool cpu_thread_is_idle(CPUState *cpu)
  95 {
  96     if (cpu->stop || cpu->queued_work_first) {
  97         return false;
  98     }
  99     if (cpu_is_stopped(cpu)) {
 100         return true;
 101     }
 102     if (!cpu->halted || cpu_has_work(cpu) ||
 103         kvm_halt_in_kernel()) {
 104         return false;
 105     }
 106     return true;
 107 }
 108
 109 static bool all_cpu_threads_idle(void)
 110 {
 111     CPUState *cpu;
 112
 113     CPU_FOREACH(cpu) {
 114         if (!cpu_thread_is_idle(cpu)) {
 115             return false;
 116         }
 117     }
 118     return true;
 119 }
 120
 121 /***********************************************************/
 122 /* guest cycle counter */
 123
 124 /* Protected by TimersState seqlock */
 125
 126 static bool icount_sleep = true;
 127 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 128 #define MAX_ICOUNT_SHIFT 10
 129
 130 typedef struct TimersState {
 131     /* Protected by BQL.  */
 132     int64_t cpu_ticks_prev;
 133     int64_t cpu_ticks_offset;
 134
 135     /* Protect fields that can be respectively read outside the
 136      * BQL, and written from multiple threads.
 137      */
 138     QemuSeqLock vm_clock_seqlock;
 139     QemuSpin vm_clock_lock;
 140
 141     int16_t cpu_ticks_enabled;
 142
 143     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 144     int16_t icount_time_shift;
 145
 146     /* Compensate for varying guest execution speed.  */
 147     int64_t qemu_icount_bias;
 148
 149     int64_t vm_clock_warp_start;
 150     int64_t cpu_clock_offset;
 151
 152     /* Only written by TCG thread */
 153     int64_t qemu_icount;
 154
 155     /* for adjusting icount */
 156     QEMUTimer *icount_rt_timer;
 157     QEMUTimer *icount_vm_timer;
 158     QEMUTimer *icount_warp_timer;
 159 } TimersState;
 160
 161 static TimersState timers_state;
 162 bool mttcg_enabled;
 163
 164 /*
 165  * We default to false if we know other options have been enabled
 166  * which are currently incompatible with MTTCG. Otherwise when each
 167  * guest (target) has been updated to support:
 168  *   - atomic instructions
 169  *   - memory ordering primitives (barriers)
 170  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 171  *
 172  * Once a guest architecture has been converted to the new primitives
 173  * there are two remaining limitations to check.
 174  *
 175  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 176  * - The host must have a stronger memory order than the guest
 177  *
 178  * It may be possible in future to support strong guests on weak hosts
 179  * but that will require tagging all load/stores in a guest with their
 180  * implicit memory order requirements which would likely slow things
 181  * down a lot.
 182  */
 183
 184 static bool check_tcg_memory_orders_compatible(void)
 185 {
 186 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 187     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 188 #else
 189     return false;
 190 #endif
 191 }
 192
 193 static bool default_mttcg_enabled(void)
 194 {
 195     if (use_icount || TCG_OVERSIZED_GUEST) {
 196         return false;
 197     } else {
 198 #ifdef TARGET_SUPPORTS_MTTCG
 199         return check_tcg_memory_orders_compatible();
 200 #else
 201         return false;
 202 #endif
 203     }
 204 }
 205
 206 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 207 {
 208     const char *t = qemu_opt_get(opts, "thread");
 209     if (t) {
 210         if (strcmp(t, "multi") == 0) {
 211             if (TCG_OVERSIZED_GUEST) {
 212                 error_setg(errp, "No MTTCG when guest word size > hosts");
 213             } else if (use_icount) {
 214                 error_setg(errp, "No MTTCG when icount is enabled");
 215             } else {
 216 #ifndef TARGET_SUPPORTS_MTTCG
 217                 warn_report("Guest not yet converted to MTTCG - "
 218                             "you may get unexpected results");
 219 #endif
 220                 if (!check_tcg_memory_orders_compatible()) {
 221                     warn_report("Guest expects a stronger memory ordering "
 222                                 "than the host provides");
 223                     error_printf("This may cause strange/hard to debug errors\n");
 224                 }
 225                 mttcg_enabled = true;
 226             }
 227         } else if (strcmp(t, "single") == 0) {
 228             mttcg_enabled = false;
 229         } else {
 230             error_setg(errp, "Invalid 'thread' setting %s", t);
 231         }
 232     } else {
 233         mttcg_enabled = default_mttcg_enabled();
 234     }
 235 }
 236
 237 /* The current number of executed instructions is based on what we
 238  * originally budgeted minus the current state of the decrementing
 239  * icount counters in extra/u16.low.
 240  */
 241 static int64_t cpu_get_icount_executed(CPUState *cpu)
 242 {
 243     return (cpu->icount_budget -
 244             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 245 }
 246
 247 /*
 248  * Update the global shared timer_state.qemu_icount to take into
 249  * account executed instructions. This is done by the TCG vCPU
 250  * thread so the main-loop can see time has moved forward.
 251  */
 252 static void cpu_update_icount_locked(CPUState *cpu)
 253 {
 254     int64_t executed = cpu_get_icount_executed(cpu);
 255     cpu->icount_budget -= executed;
 256
 257     atomic_set_i64(&timers_state.qemu_icount,
 258                    timers_state.qemu_icount + executed);
 259 }
 260
 261 /*
 262  * Update the global shared timer_state.qemu_icount to take into
 263  * account executed instructions. This is done by the TCG vCPU
 264  * thread so the main-loop can see time has moved forward.
 265  */
 266 void cpu_update_icount(CPUState *cpu)
 267 {
 268     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 269                        &timers_state.vm_clock_lock);
 270     cpu_update_icount_locked(cpu);
 271     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 272                          &timers_state.vm_clock_lock);
 273 }
 274
 275 static int64_t cpu_get_icount_raw_locked(void)
 276 {
 277     CPUState *cpu = current_cpu;
 278
 279     if (cpu && cpu->running) {
 280         if (!cpu->can_do_io) {
 281             error_report("Bad icount read");
 282             exit(1);
 283         }
 284         /* Take into account what has run */
 285         cpu_update_icount_locked(cpu);
 286     }
 287     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 288     return atomic_read_i64(&timers_state.qemu_icount);
 289 }
 290
 291 static int64_t cpu_get_icount_locked(void)
 292 {
 293     int64_t icount = cpu_get_icount_raw_locked();
 294     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 295         cpu_icount_to_ns(icount);
 296 }
 297
 298 int64_t cpu_get_icount_raw(void)
 299 {
 300     int64_t icount;
 301     unsigned start;
 302
 303     do {
 304         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 305         icount = cpu_get_icount_raw_locked();
 306     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 307
 308     return icount;
 309 }
 310
 311 /* Return the virtual CPU time, based on the instruction counter.  */
 312 int64_t cpu_get_icount(void)
 313 {
 314     int64_t icount;
 315     unsigned start;
 316
 317     do {
 318         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 319         icount = cpu_get_icount_locked();
 320     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 321
 322     return icount;
 323 }
 324
 325 int64_t cpu_icount_to_ns(int64_t icount)
 326 {
 327     return icount << atomic_read(&timers_state.icount_time_shift);
 328 }
 329
 330 static int64_t cpu_get_ticks_locked(void)
 331 {
 332     int64_t ticks = timers_state.cpu_ticks_offset;
 333     if (timers_state.cpu_ticks_enabled) {
 334         ticks += cpu_get_host_ticks();
 335     }
 336
 337     if (timers_state.cpu_ticks_prev > ticks) {
 338         /* Non increasing ticks may happen if the host uses software suspend.  */
 339         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 340         ticks = timers_state.cpu_ticks_prev;
 341     }
 342
 343     timers_state.cpu_ticks_prev = ticks;
 344     return ticks;
 345 }
 346
 347 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 348  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 349  * counter.
 350  */
 351 int64_t cpu_get_ticks(void)
 352 {
 353     int64_t ticks;
 354
 355     if (use_icount) {
 356         return cpu_get_icount();
 357     }
 358
 359     qemu_spin_lock(&timers_state.vm_clock_lock);
 360     ticks = cpu_get_ticks_locked();
 361     qemu_spin_unlock(&timers_state.vm_clock_lock);
 362     return ticks;
 363 }
 364
 365 static int64_t cpu_get_clock_locked(void)
 366 {
 367     int64_t time;
 368
 369     time = timers_state.cpu_clock_offset;
 370     if (timers_state.cpu_ticks_enabled) {
 371         time += get_clock();
 372     }
 373
 374     return time;
 375 }
 376
 377 /* Return the monotonic time elapsed in VM, i.e.,
 378  * the time between vm_start and vm_stop
 379  */
 380 int64_t cpu_get_clock(void)
 381 {
 382     int64_t ti;
 383     unsigned start;
 384
 385     do {
 386         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 387         ti = cpu_get_clock_locked();
 388     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 389
 390     return ti;
 391 }
 392
 393 /* enable cpu_get_ticks()
 394  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 395  */
 396 void cpu_enable_ticks(void)
 397 {
 398     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 399                        &timers_state.vm_clock_lock);
 400     if (!timers_state.cpu_ticks_enabled) {
 401         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 402         timers_state.cpu_clock_offset -= get_clock();
 403         timers_state.cpu_ticks_enabled = 1;
 404     }
 405     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 406                        &timers_state.vm_clock_lock);
 407 }
 408
 409 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 410  * cpu_get_ticks() after that.
 411  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 412  */
 413 void cpu_disable_ticks(void)
 414 {
 415     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 416                        &timers_state.vm_clock_lock);
 417     if (timers_state.cpu_ticks_enabled) {
 418         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 419         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 420         timers_state.cpu_ticks_enabled = 0;
 421     }
 422     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 423                          &timers_state.vm_clock_lock);
 424 }
 425
 426 /* Correlation between real and virtual time is always going to be
 427    fairly approximate, so ignore small variation.
 428    When the guest is idle real and virtual time will be aligned in
 429    the IO wait loop.  */
 430 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 431
 432 static void icount_adjust(void)
 433 {
 434     int64_t cur_time;
 435     int64_t cur_icount;
 436     int64_t delta;
 437
 438     /* Protected by TimersState mutex.  */
 439     static int64_t last_delta;
 440
 441     /* If the VM is not running, then do nothing.  */
 442     if (!runstate_is_running()) {
 443         return;
 444     }
 445
 446     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 447                        &timers_state.vm_clock_lock);
 448     cur_time = cpu_get_clock_locked();
 449     cur_icount = cpu_get_icount_locked();
 450
 451     delta = cur_icount - cur_time;
 452     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 453     if (delta > 0
 454         && last_delta + ICOUNT_WOBBLE < delta * 2
 455         && timers_state.icount_time_shift > 0) {
 456         /* The guest is getting too far ahead.  Slow time down.  */
 457         atomic_set(&timers_state.icount_time_shift,
 458                    timers_state.icount_time_shift - 1);
 459     }
 460     if (delta < 0
 461         && last_delta - ICOUNT_WOBBLE > delta * 2
 462         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 463         /* The guest is getting too far behind.  Speed time up.  */
 464         atomic_set(&timers_state.icount_time_shift,
 465                    timers_state.icount_time_shift + 1);
 466     }
 467     last_delta = delta;
 468     atomic_set_i64(&timers_state.qemu_icount_bias,
 469                    cur_icount - (timers_state.qemu_icount
 470                                  << timers_state.icount_time_shift));
 471     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 472                          &timers_state.vm_clock_lock);
 473 }
 474
 475 static void icount_adjust_rt(void *opaque)
 476 {
 477     timer_mod(timers_state.icount_rt_timer,
 478               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 479     icount_adjust();
 480 }
 481
 482 static void icount_adjust_vm(void *opaque)
 483 {
 484     timer_mod(timers_state.icount_vm_timer,
 485                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 486                    NANOSECONDS_PER_SECOND / 10);
 487     icount_adjust();
 488 }
 489
 490 static int64_t qemu_icount_round(int64_t count)
 491 {
 492     int shift = atomic_read(&timers_state.icount_time_shift);
 493     return (count + (1 << shift) - 1) >> shift;
 494 }
 495
 496 static void icount_warp_rt(void)
 497 {
 498     unsigned seq;
 499     int64_t warp_start;
 500
 501     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 502      * changes from -1 to another value, so the race here is okay.
 503      */
 504     do {
 505         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 506         warp_start = timers_state.vm_clock_warp_start;
 507     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 508
 509     if (warp_start == -1) {
 510         return;
 511     }
 512
 513     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 514                        &timers_state.vm_clock_lock);
 515     if (runstate_is_running()) {
 516         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 517                                             cpu_get_clock_locked());
 518         int64_t warp_delta;
 519
 520         warp_delta = clock - timers_state.vm_clock_warp_start;
 521         if (use_icount == 2) {
 522             /*
 523              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 524              * far ahead of real time.
 525              */
 526             int64_t cur_icount = cpu_get_icount_locked();
 527             int64_t delta = clock - cur_icount;
 528             warp_delta = MIN(warp_delta, delta);
 529         }
 530         atomic_set_i64(&timers_state.qemu_icount_bias,
 531                        timers_state.qemu_icount_bias + warp_delta);
 532     }
 533     timers_state.vm_clock_warp_start = -1;
 534     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 535                        &timers_state.vm_clock_lock);
 536
 537     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 538         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 539     }
 540 }
 541
 542 static void icount_timer_cb(void *opaque)
 543 {
 544     /* No need for a checkpoint because the timer already synchronizes
 545      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 546      */
 547     icount_warp_rt();
 548 }
 549
 550 void qtest_clock_warp(int64_t dest)
 551 {
 552     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 553     AioContext *aio_context;
 554     assert(qtest_enabled());
 555     aio_context = qemu_get_aio_context();
 556     while (clock < dest) {
 557         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 558         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 559
 560         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 561                            &timers_state.vm_clock_lock);
 562         atomic_set_i64(&timers_state.qemu_icount_bias,
 563                        timers_state.qemu_icount_bias + warp);
 564         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 565                              &timers_state.vm_clock_lock);
 566
 567         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 568         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 569         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 570     }
 571     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 572 }
 573
 574 void qemu_start_warp_timer(void)
 575 {
 576     int64_t clock;
 577     int64_t deadline;
 578
 579     if (!use_icount) {
 580         return;
 581     }
 582
 583     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 584      * do not fire, so computing the deadline does not make sense.
 585      */
 586     if (!runstate_is_running()) {
 587         return;
 588     }
 589
 590     if (replay_mode != REPLAY_MODE_PLAY) {
 591         if (!all_cpu_threads_idle()) {
 592             return;
 593         }
 594
 595         if (qtest_enabled()) {
 596             /* When testing, qtest commands advance icount.  */
 597             return;
 598         }
 599
 600         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 601     } else {
 602         /* warp clock deterministically in record/replay mode */
 603         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 604             /* vCPU is sleeping and warp can't be started.
 605                It is probably a race condition: notification sent
 606                to vCPU was processed in advance and vCPU went to sleep.
 607                Therefore we have to wake it up for doing someting. */
 608             if (replay_has_checkpoint()) {
 609                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 610             }
 611             return;
 612         }
 613     }
 614
 615     /* We want to use the earliest deadline from ALL vm_clocks */
 616     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 617     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 618     if (deadline < 0) {
 619         static bool notified;
 620         if (!icount_sleep && !notified) {
 621             warn_report("icount sleep disabled and no active timers");
 622             notified = true;
 623         }
 624         return;
 625     }
 626
 627     if (deadline > 0) {
 628         /*
 629          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 630          * sleep.  Otherwise, the CPU might be waiting for a future timer
 631          * interrupt to wake it up, but the interrupt never comes because
 632          * the vCPU isn't running any insns and thus doesn't advance the
 633          * QEMU_CLOCK_VIRTUAL.
 634          */
 635         if (!icount_sleep) {
 636             /*
 637              * We never let VCPUs sleep in no sleep icount mode.
 638              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 639              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 640              * It is useful when we want a deterministic execution time,
 641              * isolated from host latencies.
 642              */
 643             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 644                                &timers_state.vm_clock_lock);
 645             atomic_set_i64(&timers_state.qemu_icount_bias,
 646                            timers_state.qemu_icount_bias + deadline);
 647             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 648                                  &timers_state.vm_clock_lock);
 649             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 650         } else {
 651             /*
 652              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 653              * "real" time, (related to the time left until the next event) has
 654              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 655              * This avoids that the warps are visible externally; for example,
 656              * you will not be sending network packets continuously instead of
 657              * every 100ms.
 658              */
 659             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 660                                &timers_state.vm_clock_lock);
 661             if (timers_state.vm_clock_warp_start == -1
 662                 || timers_state.vm_clock_warp_start > clock) {
 663                 timers_state.vm_clock_warp_start = clock;
 664             }
 665             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 666                                  &timers_state.vm_clock_lock);
 667             timer_mod_anticipate(timers_state.icount_warp_timer,
 668                                  clock + deadline);
 669         }
 670     } else if (deadline == 0) {
 671         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 672     }
 673 }
 674
 675 static void qemu_account_warp_timer(void)
 676 {
 677     if (!use_icount || !icount_sleep) {
 678         return;
 679     }
 680
 681     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 682      * do not fire, so computing the deadline does not make sense.
 683      */
 684     if (!runstate_is_running()) {
 685         return;
 686     }
 687
 688     /* warp clock deterministically in record/replay mode */
 689     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 690         return;
 691     }
 692
 693     timer_del(timers_state.icount_warp_timer);
 694     icount_warp_rt();
 695 }
 696
 697 static bool icount_state_needed(void *opaque)
 698 {
 699     return use_icount;
 700 }
 701
 702 static bool warp_timer_state_needed(void *opaque)
 703 {
 704     TimersState *s = opaque;
 705     return s->icount_warp_timer != NULL;
 706 }
 707
 708 static bool adjust_timers_state_needed(void *opaque)
 709 {
 710     TimersState *s = opaque;
 711     return s->icount_rt_timer != NULL;
 712 }
 713
 714 /*
 715  * Subsection for warp timer migration is optional, because may not be created
 716  */
 717 static const VMStateDescription icount_vmstate_warp_timer = {
 718     .name = "timer/icount/warp_timer",
 719     .version_id = 1,
 720     .minimum_version_id = 1,
 721     .needed = warp_timer_state_needed,
 722     .fields = (VMStateField[]) {
 723         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 724         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 725         VMSTATE_END_OF_LIST()
 726     }
 727 };
 728
 729 static const VMStateDescription icount_vmstate_adjust_timers = {
 730     .name = "timer/icount/timers",
 731     .version_id = 1,
 732     .minimum_version_id = 1,
 733     .needed = adjust_timers_state_needed,
 734     .fields = (VMStateField[]) {
 735         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 736         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 737         VMSTATE_END_OF_LIST()
 738     }
 739 };
 740
 741 /*
 742  * This is a subsection for icount migration.
 743  */
 744 static const VMStateDescription icount_vmstate_timers = {
 745     .name = "timer/icount",
 746     .version_id = 1,
 747     .minimum_version_id = 1,
 748     .needed = icount_state_needed,
 749     .fields = (VMStateField[]) {
 750         VMSTATE_INT64(qemu_icount_bias, TimersState),
 751         VMSTATE_INT64(qemu_icount, TimersState),
 752         VMSTATE_END_OF_LIST()
 753     },
 754     .subsections = (const VMStateDescription*[]) {
 755         &icount_vmstate_warp_timer,
 756         &icount_vmstate_adjust_timers,
 757         NULL
 758     }
 759 };
 760
 761 static const VMStateDescription vmstate_timers = {
 762     .name = "timer",
 763     .version_id = 2,
 764     .minimum_version_id = 1,
 765     .fields = (VMStateField[]) {
 766         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 767         VMSTATE_UNUSED(8),
 768         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 769         VMSTATE_END_OF_LIST()
 770     },
 771     .subsections = (const VMStateDescription*[]) {
 772         &icount_vmstate_timers,
 773         NULL
 774     }
 775 };
 776
 777 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 778 {
 779     double pct;
 780     double throttle_ratio;
 781     long sleeptime_ns;
 782
 783     if (!cpu_throttle_get_percentage()) {
 784         return;
 785     }
 786
 787     pct = (double)cpu_throttle_get_percentage()/100;
 788     throttle_ratio = pct / (1 - pct);
 789     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 790
 791     qemu_mutex_unlock_iothread();
 792     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 793     qemu_mutex_lock_iothread();
 794     atomic_set(&cpu->throttle_thread_scheduled, 0);
 795 }
 796
 797 static void cpu_throttle_timer_tick(void *opaque)
 798 {
 799     CPUState *cpu;
 800     double pct;
 801
 802     /* Stop the timer if needed */
 803     if (!cpu_throttle_get_percentage()) {
 804         return;
 805     }
 806     CPU_FOREACH(cpu) {
 807         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 808             async_run_on_cpu(cpu, cpu_throttle_thread,
 809                              RUN_ON_CPU_NULL);
 810         }
 811     }
 812
 813     pct = (double)cpu_throttle_get_percentage()/100;
 814     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 815                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 816 }
 817
 818 void cpu_throttle_set(int new_throttle_pct)
 819 {
 820     /* Ensure throttle percentage is within valid range */
 821     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 822     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 823
 824     atomic_set(&throttle_percentage, new_throttle_pct);
 825
 826     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 827                                        CPU_THROTTLE_TIMESLICE_NS);
 828 }
 829
 830 void cpu_throttle_stop(void)
 831 {
 832     atomic_set(&throttle_percentage, 0);
 833 }
 834
 835 bool cpu_throttle_active(void)
 836 {
 837     return (cpu_throttle_get_percentage() != 0);
 838 }
 839
 840 int cpu_throttle_get_percentage(void)
 841 {
 842     return atomic_read(&throttle_percentage);
 843 }
 844
 845 void cpu_ticks_init(void)
 846 {
 847     seqlock_init(&timers_state.vm_clock_seqlock);
 848     qemu_spin_init(&timers_state.vm_clock_lock);
 849     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 850     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 851                                            cpu_throttle_timer_tick, NULL);
 852 }
 853
 854 void configure_icount(QemuOpts *opts, Error **errp)
 855 {
 856     const char *option;
 857     char *rem_str = NULL;
 858
 859     option = qemu_opt_get(opts, "shift");
 860     if (!option) {
 861         if (qemu_opt_get(opts, "align") != NULL) {
 862             error_setg(errp, "Please specify shift option when using align");
 863         }
 864         return;
 865     }
 866
 867     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 868     if (icount_sleep) {
 869         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 870                                          icount_timer_cb, NULL);
 871     }
 872
 873     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 874
 875     if (icount_align_option && !icount_sleep) {
 876         error_setg(errp, "align=on and sleep=off are incompatible");
 877     }
 878     if (strcmp(option, "auto") != 0) {
 879         errno = 0;
 880         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 881         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 882             error_setg(errp, "icount: Invalid shift value");
 883         }
 884         use_icount = 1;
 885         return;
 886     } else if (icount_align_option) {
 887         error_setg(errp, "shift=auto and align=on are incompatible");
 888     } else if (!icount_sleep) {
 889         error_setg(errp, "shift=auto and sleep=off are incompatible");
 890     }
 891
 892     use_icount = 2;
 893
 894     /* 125MIPS seems a reasonable initial guess at the guest speed.
 895        It will be corrected fairly quickly anyway.  */
 896     timers_state.icount_time_shift = 3;
 897
 898     /* Have both realtime and virtual time triggers for speed adjustment.
 899        The realtime trigger catches emulated time passing too slowly,
 900        the virtual time trigger catches emulated time passing too fast.
 901        Realtime triggers occur even when idle, so use them less frequently
 902        than VM triggers.  */
 903     timers_state.vm_clock_warp_start = -1;
 904     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 905                                    icount_adjust_rt, NULL);
 906     timer_mod(timers_state.icount_rt_timer,
 907                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 908     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 909                                         icount_adjust_vm, NULL);
 910     timer_mod(timers_state.icount_vm_timer,
 911                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 912                    NANOSECONDS_PER_SECOND / 10);
 913 }
 914
 915 /***********************************************************/
 916 /* TCG vCPU kick timer
 917  *
 918  * The kick timer is responsible for moving single threaded vCPU
 919  * emulation on to the next vCPU. If more than one vCPU is running a
 920  * timer event with force a cpu->exit so the next vCPU can get
 921  * scheduled.
 922  *
 923  * The timer is removed if all vCPUs are idle and restarted again once
 924  * idleness is complete.
 925  */
 926
 927 static QEMUTimer *tcg_kick_vcpu_timer;
 928 static CPUState *tcg_current_rr_cpu;
 929
 930 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 931
 932 static inline int64_t qemu_tcg_next_kick(void)
 933 {
 934     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 935 }
 936
 937 /* Kick the currently round-robin scheduled vCPU */
 938 static void qemu_cpu_kick_rr_cpu(void)
 939 {
 940     CPUState *cpu;
 941     do {
 942         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 943         if (cpu) {
 944             cpu_exit(cpu);
 945         }
 946     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 947 }
 948
 949 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 950 {
 951 }
 952
 953 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 954 {
 955     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 956         qemu_notify_event();
 957         return;
 958     }
 959
 960     if (qemu_in_vcpu_thread()) {
 961         /* A CPU is currently running; kick it back out to the
 962          * tcg_cpu_exec() loop so it will recalculate its
 963          * icount deadline immediately.
 964          */
 965         qemu_cpu_kick(current_cpu);
 966     } else if (first_cpu) {
 967         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 968          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 969          * causes cpu_thread_is_idle to return false.  This way,
 970          * handle_icount_deadline can run.
 971          * If we have no CPUs at all for some reason, we don't
 972          * need to do anything.
 973          */
 974         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 975     }
 976 }
 977
 978 static void kick_tcg_thread(void *opaque)
 979 {
 980     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 981     qemu_cpu_kick_rr_cpu();
 982 }
 983
 984 static void start_tcg_kick_timer(void)
 985 {
 986     assert(!mttcg_enabled);
 987     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 988         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 989                                            kick_tcg_thread, NULL);
 990     }
 991     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 992         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 993     }
 994 }
 995
 996 static void stop_tcg_kick_timer(void)
 997 {
 998     assert(!mttcg_enabled);
 999     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
1000         timer_del(tcg_kick_vcpu_timer);
1001     }
1002 }
1003
1004 /***********************************************************/
1005 void hw_error(const char *fmt, ...)
1006 {
1007     va_list ap;
1008     CPUState *cpu;
1009
1010     va_start(ap, fmt);
1011     fprintf(stderr, "qemu: hardware error: ");
1012     vfprintf(stderr, fmt, ap);
1013     fprintf(stderr, "\n");
1014     CPU_FOREACH(cpu) {
1015         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1016         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1017     }
1018     va_end(ap);
1019     abort();
1020 }
1021
1022 void cpu_synchronize_all_states(void)
1023 {
1024     CPUState *cpu;
1025
1026     CPU_FOREACH(cpu) {
1027         cpu_synchronize_state(cpu);
1028         /* TODO: move to cpu_synchronize_state() */
1029         if (hvf_enabled()) {
1030             hvf_cpu_synchronize_state(cpu);
1031         }
1032     }
1033 }
1034
1035 void cpu_synchronize_all_post_reset(void)
1036 {
1037     CPUState *cpu;
1038
1039     CPU_FOREACH(cpu) {
1040         cpu_synchronize_post_reset(cpu);
1041         /* TODO: move to cpu_synchronize_post_reset() */
1042         if (hvf_enabled()) {
1043             hvf_cpu_synchronize_post_reset(cpu);
1044         }
1045     }
1046 }
1047
1048 void cpu_synchronize_all_post_init(void)
1049 {
1050     CPUState *cpu;
1051
1052     CPU_FOREACH(cpu) {
1053         cpu_synchronize_post_init(cpu);
1054         /* TODO: move to cpu_synchronize_post_init() */
1055         if (hvf_enabled()) {
1056             hvf_cpu_synchronize_post_init(cpu);
1057         }
1058     }
1059 }
1060
1061 void cpu_synchronize_all_pre_loadvm(void)
1062 {
1063     CPUState *cpu;
1064
1065     CPU_FOREACH(cpu) {
1066         cpu_synchronize_pre_loadvm(cpu);
1067     }
1068 }
1069
1070 static int do_vm_stop(RunState state, bool send_stop)
1071 {
1072     int ret = 0;
1073
1074     if (runstate_is_running()) {
1075         cpu_disable_ticks();
1076         pause_all_vcpus();
1077         runstate_set(state);
1078         vm_state_notify(0, state);
1079         if (send_stop) {
1080             qapi_event_send_stop();
1081         }
1082     }
1083
1084     bdrv_drain_all();
1085     replay_disable_events();
1086     ret = bdrv_flush_all();
1087
1088     return ret;
1089 }
1090
1091 /* Special vm_stop() variant for terminating the process.  Historically clients
1092  * did not expect a QMP STOP event and so we need to retain compatibility.
1093  */
1094 int vm_shutdown(void)
1095 {
1096     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1097 }
1098
1099 static bool cpu_can_run(CPUState *cpu)
1100 {
1101     if (cpu->stop) {
1102         return false;
1103     }
1104     if (cpu_is_stopped(cpu)) {
1105         return false;
1106     }
1107     return true;
1108 }
1109
1110 static void cpu_handle_guest_debug(CPUState *cpu)
1111 {
1112     gdb_set_stop_cpu(cpu);
1113     qemu_system_debug_request();
1114     cpu->stopped = true;
1115 }
1116
1117 #ifdef CONFIG_LINUX
1118 static void sigbus_reraise(void)
1119 {
1120     sigset_t set;
1121     struct sigaction action;
1122
1123     memset(&action, 0, sizeof(action));
1124     action.sa_handler = SIG_DFL;
1125     if (!sigaction(SIGBUS, &action, NULL)) {
1126         raise(SIGBUS);
1127         sigemptyset(&set);
1128         sigaddset(&set, SIGBUS);
1129         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1130     }
1131     perror("Failed to re-raise SIGBUS!\n");
1132     abort();
1133 }
1134
1135 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1136 {
1137     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1138         sigbus_reraise();
1139     }
1140
1141     if (current_cpu) {
1142         /* Called asynchronously in VCPU thread.  */
1143         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1144             sigbus_reraise();
1145         }
1146     } else {
1147         /* Called synchronously (via signalfd) in main thread.  */
1148         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1149             sigbus_reraise();
1150         }
1151     }
1152 }
1153
1154 static void qemu_init_sigbus(void)
1155 {
1156     struct sigaction action;
1157
1158     memset(&action, 0, sizeof(action));
1159     action.sa_flags = SA_SIGINFO;
1160     action.sa_sigaction = sigbus_handler;
1161     sigaction(SIGBUS, &action, NULL);
1162
1163     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1164 }
1165 #else /* !CONFIG_LINUX */
1166 static void qemu_init_sigbus(void)
1167 {
1168 }
1169 #endif /* !CONFIG_LINUX */
1170
1171 static QemuMutex qemu_global_mutex;
1172
1173 static QemuThread io_thread;
1174
1175 /* cpu creation */
1176 static QemuCond qemu_cpu_cond;
1177 /* system init */
1178 static QemuCond qemu_pause_cond;
1179
1180 void qemu_init_cpu_loop(void)
1181 {
1182     qemu_init_sigbus();
1183     qemu_cond_init(&qemu_cpu_cond);
1184     qemu_cond_init(&qemu_pause_cond);
1185     qemu_mutex_init(&qemu_global_mutex);
1186
1187     qemu_thread_get_self(&io_thread);
1188 }
1189
1190 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1191 {
1192     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1193 }
1194
1195 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1196 {
1197     if (kvm_destroy_vcpu(cpu) < 0) {
1198         error_report("kvm_destroy_vcpu failed");
1199         exit(EXIT_FAILURE);
1200     }
1201 }
1202
1203 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1204 {
1205 }
1206
1207 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1208 {
1209     g_assert(qemu_cpu_is_self(cpu));
1210     cpu->stop = false;
1211     cpu->stopped = true;
1212     if (exit) {
1213         cpu_exit(cpu);
1214     }
1215     qemu_cond_broadcast(&qemu_pause_cond);
1216 }
1217
1218 static void qemu_wait_io_event_common(CPUState *cpu)
1219 {
1220     atomic_mb_set(&cpu->thread_kicked, false);
1221     if (cpu->stop) {
1222         qemu_cpu_stop(cpu, false);
1223     }
1224     process_queued_cpu_work(cpu);
1225 }
1226
1227 static void qemu_tcg_rr_wait_io_event(void)
1228 {
1229     CPUState *cpu;
1230
1231     while (all_cpu_threads_idle()) {
1232         stop_tcg_kick_timer();
1233         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1234     }
1235
1236     start_tcg_kick_timer();
1237
1238     CPU_FOREACH(cpu) {
1239         qemu_wait_io_event_common(cpu);
1240     }
1241 }
1242
1243 static void qemu_wait_io_event(CPUState *cpu)
1244 {
1245     while (cpu_thread_is_idle(cpu)) {
1246         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1247     }
1248
1249 #ifdef _WIN32
1250     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1251     if (!tcg_enabled()) {
1252         SleepEx(0, TRUE);
1253     }
1254 #endif
1255     qemu_wait_io_event_common(cpu);
1256 }
1257
1258 static void *qemu_kvm_cpu_thread_fn(void *arg)
1259 {
1260     CPUState *cpu = arg;
1261     int r;
1262
1263     rcu_register_thread();
1264
1265     qemu_mutex_lock_iothread();
1266     qemu_thread_get_self(cpu->thread);
1267     cpu->thread_id = qemu_get_thread_id();
1268     cpu->can_do_io = 1;
1269     current_cpu = cpu;
1270
1271     r = kvm_init_vcpu(cpu);
1272     if (r < 0) {
1273         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1274         exit(1);
1275     }
1276
1277     kvm_init_cpu_signals(cpu);
1278
1279     /* signal CPU creation */
1280     cpu->created = true;
1281     qemu_cond_signal(&qemu_cpu_cond);
1282     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1283
1284     do {
1285         if (cpu_can_run(cpu)) {
1286             r = kvm_cpu_exec(cpu);
1287             if (r == EXCP_DEBUG) {
1288                 cpu_handle_guest_debug(cpu);
1289             }
1290         }
1291         qemu_wait_io_event(cpu);
1292     } while (!cpu->unplug || cpu_can_run(cpu));
1293
1294     qemu_kvm_destroy_vcpu(cpu);
1295     cpu->created = false;
1296     qemu_cond_signal(&qemu_cpu_cond);
1297     qemu_mutex_unlock_iothread();
1298     rcu_unregister_thread();
1299     return NULL;
1300 }
1301
1302 static void *qemu_dummy_cpu_thread_fn(void *arg)
1303 {
1304 #ifdef _WIN32
1305     error_report("qtest is not supported under Windows");
1306     exit(1);
1307 #else
1308     CPUState *cpu = arg;
1309     sigset_t waitset;
1310     int r;
1311
1312     rcu_register_thread();
1313
1314     qemu_mutex_lock_iothread();
1315     qemu_thread_get_self(cpu->thread);
1316     cpu->thread_id = qemu_get_thread_id();
1317     cpu->can_do_io = 1;
1318     current_cpu = cpu;
1319
1320     sigemptyset(&waitset);
1321     sigaddset(&waitset, SIG_IPI);
1322
1323     /* signal CPU creation */
1324     cpu->created = true;
1325     qemu_cond_signal(&qemu_cpu_cond);
1326     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1327
1328     do {
1329         qemu_mutex_unlock_iothread();
1330         do {
1331             int sig;
1332             r = sigwait(&waitset, &sig);
1333         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1334         if (r == -1) {
1335             perror("sigwait");
1336             exit(1);
1337         }
1338         qemu_mutex_lock_iothread();
1339         qemu_wait_io_event(cpu);
1340     } while (!cpu->unplug);
1341
1342     qemu_mutex_unlock_iothread();
1343     rcu_unregister_thread();
1344     return NULL;
1345 #endif
1346 }
1347
1348 static int64_t tcg_get_icount_limit(void)
1349 {
1350     int64_t deadline;
1351
1352     if (replay_mode != REPLAY_MODE_PLAY) {
1353         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1354
1355         /* Maintain prior (possibly buggy) behaviour where if no deadline
1356          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1357          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1358          * nanoseconds.
1359          */
1360         if ((deadline < 0) || (deadline > INT32_MAX)) {
1361             deadline = INT32_MAX;
1362         }
1363
1364         return qemu_icount_round(deadline);
1365     } else {
1366         return replay_get_instructions();
1367     }
1368 }
1369
1370 static void handle_icount_deadline(void)
1371 {
1372     assert(qemu_in_vcpu_thread());
1373     if (use_icount) {
1374         int64_t deadline =
1375             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1376
1377         if (deadline == 0) {
1378             /* Wake up other AioContexts.  */
1379             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1380             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1381         }
1382     }
1383 }
1384
1385 static void prepare_icount_for_run(CPUState *cpu)
1386 {
1387     if (use_icount) {
1388         int insns_left;
1389
1390         /* These should always be cleared by process_icount_data after
1391          * each vCPU execution. However u16.high can be raised
1392          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1393          */
1394         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1395         g_assert(cpu->icount_extra == 0);
1396
1397         cpu->icount_budget = tcg_get_icount_limit();
1398         insns_left = MIN(0xffff, cpu->icount_budget);
1399         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1400         cpu->icount_extra = cpu->icount_budget - insns_left;
1401
1402         replay_mutex_lock();
1403     }
1404 }
1405
1406 static void process_icount_data(CPUState *cpu)
1407 {
1408     if (use_icount) {
1409         /* Account for executed instructions */
1410         cpu_update_icount(cpu);
1411
1412         /* Reset the counters */
1413         cpu_neg(cpu)->icount_decr.u16.low = 0;
1414         cpu->icount_extra = 0;
1415         cpu->icount_budget = 0;
1416
1417         replay_account_executed_instructions();
1418
1419         replay_mutex_unlock();
1420     }
1421 }
1422
1423
1424 static int tcg_cpu_exec(CPUState *cpu)
1425 {
1426     int ret;
1427 #ifdef CONFIG_PROFILER
1428     int64_t ti;
1429 #endif
1430
1431     assert(tcg_enabled());
1432 #ifdef CONFIG_PROFILER
1433     ti = profile_getclock();
1434 #endif
1435     cpu_exec_start(cpu);
1436     ret = cpu_exec(cpu);
1437     cpu_exec_end(cpu);
1438 #ifdef CONFIG_PROFILER
1439     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1440                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1441 #endif
1442     return ret;
1443 }
1444
1445 /* Destroy any remaining vCPUs which have been unplugged and have
1446  * finished running
1447  */
1448 static void deal_with_unplugged_cpus(void)
1449 {
1450     CPUState *cpu;
1451
1452     CPU_FOREACH(cpu) {
1453         if (cpu->unplug && !cpu_can_run(cpu)) {
1454             qemu_tcg_destroy_vcpu(cpu);
1455             cpu->created = false;
1456             qemu_cond_signal(&qemu_cpu_cond);
1457             break;
1458         }
1459     }
1460 }
1461
1462 /* Single-threaded TCG
1463  *
1464  * In the single-threaded case each vCPU is simulated in turn. If
1465  * there is more than a single vCPU we create a simple timer to kick
1466  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1467  * This is done explicitly rather than relying on side-effects
1468  * elsewhere.
1469  */
1470
1471 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1472 {
1473     CPUState *cpu = arg;
1474
1475     assert(tcg_enabled());
1476     rcu_register_thread();
1477     tcg_register_thread();
1478
1479     qemu_mutex_lock_iothread();
1480     qemu_thread_get_self(cpu->thread);
1481
1482     cpu->thread_id = qemu_get_thread_id();
1483     cpu->created = true;
1484     cpu->can_do_io = 1;
1485     qemu_cond_signal(&qemu_cpu_cond);
1486     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1487
1488     /* wait for initial kick-off after machine start */
1489     while (first_cpu->stopped) {
1490         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1491
1492         /* process any pending work */
1493         CPU_FOREACH(cpu) {
1494             current_cpu = cpu;
1495             qemu_wait_io_event_common(cpu);
1496         }
1497     }
1498
1499     start_tcg_kick_timer();
1500
1501     cpu = first_cpu;
1502
1503     /* process any pending work */
1504     cpu->exit_request = 1;
1505
1506     while (1) {
1507         qemu_mutex_unlock_iothread();
1508         replay_mutex_lock();
1509         qemu_mutex_lock_iothread();
1510         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1511         qemu_account_warp_timer();
1512
1513         /* Run the timers here.  This is much more efficient than
1514          * waking up the I/O thread and waiting for completion.
1515          */
1516         handle_icount_deadline();
1517
1518         replay_mutex_unlock();
1519
1520         if (!cpu) {
1521             cpu = first_cpu;
1522         }
1523
1524         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1525
1526             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1527             current_cpu = cpu;
1528
1529             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1530                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1531
1532             if (cpu_can_run(cpu)) {
1533                 int r;
1534
1535                 qemu_mutex_unlock_iothread();
1536                 prepare_icount_for_run(cpu);
1537
1538                 r = tcg_cpu_exec(cpu);
1539
1540                 process_icount_data(cpu);
1541                 qemu_mutex_lock_iothread();
1542
1543                 if (r == EXCP_DEBUG) {
1544                     cpu_handle_guest_debug(cpu);
1545                     break;
1546                 } else if (r == EXCP_ATOMIC) {
1547                     qemu_mutex_unlock_iothread();
1548                     cpu_exec_step_atomic(cpu);
1549                     qemu_mutex_lock_iothread();
1550                     break;
1551                 }
1552             } else if (cpu->stop) {
1553                 if (cpu->unplug) {
1554                     cpu = CPU_NEXT(cpu);
1555                 }
1556                 break;
1557             }
1558
1559             cpu = CPU_NEXT(cpu);
1560         } /* while (cpu && !cpu->exit_request).. */
1561
1562         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1563         atomic_set(&tcg_current_rr_cpu, NULL);
1564
1565         if (cpu && cpu->exit_request) {
1566             atomic_mb_set(&cpu->exit_request, 0);
1567         }
1568
1569         if (use_icount && all_cpu_threads_idle()) {
1570             /*
1571              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1572              * in the main_loop, wake it up in order to start the warp timer.
1573              */
1574             qemu_notify_event();
1575         }
1576
1577         qemu_tcg_rr_wait_io_event();
1578         deal_with_unplugged_cpus();
1579     }
1580
1581     rcu_unregister_thread();
1582     return NULL;
1583 }
1584
1585 static void *qemu_hax_cpu_thread_fn(void *arg)
1586 {
1587     CPUState *cpu = arg;
1588     int r;
1589
1590     rcu_register_thread();
1591     qemu_mutex_lock_iothread();
1592     qemu_thread_get_self(cpu->thread);
1593
1594     cpu->thread_id = qemu_get_thread_id();
1595     cpu->created = true;
1596     cpu->halted = 0;
1597     current_cpu = cpu;
1598
1599     hax_init_vcpu(cpu);
1600     qemu_cond_signal(&qemu_cpu_cond);
1601     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1602
1603     do {
1604         if (cpu_can_run(cpu)) {
1605             r = hax_smp_cpu_exec(cpu);
1606             if (r == EXCP_DEBUG) {
1607                 cpu_handle_guest_debug(cpu);
1608             }
1609         }
1610
1611         qemu_wait_io_event(cpu);
1612     } while (!cpu->unplug || cpu_can_run(cpu));
1613     rcu_unregister_thread();
1614     return NULL;
1615 }
1616
1617 /* The HVF-specific vCPU thread function. This one should only run when the host
1618  * CPU supports the VMX "unrestricted guest" feature. */
1619 static void *qemu_hvf_cpu_thread_fn(void *arg)
1620 {
1621     CPUState *cpu = arg;
1622
1623     int r;
1624
1625     assert(hvf_enabled());
1626
1627     rcu_register_thread();
1628
1629     qemu_mutex_lock_iothread();
1630     qemu_thread_get_self(cpu->thread);
1631
1632     cpu->thread_id = qemu_get_thread_id();
1633     cpu->can_do_io = 1;
1634     current_cpu = cpu;
1635
1636     hvf_init_vcpu(cpu);
1637
1638     /* signal CPU creation */
1639     cpu->created = true;
1640     qemu_cond_signal(&qemu_cpu_cond);
1641     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1642
1643     do {
1644         if (cpu_can_run(cpu)) {
1645             r = hvf_vcpu_exec(cpu);
1646             if (r == EXCP_DEBUG) {
1647                 cpu_handle_guest_debug(cpu);
1648             }
1649         }
1650         qemu_wait_io_event(cpu);
1651     } while (!cpu->unplug || cpu_can_run(cpu));
1652
1653     hvf_vcpu_destroy(cpu);
1654     cpu->created = false;
1655     qemu_cond_signal(&qemu_cpu_cond);
1656     qemu_mutex_unlock_iothread();
1657     rcu_unregister_thread();
1658     return NULL;
1659 }
1660
1661 static void *qemu_whpx_cpu_thread_fn(void *arg)
1662 {
1663     CPUState *cpu = arg;
1664     int r;
1665
1666     rcu_register_thread();
1667
1668     qemu_mutex_lock_iothread();
1669     qemu_thread_get_self(cpu->thread);
1670     cpu->thread_id = qemu_get_thread_id();
1671     current_cpu = cpu;
1672
1673     r = whpx_init_vcpu(cpu);
1674     if (r < 0) {
1675         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1676         exit(1);
1677     }
1678
1679     /* signal CPU creation */
1680     cpu->created = true;
1681     qemu_cond_signal(&qemu_cpu_cond);
1682     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1683
1684     do {
1685         if (cpu_can_run(cpu)) {
1686             r = whpx_vcpu_exec(cpu);
1687             if (r == EXCP_DEBUG) {
1688                 cpu_handle_guest_debug(cpu);
1689             }
1690         }
1691         while (cpu_thread_is_idle(cpu)) {
1692             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1693         }
1694         qemu_wait_io_event_common(cpu);
1695     } while (!cpu->unplug || cpu_can_run(cpu));
1696
1697     whpx_destroy_vcpu(cpu);
1698     cpu->created = false;
1699     qemu_cond_signal(&qemu_cpu_cond);
1700     qemu_mutex_unlock_iothread();
1701     rcu_unregister_thread();
1702     return NULL;
1703 }
1704
1705 #ifdef _WIN32
1706 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1707 {
1708 }
1709 #endif
1710
1711 /* Multi-threaded TCG
1712  *
1713  * In the multi-threaded case each vCPU has its own thread. The TLS
1714  * variable current_cpu can be used deep in the code to find the
1715  * current CPUState for a given thread.
1716  */
1717
1718 static void *qemu_tcg_cpu_thread_fn(void *arg)
1719 {
1720     CPUState *cpu = arg;
1721
1722     assert(tcg_enabled());
1723     g_assert(!use_icount);
1724
1725     rcu_register_thread();
1726     tcg_register_thread();
1727
1728     qemu_mutex_lock_iothread();
1729     qemu_thread_get_self(cpu->thread);
1730
1731     cpu->thread_id = qemu_get_thread_id();
1732     cpu->created = true;
1733     cpu->can_do_io = 1;
1734     current_cpu = cpu;
1735     qemu_cond_signal(&qemu_cpu_cond);
1736     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1737
1738     /* process any pending work */
1739     cpu->exit_request = 1;
1740
1741     do {
1742         if (cpu_can_run(cpu)) {
1743             int r;
1744             qemu_mutex_unlock_iothread();
1745             r = tcg_cpu_exec(cpu);
1746             qemu_mutex_lock_iothread();
1747             switch (r) {
1748             case EXCP_DEBUG:
1749                 cpu_handle_guest_debug(cpu);
1750                 break;
1751             case EXCP_HALTED:
1752                 /* during start-up the vCPU is reset and the thread is
1753                  * kicked several times. If we don't ensure we go back
1754                  * to sleep in the halted state we won't cleanly
1755                  * start-up when the vCPU is enabled.
1756                  *
1757                  * cpu->halted should ensure we sleep in wait_io_event
1758                  */
1759                 g_assert(cpu->halted);
1760                 break;
1761             case EXCP_ATOMIC:
1762                 qemu_mutex_unlock_iothread();
1763                 cpu_exec_step_atomic(cpu);
1764                 qemu_mutex_lock_iothread();
1765             default:
1766                 /* Ignore everything else? */
1767                 break;
1768             }
1769         }
1770
1771         atomic_mb_set(&cpu->exit_request, 0);
1772         qemu_wait_io_event(cpu);
1773     } while (!cpu->unplug || cpu_can_run(cpu));
1774
1775     qemu_tcg_destroy_vcpu(cpu);
1776     cpu->created = false;
1777     qemu_cond_signal(&qemu_cpu_cond);
1778     qemu_mutex_unlock_iothread();
1779     rcu_unregister_thread();
1780     return NULL;
1781 }
1782
1783 static void qemu_cpu_kick_thread(CPUState *cpu)
1784 {
1785 #ifndef _WIN32
1786     int err;
1787
1788     if (cpu->thread_kicked) {
1789         return;
1790     }
1791     cpu->thread_kicked = true;
1792     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1793     if (err && err != ESRCH) {
1794         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1795         exit(1);
1796     }
1797 #else /* _WIN32 */
1798     if (!qemu_cpu_is_self(cpu)) {
1799         if (whpx_enabled()) {
1800             whpx_vcpu_kick(cpu);
1801         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1802             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1803                     __func__, GetLastError());
1804             exit(1);
1805         }
1806     }
1807 #endif
1808 }
1809
1810 void qemu_cpu_kick(CPUState *cpu)
1811 {
1812     qemu_cond_broadcast(cpu->halt_cond);
1813     if (tcg_enabled()) {
1814         cpu_exit(cpu);
1815         /* NOP unless doing single-thread RR */
1816         qemu_cpu_kick_rr_cpu();
1817     } else {
1818         if (hax_enabled()) {
1819             /*
1820              * FIXME: race condition with the exit_request check in
1821              * hax_vcpu_hax_exec
1822              */
1823             cpu->exit_request = 1;
1824         }
1825         qemu_cpu_kick_thread(cpu);
1826     }
1827 }
1828
1829 void qemu_cpu_kick_self(void)
1830 {
1831     assert(current_cpu);
1832     qemu_cpu_kick_thread(current_cpu);
1833 }
1834
1835 bool qemu_cpu_is_self(CPUState *cpu)
1836 {
1837     return qemu_thread_is_self(cpu->thread);
1838 }
1839
1840 bool qemu_in_vcpu_thread(void)
1841 {
1842     return current_cpu && qemu_cpu_is_self(current_cpu);
1843 }
1844
1845 static __thread bool iothread_locked = false;
1846
1847 bool qemu_mutex_iothread_locked(void)
1848 {
1849     return iothread_locked;
1850 }
1851
1852 /*
1853  * The BQL is taken from so many places that it is worth profiling the
1854  * callers directly, instead of funneling them all through a single function.
1855  */
1856 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1857 {
1858     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1859
1860     g_assert(!qemu_mutex_iothread_locked());
1861     bql_lock(&qemu_global_mutex, file, line);
1862     iothread_locked = true;
1863 }
1864
1865 void qemu_mutex_unlock_iothread(void)
1866 {
1867     g_assert(qemu_mutex_iothread_locked());
1868     iothread_locked = false;
1869     qemu_mutex_unlock(&qemu_global_mutex);
1870 }
1871
1872 static bool all_vcpus_paused(void)
1873 {
1874     CPUState *cpu;
1875
1876     CPU_FOREACH(cpu) {
1877         if (!cpu->stopped) {
1878             return false;
1879         }
1880     }
1881
1882     return true;
1883 }
1884
1885 void pause_all_vcpus(void)
1886 {
1887     CPUState *cpu;
1888
1889     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1890     CPU_FOREACH(cpu) {
1891         if (qemu_cpu_is_self(cpu)) {
1892             qemu_cpu_stop(cpu, true);
1893         } else {
1894             cpu->stop = true;
1895             qemu_cpu_kick(cpu);
1896         }
1897     }
1898
1899     /* We need to drop the replay_lock so any vCPU threads woken up
1900      * can finish their replay tasks
1901      */
1902     replay_mutex_unlock();
1903
1904     while (!all_vcpus_paused()) {
1905         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1906         CPU_FOREACH(cpu) {
1907             qemu_cpu_kick(cpu);
1908         }
1909     }
1910
1911     qemu_mutex_unlock_iothread();
1912     replay_mutex_lock();
1913     qemu_mutex_lock_iothread();
1914 }
1915
1916 void cpu_resume(CPUState *cpu)
1917 {
1918     cpu->stop = false;
1919     cpu->stopped = false;
1920     qemu_cpu_kick(cpu);
1921 }
1922
1923 void resume_all_vcpus(void)
1924 {
1925     CPUState *cpu;
1926
1927     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1928     CPU_FOREACH(cpu) {
1929         cpu_resume(cpu);
1930     }
1931 }
1932
1933 void cpu_remove_sync(CPUState *cpu)
1934 {
1935     cpu->stop = true;
1936     cpu->unplug = true;
1937     qemu_cpu_kick(cpu);
1938     qemu_mutex_unlock_iothread();
1939     qemu_thread_join(cpu->thread);
1940     qemu_mutex_lock_iothread();
1941 }
1942
1943 /* For temporary buffers for forming a name */
1944 #define VCPU_THREAD_NAME_SIZE 16
1945
1946 static void qemu_tcg_init_vcpu(CPUState *cpu)
1947 {
1948     char thread_name[VCPU_THREAD_NAME_SIZE];
1949     static QemuCond *single_tcg_halt_cond;
1950     static QemuThread *single_tcg_cpu_thread;
1951     static int tcg_region_inited;
1952
1953     assert(tcg_enabled());
1954     /*
1955      * Initialize TCG regions--once. Now is a good time, because:
1956      * (1) TCG's init context, prologue and target globals have been set up.
1957      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1958      *     -accel flag is processed, so the check doesn't work then).
1959      */
1960     if (!tcg_region_inited) {
1961         tcg_region_inited = 1;
1962         tcg_region_init();
1963     }
1964
1965     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1966         cpu->thread = g_malloc0(sizeof(QemuThread));
1967         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1968         qemu_cond_init(cpu->halt_cond);
1969
1970         if (qemu_tcg_mttcg_enabled()) {
1971             /* create a thread per vCPU with TCG (MTTCG) */
1972             parallel_cpus = true;
1973             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1974                  cpu->cpu_index);
1975
1976             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1977                                cpu, QEMU_THREAD_JOINABLE);
1978
1979         } else {
1980             /* share a single thread for all cpus with TCG */
1981             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1982             qemu_thread_create(cpu->thread, thread_name,
1983                                qemu_tcg_rr_cpu_thread_fn,
1984                                cpu, QEMU_THREAD_JOINABLE);
1985
1986             single_tcg_halt_cond = cpu->halt_cond;
1987             single_tcg_cpu_thread = cpu->thread;
1988         }
1989 #ifdef _WIN32
1990         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1991 #endif
1992     } else {
1993         /* For non-MTTCG cases we share the thread */
1994         cpu->thread = single_tcg_cpu_thread;
1995         cpu->halt_cond = single_tcg_halt_cond;
1996         cpu->thread_id = first_cpu->thread_id;
1997         cpu->can_do_io = 1;
1998         cpu->created = true;
1999     }
2000 }
2001
2002 static void qemu_hax_start_vcpu(CPUState *cpu)
2003 {
2004     char thread_name[VCPU_THREAD_NAME_SIZE];
2005
2006     cpu->thread = g_malloc0(sizeof(QemuThread));
2007     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2008     qemu_cond_init(cpu->halt_cond);
2009
2010     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2011              cpu->cpu_index);
2012     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2013                        cpu, QEMU_THREAD_JOINABLE);
2014 #ifdef _WIN32
2015     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2016 #endif
2017 }
2018
2019 static void qemu_kvm_start_vcpu(CPUState *cpu)
2020 {
2021     char thread_name[VCPU_THREAD_NAME_SIZE];
2022
2023     cpu->thread = g_malloc0(sizeof(QemuThread));
2024     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2025     qemu_cond_init(cpu->halt_cond);
2026     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2027              cpu->cpu_index);
2028     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2029                        cpu, QEMU_THREAD_JOINABLE);
2030 }
2031
2032 static void qemu_hvf_start_vcpu(CPUState *cpu)
2033 {
2034     char thread_name[VCPU_THREAD_NAME_SIZE];
2035
2036     /* HVF currently does not support TCG, and only runs in
2037      * unrestricted-guest mode. */
2038     assert(hvf_enabled());
2039
2040     cpu->thread = g_malloc0(sizeof(QemuThread));
2041     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2042     qemu_cond_init(cpu->halt_cond);
2043
2044     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2045              cpu->cpu_index);
2046     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2047                        cpu, QEMU_THREAD_JOINABLE);
2048 }
2049
2050 static void qemu_whpx_start_vcpu(CPUState *cpu)
2051 {
2052     char thread_name[VCPU_THREAD_NAME_SIZE];
2053
2054     cpu->thread = g_malloc0(sizeof(QemuThread));
2055     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2056     qemu_cond_init(cpu->halt_cond);
2057     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2058              cpu->cpu_index);
2059     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2060                        cpu, QEMU_THREAD_JOINABLE);
2061 #ifdef _WIN32
2062     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2063 #endif
2064 }
2065
2066 static void qemu_dummy_start_vcpu(CPUState *cpu)
2067 {
2068     char thread_name[VCPU_THREAD_NAME_SIZE];
2069
2070     cpu->thread = g_malloc0(sizeof(QemuThread));
2071     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2072     qemu_cond_init(cpu->halt_cond);
2073     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2074              cpu->cpu_index);
2075     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2076                        QEMU_THREAD_JOINABLE);
2077 }
2078
2079 void qemu_init_vcpu(CPUState *cpu)
2080 {
2081     cpu->nr_cores = smp_cores;
2082     cpu->nr_threads = smp_threads;
2083     cpu->stopped = true;
2084     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2085
2086     if (!cpu->as) {
2087         /* If the target cpu hasn't set up any address spaces itself,
2088          * give it the default one.
2089          */
2090         cpu->num_ases = 1;
2091         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2092     }
2093
2094     if (kvm_enabled()) {
2095         qemu_kvm_start_vcpu(cpu);
2096     } else if (hax_enabled()) {
2097         qemu_hax_start_vcpu(cpu);
2098     } else if (hvf_enabled()) {
2099         qemu_hvf_start_vcpu(cpu);
2100     } else if (tcg_enabled()) {
2101         qemu_tcg_init_vcpu(cpu);
2102     } else if (whpx_enabled()) {
2103         qemu_whpx_start_vcpu(cpu);
2104     } else {
2105         qemu_dummy_start_vcpu(cpu);
2106     }
2107
2108     while (!cpu->created) {
2109         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2110     }
2111 }
2112
2113 void cpu_stop_current(void)
2114 {
2115     if (current_cpu) {
2116         current_cpu->stop = true;
2117         cpu_exit(current_cpu);
2118     }
2119 }
2120
2121 int vm_stop(RunState state)
2122 {
2123     if (qemu_in_vcpu_thread()) {
2124         qemu_system_vmstop_request_prepare();
2125         qemu_system_vmstop_request(state);
2126         /*
2127          * FIXME: should not return to device code in case
2128          * vm_stop() has been requested.
2129          */
2130         cpu_stop_current();
2131         return 0;
2132     }
2133
2134     return do_vm_stop(state, true);
2135 }
2136
2137 /**
2138  * Prepare for (re)starting the VM.
2139  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2140  * running or in case of an error condition), 0 otherwise.
2141  */
2142 int vm_prepare_start(void)
2143 {
2144     RunState requested;
2145
2146     qemu_vmstop_requested(&requested);
2147     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2148         return -1;
2149     }
2150
2151     /* Ensure that a STOP/RESUME pair of events is emitted if a
2152      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2153      * example, according to documentation is always followed by
2154      * the STOP event.
2155      */
2156     if (runstate_is_running()) {
2157         qapi_event_send_stop();
2158         qapi_event_send_resume();
2159         return -1;
2160     }
2161
2162     /* We are sending this now, but the CPUs will be resumed shortly later */
2163     qapi_event_send_resume();
2164
2165     replay_enable_events();
2166     cpu_enable_ticks();
2167     runstate_set(RUN_STATE_RUNNING);
2168     vm_state_notify(1, RUN_STATE_RUNNING);
2169     return 0;
2170 }
2171
2172 void vm_start(void)
2173 {
2174     if (!vm_prepare_start()) {
2175         resume_all_vcpus();
2176     }
2177 }
2178
2179 /* does a state transition even if the VM is already stopped,
2180    current state is forgotten forever */
2181 int vm_stop_force_state(RunState state)
2182 {
2183     if (runstate_is_running()) {
2184         return vm_stop(state);
2185     } else {
2186         runstate_set(state);
2187
2188         bdrv_drain_all();
2189         /* Make sure to return an error if the flush in a previous vm_stop()
2190          * failed. */
2191         return bdrv_flush_all();
2192     }
2193 }
2194
2195 void list_cpus(const char *optarg)
2196 {
2197     /* XXX: implement xxx_cpu_list for targets that still miss it */
2198 #if defined(cpu_list)
2199     cpu_list();
2200 #endif
2201 }
2202
2203 CpuInfoList *qmp_query_cpus(Error **errp)
2204 {
2205     MachineState *ms = MACHINE(qdev_get_machine());
2206     MachineClass *mc = MACHINE_GET_CLASS(ms);
2207     CpuInfoList *head = NULL, *cur_item = NULL;
2208     CPUState *cpu;
2209
2210     CPU_FOREACH(cpu) {
2211         CpuInfoList *info;
2212 #if defined(TARGET_I386)
2213         X86CPU *x86_cpu = X86_CPU(cpu);
2214         CPUX86State *env = &x86_cpu->env;
2215 #elif defined(TARGET_PPC)
2216         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2217         CPUPPCState *env = &ppc_cpu->env;
2218 #elif defined(TARGET_SPARC)
2219         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2220         CPUSPARCState *env = &sparc_cpu->env;
2221 #elif defined(TARGET_RISCV)
2222         RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2223         CPURISCVState *env = &riscv_cpu->env;
2224 #elif defined(TARGET_MIPS)
2225         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2226         CPUMIPSState *env = &mips_cpu->env;
2227 #elif defined(TARGET_TRICORE)
2228         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2229         CPUTriCoreState *env = &tricore_cpu->env;
2230 #elif defined(TARGET_S390X)
2231         S390CPU *s390_cpu = S390_CPU(cpu);
2232         CPUS390XState *env = &s390_cpu->env;
2233 #endif
2234
2235         cpu_synchronize_state(cpu);
2236
2237         info = g_malloc0(sizeof(*info));
2238         info->value = g_malloc0(sizeof(*info->value));
2239         info->value->CPU = cpu->cpu_index;
2240         info->value->current = (cpu == first_cpu);
2241         info->value->halted = cpu->halted;
2242         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2243         info->value->thread_id = cpu->thread_id;
2244 #if defined(TARGET_I386)
2245         info->value->arch = CPU_INFO_ARCH_X86;
2246         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2247 #elif defined(TARGET_PPC)
2248         info->value->arch = CPU_INFO_ARCH_PPC;
2249         info->value->u.ppc.nip = env->nip;
2250 #elif defined(TARGET_SPARC)
2251         info->value->arch = CPU_INFO_ARCH_SPARC;
2252         info->value->u.q_sparc.pc = env->pc;
2253         info->value->u.q_sparc.npc = env->npc;
2254 #elif defined(TARGET_MIPS)
2255         info->value->arch = CPU_INFO_ARCH_MIPS;
2256         info->value->u.q_mips.PC = env->active_tc.PC;
2257 #elif defined(TARGET_TRICORE)
2258         info->value->arch = CPU_INFO_ARCH_TRICORE;
2259         info->value->u.tricore.PC = env->PC;
2260 #elif defined(TARGET_S390X)
2261         info->value->arch = CPU_INFO_ARCH_S390;
2262         info->value->u.s390.cpu_state = env->cpu_state;
2263 #elif defined(TARGET_RISCV)
2264         info->value->arch = CPU_INFO_ARCH_RISCV;
2265         info->value->u.riscv.pc = env->pc;
2266 #else
2267         info->value->arch = CPU_INFO_ARCH_OTHER;
2268 #endif
2269         info->value->has_props = !!mc->cpu_index_to_instance_props;
2270         if (info->value->has_props) {
2271             CpuInstanceProperties *props;
2272             props = g_malloc0(sizeof(*props));
2273             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2274             info->value->props = props;
2275         }
2276
2277         /* XXX: waiting for the qapi to support GSList */
2278         if (!cur_item) {
2279             head = cur_item = info;
2280         } else {
2281             cur_item->next = info;
2282             cur_item = info;
2283         }
2284     }
2285
2286     return head;
2287 }
2288
2289 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2290 {
2291     /*
2292      * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2293      * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2294      */
2295     switch (target) {
2296     case SYS_EMU_TARGET_I386:
2297     case SYS_EMU_TARGET_X86_64:
2298         return CPU_INFO_ARCH_X86;
2299
2300     case SYS_EMU_TARGET_PPC:
2301     case SYS_EMU_TARGET_PPC64:
2302         return CPU_INFO_ARCH_PPC;
2303
2304     case SYS_EMU_TARGET_SPARC:
2305     case SYS_EMU_TARGET_SPARC64:
2306         return CPU_INFO_ARCH_SPARC;
2307
2308     case SYS_EMU_TARGET_MIPS:
2309     case SYS_EMU_TARGET_MIPSEL:
2310     case SYS_EMU_TARGET_MIPS64:
2311     case SYS_EMU_TARGET_MIPS64EL:
2312         return CPU_INFO_ARCH_MIPS;
2313
2314     case SYS_EMU_TARGET_TRICORE:
2315         return CPU_INFO_ARCH_TRICORE;
2316
2317     case SYS_EMU_TARGET_S390X:
2318         return CPU_INFO_ARCH_S390;
2319
2320     case SYS_EMU_TARGET_RISCV32:
2321     case SYS_EMU_TARGET_RISCV64:
2322         return CPU_INFO_ARCH_RISCV;
2323
2324     default:
2325         return CPU_INFO_ARCH_OTHER;
2326     }
2327 }
2328
2329 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2330 {
2331 #ifdef TARGET_S390X
2332     S390CPU *s390_cpu = S390_CPU(cpu);
2333     CPUS390XState *env = &s390_cpu->env;
2334
2335     info->cpu_state = env->cpu_state;
2336 #else
2337     abort();
2338 #endif
2339 }
2340
2341 /*
2342  * fast means: we NEVER interrupt vCPU threads to retrieve
2343  * information from KVM.
2344  */
2345 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2346 {
2347     MachineState *ms = MACHINE(qdev_get_machine());
2348     MachineClass *mc = MACHINE_GET_CLASS(ms);
2349     CpuInfoFastList *head = NULL, *cur_item = NULL;
2350     SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2351                                           -1, &error_abort);
2352     CPUState *cpu;
2353
2354     CPU_FOREACH(cpu) {
2355         CpuInfoFastList *info = g_malloc0(sizeof(*info));
2356         info->value = g_malloc0(sizeof(*info->value));
2357
2358         info->value->cpu_index = cpu->cpu_index;
2359         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2360         info->value->thread_id = cpu->thread_id;
2361
2362         info->value->has_props = !!mc->cpu_index_to_instance_props;
2363         if (info->value->has_props) {
2364             CpuInstanceProperties *props;
2365             props = g_malloc0(sizeof(*props));
2366             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2367             info->value->props = props;
2368         }
2369
2370         info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2371         info->value->target = target;
2372         if (target == SYS_EMU_TARGET_S390X) {
2373             cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2374         }
2375
2376         if (!cur_item) {
2377             head = cur_item = info;
2378         } else {
2379             cur_item->next = info;
2380             cur_item = info;
2381         }
2382     }
2383
2384     return head;
2385 }
2386
2387 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2388                  bool has_cpu, int64_t cpu_index, Error **errp)
2389 {
2390     FILE *f;
2391     uint32_t l;
2392     CPUState *cpu;
2393     uint8_t buf[1024];
2394     int64_t orig_addr = addr, orig_size = size;
2395
2396     if (!has_cpu) {
2397         cpu_index = 0;
2398     }
2399
2400     cpu = qemu_get_cpu(cpu_index);
2401     if (cpu == NULL) {
2402         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2403                    "a CPU number");
2404         return;
2405     }
2406
2407     f = fopen(filename, "wb");
2408     if (!f) {
2409         error_setg_file_open(errp, errno, filename);
2410         return;
2411     }
2412
2413     while (size != 0) {
2414         l = sizeof(buf);
2415         if (l > size)
2416             l = size;
2417         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2418             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2419                              " specified", orig_addr, orig_size);
2420             goto exit;
2421         }
2422         if (fwrite(buf, 1, l, f) != l) {
2423             error_setg(errp, QERR_IO_ERROR);
2424             goto exit;
2425         }
2426         addr += l;
2427         size -= l;
2428     }
2429
2430 exit:
2431     fclose(f);
2432 }
2433
2434 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2435                   Error **errp)
2436 {
2437     FILE *f;
2438     uint32_t l;
2439     uint8_t buf[1024];
2440
2441     f = fopen(filename, "wb");
2442     if (!f) {
2443         error_setg_file_open(errp, errno, filename);
2444         return;
2445     }
2446
2447     while (size != 0) {
2448         l = sizeof(buf);
2449         if (l > size)
2450             l = size;
2451         cpu_physical_memory_read(addr, buf, l);
2452         if (fwrite(buf, 1, l, f) != l) {
2453             error_setg(errp, QERR_IO_ERROR);
2454             goto exit;
2455         }
2456         addr += l;
2457         size -= l;
2458     }
2459
2460 exit:
2461     fclose(f);
2462 }
2463
2464 void qmp_inject_nmi(Error **errp)
2465 {
2466     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2467 }
2468
2469 void dump_drift_info(void)
2470 {
2471     if (!use_icount) {
2472         return;
2473     }
2474
2475     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2476                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2477     if (icount_align_option) {
2478         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2479                     -max_delay / SCALE_MS);
2480         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2481                     max_advance / SCALE_MS);
2482     } else {
2483         qemu_printf("Max guest delay     NA\n");
2484         qemu_printf("Max guest advance   NA\n");
2485     }
2486 }