cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu/config-file.h"
  27 #include "cpu.h"
  28 #include "monitor/monitor.h"
  29 #include "qapi/error.h"
  30 #include "qapi/qapi-commands-misc.h"
  31 #include "qapi/qapi-events-run-state.h"
  32 #include "qapi/qmp/qerror.h"
  33 #include "qemu/error-report.h"
  34 #include "qemu/qemu-print.h"
  35 #include "sysemu/sysemu.h"
  36 #include "sysemu/block-backend.h"
  37 #include "exec/gdbstub.h"
  38 #include "sysemu/dma.h"
  39 #include "sysemu/hw_accel.h"
  40 #include "sysemu/kvm.h"
  41 #include "sysemu/hax.h"
  42 #include "sysemu/hvf.h"
  43 #include "sysemu/whpx.h"
  44 #include "exec/exec-all.h"
  45
  46 #include "qemu/thread.h"
  47 #include "sysemu/cpus.h"
  48 #include "sysemu/qtest.h"
  49 #include "qemu/main-loop.h"
  50 #include "qemu/option.h"
  51 #include "qemu/bitmap.h"
  52 #include "qemu/seqlock.h"
  53 #include "qemu/guest-random.h"
  54 #include "tcg.h"
  55 #include "hw/nmi.h"
  56 #include "sysemu/replay.h"
  57 #include "hw/boards.h"
  58
  59 #ifdef CONFIG_LINUX
  60
  61 #include <sys/prctl.h>
  62
  63 #ifndef PR_MCE_KILL
  64 #define PR_MCE_KILL 33
  65 #endif
  66
  67 #ifndef PR_MCE_KILL_SET
  68 #define PR_MCE_KILL_SET 1
  69 #endif
  70
  71 #ifndef PR_MCE_KILL_EARLY
  72 #define PR_MCE_KILL_EARLY 1
  73 #endif
  74
  75 #endif /* CONFIG_LINUX */
  76
  77 int64_t max_delay;
  78 int64_t max_advance;
  79
  80 /* vcpu throttling controls */
  81 static QEMUTimer *throttle_timer;
  82 static unsigned int throttle_percentage;
  83
  84 #define CPU_THROTTLE_PCT_MIN 1
  85 #define CPU_THROTTLE_PCT_MAX 99
  86 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  87
  88 bool cpu_is_stopped(CPUState *cpu)
  89 {
  90     return cpu->stopped || !runstate_is_running();
  91 }
  92
  93 static bool cpu_thread_is_idle(CPUState *cpu)
  94 {
  95     if (cpu->stop || cpu->queued_work_first) {
  96         return false;
  97     }
  98     if (cpu_is_stopped(cpu)) {
  99         return true;
 100     }
 101     if (!cpu->halted || cpu_has_work(cpu) ||
 102         kvm_halt_in_kernel()) {
 103         return false;
 104     }
 105     return true;
 106 }
 107
 108 static bool all_cpu_threads_idle(void)
 109 {
 110     CPUState *cpu;
 111
 112     CPU_FOREACH(cpu) {
 113         if (!cpu_thread_is_idle(cpu)) {
 114             return false;
 115         }
 116     }
 117     return true;
 118 }
 119
 120 /***********************************************************/
 121 /* guest cycle counter */
 122
 123 /* Protected by TimersState seqlock */
 124
 125 static bool icount_sleep = true;
 126 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 127 #define MAX_ICOUNT_SHIFT 10
 128
 129 typedef struct TimersState {
 130     /* Protected by BQL.  */
 131     int64_t cpu_ticks_prev;
 132     int64_t cpu_ticks_offset;
 133
 134     /* Protect fields that can be respectively read outside the
 135      * BQL, and written from multiple threads.
 136      */
 137     QemuSeqLock vm_clock_seqlock;
 138     QemuSpin vm_clock_lock;
 139
 140     int16_t cpu_ticks_enabled;
 141
 142     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 143     int16_t icount_time_shift;
 144
 145     /* Compensate for varying guest execution speed.  */
 146     int64_t qemu_icount_bias;
 147
 148     int64_t vm_clock_warp_start;
 149     int64_t cpu_clock_offset;
 150
 151     /* Only written by TCG thread */
 152     int64_t qemu_icount;
 153
 154     /* for adjusting icount */
 155     QEMUTimer *icount_rt_timer;
 156     QEMUTimer *icount_vm_timer;
 157     QEMUTimer *icount_warp_timer;
 158 } TimersState;
 159
 160 static TimersState timers_state;
 161 bool mttcg_enabled;
 162
 163 /*
 164  * We default to false if we know other options have been enabled
 165  * which are currently incompatible with MTTCG. Otherwise when each
 166  * guest (target) has been updated to support:
 167  *   - atomic instructions
 168  *   - memory ordering primitives (barriers)
 169  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 170  *
 171  * Once a guest architecture has been converted to the new primitives
 172  * there are two remaining limitations to check.
 173  *
 174  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 175  * - The host must have a stronger memory order than the guest
 176  *
 177  * It may be possible in future to support strong guests on weak hosts
 178  * but that will require tagging all load/stores in a guest with their
 179  * implicit memory order requirements which would likely slow things
 180  * down a lot.
 181  */
 182
 183 static bool check_tcg_memory_orders_compatible(void)
 184 {
 185 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 186     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 187 #else
 188     return false;
 189 #endif
 190 }
 191
 192 static bool default_mttcg_enabled(void)
 193 {
 194     if (use_icount || TCG_OVERSIZED_GUEST) {
 195         return false;
 196     } else {
 197 #ifdef TARGET_SUPPORTS_MTTCG
 198         return check_tcg_memory_orders_compatible();
 199 #else
 200         return false;
 201 #endif
 202     }
 203 }
 204
 205 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 206 {
 207     const char *t = qemu_opt_get(opts, "thread");
 208     if (t) {
 209         if (strcmp(t, "multi") == 0) {
 210             if (TCG_OVERSIZED_GUEST) {
 211                 error_setg(errp, "No MTTCG when guest word size > hosts");
 212             } else if (use_icount) {
 213                 error_setg(errp, "No MTTCG when icount is enabled");
 214             } else {
 215 #ifndef TARGET_SUPPORTS_MTTCG
 216                 warn_report("Guest not yet converted to MTTCG - "
 217                             "you may get unexpected results");
 218 #endif
 219                 if (!check_tcg_memory_orders_compatible()) {
 220                     warn_report("Guest expects a stronger memory ordering "
 221                                 "than the host provides");
 222                     error_printf("This may cause strange/hard to debug errors\n");
 223                 }
 224                 mttcg_enabled = true;
 225             }
 226         } else if (strcmp(t, "single") == 0) {
 227             mttcg_enabled = false;
 228         } else {
 229             error_setg(errp, "Invalid 'thread' setting %s", t);
 230         }
 231     } else {
 232         mttcg_enabled = default_mttcg_enabled();
 233     }
 234 }
 235
 236 /* The current number of executed instructions is based on what we
 237  * originally budgeted minus the current state of the decrementing
 238  * icount counters in extra/u16.low.
 239  */
 240 static int64_t cpu_get_icount_executed(CPUState *cpu)
 241 {
 242     return (cpu->icount_budget -
 243             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 244 }
 245
 246 /*
 247  * Update the global shared timer_state.qemu_icount to take into
 248  * account executed instructions. This is done by the TCG vCPU
 249  * thread so the main-loop can see time has moved forward.
 250  */
 251 static void cpu_update_icount_locked(CPUState *cpu)
 252 {
 253     int64_t executed = cpu_get_icount_executed(cpu);
 254     cpu->icount_budget -= executed;
 255
 256     atomic_set_i64(&timers_state.qemu_icount,
 257                    timers_state.qemu_icount + executed);
 258 }
 259
 260 /*
 261  * Update the global shared timer_state.qemu_icount to take into
 262  * account executed instructions. This is done by the TCG vCPU
 263  * thread so the main-loop can see time has moved forward.
 264  */
 265 void cpu_update_icount(CPUState *cpu)
 266 {
 267     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 268                        &timers_state.vm_clock_lock);
 269     cpu_update_icount_locked(cpu);
 270     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 271                          &timers_state.vm_clock_lock);
 272 }
 273
 274 static int64_t cpu_get_icount_raw_locked(void)
 275 {
 276     CPUState *cpu = current_cpu;
 277
 278     if (cpu && cpu->running) {
 279         if (!cpu->can_do_io) {
 280             error_report("Bad icount read");
 281             exit(1);
 282         }
 283         /* Take into account what has run */
 284         cpu_update_icount_locked(cpu);
 285     }
 286     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 287     return atomic_read_i64(&timers_state.qemu_icount);
 288 }
 289
 290 static int64_t cpu_get_icount_locked(void)
 291 {
 292     int64_t icount = cpu_get_icount_raw_locked();
 293     return atomic_read_i64(&timers_state.qemu_icount_bias) +
 294         cpu_icount_to_ns(icount);
 295 }
 296
 297 int64_t cpu_get_icount_raw(void)
 298 {
 299     int64_t icount;
 300     unsigned start;
 301
 302     do {
 303         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 304         icount = cpu_get_icount_raw_locked();
 305     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 306
 307     return icount;
 308 }
 309
 310 /* Return the virtual CPU time, based on the instruction counter.  */
 311 int64_t cpu_get_icount(void)
 312 {
 313     int64_t icount;
 314     unsigned start;
 315
 316     do {
 317         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 318         icount = cpu_get_icount_locked();
 319     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 320
 321     return icount;
 322 }
 323
 324 int64_t cpu_icount_to_ns(int64_t icount)
 325 {
 326     return icount << atomic_read(&timers_state.icount_time_shift);
 327 }
 328
 329 static int64_t cpu_get_ticks_locked(void)
 330 {
 331     int64_t ticks = timers_state.cpu_ticks_offset;
 332     if (timers_state.cpu_ticks_enabled) {
 333         ticks += cpu_get_host_ticks();
 334     }
 335
 336     if (timers_state.cpu_ticks_prev > ticks) {
 337         /* Non increasing ticks may happen if the host uses software suspend.  */
 338         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 339         ticks = timers_state.cpu_ticks_prev;
 340     }
 341
 342     timers_state.cpu_ticks_prev = ticks;
 343     return ticks;
 344 }
 345
 346 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 347  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 348  * counter.
 349  */
 350 int64_t cpu_get_ticks(void)
 351 {
 352     int64_t ticks;
 353
 354     if (use_icount) {
 355         return cpu_get_icount();
 356     }
 357
 358     qemu_spin_lock(&timers_state.vm_clock_lock);
 359     ticks = cpu_get_ticks_locked();
 360     qemu_spin_unlock(&timers_state.vm_clock_lock);
 361     return ticks;
 362 }
 363
 364 static int64_t cpu_get_clock_locked(void)
 365 {
 366     int64_t time;
 367
 368     time = timers_state.cpu_clock_offset;
 369     if (timers_state.cpu_ticks_enabled) {
 370         time += get_clock();
 371     }
 372
 373     return time;
 374 }
 375
 376 /* Return the monotonic time elapsed in VM, i.e.,
 377  * the time between vm_start and vm_stop
 378  */
 379 int64_t cpu_get_clock(void)
 380 {
 381     int64_t ti;
 382     unsigned start;
 383
 384     do {
 385         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 386         ti = cpu_get_clock_locked();
 387     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 388
 389     return ti;
 390 }
 391
 392 /* enable cpu_get_ticks()
 393  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 394  */
 395 void cpu_enable_ticks(void)
 396 {
 397     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 398                        &timers_state.vm_clock_lock);
 399     if (!timers_state.cpu_ticks_enabled) {
 400         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 401         timers_state.cpu_clock_offset -= get_clock();
 402         timers_state.cpu_ticks_enabled = 1;
 403     }
 404     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 405                        &timers_state.vm_clock_lock);
 406 }
 407
 408 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 409  * cpu_get_ticks() after that.
 410  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 411  */
 412 void cpu_disable_ticks(void)
 413 {
 414     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 415                        &timers_state.vm_clock_lock);
 416     if (timers_state.cpu_ticks_enabled) {
 417         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 418         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 419         timers_state.cpu_ticks_enabled = 0;
 420     }
 421     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 422                          &timers_state.vm_clock_lock);
 423 }
 424
 425 /* Correlation between real and virtual time is always going to be
 426    fairly approximate, so ignore small variation.
 427    When the guest is idle real and virtual time will be aligned in
 428    the IO wait loop.  */
 429 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 430
 431 static void icount_adjust(void)
 432 {
 433     int64_t cur_time;
 434     int64_t cur_icount;
 435     int64_t delta;
 436
 437     /* Protected by TimersState mutex.  */
 438     static int64_t last_delta;
 439
 440     /* If the VM is not running, then do nothing.  */
 441     if (!runstate_is_running()) {
 442         return;
 443     }
 444
 445     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 446                        &timers_state.vm_clock_lock);
 447     cur_time = cpu_get_clock_locked();
 448     cur_icount = cpu_get_icount_locked();
 449
 450     delta = cur_icount - cur_time;
 451     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 452     if (delta > 0
 453         && last_delta + ICOUNT_WOBBLE < delta * 2
 454         && timers_state.icount_time_shift > 0) {
 455         /* The guest is getting too far ahead.  Slow time down.  */
 456         atomic_set(&timers_state.icount_time_shift,
 457                    timers_state.icount_time_shift - 1);
 458     }
 459     if (delta < 0
 460         && last_delta - ICOUNT_WOBBLE > delta * 2
 461         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 462         /* The guest is getting too far behind.  Speed time up.  */
 463         atomic_set(&timers_state.icount_time_shift,
 464                    timers_state.icount_time_shift + 1);
 465     }
 466     last_delta = delta;
 467     atomic_set_i64(&timers_state.qemu_icount_bias,
 468                    cur_icount - (timers_state.qemu_icount
 469                                  << timers_state.icount_time_shift));
 470     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 471                          &timers_state.vm_clock_lock);
 472 }
 473
 474 static void icount_adjust_rt(void *opaque)
 475 {
 476     timer_mod(timers_state.icount_rt_timer,
 477               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 478     icount_adjust();
 479 }
 480
 481 static void icount_adjust_vm(void *opaque)
 482 {
 483     timer_mod(timers_state.icount_vm_timer,
 484                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 485                    NANOSECONDS_PER_SECOND / 10);
 486     icount_adjust();
 487 }
 488
 489 static int64_t qemu_icount_round(int64_t count)
 490 {
 491     int shift = atomic_read(&timers_state.icount_time_shift);
 492     return (count + (1 << shift) - 1) >> shift;
 493 }
 494
 495 static void icount_warp_rt(void)
 496 {
 497     unsigned seq;
 498     int64_t warp_start;
 499
 500     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 501      * changes from -1 to another value, so the race here is okay.
 502      */
 503     do {
 504         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 505         warp_start = timers_state.vm_clock_warp_start;
 506     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 507
 508     if (warp_start == -1) {
 509         return;
 510     }
 511
 512     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 513                        &timers_state.vm_clock_lock);
 514     if (runstate_is_running()) {
 515         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 516                                             cpu_get_clock_locked());
 517         int64_t warp_delta;
 518
 519         warp_delta = clock - timers_state.vm_clock_warp_start;
 520         if (use_icount == 2) {
 521             /*
 522              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 523              * far ahead of real time.
 524              */
 525             int64_t cur_icount = cpu_get_icount_locked();
 526             int64_t delta = clock - cur_icount;
 527             warp_delta = MIN(warp_delta, delta);
 528         }
 529         atomic_set_i64(&timers_state.qemu_icount_bias,
 530                        timers_state.qemu_icount_bias + warp_delta);
 531     }
 532     timers_state.vm_clock_warp_start = -1;
 533     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 534                        &timers_state.vm_clock_lock);
 535
 536     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 537         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 538     }
 539 }
 540
 541 static void icount_timer_cb(void *opaque)
 542 {
 543     /* No need for a checkpoint because the timer already synchronizes
 544      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 545      */
 546     icount_warp_rt();
 547 }
 548
 549 void qtest_clock_warp(int64_t dest)
 550 {
 551     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 552     AioContext *aio_context;
 553     assert(qtest_enabled());
 554     aio_context = qemu_get_aio_context();
 555     while (clock < dest) {
 556         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 557         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 558
 559         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 560                            &timers_state.vm_clock_lock);
 561         atomic_set_i64(&timers_state.qemu_icount_bias,
 562                        timers_state.qemu_icount_bias + warp);
 563         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 564                              &timers_state.vm_clock_lock);
 565
 566         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 567         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 568         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 569     }
 570     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 571 }
 572
 573 void qemu_start_warp_timer(void)
 574 {
 575     int64_t clock;
 576     int64_t deadline;
 577
 578     if (!use_icount) {
 579         return;
 580     }
 581
 582     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 583      * do not fire, so computing the deadline does not make sense.
 584      */
 585     if (!runstate_is_running()) {
 586         return;
 587     }
 588
 589     if (replay_mode != REPLAY_MODE_PLAY) {
 590         if (!all_cpu_threads_idle()) {
 591             return;
 592         }
 593
 594         if (qtest_enabled()) {
 595             /* When testing, qtest commands advance icount.  */
 596             return;
 597         }
 598
 599         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 600     } else {
 601         /* warp clock deterministically in record/replay mode */
 602         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 603             /* vCPU is sleeping and warp can't be started.
 604                It is probably a race condition: notification sent
 605                to vCPU was processed in advance and vCPU went to sleep.
 606                Therefore we have to wake it up for doing someting. */
 607             if (replay_has_checkpoint()) {
 608                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 609             }
 610             return;
 611         }
 612     }
 613
 614     /* We want to use the earliest deadline from ALL vm_clocks */
 615     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 616     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 617     if (deadline < 0) {
 618         static bool notified;
 619         if (!icount_sleep && !notified) {
 620             warn_report("icount sleep disabled and no active timers");
 621             notified = true;
 622         }
 623         return;
 624     }
 625
 626     if (deadline > 0) {
 627         /*
 628          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 629          * sleep.  Otherwise, the CPU might be waiting for a future timer
 630          * interrupt to wake it up, but the interrupt never comes because
 631          * the vCPU isn't running any insns and thus doesn't advance the
 632          * QEMU_CLOCK_VIRTUAL.
 633          */
 634         if (!icount_sleep) {
 635             /*
 636              * We never let VCPUs sleep in no sleep icount mode.
 637              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 638              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 639              * It is useful when we want a deterministic execution time,
 640              * isolated from host latencies.
 641              */
 642             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 643                                &timers_state.vm_clock_lock);
 644             atomic_set_i64(&timers_state.qemu_icount_bias,
 645                            timers_state.qemu_icount_bias + deadline);
 646             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 647                                  &timers_state.vm_clock_lock);
 648             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 649         } else {
 650             /*
 651              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 652              * "real" time, (related to the time left until the next event) has
 653              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 654              * This avoids that the warps are visible externally; for example,
 655              * you will not be sending network packets continuously instead of
 656              * every 100ms.
 657              */
 658             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 659                                &timers_state.vm_clock_lock);
 660             if (timers_state.vm_clock_warp_start == -1
 661                 || timers_state.vm_clock_warp_start > clock) {
 662                 timers_state.vm_clock_warp_start = clock;
 663             }
 664             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 665                                  &timers_state.vm_clock_lock);
 666             timer_mod_anticipate(timers_state.icount_warp_timer,
 667                                  clock + deadline);
 668         }
 669     } else if (deadline == 0) {
 670         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 671     }
 672 }
 673
 674 static void qemu_account_warp_timer(void)
 675 {
 676     if (!use_icount || !icount_sleep) {
 677         return;
 678     }
 679
 680     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 681      * do not fire, so computing the deadline does not make sense.
 682      */
 683     if (!runstate_is_running()) {
 684         return;
 685     }
 686
 687     /* warp clock deterministically in record/replay mode */
 688     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 689         return;
 690     }
 691
 692     timer_del(timers_state.icount_warp_timer);
 693     icount_warp_rt();
 694 }
 695
 696 static bool icount_state_needed(void *opaque)
 697 {
 698     return use_icount;
 699 }
 700
 701 static bool warp_timer_state_needed(void *opaque)
 702 {
 703     TimersState *s = opaque;
 704     return s->icount_warp_timer != NULL;
 705 }
 706
 707 static bool adjust_timers_state_needed(void *opaque)
 708 {
 709     TimersState *s = opaque;
 710     return s->icount_rt_timer != NULL;
 711 }
 712
 713 /*
 714  * Subsection for warp timer migration is optional, because may not be created
 715  */
 716 static const VMStateDescription icount_vmstate_warp_timer = {
 717     .name = "timer/icount/warp_timer",
 718     .version_id = 1,
 719     .minimum_version_id = 1,
 720     .needed = warp_timer_state_needed,
 721     .fields = (VMStateField[]) {
 722         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 723         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 724         VMSTATE_END_OF_LIST()
 725     }
 726 };
 727
 728 static const VMStateDescription icount_vmstate_adjust_timers = {
 729     .name = "timer/icount/timers",
 730     .version_id = 1,
 731     .minimum_version_id = 1,
 732     .needed = adjust_timers_state_needed,
 733     .fields = (VMStateField[]) {
 734         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 735         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 736         VMSTATE_END_OF_LIST()
 737     }
 738 };
 739
 740 /*
 741  * This is a subsection for icount migration.
 742  */
 743 static const VMStateDescription icount_vmstate_timers = {
 744     .name = "timer/icount",
 745     .version_id = 1,
 746     .minimum_version_id = 1,
 747     .needed = icount_state_needed,
 748     .fields = (VMStateField[]) {
 749         VMSTATE_INT64(qemu_icount_bias, TimersState),
 750         VMSTATE_INT64(qemu_icount, TimersState),
 751         VMSTATE_END_OF_LIST()
 752     },
 753     .subsections = (const VMStateDescription*[]) {
 754         &icount_vmstate_warp_timer,
 755         &icount_vmstate_adjust_timers,
 756         NULL
 757     }
 758 };
 759
 760 static const VMStateDescription vmstate_timers = {
 761     .name = "timer",
 762     .version_id = 2,
 763     .minimum_version_id = 1,
 764     .fields = (VMStateField[]) {
 765         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 766         VMSTATE_UNUSED(8),
 767         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 768         VMSTATE_END_OF_LIST()
 769     },
 770     .subsections = (const VMStateDescription*[]) {
 771         &icount_vmstate_timers,
 772         NULL
 773     }
 774 };
 775
 776 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 777 {
 778     double pct;
 779     double throttle_ratio;
 780     long sleeptime_ns;
 781
 782     if (!cpu_throttle_get_percentage()) {
 783         return;
 784     }
 785
 786     pct = (double)cpu_throttle_get_percentage()/100;
 787     throttle_ratio = pct / (1 - pct);
 788     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 789
 790     qemu_mutex_unlock_iothread();
 791     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 792     qemu_mutex_lock_iothread();
 793     atomic_set(&cpu->throttle_thread_scheduled, 0);
 794 }
 795
 796 static void cpu_throttle_timer_tick(void *opaque)
 797 {
 798     CPUState *cpu;
 799     double pct;
 800
 801     /* Stop the timer if needed */
 802     if (!cpu_throttle_get_percentage()) {
 803         return;
 804     }
 805     CPU_FOREACH(cpu) {
 806         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 807             async_run_on_cpu(cpu, cpu_throttle_thread,
 808                              RUN_ON_CPU_NULL);
 809         }
 810     }
 811
 812     pct = (double)cpu_throttle_get_percentage()/100;
 813     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 814                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 815 }
 816
 817 void cpu_throttle_set(int new_throttle_pct)
 818 {
 819     /* Ensure throttle percentage is within valid range */
 820     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 821     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 822
 823     atomic_set(&throttle_percentage, new_throttle_pct);
 824
 825     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 826                                        CPU_THROTTLE_TIMESLICE_NS);
 827 }
 828
 829 void cpu_throttle_stop(void)
 830 {
 831     atomic_set(&throttle_percentage, 0);
 832 }
 833
 834 bool cpu_throttle_active(void)
 835 {
 836     return (cpu_throttle_get_percentage() != 0);
 837 }
 838
 839 int cpu_throttle_get_percentage(void)
 840 {
 841     return atomic_read(&throttle_percentage);
 842 }
 843
 844 void cpu_ticks_init(void)
 845 {
 846     seqlock_init(&timers_state.vm_clock_seqlock);
 847     qemu_spin_init(&timers_state.vm_clock_lock);
 848     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 849     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 850                                            cpu_throttle_timer_tick, NULL);
 851 }
 852
 853 void configure_icount(QemuOpts *opts, Error **errp)
 854 {
 855     const char *option;
 856     char *rem_str = NULL;
 857
 858     option = qemu_opt_get(opts, "shift");
 859     if (!option) {
 860         if (qemu_opt_get(opts, "align") != NULL) {
 861             error_setg(errp, "Please specify shift option when using align");
 862         }
 863         return;
 864     }
 865
 866     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 867     if (icount_sleep) {
 868         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 869                                          icount_timer_cb, NULL);
 870     }
 871
 872     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 873
 874     if (icount_align_option && !icount_sleep) {
 875         error_setg(errp, "align=on and sleep=off are incompatible");
 876     }
 877     if (strcmp(option, "auto") != 0) {
 878         errno = 0;
 879         timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 880         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 881             error_setg(errp, "icount: Invalid shift value");
 882         }
 883         use_icount = 1;
 884         return;
 885     } else if (icount_align_option) {
 886         error_setg(errp, "shift=auto and align=on are incompatible");
 887     } else if (!icount_sleep) {
 888         error_setg(errp, "shift=auto and sleep=off are incompatible");
 889     }
 890
 891     use_icount = 2;
 892
 893     /* 125MIPS seems a reasonable initial guess at the guest speed.
 894        It will be corrected fairly quickly anyway.  */
 895     timers_state.icount_time_shift = 3;
 896
 897     /* Have both realtime and virtual time triggers for speed adjustment.
 898        The realtime trigger catches emulated time passing too slowly,
 899        the virtual time trigger catches emulated time passing too fast.
 900        Realtime triggers occur even when idle, so use them less frequently
 901        than VM triggers.  */
 902     timers_state.vm_clock_warp_start = -1;
 903     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 904                                    icount_adjust_rt, NULL);
 905     timer_mod(timers_state.icount_rt_timer,
 906                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 907     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 908                                         icount_adjust_vm, NULL);
 909     timer_mod(timers_state.icount_vm_timer,
 910                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 911                    NANOSECONDS_PER_SECOND / 10);
 912 }
 913
 914 /***********************************************************/
 915 /* TCG vCPU kick timer
 916  *
 917  * The kick timer is responsible for moving single threaded vCPU
 918  * emulation on to the next vCPU. If more than one vCPU is running a
 919  * timer event with force a cpu->exit so the next vCPU can get
 920  * scheduled.
 921  *
 922  * The timer is removed if all vCPUs are idle and restarted again once
 923  * idleness is complete.
 924  */
 925
 926 static QEMUTimer *tcg_kick_vcpu_timer;
 927 static CPUState *tcg_current_rr_cpu;
 928
 929 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 930
 931 static inline int64_t qemu_tcg_next_kick(void)
 932 {
 933     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 934 }
 935
 936 /* Kick the currently round-robin scheduled vCPU */
 937 static void qemu_cpu_kick_rr_cpu(void)
 938 {
 939     CPUState *cpu;
 940     do {
 941         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 942         if (cpu) {
 943             cpu_exit(cpu);
 944         }
 945     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 946 }
 947
 948 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 949 {
 950 }
 951
 952 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 953 {
 954     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 955         qemu_notify_event();
 956         return;
 957     }
 958
 959     if (qemu_in_vcpu_thread()) {
 960         /* A CPU is currently running; kick it back out to the
 961          * tcg_cpu_exec() loop so it will recalculate its
 962          * icount deadline immediately.
 963          */
 964         qemu_cpu_kick(current_cpu);
 965     } else if (first_cpu) {
 966         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 967          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 968          * causes cpu_thread_is_idle to return false.  This way,
 969          * handle_icount_deadline can run.
 970          * If we have no CPUs at all for some reason, we don't
 971          * need to do anything.
 972          */
 973         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 974     }
 975 }
 976
 977 static void kick_tcg_thread(void *opaque)
 978 {
 979     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 980     qemu_cpu_kick_rr_cpu();
 981 }
 982
 983 static void start_tcg_kick_timer(void)
 984 {
 985     assert(!mttcg_enabled);
 986     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 987         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 988                                            kick_tcg_thread, NULL);
 989     }
 990     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 991         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 992     }
 993 }
 994
 995 static void stop_tcg_kick_timer(void)
 996 {
 997     assert(!mttcg_enabled);
 998     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 999         timer_del(tcg_kick_vcpu_timer);
1000     }
1001 }
1002
1003 /***********************************************************/
1004 void hw_error(const char *fmt, ...)
1005 {
1006     va_list ap;
1007     CPUState *cpu;
1008
1009     va_start(ap, fmt);
1010     fprintf(stderr, "qemu: hardware error: ");
1011     vfprintf(stderr, fmt, ap);
1012     fprintf(stderr, "\n");
1013     CPU_FOREACH(cpu) {
1014         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1015         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1016     }
1017     va_end(ap);
1018     abort();
1019 }
1020
1021 void cpu_synchronize_all_states(void)
1022 {
1023     CPUState *cpu;
1024
1025     CPU_FOREACH(cpu) {
1026         cpu_synchronize_state(cpu);
1027         /* TODO: move to cpu_synchronize_state() */
1028         if (hvf_enabled()) {
1029             hvf_cpu_synchronize_state(cpu);
1030         }
1031     }
1032 }
1033
1034 void cpu_synchronize_all_post_reset(void)
1035 {
1036     CPUState *cpu;
1037
1038     CPU_FOREACH(cpu) {
1039         cpu_synchronize_post_reset(cpu);
1040         /* TODO: move to cpu_synchronize_post_reset() */
1041         if (hvf_enabled()) {
1042             hvf_cpu_synchronize_post_reset(cpu);
1043         }
1044     }
1045 }
1046
1047 void cpu_synchronize_all_post_init(void)
1048 {
1049     CPUState *cpu;
1050
1051     CPU_FOREACH(cpu) {
1052         cpu_synchronize_post_init(cpu);
1053         /* TODO: move to cpu_synchronize_post_init() */
1054         if (hvf_enabled()) {
1055             hvf_cpu_synchronize_post_init(cpu);
1056         }
1057     }
1058 }
1059
1060 void cpu_synchronize_all_pre_loadvm(void)
1061 {
1062     CPUState *cpu;
1063
1064     CPU_FOREACH(cpu) {
1065         cpu_synchronize_pre_loadvm(cpu);
1066     }
1067 }
1068
1069 static int do_vm_stop(RunState state, bool send_stop)
1070 {
1071     int ret = 0;
1072
1073     if (runstate_is_running()) {
1074         cpu_disable_ticks();
1075         pause_all_vcpus();
1076         runstate_set(state);
1077         vm_state_notify(0, state);
1078         if (send_stop) {
1079             qapi_event_send_stop();
1080         }
1081     }
1082
1083     bdrv_drain_all();
1084     replay_disable_events();
1085     ret = bdrv_flush_all();
1086
1087     return ret;
1088 }
1089
1090 /* Special vm_stop() variant for terminating the process.  Historically clients
1091  * did not expect a QMP STOP event and so we need to retain compatibility.
1092  */
1093 int vm_shutdown(void)
1094 {
1095     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1096 }
1097
1098 static bool cpu_can_run(CPUState *cpu)
1099 {
1100     if (cpu->stop) {
1101         return false;
1102     }
1103     if (cpu_is_stopped(cpu)) {
1104         return false;
1105     }
1106     return true;
1107 }
1108
1109 static void cpu_handle_guest_debug(CPUState *cpu)
1110 {
1111     gdb_set_stop_cpu(cpu);
1112     qemu_system_debug_request();
1113     cpu->stopped = true;
1114 }
1115
1116 #ifdef CONFIG_LINUX
1117 static void sigbus_reraise(void)
1118 {
1119     sigset_t set;
1120     struct sigaction action;
1121
1122     memset(&action, 0, sizeof(action));
1123     action.sa_handler = SIG_DFL;
1124     if (!sigaction(SIGBUS, &action, NULL)) {
1125         raise(SIGBUS);
1126         sigemptyset(&set);
1127         sigaddset(&set, SIGBUS);
1128         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1129     }
1130     perror("Failed to re-raise SIGBUS!\n");
1131     abort();
1132 }
1133
1134 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1135 {
1136     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1137         sigbus_reraise();
1138     }
1139
1140     if (current_cpu) {
1141         /* Called asynchronously in VCPU thread.  */
1142         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1143             sigbus_reraise();
1144         }
1145     } else {
1146         /* Called synchronously (via signalfd) in main thread.  */
1147         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1148             sigbus_reraise();
1149         }
1150     }
1151 }
1152
1153 static void qemu_init_sigbus(void)
1154 {
1155     struct sigaction action;
1156
1157     memset(&action, 0, sizeof(action));
1158     action.sa_flags = SA_SIGINFO;
1159     action.sa_sigaction = sigbus_handler;
1160     sigaction(SIGBUS, &action, NULL);
1161
1162     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1163 }
1164 #else /* !CONFIG_LINUX */
1165 static void qemu_init_sigbus(void)
1166 {
1167 }
1168 #endif /* !CONFIG_LINUX */
1169
1170 static QemuMutex qemu_global_mutex;
1171
1172 static QemuThread io_thread;
1173
1174 /* cpu creation */
1175 static QemuCond qemu_cpu_cond;
1176 /* system init */
1177 static QemuCond qemu_pause_cond;
1178
1179 void qemu_init_cpu_loop(void)
1180 {
1181     qemu_init_sigbus();
1182     qemu_cond_init(&qemu_cpu_cond);
1183     qemu_cond_init(&qemu_pause_cond);
1184     qemu_mutex_init(&qemu_global_mutex);
1185
1186     qemu_thread_get_self(&io_thread);
1187 }
1188
1189 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1190 {
1191     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1192 }
1193
1194 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1195 {
1196     if (kvm_destroy_vcpu(cpu) < 0) {
1197         error_report("kvm_destroy_vcpu failed");
1198         exit(EXIT_FAILURE);
1199     }
1200 }
1201
1202 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1203 {
1204 }
1205
1206 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1207 {
1208     g_assert(qemu_cpu_is_self(cpu));
1209     cpu->stop = false;
1210     cpu->stopped = true;
1211     if (exit) {
1212         cpu_exit(cpu);
1213     }
1214     qemu_cond_broadcast(&qemu_pause_cond);
1215 }
1216
1217 static void qemu_wait_io_event_common(CPUState *cpu)
1218 {
1219     atomic_mb_set(&cpu->thread_kicked, false);
1220     if (cpu->stop) {
1221         qemu_cpu_stop(cpu, false);
1222     }
1223     process_queued_cpu_work(cpu);
1224 }
1225
1226 static void qemu_tcg_rr_wait_io_event(void)
1227 {
1228     CPUState *cpu;
1229
1230     while (all_cpu_threads_idle()) {
1231         stop_tcg_kick_timer();
1232         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1233     }
1234
1235     start_tcg_kick_timer();
1236
1237     CPU_FOREACH(cpu) {
1238         qemu_wait_io_event_common(cpu);
1239     }
1240 }
1241
1242 static void qemu_wait_io_event(CPUState *cpu)
1243 {
1244     while (cpu_thread_is_idle(cpu)) {
1245         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1246     }
1247
1248 #ifdef _WIN32
1249     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1250     if (!tcg_enabled()) {
1251         SleepEx(0, TRUE);
1252     }
1253 #endif
1254     qemu_wait_io_event_common(cpu);
1255 }
1256
1257 static void *qemu_kvm_cpu_thread_fn(void *arg)
1258 {
1259     CPUState *cpu = arg;
1260     int r;
1261
1262     rcu_register_thread();
1263
1264     qemu_mutex_lock_iothread();
1265     qemu_thread_get_self(cpu->thread);
1266     cpu->thread_id = qemu_get_thread_id();
1267     cpu->can_do_io = 1;
1268     current_cpu = cpu;
1269
1270     r = kvm_init_vcpu(cpu);
1271     if (r < 0) {
1272         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1273         exit(1);
1274     }
1275
1276     kvm_init_cpu_signals(cpu);
1277
1278     /* signal CPU creation */
1279     cpu->created = true;
1280     qemu_cond_signal(&qemu_cpu_cond);
1281     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1282
1283     do {
1284         if (cpu_can_run(cpu)) {
1285             r = kvm_cpu_exec(cpu);
1286             if (r == EXCP_DEBUG) {
1287                 cpu_handle_guest_debug(cpu);
1288             }
1289         }
1290         qemu_wait_io_event(cpu);
1291     } while (!cpu->unplug || cpu_can_run(cpu));
1292
1293     qemu_kvm_destroy_vcpu(cpu);
1294     cpu->created = false;
1295     qemu_cond_signal(&qemu_cpu_cond);
1296     qemu_mutex_unlock_iothread();
1297     rcu_unregister_thread();
1298     return NULL;
1299 }
1300
1301 static void *qemu_dummy_cpu_thread_fn(void *arg)
1302 {
1303 #ifdef _WIN32
1304     error_report("qtest is not supported under Windows");
1305     exit(1);
1306 #else
1307     CPUState *cpu = arg;
1308     sigset_t waitset;
1309     int r;
1310
1311     rcu_register_thread();
1312
1313     qemu_mutex_lock_iothread();
1314     qemu_thread_get_self(cpu->thread);
1315     cpu->thread_id = qemu_get_thread_id();
1316     cpu->can_do_io = 1;
1317     current_cpu = cpu;
1318
1319     sigemptyset(&waitset);
1320     sigaddset(&waitset, SIG_IPI);
1321
1322     /* signal CPU creation */
1323     cpu->created = true;
1324     qemu_cond_signal(&qemu_cpu_cond);
1325     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1326
1327     do {
1328         qemu_mutex_unlock_iothread();
1329         do {
1330             int sig;
1331             r = sigwait(&waitset, &sig);
1332         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1333         if (r == -1) {
1334             perror("sigwait");
1335             exit(1);
1336         }
1337         qemu_mutex_lock_iothread();
1338         qemu_wait_io_event(cpu);
1339     } while (!cpu->unplug);
1340
1341     qemu_mutex_unlock_iothread();
1342     rcu_unregister_thread();
1343     return NULL;
1344 #endif
1345 }
1346
1347 static int64_t tcg_get_icount_limit(void)
1348 {
1349     int64_t deadline;
1350
1351     if (replay_mode != REPLAY_MODE_PLAY) {
1352         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1353
1354         /* Maintain prior (possibly buggy) behaviour where if no deadline
1355          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1356          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1357          * nanoseconds.
1358          */
1359         if ((deadline < 0) || (deadline > INT32_MAX)) {
1360             deadline = INT32_MAX;
1361         }
1362
1363         return qemu_icount_round(deadline);
1364     } else {
1365         return replay_get_instructions();
1366     }
1367 }
1368
1369 static void handle_icount_deadline(void)
1370 {
1371     assert(qemu_in_vcpu_thread());
1372     if (use_icount) {
1373         int64_t deadline =
1374             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1375
1376         if (deadline == 0) {
1377             /* Wake up other AioContexts.  */
1378             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1379             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1380         }
1381     }
1382 }
1383
1384 static void prepare_icount_for_run(CPUState *cpu)
1385 {
1386     if (use_icount) {
1387         int insns_left;
1388
1389         /* These should always be cleared by process_icount_data after
1390          * each vCPU execution. However u16.high can be raised
1391          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1392          */
1393         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1394         g_assert(cpu->icount_extra == 0);
1395
1396         cpu->icount_budget = tcg_get_icount_limit();
1397         insns_left = MIN(0xffff, cpu->icount_budget);
1398         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1399         cpu->icount_extra = cpu->icount_budget - insns_left;
1400
1401         replay_mutex_lock();
1402     }
1403 }
1404
1405 static void process_icount_data(CPUState *cpu)
1406 {
1407     if (use_icount) {
1408         /* Account for executed instructions */
1409         cpu_update_icount(cpu);
1410
1411         /* Reset the counters */
1412         cpu_neg(cpu)->icount_decr.u16.low = 0;
1413         cpu->icount_extra = 0;
1414         cpu->icount_budget = 0;
1415
1416         replay_account_executed_instructions();
1417
1418         replay_mutex_unlock();
1419     }
1420 }
1421
1422
1423 static int tcg_cpu_exec(CPUState *cpu)
1424 {
1425     int ret;
1426 #ifdef CONFIG_PROFILER
1427     int64_t ti;
1428 #endif
1429
1430     assert(tcg_enabled());
1431 #ifdef CONFIG_PROFILER
1432     ti = profile_getclock();
1433 #endif
1434     cpu_exec_start(cpu);
1435     ret = cpu_exec(cpu);
1436     cpu_exec_end(cpu);
1437 #ifdef CONFIG_PROFILER
1438     atomic_set(&tcg_ctx->prof.cpu_exec_time,
1439                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1440 #endif
1441     return ret;
1442 }
1443
1444 /* Destroy any remaining vCPUs which have been unplugged and have
1445  * finished running
1446  */
1447 static void deal_with_unplugged_cpus(void)
1448 {
1449     CPUState *cpu;
1450
1451     CPU_FOREACH(cpu) {
1452         if (cpu->unplug && !cpu_can_run(cpu)) {
1453             qemu_tcg_destroy_vcpu(cpu);
1454             cpu->created = false;
1455             qemu_cond_signal(&qemu_cpu_cond);
1456             break;
1457         }
1458     }
1459 }
1460
1461 /* Single-threaded TCG
1462  *
1463  * In the single-threaded case each vCPU is simulated in turn. If
1464  * there is more than a single vCPU we create a simple timer to kick
1465  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1466  * This is done explicitly rather than relying on side-effects
1467  * elsewhere.
1468  */
1469
1470 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1471 {
1472     CPUState *cpu = arg;
1473
1474     assert(tcg_enabled());
1475     rcu_register_thread();
1476     tcg_register_thread();
1477
1478     qemu_mutex_lock_iothread();
1479     qemu_thread_get_self(cpu->thread);
1480
1481     cpu->thread_id = qemu_get_thread_id();
1482     cpu->created = true;
1483     cpu->can_do_io = 1;
1484     qemu_cond_signal(&qemu_cpu_cond);
1485     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1486
1487     /* wait for initial kick-off after machine start */
1488     while (first_cpu->stopped) {
1489         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1490
1491         /* process any pending work */
1492         CPU_FOREACH(cpu) {
1493             current_cpu = cpu;
1494             qemu_wait_io_event_common(cpu);
1495         }
1496     }
1497
1498     start_tcg_kick_timer();
1499
1500     cpu = first_cpu;
1501
1502     /* process any pending work */
1503     cpu->exit_request = 1;
1504
1505     while (1) {
1506         qemu_mutex_unlock_iothread();
1507         replay_mutex_lock();
1508         qemu_mutex_lock_iothread();
1509         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1510         qemu_account_warp_timer();
1511
1512         /* Run the timers here.  This is much more efficient than
1513          * waking up the I/O thread and waiting for completion.
1514          */
1515         handle_icount_deadline();
1516
1517         replay_mutex_unlock();
1518
1519         if (!cpu) {
1520             cpu = first_cpu;
1521         }
1522
1523         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1524
1525             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1526             current_cpu = cpu;
1527
1528             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1529                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1530
1531             if (cpu_can_run(cpu)) {
1532                 int r;
1533
1534                 qemu_mutex_unlock_iothread();
1535                 prepare_icount_for_run(cpu);
1536
1537                 r = tcg_cpu_exec(cpu);
1538
1539                 process_icount_data(cpu);
1540                 qemu_mutex_lock_iothread();
1541
1542                 if (r == EXCP_DEBUG) {
1543                     cpu_handle_guest_debug(cpu);
1544                     break;
1545                 } else if (r == EXCP_ATOMIC) {
1546                     qemu_mutex_unlock_iothread();
1547                     cpu_exec_step_atomic(cpu);
1548                     qemu_mutex_lock_iothread();
1549                     break;
1550                 }
1551             } else if (cpu->stop) {
1552                 if (cpu->unplug) {
1553                     cpu = CPU_NEXT(cpu);
1554                 }
1555                 break;
1556             }
1557
1558             cpu = CPU_NEXT(cpu);
1559         } /* while (cpu && !cpu->exit_request).. */
1560
1561         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1562         atomic_set(&tcg_current_rr_cpu, NULL);
1563
1564         if (cpu && cpu->exit_request) {
1565             atomic_mb_set(&cpu->exit_request, 0);
1566         }
1567
1568         if (use_icount && all_cpu_threads_idle()) {
1569             /*
1570              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1571              * in the main_loop, wake it up in order to start the warp timer.
1572              */
1573             qemu_notify_event();
1574         }
1575
1576         qemu_tcg_rr_wait_io_event();
1577         deal_with_unplugged_cpus();
1578     }
1579
1580     rcu_unregister_thread();
1581     return NULL;
1582 }
1583
1584 static void *qemu_hax_cpu_thread_fn(void *arg)
1585 {
1586     CPUState *cpu = arg;
1587     int r;
1588
1589     rcu_register_thread();
1590     qemu_mutex_lock_iothread();
1591     qemu_thread_get_self(cpu->thread);
1592
1593     cpu->thread_id = qemu_get_thread_id();
1594     cpu->created = true;
1595     cpu->halted = 0;
1596     current_cpu = cpu;
1597
1598     hax_init_vcpu(cpu);
1599     qemu_cond_signal(&qemu_cpu_cond);
1600     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1601
1602     do {
1603         if (cpu_can_run(cpu)) {
1604             r = hax_smp_cpu_exec(cpu);
1605             if (r == EXCP_DEBUG) {
1606                 cpu_handle_guest_debug(cpu);
1607             }
1608         }
1609
1610         qemu_wait_io_event(cpu);
1611     } while (!cpu->unplug || cpu_can_run(cpu));
1612     rcu_unregister_thread();
1613     return NULL;
1614 }
1615
1616 /* The HVF-specific vCPU thread function. This one should only run when the host
1617  * CPU supports the VMX "unrestricted guest" feature. */
1618 static void *qemu_hvf_cpu_thread_fn(void *arg)
1619 {
1620     CPUState *cpu = arg;
1621
1622     int r;
1623
1624     assert(hvf_enabled());
1625
1626     rcu_register_thread();
1627
1628     qemu_mutex_lock_iothread();
1629     qemu_thread_get_self(cpu->thread);
1630
1631     cpu->thread_id = qemu_get_thread_id();
1632     cpu->can_do_io = 1;
1633     current_cpu = cpu;
1634
1635     hvf_init_vcpu(cpu);
1636
1637     /* signal CPU creation */
1638     cpu->created = true;
1639     qemu_cond_signal(&qemu_cpu_cond);
1640     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1641
1642     do {
1643         if (cpu_can_run(cpu)) {
1644             r = hvf_vcpu_exec(cpu);
1645             if (r == EXCP_DEBUG) {
1646                 cpu_handle_guest_debug(cpu);
1647             }
1648         }
1649         qemu_wait_io_event(cpu);
1650     } while (!cpu->unplug || cpu_can_run(cpu));
1651
1652     hvf_vcpu_destroy(cpu);
1653     cpu->created = false;
1654     qemu_cond_signal(&qemu_cpu_cond);
1655     qemu_mutex_unlock_iothread();
1656     rcu_unregister_thread();
1657     return NULL;
1658 }
1659
1660 static void *qemu_whpx_cpu_thread_fn(void *arg)
1661 {
1662     CPUState *cpu = arg;
1663     int r;
1664
1665     rcu_register_thread();
1666
1667     qemu_mutex_lock_iothread();
1668     qemu_thread_get_self(cpu->thread);
1669     cpu->thread_id = qemu_get_thread_id();
1670     current_cpu = cpu;
1671
1672     r = whpx_init_vcpu(cpu);
1673     if (r < 0) {
1674         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1675         exit(1);
1676     }
1677
1678     /* signal CPU creation */
1679     cpu->created = true;
1680     qemu_cond_signal(&qemu_cpu_cond);
1681     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1682
1683     do {
1684         if (cpu_can_run(cpu)) {
1685             r = whpx_vcpu_exec(cpu);
1686             if (r == EXCP_DEBUG) {
1687                 cpu_handle_guest_debug(cpu);
1688             }
1689         }
1690         while (cpu_thread_is_idle(cpu)) {
1691             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1692         }
1693         qemu_wait_io_event_common(cpu);
1694     } while (!cpu->unplug || cpu_can_run(cpu));
1695
1696     whpx_destroy_vcpu(cpu);
1697     cpu->created = false;
1698     qemu_cond_signal(&qemu_cpu_cond);
1699     qemu_mutex_unlock_iothread();
1700     rcu_unregister_thread();
1701     return NULL;
1702 }
1703
1704 #ifdef _WIN32
1705 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1706 {
1707 }
1708 #endif
1709
1710 /* Multi-threaded TCG
1711  *
1712  * In the multi-threaded case each vCPU has its own thread. The TLS
1713  * variable current_cpu can be used deep in the code to find the
1714  * current CPUState for a given thread.
1715  */
1716
1717 static void *qemu_tcg_cpu_thread_fn(void *arg)
1718 {
1719     CPUState *cpu = arg;
1720
1721     assert(tcg_enabled());
1722     g_assert(!use_icount);
1723
1724     rcu_register_thread();
1725     tcg_register_thread();
1726
1727     qemu_mutex_lock_iothread();
1728     qemu_thread_get_self(cpu->thread);
1729
1730     cpu->thread_id = qemu_get_thread_id();
1731     cpu->created = true;
1732     cpu->can_do_io = 1;
1733     current_cpu = cpu;
1734     qemu_cond_signal(&qemu_cpu_cond);
1735     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1736
1737     /* process any pending work */
1738     cpu->exit_request = 1;
1739
1740     do {
1741         if (cpu_can_run(cpu)) {
1742             int r;
1743             qemu_mutex_unlock_iothread();
1744             r = tcg_cpu_exec(cpu);
1745             qemu_mutex_lock_iothread();
1746             switch (r) {
1747             case EXCP_DEBUG:
1748                 cpu_handle_guest_debug(cpu);
1749                 break;
1750             case EXCP_HALTED:
1751                 /* during start-up the vCPU is reset and the thread is
1752                  * kicked several times. If we don't ensure we go back
1753                  * to sleep in the halted state we won't cleanly
1754                  * start-up when the vCPU is enabled.
1755                  *
1756                  * cpu->halted should ensure we sleep in wait_io_event
1757                  */
1758                 g_assert(cpu->halted);
1759                 break;
1760             case EXCP_ATOMIC:
1761                 qemu_mutex_unlock_iothread();
1762                 cpu_exec_step_atomic(cpu);
1763                 qemu_mutex_lock_iothread();
1764             default:
1765                 /* Ignore everything else? */
1766                 break;
1767             }
1768         }
1769
1770         atomic_mb_set(&cpu->exit_request, 0);
1771         qemu_wait_io_event(cpu);
1772     } while (!cpu->unplug || cpu_can_run(cpu));
1773
1774     qemu_tcg_destroy_vcpu(cpu);
1775     cpu->created = false;
1776     qemu_cond_signal(&qemu_cpu_cond);
1777     qemu_mutex_unlock_iothread();
1778     rcu_unregister_thread();
1779     return NULL;
1780 }
1781
1782 static void qemu_cpu_kick_thread(CPUState *cpu)
1783 {
1784 #ifndef _WIN32
1785     int err;
1786
1787     if (cpu->thread_kicked) {
1788         return;
1789     }
1790     cpu->thread_kicked = true;
1791     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1792     if (err && err != ESRCH) {
1793         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1794         exit(1);
1795     }
1796 #else /* _WIN32 */
1797     if (!qemu_cpu_is_self(cpu)) {
1798         if (whpx_enabled()) {
1799             whpx_vcpu_kick(cpu);
1800         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1801             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1802                     __func__, GetLastError());
1803             exit(1);
1804         }
1805     }
1806 #endif
1807 }
1808
1809 void qemu_cpu_kick(CPUState *cpu)
1810 {
1811     qemu_cond_broadcast(cpu->halt_cond);
1812     if (tcg_enabled()) {
1813         cpu_exit(cpu);
1814         /* NOP unless doing single-thread RR */
1815         qemu_cpu_kick_rr_cpu();
1816     } else {
1817         if (hax_enabled()) {
1818             /*
1819              * FIXME: race condition with the exit_request check in
1820              * hax_vcpu_hax_exec
1821              */
1822             cpu->exit_request = 1;
1823         }
1824         qemu_cpu_kick_thread(cpu);
1825     }
1826 }
1827
1828 void qemu_cpu_kick_self(void)
1829 {
1830     assert(current_cpu);
1831     qemu_cpu_kick_thread(current_cpu);
1832 }
1833
1834 bool qemu_cpu_is_self(CPUState *cpu)
1835 {
1836     return qemu_thread_is_self(cpu->thread);
1837 }
1838
1839 bool qemu_in_vcpu_thread(void)
1840 {
1841     return current_cpu && qemu_cpu_is_self(current_cpu);
1842 }
1843
1844 static __thread bool iothread_locked = false;
1845
1846 bool qemu_mutex_iothread_locked(void)
1847 {
1848     return iothread_locked;
1849 }
1850
1851 /*
1852  * The BQL is taken from so many places that it is worth profiling the
1853  * callers directly, instead of funneling them all through a single function.
1854  */
1855 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1856 {
1857     QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1858
1859     g_assert(!qemu_mutex_iothread_locked());
1860     bql_lock(&qemu_global_mutex, file, line);
1861     iothread_locked = true;
1862 }
1863
1864 void qemu_mutex_unlock_iothread(void)
1865 {
1866     g_assert(qemu_mutex_iothread_locked());
1867     iothread_locked = false;
1868     qemu_mutex_unlock(&qemu_global_mutex);
1869 }
1870
1871 static bool all_vcpus_paused(void)
1872 {
1873     CPUState *cpu;
1874
1875     CPU_FOREACH(cpu) {
1876         if (!cpu->stopped) {
1877             return false;
1878         }
1879     }
1880
1881     return true;
1882 }
1883
1884 void pause_all_vcpus(void)
1885 {
1886     CPUState *cpu;
1887
1888     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1889     CPU_FOREACH(cpu) {
1890         if (qemu_cpu_is_self(cpu)) {
1891             qemu_cpu_stop(cpu, true);
1892         } else {
1893             cpu->stop = true;
1894             qemu_cpu_kick(cpu);
1895         }
1896     }
1897
1898     /* We need to drop the replay_lock so any vCPU threads woken up
1899      * can finish their replay tasks
1900      */
1901     replay_mutex_unlock();
1902
1903     while (!all_vcpus_paused()) {
1904         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1905         CPU_FOREACH(cpu) {
1906             qemu_cpu_kick(cpu);
1907         }
1908     }
1909
1910     qemu_mutex_unlock_iothread();
1911     replay_mutex_lock();
1912     qemu_mutex_lock_iothread();
1913 }
1914
1915 void cpu_resume(CPUState *cpu)
1916 {
1917     cpu->stop = false;
1918     cpu->stopped = false;
1919     qemu_cpu_kick(cpu);
1920 }
1921
1922 void resume_all_vcpus(void)
1923 {
1924     CPUState *cpu;
1925
1926     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1927     CPU_FOREACH(cpu) {
1928         cpu_resume(cpu);
1929     }
1930 }
1931
1932 void cpu_remove_sync(CPUState *cpu)
1933 {
1934     cpu->stop = true;
1935     cpu->unplug = true;
1936     qemu_cpu_kick(cpu);
1937     qemu_mutex_unlock_iothread();
1938     qemu_thread_join(cpu->thread);
1939     qemu_mutex_lock_iothread();
1940 }
1941
1942 /* For temporary buffers for forming a name */
1943 #define VCPU_THREAD_NAME_SIZE 16
1944
1945 static void qemu_tcg_init_vcpu(CPUState *cpu)
1946 {
1947     char thread_name[VCPU_THREAD_NAME_SIZE];
1948     static QemuCond *single_tcg_halt_cond;
1949     static QemuThread *single_tcg_cpu_thread;
1950     static int tcg_region_inited;
1951
1952     assert(tcg_enabled());
1953     /*
1954      * Initialize TCG regions--once. Now is a good time, because:
1955      * (1) TCG's init context, prologue and target globals have been set up.
1956      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1957      *     -accel flag is processed, so the check doesn't work then).
1958      */
1959     if (!tcg_region_inited) {
1960         tcg_region_inited = 1;
1961         tcg_region_init();
1962     }
1963
1964     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1965         cpu->thread = g_malloc0(sizeof(QemuThread));
1966         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1967         qemu_cond_init(cpu->halt_cond);
1968
1969         if (qemu_tcg_mttcg_enabled()) {
1970             /* create a thread per vCPU with TCG (MTTCG) */
1971             parallel_cpus = true;
1972             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1973                  cpu->cpu_index);
1974
1975             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1976                                cpu, QEMU_THREAD_JOINABLE);
1977
1978         } else {
1979             /* share a single thread for all cpus with TCG */
1980             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1981             qemu_thread_create(cpu->thread, thread_name,
1982                                qemu_tcg_rr_cpu_thread_fn,
1983                                cpu, QEMU_THREAD_JOINABLE);
1984
1985             single_tcg_halt_cond = cpu->halt_cond;
1986             single_tcg_cpu_thread = cpu->thread;
1987         }
1988 #ifdef _WIN32
1989         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1990 #endif
1991     } else {
1992         /* For non-MTTCG cases we share the thread */
1993         cpu->thread = single_tcg_cpu_thread;
1994         cpu->halt_cond = single_tcg_halt_cond;
1995         cpu->thread_id = first_cpu->thread_id;
1996         cpu->can_do_io = 1;
1997         cpu->created = true;
1998     }
1999 }
2000
2001 static void qemu_hax_start_vcpu(CPUState *cpu)
2002 {
2003     char thread_name[VCPU_THREAD_NAME_SIZE];
2004
2005     cpu->thread = g_malloc0(sizeof(QemuThread));
2006     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2007     qemu_cond_init(cpu->halt_cond);
2008
2009     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2010              cpu->cpu_index);
2011     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2012                        cpu, QEMU_THREAD_JOINABLE);
2013 #ifdef _WIN32
2014     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2015 #endif
2016 }
2017
2018 static void qemu_kvm_start_vcpu(CPUState *cpu)
2019 {
2020     char thread_name[VCPU_THREAD_NAME_SIZE];
2021
2022     cpu->thread = g_malloc0(sizeof(QemuThread));
2023     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2024     qemu_cond_init(cpu->halt_cond);
2025     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2026              cpu->cpu_index);
2027     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2028                        cpu, QEMU_THREAD_JOINABLE);
2029 }
2030
2031 static void qemu_hvf_start_vcpu(CPUState *cpu)
2032 {
2033     char thread_name[VCPU_THREAD_NAME_SIZE];
2034
2035     /* HVF currently does not support TCG, and only runs in
2036      * unrestricted-guest mode. */
2037     assert(hvf_enabled());
2038
2039     cpu->thread = g_malloc0(sizeof(QemuThread));
2040     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2041     qemu_cond_init(cpu->halt_cond);
2042
2043     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2044              cpu->cpu_index);
2045     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2046                        cpu, QEMU_THREAD_JOINABLE);
2047 }
2048
2049 static void qemu_whpx_start_vcpu(CPUState *cpu)
2050 {
2051     char thread_name[VCPU_THREAD_NAME_SIZE];
2052
2053     cpu->thread = g_malloc0(sizeof(QemuThread));
2054     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2055     qemu_cond_init(cpu->halt_cond);
2056     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2057              cpu->cpu_index);
2058     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2059                        cpu, QEMU_THREAD_JOINABLE);
2060 #ifdef _WIN32
2061     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2062 #endif
2063 }
2064
2065 static void qemu_dummy_start_vcpu(CPUState *cpu)
2066 {
2067     char thread_name[VCPU_THREAD_NAME_SIZE];
2068
2069     cpu->thread = g_malloc0(sizeof(QemuThread));
2070     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2071     qemu_cond_init(cpu->halt_cond);
2072     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2073              cpu->cpu_index);
2074     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2075                        QEMU_THREAD_JOINABLE);
2076 }
2077
2078 void qemu_init_vcpu(CPUState *cpu)
2079 {
2080     cpu->nr_cores = smp_cores;
2081     cpu->nr_threads = smp_threads;
2082     cpu->stopped = true;
2083     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2084
2085     if (!cpu->as) {
2086         /* If the target cpu hasn't set up any address spaces itself,
2087          * give it the default one.
2088          */
2089         cpu->num_ases = 1;
2090         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2091     }
2092
2093     if (kvm_enabled()) {
2094         qemu_kvm_start_vcpu(cpu);
2095     } else if (hax_enabled()) {
2096         qemu_hax_start_vcpu(cpu);
2097     } else if (hvf_enabled()) {
2098         qemu_hvf_start_vcpu(cpu);
2099     } else if (tcg_enabled()) {
2100         qemu_tcg_init_vcpu(cpu);
2101     } else if (whpx_enabled()) {
2102         qemu_whpx_start_vcpu(cpu);
2103     } else {
2104         qemu_dummy_start_vcpu(cpu);
2105     }
2106
2107     while (!cpu->created) {
2108         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2109     }
2110 }
2111
2112 void cpu_stop_current(void)
2113 {
2114     if (current_cpu) {
2115         current_cpu->stop = true;
2116         cpu_exit(current_cpu);
2117     }
2118 }
2119
2120 int vm_stop(RunState state)
2121 {
2122     if (qemu_in_vcpu_thread()) {
2123         qemu_system_vmstop_request_prepare();
2124         qemu_system_vmstop_request(state);
2125         /*
2126          * FIXME: should not return to device code in case
2127          * vm_stop() has been requested.
2128          */
2129         cpu_stop_current();
2130         return 0;
2131     }
2132
2133     return do_vm_stop(state, true);
2134 }
2135
2136 /**
2137  * Prepare for (re)starting the VM.
2138  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2139  * running or in case of an error condition), 0 otherwise.
2140  */
2141 int vm_prepare_start(void)
2142 {
2143     RunState requested;
2144
2145     qemu_vmstop_requested(&requested);
2146     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2147         return -1;
2148     }
2149
2150     /* Ensure that a STOP/RESUME pair of events is emitted if a
2151      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2152      * example, according to documentation is always followed by
2153      * the STOP event.
2154      */
2155     if (runstate_is_running()) {
2156         qapi_event_send_stop();
2157         qapi_event_send_resume();
2158         return -1;
2159     }
2160
2161     /* We are sending this now, but the CPUs will be resumed shortly later */
2162     qapi_event_send_resume();
2163
2164     replay_enable_events();
2165     cpu_enable_ticks();
2166     runstate_set(RUN_STATE_RUNNING);
2167     vm_state_notify(1, RUN_STATE_RUNNING);
2168     return 0;
2169 }
2170
2171 void vm_start(void)
2172 {
2173     if (!vm_prepare_start()) {
2174         resume_all_vcpus();
2175     }
2176 }
2177
2178 /* does a state transition even if the VM is already stopped,
2179    current state is forgotten forever */
2180 int vm_stop_force_state(RunState state)
2181 {
2182     if (runstate_is_running()) {
2183         return vm_stop(state);
2184     } else {
2185         runstate_set(state);
2186
2187         bdrv_drain_all();
2188         /* Make sure to return an error if the flush in a previous vm_stop()
2189          * failed. */
2190         return bdrv_flush_all();
2191     }
2192 }
2193
2194 void list_cpus(const char *optarg)
2195 {
2196     /* XXX: implement xxx_cpu_list for targets that still miss it */
2197 #if defined(cpu_list)
2198     cpu_list();
2199 #endif
2200 }
2201
2202 CpuInfoList *qmp_query_cpus(Error **errp)
2203 {
2204     MachineState *ms = MACHINE(qdev_get_machine());
2205     MachineClass *mc = MACHINE_GET_CLASS(ms);
2206     CpuInfoList *head = NULL, *cur_item = NULL;
2207     CPUState *cpu;
2208
2209     CPU_FOREACH(cpu) {
2210         CpuInfoList *info;
2211 #if defined(TARGET_I386)
2212         X86CPU *x86_cpu = X86_CPU(cpu);
2213         CPUX86State *env = &x86_cpu->env;
2214 #elif defined(TARGET_PPC)
2215         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2216         CPUPPCState *env = &ppc_cpu->env;
2217 #elif defined(TARGET_SPARC)
2218         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2219         CPUSPARCState *env = &sparc_cpu->env;
2220 #elif defined(TARGET_RISCV)
2221         RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2222         CPURISCVState *env = &riscv_cpu->env;
2223 #elif defined(TARGET_MIPS)
2224         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2225         CPUMIPSState *env = &mips_cpu->env;
2226 #elif defined(TARGET_TRICORE)
2227         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2228         CPUTriCoreState *env = &tricore_cpu->env;
2229 #elif defined(TARGET_S390X)
2230         S390CPU *s390_cpu = S390_CPU(cpu);
2231         CPUS390XState *env = &s390_cpu->env;
2232 #endif
2233
2234         cpu_synchronize_state(cpu);
2235
2236         info = g_malloc0(sizeof(*info));
2237         info->value = g_malloc0(sizeof(*info->value));
2238         info->value->CPU = cpu->cpu_index;
2239         info->value->current = (cpu == first_cpu);
2240         info->value->halted = cpu->halted;
2241         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2242         info->value->thread_id = cpu->thread_id;
2243 #if defined(TARGET_I386)
2244         info->value->arch = CPU_INFO_ARCH_X86;
2245         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2246 #elif defined(TARGET_PPC)
2247         info->value->arch = CPU_INFO_ARCH_PPC;
2248         info->value->u.ppc.nip = env->nip;
2249 #elif defined(TARGET_SPARC)
2250         info->value->arch = CPU_INFO_ARCH_SPARC;
2251         info->value->u.q_sparc.pc = env->pc;
2252         info->value->u.q_sparc.npc = env->npc;
2253 #elif defined(TARGET_MIPS)
2254         info->value->arch = CPU_INFO_ARCH_MIPS;
2255         info->value->u.q_mips.PC = env->active_tc.PC;
2256 #elif defined(TARGET_TRICORE)
2257         info->value->arch = CPU_INFO_ARCH_TRICORE;
2258         info->value->u.tricore.PC = env->PC;
2259 #elif defined(TARGET_S390X)
2260         info->value->arch = CPU_INFO_ARCH_S390;
2261         info->value->u.s390.cpu_state = env->cpu_state;
2262 #elif defined(TARGET_RISCV)
2263         info->value->arch = CPU_INFO_ARCH_RISCV;
2264         info->value->u.riscv.pc = env->pc;
2265 #else
2266         info->value->arch = CPU_INFO_ARCH_OTHER;
2267 #endif
2268         info->value->has_props = !!mc->cpu_index_to_instance_props;
2269         if (info->value->has_props) {
2270             CpuInstanceProperties *props;
2271             props = g_malloc0(sizeof(*props));
2272             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2273             info->value->props = props;
2274         }
2275
2276         /* XXX: waiting for the qapi to support GSList */
2277         if (!cur_item) {
2278             head = cur_item = info;
2279         } else {
2280             cur_item->next = info;
2281             cur_item = info;
2282         }
2283     }
2284
2285     return head;
2286 }
2287
2288 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2289 {
2290     /*
2291      * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2292      * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2293      */
2294     switch (target) {
2295     case SYS_EMU_TARGET_I386:
2296     case SYS_EMU_TARGET_X86_64:
2297         return CPU_INFO_ARCH_X86;
2298
2299     case SYS_EMU_TARGET_PPC:
2300     case SYS_EMU_TARGET_PPC64:
2301         return CPU_INFO_ARCH_PPC;
2302
2303     case SYS_EMU_TARGET_SPARC:
2304     case SYS_EMU_TARGET_SPARC64:
2305         return CPU_INFO_ARCH_SPARC;
2306
2307     case SYS_EMU_TARGET_MIPS:
2308     case SYS_EMU_TARGET_MIPSEL:
2309     case SYS_EMU_TARGET_MIPS64:
2310     case SYS_EMU_TARGET_MIPS64EL:
2311         return CPU_INFO_ARCH_MIPS;
2312
2313     case SYS_EMU_TARGET_TRICORE:
2314         return CPU_INFO_ARCH_TRICORE;
2315
2316     case SYS_EMU_TARGET_S390X:
2317         return CPU_INFO_ARCH_S390;
2318
2319     case SYS_EMU_TARGET_RISCV32:
2320     case SYS_EMU_TARGET_RISCV64:
2321         return CPU_INFO_ARCH_RISCV;
2322
2323     default:
2324         return CPU_INFO_ARCH_OTHER;
2325     }
2326 }
2327
2328 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2329 {
2330 #ifdef TARGET_S390X
2331     S390CPU *s390_cpu = S390_CPU(cpu);
2332     CPUS390XState *env = &s390_cpu->env;
2333
2334     info->cpu_state = env->cpu_state;
2335 #else
2336     abort();
2337 #endif
2338 }
2339
2340 /*
2341  * fast means: we NEVER interrupt vCPU threads to retrieve
2342  * information from KVM.
2343  */
2344 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2345 {
2346     MachineState *ms = MACHINE(qdev_get_machine());
2347     MachineClass *mc = MACHINE_GET_CLASS(ms);
2348     CpuInfoFastList *head = NULL, *cur_item = NULL;
2349     SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2350                                           -1, &error_abort);
2351     CPUState *cpu;
2352
2353     CPU_FOREACH(cpu) {
2354         CpuInfoFastList *info = g_malloc0(sizeof(*info));
2355         info->value = g_malloc0(sizeof(*info->value));
2356
2357         info->value->cpu_index = cpu->cpu_index;
2358         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2359         info->value->thread_id = cpu->thread_id;
2360
2361         info->value->has_props = !!mc->cpu_index_to_instance_props;
2362         if (info->value->has_props) {
2363             CpuInstanceProperties *props;
2364             props = g_malloc0(sizeof(*props));
2365             *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2366             info->value->props = props;
2367         }
2368
2369         info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2370         info->value->target = target;
2371         if (target == SYS_EMU_TARGET_S390X) {
2372             cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2373         }
2374
2375         if (!cur_item) {
2376             head = cur_item = info;
2377         } else {
2378             cur_item->next = info;
2379             cur_item = info;
2380         }
2381     }
2382
2383     return head;
2384 }
2385
2386 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2387                  bool has_cpu, int64_t cpu_index, Error **errp)
2388 {
2389     FILE *f;
2390     uint32_t l;
2391     CPUState *cpu;
2392     uint8_t buf[1024];
2393     int64_t orig_addr = addr, orig_size = size;
2394
2395     if (!has_cpu) {
2396         cpu_index = 0;
2397     }
2398
2399     cpu = qemu_get_cpu(cpu_index);
2400     if (cpu == NULL) {
2401         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2402                    "a CPU number");
2403         return;
2404     }
2405
2406     f = fopen(filename, "wb");
2407     if (!f) {
2408         error_setg_file_open(errp, errno, filename);
2409         return;
2410     }
2411
2412     while (size != 0) {
2413         l = sizeof(buf);
2414         if (l > size)
2415             l = size;
2416         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2417             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2418                              " specified", orig_addr, orig_size);
2419             goto exit;
2420         }
2421         if (fwrite(buf, 1, l, f) != l) {
2422             error_setg(errp, QERR_IO_ERROR);
2423             goto exit;
2424         }
2425         addr += l;
2426         size -= l;
2427     }
2428
2429 exit:
2430     fclose(f);
2431 }
2432
2433 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2434                   Error **errp)
2435 {
2436     FILE *f;
2437     uint32_t l;
2438     uint8_t buf[1024];
2439
2440     f = fopen(filename, "wb");
2441     if (!f) {
2442         error_setg_file_open(errp, errno, filename);
2443         return;
2444     }
2445
2446     while (size != 0) {
2447         l = sizeof(buf);
2448         if (l > size)
2449             l = size;
2450         cpu_physical_memory_read(addr, buf, l);
2451         if (fwrite(buf, 1, l, f) != l) {
2452             error_setg(errp, QERR_IO_ERROR);
2453             goto exit;
2454         }
2455         addr += l;
2456         size -= l;
2457     }
2458
2459 exit:
2460     fclose(f);
2461 }
2462
2463 void qmp_inject_nmi(Error **errp)
2464 {
2465     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2466 }
2467
2468 void dump_drift_info(void)
2469 {
2470     if (!use_icount) {
2471         return;
2472     }
2473
2474     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2475                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2476     if (icount_align_option) {
2477         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2478                     -max_delay / SCALE_MS);
2479         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2480                     max_advance / SCALE_MS);
2481     } else {
2482         qemu_printf("Max guest delay     NA\n");
2483         qemu_printf("Max guest advance   NA\n");
2484     }
2485 }