softmmu/cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "qemu-common.h"
  27 #include "qemu/config-file.h"
  28 #include "qemu/cutils.h"
  29 #include "migration/vmstate.h"
  30 #include "monitor/monitor.h"
  31 #include "qapi/error.h"
  32 #include "qapi/qapi-commands-misc.h"
  33 #include "qapi/qapi-events-run-state.h"
  34 #include "qapi/qmp/qerror.h"
  35 #include "qemu/error-report.h"
  36 #include "qemu/qemu-print.h"
  37 #include "sysemu/tcg.h"
  38 #include "sysemu/block-backend.h"
  39 #include "exec/gdbstub.h"
  40 #include "sysemu/dma.h"
  41 #include "sysemu/hw_accel.h"
  42 #include "sysemu/kvm.h"
  43 #include "sysemu/hax.h"
  44 #include "sysemu/hvf.h"
  45 #include "sysemu/whpx.h"
  46 #include "exec/exec-all.h"
  47
  48 #include "qemu/thread.h"
  49 #include "qemu/plugin.h"
  50 #include "sysemu/cpus.h"
  51 #include "sysemu/qtest.h"
  52 #include "qemu/main-loop.h"
  53 #include "qemu/option.h"
  54 #include "qemu/bitmap.h"
  55 #include "qemu/seqlock.h"
  56 #include "qemu/guest-random.h"
  57 #include "tcg/tcg.h"
  58 #include "hw/nmi.h"
  59 #include "sysemu/replay.h"
  60 #include "sysemu/runstate.h"
  61 #include "hw/boards.h"
  62 #include "hw/hw.h"
  63
  64 #include "sysemu/cpu-throttle.h"
  65
  66 #ifdef CONFIG_LINUX
  67
  68 #include <sys/prctl.h>
  69
  70 #ifndef PR_MCE_KILL
  71 #define PR_MCE_KILL 33
  72 #endif
  73
  74 #ifndef PR_MCE_KILL_SET
  75 #define PR_MCE_KILL_SET 1
  76 #endif
  77
  78 #ifndef PR_MCE_KILL_EARLY
  79 #define PR_MCE_KILL_EARLY 1
  80 #endif
  81
  82 #endif /* CONFIG_LINUX */
  83
  84 static QemuMutex qemu_global_mutex;
  85
  86 int64_t max_delay;
  87 int64_t max_advance;
  88
  89 bool cpu_is_stopped(CPUState *cpu)
  90 {
  91     return cpu->stopped || !runstate_is_running();
  92 }
  93
  94 static inline bool cpu_work_list_empty(CPUState *cpu)
  95 {
  96     bool ret;
  97
  98     qemu_mutex_lock(&cpu->work_mutex);
  99     ret = QSIMPLEQ_EMPTY(&cpu->work_list);
 100     qemu_mutex_unlock(&cpu->work_mutex);
 101     return ret;
 102 }
 103
 104 static bool cpu_thread_is_idle(CPUState *cpu)
 105 {
 106     if (cpu->stop || !cpu_work_list_empty(cpu)) {
 107         return false;
 108     }
 109     if (cpu_is_stopped(cpu)) {
 110         return true;
 111     }
 112     if (!cpu->halted || cpu_has_work(cpu) ||
 113         kvm_halt_in_kernel()) {
 114         return false;
 115     }
 116     return true;
 117 }
 118
 119 static bool all_cpu_threads_idle(void)
 120 {
 121     CPUState *cpu;
 122
 123     CPU_FOREACH(cpu) {
 124         if (!cpu_thread_is_idle(cpu)) {
 125             return false;
 126         }
 127     }
 128     return true;
 129 }
 130
 131 /***********************************************************/
 132 /* guest cycle counter */
 133
 134 /* Protected by TimersState seqlock */
 135
 136 static bool icount_sleep = true;
 137 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 138 #define MAX_ICOUNT_SHIFT 10
 139
 140 typedef struct TimersState {
 141     /* Protected by BQL.  */
 142     int64_t cpu_ticks_prev;
 143     int64_t cpu_ticks_offset;
 144
 145     /* Protect fields that can be respectively read outside the
 146      * BQL, and written from multiple threads.
 147      */
 148     QemuSeqLock vm_clock_seqlock;
 149     QemuSpin vm_clock_lock;
 150
 151     int16_t cpu_ticks_enabled;
 152
 153     /* Conversion factor from emulated instructions to virtual clock ticks.  */
 154     int16_t icount_time_shift;
 155
 156     /* Compensate for varying guest execution speed.  */
 157     int64_t qemu_icount_bias;
 158
 159     int64_t vm_clock_warp_start;
 160     int64_t cpu_clock_offset;
 161
 162     /* Only written by TCG thread */
 163     int64_t qemu_icount;
 164
 165     /* for adjusting icount */
 166     QEMUTimer *icount_rt_timer;
 167     QEMUTimer *icount_vm_timer;
 168     QEMUTimer *icount_warp_timer;
 169 } TimersState;
 170
 171 static TimersState timers_state;
 172 bool mttcg_enabled;
 173
 174
 175 /* The current number of executed instructions is based on what we
 176  * originally budgeted minus the current state of the decrementing
 177  * icount counters in extra/u16.low.
 178  */
 179 static int64_t cpu_get_icount_executed(CPUState *cpu)
 180 {
 181     return (cpu->icount_budget -
 182             (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 183 }
 184
 185 /*
 186  * Update the global shared timer_state.qemu_icount to take into
 187  * account executed instructions. This is done by the TCG vCPU
 188  * thread so the main-loop can see time has moved forward.
 189  */
 190 static void cpu_update_icount_locked(CPUState *cpu)
 191 {
 192     int64_t executed = cpu_get_icount_executed(cpu);
 193     cpu->icount_budget -= executed;
 194
 195     qatomic_set_i64(&timers_state.qemu_icount,
 196                    timers_state.qemu_icount + executed);
 197 }
 198
 199 /*
 200  * Update the global shared timer_state.qemu_icount to take into
 201  * account executed instructions. This is done by the TCG vCPU
 202  * thread so the main-loop can see time has moved forward.
 203  */
 204 void cpu_update_icount(CPUState *cpu)
 205 {
 206     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 207                        &timers_state.vm_clock_lock);
 208     cpu_update_icount_locked(cpu);
 209     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 210                          &timers_state.vm_clock_lock);
 211 }
 212
 213 static int64_t cpu_get_icount_raw_locked(void)
 214 {
 215     CPUState *cpu = current_cpu;
 216
 217     if (cpu && cpu->running) {
 218         if (!cpu->can_do_io) {
 219             error_report("Bad icount read");
 220             exit(1);
 221         }
 222         /* Take into account what has run */
 223         cpu_update_icount_locked(cpu);
 224     }
 225     /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 226     return qatomic_read_i64(&timers_state.qemu_icount);
 227 }
 228
 229 static int64_t cpu_get_icount_locked(void)
 230 {
 231     int64_t icount = cpu_get_icount_raw_locked();
 232     return qatomic_read_i64(&timers_state.qemu_icount_bias) +
 233         cpu_icount_to_ns(icount);
 234 }
 235
 236 int64_t cpu_get_icount_raw(void)
 237 {
 238     int64_t icount;
 239     unsigned start;
 240
 241     do {
 242         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 243         icount = cpu_get_icount_raw_locked();
 244     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 245
 246     return icount;
 247 }
 248
 249 /* Return the virtual CPU time, based on the instruction counter.  */
 250 int64_t cpu_get_icount(void)
 251 {
 252     int64_t icount;
 253     unsigned start;
 254
 255     do {
 256         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 257         icount = cpu_get_icount_locked();
 258     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 259
 260     return icount;
 261 }
 262
 263 int64_t cpu_icount_to_ns(int64_t icount)
 264 {
 265     return icount << qatomic_read(&timers_state.icount_time_shift);
 266 }
 267
 268 static int64_t cpu_get_ticks_locked(void)
 269 {
 270     int64_t ticks = timers_state.cpu_ticks_offset;
 271     if (timers_state.cpu_ticks_enabled) {
 272         ticks += cpu_get_host_ticks();
 273     }
 274
 275     if (timers_state.cpu_ticks_prev > ticks) {
 276         /* Non increasing ticks may happen if the host uses software suspend.  */
 277         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 278         ticks = timers_state.cpu_ticks_prev;
 279     }
 280
 281     timers_state.cpu_ticks_prev = ticks;
 282     return ticks;
 283 }
 284
 285 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 286  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 287  * counter.
 288  */
 289 int64_t cpu_get_ticks(void)
 290 {
 291     int64_t ticks;
 292
 293     if (use_icount) {
 294         return cpu_get_icount();
 295     }
 296
 297     qemu_spin_lock(&timers_state.vm_clock_lock);
 298     ticks = cpu_get_ticks_locked();
 299     qemu_spin_unlock(&timers_state.vm_clock_lock);
 300     return ticks;
 301 }
 302
 303 static int64_t cpu_get_clock_locked(void)
 304 {
 305     int64_t time;
 306
 307     time = timers_state.cpu_clock_offset;
 308     if (timers_state.cpu_ticks_enabled) {
 309         time += get_clock();
 310     }
 311
 312     return time;
 313 }
 314
 315 /* Return the monotonic time elapsed in VM, i.e.,
 316  * the time between vm_start and vm_stop
 317  */
 318 int64_t cpu_get_clock(void)
 319 {
 320     int64_t ti;
 321     unsigned start;
 322
 323     do {
 324         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 325         ti = cpu_get_clock_locked();
 326     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 327
 328     return ti;
 329 }
 330
 331 /* enable cpu_get_ticks()
 332  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 333  */
 334 void cpu_enable_ticks(void)
 335 {
 336     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 337                        &timers_state.vm_clock_lock);
 338     if (!timers_state.cpu_ticks_enabled) {
 339         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 340         timers_state.cpu_clock_offset -= get_clock();
 341         timers_state.cpu_ticks_enabled = 1;
 342     }
 343     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 344                        &timers_state.vm_clock_lock);
 345 }
 346
 347 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 348  * cpu_get_ticks() after that.
 349  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 350  */
 351 void cpu_disable_ticks(void)
 352 {
 353     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 354                        &timers_state.vm_clock_lock);
 355     if (timers_state.cpu_ticks_enabled) {
 356         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 357         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 358         timers_state.cpu_ticks_enabled = 0;
 359     }
 360     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 361                          &timers_state.vm_clock_lock);
 362 }
 363
 364 /* Correlation between real and virtual time is always going to be
 365    fairly approximate, so ignore small variation.
 366    When the guest is idle real and virtual time will be aligned in
 367    the IO wait loop.  */
 368 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 369
 370 static void icount_adjust(void)
 371 {
 372     int64_t cur_time;
 373     int64_t cur_icount;
 374     int64_t delta;
 375
 376     /* Protected by TimersState mutex.  */
 377     static int64_t last_delta;
 378
 379     /* If the VM is not running, then do nothing.  */
 380     if (!runstate_is_running()) {
 381         return;
 382     }
 383
 384     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 385                        &timers_state.vm_clock_lock);
 386     cur_time = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 387                                    cpu_get_clock_locked());
 388     cur_icount = cpu_get_icount_locked();
 389
 390     delta = cur_icount - cur_time;
 391     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 392     if (delta > 0
 393         && last_delta + ICOUNT_WOBBLE < delta * 2
 394         && timers_state.icount_time_shift > 0) {
 395         /* The guest is getting too far ahead.  Slow time down.  */
 396         qatomic_set(&timers_state.icount_time_shift,
 397                    timers_state.icount_time_shift - 1);
 398     }
 399     if (delta < 0
 400         && last_delta - ICOUNT_WOBBLE > delta * 2
 401         && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 402         /* The guest is getting too far behind.  Speed time up.  */
 403         qatomic_set(&timers_state.icount_time_shift,
 404                    timers_state.icount_time_shift + 1);
 405     }
 406     last_delta = delta;
 407     qatomic_set_i64(&timers_state.qemu_icount_bias,
 408                    cur_icount - (timers_state.qemu_icount
 409                                  << timers_state.icount_time_shift));
 410     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 411                          &timers_state.vm_clock_lock);
 412 }
 413
 414 static void icount_adjust_rt(void *opaque)
 415 {
 416     timer_mod(timers_state.icount_rt_timer,
 417               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 418     icount_adjust();
 419 }
 420
 421 static void icount_adjust_vm(void *opaque)
 422 {
 423     timer_mod(timers_state.icount_vm_timer,
 424                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 425                    NANOSECONDS_PER_SECOND / 10);
 426     icount_adjust();
 427 }
 428
 429 static int64_t qemu_icount_round(int64_t count)
 430 {
 431     int shift = qatomic_read(&timers_state.icount_time_shift);
 432     return (count + (1 << shift) - 1) >> shift;
 433 }
 434
 435 static void icount_warp_rt(void)
 436 {
 437     unsigned seq;
 438     int64_t warp_start;
 439
 440     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 441      * changes from -1 to another value, so the race here is okay.
 442      */
 443     do {
 444         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 445         warp_start = timers_state.vm_clock_warp_start;
 446     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 447
 448     if (warp_start == -1) {
 449         return;
 450     }
 451
 452     seqlock_write_lock(&timers_state.vm_clock_seqlock,
 453                        &timers_state.vm_clock_lock);
 454     if (runstate_is_running()) {
 455         int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 456                                             cpu_get_clock_locked());
 457         int64_t warp_delta;
 458
 459         warp_delta = clock - timers_state.vm_clock_warp_start;
 460         if (use_icount == 2) {
 461             /*
 462              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 463              * far ahead of real time.
 464              */
 465             int64_t cur_icount = cpu_get_icount_locked();
 466             int64_t delta = clock - cur_icount;
 467             warp_delta = MIN(warp_delta, delta);
 468         }
 469         qatomic_set_i64(&timers_state.qemu_icount_bias,
 470                        timers_state.qemu_icount_bias + warp_delta);
 471     }
 472     timers_state.vm_clock_warp_start = -1;
 473     seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 474                        &timers_state.vm_clock_lock);
 475
 476     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 477         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 478     }
 479 }
 480
 481 static void icount_timer_cb(void *opaque)
 482 {
 483     /* No need for a checkpoint because the timer already synchronizes
 484      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 485      */
 486     icount_warp_rt();
 487 }
 488
 489 void qtest_clock_warp(int64_t dest)
 490 {
 491     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 492     AioContext *aio_context;
 493     assert(qtest_enabled());
 494     aio_context = qemu_get_aio_context();
 495     while (clock < dest) {
 496         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 497                                                       QEMU_TIMER_ATTR_ALL);
 498         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 499
 500         seqlock_write_lock(&timers_state.vm_clock_seqlock,
 501                            &timers_state.vm_clock_lock);
 502         qatomic_set_i64(&timers_state.qemu_icount_bias,
 503                        timers_state.qemu_icount_bias + warp);
 504         seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 505                              &timers_state.vm_clock_lock);
 506
 507         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 508         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 509         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 510     }
 511     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 512 }
 513
 514 void qemu_start_warp_timer(void)
 515 {
 516     int64_t clock;
 517     int64_t deadline;
 518
 519     if (!use_icount) {
 520         return;
 521     }
 522
 523     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 524      * do not fire, so computing the deadline does not make sense.
 525      */
 526     if (!runstate_is_running()) {
 527         return;
 528     }
 529
 530     if (replay_mode != REPLAY_MODE_PLAY) {
 531         if (!all_cpu_threads_idle()) {
 532             return;
 533         }
 534
 535         if (qtest_enabled()) {
 536             /* When testing, qtest commands advance icount.  */
 537             return;
 538         }
 539
 540         replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 541     } else {
 542         /* warp clock deterministically in record/replay mode */
 543         if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 544             /* vCPU is sleeping and warp can't be started.
 545                It is probably a race condition: notification sent
 546                to vCPU was processed in advance and vCPU went to sleep.
 547                Therefore we have to wake it up for doing someting. */
 548             if (replay_has_checkpoint()) {
 549                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 550             }
 551             return;
 552         }
 553     }
 554
 555     /* We want to use the earliest deadline from ALL vm_clocks */
 556     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 557     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 558                                           ~QEMU_TIMER_ATTR_EXTERNAL);
 559     if (deadline < 0) {
 560         static bool notified;
 561         if (!icount_sleep && !notified) {
 562             warn_report("icount sleep disabled and no active timers");
 563             notified = true;
 564         }
 565         return;
 566     }
 567
 568     if (deadline > 0) {
 569         /*
 570          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 571          * sleep.  Otherwise, the CPU might be waiting for a future timer
 572          * interrupt to wake it up, but the interrupt never comes because
 573          * the vCPU isn't running any insns and thus doesn't advance the
 574          * QEMU_CLOCK_VIRTUAL.
 575          */
 576         if (!icount_sleep) {
 577             /*
 578              * We never let VCPUs sleep in no sleep icount mode.
 579              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 580              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 581              * It is useful when we want a deterministic execution time,
 582              * isolated from host latencies.
 583              */
 584             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 585                                &timers_state.vm_clock_lock);
 586             qatomic_set_i64(&timers_state.qemu_icount_bias,
 587                            timers_state.qemu_icount_bias + deadline);
 588             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 589                                  &timers_state.vm_clock_lock);
 590             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 591         } else {
 592             /*
 593              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 594              * "real" time, (related to the time left until the next event) has
 595              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 596              * This avoids that the warps are visible externally; for example,
 597              * you will not be sending network packets continuously instead of
 598              * every 100ms.
 599              */
 600             seqlock_write_lock(&timers_state.vm_clock_seqlock,
 601                                &timers_state.vm_clock_lock);
 602             if (timers_state.vm_clock_warp_start == -1
 603                 || timers_state.vm_clock_warp_start > clock) {
 604                 timers_state.vm_clock_warp_start = clock;
 605             }
 606             seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 607                                  &timers_state.vm_clock_lock);
 608             timer_mod_anticipate(timers_state.icount_warp_timer,
 609                                  clock + deadline);
 610         }
 611     } else if (deadline == 0) {
 612         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 613     }
 614 }
 615
 616 static void qemu_account_warp_timer(void)
 617 {
 618     if (!use_icount || !icount_sleep) {
 619         return;
 620     }
 621
 622     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 623      * do not fire, so computing the deadline does not make sense.
 624      */
 625     if (!runstate_is_running()) {
 626         return;
 627     }
 628
 629     /* warp clock deterministically in record/replay mode */
 630     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 631         return;
 632     }
 633
 634     timer_del(timers_state.icount_warp_timer);
 635     icount_warp_rt();
 636 }
 637
 638 static bool icount_state_needed(void *opaque)
 639 {
 640     return use_icount;
 641 }
 642
 643 static bool warp_timer_state_needed(void *opaque)
 644 {
 645     TimersState *s = opaque;
 646     return s->icount_warp_timer != NULL;
 647 }
 648
 649 static bool adjust_timers_state_needed(void *opaque)
 650 {
 651     TimersState *s = opaque;
 652     return s->icount_rt_timer != NULL;
 653 }
 654
 655 static bool shift_state_needed(void *opaque)
 656 {
 657     return use_icount == 2;
 658 }
 659
 660 /*
 661  * Subsection for warp timer migration is optional, because may not be created
 662  */
 663 static const VMStateDescription icount_vmstate_warp_timer = {
 664     .name = "timer/icount/warp_timer",
 665     .version_id = 1,
 666     .minimum_version_id = 1,
 667     .needed = warp_timer_state_needed,
 668     .fields = (VMStateField[]) {
 669         VMSTATE_INT64(vm_clock_warp_start, TimersState),
 670         VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 671         VMSTATE_END_OF_LIST()
 672     }
 673 };
 674
 675 static const VMStateDescription icount_vmstate_adjust_timers = {
 676     .name = "timer/icount/timers",
 677     .version_id = 1,
 678     .minimum_version_id = 1,
 679     .needed = adjust_timers_state_needed,
 680     .fields = (VMStateField[]) {
 681         VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 682         VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 683         VMSTATE_END_OF_LIST()
 684     }
 685 };
 686
 687 static const VMStateDescription icount_vmstate_shift = {
 688     .name = "timer/icount/shift",
 689     .version_id = 1,
 690     .minimum_version_id = 1,
 691     .needed = shift_state_needed,
 692     .fields = (VMStateField[]) {
 693         VMSTATE_INT16(icount_time_shift, TimersState),
 694         VMSTATE_END_OF_LIST()
 695     }
 696 };
 697
 698 /*
 699  * This is a subsection for icount migration.
 700  */
 701 static const VMStateDescription icount_vmstate_timers = {
 702     .name = "timer/icount",
 703     .version_id = 1,
 704     .minimum_version_id = 1,
 705     .needed = icount_state_needed,
 706     .fields = (VMStateField[]) {
 707         VMSTATE_INT64(qemu_icount_bias, TimersState),
 708         VMSTATE_INT64(qemu_icount, TimersState),
 709         VMSTATE_END_OF_LIST()
 710     },
 711     .subsections = (const VMStateDescription*[]) {
 712         &icount_vmstate_warp_timer,
 713         &icount_vmstate_adjust_timers,
 714         &icount_vmstate_shift,
 715         NULL
 716     }
 717 };
 718
 719 static const VMStateDescription vmstate_timers = {
 720     .name = "timer",
 721     .version_id = 2,
 722     .minimum_version_id = 1,
 723     .fields = (VMStateField[]) {
 724         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 725         VMSTATE_UNUSED(8),
 726         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 727         VMSTATE_END_OF_LIST()
 728     },
 729     .subsections = (const VMStateDescription*[]) {
 730         &icount_vmstate_timers,
 731         NULL
 732     }
 733 };
 734
 735 void cpu_ticks_init(void)
 736 {
 737     seqlock_init(&timers_state.vm_clock_seqlock);
 738     qemu_spin_init(&timers_state.vm_clock_lock);
 739     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 740     cpu_throttle_init();
 741 }
 742
 743 void configure_icount(QemuOpts *opts, Error **errp)
 744 {
 745     const char *option = qemu_opt_get(opts, "shift");
 746     bool sleep = qemu_opt_get_bool(opts, "sleep", true);
 747     bool align = qemu_opt_get_bool(opts, "align", false);
 748     long time_shift = -1;
 749
 750     if (!option) {
 751         if (qemu_opt_get(opts, "align") != NULL) {
 752             error_setg(errp, "Please specify shift option when using align");
 753         }
 754         return;
 755     }
 756
 757     if (align && !sleep) {
 758         error_setg(errp, "align=on and sleep=off are incompatible");
 759         return;
 760     }
 761
 762     if (strcmp(option, "auto") != 0) {
 763         if (qemu_strtol(option, NULL, 0, &time_shift) < 0
 764             || time_shift < 0 || time_shift > MAX_ICOUNT_SHIFT) {
 765             error_setg(errp, "icount: Invalid shift value");
 766             return;
 767         }
 768     } else if (icount_align_option) {
 769         error_setg(errp, "shift=auto and align=on are incompatible");
 770         return;
 771     } else if (!icount_sleep) {
 772         error_setg(errp, "shift=auto and sleep=off are incompatible");
 773         return;
 774     }
 775
 776     icount_sleep = sleep;
 777     if (icount_sleep) {
 778         timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 779                                          icount_timer_cb, NULL);
 780     }
 781
 782     icount_align_option = align;
 783
 784     if (time_shift >= 0) {
 785         timers_state.icount_time_shift = time_shift;
 786         use_icount = 1;
 787         return;
 788     }
 789
 790     use_icount = 2;
 791
 792     /* 125MIPS seems a reasonable initial guess at the guest speed.
 793        It will be corrected fairly quickly anyway.  */
 794     timers_state.icount_time_shift = 3;
 795
 796     /* Have both realtime and virtual time triggers for speed adjustment.
 797        The realtime trigger catches emulated time passing too slowly,
 798        the virtual time trigger catches emulated time passing too fast.
 799        Realtime triggers occur even when idle, so use them less frequently
 800        than VM triggers.  */
 801     timers_state.vm_clock_warp_start = -1;
 802     timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 803                                    icount_adjust_rt, NULL);
 804     timer_mod(timers_state.icount_rt_timer,
 805                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 806     timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 807                                         icount_adjust_vm, NULL);
 808     timer_mod(timers_state.icount_vm_timer,
 809                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 810                    NANOSECONDS_PER_SECOND / 10);
 811 }
 812
 813 /***********************************************************/
 814 /* TCG vCPU kick timer
 815  *
 816  * The kick timer is responsible for moving single threaded vCPU
 817  * emulation on to the next vCPU. If more than one vCPU is running a
 818  * timer event with force a cpu->exit so the next vCPU can get
 819  * scheduled.
 820  *
 821  * The timer is removed if all vCPUs are idle and restarted again once
 822  * idleness is complete.
 823  */
 824
 825 static QEMUTimer *tcg_kick_vcpu_timer;
 826 static CPUState *tcg_current_rr_cpu;
 827
 828 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 829
 830 static inline int64_t qemu_tcg_next_kick(void)
 831 {
 832     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 833 }
 834
 835 /* Kick the currently round-robin scheduled vCPU to next */
 836 static void qemu_cpu_kick_rr_next_cpu(void)
 837 {
 838     CPUState *cpu;
 839     do {
 840         cpu = qatomic_mb_read(&tcg_current_rr_cpu);
 841         if (cpu) {
 842             cpu_exit(cpu);
 843         }
 844     } while (cpu != qatomic_mb_read(&tcg_current_rr_cpu));
 845 }
 846
 847 /* Kick all RR vCPUs */
 848 static void qemu_cpu_kick_rr_cpus(void)
 849 {
 850     CPUState *cpu;
 851
 852     CPU_FOREACH(cpu) {
 853         cpu_exit(cpu);
 854     };
 855 }
 856
 857 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 858 {
 859 }
 860
 861 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 862 {
 863     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 864         qemu_notify_event();
 865         return;
 866     }
 867
 868     if (qemu_in_vcpu_thread()) {
 869         /* A CPU is currently running; kick it back out to the
 870          * tcg_cpu_exec() loop so it will recalculate its
 871          * icount deadline immediately.
 872          */
 873         qemu_cpu_kick(current_cpu);
 874     } else if (first_cpu) {
 875         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 876          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 877          * causes cpu_thread_is_idle to return false.  This way,
 878          * handle_icount_deadline can run.
 879          * If we have no CPUs at all for some reason, we don't
 880          * need to do anything.
 881          */
 882         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 883     }
 884 }
 885
 886 static void kick_tcg_thread(void *opaque)
 887 {
 888     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 889     qemu_cpu_kick_rr_next_cpu();
 890 }
 891
 892 static void start_tcg_kick_timer(void)
 893 {
 894     assert(!mttcg_enabled);
 895     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 896         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 897                                            kick_tcg_thread, NULL);
 898     }
 899     if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 900         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 901     }
 902 }
 903
 904 static void stop_tcg_kick_timer(void)
 905 {
 906     assert(!mttcg_enabled);
 907     if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 908         timer_del(tcg_kick_vcpu_timer);
 909     }
 910 }
 911
 912 /***********************************************************/
 913 void hw_error(const char *fmt, ...)
 914 {
 915     va_list ap;
 916     CPUState *cpu;
 917
 918     va_start(ap, fmt);
 919     fprintf(stderr, "qemu: hardware error: ");
 920     vfprintf(stderr, fmt, ap);
 921     fprintf(stderr, "\n");
 922     CPU_FOREACH(cpu) {
 923         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 924         cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
 925     }
 926     va_end(ap);
 927     abort();
 928 }
 929
 930 void cpu_synchronize_all_states(void)
 931 {
 932     CPUState *cpu;
 933
 934     CPU_FOREACH(cpu) {
 935         cpu_synchronize_state(cpu);
 936     }
 937 }
 938
 939 void cpu_synchronize_all_post_reset(void)
 940 {
 941     CPUState *cpu;
 942
 943     CPU_FOREACH(cpu) {
 944         cpu_synchronize_post_reset(cpu);
 945     }
 946 }
 947
 948 void cpu_synchronize_all_post_init(void)
 949 {
 950     CPUState *cpu;
 951
 952     CPU_FOREACH(cpu) {
 953         cpu_synchronize_post_init(cpu);
 954     }
 955 }
 956
 957 void cpu_synchronize_all_pre_loadvm(void)
 958 {
 959     CPUState *cpu;
 960
 961     CPU_FOREACH(cpu) {
 962         cpu_synchronize_pre_loadvm(cpu);
 963     }
 964 }
 965
 966 static int do_vm_stop(RunState state, bool send_stop)
 967 {
 968     int ret = 0;
 969
 970     if (runstate_is_running()) {
 971         runstate_set(state);
 972         cpu_disable_ticks();
 973         pause_all_vcpus();
 974         vm_state_notify(0, state);
 975         if (send_stop) {
 976             qapi_event_send_stop();
 977         }
 978     }
 979
 980     bdrv_drain_all();
 981     ret = bdrv_flush_all();
 982
 983     return ret;
 984 }
 985
 986 /* Special vm_stop() variant for terminating the process.  Historically clients
 987  * did not expect a QMP STOP event and so we need to retain compatibility.
 988  */
 989 int vm_shutdown(void)
 990 {
 991     return do_vm_stop(RUN_STATE_SHUTDOWN, false);
 992 }
 993
 994 static bool cpu_can_run(CPUState *cpu)
 995 {
 996     if (cpu->stop) {
 997         return false;
 998     }
 999     if (cpu_is_stopped(cpu)) {
1000         return false;
1001     }
1002     return true;
1003 }
1004
1005 static void cpu_handle_guest_debug(CPUState *cpu)
1006 {
1007     gdb_set_stop_cpu(cpu);
1008     qemu_system_debug_request();
1009     cpu->stopped = true;
1010 }
1011
1012 #ifdef CONFIG_LINUX
1013 static void sigbus_reraise(void)
1014 {
1015     sigset_t set;
1016     struct sigaction action;
1017
1018     memset(&action, 0, sizeof(action));
1019     action.sa_handler = SIG_DFL;
1020     if (!sigaction(SIGBUS, &action, NULL)) {
1021         raise(SIGBUS);
1022         sigemptyset(&set);
1023         sigaddset(&set, SIGBUS);
1024         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1025     }
1026     perror("Failed to re-raise SIGBUS!\n");
1027     abort();
1028 }
1029
1030 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1031 {
1032     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1033         sigbus_reraise();
1034     }
1035
1036     if (current_cpu) {
1037         /* Called asynchronously in VCPU thread.  */
1038         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1039             sigbus_reraise();
1040         }
1041     } else {
1042         /* Called synchronously (via signalfd) in main thread.  */
1043         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1044             sigbus_reraise();
1045         }
1046     }
1047 }
1048
1049 static void qemu_init_sigbus(void)
1050 {
1051     struct sigaction action;
1052
1053     memset(&action, 0, sizeof(action));
1054     action.sa_flags = SA_SIGINFO;
1055     action.sa_sigaction = sigbus_handler;
1056     sigaction(SIGBUS, &action, NULL);
1057
1058     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1059 }
1060 #else /* !CONFIG_LINUX */
1061 static void qemu_init_sigbus(void)
1062 {
1063 }
1064 #endif /* !CONFIG_LINUX */
1065
1066 static QemuThread io_thread;
1067
1068 /* cpu creation */
1069 static QemuCond qemu_cpu_cond;
1070 /* system init */
1071 static QemuCond qemu_pause_cond;
1072
1073 void qemu_init_cpu_loop(void)
1074 {
1075     qemu_init_sigbus();
1076     qemu_cond_init(&qemu_cpu_cond);
1077     qemu_cond_init(&qemu_pause_cond);
1078     qemu_mutex_init(&qemu_global_mutex);
1079
1080     qemu_thread_get_self(&io_thread);
1081 }
1082
1083 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1084 {
1085     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1086 }
1087
1088 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1089 {
1090     if (kvm_destroy_vcpu(cpu) < 0) {
1091         error_report("kvm_destroy_vcpu failed");
1092         exit(EXIT_FAILURE);
1093     }
1094 }
1095
1096 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1097 {
1098 }
1099
1100 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1101 {
1102     g_assert(qemu_cpu_is_self(cpu));
1103     cpu->stop = false;
1104     cpu->stopped = true;
1105     if (exit) {
1106         cpu_exit(cpu);
1107     }
1108     qemu_cond_broadcast(&qemu_pause_cond);
1109 }
1110
1111 static void qemu_wait_io_event_common(CPUState *cpu)
1112 {
1113     qatomic_mb_set(&cpu->thread_kicked, false);
1114     if (cpu->stop) {
1115         qemu_cpu_stop(cpu, false);
1116     }
1117     process_queued_cpu_work(cpu);
1118 }
1119
1120 static void qemu_tcg_rr_wait_io_event(void)
1121 {
1122     CPUState *cpu;
1123
1124     while (all_cpu_threads_idle()) {
1125         stop_tcg_kick_timer();
1126         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1127     }
1128
1129     start_tcg_kick_timer();
1130
1131     CPU_FOREACH(cpu) {
1132         qemu_wait_io_event_common(cpu);
1133     }
1134 }
1135
1136 static void qemu_wait_io_event(CPUState *cpu)
1137 {
1138     bool slept = false;
1139
1140     while (cpu_thread_is_idle(cpu)) {
1141         if (!slept) {
1142             slept = true;
1143             qemu_plugin_vcpu_idle_cb(cpu);
1144         }
1145         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1146     }
1147     if (slept) {
1148         qemu_plugin_vcpu_resume_cb(cpu);
1149     }
1150
1151 #ifdef _WIN32
1152     /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1153     if (!tcg_enabled()) {
1154         SleepEx(0, TRUE);
1155     }
1156 #endif
1157     qemu_wait_io_event_common(cpu);
1158 }
1159
1160 static void *qemu_kvm_cpu_thread_fn(void *arg)
1161 {
1162     CPUState *cpu = arg;
1163     int r;
1164
1165     rcu_register_thread();
1166
1167     qemu_mutex_lock_iothread();
1168     qemu_thread_get_self(cpu->thread);
1169     cpu->thread_id = qemu_get_thread_id();
1170     cpu->can_do_io = 1;
1171     current_cpu = cpu;
1172
1173     r = kvm_init_vcpu(cpu);
1174     if (r < 0) {
1175         error_report("kvm_init_vcpu failed: %s", strerror(-r));
1176         exit(1);
1177     }
1178
1179     kvm_init_cpu_signals(cpu);
1180
1181     /* signal CPU creation */
1182     cpu->created = true;
1183     qemu_cond_signal(&qemu_cpu_cond);
1184     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1185
1186     do {
1187         if (cpu_can_run(cpu)) {
1188             r = kvm_cpu_exec(cpu);
1189             if (r == EXCP_DEBUG) {
1190                 cpu_handle_guest_debug(cpu);
1191             }
1192         }
1193         qemu_wait_io_event(cpu);
1194     } while (!cpu->unplug || cpu_can_run(cpu));
1195
1196     qemu_kvm_destroy_vcpu(cpu);
1197     cpu->created = false;
1198     qemu_cond_signal(&qemu_cpu_cond);
1199     qemu_mutex_unlock_iothread();
1200     rcu_unregister_thread();
1201     return NULL;
1202 }
1203
1204 static void *qemu_dummy_cpu_thread_fn(void *arg)
1205 {
1206 #ifdef _WIN32
1207     error_report("qtest is not supported under Windows");
1208     exit(1);
1209 #else
1210     CPUState *cpu = arg;
1211     sigset_t waitset;
1212     int r;
1213
1214     rcu_register_thread();
1215
1216     qemu_mutex_lock_iothread();
1217     qemu_thread_get_self(cpu->thread);
1218     cpu->thread_id = qemu_get_thread_id();
1219     cpu->can_do_io = 1;
1220     current_cpu = cpu;
1221
1222     sigemptyset(&waitset);
1223     sigaddset(&waitset, SIG_IPI);
1224
1225     /* signal CPU creation */
1226     cpu->created = true;
1227     qemu_cond_signal(&qemu_cpu_cond);
1228     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1229
1230     do {
1231         qemu_mutex_unlock_iothread();
1232         do {
1233             int sig;
1234             r = sigwait(&waitset, &sig);
1235         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1236         if (r == -1) {
1237             perror("sigwait");
1238             exit(1);
1239         }
1240         qemu_mutex_lock_iothread();
1241         qemu_wait_io_event(cpu);
1242     } while (!cpu->unplug);
1243
1244     qemu_mutex_unlock_iothread();
1245     rcu_unregister_thread();
1246     return NULL;
1247 #endif
1248 }
1249
1250 static int64_t tcg_get_icount_limit(void)
1251 {
1252     int64_t deadline;
1253
1254     if (replay_mode != REPLAY_MODE_PLAY) {
1255         /*
1256          * Include all the timers, because they may need an attention.
1257          * Too long CPU execution may create unnecessary delay in UI.
1258          */
1259         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1260                                               QEMU_TIMER_ATTR_ALL);
1261         /* Check realtime timers, because they help with input processing */
1262         deadline = qemu_soonest_timeout(deadline,
1263                 qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
1264                                            QEMU_TIMER_ATTR_ALL));
1265
1266         /* Maintain prior (possibly buggy) behaviour where if no deadline
1267          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1268          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1269          * nanoseconds.
1270          */
1271         if ((deadline < 0) || (deadline > INT32_MAX)) {
1272             deadline = INT32_MAX;
1273         }
1274
1275         return qemu_icount_round(deadline);
1276     } else {
1277         return replay_get_instructions();
1278     }
1279 }
1280
1281 static void notify_aio_contexts(void)
1282 {
1283     /* Wake up other AioContexts.  */
1284     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1285     qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1286 }
1287
1288 static void handle_icount_deadline(void)
1289 {
1290     assert(qemu_in_vcpu_thread());
1291     if (use_icount) {
1292         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1293                                                       QEMU_TIMER_ATTR_ALL);
1294
1295         if (deadline == 0) {
1296             notify_aio_contexts();
1297         }
1298     }
1299 }
1300
1301 static void prepare_icount_for_run(CPUState *cpu)
1302 {
1303     if (use_icount) {
1304         int insns_left;
1305
1306         /* These should always be cleared by process_icount_data after
1307          * each vCPU execution. However u16.high can be raised
1308          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1309          */
1310         g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1311         g_assert(cpu->icount_extra == 0);
1312
1313         cpu->icount_budget = tcg_get_icount_limit();
1314         insns_left = MIN(0xffff, cpu->icount_budget);
1315         cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1316         cpu->icount_extra = cpu->icount_budget - insns_left;
1317
1318         replay_mutex_lock();
1319
1320         if (cpu->icount_budget == 0 && replay_has_checkpoint()) {
1321             notify_aio_contexts();
1322         }
1323     }
1324 }
1325
1326 static void process_icount_data(CPUState *cpu)
1327 {
1328     if (use_icount) {
1329         /* Account for executed instructions */
1330         cpu_update_icount(cpu);
1331
1332         /* Reset the counters */
1333         cpu_neg(cpu)->icount_decr.u16.low = 0;
1334         cpu->icount_extra = 0;
1335         cpu->icount_budget = 0;
1336
1337         replay_account_executed_instructions();
1338
1339         replay_mutex_unlock();
1340     }
1341 }
1342
1343
1344 static int tcg_cpu_exec(CPUState *cpu)
1345 {
1346     int ret;
1347 #ifdef CONFIG_PROFILER
1348     int64_t ti;
1349 #endif
1350
1351     assert(tcg_enabled());
1352 #ifdef CONFIG_PROFILER
1353     ti = profile_getclock();
1354 #endif
1355     cpu_exec_start(cpu);
1356     ret = cpu_exec(cpu);
1357     cpu_exec_end(cpu);
1358 #ifdef CONFIG_PROFILER
1359     qatomic_set(&tcg_ctx->prof.cpu_exec_time,
1360                tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1361 #endif
1362     return ret;
1363 }
1364
1365 /* Destroy any remaining vCPUs which have been unplugged and have
1366  * finished running
1367  */
1368 static void deal_with_unplugged_cpus(void)
1369 {
1370     CPUState *cpu;
1371
1372     CPU_FOREACH(cpu) {
1373         if (cpu->unplug && !cpu_can_run(cpu)) {
1374             qemu_tcg_destroy_vcpu(cpu);
1375             cpu->created = false;
1376             qemu_cond_signal(&qemu_cpu_cond);
1377             break;
1378         }
1379     }
1380 }
1381
1382 /* Single-threaded TCG
1383  *
1384  * In the single-threaded case each vCPU is simulated in turn. If
1385  * there is more than a single vCPU we create a simple timer to kick
1386  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1387  * This is done explicitly rather than relying on side-effects
1388  * elsewhere.
1389  */
1390
1391 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1392 {
1393     CPUState *cpu = arg;
1394
1395     assert(tcg_enabled());
1396     rcu_register_thread();
1397     tcg_register_thread();
1398
1399     qemu_mutex_lock_iothread();
1400     qemu_thread_get_self(cpu->thread);
1401
1402     cpu->thread_id = qemu_get_thread_id();
1403     cpu->created = true;
1404     cpu->can_do_io = 1;
1405     qemu_cond_signal(&qemu_cpu_cond);
1406     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1407
1408     /* wait for initial kick-off after machine start */
1409     while (first_cpu->stopped) {
1410         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1411
1412         /* process any pending work */
1413         CPU_FOREACH(cpu) {
1414             current_cpu = cpu;
1415             qemu_wait_io_event_common(cpu);
1416         }
1417     }
1418
1419     start_tcg_kick_timer();
1420
1421     cpu = first_cpu;
1422
1423     /* process any pending work */
1424     cpu->exit_request = 1;
1425
1426     while (1) {
1427         qemu_mutex_unlock_iothread();
1428         replay_mutex_lock();
1429         qemu_mutex_lock_iothread();
1430         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1431         qemu_account_warp_timer();
1432
1433         /* Run the timers here.  This is much more efficient than
1434          * waking up the I/O thread and waiting for completion.
1435          */
1436         handle_icount_deadline();
1437
1438         replay_mutex_unlock();
1439
1440         if (!cpu) {
1441             cpu = first_cpu;
1442         }
1443
1444         while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
1445
1446             qatomic_mb_set(&tcg_current_rr_cpu, cpu);
1447             current_cpu = cpu;
1448
1449             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1450                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1451
1452             if (cpu_can_run(cpu)) {
1453                 int r;
1454
1455                 qemu_mutex_unlock_iothread();
1456                 prepare_icount_for_run(cpu);
1457
1458                 r = tcg_cpu_exec(cpu);
1459
1460                 process_icount_data(cpu);
1461                 qemu_mutex_lock_iothread();
1462
1463                 if (r == EXCP_DEBUG) {
1464                     cpu_handle_guest_debug(cpu);
1465                     break;
1466                 } else if (r == EXCP_ATOMIC) {
1467                     qemu_mutex_unlock_iothread();
1468                     cpu_exec_step_atomic(cpu);
1469                     qemu_mutex_lock_iothread();
1470                     break;
1471                 }
1472             } else if (cpu->stop) {
1473                 if (cpu->unplug) {
1474                     cpu = CPU_NEXT(cpu);
1475                 }
1476                 break;
1477             }
1478
1479             cpu = CPU_NEXT(cpu);
1480         } /* while (cpu && !cpu->exit_request).. */
1481
1482         /* Does not need qatomic_mb_set because a spurious wakeup is okay.  */
1483         qatomic_set(&tcg_current_rr_cpu, NULL);
1484
1485         if (cpu && cpu->exit_request) {
1486             qatomic_mb_set(&cpu->exit_request, 0);
1487         }
1488
1489         if (use_icount && all_cpu_threads_idle()) {
1490             /*
1491              * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1492              * in the main_loop, wake it up in order to start the warp timer.
1493              */
1494             qemu_notify_event();
1495         }
1496
1497         qemu_tcg_rr_wait_io_event();
1498         deal_with_unplugged_cpus();
1499     }
1500
1501     rcu_unregister_thread();
1502     return NULL;
1503 }
1504
1505 static void *qemu_hax_cpu_thread_fn(void *arg)
1506 {
1507     CPUState *cpu = arg;
1508     int r;
1509
1510     rcu_register_thread();
1511     qemu_mutex_lock_iothread();
1512     qemu_thread_get_self(cpu->thread);
1513
1514     cpu->thread_id = qemu_get_thread_id();
1515     cpu->created = true;
1516     current_cpu = cpu;
1517
1518     hax_init_vcpu(cpu);
1519     qemu_cond_signal(&qemu_cpu_cond);
1520     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1521
1522     do {
1523         if (cpu_can_run(cpu)) {
1524             r = hax_smp_cpu_exec(cpu);
1525             if (r == EXCP_DEBUG) {
1526                 cpu_handle_guest_debug(cpu);
1527             }
1528         }
1529
1530         qemu_wait_io_event(cpu);
1531     } while (!cpu->unplug || cpu_can_run(cpu));
1532     rcu_unregister_thread();
1533     return NULL;
1534 }
1535
1536 /* The HVF-specific vCPU thread function. This one should only run when the host
1537  * CPU supports the VMX "unrestricted guest" feature. */
1538 static void *qemu_hvf_cpu_thread_fn(void *arg)
1539 {
1540     CPUState *cpu = arg;
1541
1542     int r;
1543
1544     assert(hvf_enabled());
1545
1546     rcu_register_thread();
1547
1548     qemu_mutex_lock_iothread();
1549     qemu_thread_get_self(cpu->thread);
1550
1551     cpu->thread_id = qemu_get_thread_id();
1552     cpu->can_do_io = 1;
1553     current_cpu = cpu;
1554
1555     hvf_init_vcpu(cpu);
1556
1557     /* signal CPU creation */
1558     cpu->created = true;
1559     qemu_cond_signal(&qemu_cpu_cond);
1560     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1561
1562     do {
1563         if (cpu_can_run(cpu)) {
1564             r = hvf_vcpu_exec(cpu);
1565             if (r == EXCP_DEBUG) {
1566                 cpu_handle_guest_debug(cpu);
1567             }
1568         }
1569         qemu_wait_io_event(cpu);
1570     } while (!cpu->unplug || cpu_can_run(cpu));
1571
1572     hvf_vcpu_destroy(cpu);
1573     cpu->created = false;
1574     qemu_cond_signal(&qemu_cpu_cond);
1575     qemu_mutex_unlock_iothread();
1576     rcu_unregister_thread();
1577     return NULL;
1578 }
1579
1580 static void *qemu_whpx_cpu_thread_fn(void *arg)
1581 {
1582     CPUState *cpu = arg;
1583     int r;
1584
1585     rcu_register_thread();
1586
1587     qemu_mutex_lock_iothread();
1588     qemu_thread_get_self(cpu->thread);
1589     cpu->thread_id = qemu_get_thread_id();
1590     current_cpu = cpu;
1591
1592     r = whpx_init_vcpu(cpu);
1593     if (r < 0) {
1594         fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1595         exit(1);
1596     }
1597
1598     /* signal CPU creation */
1599     cpu->created = true;
1600     qemu_cond_signal(&qemu_cpu_cond);
1601     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1602
1603     do {
1604         if (cpu_can_run(cpu)) {
1605             r = whpx_vcpu_exec(cpu);
1606             if (r == EXCP_DEBUG) {
1607                 cpu_handle_guest_debug(cpu);
1608             }
1609         }
1610         while (cpu_thread_is_idle(cpu)) {
1611             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1612         }
1613         qemu_wait_io_event_common(cpu);
1614     } while (!cpu->unplug || cpu_can_run(cpu));
1615
1616     whpx_destroy_vcpu(cpu);
1617     cpu->created = false;
1618     qemu_cond_signal(&qemu_cpu_cond);
1619     qemu_mutex_unlock_iothread();
1620     rcu_unregister_thread();
1621     return NULL;
1622 }
1623
1624 #ifdef _WIN32
1625 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1626 {
1627 }
1628 #endif
1629
1630 /* Multi-threaded TCG
1631  *
1632  * In the multi-threaded case each vCPU has its own thread. The TLS
1633  * variable current_cpu can be used deep in the code to find the
1634  * current CPUState for a given thread.
1635  */
1636
1637 static void *qemu_tcg_cpu_thread_fn(void *arg)
1638 {
1639     CPUState *cpu = arg;
1640
1641     assert(tcg_enabled());
1642     g_assert(!use_icount);
1643
1644     rcu_register_thread();
1645     tcg_register_thread();
1646
1647     qemu_mutex_lock_iothread();
1648     qemu_thread_get_self(cpu->thread);
1649
1650     cpu->thread_id = qemu_get_thread_id();
1651     cpu->created = true;
1652     cpu->can_do_io = 1;
1653     current_cpu = cpu;
1654     qemu_cond_signal(&qemu_cpu_cond);
1655     qemu_guest_random_seed_thread_part2(cpu->random_seed);
1656
1657     /* process any pending work */
1658     cpu->exit_request = 1;
1659
1660     do {
1661         if (cpu_can_run(cpu)) {
1662             int r;
1663             qemu_mutex_unlock_iothread();
1664             r = tcg_cpu_exec(cpu);
1665             qemu_mutex_lock_iothread();
1666             switch (r) {
1667             case EXCP_DEBUG:
1668                 cpu_handle_guest_debug(cpu);
1669                 break;
1670             case EXCP_HALTED:
1671                 /* during start-up the vCPU is reset and the thread is
1672                  * kicked several times. If we don't ensure we go back
1673                  * to sleep in the halted state we won't cleanly
1674                  * start-up when the vCPU is enabled.
1675                  *
1676                  * cpu->halted should ensure we sleep in wait_io_event
1677                  */
1678                 g_assert(cpu->halted);
1679                 break;
1680             case EXCP_ATOMIC:
1681                 qemu_mutex_unlock_iothread();
1682                 cpu_exec_step_atomic(cpu);
1683                 qemu_mutex_lock_iothread();
1684             default:
1685                 /* Ignore everything else? */
1686                 break;
1687             }
1688         }
1689
1690         qatomic_mb_set(&cpu->exit_request, 0);
1691         qemu_wait_io_event(cpu);
1692     } while (!cpu->unplug || cpu_can_run(cpu));
1693
1694     qemu_tcg_destroy_vcpu(cpu);
1695     cpu->created = false;
1696     qemu_cond_signal(&qemu_cpu_cond);
1697     qemu_mutex_unlock_iothread();
1698     rcu_unregister_thread();
1699     return NULL;
1700 }
1701
1702 static void qemu_cpu_kick_thread(CPUState *cpu)
1703 {
1704 #ifndef _WIN32
1705     int err;
1706
1707     if (cpu->thread_kicked) {
1708         return;
1709     }
1710     cpu->thread_kicked = true;
1711     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1712     if (err && err != ESRCH) {
1713         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1714         exit(1);
1715     }
1716 #else /* _WIN32 */
1717     if (!qemu_cpu_is_self(cpu)) {
1718         if (whpx_enabled()) {
1719             whpx_vcpu_kick(cpu);
1720         } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1721             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1722                     __func__, GetLastError());
1723             exit(1);
1724         }
1725     }
1726 #endif
1727 }
1728
1729 void qemu_cpu_kick(CPUState *cpu)
1730 {
1731     qemu_cond_broadcast(cpu->halt_cond);
1732     if (tcg_enabled()) {
1733         if (qemu_tcg_mttcg_enabled()) {
1734             cpu_exit(cpu);
1735         } else {
1736             qemu_cpu_kick_rr_cpus();
1737         }
1738     } else {
1739         if (hax_enabled()) {
1740             /*
1741              * FIXME: race condition with the exit_request check in
1742              * hax_vcpu_hax_exec
1743              */
1744             cpu->exit_request = 1;
1745         }
1746         qemu_cpu_kick_thread(cpu);
1747     }
1748 }
1749
1750 void qemu_cpu_kick_self(void)
1751 {
1752     assert(current_cpu);
1753     qemu_cpu_kick_thread(current_cpu);
1754 }
1755
1756 bool qemu_cpu_is_self(CPUState *cpu)
1757 {
1758     return qemu_thread_is_self(cpu->thread);
1759 }
1760
1761 bool qemu_in_vcpu_thread(void)
1762 {
1763     return current_cpu && qemu_cpu_is_self(current_cpu);
1764 }
1765
1766 static __thread bool iothread_locked = false;
1767
1768 bool qemu_mutex_iothread_locked(void)
1769 {
1770     return iothread_locked;
1771 }
1772
1773 /*
1774  * The BQL is taken from so many places that it is worth profiling the
1775  * callers directly, instead of funneling them all through a single function.
1776  */
1777 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1778 {
1779     QemuMutexLockFunc bql_lock = qatomic_read(&qemu_bql_mutex_lock_func);
1780
1781     g_assert(!qemu_mutex_iothread_locked());
1782     bql_lock(&qemu_global_mutex, file, line);
1783     iothread_locked = true;
1784 }
1785
1786 void qemu_mutex_unlock_iothread(void)
1787 {
1788     g_assert(qemu_mutex_iothread_locked());
1789     iothread_locked = false;
1790     qemu_mutex_unlock(&qemu_global_mutex);
1791 }
1792
1793 void qemu_cond_wait_iothread(QemuCond *cond)
1794 {
1795     qemu_cond_wait(cond, &qemu_global_mutex);
1796 }
1797
1798 void qemu_cond_timedwait_iothread(QemuCond *cond, int ms)
1799 {
1800     qemu_cond_timedwait(cond, &qemu_global_mutex, ms);
1801 }
1802
1803 static bool all_vcpus_paused(void)
1804 {
1805     CPUState *cpu;
1806
1807     CPU_FOREACH(cpu) {
1808         if (!cpu->stopped) {
1809             return false;
1810         }
1811     }
1812
1813     return true;
1814 }
1815
1816 void pause_all_vcpus(void)
1817 {
1818     CPUState *cpu;
1819
1820     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1821     CPU_FOREACH(cpu) {
1822         if (qemu_cpu_is_self(cpu)) {
1823             qemu_cpu_stop(cpu, true);
1824         } else {
1825             cpu->stop = true;
1826             qemu_cpu_kick(cpu);
1827         }
1828     }
1829
1830     /* We need to drop the replay_lock so any vCPU threads woken up
1831      * can finish their replay tasks
1832      */
1833     replay_mutex_unlock();
1834
1835     while (!all_vcpus_paused()) {
1836         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1837         CPU_FOREACH(cpu) {
1838             qemu_cpu_kick(cpu);
1839         }
1840     }
1841
1842     qemu_mutex_unlock_iothread();
1843     replay_mutex_lock();
1844     qemu_mutex_lock_iothread();
1845 }
1846
1847 void cpu_resume(CPUState *cpu)
1848 {
1849     cpu->stop = false;
1850     cpu->stopped = false;
1851     qemu_cpu_kick(cpu);
1852 }
1853
1854 void resume_all_vcpus(void)
1855 {
1856     CPUState *cpu;
1857
1858     if (!runstate_is_running()) {
1859         return;
1860     }
1861
1862     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1863     CPU_FOREACH(cpu) {
1864         cpu_resume(cpu);
1865     }
1866 }
1867
1868 void cpu_remove_sync(CPUState *cpu)
1869 {
1870     cpu->stop = true;
1871     cpu->unplug = true;
1872     qemu_cpu_kick(cpu);
1873     qemu_mutex_unlock_iothread();
1874     qemu_thread_join(cpu->thread);
1875     qemu_mutex_lock_iothread();
1876 }
1877
1878 /* For temporary buffers for forming a name */
1879 #define VCPU_THREAD_NAME_SIZE 16
1880
1881 static void qemu_tcg_init_vcpu(CPUState *cpu)
1882 {
1883     char thread_name[VCPU_THREAD_NAME_SIZE];
1884     static QemuCond *single_tcg_halt_cond;
1885     static QemuThread *single_tcg_cpu_thread;
1886     static int tcg_region_inited;
1887
1888     assert(tcg_enabled());
1889     /*
1890      * Initialize TCG regions--once. Now is a good time, because:
1891      * (1) TCG's init context, prologue and target globals have been set up.
1892      * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1893      *     -accel flag is processed, so the check doesn't work then).
1894      */
1895     if (!tcg_region_inited) {
1896         tcg_region_inited = 1;
1897         tcg_region_init();
1898         /*
1899          * If MTTCG, and we will create multiple cpus,
1900          * then we will have cpus running in parallel.
1901          */
1902         if (qemu_tcg_mttcg_enabled()) {
1903             MachineState *ms = MACHINE(qdev_get_machine());
1904             if (ms->smp.max_cpus > 1) {
1905                 parallel_cpus = true;
1906             }
1907         }
1908     }
1909
1910     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1911         cpu->thread = g_malloc0(sizeof(QemuThread));
1912         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1913         qemu_cond_init(cpu->halt_cond);
1914
1915         if (qemu_tcg_mttcg_enabled()) {
1916             /* create a thread per vCPU with TCG (MTTCG) */
1917             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1918                  cpu->cpu_index);
1919
1920             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1921                                cpu, QEMU_THREAD_JOINABLE);
1922
1923         } else {
1924             /* share a single thread for all cpus with TCG */
1925             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1926             qemu_thread_create(cpu->thread, thread_name,
1927                                qemu_tcg_rr_cpu_thread_fn,
1928                                cpu, QEMU_THREAD_JOINABLE);
1929
1930             single_tcg_halt_cond = cpu->halt_cond;
1931             single_tcg_cpu_thread = cpu->thread;
1932         }
1933 #ifdef _WIN32
1934         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1935 #endif
1936     } else {
1937         /* For non-MTTCG cases we share the thread */
1938         cpu->thread = single_tcg_cpu_thread;
1939         cpu->halt_cond = single_tcg_halt_cond;
1940         cpu->thread_id = first_cpu->thread_id;
1941         cpu->can_do_io = 1;
1942         cpu->created = true;
1943     }
1944 }
1945
1946 static void qemu_hax_start_vcpu(CPUState *cpu)
1947 {
1948     char thread_name[VCPU_THREAD_NAME_SIZE];
1949
1950     cpu->thread = g_malloc0(sizeof(QemuThread));
1951     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1952     qemu_cond_init(cpu->halt_cond);
1953
1954     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1955              cpu->cpu_index);
1956     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1957                        cpu, QEMU_THREAD_JOINABLE);
1958 #ifdef _WIN32
1959     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1960 #endif
1961 }
1962
1963 static void qemu_kvm_start_vcpu(CPUState *cpu)
1964 {
1965     char thread_name[VCPU_THREAD_NAME_SIZE];
1966
1967     cpu->thread = g_malloc0(sizeof(QemuThread));
1968     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1969     qemu_cond_init(cpu->halt_cond);
1970     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1971              cpu->cpu_index);
1972     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1973                        cpu, QEMU_THREAD_JOINABLE);
1974 }
1975
1976 static void qemu_hvf_start_vcpu(CPUState *cpu)
1977 {
1978     char thread_name[VCPU_THREAD_NAME_SIZE];
1979
1980     /* HVF currently does not support TCG, and only runs in
1981      * unrestricted-guest mode. */
1982     assert(hvf_enabled());
1983
1984     cpu->thread = g_malloc0(sizeof(QemuThread));
1985     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1986     qemu_cond_init(cpu->halt_cond);
1987
1988     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
1989              cpu->cpu_index);
1990     qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
1991                        cpu, QEMU_THREAD_JOINABLE);
1992 }
1993
1994 static void qemu_whpx_start_vcpu(CPUState *cpu)
1995 {
1996     char thread_name[VCPU_THREAD_NAME_SIZE];
1997
1998     cpu->thread = g_malloc0(sizeof(QemuThread));
1999     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2000     qemu_cond_init(cpu->halt_cond);
2001     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2002              cpu->cpu_index);
2003     qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2004                        cpu, QEMU_THREAD_JOINABLE);
2005 #ifdef _WIN32
2006     cpu->hThread = qemu_thread_get_handle(cpu->thread);
2007 #endif
2008 }
2009
2010 static void qemu_dummy_start_vcpu(CPUState *cpu)
2011 {
2012     char thread_name[VCPU_THREAD_NAME_SIZE];
2013
2014     cpu->thread = g_malloc0(sizeof(QemuThread));
2015     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2016     qemu_cond_init(cpu->halt_cond);
2017     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2018              cpu->cpu_index);
2019     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2020                        QEMU_THREAD_JOINABLE);
2021 }
2022
2023 void qemu_init_vcpu(CPUState *cpu)
2024 {
2025     MachineState *ms = MACHINE(qdev_get_machine());
2026
2027     cpu->nr_cores = ms->smp.cores;
2028     cpu->nr_threads =  ms->smp.threads;
2029     cpu->stopped = true;
2030     cpu->random_seed = qemu_guest_random_seed_thread_part1();
2031
2032     if (!cpu->as) {
2033         /* If the target cpu hasn't set up any address spaces itself,
2034          * give it the default one.
2035          */
2036         cpu->num_ases = 1;
2037         cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2038     }
2039
2040     if (kvm_enabled()) {
2041         qemu_kvm_start_vcpu(cpu);
2042     } else if (hax_enabled()) {
2043         qemu_hax_start_vcpu(cpu);
2044     } else if (hvf_enabled()) {
2045         qemu_hvf_start_vcpu(cpu);
2046     } else if (tcg_enabled()) {
2047         qemu_tcg_init_vcpu(cpu);
2048     } else if (whpx_enabled()) {
2049         qemu_whpx_start_vcpu(cpu);
2050     } else {
2051         qemu_dummy_start_vcpu(cpu);
2052     }
2053
2054     while (!cpu->created) {
2055         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2056     }
2057 }
2058
2059 void cpu_stop_current(void)
2060 {
2061     if (current_cpu) {
2062         current_cpu->stop = true;
2063         cpu_exit(current_cpu);
2064     }
2065 }
2066
2067 int vm_stop(RunState state)
2068 {
2069     if (qemu_in_vcpu_thread()) {
2070         qemu_system_vmstop_request_prepare();
2071         qemu_system_vmstop_request(state);
2072         /*
2073          * FIXME: should not return to device code in case
2074          * vm_stop() has been requested.
2075          */
2076         cpu_stop_current();
2077         return 0;
2078     }
2079
2080     return do_vm_stop(state, true);
2081 }
2082
2083 /**
2084  * Prepare for (re)starting the VM.
2085  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2086  * running or in case of an error condition), 0 otherwise.
2087  */
2088 int vm_prepare_start(void)
2089 {
2090     RunState requested;
2091
2092     qemu_vmstop_requested(&requested);
2093     if (runstate_is_running() && requested == RUN_STATE__MAX) {
2094         return -1;
2095     }
2096
2097     /* Ensure that a STOP/RESUME pair of events is emitted if a
2098      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2099      * example, according to documentation is always followed by
2100      * the STOP event.
2101      */
2102     if (runstate_is_running()) {
2103         qapi_event_send_stop();
2104         qapi_event_send_resume();
2105         return -1;
2106     }
2107
2108     /* We are sending this now, but the CPUs will be resumed shortly later */
2109     qapi_event_send_resume();
2110
2111     cpu_enable_ticks();
2112     runstate_set(RUN_STATE_RUNNING);
2113     vm_state_notify(1, RUN_STATE_RUNNING);
2114     return 0;
2115 }
2116
2117 void vm_start(void)
2118 {
2119     if (!vm_prepare_start()) {
2120         resume_all_vcpus();
2121     }
2122 }
2123
2124 /* does a state transition even if the VM is already stopped,
2125    current state is forgotten forever */
2126 int vm_stop_force_state(RunState state)
2127 {
2128     if (runstate_is_running()) {
2129         return vm_stop(state);
2130     } else {
2131         runstate_set(state);
2132
2133         bdrv_drain_all();
2134         /* Make sure to return an error if the flush in a previous vm_stop()
2135          * failed. */
2136         return bdrv_flush_all();
2137     }
2138 }
2139
2140 void list_cpus(const char *optarg)
2141 {
2142     /* XXX: implement xxx_cpu_list for targets that still miss it */
2143 #if defined(cpu_list)
2144     cpu_list();
2145 #endif
2146 }
2147
2148 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2149                  bool has_cpu, int64_t cpu_index, Error **errp)
2150 {
2151     FILE *f;
2152     uint32_t l;
2153     CPUState *cpu;
2154     uint8_t buf[1024];
2155     int64_t orig_addr = addr, orig_size = size;
2156
2157     if (!has_cpu) {
2158         cpu_index = 0;
2159     }
2160
2161     cpu = qemu_get_cpu(cpu_index);
2162     if (cpu == NULL) {
2163         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2164                    "a CPU number");
2165         return;
2166     }
2167
2168     f = fopen(filename, "wb");
2169     if (!f) {
2170         error_setg_file_open(errp, errno, filename);
2171         return;
2172     }
2173
2174     while (size != 0) {
2175         l = sizeof(buf);
2176         if (l > size)
2177             l = size;
2178         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2179             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2180                              " specified", orig_addr, orig_size);
2181             goto exit;
2182         }
2183         if (fwrite(buf, 1, l, f) != l) {
2184             error_setg(errp, QERR_IO_ERROR);
2185             goto exit;
2186         }
2187         addr += l;
2188         size -= l;
2189     }
2190
2191 exit:
2192     fclose(f);
2193 }
2194
2195 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2196                   Error **errp)
2197 {
2198     FILE *f;
2199     uint32_t l;
2200     uint8_t buf[1024];
2201
2202     f = fopen(filename, "wb");
2203     if (!f) {
2204         error_setg_file_open(errp, errno, filename);
2205         return;
2206     }
2207
2208     while (size != 0) {
2209         l = sizeof(buf);
2210         if (l > size)
2211             l = size;
2212         cpu_physical_memory_read(addr, buf, l);
2213         if (fwrite(buf, 1, l, f) != l) {
2214             error_setg(errp, QERR_IO_ERROR);
2215             goto exit;
2216         }
2217         addr += l;
2218         size -= l;
2219     }
2220
2221 exit:
2222     fclose(f);
2223 }
2224
2225 void qmp_inject_nmi(Error **errp)
2226 {
2227     nmi_monitor_handle(monitor_get_cpu_index(), errp);
2228 }
2229
2230 void dump_drift_info(void)
2231 {
2232     if (!use_icount) {
2233         return;
2234     }
2235
2236     qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2237                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2238     if (icount_align_option) {
2239         qemu_printf("Max guest delay     %"PRIi64" ms\n",
2240                     -max_delay / SCALE_MS);
2241         qemu_printf("Max guest advance   %"PRIi64" ms\n",
2242                     max_advance / SCALE_MS);
2243     } else {
2244         qemu_printf("Max guest delay     NA\n");
2245         qemu_printf("Max guest advance   NA\n");
2246     }
2247 }