cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 /* Needed early for CONFIG_BSD etc. */
  26 #include "qemu/osdep.h"
  27 #include "qemu-common.h"
  28 #include "qemu/config-file.h"
  29 #include "cpu.h"
  30 #include "monitor/monitor.h"
  31 #include "qapi/qmp/qerror.h"
  32 #include "qemu/error-report.h"
  33 #include "sysemu/sysemu.h"
  34 #include "sysemu/block-backend.h"
  35 #include "exec/gdbstub.h"
  36 #include "sysemu/dma.h"
  37 #include "sysemu/hw_accel.h"
  38 #include "sysemu/kvm.h"
  39 #include "sysemu/hax.h"
  40 #include "qmp-commands.h"
  41 #include "exec/exec-all.h"
  42
  43 #include "qemu/thread.h"
  44 #include "sysemu/cpus.h"
  45 #include "sysemu/qtest.h"
  46 #include "qemu/main-loop.h"
  47 #include "qemu/bitmap.h"
  48 #include "qemu/seqlock.h"
  49 #include "tcg.h"
  50 #include "qapi-event.h"
  51 #include "hw/nmi.h"
  52 #include "sysemu/replay.h"
  53
  54 #ifdef CONFIG_LINUX
  55
  56 #include <sys/prctl.h>
  57
  58 #ifndef PR_MCE_KILL
  59 #define PR_MCE_KILL 33
  60 #endif
  61
  62 #ifndef PR_MCE_KILL_SET
  63 #define PR_MCE_KILL_SET 1
  64 #endif
  65
  66 #ifndef PR_MCE_KILL_EARLY
  67 #define PR_MCE_KILL_EARLY 1
  68 #endif
  69
  70 #endif /* CONFIG_LINUX */
  71
  72 int64_t max_delay;
  73 int64_t max_advance;
  74
  75 /* vcpu throttling controls */
  76 static QEMUTimer *throttle_timer;
  77 static unsigned int throttle_percentage;
  78
  79 #define CPU_THROTTLE_PCT_MIN 1
  80 #define CPU_THROTTLE_PCT_MAX 99
  81 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  82
  83 bool cpu_is_stopped(CPUState *cpu)
  84 {
  85     return cpu->stopped || !runstate_is_running();
  86 }
  87
  88 static bool cpu_thread_is_idle(CPUState *cpu)
  89 {
  90     if (cpu->stop || cpu->queued_work_first) {
  91         return false;
  92     }
  93     if (cpu_is_stopped(cpu)) {
  94         return true;
  95     }
  96     if (!cpu->halted || cpu_has_work(cpu) ||
  97         kvm_halt_in_kernel()) {
  98         return false;
  99     }
 100     return true;
 101 }
 102
 103 static bool all_cpu_threads_idle(void)
 104 {
 105     CPUState *cpu;
 106
 107     CPU_FOREACH(cpu) {
 108         if (!cpu_thread_is_idle(cpu)) {
 109             return false;
 110         }
 111     }
 112     return true;
 113 }
 114
 115 /***********************************************************/
 116 /* guest cycle counter */
 117
 118 /* Protected by TimersState seqlock */
 119
 120 static bool icount_sleep = true;
 121 static int64_t vm_clock_warp_start = -1;
 122 /* Conversion factor from emulated instructions to virtual clock ticks.  */
 123 static int icount_time_shift;
 124 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 125 #define MAX_ICOUNT_SHIFT 10
 126
 127 static QEMUTimer *icount_rt_timer;
 128 static QEMUTimer *icount_vm_timer;
 129 static QEMUTimer *icount_warp_timer;
 130
 131 typedef struct TimersState {
 132     /* Protected by BQL.  */
 133     int64_t cpu_ticks_prev;
 134     int64_t cpu_ticks_offset;
 135
 136     /* cpu_clock_offset can be read out of BQL, so protect it with
 137      * this lock.
 138      */
 139     QemuSeqLock vm_clock_seqlock;
 140     int64_t cpu_clock_offset;
 141     int32_t cpu_ticks_enabled;
 142     int64_t dummy;
 143
 144     /* Compensate for varying guest execution speed.  */
 145     int64_t qemu_icount_bias;
 146     /* Only written by TCG thread */
 147     int64_t qemu_icount;
 148 } TimersState;
 149
 150 static TimersState timers_state;
 151 bool mttcg_enabled;
 152
 153 /*
 154  * We default to false if we know other options have been enabled
 155  * which are currently incompatible with MTTCG. Otherwise when each
 156  * guest (target) has been updated to support:
 157  *   - atomic instructions
 158  *   - memory ordering primitives (barriers)
 159  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 160  *
 161  * Once a guest architecture has been converted to the new primitives
 162  * there are two remaining limitations to check.
 163  *
 164  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 165  * - The host must have a stronger memory order than the guest
 166  *
 167  * It may be possible in future to support strong guests on weak hosts
 168  * but that will require tagging all load/stores in a guest with their
 169  * implicit memory order requirements which would likely slow things
 170  * down a lot.
 171  */
 172
 173 static bool check_tcg_memory_orders_compatible(void)
 174 {
 175 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 176     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 177 #else
 178     return false;
 179 #endif
 180 }
 181
 182 static bool default_mttcg_enabled(void)
 183 {
 184     if (use_icount || TCG_OVERSIZED_GUEST) {
 185         return false;
 186     } else {
 187 #ifdef TARGET_SUPPORTS_MTTCG
 188         return check_tcg_memory_orders_compatible();
 189 #else
 190         return false;
 191 #endif
 192     }
 193 }
 194
 195 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 196 {
 197     const char *t = qemu_opt_get(opts, "thread");
 198     if (t) {
 199         if (strcmp(t, "multi") == 0) {
 200             if (TCG_OVERSIZED_GUEST) {
 201                 error_setg(errp, "No MTTCG when guest word size > hosts");
 202             } else if (use_icount) {
 203                 error_setg(errp, "No MTTCG when icount is enabled");
 204             } else {
 205 #ifndef TARGET_SUPPORT_MTTCG
 206                 error_report("Guest not yet converted to MTTCG - "
 207                              "you may get unexpected results");
 208 #endif
 209                 if (!check_tcg_memory_orders_compatible()) {
 210                     error_report("Guest expects a stronger memory ordering "
 211                                  "than the host provides");
 212                     error_printf("This may cause strange/hard to debug errors");
 213                 }
 214                 mttcg_enabled = true;
 215             }
 216         } else if (strcmp(t, "single") == 0) {
 217             mttcg_enabled = false;
 218         } else {
 219             error_setg(errp, "Invalid 'thread' setting %s", t);
 220         }
 221     } else {
 222         mttcg_enabled = default_mttcg_enabled();
 223     }
 224 }
 225
 226 int64_t cpu_get_icount_raw(void)
 227 {
 228     int64_t icount;
 229     CPUState *cpu = current_cpu;
 230
 231     icount = timers_state.qemu_icount;
 232     if (cpu) {
 233         if (!cpu->can_do_io) {
 234             fprintf(stderr, "Bad icount read\n");
 235             exit(1);
 236         }
 237         icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
 238     }
 239     return icount;
 240 }
 241
 242 /* Return the virtual CPU time, based on the instruction counter.  */
 243 static int64_t cpu_get_icount_locked(void)
 244 {
 245     int64_t icount = cpu_get_icount_raw();
 246     return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
 247 }
 248
 249 int64_t cpu_get_icount(void)
 250 {
 251     int64_t icount;
 252     unsigned start;
 253
 254     do {
 255         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 256         icount = cpu_get_icount_locked();
 257     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 258
 259     return icount;
 260 }
 261
 262 int64_t cpu_icount_to_ns(int64_t icount)
 263 {
 264     return icount << icount_time_shift;
 265 }
 266
 267 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 268  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 269  * counter.
 270  *
 271  * Caller must hold the BQL
 272  */
 273 int64_t cpu_get_ticks(void)
 274 {
 275     int64_t ticks;
 276
 277     if (use_icount) {
 278         return cpu_get_icount();
 279     }
 280
 281     ticks = timers_state.cpu_ticks_offset;
 282     if (timers_state.cpu_ticks_enabled) {
 283         ticks += cpu_get_host_ticks();
 284     }
 285
 286     if (timers_state.cpu_ticks_prev > ticks) {
 287         /* Note: non increasing ticks may happen if the host uses
 288            software suspend */
 289         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 290         ticks = timers_state.cpu_ticks_prev;
 291     }
 292
 293     timers_state.cpu_ticks_prev = ticks;
 294     return ticks;
 295 }
 296
 297 static int64_t cpu_get_clock_locked(void)
 298 {
 299     int64_t time;
 300
 301     time = timers_state.cpu_clock_offset;
 302     if (timers_state.cpu_ticks_enabled) {
 303         time += get_clock();
 304     }
 305
 306     return time;
 307 }
 308
 309 /* Return the monotonic time elapsed in VM, i.e.,
 310  * the time between vm_start and vm_stop
 311  */
 312 int64_t cpu_get_clock(void)
 313 {
 314     int64_t ti;
 315     unsigned start;
 316
 317     do {
 318         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 319         ti = cpu_get_clock_locked();
 320     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 321
 322     return ti;
 323 }
 324
 325 /* enable cpu_get_ticks()
 326  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 327  */
 328 void cpu_enable_ticks(void)
 329 {
 330     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 331     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 332     if (!timers_state.cpu_ticks_enabled) {
 333         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 334         timers_state.cpu_clock_offset -= get_clock();
 335         timers_state.cpu_ticks_enabled = 1;
 336     }
 337     seqlock_write_end(&timers_state.vm_clock_seqlock);
 338 }
 339
 340 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 341  * cpu_get_ticks() after that.
 342  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 343  */
 344 void cpu_disable_ticks(void)
 345 {
 346     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 347     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 348     if (timers_state.cpu_ticks_enabled) {
 349         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 350         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 351         timers_state.cpu_ticks_enabled = 0;
 352     }
 353     seqlock_write_end(&timers_state.vm_clock_seqlock);
 354 }
 355
 356 /* Correlation between real and virtual time is always going to be
 357    fairly approximate, so ignore small variation.
 358    When the guest is idle real and virtual time will be aligned in
 359    the IO wait loop.  */
 360 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 361
 362 static void icount_adjust(void)
 363 {
 364     int64_t cur_time;
 365     int64_t cur_icount;
 366     int64_t delta;
 367
 368     /* Protected by TimersState mutex.  */
 369     static int64_t last_delta;
 370
 371     /* If the VM is not running, then do nothing.  */
 372     if (!runstate_is_running()) {
 373         return;
 374     }
 375
 376     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 377     cur_time = cpu_get_clock_locked();
 378     cur_icount = cpu_get_icount_locked();
 379
 380     delta = cur_icount - cur_time;
 381     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 382     if (delta > 0
 383         && last_delta + ICOUNT_WOBBLE < delta * 2
 384         && icount_time_shift > 0) {
 385         /* The guest is getting too far ahead.  Slow time down.  */
 386         icount_time_shift--;
 387     }
 388     if (delta < 0
 389         && last_delta - ICOUNT_WOBBLE > delta * 2
 390         && icount_time_shift < MAX_ICOUNT_SHIFT) {
 391         /* The guest is getting too far behind.  Speed time up.  */
 392         icount_time_shift++;
 393     }
 394     last_delta = delta;
 395     timers_state.qemu_icount_bias = cur_icount
 396                               - (timers_state.qemu_icount << icount_time_shift);
 397     seqlock_write_end(&timers_state.vm_clock_seqlock);
 398 }
 399
 400 static void icount_adjust_rt(void *opaque)
 401 {
 402     timer_mod(icount_rt_timer,
 403               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 404     icount_adjust();
 405 }
 406
 407 static void icount_adjust_vm(void *opaque)
 408 {
 409     timer_mod(icount_vm_timer,
 410                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 411                    NANOSECONDS_PER_SECOND / 10);
 412     icount_adjust();
 413 }
 414
 415 static int64_t qemu_icount_round(int64_t count)
 416 {
 417     return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 418 }
 419
 420 static void icount_warp_rt(void)
 421 {
 422     unsigned seq;
 423     int64_t warp_start;
 424
 425     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 426      * changes from -1 to another value, so the race here is okay.
 427      */
 428     do {
 429         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 430         warp_start = vm_clock_warp_start;
 431     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 432
 433     if (warp_start == -1) {
 434         return;
 435     }
 436
 437     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 438     if (runstate_is_running()) {
 439         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 440                                      cpu_get_clock_locked());
 441         int64_t warp_delta;
 442
 443         warp_delta = clock - vm_clock_warp_start;
 444         if (use_icount == 2) {
 445             /*
 446              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 447              * far ahead of real time.
 448              */
 449             int64_t cur_icount = cpu_get_icount_locked();
 450             int64_t delta = clock - cur_icount;
 451             warp_delta = MIN(warp_delta, delta);
 452         }
 453         timers_state.qemu_icount_bias += warp_delta;
 454     }
 455     vm_clock_warp_start = -1;
 456     seqlock_write_end(&timers_state.vm_clock_seqlock);
 457
 458     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 459         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 460     }
 461 }
 462
 463 static void icount_timer_cb(void *opaque)
 464 {
 465     /* No need for a checkpoint because the timer already synchronizes
 466      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 467      */
 468     icount_warp_rt();
 469 }
 470
 471 void qtest_clock_warp(int64_t dest)
 472 {
 473     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 474     AioContext *aio_context;
 475     assert(qtest_enabled());
 476     aio_context = qemu_get_aio_context();
 477     while (clock < dest) {
 478         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 479         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 480
 481         seqlock_write_begin(&timers_state.vm_clock_seqlock);
 482         timers_state.qemu_icount_bias += warp;
 483         seqlock_write_end(&timers_state.vm_clock_seqlock);
 484
 485         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 486         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 487         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 488     }
 489     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 490 }
 491
 492 void qemu_start_warp_timer(void)
 493 {
 494     int64_t clock;
 495     int64_t deadline;
 496
 497     if (!use_icount) {
 498         return;
 499     }
 500
 501     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 502      * do not fire, so computing the deadline does not make sense.
 503      */
 504     if (!runstate_is_running()) {
 505         return;
 506     }
 507
 508     /* warp clock deterministically in record/replay mode */
 509     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 510         return;
 511     }
 512
 513     if (!all_cpu_threads_idle()) {
 514         return;
 515     }
 516
 517     if (qtest_enabled()) {
 518         /* When testing, qtest commands advance icount.  */
 519         return;
 520     }
 521
 522     /* We want to use the earliest deadline from ALL vm_clocks */
 523     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 524     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 525     if (deadline < 0) {
 526         static bool notified;
 527         if (!icount_sleep && !notified) {
 528             error_report("WARNING: icount sleep disabled and no active timers");
 529             notified = true;
 530         }
 531         return;
 532     }
 533
 534     if (deadline > 0) {
 535         /*
 536          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 537          * sleep.  Otherwise, the CPU might be waiting for a future timer
 538          * interrupt to wake it up, but the interrupt never comes because
 539          * the vCPU isn't running any insns and thus doesn't advance the
 540          * QEMU_CLOCK_VIRTUAL.
 541          */
 542         if (!icount_sleep) {
 543             /*
 544              * We never let VCPUs sleep in no sleep icount mode.
 545              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 546              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 547              * It is useful when we want a deterministic execution time,
 548              * isolated from host latencies.
 549              */
 550             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 551             timers_state.qemu_icount_bias += deadline;
 552             seqlock_write_end(&timers_state.vm_clock_seqlock);
 553             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 554         } else {
 555             /*
 556              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 557              * "real" time, (related to the time left until the next event) has
 558              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 559              * This avoids that the warps are visible externally; for example,
 560              * you will not be sending network packets continuously instead of
 561              * every 100ms.
 562              */
 563             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 564             if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
 565                 vm_clock_warp_start = clock;
 566             }
 567             seqlock_write_end(&timers_state.vm_clock_seqlock);
 568             timer_mod_anticipate(icount_warp_timer, clock + deadline);
 569         }
 570     } else if (deadline == 0) {
 571         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 572     }
 573 }
 574
 575 static void qemu_account_warp_timer(void)
 576 {
 577     if (!use_icount || !icount_sleep) {
 578         return;
 579     }
 580
 581     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 582      * do not fire, so computing the deadline does not make sense.
 583      */
 584     if (!runstate_is_running()) {
 585         return;
 586     }
 587
 588     /* warp clock deterministically in record/replay mode */
 589     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 590         return;
 591     }
 592
 593     timer_del(icount_warp_timer);
 594     icount_warp_rt();
 595 }
 596
 597 static bool icount_state_needed(void *opaque)
 598 {
 599     return use_icount;
 600 }
 601
 602 /*
 603  * This is a subsection for icount migration.
 604  */
 605 static const VMStateDescription icount_vmstate_timers = {
 606     .name = "timer/icount",
 607     .version_id = 1,
 608     .minimum_version_id = 1,
 609     .needed = icount_state_needed,
 610     .fields = (VMStateField[]) {
 611         VMSTATE_INT64(qemu_icount_bias, TimersState),
 612         VMSTATE_INT64(qemu_icount, TimersState),
 613         VMSTATE_END_OF_LIST()
 614     }
 615 };
 616
 617 static const VMStateDescription vmstate_timers = {
 618     .name = "timer",
 619     .version_id = 2,
 620     .minimum_version_id = 1,
 621     .fields = (VMStateField[]) {
 622         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 623         VMSTATE_INT64(dummy, TimersState),
 624         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 625         VMSTATE_END_OF_LIST()
 626     },
 627     .subsections = (const VMStateDescription*[]) {
 628         &icount_vmstate_timers,
 629         NULL
 630     }
 631 };
 632
 633 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 634 {
 635     double pct;
 636     double throttle_ratio;
 637     long sleeptime_ns;
 638
 639     if (!cpu_throttle_get_percentage()) {
 640         return;
 641     }
 642
 643     pct = (double)cpu_throttle_get_percentage()/100;
 644     throttle_ratio = pct / (1 - pct);
 645     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 646
 647     qemu_mutex_unlock_iothread();
 648     atomic_set(&cpu->throttle_thread_scheduled, 0);
 649     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 650     qemu_mutex_lock_iothread();
 651 }
 652
 653 static void cpu_throttle_timer_tick(void *opaque)
 654 {
 655     CPUState *cpu;
 656     double pct;
 657
 658     /* Stop the timer if needed */
 659     if (!cpu_throttle_get_percentage()) {
 660         return;
 661     }
 662     CPU_FOREACH(cpu) {
 663         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 664             async_run_on_cpu(cpu, cpu_throttle_thread,
 665                              RUN_ON_CPU_NULL);
 666         }
 667     }
 668
 669     pct = (double)cpu_throttle_get_percentage()/100;
 670     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 671                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 672 }
 673
 674 void cpu_throttle_set(int new_throttle_pct)
 675 {
 676     /* Ensure throttle percentage is within valid range */
 677     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 678     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 679
 680     atomic_set(&throttle_percentage, new_throttle_pct);
 681
 682     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 683                                        CPU_THROTTLE_TIMESLICE_NS);
 684 }
 685
 686 void cpu_throttle_stop(void)
 687 {
 688     atomic_set(&throttle_percentage, 0);
 689 }
 690
 691 bool cpu_throttle_active(void)
 692 {
 693     return (cpu_throttle_get_percentage() != 0);
 694 }
 695
 696 int cpu_throttle_get_percentage(void)
 697 {
 698     return atomic_read(&throttle_percentage);
 699 }
 700
 701 void cpu_ticks_init(void)
 702 {
 703     seqlock_init(&timers_state.vm_clock_seqlock);
 704     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 705     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 706                                            cpu_throttle_timer_tick, NULL);
 707 }
 708
 709 void configure_icount(QemuOpts *opts, Error **errp)
 710 {
 711     const char *option;
 712     char *rem_str = NULL;
 713
 714     option = qemu_opt_get(opts, "shift");
 715     if (!option) {
 716         if (qemu_opt_get(opts, "align") != NULL) {
 717             error_setg(errp, "Please specify shift option when using align");
 718         }
 719         return;
 720     }
 721
 722     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 723     if (icount_sleep) {
 724         icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 725                                          icount_timer_cb, NULL);
 726     }
 727
 728     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 729
 730     if (icount_align_option && !icount_sleep) {
 731         error_setg(errp, "align=on and sleep=off are incompatible");
 732     }
 733     if (strcmp(option, "auto") != 0) {
 734         errno = 0;
 735         icount_time_shift = strtol(option, &rem_str, 0);
 736         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 737             error_setg(errp, "icount: Invalid shift value");
 738         }
 739         use_icount = 1;
 740         return;
 741     } else if (icount_align_option) {
 742         error_setg(errp, "shift=auto and align=on are incompatible");
 743     } else if (!icount_sleep) {
 744         error_setg(errp, "shift=auto and sleep=off are incompatible");
 745     }
 746
 747     use_icount = 2;
 748
 749     /* 125MIPS seems a reasonable initial guess at the guest speed.
 750        It will be corrected fairly quickly anyway.  */
 751     icount_time_shift = 3;
 752
 753     /* Have both realtime and virtual time triggers for speed adjustment.
 754        The realtime trigger catches emulated time passing too slowly,
 755        the virtual time trigger catches emulated time passing too fast.
 756        Realtime triggers occur even when idle, so use them less frequently
 757        than VM triggers.  */
 758     icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 759                                    icount_adjust_rt, NULL);
 760     timer_mod(icount_rt_timer,
 761                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 762     icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 763                                         icount_adjust_vm, NULL);
 764     timer_mod(icount_vm_timer,
 765                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 766                    NANOSECONDS_PER_SECOND / 10);
 767 }
 768
 769 /***********************************************************/
 770 /* TCG vCPU kick timer
 771  *
 772  * The kick timer is responsible for moving single threaded vCPU
 773  * emulation on to the next vCPU. If more than one vCPU is running a
 774  * timer event with force a cpu->exit so the next vCPU can get
 775  * scheduled.
 776  *
 777  * The timer is removed if all vCPUs are idle and restarted again once
 778  * idleness is complete.
 779  */
 780
 781 static QEMUTimer *tcg_kick_vcpu_timer;
 782 static CPUState *tcg_current_rr_cpu;
 783
 784 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 785
 786 static inline int64_t qemu_tcg_next_kick(void)
 787 {
 788     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 789 }
 790
 791 /* Kick the currently round-robin scheduled vCPU */
 792 static void qemu_cpu_kick_rr_cpu(void)
 793 {
 794     CPUState *cpu;
 795     do {
 796         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 797         if (cpu) {
 798             cpu_exit(cpu);
 799         }
 800     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 801 }
 802
 803 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 804 {
 805     qemu_notify_event();
 806 }
 807
 808 static void kick_tcg_thread(void *opaque)
 809 {
 810     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 811     qemu_cpu_kick_rr_cpu();
 812 }
 813
 814 static void start_tcg_kick_timer(void)
 815 {
 816     if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 817         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 818                                            kick_tcg_thread, NULL);
 819         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 820     }
 821 }
 822
 823 static void stop_tcg_kick_timer(void)
 824 {
 825     if (tcg_kick_vcpu_timer) {
 826         timer_del(tcg_kick_vcpu_timer);
 827         tcg_kick_vcpu_timer = NULL;
 828     }
 829 }
 830
 831 /***********************************************************/
 832 void hw_error(const char *fmt, ...)
 833 {
 834     va_list ap;
 835     CPUState *cpu;
 836
 837     va_start(ap, fmt);
 838     fprintf(stderr, "qemu: hardware error: ");
 839     vfprintf(stderr, fmt, ap);
 840     fprintf(stderr, "\n");
 841     CPU_FOREACH(cpu) {
 842         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 843         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 844     }
 845     va_end(ap);
 846     abort();
 847 }
 848
 849 void cpu_synchronize_all_states(void)
 850 {
 851     CPUState *cpu;
 852
 853     CPU_FOREACH(cpu) {
 854         cpu_synchronize_state(cpu);
 855     }
 856 }
 857
 858 void cpu_synchronize_all_post_reset(void)
 859 {
 860     CPUState *cpu;
 861
 862     CPU_FOREACH(cpu) {
 863         cpu_synchronize_post_reset(cpu);
 864     }
 865 }
 866
 867 void cpu_synchronize_all_post_init(void)
 868 {
 869     CPUState *cpu;
 870
 871     CPU_FOREACH(cpu) {
 872         cpu_synchronize_post_init(cpu);
 873     }
 874 }
 875
 876 static int do_vm_stop(RunState state)
 877 {
 878     int ret = 0;
 879
 880     if (runstate_is_running()) {
 881         cpu_disable_ticks();
 882         pause_all_vcpus();
 883         runstate_set(state);
 884         vm_state_notify(0, state);
 885         qapi_event_send_stop(&error_abort);
 886     }
 887
 888     bdrv_drain_all();
 889     replay_disable_events();
 890     ret = bdrv_flush_all();
 891
 892     return ret;
 893 }
 894
 895 static bool cpu_can_run(CPUState *cpu)
 896 {
 897     if (cpu->stop) {
 898         return false;
 899     }
 900     if (cpu_is_stopped(cpu)) {
 901         return false;
 902     }
 903     return true;
 904 }
 905
 906 static void cpu_handle_guest_debug(CPUState *cpu)
 907 {
 908     gdb_set_stop_cpu(cpu);
 909     qemu_system_debug_request();
 910     cpu->stopped = true;
 911 }
 912
 913 #ifdef CONFIG_LINUX
 914 static void sigbus_reraise(void)
 915 {
 916     sigset_t set;
 917     struct sigaction action;
 918
 919     memset(&action, 0, sizeof(action));
 920     action.sa_handler = SIG_DFL;
 921     if (!sigaction(SIGBUS, &action, NULL)) {
 922         raise(SIGBUS);
 923         sigemptyset(&set);
 924         sigaddset(&set, SIGBUS);
 925         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 926     }
 927     perror("Failed to re-raise SIGBUS!\n");
 928     abort();
 929 }
 930
 931 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
 932 {
 933     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
 934         sigbus_reraise();
 935     }
 936
 937     if (current_cpu) {
 938         /* Called asynchronously in VCPU thread.  */
 939         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
 940             sigbus_reraise();
 941         }
 942     } else {
 943         /* Called synchronously (via signalfd) in main thread.  */
 944         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
 945             sigbus_reraise();
 946         }
 947     }
 948 }
 949
 950 static void qemu_init_sigbus(void)
 951 {
 952     struct sigaction action;
 953
 954     memset(&action, 0, sizeof(action));
 955     action.sa_flags = SA_SIGINFO;
 956     action.sa_sigaction = sigbus_handler;
 957     sigaction(SIGBUS, &action, NULL);
 958
 959     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
 960 }
 961 #else /* !CONFIG_LINUX */
 962 static void qemu_init_sigbus(void)
 963 {
 964 }
 965 #endif /* !CONFIG_LINUX */
 966
 967 static QemuMutex qemu_global_mutex;
 968
 969 static QemuThread io_thread;
 970
 971 /* cpu creation */
 972 static QemuCond qemu_cpu_cond;
 973 /* system init */
 974 static QemuCond qemu_pause_cond;
 975
 976 void qemu_init_cpu_loop(void)
 977 {
 978     qemu_init_sigbus();
 979     qemu_cond_init(&qemu_cpu_cond);
 980     qemu_cond_init(&qemu_pause_cond);
 981     qemu_mutex_init(&qemu_global_mutex);
 982
 983     qemu_thread_get_self(&io_thread);
 984 }
 985
 986 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
 987 {
 988     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
 989 }
 990
 991 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
 992 {
 993     if (kvm_destroy_vcpu(cpu) < 0) {
 994         error_report("kvm_destroy_vcpu failed");
 995         exit(EXIT_FAILURE);
 996     }
 997 }
 998
 999 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1000 {
1001 }
1002
1003 static void qemu_wait_io_event_common(CPUState *cpu)
1004 {
1005     atomic_mb_set(&cpu->thread_kicked, false);
1006     if (cpu->stop) {
1007         cpu->stop = false;
1008         cpu->stopped = true;
1009         qemu_cond_broadcast(&qemu_pause_cond);
1010     }
1011     process_queued_cpu_work(cpu);
1012 }
1013
1014 static bool qemu_tcg_should_sleep(CPUState *cpu)
1015 {
1016     if (mttcg_enabled) {
1017         return cpu_thread_is_idle(cpu);
1018     } else {
1019         return all_cpu_threads_idle();
1020     }
1021 }
1022
1023 static void qemu_tcg_wait_io_event(CPUState *cpu)
1024 {
1025     while (qemu_tcg_should_sleep(cpu)) {
1026         stop_tcg_kick_timer();
1027         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1028     }
1029
1030     start_tcg_kick_timer();
1031
1032     qemu_wait_io_event_common(cpu);
1033 }
1034
1035 static void qemu_kvm_wait_io_event(CPUState *cpu)
1036 {
1037     while (cpu_thread_is_idle(cpu)) {
1038         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1039     }
1040
1041     qemu_wait_io_event_common(cpu);
1042 }
1043
1044 static void *qemu_kvm_cpu_thread_fn(void *arg)
1045 {
1046     CPUState *cpu = arg;
1047     int r;
1048
1049     rcu_register_thread();
1050
1051     qemu_mutex_lock_iothread();
1052     qemu_thread_get_self(cpu->thread);
1053     cpu->thread_id = qemu_get_thread_id();
1054     cpu->can_do_io = 1;
1055     current_cpu = cpu;
1056
1057     r = kvm_init_vcpu(cpu);
1058     if (r < 0) {
1059         fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1060         exit(1);
1061     }
1062
1063     kvm_init_cpu_signals(cpu);
1064
1065     /* signal CPU creation */
1066     cpu->created = true;
1067     qemu_cond_signal(&qemu_cpu_cond);
1068
1069     do {
1070         if (cpu_can_run(cpu)) {
1071             r = kvm_cpu_exec(cpu);
1072             if (r == EXCP_DEBUG) {
1073                 cpu_handle_guest_debug(cpu);
1074             }
1075         }
1076         qemu_kvm_wait_io_event(cpu);
1077     } while (!cpu->unplug || cpu_can_run(cpu));
1078
1079     qemu_kvm_destroy_vcpu(cpu);
1080     cpu->created = false;
1081     qemu_cond_signal(&qemu_cpu_cond);
1082     qemu_mutex_unlock_iothread();
1083     return NULL;
1084 }
1085
1086 static void *qemu_dummy_cpu_thread_fn(void *arg)
1087 {
1088 #ifdef _WIN32
1089     fprintf(stderr, "qtest is not supported under Windows\n");
1090     exit(1);
1091 #else
1092     CPUState *cpu = arg;
1093     sigset_t waitset;
1094     int r;
1095
1096     rcu_register_thread();
1097
1098     qemu_mutex_lock_iothread();
1099     qemu_thread_get_self(cpu->thread);
1100     cpu->thread_id = qemu_get_thread_id();
1101     cpu->can_do_io = 1;
1102     current_cpu = cpu;
1103
1104     sigemptyset(&waitset);
1105     sigaddset(&waitset, SIG_IPI);
1106
1107     /* signal CPU creation */
1108     cpu->created = true;
1109     qemu_cond_signal(&qemu_cpu_cond);
1110
1111     while (1) {
1112         qemu_mutex_unlock_iothread();
1113         do {
1114             int sig;
1115             r = sigwait(&waitset, &sig);
1116         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1117         if (r == -1) {
1118             perror("sigwait");
1119             exit(1);
1120         }
1121         qemu_mutex_lock_iothread();
1122         qemu_wait_io_event_common(cpu);
1123     }
1124
1125     return NULL;
1126 #endif
1127 }
1128
1129 static int64_t tcg_get_icount_limit(void)
1130 {
1131     int64_t deadline;
1132
1133     if (replay_mode != REPLAY_MODE_PLAY) {
1134         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1135
1136         /* Maintain prior (possibly buggy) behaviour where if no deadline
1137          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1138          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1139          * nanoseconds.
1140          */
1141         if ((deadline < 0) || (deadline > INT32_MAX)) {
1142             deadline = INT32_MAX;
1143         }
1144
1145         return qemu_icount_round(deadline);
1146     } else {
1147         return replay_get_instructions();
1148     }
1149 }
1150
1151 static void handle_icount_deadline(void)
1152 {
1153     if (use_icount) {
1154         int64_t deadline =
1155             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1156
1157         if (deadline == 0) {
1158             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1159         }
1160     }
1161 }
1162
1163 static int tcg_cpu_exec(CPUState *cpu)
1164 {
1165     int ret;
1166 #ifdef CONFIG_PROFILER
1167     int64_t ti;
1168 #endif
1169
1170 #ifdef CONFIG_PROFILER
1171     ti = profile_getclock();
1172 #endif
1173     if (use_icount) {
1174         int64_t count;
1175         int decr;
1176         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1177                                     + cpu->icount_extra);
1178         cpu->icount_decr.u16.low = 0;
1179         cpu->icount_extra = 0;
1180         count = tcg_get_icount_limit();
1181         timers_state.qemu_icount += count;
1182         decr = (count > 0xffff) ? 0xffff : count;
1183         count -= decr;
1184         cpu->icount_decr.u16.low = decr;
1185         cpu->icount_extra = count;
1186     }
1187     qemu_mutex_unlock_iothread();
1188     cpu_exec_start(cpu);
1189     ret = cpu_exec(cpu);
1190     cpu_exec_end(cpu);
1191     qemu_mutex_lock_iothread();
1192 #ifdef CONFIG_PROFILER
1193     tcg_time += profile_getclock() - ti;
1194 #endif
1195     if (use_icount) {
1196         /* Fold pending instructions back into the
1197            instruction counter, and clear the interrupt flag.  */
1198         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1199                         + cpu->icount_extra);
1200         cpu->icount_decr.u32 = 0;
1201         cpu->icount_extra = 0;
1202         replay_account_executed_instructions();
1203     }
1204     return ret;
1205 }
1206
1207 /* Destroy any remaining vCPUs which have been unplugged and have
1208  * finished running
1209  */
1210 static void deal_with_unplugged_cpus(void)
1211 {
1212     CPUState *cpu;
1213
1214     CPU_FOREACH(cpu) {
1215         if (cpu->unplug && !cpu_can_run(cpu)) {
1216             qemu_tcg_destroy_vcpu(cpu);
1217             cpu->created = false;
1218             qemu_cond_signal(&qemu_cpu_cond);
1219             break;
1220         }
1221     }
1222 }
1223
1224 /* Single-threaded TCG
1225  *
1226  * In the single-threaded case each vCPU is simulated in turn. If
1227  * there is more than a single vCPU we create a simple timer to kick
1228  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1229  * This is done explicitly rather than relying on side-effects
1230  * elsewhere.
1231  */
1232
1233 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1234 {
1235     CPUState *cpu = arg;
1236
1237     rcu_register_thread();
1238
1239     qemu_mutex_lock_iothread();
1240     qemu_thread_get_self(cpu->thread);
1241
1242     CPU_FOREACH(cpu) {
1243         cpu->thread_id = qemu_get_thread_id();
1244         cpu->created = true;
1245         cpu->can_do_io = 1;
1246     }
1247     qemu_cond_signal(&qemu_cpu_cond);
1248
1249     /* wait for initial kick-off after machine start */
1250     while (first_cpu->stopped) {
1251         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1252
1253         /* process any pending work */
1254         CPU_FOREACH(cpu) {
1255             current_cpu = cpu;
1256             qemu_wait_io_event_common(cpu);
1257         }
1258     }
1259
1260     start_tcg_kick_timer();
1261
1262     cpu = first_cpu;
1263
1264     /* process any pending work */
1265     cpu->exit_request = 1;
1266
1267     while (1) {
1268         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1269         qemu_account_warp_timer();
1270
1271         if (!cpu) {
1272             cpu = first_cpu;
1273         }
1274
1275         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1276
1277             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1278             current_cpu = cpu;
1279
1280             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1281                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1282
1283             if (cpu_can_run(cpu)) {
1284                 int r;
1285                 r = tcg_cpu_exec(cpu);
1286                 if (r == EXCP_DEBUG) {
1287                     cpu_handle_guest_debug(cpu);
1288                     break;
1289                 } else if (r == EXCP_ATOMIC) {
1290                     qemu_mutex_unlock_iothread();
1291                     cpu_exec_step_atomic(cpu);
1292                     qemu_mutex_lock_iothread();
1293                     break;
1294                 }
1295             } else if (cpu->stop) {
1296                 if (cpu->unplug) {
1297                     cpu = CPU_NEXT(cpu);
1298                 }
1299                 break;
1300             }
1301
1302             cpu = CPU_NEXT(cpu);
1303         } /* while (cpu && !cpu->exit_request).. */
1304
1305         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1306         atomic_set(&tcg_current_rr_cpu, NULL);
1307
1308         if (cpu && cpu->exit_request) {
1309             atomic_mb_set(&cpu->exit_request, 0);
1310         }
1311
1312         handle_icount_deadline();
1313
1314         qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1315         deal_with_unplugged_cpus();
1316     }
1317
1318     return NULL;
1319 }
1320
1321 static void *qemu_hax_cpu_thread_fn(void *arg)
1322 {
1323     CPUState *cpu = arg;
1324     int r;
1325     qemu_thread_get_self(cpu->thread);
1326     qemu_mutex_lock(&qemu_global_mutex);
1327
1328     cpu->thread_id = qemu_get_thread_id();
1329     cpu->created = true;
1330     cpu->halted = 0;
1331     current_cpu = cpu;
1332
1333     hax_init_vcpu(cpu);
1334     qemu_cond_signal(&qemu_cpu_cond);
1335
1336     while (1) {
1337         if (cpu_can_run(cpu)) {
1338             r = hax_smp_cpu_exec(cpu);
1339             if (r == EXCP_DEBUG) {
1340                 cpu_handle_guest_debug(cpu);
1341             }
1342         }
1343
1344         while (cpu_thread_is_idle(cpu)) {
1345             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1346         }
1347 #ifdef _WIN32
1348         SleepEx(0, TRUE);
1349 #endif
1350         qemu_wait_io_event_common(cpu);
1351     }
1352     return NULL;
1353 }
1354
1355 #ifdef _WIN32
1356 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1357 {
1358 }
1359 #endif
1360
1361 /* Multi-threaded TCG
1362  *
1363  * In the multi-threaded case each vCPU has its own thread. The TLS
1364  * variable current_cpu can be used deep in the code to find the
1365  * current CPUState for a given thread.
1366  */
1367
1368 static void *qemu_tcg_cpu_thread_fn(void *arg)
1369 {
1370     CPUState *cpu = arg;
1371
1372     rcu_register_thread();
1373
1374     qemu_mutex_lock_iothread();
1375     qemu_thread_get_self(cpu->thread);
1376
1377     cpu->thread_id = qemu_get_thread_id();
1378     cpu->created = true;
1379     cpu->can_do_io = 1;
1380     current_cpu = cpu;
1381     qemu_cond_signal(&qemu_cpu_cond);
1382
1383     /* process any pending work */
1384     cpu->exit_request = 1;
1385
1386     while (1) {
1387         if (cpu_can_run(cpu)) {
1388             int r;
1389             r = tcg_cpu_exec(cpu);
1390             switch (r) {
1391             case EXCP_DEBUG:
1392                 cpu_handle_guest_debug(cpu);
1393                 break;
1394             case EXCP_HALTED:
1395                 /* during start-up the vCPU is reset and the thread is
1396                  * kicked several times. If we don't ensure we go back
1397                  * to sleep in the halted state we won't cleanly
1398                  * start-up when the vCPU is enabled.
1399                  *
1400                  * cpu->halted should ensure we sleep in wait_io_event
1401                  */
1402                 g_assert(cpu->halted);
1403                 break;
1404             case EXCP_ATOMIC:
1405                 qemu_mutex_unlock_iothread();
1406                 cpu_exec_step_atomic(cpu);
1407                 qemu_mutex_lock_iothread();
1408             default:
1409                 /* Ignore everything else? */
1410                 break;
1411             }
1412         }
1413
1414         handle_icount_deadline();
1415
1416         atomic_mb_set(&cpu->exit_request, 0);
1417         qemu_tcg_wait_io_event(cpu);
1418     }
1419
1420     return NULL;
1421 }
1422
1423 static void qemu_cpu_kick_thread(CPUState *cpu)
1424 {
1425 #ifndef _WIN32
1426     int err;
1427
1428     if (cpu->thread_kicked) {
1429         return;
1430     }
1431     cpu->thread_kicked = true;
1432     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1433     if (err) {
1434         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1435         exit(1);
1436     }
1437 #else /* _WIN32 */
1438     if (!qemu_cpu_is_self(cpu)) {
1439         if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1440             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1441                     __func__, GetLastError());
1442             exit(1);
1443         }
1444     }
1445 #endif
1446 }
1447
1448 void qemu_cpu_kick(CPUState *cpu)
1449 {
1450     qemu_cond_broadcast(cpu->halt_cond);
1451     if (tcg_enabled()) {
1452         cpu_exit(cpu);
1453         /* NOP unless doing single-thread RR */
1454         qemu_cpu_kick_rr_cpu();
1455     } else {
1456         if (hax_enabled()) {
1457             /*
1458              * FIXME: race condition with the exit_request check in
1459              * hax_vcpu_hax_exec
1460              */
1461             cpu->exit_request = 1;
1462         }
1463         qemu_cpu_kick_thread(cpu);
1464     }
1465 }
1466
1467 void qemu_cpu_kick_self(void)
1468 {
1469     assert(current_cpu);
1470     qemu_cpu_kick_thread(current_cpu);
1471 }
1472
1473 bool qemu_cpu_is_self(CPUState *cpu)
1474 {
1475     return qemu_thread_is_self(cpu->thread);
1476 }
1477
1478 bool qemu_in_vcpu_thread(void)
1479 {
1480     return current_cpu && qemu_cpu_is_self(current_cpu);
1481 }
1482
1483 static __thread bool iothread_locked = false;
1484
1485 bool qemu_mutex_iothread_locked(void)
1486 {
1487     return iothread_locked;
1488 }
1489
1490 void qemu_mutex_lock_iothread(void)
1491 {
1492     g_assert(!qemu_mutex_iothread_locked());
1493     qemu_mutex_lock(&qemu_global_mutex);
1494     iothread_locked = true;
1495 }
1496
1497 void qemu_mutex_unlock_iothread(void)
1498 {
1499     g_assert(qemu_mutex_iothread_locked());
1500     iothread_locked = false;
1501     qemu_mutex_unlock(&qemu_global_mutex);
1502 }
1503
1504 static bool all_vcpus_paused(void)
1505 {
1506     CPUState *cpu;
1507
1508     CPU_FOREACH(cpu) {
1509         if (!cpu->stopped) {
1510             return false;
1511         }
1512     }
1513
1514     return true;
1515 }
1516
1517 void pause_all_vcpus(void)
1518 {
1519     CPUState *cpu;
1520
1521     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1522     CPU_FOREACH(cpu) {
1523         cpu->stop = true;
1524         qemu_cpu_kick(cpu);
1525     }
1526
1527     if (qemu_in_vcpu_thread()) {
1528         cpu_stop_current();
1529     }
1530
1531     while (!all_vcpus_paused()) {
1532         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1533         CPU_FOREACH(cpu) {
1534             qemu_cpu_kick(cpu);
1535         }
1536     }
1537 }
1538
1539 void cpu_resume(CPUState *cpu)
1540 {
1541     cpu->stop = false;
1542     cpu->stopped = false;
1543     qemu_cpu_kick(cpu);
1544 }
1545
1546 void resume_all_vcpus(void)
1547 {
1548     CPUState *cpu;
1549
1550     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1551     CPU_FOREACH(cpu) {
1552         cpu_resume(cpu);
1553     }
1554 }
1555
1556 void cpu_remove(CPUState *cpu)
1557 {
1558     cpu->stop = true;
1559     cpu->unplug = true;
1560     qemu_cpu_kick(cpu);
1561 }
1562
1563 void cpu_remove_sync(CPUState *cpu)
1564 {
1565     cpu_remove(cpu);
1566     while (cpu->created) {
1567         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1568     }
1569 }
1570
1571 /* For temporary buffers for forming a name */
1572 #define VCPU_THREAD_NAME_SIZE 16
1573
1574 static void qemu_tcg_init_vcpu(CPUState *cpu)
1575 {
1576     char thread_name[VCPU_THREAD_NAME_SIZE];
1577     static QemuCond *single_tcg_halt_cond;
1578     static QemuThread *single_tcg_cpu_thread;
1579
1580     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1581         cpu->thread = g_malloc0(sizeof(QemuThread));
1582         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1583         qemu_cond_init(cpu->halt_cond);
1584
1585         if (qemu_tcg_mttcg_enabled()) {
1586             /* create a thread per vCPU with TCG (MTTCG) */
1587             parallel_cpus = true;
1588             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1589                  cpu->cpu_index);
1590
1591             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1592                                cpu, QEMU_THREAD_JOINABLE);
1593
1594         } else {
1595             /* share a single thread for all cpus with TCG */
1596             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1597             qemu_thread_create(cpu->thread, thread_name,
1598                                qemu_tcg_rr_cpu_thread_fn,
1599                                cpu, QEMU_THREAD_JOINABLE);
1600
1601             single_tcg_halt_cond = cpu->halt_cond;
1602             single_tcg_cpu_thread = cpu->thread;
1603         }
1604 #ifdef _WIN32
1605         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1606 #endif
1607         while (!cpu->created) {
1608             qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1609         }
1610     } else {
1611         /* For non-MTTCG cases we share the thread */
1612         cpu->thread = single_tcg_cpu_thread;
1613         cpu->halt_cond = single_tcg_halt_cond;
1614     }
1615 }
1616
1617 static void qemu_hax_start_vcpu(CPUState *cpu)
1618 {
1619     char thread_name[VCPU_THREAD_NAME_SIZE];
1620
1621     cpu->thread = g_malloc0(sizeof(QemuThread));
1622     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1623     qemu_cond_init(cpu->halt_cond);
1624
1625     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1626              cpu->cpu_index);
1627     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1628                        cpu, QEMU_THREAD_JOINABLE);
1629 #ifdef _WIN32
1630     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1631 #endif
1632     while (!cpu->created) {
1633         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1634     }
1635 }
1636
1637 static void qemu_kvm_start_vcpu(CPUState *cpu)
1638 {
1639     char thread_name[VCPU_THREAD_NAME_SIZE];
1640
1641     cpu->thread = g_malloc0(sizeof(QemuThread));
1642     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1643     qemu_cond_init(cpu->halt_cond);
1644     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1645              cpu->cpu_index);
1646     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1647                        cpu, QEMU_THREAD_JOINABLE);
1648     while (!cpu->created) {
1649         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1650     }
1651 }
1652
1653 static void qemu_dummy_start_vcpu(CPUState *cpu)
1654 {
1655     char thread_name[VCPU_THREAD_NAME_SIZE];
1656
1657     cpu->thread = g_malloc0(sizeof(QemuThread));
1658     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1659     qemu_cond_init(cpu->halt_cond);
1660     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1661              cpu->cpu_index);
1662     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1663                        QEMU_THREAD_JOINABLE);
1664     while (!cpu->created) {
1665         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1666     }
1667 }
1668
1669 void qemu_init_vcpu(CPUState *cpu)
1670 {
1671     cpu->nr_cores = smp_cores;
1672     cpu->nr_threads = smp_threads;
1673     cpu->stopped = true;
1674
1675     if (!cpu->as) {
1676         /* If the target cpu hasn't set up any address spaces itself,
1677          * give it the default one.
1678          */
1679         AddressSpace *as = address_space_init_shareable(cpu->memory,
1680                                                         "cpu-memory");
1681         cpu->num_ases = 1;
1682         cpu_address_space_init(cpu, as, 0);
1683     }
1684
1685     if (kvm_enabled()) {
1686         qemu_kvm_start_vcpu(cpu);
1687     } else if (hax_enabled()) {
1688         qemu_hax_start_vcpu(cpu);
1689     } else if (tcg_enabled()) {
1690         qemu_tcg_init_vcpu(cpu);
1691     } else {
1692         qemu_dummy_start_vcpu(cpu);
1693     }
1694 }
1695
1696 void cpu_stop_current(void)
1697 {
1698     if (current_cpu) {
1699         current_cpu->stop = false;
1700         current_cpu->stopped = true;
1701         cpu_exit(current_cpu);
1702         qemu_cond_broadcast(&qemu_pause_cond);
1703     }
1704 }
1705
1706 int vm_stop(RunState state)
1707 {
1708     if (qemu_in_vcpu_thread()) {
1709         qemu_system_vmstop_request_prepare();
1710         qemu_system_vmstop_request(state);
1711         /*
1712          * FIXME: should not return to device code in case
1713          * vm_stop() has been requested.
1714          */
1715         cpu_stop_current();
1716         return 0;
1717     }
1718
1719     return do_vm_stop(state);
1720 }
1721
1722 /**
1723  * Prepare for (re)starting the VM.
1724  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1725  * running or in case of an error condition), 0 otherwise.
1726  */
1727 int vm_prepare_start(void)
1728 {
1729     RunState requested;
1730     int res = 0;
1731
1732     qemu_vmstop_requested(&requested);
1733     if (runstate_is_running() && requested == RUN_STATE__MAX) {
1734         return -1;
1735     }
1736
1737     /* Ensure that a STOP/RESUME pair of events is emitted if a
1738      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
1739      * example, according to documentation is always followed by
1740      * the STOP event.
1741      */
1742     if (runstate_is_running()) {
1743         qapi_event_send_stop(&error_abort);
1744         res = -1;
1745     } else {
1746         replay_enable_events();
1747         cpu_enable_ticks();
1748         runstate_set(RUN_STATE_RUNNING);
1749         vm_state_notify(1, RUN_STATE_RUNNING);
1750     }
1751
1752     /* We are sending this now, but the CPUs will be resumed shortly later */
1753     qapi_event_send_resume(&error_abort);
1754     return res;
1755 }
1756
1757 void vm_start(void)
1758 {
1759     if (!vm_prepare_start()) {
1760         resume_all_vcpus();
1761     }
1762 }
1763
1764 /* does a state transition even if the VM is already stopped,
1765    current state is forgotten forever */
1766 int vm_stop_force_state(RunState state)
1767 {
1768     if (runstate_is_running()) {
1769         return vm_stop(state);
1770     } else {
1771         runstate_set(state);
1772
1773         bdrv_drain_all();
1774         /* Make sure to return an error if the flush in a previous vm_stop()
1775          * failed. */
1776         return bdrv_flush_all();
1777     }
1778 }
1779
1780 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1781 {
1782     /* XXX: implement xxx_cpu_list for targets that still miss it */
1783 #if defined(cpu_list)
1784     cpu_list(f, cpu_fprintf);
1785 #endif
1786 }
1787
1788 CpuInfoList *qmp_query_cpus(Error **errp)
1789 {
1790     CpuInfoList *head = NULL, *cur_item = NULL;
1791     CPUState *cpu;
1792
1793     CPU_FOREACH(cpu) {
1794         CpuInfoList *info;
1795 #if defined(TARGET_I386)
1796         X86CPU *x86_cpu = X86_CPU(cpu);
1797         CPUX86State *env = &x86_cpu->env;
1798 #elif defined(TARGET_PPC)
1799         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1800         CPUPPCState *env = &ppc_cpu->env;
1801 #elif defined(TARGET_SPARC)
1802         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1803         CPUSPARCState *env = &sparc_cpu->env;
1804 #elif defined(TARGET_MIPS)
1805         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1806         CPUMIPSState *env = &mips_cpu->env;
1807 #elif defined(TARGET_TRICORE)
1808         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1809         CPUTriCoreState *env = &tricore_cpu->env;
1810 #endif
1811
1812         cpu_synchronize_state(cpu);
1813
1814         info = g_malloc0(sizeof(*info));
1815         info->value = g_malloc0(sizeof(*info->value));
1816         info->value->CPU = cpu->cpu_index;
1817         info->value->current = (cpu == first_cpu);
1818         info->value->halted = cpu->halted;
1819         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1820         info->value->thread_id = cpu->thread_id;
1821 #if defined(TARGET_I386)
1822         info->value->arch = CPU_INFO_ARCH_X86;
1823         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1824 #elif defined(TARGET_PPC)
1825         info->value->arch = CPU_INFO_ARCH_PPC;
1826         info->value->u.ppc.nip = env->nip;
1827 #elif defined(TARGET_SPARC)
1828         info->value->arch = CPU_INFO_ARCH_SPARC;
1829         info->value->u.q_sparc.pc = env->pc;
1830         info->value->u.q_sparc.npc = env->npc;
1831 #elif defined(TARGET_MIPS)
1832         info->value->arch = CPU_INFO_ARCH_MIPS;
1833         info->value->u.q_mips.PC = env->active_tc.PC;
1834 #elif defined(TARGET_TRICORE)
1835         info->value->arch = CPU_INFO_ARCH_TRICORE;
1836         info->value->u.tricore.PC = env->PC;
1837 #else
1838         info->value->arch = CPU_INFO_ARCH_OTHER;
1839 #endif
1840
1841         /* XXX: waiting for the qapi to support GSList */
1842         if (!cur_item) {
1843             head = cur_item = info;
1844         } else {
1845             cur_item->next = info;
1846             cur_item = info;
1847         }
1848     }
1849
1850     return head;
1851 }
1852
1853 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1854                  bool has_cpu, int64_t cpu_index, Error **errp)
1855 {
1856     FILE *f;
1857     uint32_t l;
1858     CPUState *cpu;
1859     uint8_t buf[1024];
1860     int64_t orig_addr = addr, orig_size = size;
1861
1862     if (!has_cpu) {
1863         cpu_index = 0;
1864     }
1865
1866     cpu = qemu_get_cpu(cpu_index);
1867     if (cpu == NULL) {
1868         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1869                    "a CPU number");
1870         return;
1871     }
1872
1873     f = fopen(filename, "wb");
1874     if (!f) {
1875         error_setg_file_open(errp, errno, filename);
1876         return;
1877     }
1878
1879     while (size != 0) {
1880         l = sizeof(buf);
1881         if (l > size)
1882             l = size;
1883         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1884             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1885                              " specified", orig_addr, orig_size);
1886             goto exit;
1887         }
1888         if (fwrite(buf, 1, l, f) != l) {
1889             error_setg(errp, QERR_IO_ERROR);
1890             goto exit;
1891         }
1892         addr += l;
1893         size -= l;
1894     }
1895
1896 exit:
1897     fclose(f);
1898 }
1899
1900 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1901                   Error **errp)
1902 {
1903     FILE *f;
1904     uint32_t l;
1905     uint8_t buf[1024];
1906
1907     f = fopen(filename, "wb");
1908     if (!f) {
1909         error_setg_file_open(errp, errno, filename);
1910         return;
1911     }
1912
1913     while (size != 0) {
1914         l = sizeof(buf);
1915         if (l > size)
1916             l = size;
1917         cpu_physical_memory_read(addr, buf, l);
1918         if (fwrite(buf, 1, l, f) != l) {
1919             error_setg(errp, QERR_IO_ERROR);
1920             goto exit;
1921         }
1922         addr += l;
1923         size -= l;
1924     }
1925
1926 exit:
1927     fclose(f);
1928 }
1929
1930 void qmp_inject_nmi(Error **errp)
1931 {
1932     nmi_monitor_handle(monitor_get_cpu_index(), errp);
1933 }
1934
1935 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1936 {
1937     if (!use_icount) {
1938         return;
1939     }
1940
1941     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
1942                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1943     if (icount_align_option) {
1944         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
1945         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
1946     } else {
1947         cpu_fprintf(f, "Max guest delay     NA\n");
1948         cpu_fprintf(f, "Max guest advance   NA\n");
1949     }
1950 }