cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 /* Needed early for CONFIG_BSD etc. */
  26 #include "qemu/osdep.h"
  27 #include "qemu-common.h"
  28 #include "qemu/config-file.h"
  29 #include "cpu.h"
  30 #include "monitor/monitor.h"
  31 #include "qapi/qmp/qerror.h"
  32 #include "qemu/error-report.h"
  33 #include "sysemu/sysemu.h"
  34 #include "sysemu/block-backend.h"
  35 #include "exec/gdbstub.h"
  36 #include "sysemu/dma.h"
  37 #include "sysemu/hw_accel.h"
  38 #include "sysemu/kvm.h"
  39 #include "sysemu/hax.h"
  40 #include "qmp-commands.h"
  41 #include "exec/exec-all.h"
  42
  43 #include "qemu/thread.h"
  44 #include "sysemu/cpus.h"
  45 #include "sysemu/qtest.h"
  46 #include "qemu/main-loop.h"
  47 #include "qemu/bitmap.h"
  48 #include "qemu/seqlock.h"
  49 #include "tcg.h"
  50 #include "qapi-event.h"
  51 #include "hw/nmi.h"
  52 #include "sysemu/replay.h"
  53
  54 #ifndef _WIN32
  55 #include "qemu/compatfd.h"
  56 #endif
  57
  58 #ifdef CONFIG_LINUX
  59
  60 #include <sys/prctl.h>
  61
  62 #ifndef PR_MCE_KILL
  63 #define PR_MCE_KILL 33
  64 #endif
  65
  66 #ifndef PR_MCE_KILL_SET
  67 #define PR_MCE_KILL_SET 1
  68 #endif
  69
  70 #ifndef PR_MCE_KILL_EARLY
  71 #define PR_MCE_KILL_EARLY 1
  72 #endif
  73
  74 #endif /* CONFIG_LINUX */
  75
  76 int64_t max_delay;
  77 int64_t max_advance;
  78
  79 /* vcpu throttling controls */
  80 static QEMUTimer *throttle_timer;
  81 static unsigned int throttle_percentage;
  82
  83 #define CPU_THROTTLE_PCT_MIN 1
  84 #define CPU_THROTTLE_PCT_MAX 99
  85 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  86
  87 bool cpu_is_stopped(CPUState *cpu)
  88 {
  89     return cpu->stopped || !runstate_is_running();
  90 }
  91
  92 static bool cpu_thread_is_idle(CPUState *cpu)
  93 {
  94     if (cpu->stop || cpu->queued_work_first) {
  95         return false;
  96     }
  97     if (cpu_is_stopped(cpu)) {
  98         return true;
  99     }
 100     if (!cpu->halted || cpu_has_work(cpu) ||
 101         kvm_halt_in_kernel()) {
 102         return false;
 103     }
 104     return true;
 105 }
 106
 107 static bool all_cpu_threads_idle(void)
 108 {
 109     CPUState *cpu;
 110
 111     CPU_FOREACH(cpu) {
 112         if (!cpu_thread_is_idle(cpu)) {
 113             return false;
 114         }
 115     }
 116     return true;
 117 }
 118
 119 /***********************************************************/
 120 /* guest cycle counter */
 121
 122 /* Protected by TimersState seqlock */
 123
 124 static bool icount_sleep = true;
 125 static int64_t vm_clock_warp_start = -1;
 126 /* Conversion factor from emulated instructions to virtual clock ticks.  */
 127 static int icount_time_shift;
 128 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 129 #define MAX_ICOUNT_SHIFT 10
 130
 131 static QEMUTimer *icount_rt_timer;
 132 static QEMUTimer *icount_vm_timer;
 133 static QEMUTimer *icount_warp_timer;
 134
 135 typedef struct TimersState {
 136     /* Protected by BQL.  */
 137     int64_t cpu_ticks_prev;
 138     int64_t cpu_ticks_offset;
 139
 140     /* cpu_clock_offset can be read out of BQL, so protect it with
 141      * this lock.
 142      */
 143     QemuSeqLock vm_clock_seqlock;
 144     int64_t cpu_clock_offset;
 145     int32_t cpu_ticks_enabled;
 146     int64_t dummy;
 147
 148     /* Compensate for varying guest execution speed.  */
 149     int64_t qemu_icount_bias;
 150     /* Only written by TCG thread */
 151     int64_t qemu_icount;
 152 } TimersState;
 153
 154 static TimersState timers_state;
 155 bool mttcg_enabled;
 156
 157 /*
 158  * We default to false if we know other options have been enabled
 159  * which are currently incompatible with MTTCG. Otherwise when each
 160  * guest (target) has been updated to support:
 161  *   - atomic instructions
 162  *   - memory ordering primitives (barriers)
 163  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 164  *
 165  * Once a guest architecture has been converted to the new primitives
 166  * there are two remaining limitations to check.
 167  *
 168  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 169  * - The host must have a stronger memory order than the guest
 170  *
 171  * It may be possible in future to support strong guests on weak hosts
 172  * but that will require tagging all load/stores in a guest with their
 173  * implicit memory order requirements which would likely slow things
 174  * down a lot.
 175  */
 176
 177 static bool check_tcg_memory_orders_compatible(void)
 178 {
 179 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 180     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 181 #else
 182     return false;
 183 #endif
 184 }
 185
 186 static bool default_mttcg_enabled(void)
 187 {
 188     QemuOpts *icount_opts = qemu_find_opts_singleton("icount");
 189     const char *rr = qemu_opt_get(icount_opts, "rr");
 190
 191     if (rr || TCG_OVERSIZED_GUEST) {
 192         return false;
 193     } else {
 194 #ifdef TARGET_SUPPORTS_MTTCG
 195         return check_tcg_memory_orders_compatible();
 196 #else
 197         return false;
 198 #endif
 199     }
 200 }
 201
 202 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 203 {
 204     const char *t = qemu_opt_get(opts, "thread");
 205     if (t) {
 206         if (strcmp(t, "multi") == 0) {
 207             if (TCG_OVERSIZED_GUEST) {
 208                 error_setg(errp, "No MTTCG when guest word size > hosts");
 209             } else {
 210                 if (!check_tcg_memory_orders_compatible()) {
 211                     error_report("Guest expects a stronger memory ordering "
 212                                  "than the host provides");
 213                     error_printf("This may cause strange/hard to debug errors");
 214                 }
 215                 mttcg_enabled = true;
 216             }
 217         } else if (strcmp(t, "single") == 0) {
 218             mttcg_enabled = false;
 219         } else {
 220             error_setg(errp, "Invalid 'thread' setting %s", t);
 221         }
 222     } else {
 223         mttcg_enabled = default_mttcg_enabled();
 224     }
 225 }
 226
 227 int64_t cpu_get_icount_raw(void)
 228 {
 229     int64_t icount;
 230     CPUState *cpu = current_cpu;
 231
 232     icount = timers_state.qemu_icount;
 233     if (cpu) {
 234         if (!cpu->can_do_io) {
 235             fprintf(stderr, "Bad icount read\n");
 236             exit(1);
 237         }
 238         icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
 239     }
 240     return icount;
 241 }
 242
 243 /* Return the virtual CPU time, based on the instruction counter.  */
 244 static int64_t cpu_get_icount_locked(void)
 245 {
 246     int64_t icount = cpu_get_icount_raw();
 247     return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
 248 }
 249
 250 int64_t cpu_get_icount(void)
 251 {
 252     int64_t icount;
 253     unsigned start;
 254
 255     do {
 256         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 257         icount = cpu_get_icount_locked();
 258     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 259
 260     return icount;
 261 }
 262
 263 int64_t cpu_icount_to_ns(int64_t icount)
 264 {
 265     return icount << icount_time_shift;
 266 }
 267
 268 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 269  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 270  * counter.
 271  *
 272  * Caller must hold the BQL
 273  */
 274 int64_t cpu_get_ticks(void)
 275 {
 276     int64_t ticks;
 277
 278     if (use_icount) {
 279         return cpu_get_icount();
 280     }
 281
 282     ticks = timers_state.cpu_ticks_offset;
 283     if (timers_state.cpu_ticks_enabled) {
 284         ticks += cpu_get_host_ticks();
 285     }
 286
 287     if (timers_state.cpu_ticks_prev > ticks) {
 288         /* Note: non increasing ticks may happen if the host uses
 289            software suspend */
 290         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 291         ticks = timers_state.cpu_ticks_prev;
 292     }
 293
 294     timers_state.cpu_ticks_prev = ticks;
 295     return ticks;
 296 }
 297
 298 static int64_t cpu_get_clock_locked(void)
 299 {
 300     int64_t time;
 301
 302     time = timers_state.cpu_clock_offset;
 303     if (timers_state.cpu_ticks_enabled) {
 304         time += get_clock();
 305     }
 306
 307     return time;
 308 }
 309
 310 /* Return the monotonic time elapsed in VM, i.e.,
 311  * the time between vm_start and vm_stop
 312  */
 313 int64_t cpu_get_clock(void)
 314 {
 315     int64_t ti;
 316     unsigned start;
 317
 318     do {
 319         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 320         ti = cpu_get_clock_locked();
 321     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 322
 323     return ti;
 324 }
 325
 326 /* enable cpu_get_ticks()
 327  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 328  */
 329 void cpu_enable_ticks(void)
 330 {
 331     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 332     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 333     if (!timers_state.cpu_ticks_enabled) {
 334         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 335         timers_state.cpu_clock_offset -= get_clock();
 336         timers_state.cpu_ticks_enabled = 1;
 337     }
 338     seqlock_write_end(&timers_state.vm_clock_seqlock);
 339 }
 340
 341 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 342  * cpu_get_ticks() after that.
 343  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 344  */
 345 void cpu_disable_ticks(void)
 346 {
 347     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 348     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 349     if (timers_state.cpu_ticks_enabled) {
 350         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 351         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 352         timers_state.cpu_ticks_enabled = 0;
 353     }
 354     seqlock_write_end(&timers_state.vm_clock_seqlock);
 355 }
 356
 357 /* Correlation between real and virtual time is always going to be
 358    fairly approximate, so ignore small variation.
 359    When the guest is idle real and virtual time will be aligned in
 360    the IO wait loop.  */
 361 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 362
 363 static void icount_adjust(void)
 364 {
 365     int64_t cur_time;
 366     int64_t cur_icount;
 367     int64_t delta;
 368
 369     /* Protected by TimersState mutex.  */
 370     static int64_t last_delta;
 371
 372     /* If the VM is not running, then do nothing.  */
 373     if (!runstate_is_running()) {
 374         return;
 375     }
 376
 377     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 378     cur_time = cpu_get_clock_locked();
 379     cur_icount = cpu_get_icount_locked();
 380
 381     delta = cur_icount - cur_time;
 382     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 383     if (delta > 0
 384         && last_delta + ICOUNT_WOBBLE < delta * 2
 385         && icount_time_shift > 0) {
 386         /* The guest is getting too far ahead.  Slow time down.  */
 387         icount_time_shift--;
 388     }
 389     if (delta < 0
 390         && last_delta - ICOUNT_WOBBLE > delta * 2
 391         && icount_time_shift < MAX_ICOUNT_SHIFT) {
 392         /* The guest is getting too far behind.  Speed time up.  */
 393         icount_time_shift++;
 394     }
 395     last_delta = delta;
 396     timers_state.qemu_icount_bias = cur_icount
 397                               - (timers_state.qemu_icount << icount_time_shift);
 398     seqlock_write_end(&timers_state.vm_clock_seqlock);
 399 }
 400
 401 static void icount_adjust_rt(void *opaque)
 402 {
 403     timer_mod(icount_rt_timer,
 404               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 405     icount_adjust();
 406 }
 407
 408 static void icount_adjust_vm(void *opaque)
 409 {
 410     timer_mod(icount_vm_timer,
 411                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 412                    NANOSECONDS_PER_SECOND / 10);
 413     icount_adjust();
 414 }
 415
 416 static int64_t qemu_icount_round(int64_t count)
 417 {
 418     return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 419 }
 420
 421 static void icount_warp_rt(void)
 422 {
 423     unsigned seq;
 424     int64_t warp_start;
 425
 426     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 427      * changes from -1 to another value, so the race here is okay.
 428      */
 429     do {
 430         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 431         warp_start = vm_clock_warp_start;
 432     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 433
 434     if (warp_start == -1) {
 435         return;
 436     }
 437
 438     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 439     if (runstate_is_running()) {
 440         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 441                                      cpu_get_clock_locked());
 442         int64_t warp_delta;
 443
 444         warp_delta = clock - vm_clock_warp_start;
 445         if (use_icount == 2) {
 446             /*
 447              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 448              * far ahead of real time.
 449              */
 450             int64_t cur_icount = cpu_get_icount_locked();
 451             int64_t delta = clock - cur_icount;
 452             warp_delta = MIN(warp_delta, delta);
 453         }
 454         timers_state.qemu_icount_bias += warp_delta;
 455     }
 456     vm_clock_warp_start = -1;
 457     seqlock_write_end(&timers_state.vm_clock_seqlock);
 458
 459     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 460         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 461     }
 462 }
 463
 464 static void icount_timer_cb(void *opaque)
 465 {
 466     /* No need for a checkpoint because the timer already synchronizes
 467      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 468      */
 469     icount_warp_rt();
 470 }
 471
 472 void qtest_clock_warp(int64_t dest)
 473 {
 474     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 475     AioContext *aio_context;
 476     assert(qtest_enabled());
 477     aio_context = qemu_get_aio_context();
 478     while (clock < dest) {
 479         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 480         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 481
 482         seqlock_write_begin(&timers_state.vm_clock_seqlock);
 483         timers_state.qemu_icount_bias += warp;
 484         seqlock_write_end(&timers_state.vm_clock_seqlock);
 485
 486         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 487         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 488         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 489     }
 490     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 491 }
 492
 493 void qemu_start_warp_timer(void)
 494 {
 495     int64_t clock;
 496     int64_t deadline;
 497
 498     if (!use_icount) {
 499         return;
 500     }
 501
 502     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 503      * do not fire, so computing the deadline does not make sense.
 504      */
 505     if (!runstate_is_running()) {
 506         return;
 507     }
 508
 509     /* warp clock deterministically in record/replay mode */
 510     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 511         return;
 512     }
 513
 514     if (!all_cpu_threads_idle()) {
 515         return;
 516     }
 517
 518     if (qtest_enabled()) {
 519         /* When testing, qtest commands advance icount.  */
 520         return;
 521     }
 522
 523     /* We want to use the earliest deadline from ALL vm_clocks */
 524     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 525     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 526     if (deadline < 0) {
 527         static bool notified;
 528         if (!icount_sleep && !notified) {
 529             error_report("WARNING: icount sleep disabled and no active timers");
 530             notified = true;
 531         }
 532         return;
 533     }
 534
 535     if (deadline > 0) {
 536         /*
 537          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 538          * sleep.  Otherwise, the CPU might be waiting for a future timer
 539          * interrupt to wake it up, but the interrupt never comes because
 540          * the vCPU isn't running any insns and thus doesn't advance the
 541          * QEMU_CLOCK_VIRTUAL.
 542          */
 543         if (!icount_sleep) {
 544             /*
 545              * We never let VCPUs sleep in no sleep icount mode.
 546              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 547              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 548              * It is useful when we want a deterministic execution time,
 549              * isolated from host latencies.
 550              */
 551             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 552             timers_state.qemu_icount_bias += deadline;
 553             seqlock_write_end(&timers_state.vm_clock_seqlock);
 554             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 555         } else {
 556             /*
 557              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 558              * "real" time, (related to the time left until the next event) has
 559              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 560              * This avoids that the warps are visible externally; for example,
 561              * you will not be sending network packets continuously instead of
 562              * every 100ms.
 563              */
 564             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 565             if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
 566                 vm_clock_warp_start = clock;
 567             }
 568             seqlock_write_end(&timers_state.vm_clock_seqlock);
 569             timer_mod_anticipate(icount_warp_timer, clock + deadline);
 570         }
 571     } else if (deadline == 0) {
 572         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 573     }
 574 }
 575
 576 static void qemu_account_warp_timer(void)
 577 {
 578     if (!use_icount || !icount_sleep) {
 579         return;
 580     }
 581
 582     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 583      * do not fire, so computing the deadline does not make sense.
 584      */
 585     if (!runstate_is_running()) {
 586         return;
 587     }
 588
 589     /* warp clock deterministically in record/replay mode */
 590     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 591         return;
 592     }
 593
 594     timer_del(icount_warp_timer);
 595     icount_warp_rt();
 596 }
 597
 598 static bool icount_state_needed(void *opaque)
 599 {
 600     return use_icount;
 601 }
 602
 603 /*
 604  * This is a subsection for icount migration.
 605  */
 606 static const VMStateDescription icount_vmstate_timers = {
 607     .name = "timer/icount",
 608     .version_id = 1,
 609     .minimum_version_id = 1,
 610     .needed = icount_state_needed,
 611     .fields = (VMStateField[]) {
 612         VMSTATE_INT64(qemu_icount_bias, TimersState),
 613         VMSTATE_INT64(qemu_icount, TimersState),
 614         VMSTATE_END_OF_LIST()
 615     }
 616 };
 617
 618 static const VMStateDescription vmstate_timers = {
 619     .name = "timer",
 620     .version_id = 2,
 621     .minimum_version_id = 1,
 622     .fields = (VMStateField[]) {
 623         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 624         VMSTATE_INT64(dummy, TimersState),
 625         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 626         VMSTATE_END_OF_LIST()
 627     },
 628     .subsections = (const VMStateDescription*[]) {
 629         &icount_vmstate_timers,
 630         NULL
 631     }
 632 };
 633
 634 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 635 {
 636     double pct;
 637     double throttle_ratio;
 638     long sleeptime_ns;
 639
 640     if (!cpu_throttle_get_percentage()) {
 641         return;
 642     }
 643
 644     pct = (double)cpu_throttle_get_percentage()/100;
 645     throttle_ratio = pct / (1 - pct);
 646     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 647
 648     qemu_mutex_unlock_iothread();
 649     atomic_set(&cpu->throttle_thread_scheduled, 0);
 650     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 651     qemu_mutex_lock_iothread();
 652 }
 653
 654 static void cpu_throttle_timer_tick(void *opaque)
 655 {
 656     CPUState *cpu;
 657     double pct;
 658
 659     /* Stop the timer if needed */
 660     if (!cpu_throttle_get_percentage()) {
 661         return;
 662     }
 663     CPU_FOREACH(cpu) {
 664         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 665             async_run_on_cpu(cpu, cpu_throttle_thread,
 666                              RUN_ON_CPU_NULL);
 667         }
 668     }
 669
 670     pct = (double)cpu_throttle_get_percentage()/100;
 671     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 672                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 673 }
 674
 675 void cpu_throttle_set(int new_throttle_pct)
 676 {
 677     /* Ensure throttle percentage is within valid range */
 678     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 679     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 680
 681     atomic_set(&throttle_percentage, new_throttle_pct);
 682
 683     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 684                                        CPU_THROTTLE_TIMESLICE_NS);
 685 }
 686
 687 void cpu_throttle_stop(void)
 688 {
 689     atomic_set(&throttle_percentage, 0);
 690 }
 691
 692 bool cpu_throttle_active(void)
 693 {
 694     return (cpu_throttle_get_percentage() != 0);
 695 }
 696
 697 int cpu_throttle_get_percentage(void)
 698 {
 699     return atomic_read(&throttle_percentage);
 700 }
 701
 702 void cpu_ticks_init(void)
 703 {
 704     seqlock_init(&timers_state.vm_clock_seqlock);
 705     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 706     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 707                                            cpu_throttle_timer_tick, NULL);
 708 }
 709
 710 void configure_icount(QemuOpts *opts, Error **errp)
 711 {
 712     const char *option;
 713     char *rem_str = NULL;
 714
 715     option = qemu_opt_get(opts, "shift");
 716     if (!option) {
 717         if (qemu_opt_get(opts, "align") != NULL) {
 718             error_setg(errp, "Please specify shift option when using align");
 719         }
 720         return;
 721     }
 722
 723     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 724     if (icount_sleep) {
 725         icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 726                                          icount_timer_cb, NULL);
 727     }
 728
 729     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 730
 731     if (icount_align_option && !icount_sleep) {
 732         error_setg(errp, "align=on and sleep=off are incompatible");
 733     }
 734     if (strcmp(option, "auto") != 0) {
 735         errno = 0;
 736         icount_time_shift = strtol(option, &rem_str, 0);
 737         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 738             error_setg(errp, "icount: Invalid shift value");
 739         }
 740         use_icount = 1;
 741         return;
 742     } else if (icount_align_option) {
 743         error_setg(errp, "shift=auto and align=on are incompatible");
 744     } else if (!icount_sleep) {
 745         error_setg(errp, "shift=auto and sleep=off are incompatible");
 746     }
 747
 748     use_icount = 2;
 749
 750     /* 125MIPS seems a reasonable initial guess at the guest speed.
 751        It will be corrected fairly quickly anyway.  */
 752     icount_time_shift = 3;
 753
 754     /* Have both realtime and virtual time triggers for speed adjustment.
 755        The realtime trigger catches emulated time passing too slowly,
 756        the virtual time trigger catches emulated time passing too fast.
 757        Realtime triggers occur even when idle, so use them less frequently
 758        than VM triggers.  */
 759     icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 760                                    icount_adjust_rt, NULL);
 761     timer_mod(icount_rt_timer,
 762                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 763     icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 764                                         icount_adjust_vm, NULL);
 765     timer_mod(icount_vm_timer,
 766                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 767                    NANOSECONDS_PER_SECOND / 10);
 768 }
 769
 770 /***********************************************************/
 771 /* TCG vCPU kick timer
 772  *
 773  * The kick timer is responsible for moving single threaded vCPU
 774  * emulation on to the next vCPU. If more than one vCPU is running a
 775  * timer event with force a cpu->exit so the next vCPU can get
 776  * scheduled.
 777  *
 778  * The timer is removed if all vCPUs are idle and restarted again once
 779  * idleness is complete.
 780  */
 781
 782 static QEMUTimer *tcg_kick_vcpu_timer;
 783 static CPUState *tcg_current_rr_cpu;
 784
 785 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 786
 787 static inline int64_t qemu_tcg_next_kick(void)
 788 {
 789     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 790 }
 791
 792 /* Kick the currently round-robin scheduled vCPU */
 793 static void qemu_cpu_kick_rr_cpu(void)
 794 {
 795     CPUState *cpu;
 796     do {
 797         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 798         if (cpu) {
 799             cpu_exit(cpu);
 800         }
 801     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 802 }
 803
 804 static void kick_tcg_thread(void *opaque)
 805 {
 806     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 807     qemu_cpu_kick_rr_cpu();
 808 }
 809
 810 static void start_tcg_kick_timer(void)
 811 {
 812     if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 813         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 814                                            kick_tcg_thread, NULL);
 815         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 816     }
 817 }
 818
 819 static void stop_tcg_kick_timer(void)
 820 {
 821     if (tcg_kick_vcpu_timer) {
 822         timer_del(tcg_kick_vcpu_timer);
 823         tcg_kick_vcpu_timer = NULL;
 824     }
 825 }
 826
 827 /***********************************************************/
 828 void hw_error(const char *fmt, ...)
 829 {
 830     va_list ap;
 831     CPUState *cpu;
 832
 833     va_start(ap, fmt);
 834     fprintf(stderr, "qemu: hardware error: ");
 835     vfprintf(stderr, fmt, ap);
 836     fprintf(stderr, "\n");
 837     CPU_FOREACH(cpu) {
 838         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 839         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 840     }
 841     va_end(ap);
 842     abort();
 843 }
 844
 845 void cpu_synchronize_all_states(void)
 846 {
 847     CPUState *cpu;
 848
 849     CPU_FOREACH(cpu) {
 850         cpu_synchronize_state(cpu);
 851     }
 852 }
 853
 854 void cpu_synchronize_all_post_reset(void)
 855 {
 856     CPUState *cpu;
 857
 858     CPU_FOREACH(cpu) {
 859         cpu_synchronize_post_reset(cpu);
 860     }
 861 }
 862
 863 void cpu_synchronize_all_post_init(void)
 864 {
 865     CPUState *cpu;
 866
 867     CPU_FOREACH(cpu) {
 868         cpu_synchronize_post_init(cpu);
 869     }
 870 }
 871
 872 static int do_vm_stop(RunState state)
 873 {
 874     int ret = 0;
 875
 876     if (runstate_is_running()) {
 877         cpu_disable_ticks();
 878         pause_all_vcpus();
 879         runstate_set(state);
 880         vm_state_notify(0, state);
 881         qapi_event_send_stop(&error_abort);
 882     }
 883
 884     bdrv_drain_all();
 885     replay_disable_events();
 886     ret = bdrv_flush_all();
 887
 888     return ret;
 889 }
 890
 891 static bool cpu_can_run(CPUState *cpu)
 892 {
 893     if (cpu->stop) {
 894         return false;
 895     }
 896     if (cpu_is_stopped(cpu)) {
 897         return false;
 898     }
 899     return true;
 900 }
 901
 902 static void cpu_handle_guest_debug(CPUState *cpu)
 903 {
 904     gdb_set_stop_cpu(cpu);
 905     qemu_system_debug_request();
 906     cpu->stopped = true;
 907 }
 908
 909 #ifdef CONFIG_LINUX
 910 static void sigbus_reraise(void)
 911 {
 912     sigset_t set;
 913     struct sigaction action;
 914
 915     memset(&action, 0, sizeof(action));
 916     action.sa_handler = SIG_DFL;
 917     if (!sigaction(SIGBUS, &action, NULL)) {
 918         raise(SIGBUS);
 919         sigemptyset(&set);
 920         sigaddset(&set, SIGBUS);
 921         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 922     }
 923     perror("Failed to re-raise SIGBUS!\n");
 924     abort();
 925 }
 926
 927 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
 928                            void *ctx)
 929 {
 930     if (kvm_on_sigbus(siginfo->ssi_code,
 931                       (void *)(intptr_t)siginfo->ssi_addr)) {
 932         sigbus_reraise();
 933     }
 934 }
 935
 936 static void qemu_init_sigbus(void)
 937 {
 938     struct sigaction action;
 939
 940     memset(&action, 0, sizeof(action));
 941     action.sa_flags = SA_SIGINFO;
 942     action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
 943     sigaction(SIGBUS, &action, NULL);
 944
 945     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
 946 }
 947
 948 static void qemu_kvm_eat_signals(CPUState *cpu)
 949 {
 950     struct timespec ts = { 0, 0 };
 951     siginfo_t siginfo;
 952     sigset_t waitset;
 953     sigset_t chkset;
 954     int r;
 955
 956     sigemptyset(&waitset);
 957     sigaddset(&waitset, SIG_IPI);
 958     sigaddset(&waitset, SIGBUS);
 959
 960     do {
 961         r = sigtimedwait(&waitset, &siginfo, &ts);
 962         if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
 963             perror("sigtimedwait");
 964             exit(1);
 965         }
 966
 967         switch (r) {
 968         case SIGBUS:
 969             if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
 970                 sigbus_reraise();
 971             }
 972             break;
 973         default:
 974             break;
 975         }
 976
 977         r = sigpending(&chkset);
 978         if (r == -1) {
 979             perror("sigpending");
 980             exit(1);
 981         }
 982     } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 983 }
 984
 985 #else /* !CONFIG_LINUX */
 986
 987 static void qemu_init_sigbus(void)
 988 {
 989 }
 990
 991 static void qemu_kvm_eat_signals(CPUState *cpu)
 992 {
 993 }
 994 #endif /* !CONFIG_LINUX */
 995
 996 #ifndef _WIN32
 997 static void dummy_signal(int sig)
 998 {
 999 }
1000
1001 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
1002 {
1003     int r;
1004     sigset_t set;
1005     struct sigaction sigact;
1006
1007     memset(&sigact, 0, sizeof(sigact));
1008     sigact.sa_handler = dummy_signal;
1009     sigaction(SIG_IPI, &sigact, NULL);
1010
1011     pthread_sigmask(SIG_BLOCK, NULL, &set);
1012     sigdelset(&set, SIG_IPI);
1013     sigdelset(&set, SIGBUS);
1014     r = kvm_set_signal_mask(cpu, &set);
1015     if (r) {
1016         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
1017         exit(1);
1018     }
1019 }
1020
1021 #else /* _WIN32 */
1022 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
1023 {
1024     abort();
1025 }
1026 #endif /* _WIN32 */
1027
1028 static QemuMutex qemu_global_mutex;
1029
1030 static QemuThread io_thread;
1031
1032 /* cpu creation */
1033 static QemuCond qemu_cpu_cond;
1034 /* system init */
1035 static QemuCond qemu_pause_cond;
1036
1037 void qemu_init_cpu_loop(void)
1038 {
1039     qemu_init_sigbus();
1040     qemu_cond_init(&qemu_cpu_cond);
1041     qemu_cond_init(&qemu_pause_cond);
1042     qemu_mutex_init(&qemu_global_mutex);
1043
1044     qemu_thread_get_self(&io_thread);
1045 }
1046
1047 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1048 {
1049     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1050 }
1051
1052 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1053 {
1054     if (kvm_destroy_vcpu(cpu) < 0) {
1055         error_report("kvm_destroy_vcpu failed");
1056         exit(EXIT_FAILURE);
1057     }
1058 }
1059
1060 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1061 {
1062 }
1063
1064 static void qemu_wait_io_event_common(CPUState *cpu)
1065 {
1066     atomic_mb_set(&cpu->thread_kicked, false);
1067     if (cpu->stop) {
1068         cpu->stop = false;
1069         cpu->stopped = true;
1070         qemu_cond_broadcast(&qemu_pause_cond);
1071     }
1072     process_queued_cpu_work(cpu);
1073 }
1074
1075 static bool qemu_tcg_should_sleep(CPUState *cpu)
1076 {
1077     if (mttcg_enabled) {
1078         return cpu_thread_is_idle(cpu);
1079     } else {
1080         return all_cpu_threads_idle();
1081     }
1082 }
1083
1084 static void qemu_tcg_wait_io_event(CPUState *cpu)
1085 {
1086     while (qemu_tcg_should_sleep(cpu)) {
1087         stop_tcg_kick_timer();
1088         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1089     }
1090
1091     start_tcg_kick_timer();
1092
1093     qemu_wait_io_event_common(cpu);
1094 }
1095
1096 static void qemu_kvm_wait_io_event(CPUState *cpu)
1097 {
1098     while (cpu_thread_is_idle(cpu)) {
1099         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1100     }
1101
1102     qemu_kvm_eat_signals(cpu);
1103     qemu_wait_io_event_common(cpu);
1104 }
1105
1106 static void *qemu_kvm_cpu_thread_fn(void *arg)
1107 {
1108     CPUState *cpu = arg;
1109     int r;
1110
1111     rcu_register_thread();
1112
1113     qemu_mutex_lock_iothread();
1114     qemu_thread_get_self(cpu->thread);
1115     cpu->thread_id = qemu_get_thread_id();
1116     cpu->can_do_io = 1;
1117     current_cpu = cpu;
1118
1119     r = kvm_init_vcpu(cpu);
1120     if (r < 0) {
1121         fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1122         exit(1);
1123     }
1124
1125     qemu_kvm_init_cpu_signals(cpu);
1126
1127     /* signal CPU creation */
1128     cpu->created = true;
1129     qemu_cond_signal(&qemu_cpu_cond);
1130
1131     do {
1132         if (cpu_can_run(cpu)) {
1133             r = kvm_cpu_exec(cpu);
1134             if (r == EXCP_DEBUG) {
1135                 cpu_handle_guest_debug(cpu);
1136             }
1137         }
1138         qemu_kvm_wait_io_event(cpu);
1139     } while (!cpu->unplug || cpu_can_run(cpu));
1140
1141     qemu_kvm_destroy_vcpu(cpu);
1142     cpu->created = false;
1143     qemu_cond_signal(&qemu_cpu_cond);
1144     qemu_mutex_unlock_iothread();
1145     return NULL;
1146 }
1147
1148 static void *qemu_dummy_cpu_thread_fn(void *arg)
1149 {
1150 #ifdef _WIN32
1151     fprintf(stderr, "qtest is not supported under Windows\n");
1152     exit(1);
1153 #else
1154     CPUState *cpu = arg;
1155     sigset_t waitset;
1156     int r;
1157
1158     rcu_register_thread();
1159
1160     qemu_mutex_lock_iothread();
1161     qemu_thread_get_self(cpu->thread);
1162     cpu->thread_id = qemu_get_thread_id();
1163     cpu->can_do_io = 1;
1164     current_cpu = cpu;
1165
1166     sigemptyset(&waitset);
1167     sigaddset(&waitset, SIG_IPI);
1168
1169     /* signal CPU creation */
1170     cpu->created = true;
1171     qemu_cond_signal(&qemu_cpu_cond);
1172
1173     while (1) {
1174         qemu_mutex_unlock_iothread();
1175         do {
1176             int sig;
1177             r = sigwait(&waitset, &sig);
1178         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1179         if (r == -1) {
1180             perror("sigwait");
1181             exit(1);
1182         }
1183         qemu_mutex_lock_iothread();
1184         qemu_wait_io_event_common(cpu);
1185     }
1186
1187     return NULL;
1188 #endif
1189 }
1190
1191 static int64_t tcg_get_icount_limit(void)
1192 {
1193     int64_t deadline;
1194
1195     if (replay_mode != REPLAY_MODE_PLAY) {
1196         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1197
1198         /* Maintain prior (possibly buggy) behaviour where if no deadline
1199          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1200          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1201          * nanoseconds.
1202          */
1203         if ((deadline < 0) || (deadline > INT32_MAX)) {
1204             deadline = INT32_MAX;
1205         }
1206
1207         return qemu_icount_round(deadline);
1208     } else {
1209         return replay_get_instructions();
1210     }
1211 }
1212
1213 static void handle_icount_deadline(void)
1214 {
1215     if (use_icount) {
1216         int64_t deadline =
1217             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1218
1219         if (deadline == 0) {
1220             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1221         }
1222     }
1223 }
1224
1225 static int tcg_cpu_exec(CPUState *cpu)
1226 {
1227     int ret;
1228 #ifdef CONFIG_PROFILER
1229     int64_t ti;
1230 #endif
1231
1232 #ifdef CONFIG_PROFILER
1233     ti = profile_getclock();
1234 #endif
1235     if (use_icount) {
1236         int64_t count;
1237         int decr;
1238         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1239                                     + cpu->icount_extra);
1240         cpu->icount_decr.u16.low = 0;
1241         cpu->icount_extra = 0;
1242         count = tcg_get_icount_limit();
1243         timers_state.qemu_icount += count;
1244         decr = (count > 0xffff) ? 0xffff : count;
1245         count -= decr;
1246         cpu->icount_decr.u16.low = decr;
1247         cpu->icount_extra = count;
1248     }
1249     qemu_mutex_unlock_iothread();
1250     cpu_exec_start(cpu);
1251     ret = cpu_exec(cpu);
1252     cpu_exec_end(cpu);
1253     qemu_mutex_lock_iothread();
1254 #ifdef CONFIG_PROFILER
1255     tcg_time += profile_getclock() - ti;
1256 #endif
1257     if (use_icount) {
1258         /* Fold pending instructions back into the
1259            instruction counter, and clear the interrupt flag.  */
1260         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1261                         + cpu->icount_extra);
1262         cpu->icount_decr.u32 = 0;
1263         cpu->icount_extra = 0;
1264         replay_account_executed_instructions();
1265     }
1266     return ret;
1267 }
1268
1269 /* Destroy any remaining vCPUs which have been unplugged and have
1270  * finished running
1271  */
1272 static void deal_with_unplugged_cpus(void)
1273 {
1274     CPUState *cpu;
1275
1276     CPU_FOREACH(cpu) {
1277         if (cpu->unplug && !cpu_can_run(cpu)) {
1278             qemu_tcg_destroy_vcpu(cpu);
1279             cpu->created = false;
1280             qemu_cond_signal(&qemu_cpu_cond);
1281             break;
1282         }
1283     }
1284 }
1285
1286 /* Single-threaded TCG
1287  *
1288  * In the single-threaded case each vCPU is simulated in turn. If
1289  * there is more than a single vCPU we create a simple timer to kick
1290  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1291  * This is done explicitly rather than relying on side-effects
1292  * elsewhere.
1293  */
1294
1295 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1296 {
1297     CPUState *cpu = arg;
1298
1299     rcu_register_thread();
1300
1301     qemu_mutex_lock_iothread();
1302     qemu_thread_get_self(cpu->thread);
1303
1304     CPU_FOREACH(cpu) {
1305         cpu->thread_id = qemu_get_thread_id();
1306         cpu->created = true;
1307         cpu->can_do_io = 1;
1308     }
1309     qemu_cond_signal(&qemu_cpu_cond);
1310
1311     /* wait for initial kick-off after machine start */
1312     while (first_cpu->stopped) {
1313         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1314
1315         /* process any pending work */
1316         CPU_FOREACH(cpu) {
1317             current_cpu = cpu;
1318             qemu_wait_io_event_common(cpu);
1319         }
1320     }
1321
1322     start_tcg_kick_timer();
1323
1324     cpu = first_cpu;
1325
1326     /* process any pending work */
1327     cpu->exit_request = 1;
1328
1329     while (1) {
1330         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1331         qemu_account_warp_timer();
1332
1333         if (!cpu) {
1334             cpu = first_cpu;
1335         }
1336
1337         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1338
1339             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1340             current_cpu = cpu;
1341
1342             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1343                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1344
1345             if (cpu_can_run(cpu)) {
1346                 int r;
1347                 r = tcg_cpu_exec(cpu);
1348                 if (r == EXCP_DEBUG) {
1349                     cpu_handle_guest_debug(cpu);
1350                     break;
1351                 } else if (r == EXCP_ATOMIC) {
1352                     qemu_mutex_unlock_iothread();
1353                     cpu_exec_step_atomic(cpu);
1354                     qemu_mutex_lock_iothread();
1355                     break;
1356                 }
1357             } else if (cpu->stop) {
1358                 if (cpu->unplug) {
1359                     cpu = CPU_NEXT(cpu);
1360                 }
1361                 break;
1362             }
1363
1364             cpu = CPU_NEXT(cpu);
1365         } /* while (cpu && !cpu->exit_request).. */
1366
1367         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1368         atomic_set(&tcg_current_rr_cpu, NULL);
1369
1370         if (cpu && cpu->exit_request) {
1371             atomic_mb_set(&cpu->exit_request, 0);
1372         }
1373
1374         handle_icount_deadline();
1375
1376         qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1377         deal_with_unplugged_cpus();
1378     }
1379
1380     return NULL;
1381 }
1382
1383 static void *qemu_hax_cpu_thread_fn(void *arg)
1384 {
1385     CPUState *cpu = arg;
1386     int r;
1387     qemu_thread_get_self(cpu->thread);
1388     qemu_mutex_lock(&qemu_global_mutex);
1389
1390     cpu->thread_id = qemu_get_thread_id();
1391     cpu->created = true;
1392     cpu->halted = 0;
1393     current_cpu = cpu;
1394
1395     hax_init_vcpu(cpu);
1396     qemu_cond_signal(&qemu_cpu_cond);
1397
1398     while (1) {
1399         if (cpu_can_run(cpu)) {
1400             r = hax_smp_cpu_exec(cpu);
1401             if (r == EXCP_DEBUG) {
1402                 cpu_handle_guest_debug(cpu);
1403             }
1404         }
1405
1406         while (cpu_thread_is_idle(cpu)) {
1407             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1408         }
1409 #ifdef _WIN32
1410         SleepEx(0, TRUE);
1411 #endif
1412         qemu_wait_io_event_common(cpu);
1413     }
1414     return NULL;
1415 }
1416
1417 #ifdef _WIN32
1418 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1419 {
1420 }
1421 #endif
1422
1423 /* Multi-threaded TCG
1424  *
1425  * In the multi-threaded case each vCPU has its own thread. The TLS
1426  * variable current_cpu can be used deep in the code to find the
1427  * current CPUState for a given thread.
1428  */
1429
1430 static void *qemu_tcg_cpu_thread_fn(void *arg)
1431 {
1432     CPUState *cpu = arg;
1433
1434     rcu_register_thread();
1435
1436     qemu_mutex_lock_iothread();
1437     qemu_thread_get_self(cpu->thread);
1438
1439     cpu->thread_id = qemu_get_thread_id();
1440     cpu->created = true;
1441     cpu->can_do_io = 1;
1442     current_cpu = cpu;
1443     qemu_cond_signal(&qemu_cpu_cond);
1444
1445     /* process any pending work */
1446     cpu->exit_request = 1;
1447
1448     while (1) {
1449         if (cpu_can_run(cpu)) {
1450             int r;
1451             r = tcg_cpu_exec(cpu);
1452             switch (r) {
1453             case EXCP_DEBUG:
1454                 cpu_handle_guest_debug(cpu);
1455                 break;
1456             case EXCP_HALTED:
1457                 /* during start-up the vCPU is reset and the thread is
1458                  * kicked several times. If we don't ensure we go back
1459                  * to sleep in the halted state we won't cleanly
1460                  * start-up when the vCPU is enabled.
1461                  *
1462                  * cpu->halted should ensure we sleep in wait_io_event
1463                  */
1464                 g_assert(cpu->halted);
1465                 break;
1466             case EXCP_ATOMIC:
1467                 qemu_mutex_unlock_iothread();
1468                 cpu_exec_step_atomic(cpu);
1469                 qemu_mutex_lock_iothread();
1470             default:
1471                 /* Ignore everything else? */
1472                 break;
1473             }
1474         }
1475
1476         handle_icount_deadline();
1477
1478         atomic_mb_set(&cpu->exit_request, 0);
1479         qemu_tcg_wait_io_event(cpu);
1480     }
1481
1482     return NULL;
1483 }
1484
1485 static void qemu_cpu_kick_thread(CPUState *cpu)
1486 {
1487 #ifndef _WIN32
1488     int err;
1489
1490     if (cpu->thread_kicked) {
1491         return;
1492     }
1493     cpu->thread_kicked = true;
1494     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1495     if (err) {
1496         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1497         exit(1);
1498     }
1499 #else /* _WIN32 */
1500     if (!qemu_cpu_is_self(cpu)) {
1501         if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1502             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1503                     __func__, GetLastError());
1504             exit(1);
1505         }
1506     }
1507 #endif
1508 }
1509
1510 void qemu_cpu_kick(CPUState *cpu)
1511 {
1512     qemu_cond_broadcast(cpu->halt_cond);
1513     if (tcg_enabled()) {
1514         cpu_exit(cpu);
1515         /* NOP unless doing single-thread RR */
1516         qemu_cpu_kick_rr_cpu();
1517     } else {
1518         if (hax_enabled()) {
1519             /*
1520              * FIXME: race condition with the exit_request check in
1521              * hax_vcpu_hax_exec
1522              */
1523             cpu->exit_request = 1;
1524         }
1525         qemu_cpu_kick_thread(cpu);
1526     }
1527 }
1528
1529 void qemu_cpu_kick_self(void)
1530 {
1531     assert(current_cpu);
1532     qemu_cpu_kick_thread(current_cpu);
1533 }
1534
1535 bool qemu_cpu_is_self(CPUState *cpu)
1536 {
1537     return qemu_thread_is_self(cpu->thread);
1538 }
1539
1540 bool qemu_in_vcpu_thread(void)
1541 {
1542     return current_cpu && qemu_cpu_is_self(current_cpu);
1543 }
1544
1545 static __thread bool iothread_locked = false;
1546
1547 bool qemu_mutex_iothread_locked(void)
1548 {
1549     return iothread_locked;
1550 }
1551
1552 void qemu_mutex_lock_iothread(void)
1553 {
1554     g_assert(!qemu_mutex_iothread_locked());
1555     qemu_mutex_lock(&qemu_global_mutex);
1556     iothread_locked = true;
1557 }
1558
1559 void qemu_mutex_unlock_iothread(void)
1560 {
1561     g_assert(qemu_mutex_iothread_locked());
1562     iothread_locked = false;
1563     qemu_mutex_unlock(&qemu_global_mutex);
1564 }
1565
1566 static bool all_vcpus_paused(void)
1567 {
1568     CPUState *cpu;
1569
1570     CPU_FOREACH(cpu) {
1571         if (!cpu->stopped) {
1572             return false;
1573         }
1574     }
1575
1576     return true;
1577 }
1578
1579 void pause_all_vcpus(void)
1580 {
1581     CPUState *cpu;
1582
1583     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1584     CPU_FOREACH(cpu) {
1585         cpu->stop = true;
1586         qemu_cpu_kick(cpu);
1587     }
1588
1589     if (qemu_in_vcpu_thread()) {
1590         cpu_stop_current();
1591     }
1592
1593     while (!all_vcpus_paused()) {
1594         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1595         CPU_FOREACH(cpu) {
1596             qemu_cpu_kick(cpu);
1597         }
1598     }
1599 }
1600
1601 void cpu_resume(CPUState *cpu)
1602 {
1603     cpu->stop = false;
1604     cpu->stopped = false;
1605     qemu_cpu_kick(cpu);
1606 }
1607
1608 void resume_all_vcpus(void)
1609 {
1610     CPUState *cpu;
1611
1612     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1613     CPU_FOREACH(cpu) {
1614         cpu_resume(cpu);
1615     }
1616 }
1617
1618 void cpu_remove(CPUState *cpu)
1619 {
1620     cpu->stop = true;
1621     cpu->unplug = true;
1622     qemu_cpu_kick(cpu);
1623 }
1624
1625 void cpu_remove_sync(CPUState *cpu)
1626 {
1627     cpu_remove(cpu);
1628     while (cpu->created) {
1629         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1630     }
1631 }
1632
1633 /* For temporary buffers for forming a name */
1634 #define VCPU_THREAD_NAME_SIZE 16
1635
1636 static void qemu_tcg_init_vcpu(CPUState *cpu)
1637 {
1638     char thread_name[VCPU_THREAD_NAME_SIZE];
1639     static QemuCond *single_tcg_halt_cond;
1640     static QemuThread *single_tcg_cpu_thread;
1641
1642     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1643         cpu->thread = g_malloc0(sizeof(QemuThread));
1644         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1645         qemu_cond_init(cpu->halt_cond);
1646
1647         if (qemu_tcg_mttcg_enabled()) {
1648             /* create a thread per vCPU with TCG (MTTCG) */
1649             parallel_cpus = true;
1650             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1651                  cpu->cpu_index);
1652
1653             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1654                                cpu, QEMU_THREAD_JOINABLE);
1655
1656         } else {
1657             /* share a single thread for all cpus with TCG */
1658             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1659             qemu_thread_create(cpu->thread, thread_name,
1660                                qemu_tcg_rr_cpu_thread_fn,
1661                                cpu, QEMU_THREAD_JOINABLE);
1662
1663             single_tcg_halt_cond = cpu->halt_cond;
1664             single_tcg_cpu_thread = cpu->thread;
1665         }
1666 #ifdef _WIN32
1667         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1668 #endif
1669         while (!cpu->created) {
1670             qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1671         }
1672     } else {
1673         /* For non-MTTCG cases we share the thread */
1674         cpu->thread = single_tcg_cpu_thread;
1675         cpu->halt_cond = single_tcg_halt_cond;
1676     }
1677 }
1678
1679 static void qemu_hax_start_vcpu(CPUState *cpu)
1680 {
1681     char thread_name[VCPU_THREAD_NAME_SIZE];
1682
1683     cpu->thread = g_malloc0(sizeof(QemuThread));
1684     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1685     qemu_cond_init(cpu->halt_cond);
1686
1687     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1688              cpu->cpu_index);
1689     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1690                        cpu, QEMU_THREAD_JOINABLE);
1691 #ifdef _WIN32
1692     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1693 #endif
1694     while (!cpu->created) {
1695         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1696     }
1697 }
1698
1699 static void qemu_kvm_start_vcpu(CPUState *cpu)
1700 {
1701     char thread_name[VCPU_THREAD_NAME_SIZE];
1702
1703     cpu->thread = g_malloc0(sizeof(QemuThread));
1704     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1705     qemu_cond_init(cpu->halt_cond);
1706     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1707              cpu->cpu_index);
1708     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1709                        cpu, QEMU_THREAD_JOINABLE);
1710     while (!cpu->created) {
1711         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1712     }
1713 }
1714
1715 static void qemu_dummy_start_vcpu(CPUState *cpu)
1716 {
1717     char thread_name[VCPU_THREAD_NAME_SIZE];
1718
1719     cpu->thread = g_malloc0(sizeof(QemuThread));
1720     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1721     qemu_cond_init(cpu->halt_cond);
1722     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1723              cpu->cpu_index);
1724     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1725                        QEMU_THREAD_JOINABLE);
1726     while (!cpu->created) {
1727         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1728     }
1729 }
1730
1731 void qemu_init_vcpu(CPUState *cpu)
1732 {
1733     cpu->nr_cores = smp_cores;
1734     cpu->nr_threads = smp_threads;
1735     cpu->stopped = true;
1736
1737     if (!cpu->as) {
1738         /* If the target cpu hasn't set up any address spaces itself,
1739          * give it the default one.
1740          */
1741         AddressSpace *as = address_space_init_shareable(cpu->memory,
1742                                                         "cpu-memory");
1743         cpu->num_ases = 1;
1744         cpu_address_space_init(cpu, as, 0);
1745     }
1746
1747     if (kvm_enabled()) {
1748         qemu_kvm_start_vcpu(cpu);
1749     } else if (hax_enabled()) {
1750         qemu_hax_start_vcpu(cpu);
1751     } else if (tcg_enabled()) {
1752         qemu_tcg_init_vcpu(cpu);
1753     } else {
1754         qemu_dummy_start_vcpu(cpu);
1755     }
1756 }
1757
1758 void cpu_stop_current(void)
1759 {
1760     if (current_cpu) {
1761         current_cpu->stop = false;
1762         current_cpu->stopped = true;
1763         cpu_exit(current_cpu);
1764         qemu_cond_broadcast(&qemu_pause_cond);
1765     }
1766 }
1767
1768 int vm_stop(RunState state)
1769 {
1770     if (qemu_in_vcpu_thread()) {
1771         qemu_system_vmstop_request_prepare();
1772         qemu_system_vmstop_request(state);
1773         /*
1774          * FIXME: should not return to device code in case
1775          * vm_stop() has been requested.
1776          */
1777         cpu_stop_current();
1778         return 0;
1779     }
1780
1781     return do_vm_stop(state);
1782 }
1783
1784 /**
1785  * Prepare for (re)starting the VM.
1786  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1787  * running or in case of an error condition), 0 otherwise.
1788  */
1789 int vm_prepare_start(void)
1790 {
1791     RunState requested;
1792     int res = 0;
1793
1794     qemu_vmstop_requested(&requested);
1795     if (runstate_is_running() && requested == RUN_STATE__MAX) {
1796         return -1;
1797     }
1798
1799     /* Ensure that a STOP/RESUME pair of events is emitted if a
1800      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
1801      * example, according to documentation is always followed by
1802      * the STOP event.
1803      */
1804     if (runstate_is_running()) {
1805         qapi_event_send_stop(&error_abort);
1806         res = -1;
1807     } else {
1808         replay_enable_events();
1809         cpu_enable_ticks();
1810         runstate_set(RUN_STATE_RUNNING);
1811         vm_state_notify(1, RUN_STATE_RUNNING);
1812     }
1813
1814     /* We are sending this now, but the CPUs will be resumed shortly later */
1815     qapi_event_send_resume(&error_abort);
1816     return res;
1817 }
1818
1819 void vm_start(void)
1820 {
1821     if (!vm_prepare_start()) {
1822         resume_all_vcpus();
1823     }
1824 }
1825
1826 /* does a state transition even if the VM is already stopped,
1827    current state is forgotten forever */
1828 int vm_stop_force_state(RunState state)
1829 {
1830     if (runstate_is_running()) {
1831         return vm_stop(state);
1832     } else {
1833         runstate_set(state);
1834
1835         bdrv_drain_all();
1836         /* Make sure to return an error if the flush in a previous vm_stop()
1837          * failed. */
1838         return bdrv_flush_all();
1839     }
1840 }
1841
1842 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1843 {
1844     /* XXX: implement xxx_cpu_list for targets that still miss it */
1845 #if defined(cpu_list)
1846     cpu_list(f, cpu_fprintf);
1847 #endif
1848 }
1849
1850 CpuInfoList *qmp_query_cpus(Error **errp)
1851 {
1852     CpuInfoList *head = NULL, *cur_item = NULL;
1853     CPUState *cpu;
1854
1855     CPU_FOREACH(cpu) {
1856         CpuInfoList *info;
1857 #if defined(TARGET_I386)
1858         X86CPU *x86_cpu = X86_CPU(cpu);
1859         CPUX86State *env = &x86_cpu->env;
1860 #elif defined(TARGET_PPC)
1861         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1862         CPUPPCState *env = &ppc_cpu->env;
1863 #elif defined(TARGET_SPARC)
1864         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1865         CPUSPARCState *env = &sparc_cpu->env;
1866 #elif defined(TARGET_MIPS)
1867         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1868         CPUMIPSState *env = &mips_cpu->env;
1869 #elif defined(TARGET_TRICORE)
1870         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1871         CPUTriCoreState *env = &tricore_cpu->env;
1872 #endif
1873
1874         cpu_synchronize_state(cpu);
1875
1876         info = g_malloc0(sizeof(*info));
1877         info->value = g_malloc0(sizeof(*info->value));
1878         info->value->CPU = cpu->cpu_index;
1879         info->value->current = (cpu == first_cpu);
1880         info->value->halted = cpu->halted;
1881         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1882         info->value->thread_id = cpu->thread_id;
1883 #if defined(TARGET_I386)
1884         info->value->arch = CPU_INFO_ARCH_X86;
1885         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1886 #elif defined(TARGET_PPC)
1887         info->value->arch = CPU_INFO_ARCH_PPC;
1888         info->value->u.ppc.nip = env->nip;
1889 #elif defined(TARGET_SPARC)
1890         info->value->arch = CPU_INFO_ARCH_SPARC;
1891         info->value->u.q_sparc.pc = env->pc;
1892         info->value->u.q_sparc.npc = env->npc;
1893 #elif defined(TARGET_MIPS)
1894         info->value->arch = CPU_INFO_ARCH_MIPS;
1895         info->value->u.q_mips.PC = env->active_tc.PC;
1896 #elif defined(TARGET_TRICORE)
1897         info->value->arch = CPU_INFO_ARCH_TRICORE;
1898         info->value->u.tricore.PC = env->PC;
1899 #else
1900         info->value->arch = CPU_INFO_ARCH_OTHER;
1901 #endif
1902
1903         /* XXX: waiting for the qapi to support GSList */
1904         if (!cur_item) {
1905             head = cur_item = info;
1906         } else {
1907             cur_item->next = info;
1908             cur_item = info;
1909         }
1910     }
1911
1912     return head;
1913 }
1914
1915 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1916                  bool has_cpu, int64_t cpu_index, Error **errp)
1917 {
1918     FILE *f;
1919     uint32_t l;
1920     CPUState *cpu;
1921     uint8_t buf[1024];
1922     int64_t orig_addr = addr, orig_size = size;
1923
1924     if (!has_cpu) {
1925         cpu_index = 0;
1926     }
1927
1928     cpu = qemu_get_cpu(cpu_index);
1929     if (cpu == NULL) {
1930         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1931                    "a CPU number");
1932         return;
1933     }
1934
1935     f = fopen(filename, "wb");
1936     if (!f) {
1937         error_setg_file_open(errp, errno, filename);
1938         return;
1939     }
1940
1941     while (size != 0) {
1942         l = sizeof(buf);
1943         if (l > size)
1944             l = size;
1945         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1946             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1947                              " specified", orig_addr, orig_size);
1948             goto exit;
1949         }
1950         if (fwrite(buf, 1, l, f) != l) {
1951             error_setg(errp, QERR_IO_ERROR);
1952             goto exit;
1953         }
1954         addr += l;
1955         size -= l;
1956     }
1957
1958 exit:
1959     fclose(f);
1960 }
1961
1962 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1963                   Error **errp)
1964 {
1965     FILE *f;
1966     uint32_t l;
1967     uint8_t buf[1024];
1968
1969     f = fopen(filename, "wb");
1970     if (!f) {
1971         error_setg_file_open(errp, errno, filename);
1972         return;
1973     }
1974
1975     while (size != 0) {
1976         l = sizeof(buf);
1977         if (l > size)
1978             l = size;
1979         cpu_physical_memory_read(addr, buf, l);
1980         if (fwrite(buf, 1, l, f) != l) {
1981             error_setg(errp, QERR_IO_ERROR);
1982             goto exit;
1983         }
1984         addr += l;
1985         size -= l;
1986     }
1987
1988 exit:
1989     fclose(f);
1990 }
1991
1992 void qmp_inject_nmi(Error **errp)
1993 {
1994     nmi_monitor_handle(monitor_get_cpu_index(), errp);
1995 }
1996
1997 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1998 {
1999     if (!use_icount) {
2000         return;
2001     }
2002
2003     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
2004                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2005     if (icount_align_option) {
2006         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
2007         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
2008     } else {
2009         cpu_fprintf(f, "Max guest delay     NA\n");
2010         cpu_fprintf(f, "Max guest advance   NA\n");
2011     }
2012 }