cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 /* Needed early for CONFIG_BSD etc. */
  26 #include "qemu/osdep.h"
  27 #include "qemu-common.h"
  28 #include "qemu/config-file.h"
  29 #include "cpu.h"
  30 #include "monitor/monitor.h"
  31 #include "qapi/qmp/qerror.h"
  32 #include "qemu/error-report.h"
  33 #include "sysemu/sysemu.h"
  34 #include "sysemu/block-backend.h"
  35 #include "exec/gdbstub.h"
  36 #include "sysemu/dma.h"
  37 #include "sysemu/hw_accel.h"
  38 #include "sysemu/kvm.h"
  39 #include "sysemu/hax.h"
  40 #include "qmp-commands.h"
  41 #include "exec/exec-all.h"
  42
  43 #include "qemu/thread.h"
  44 #include "sysemu/cpus.h"
  45 #include "sysemu/qtest.h"
  46 #include "qemu/main-loop.h"
  47 #include "qemu/bitmap.h"
  48 #include "qemu/seqlock.h"
  49 #include "tcg.h"
  50 #include "qapi-event.h"
  51 #include "hw/nmi.h"
  52 #include "sysemu/replay.h"
  53
  54 #ifndef _WIN32
  55 #include "qemu/compatfd.h"
  56 #endif
  57
  58 #ifdef CONFIG_LINUX
  59
  60 #include <sys/prctl.h>
  61
  62 #ifndef PR_MCE_KILL
  63 #define PR_MCE_KILL 33
  64 #endif
  65
  66 #ifndef PR_MCE_KILL_SET
  67 #define PR_MCE_KILL_SET 1
  68 #endif
  69
  70 #ifndef PR_MCE_KILL_EARLY
  71 #define PR_MCE_KILL_EARLY 1
  72 #endif
  73
  74 #endif /* CONFIG_LINUX */
  75
  76 int64_t max_delay;
  77 int64_t max_advance;
  78
  79 /* vcpu throttling controls */
  80 static QEMUTimer *throttle_timer;
  81 static unsigned int throttle_percentage;
  82
  83 #define CPU_THROTTLE_PCT_MIN 1
  84 #define CPU_THROTTLE_PCT_MAX 99
  85 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  86
  87 bool cpu_is_stopped(CPUState *cpu)
  88 {
  89     return cpu->stopped || !runstate_is_running();
  90 }
  91
  92 static bool cpu_thread_is_idle(CPUState *cpu)
  93 {
  94     if (cpu->stop || cpu->queued_work_first) {
  95         return false;
  96     }
  97     if (cpu_is_stopped(cpu)) {
  98         return true;
  99     }
 100     if (!cpu->halted || cpu_has_work(cpu) ||
 101         kvm_halt_in_kernel()) {
 102         return false;
 103     }
 104     return true;
 105 }
 106
 107 static bool all_cpu_threads_idle(void)
 108 {
 109     CPUState *cpu;
 110
 111     CPU_FOREACH(cpu) {
 112         if (!cpu_thread_is_idle(cpu)) {
 113             return false;
 114         }
 115     }
 116     return true;
 117 }
 118
 119 /***********************************************************/
 120 /* guest cycle counter */
 121
 122 /* Protected by TimersState seqlock */
 123
 124 static bool icount_sleep = true;
 125 static int64_t vm_clock_warp_start = -1;
 126 /* Conversion factor from emulated instructions to virtual clock ticks.  */
 127 static int icount_time_shift;
 128 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 129 #define MAX_ICOUNT_SHIFT 10
 130
 131 static QEMUTimer *icount_rt_timer;
 132 static QEMUTimer *icount_vm_timer;
 133 static QEMUTimer *icount_warp_timer;
 134
 135 typedef struct TimersState {
 136     /* Protected by BQL.  */
 137     int64_t cpu_ticks_prev;
 138     int64_t cpu_ticks_offset;
 139
 140     /* cpu_clock_offset can be read out of BQL, so protect it with
 141      * this lock.
 142      */
 143     QemuSeqLock vm_clock_seqlock;
 144     int64_t cpu_clock_offset;
 145     int32_t cpu_ticks_enabled;
 146     int64_t dummy;
 147
 148     /* Compensate for varying guest execution speed.  */
 149     int64_t qemu_icount_bias;
 150     /* Only written by TCG thread */
 151     int64_t qemu_icount;
 152 } TimersState;
 153
 154 static TimersState timers_state;
 155 bool mttcg_enabled;
 156
 157 /*
 158  * We default to false if we know other options have been enabled
 159  * which are currently incompatible with MTTCG. Otherwise when each
 160  * guest (target) has been updated to support:
 161  *   - atomic instructions
 162  *   - memory ordering primitives (barriers)
 163  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 164  *
 165  * Once a guest architecture has been converted to the new primitives
 166  * there are two remaining limitations to check.
 167  *
 168  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 169  * - The host must have a stronger memory order than the guest
 170  *
 171  * It may be possible in future to support strong guests on weak hosts
 172  * but that will require tagging all load/stores in a guest with their
 173  * implicit memory order requirements which would likely slow things
 174  * down a lot.
 175  */
 176
 177 static bool check_tcg_memory_orders_compatible(void)
 178 {
 179 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 180     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 181 #else
 182     return false;
 183 #endif
 184 }
 185
 186 static bool default_mttcg_enabled(void)
 187 {
 188     QemuOpts *icount_opts = qemu_find_opts_singleton("icount");
 189     const char *rr = qemu_opt_get(icount_opts, "rr");
 190
 191     if (rr || TCG_OVERSIZED_GUEST) {
 192         return false;
 193     } else {
 194 #ifdef TARGET_SUPPORTS_MTTCG
 195         return check_tcg_memory_orders_compatible();
 196 #else
 197         return false;
 198 #endif
 199     }
 200 }
 201
 202 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 203 {
 204     const char *t = qemu_opt_get(opts, "thread");
 205     if (t) {
 206         if (strcmp(t, "multi") == 0) {
 207             if (TCG_OVERSIZED_GUEST) {
 208                 error_setg(errp, "No MTTCG when guest word size > hosts");
 209             } else {
 210                 if (!check_tcg_memory_orders_compatible()) {
 211                     error_report("Guest expects a stronger memory ordering "
 212                                  "than the host provides");
 213                     error_printf("This may cause strange/hard to debug errors");
 214                 }
 215                 mttcg_enabled = true;
 216             }
 217         } else if (strcmp(t, "single") == 0) {
 218             mttcg_enabled = false;
 219         } else {
 220             error_setg(errp, "Invalid 'thread' setting %s", t);
 221         }
 222     } else {
 223         mttcg_enabled = default_mttcg_enabled();
 224     }
 225 }
 226
 227 int64_t cpu_get_icount_raw(void)
 228 {
 229     int64_t icount;
 230     CPUState *cpu = current_cpu;
 231
 232     icount = timers_state.qemu_icount;
 233     if (cpu) {
 234         if (!cpu->can_do_io) {
 235             fprintf(stderr, "Bad icount read\n");
 236             exit(1);
 237         }
 238         icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
 239     }
 240     return icount;
 241 }
 242
 243 /* Return the virtual CPU time, based on the instruction counter.  */
 244 static int64_t cpu_get_icount_locked(void)
 245 {
 246     int64_t icount = cpu_get_icount_raw();
 247     return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
 248 }
 249
 250 int64_t cpu_get_icount(void)
 251 {
 252     int64_t icount;
 253     unsigned start;
 254
 255     do {
 256         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 257         icount = cpu_get_icount_locked();
 258     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 259
 260     return icount;
 261 }
 262
 263 int64_t cpu_icount_to_ns(int64_t icount)
 264 {
 265     return icount << icount_time_shift;
 266 }
 267
 268 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 269  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 270  * counter.
 271  *
 272  * Caller must hold the BQL
 273  */
 274 int64_t cpu_get_ticks(void)
 275 {
 276     int64_t ticks;
 277
 278     if (use_icount) {
 279         return cpu_get_icount();
 280     }
 281
 282     ticks = timers_state.cpu_ticks_offset;
 283     if (timers_state.cpu_ticks_enabled) {
 284         ticks += cpu_get_host_ticks();
 285     }
 286
 287     if (timers_state.cpu_ticks_prev > ticks) {
 288         /* Note: non increasing ticks may happen if the host uses
 289            software suspend */
 290         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 291         ticks = timers_state.cpu_ticks_prev;
 292     }
 293
 294     timers_state.cpu_ticks_prev = ticks;
 295     return ticks;
 296 }
 297
 298 static int64_t cpu_get_clock_locked(void)
 299 {
 300     int64_t time;
 301
 302     time = timers_state.cpu_clock_offset;
 303     if (timers_state.cpu_ticks_enabled) {
 304         time += get_clock();
 305     }
 306
 307     return time;
 308 }
 309
 310 /* Return the monotonic time elapsed in VM, i.e.,
 311  * the time between vm_start and vm_stop
 312  */
 313 int64_t cpu_get_clock(void)
 314 {
 315     int64_t ti;
 316     unsigned start;
 317
 318     do {
 319         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 320         ti = cpu_get_clock_locked();
 321     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 322
 323     return ti;
 324 }
 325
 326 /* enable cpu_get_ticks()
 327  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 328  */
 329 void cpu_enable_ticks(void)
 330 {
 331     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 332     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 333     if (!timers_state.cpu_ticks_enabled) {
 334         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 335         timers_state.cpu_clock_offset -= get_clock();
 336         timers_state.cpu_ticks_enabled = 1;
 337     }
 338     seqlock_write_end(&timers_state.vm_clock_seqlock);
 339 }
 340
 341 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 342  * cpu_get_ticks() after that.
 343  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 344  */
 345 void cpu_disable_ticks(void)
 346 {
 347     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 348     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 349     if (timers_state.cpu_ticks_enabled) {
 350         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 351         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 352         timers_state.cpu_ticks_enabled = 0;
 353     }
 354     seqlock_write_end(&timers_state.vm_clock_seqlock);
 355 }
 356
 357 /* Correlation between real and virtual time is always going to be
 358    fairly approximate, so ignore small variation.
 359    When the guest is idle real and virtual time will be aligned in
 360    the IO wait loop.  */
 361 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 362
 363 static void icount_adjust(void)
 364 {
 365     int64_t cur_time;
 366     int64_t cur_icount;
 367     int64_t delta;
 368
 369     /* Protected by TimersState mutex.  */
 370     static int64_t last_delta;
 371
 372     /* If the VM is not running, then do nothing.  */
 373     if (!runstate_is_running()) {
 374         return;
 375     }
 376
 377     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 378     cur_time = cpu_get_clock_locked();
 379     cur_icount = cpu_get_icount_locked();
 380
 381     delta = cur_icount - cur_time;
 382     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 383     if (delta > 0
 384         && last_delta + ICOUNT_WOBBLE < delta * 2
 385         && icount_time_shift > 0) {
 386         /* The guest is getting too far ahead.  Slow time down.  */
 387         icount_time_shift--;
 388     }
 389     if (delta < 0
 390         && last_delta - ICOUNT_WOBBLE > delta * 2
 391         && icount_time_shift < MAX_ICOUNT_SHIFT) {
 392         /* The guest is getting too far behind.  Speed time up.  */
 393         icount_time_shift++;
 394     }
 395     last_delta = delta;
 396     timers_state.qemu_icount_bias = cur_icount
 397                               - (timers_state.qemu_icount << icount_time_shift);
 398     seqlock_write_end(&timers_state.vm_clock_seqlock);
 399 }
 400
 401 static void icount_adjust_rt(void *opaque)
 402 {
 403     timer_mod(icount_rt_timer,
 404               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 405     icount_adjust();
 406 }
 407
 408 static void icount_adjust_vm(void *opaque)
 409 {
 410     timer_mod(icount_vm_timer,
 411                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 412                    NANOSECONDS_PER_SECOND / 10);
 413     icount_adjust();
 414 }
 415
 416 static int64_t qemu_icount_round(int64_t count)
 417 {
 418     return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 419 }
 420
 421 static void icount_warp_rt(void)
 422 {
 423     unsigned seq;
 424     int64_t warp_start;
 425
 426     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 427      * changes from -1 to another value, so the race here is okay.
 428      */
 429     do {
 430         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 431         warp_start = vm_clock_warp_start;
 432     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 433
 434     if (warp_start == -1) {
 435         return;
 436     }
 437
 438     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 439     if (runstate_is_running()) {
 440         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 441                                      cpu_get_clock_locked());
 442         int64_t warp_delta;
 443
 444         warp_delta = clock - vm_clock_warp_start;
 445         if (use_icount == 2) {
 446             /*
 447              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 448              * far ahead of real time.
 449              */
 450             int64_t cur_icount = cpu_get_icount_locked();
 451             int64_t delta = clock - cur_icount;
 452             warp_delta = MIN(warp_delta, delta);
 453         }
 454         timers_state.qemu_icount_bias += warp_delta;
 455     }
 456     vm_clock_warp_start = -1;
 457     seqlock_write_end(&timers_state.vm_clock_seqlock);
 458
 459     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 460         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 461     }
 462 }
 463
 464 static void icount_timer_cb(void *opaque)
 465 {
 466     /* No need for a checkpoint because the timer already synchronizes
 467      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 468      */
 469     icount_warp_rt();
 470 }
 471
 472 void qtest_clock_warp(int64_t dest)
 473 {
 474     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 475     AioContext *aio_context;
 476     assert(qtest_enabled());
 477     aio_context = qemu_get_aio_context();
 478     while (clock < dest) {
 479         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 480         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 481
 482         seqlock_write_begin(&timers_state.vm_clock_seqlock);
 483         timers_state.qemu_icount_bias += warp;
 484         seqlock_write_end(&timers_state.vm_clock_seqlock);
 485
 486         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 487         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 488         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 489     }
 490     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 491 }
 492
 493 void qemu_start_warp_timer(void)
 494 {
 495     int64_t clock;
 496     int64_t deadline;
 497
 498     if (!use_icount) {
 499         return;
 500     }
 501
 502     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 503      * do not fire, so computing the deadline does not make sense.
 504      */
 505     if (!runstate_is_running()) {
 506         return;
 507     }
 508
 509     /* warp clock deterministically in record/replay mode */
 510     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 511         return;
 512     }
 513
 514     if (!all_cpu_threads_idle()) {
 515         return;
 516     }
 517
 518     if (qtest_enabled()) {
 519         /* When testing, qtest commands advance icount.  */
 520         return;
 521     }
 522
 523     /* We want to use the earliest deadline from ALL vm_clocks */
 524     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 525     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 526     if (deadline < 0) {
 527         static bool notified;
 528         if (!icount_sleep && !notified) {
 529             error_report("WARNING: icount sleep disabled and no active timers");
 530             notified = true;
 531         }
 532         return;
 533     }
 534
 535     if (deadline > 0) {
 536         /*
 537          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 538          * sleep.  Otherwise, the CPU might be waiting for a future timer
 539          * interrupt to wake it up, but the interrupt never comes because
 540          * the vCPU isn't running any insns and thus doesn't advance the
 541          * QEMU_CLOCK_VIRTUAL.
 542          */
 543         if (!icount_sleep) {
 544             /*
 545              * We never let VCPUs sleep in no sleep icount mode.
 546              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 547              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 548              * It is useful when we want a deterministic execution time,
 549              * isolated from host latencies.
 550              */
 551             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 552             timers_state.qemu_icount_bias += deadline;
 553             seqlock_write_end(&timers_state.vm_clock_seqlock);
 554             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 555         } else {
 556             /*
 557              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 558              * "real" time, (related to the time left until the next event) has
 559              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 560              * This avoids that the warps are visible externally; for example,
 561              * you will not be sending network packets continuously instead of
 562              * every 100ms.
 563              */
 564             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 565             if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
 566                 vm_clock_warp_start = clock;
 567             }
 568             seqlock_write_end(&timers_state.vm_clock_seqlock);
 569             timer_mod_anticipate(icount_warp_timer, clock + deadline);
 570         }
 571     } else if (deadline == 0) {
 572         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 573     }
 574 }
 575
 576 static void qemu_account_warp_timer(void)
 577 {
 578     if (!use_icount || !icount_sleep) {
 579         return;
 580     }
 581
 582     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 583      * do not fire, so computing the deadline does not make sense.
 584      */
 585     if (!runstate_is_running()) {
 586         return;
 587     }
 588
 589     /* warp clock deterministically in record/replay mode */
 590     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 591         return;
 592     }
 593
 594     timer_del(icount_warp_timer);
 595     icount_warp_rt();
 596 }
 597
 598 static bool icount_state_needed(void *opaque)
 599 {
 600     return use_icount;
 601 }
 602
 603 /*
 604  * This is a subsection for icount migration.
 605  */
 606 static const VMStateDescription icount_vmstate_timers = {
 607     .name = "timer/icount",
 608     .version_id = 1,
 609     .minimum_version_id = 1,
 610     .needed = icount_state_needed,
 611     .fields = (VMStateField[]) {
 612         VMSTATE_INT64(qemu_icount_bias, TimersState),
 613         VMSTATE_INT64(qemu_icount, TimersState),
 614         VMSTATE_END_OF_LIST()
 615     }
 616 };
 617
 618 static const VMStateDescription vmstate_timers = {
 619     .name = "timer",
 620     .version_id = 2,
 621     .minimum_version_id = 1,
 622     .fields = (VMStateField[]) {
 623         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 624         VMSTATE_INT64(dummy, TimersState),
 625         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 626         VMSTATE_END_OF_LIST()
 627     },
 628     .subsections = (const VMStateDescription*[]) {
 629         &icount_vmstate_timers,
 630         NULL
 631     }
 632 };
 633
 634 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 635 {
 636     double pct;
 637     double throttle_ratio;
 638     long sleeptime_ns;
 639
 640     if (!cpu_throttle_get_percentage()) {
 641         return;
 642     }
 643
 644     pct = (double)cpu_throttle_get_percentage()/100;
 645     throttle_ratio = pct / (1 - pct);
 646     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 647
 648     qemu_mutex_unlock_iothread();
 649     atomic_set(&cpu->throttle_thread_scheduled, 0);
 650     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 651     qemu_mutex_lock_iothread();
 652 }
 653
 654 static void cpu_throttle_timer_tick(void *opaque)
 655 {
 656     CPUState *cpu;
 657     double pct;
 658
 659     /* Stop the timer if needed */
 660     if (!cpu_throttle_get_percentage()) {
 661         return;
 662     }
 663     CPU_FOREACH(cpu) {
 664         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 665             async_run_on_cpu(cpu, cpu_throttle_thread,
 666                              RUN_ON_CPU_NULL);
 667         }
 668     }
 669
 670     pct = (double)cpu_throttle_get_percentage()/100;
 671     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 672                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 673 }
 674
 675 void cpu_throttle_set(int new_throttle_pct)
 676 {
 677     /* Ensure throttle percentage is within valid range */
 678     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 679     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 680
 681     atomic_set(&throttle_percentage, new_throttle_pct);
 682
 683     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 684                                        CPU_THROTTLE_TIMESLICE_NS);
 685 }
 686
 687 void cpu_throttle_stop(void)
 688 {
 689     atomic_set(&throttle_percentage, 0);
 690 }
 691
 692 bool cpu_throttle_active(void)
 693 {
 694     return (cpu_throttle_get_percentage() != 0);
 695 }
 696
 697 int cpu_throttle_get_percentage(void)
 698 {
 699     return atomic_read(&throttle_percentage);
 700 }
 701
 702 void cpu_ticks_init(void)
 703 {
 704     seqlock_init(&timers_state.vm_clock_seqlock);
 705     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 706     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 707                                            cpu_throttle_timer_tick, NULL);
 708 }
 709
 710 void configure_icount(QemuOpts *opts, Error **errp)
 711 {
 712     const char *option;
 713     char *rem_str = NULL;
 714
 715     option = qemu_opt_get(opts, "shift");
 716     if (!option) {
 717         if (qemu_opt_get(opts, "align") != NULL) {
 718             error_setg(errp, "Please specify shift option when using align");
 719         }
 720         return;
 721     }
 722
 723     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 724     if (icount_sleep) {
 725         icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 726                                          icount_timer_cb, NULL);
 727     }
 728
 729     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 730
 731     if (icount_align_option && !icount_sleep) {
 732         error_setg(errp, "align=on and sleep=off are incompatible");
 733     }
 734     if (strcmp(option, "auto") != 0) {
 735         errno = 0;
 736         icount_time_shift = strtol(option, &rem_str, 0);
 737         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 738             error_setg(errp, "icount: Invalid shift value");
 739         }
 740         use_icount = 1;
 741         return;
 742     } else if (icount_align_option) {
 743         error_setg(errp, "shift=auto and align=on are incompatible");
 744     } else if (!icount_sleep) {
 745         error_setg(errp, "shift=auto and sleep=off are incompatible");
 746     }
 747
 748     use_icount = 2;
 749
 750     /* 125MIPS seems a reasonable initial guess at the guest speed.
 751        It will be corrected fairly quickly anyway.  */
 752     icount_time_shift = 3;
 753
 754     /* Have both realtime and virtual time triggers for speed adjustment.
 755        The realtime trigger catches emulated time passing too slowly,
 756        the virtual time trigger catches emulated time passing too fast.
 757        Realtime triggers occur even when idle, so use them less frequently
 758        than VM triggers.  */
 759     icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 760                                    icount_adjust_rt, NULL);
 761     timer_mod(icount_rt_timer,
 762                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 763     icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 764                                         icount_adjust_vm, NULL);
 765     timer_mod(icount_vm_timer,
 766                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 767                    NANOSECONDS_PER_SECOND / 10);
 768 }
 769
 770 /***********************************************************/
 771 void hw_error(const char *fmt, ...)
 772 {
 773     va_list ap;
 774     CPUState *cpu;
 775
 776     va_start(ap, fmt);
 777     fprintf(stderr, "qemu: hardware error: ");
 778     vfprintf(stderr, fmt, ap);
 779     fprintf(stderr, "\n");
 780     CPU_FOREACH(cpu) {
 781         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 782         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 783     }
 784     va_end(ap);
 785     abort();
 786 }
 787
 788 void cpu_synchronize_all_states(void)
 789 {
 790     CPUState *cpu;
 791
 792     CPU_FOREACH(cpu) {
 793         cpu_synchronize_state(cpu);
 794     }
 795 }
 796
 797 void cpu_synchronize_all_post_reset(void)
 798 {
 799     CPUState *cpu;
 800
 801     CPU_FOREACH(cpu) {
 802         cpu_synchronize_post_reset(cpu);
 803     }
 804 }
 805
 806 void cpu_synchronize_all_post_init(void)
 807 {
 808     CPUState *cpu;
 809
 810     CPU_FOREACH(cpu) {
 811         cpu_synchronize_post_init(cpu);
 812     }
 813 }
 814
 815 static int do_vm_stop(RunState state)
 816 {
 817     int ret = 0;
 818
 819     if (runstate_is_running()) {
 820         cpu_disable_ticks();
 821         pause_all_vcpus();
 822         runstate_set(state);
 823         vm_state_notify(0, state);
 824         qapi_event_send_stop(&error_abort);
 825     }
 826
 827     bdrv_drain_all();
 828     replay_disable_events();
 829     ret = bdrv_flush_all();
 830
 831     return ret;
 832 }
 833
 834 static bool cpu_can_run(CPUState *cpu)
 835 {
 836     if (cpu->stop) {
 837         return false;
 838     }
 839     if (cpu_is_stopped(cpu)) {
 840         return false;
 841     }
 842     return true;
 843 }
 844
 845 static void cpu_handle_guest_debug(CPUState *cpu)
 846 {
 847     gdb_set_stop_cpu(cpu);
 848     qemu_system_debug_request();
 849     cpu->stopped = true;
 850 }
 851
 852 #ifdef CONFIG_LINUX
 853 static void sigbus_reraise(void)
 854 {
 855     sigset_t set;
 856     struct sigaction action;
 857
 858     memset(&action, 0, sizeof(action));
 859     action.sa_handler = SIG_DFL;
 860     if (!sigaction(SIGBUS, &action, NULL)) {
 861         raise(SIGBUS);
 862         sigemptyset(&set);
 863         sigaddset(&set, SIGBUS);
 864         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 865     }
 866     perror("Failed to re-raise SIGBUS!\n");
 867     abort();
 868 }
 869
 870 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
 871                            void *ctx)
 872 {
 873     if (kvm_on_sigbus(siginfo->ssi_code,
 874                       (void *)(intptr_t)siginfo->ssi_addr)) {
 875         sigbus_reraise();
 876     }
 877 }
 878
 879 static void qemu_init_sigbus(void)
 880 {
 881     struct sigaction action;
 882
 883     memset(&action, 0, sizeof(action));
 884     action.sa_flags = SA_SIGINFO;
 885     action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
 886     sigaction(SIGBUS, &action, NULL);
 887
 888     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
 889 }
 890
 891 static void qemu_kvm_eat_signals(CPUState *cpu)
 892 {
 893     struct timespec ts = { 0, 0 };
 894     siginfo_t siginfo;
 895     sigset_t waitset;
 896     sigset_t chkset;
 897     int r;
 898
 899     sigemptyset(&waitset);
 900     sigaddset(&waitset, SIG_IPI);
 901     sigaddset(&waitset, SIGBUS);
 902
 903     do {
 904         r = sigtimedwait(&waitset, &siginfo, &ts);
 905         if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
 906             perror("sigtimedwait");
 907             exit(1);
 908         }
 909
 910         switch (r) {
 911         case SIGBUS:
 912             if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
 913                 sigbus_reraise();
 914             }
 915             break;
 916         default:
 917             break;
 918         }
 919
 920         r = sigpending(&chkset);
 921         if (r == -1) {
 922             perror("sigpending");
 923             exit(1);
 924         }
 925     } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 926 }
 927
 928 #else /* !CONFIG_LINUX */
 929
 930 static void qemu_init_sigbus(void)
 931 {
 932 }
 933
 934 static void qemu_kvm_eat_signals(CPUState *cpu)
 935 {
 936 }
 937 #endif /* !CONFIG_LINUX */
 938
 939 #ifndef _WIN32
 940 static void dummy_signal(int sig)
 941 {
 942 }
 943
 944 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
 945 {
 946     int r;
 947     sigset_t set;
 948     struct sigaction sigact;
 949
 950     memset(&sigact, 0, sizeof(sigact));
 951     sigact.sa_handler = dummy_signal;
 952     sigaction(SIG_IPI, &sigact, NULL);
 953
 954     pthread_sigmask(SIG_BLOCK, NULL, &set);
 955     sigdelset(&set, SIG_IPI);
 956     sigdelset(&set, SIGBUS);
 957     r = kvm_set_signal_mask(cpu, &set);
 958     if (r) {
 959         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
 960         exit(1);
 961     }
 962 }
 963
 964 #else /* _WIN32 */
 965 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
 966 {
 967     abort();
 968 }
 969 #endif /* _WIN32 */
 970
 971 static QemuMutex qemu_global_mutex;
 972 static QemuCond qemu_io_proceeded_cond;
 973 static unsigned iothread_requesting_mutex;
 974
 975 static QemuThread io_thread;
 976
 977 /* cpu creation */
 978 static QemuCond qemu_cpu_cond;
 979 /* system init */
 980 static QemuCond qemu_pause_cond;
 981
 982 void qemu_init_cpu_loop(void)
 983 {
 984     qemu_init_sigbus();
 985     qemu_cond_init(&qemu_cpu_cond);
 986     qemu_cond_init(&qemu_pause_cond);
 987     qemu_cond_init(&qemu_io_proceeded_cond);
 988     qemu_mutex_init(&qemu_global_mutex);
 989
 990     qemu_thread_get_self(&io_thread);
 991 }
 992
 993 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
 994 {
 995     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
 996 }
 997
 998 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
 999 {
1000     if (kvm_destroy_vcpu(cpu) < 0) {
1001         error_report("kvm_destroy_vcpu failed");
1002         exit(EXIT_FAILURE);
1003     }
1004 }
1005
1006 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1007 {
1008 }
1009
1010 static void qemu_wait_io_event_common(CPUState *cpu)
1011 {
1012     if (cpu->stop) {
1013         cpu->stop = false;
1014         cpu->stopped = true;
1015         qemu_cond_broadcast(&qemu_pause_cond);
1016     }
1017     process_queued_cpu_work(cpu);
1018     cpu->thread_kicked = false;
1019 }
1020
1021 static void qemu_tcg_wait_io_event(CPUState *cpu)
1022 {
1023     while (all_cpu_threads_idle()) {
1024         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1025     }
1026
1027     while (iothread_requesting_mutex) {
1028         qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
1029     }
1030
1031     CPU_FOREACH(cpu) {
1032         qemu_wait_io_event_common(cpu);
1033     }
1034 }
1035
1036 static void qemu_kvm_wait_io_event(CPUState *cpu)
1037 {
1038     while (cpu_thread_is_idle(cpu)) {
1039         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1040     }
1041
1042     qemu_kvm_eat_signals(cpu);
1043     qemu_wait_io_event_common(cpu);
1044 }
1045
1046 static void *qemu_kvm_cpu_thread_fn(void *arg)
1047 {
1048     CPUState *cpu = arg;
1049     int r;
1050
1051     rcu_register_thread();
1052
1053     qemu_mutex_lock_iothread();
1054     qemu_thread_get_self(cpu->thread);
1055     cpu->thread_id = qemu_get_thread_id();
1056     cpu->can_do_io = 1;
1057     current_cpu = cpu;
1058
1059     r = kvm_init_vcpu(cpu);
1060     if (r < 0) {
1061         fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1062         exit(1);
1063     }
1064
1065     qemu_kvm_init_cpu_signals(cpu);
1066
1067     /* signal CPU creation */
1068     cpu->created = true;
1069     qemu_cond_signal(&qemu_cpu_cond);
1070
1071     do {
1072         if (cpu_can_run(cpu)) {
1073             r = kvm_cpu_exec(cpu);
1074             if (r == EXCP_DEBUG) {
1075                 cpu_handle_guest_debug(cpu);
1076             }
1077         }
1078         qemu_kvm_wait_io_event(cpu);
1079     } while (!cpu->unplug || cpu_can_run(cpu));
1080
1081     qemu_kvm_destroy_vcpu(cpu);
1082     cpu->created = false;
1083     qemu_cond_signal(&qemu_cpu_cond);
1084     qemu_mutex_unlock_iothread();
1085     return NULL;
1086 }
1087
1088 static void *qemu_dummy_cpu_thread_fn(void *arg)
1089 {
1090 #ifdef _WIN32
1091     fprintf(stderr, "qtest is not supported under Windows\n");
1092     exit(1);
1093 #else
1094     CPUState *cpu = arg;
1095     sigset_t waitset;
1096     int r;
1097
1098     rcu_register_thread();
1099
1100     qemu_mutex_lock_iothread();
1101     qemu_thread_get_self(cpu->thread);
1102     cpu->thread_id = qemu_get_thread_id();
1103     cpu->can_do_io = 1;
1104
1105     sigemptyset(&waitset);
1106     sigaddset(&waitset, SIG_IPI);
1107
1108     /* signal CPU creation */
1109     cpu->created = true;
1110     qemu_cond_signal(&qemu_cpu_cond);
1111
1112     current_cpu = cpu;
1113     while (1) {
1114         current_cpu = NULL;
1115         qemu_mutex_unlock_iothread();
1116         do {
1117             int sig;
1118             r = sigwait(&waitset, &sig);
1119         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1120         if (r == -1) {
1121             perror("sigwait");
1122             exit(1);
1123         }
1124         qemu_mutex_lock_iothread();
1125         current_cpu = cpu;
1126         qemu_wait_io_event_common(cpu);
1127     }
1128
1129     return NULL;
1130 #endif
1131 }
1132
1133 static int64_t tcg_get_icount_limit(void)
1134 {
1135     int64_t deadline;
1136
1137     if (replay_mode != REPLAY_MODE_PLAY) {
1138         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1139
1140         /* Maintain prior (possibly buggy) behaviour where if no deadline
1141          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1142          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1143          * nanoseconds.
1144          */
1145         if ((deadline < 0) || (deadline > INT32_MAX)) {
1146             deadline = INT32_MAX;
1147         }
1148
1149         return qemu_icount_round(deadline);
1150     } else {
1151         return replay_get_instructions();
1152     }
1153 }
1154
1155 static void handle_icount_deadline(void)
1156 {
1157     if (use_icount) {
1158         int64_t deadline =
1159             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1160
1161         if (deadline == 0) {
1162             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1163         }
1164     }
1165 }
1166
1167 static int tcg_cpu_exec(CPUState *cpu)
1168 {
1169     int ret;
1170 #ifdef CONFIG_PROFILER
1171     int64_t ti;
1172 #endif
1173
1174 #ifdef CONFIG_PROFILER
1175     ti = profile_getclock();
1176 #endif
1177     if (use_icount) {
1178         int64_t count;
1179         int decr;
1180         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1181                                     + cpu->icount_extra);
1182         cpu->icount_decr.u16.low = 0;
1183         cpu->icount_extra = 0;
1184         count = tcg_get_icount_limit();
1185         timers_state.qemu_icount += count;
1186         decr = (count > 0xffff) ? 0xffff : count;
1187         count -= decr;
1188         cpu->icount_decr.u16.low = decr;
1189         cpu->icount_extra = count;
1190     }
1191     cpu_exec_start(cpu);
1192     ret = cpu_exec(cpu);
1193     cpu_exec_end(cpu);
1194 #ifdef CONFIG_PROFILER
1195     tcg_time += profile_getclock() - ti;
1196 #endif
1197     if (use_icount) {
1198         /* Fold pending instructions back into the
1199            instruction counter, and clear the interrupt flag.  */
1200         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1201                         + cpu->icount_extra);
1202         cpu->icount_decr.u32 = 0;
1203         cpu->icount_extra = 0;
1204         replay_account_executed_instructions();
1205     }
1206     return ret;
1207 }
1208
1209 /* Destroy any remaining vCPUs which have been unplugged and have
1210  * finished running
1211  */
1212 static void deal_with_unplugged_cpus(void)
1213 {
1214     CPUState *cpu;
1215
1216     CPU_FOREACH(cpu) {
1217         if (cpu->unplug && !cpu_can_run(cpu)) {
1218             qemu_tcg_destroy_vcpu(cpu);
1219             cpu->created = false;
1220             qemu_cond_signal(&qemu_cpu_cond);
1221             break;
1222         }
1223     }
1224 }
1225
1226 static void *qemu_tcg_cpu_thread_fn(void *arg)
1227 {
1228     CPUState *cpu = arg;
1229
1230     rcu_register_thread();
1231
1232     qemu_mutex_lock_iothread();
1233     qemu_thread_get_self(cpu->thread);
1234
1235     CPU_FOREACH(cpu) {
1236         cpu->thread_id = qemu_get_thread_id();
1237         cpu->created = true;
1238         cpu->can_do_io = 1;
1239     }
1240     qemu_cond_signal(&qemu_cpu_cond);
1241
1242     /* wait for initial kick-off after machine start */
1243     while (first_cpu->stopped) {
1244         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1245
1246         /* process any pending work */
1247         CPU_FOREACH(cpu) {
1248             qemu_wait_io_event_common(cpu);
1249         }
1250     }
1251
1252     /* process any pending work */
1253     atomic_mb_set(&exit_request, 1);
1254
1255     cpu = first_cpu;
1256
1257     while (1) {
1258         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1259         qemu_account_warp_timer();
1260
1261         if (!cpu) {
1262             cpu = first_cpu;
1263         }
1264
1265         for (; cpu != NULL && !exit_request; cpu = CPU_NEXT(cpu)) {
1266
1267             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1268                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1269
1270             if (cpu_can_run(cpu)) {
1271                 int r;
1272                 r = tcg_cpu_exec(cpu);
1273                 if (r == EXCP_DEBUG) {
1274                     cpu_handle_guest_debug(cpu);
1275                     break;
1276                 }
1277             } else if (cpu->stop || cpu->stopped) {
1278                 if (cpu->unplug) {
1279                     cpu = CPU_NEXT(cpu);
1280                 }
1281                 break;
1282             }
1283
1284         } /* for cpu.. */
1285
1286         /* Pairs with smp_wmb in qemu_cpu_kick.  */
1287         atomic_mb_set(&exit_request, 0);
1288
1289         handle_icount_deadline();
1290
1291         qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
1292         deal_with_unplugged_cpus();
1293     }
1294
1295     return NULL;
1296 }
1297
1298 static void *qemu_hax_cpu_thread_fn(void *arg)
1299 {
1300     CPUState *cpu = arg;
1301     int r;
1302     qemu_thread_get_self(cpu->thread);
1303     qemu_mutex_lock(&qemu_global_mutex);
1304
1305     cpu->thread_id = qemu_get_thread_id();
1306     cpu->created = true;
1307     cpu->halted = 0;
1308     current_cpu = cpu;
1309
1310     hax_init_vcpu(cpu);
1311     qemu_cond_signal(&qemu_cpu_cond);
1312
1313     while (1) {
1314         if (cpu_can_run(cpu)) {
1315             r = hax_smp_cpu_exec(cpu);
1316             if (r == EXCP_DEBUG) {
1317                 cpu_handle_guest_debug(cpu);
1318             }
1319         }
1320
1321         while (cpu_thread_is_idle(cpu)) {
1322             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1323         }
1324 #ifdef _WIN32
1325         SleepEx(0, TRUE);
1326 #endif
1327         qemu_wait_io_event_common(cpu);
1328     }
1329     return NULL;
1330 }
1331
1332 #ifdef _WIN32
1333 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1334 {
1335 }
1336 #endif
1337
1338 static void qemu_cpu_kick_thread(CPUState *cpu)
1339 {
1340 #ifndef _WIN32
1341     int err;
1342
1343     if (cpu->thread_kicked) {
1344         return;
1345     }
1346     cpu->thread_kicked = true;
1347     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1348     if (err) {
1349         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1350         exit(1);
1351     }
1352 #else /* _WIN32 */
1353     if (!qemu_cpu_is_self(cpu)) {
1354         if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1355             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1356                     __func__, GetLastError());
1357             exit(1);
1358         }
1359     }
1360 #endif
1361 }
1362
1363 static void qemu_cpu_kick_no_halt(void)
1364 {
1365     CPUState *cpu;
1366     /* Ensure whatever caused the exit has reached the CPU threads before
1367      * writing exit_request.
1368      */
1369     atomic_mb_set(&exit_request, 1);
1370     cpu = atomic_mb_read(&tcg_current_cpu);
1371     if (cpu) {
1372         cpu_exit(cpu);
1373     }
1374 }
1375
1376 void qemu_cpu_kick(CPUState *cpu)
1377 {
1378     qemu_cond_broadcast(cpu->halt_cond);
1379     if (tcg_enabled()) {
1380         qemu_cpu_kick_no_halt();
1381     } else {
1382         if (hax_enabled()) {
1383             /*
1384              * FIXME: race condition with the exit_request check in
1385              * hax_vcpu_hax_exec
1386              */
1387             cpu->exit_request = 1;
1388         }
1389         qemu_cpu_kick_thread(cpu);
1390     }
1391 }
1392
1393 void qemu_cpu_kick_self(void)
1394 {
1395     assert(current_cpu);
1396     qemu_cpu_kick_thread(current_cpu);
1397 }
1398
1399 bool qemu_cpu_is_self(CPUState *cpu)
1400 {
1401     return qemu_thread_is_self(cpu->thread);
1402 }
1403
1404 bool qemu_in_vcpu_thread(void)
1405 {
1406     return current_cpu && qemu_cpu_is_self(current_cpu);
1407 }
1408
1409 static __thread bool iothread_locked = false;
1410
1411 bool qemu_mutex_iothread_locked(void)
1412 {
1413     return iothread_locked;
1414 }
1415
1416 void qemu_mutex_lock_iothread(void)
1417 {
1418     atomic_inc(&iothread_requesting_mutex);
1419     /* In the simple case there is no need to bump the VCPU thread out of
1420      * TCG code execution.
1421      */
1422     if (!tcg_enabled() || qemu_in_vcpu_thread() ||
1423         !first_cpu || !first_cpu->created) {
1424         qemu_mutex_lock(&qemu_global_mutex);
1425         atomic_dec(&iothread_requesting_mutex);
1426     } else {
1427         if (qemu_mutex_trylock(&qemu_global_mutex)) {
1428             qemu_cpu_kick_no_halt();
1429             qemu_mutex_lock(&qemu_global_mutex);
1430         }
1431         atomic_dec(&iothread_requesting_mutex);
1432         qemu_cond_broadcast(&qemu_io_proceeded_cond);
1433     }
1434     iothread_locked = true;
1435 }
1436
1437 void qemu_mutex_unlock_iothread(void)
1438 {
1439     iothread_locked = false;
1440     qemu_mutex_unlock(&qemu_global_mutex);
1441 }
1442
1443 static bool all_vcpus_paused(void)
1444 {
1445     CPUState *cpu;
1446
1447     CPU_FOREACH(cpu) {
1448         if (!cpu->stopped) {
1449             return false;
1450         }
1451     }
1452
1453     return true;
1454 }
1455
1456 void pause_all_vcpus(void)
1457 {
1458     CPUState *cpu;
1459
1460     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1461     CPU_FOREACH(cpu) {
1462         cpu->stop = true;
1463         qemu_cpu_kick(cpu);
1464     }
1465
1466     if (qemu_in_vcpu_thread()) {
1467         cpu_stop_current();
1468         if (!kvm_enabled()) {
1469             CPU_FOREACH(cpu) {
1470                 cpu->stop = false;
1471                 cpu->stopped = true;
1472             }
1473             return;
1474         }
1475     }
1476
1477     while (!all_vcpus_paused()) {
1478         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1479         CPU_FOREACH(cpu) {
1480             qemu_cpu_kick(cpu);
1481         }
1482     }
1483 }
1484
1485 void cpu_resume(CPUState *cpu)
1486 {
1487     cpu->stop = false;
1488     cpu->stopped = false;
1489     qemu_cpu_kick(cpu);
1490 }
1491
1492 void resume_all_vcpus(void)
1493 {
1494     CPUState *cpu;
1495
1496     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1497     CPU_FOREACH(cpu) {
1498         cpu_resume(cpu);
1499     }
1500 }
1501
1502 void cpu_remove(CPUState *cpu)
1503 {
1504     cpu->stop = true;
1505     cpu->unplug = true;
1506     qemu_cpu_kick(cpu);
1507 }
1508
1509 void cpu_remove_sync(CPUState *cpu)
1510 {
1511     cpu_remove(cpu);
1512     while (cpu->created) {
1513         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1514     }
1515 }
1516
1517 /* For temporary buffers for forming a name */
1518 #define VCPU_THREAD_NAME_SIZE 16
1519
1520 static void qemu_tcg_init_vcpu(CPUState *cpu)
1521 {
1522     char thread_name[VCPU_THREAD_NAME_SIZE];
1523     static QemuCond *tcg_halt_cond;
1524     static QemuThread *tcg_cpu_thread;
1525
1526     /* share a single thread for all cpus with TCG */
1527     if (!tcg_cpu_thread) {
1528         cpu->thread = g_malloc0(sizeof(QemuThread));
1529         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1530         qemu_cond_init(cpu->halt_cond);
1531         tcg_halt_cond = cpu->halt_cond;
1532         snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1533                  cpu->cpu_index);
1534         qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1535                            cpu, QEMU_THREAD_JOINABLE);
1536 #ifdef _WIN32
1537         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1538 #endif
1539         while (!cpu->created) {
1540             qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1541         }
1542         tcg_cpu_thread = cpu->thread;
1543     } else {
1544         cpu->thread = tcg_cpu_thread;
1545         cpu->halt_cond = tcg_halt_cond;
1546     }
1547 }
1548
1549 static void qemu_hax_start_vcpu(CPUState *cpu)
1550 {
1551     char thread_name[VCPU_THREAD_NAME_SIZE];
1552
1553     cpu->thread = g_malloc0(sizeof(QemuThread));
1554     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1555     qemu_cond_init(cpu->halt_cond);
1556
1557     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1558              cpu->cpu_index);
1559     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1560                        cpu, QEMU_THREAD_JOINABLE);
1561 #ifdef _WIN32
1562     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1563 #endif
1564     while (!cpu->created) {
1565         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1566     }
1567 }
1568
1569 static void qemu_kvm_start_vcpu(CPUState *cpu)
1570 {
1571     char thread_name[VCPU_THREAD_NAME_SIZE];
1572
1573     cpu->thread = g_malloc0(sizeof(QemuThread));
1574     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1575     qemu_cond_init(cpu->halt_cond);
1576     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1577              cpu->cpu_index);
1578     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1579                        cpu, QEMU_THREAD_JOINABLE);
1580     while (!cpu->created) {
1581         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1582     }
1583 }
1584
1585 static void qemu_dummy_start_vcpu(CPUState *cpu)
1586 {
1587     char thread_name[VCPU_THREAD_NAME_SIZE];
1588
1589     cpu->thread = g_malloc0(sizeof(QemuThread));
1590     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1591     qemu_cond_init(cpu->halt_cond);
1592     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1593              cpu->cpu_index);
1594     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1595                        QEMU_THREAD_JOINABLE);
1596     while (!cpu->created) {
1597         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1598     }
1599 }
1600
1601 void qemu_init_vcpu(CPUState *cpu)
1602 {
1603     cpu->nr_cores = smp_cores;
1604     cpu->nr_threads = smp_threads;
1605     cpu->stopped = true;
1606
1607     if (!cpu->as) {
1608         /* If the target cpu hasn't set up any address spaces itself,
1609          * give it the default one.
1610          */
1611         AddressSpace *as = address_space_init_shareable(cpu->memory,
1612                                                         "cpu-memory");
1613         cpu->num_ases = 1;
1614         cpu_address_space_init(cpu, as, 0);
1615     }
1616
1617     if (kvm_enabled()) {
1618         qemu_kvm_start_vcpu(cpu);
1619     } else if (hax_enabled()) {
1620         qemu_hax_start_vcpu(cpu);
1621     } else if (tcg_enabled()) {
1622         qemu_tcg_init_vcpu(cpu);
1623     } else {
1624         qemu_dummy_start_vcpu(cpu);
1625     }
1626 }
1627
1628 void cpu_stop_current(void)
1629 {
1630     if (current_cpu) {
1631         current_cpu->stop = false;
1632         current_cpu->stopped = true;
1633         cpu_exit(current_cpu);
1634         qemu_cond_broadcast(&qemu_pause_cond);
1635     }
1636 }
1637
1638 int vm_stop(RunState state)
1639 {
1640     if (qemu_in_vcpu_thread()) {
1641         qemu_system_vmstop_request_prepare();
1642         qemu_system_vmstop_request(state);
1643         /*
1644          * FIXME: should not return to device code in case
1645          * vm_stop() has been requested.
1646          */
1647         cpu_stop_current();
1648         return 0;
1649     }
1650
1651     return do_vm_stop(state);
1652 }
1653
1654 /**
1655  * Prepare for (re)starting the VM.
1656  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1657  * running or in case of an error condition), 0 otherwise.
1658  */
1659 int vm_prepare_start(void)
1660 {
1661     RunState requested;
1662     int res = 0;
1663
1664     qemu_vmstop_requested(&requested);
1665     if (runstate_is_running() && requested == RUN_STATE__MAX) {
1666         return -1;
1667     }
1668
1669     /* Ensure that a STOP/RESUME pair of events is emitted if a
1670      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
1671      * example, according to documentation is always followed by
1672      * the STOP event.
1673      */
1674     if (runstate_is_running()) {
1675         qapi_event_send_stop(&error_abort);
1676         res = -1;
1677     } else {
1678         replay_enable_events();
1679         cpu_enable_ticks();
1680         runstate_set(RUN_STATE_RUNNING);
1681         vm_state_notify(1, RUN_STATE_RUNNING);
1682     }
1683
1684     /* We are sending this now, but the CPUs will be resumed shortly later */
1685     qapi_event_send_resume(&error_abort);
1686     return res;
1687 }
1688
1689 void vm_start(void)
1690 {
1691     if (!vm_prepare_start()) {
1692         resume_all_vcpus();
1693     }
1694 }
1695
1696 /* does a state transition even if the VM is already stopped,
1697    current state is forgotten forever */
1698 int vm_stop_force_state(RunState state)
1699 {
1700     if (runstate_is_running()) {
1701         return vm_stop(state);
1702     } else {
1703         runstate_set(state);
1704
1705         bdrv_drain_all();
1706         /* Make sure to return an error if the flush in a previous vm_stop()
1707          * failed. */
1708         return bdrv_flush_all();
1709     }
1710 }
1711
1712 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1713 {
1714     /* XXX: implement xxx_cpu_list for targets that still miss it */
1715 #if defined(cpu_list)
1716     cpu_list(f, cpu_fprintf);
1717 #endif
1718 }
1719
1720 CpuInfoList *qmp_query_cpus(Error **errp)
1721 {
1722     CpuInfoList *head = NULL, *cur_item = NULL;
1723     CPUState *cpu;
1724
1725     CPU_FOREACH(cpu) {
1726         CpuInfoList *info;
1727 #if defined(TARGET_I386)
1728         X86CPU *x86_cpu = X86_CPU(cpu);
1729         CPUX86State *env = &x86_cpu->env;
1730 #elif defined(TARGET_PPC)
1731         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1732         CPUPPCState *env = &ppc_cpu->env;
1733 #elif defined(TARGET_SPARC)
1734         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1735         CPUSPARCState *env = &sparc_cpu->env;
1736 #elif defined(TARGET_MIPS)
1737         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1738         CPUMIPSState *env = &mips_cpu->env;
1739 #elif defined(TARGET_TRICORE)
1740         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1741         CPUTriCoreState *env = &tricore_cpu->env;
1742 #endif
1743
1744         cpu_synchronize_state(cpu);
1745
1746         info = g_malloc0(sizeof(*info));
1747         info->value = g_malloc0(sizeof(*info->value));
1748         info->value->CPU = cpu->cpu_index;
1749         info->value->current = (cpu == first_cpu);
1750         info->value->halted = cpu->halted;
1751         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1752         info->value->thread_id = cpu->thread_id;
1753 #if defined(TARGET_I386)
1754         info->value->arch = CPU_INFO_ARCH_X86;
1755         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1756 #elif defined(TARGET_PPC)
1757         info->value->arch = CPU_INFO_ARCH_PPC;
1758         info->value->u.ppc.nip = env->nip;
1759 #elif defined(TARGET_SPARC)
1760         info->value->arch = CPU_INFO_ARCH_SPARC;
1761         info->value->u.q_sparc.pc = env->pc;
1762         info->value->u.q_sparc.npc = env->npc;
1763 #elif defined(TARGET_MIPS)
1764         info->value->arch = CPU_INFO_ARCH_MIPS;
1765         info->value->u.q_mips.PC = env->active_tc.PC;
1766 #elif defined(TARGET_TRICORE)
1767         info->value->arch = CPU_INFO_ARCH_TRICORE;
1768         info->value->u.tricore.PC = env->PC;
1769 #else
1770         info->value->arch = CPU_INFO_ARCH_OTHER;
1771 #endif
1772
1773         /* XXX: waiting for the qapi to support GSList */
1774         if (!cur_item) {
1775             head = cur_item = info;
1776         } else {
1777             cur_item->next = info;
1778             cur_item = info;
1779         }
1780     }
1781
1782     return head;
1783 }
1784
1785 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1786                  bool has_cpu, int64_t cpu_index, Error **errp)
1787 {
1788     FILE *f;
1789     uint32_t l;
1790     CPUState *cpu;
1791     uint8_t buf[1024];
1792     int64_t orig_addr = addr, orig_size = size;
1793
1794     if (!has_cpu) {
1795         cpu_index = 0;
1796     }
1797
1798     cpu = qemu_get_cpu(cpu_index);
1799     if (cpu == NULL) {
1800         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1801                    "a CPU number");
1802         return;
1803     }
1804
1805     f = fopen(filename, "wb");
1806     if (!f) {
1807         error_setg_file_open(errp, errno, filename);
1808         return;
1809     }
1810
1811     while (size != 0) {
1812         l = sizeof(buf);
1813         if (l > size)
1814             l = size;
1815         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1816             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1817                              " specified", orig_addr, orig_size);
1818             goto exit;
1819         }
1820         if (fwrite(buf, 1, l, f) != l) {
1821             error_setg(errp, QERR_IO_ERROR);
1822             goto exit;
1823         }
1824         addr += l;
1825         size -= l;
1826     }
1827
1828 exit:
1829     fclose(f);
1830 }
1831
1832 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1833                   Error **errp)
1834 {
1835     FILE *f;
1836     uint32_t l;
1837     uint8_t buf[1024];
1838
1839     f = fopen(filename, "wb");
1840     if (!f) {
1841         error_setg_file_open(errp, errno, filename);
1842         return;
1843     }
1844
1845     while (size != 0) {
1846         l = sizeof(buf);
1847         if (l > size)
1848             l = size;
1849         cpu_physical_memory_read(addr, buf, l);
1850         if (fwrite(buf, 1, l, f) != l) {
1851             error_setg(errp, QERR_IO_ERROR);
1852             goto exit;
1853         }
1854         addr += l;
1855         size -= l;
1856     }
1857
1858 exit:
1859     fclose(f);
1860 }
1861
1862 void qmp_inject_nmi(Error **errp)
1863 {
1864     nmi_monitor_handle(monitor_get_cpu_index(), errp);
1865 }
1866
1867 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1868 {
1869     if (!use_icount) {
1870         return;
1871     }
1872
1873     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
1874                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1875     if (icount_align_option) {
1876         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
1877         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
1878     } else {
1879         cpu_fprintf(f, "Max guest delay     NA\n");
1880         cpu_fprintf(f, "Max guest advance   NA\n");
1881     }
1882 }