cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 /* Needed early for CONFIG_BSD etc. */
  26 #include "qemu/osdep.h"
  27 #include "qemu-common.h"
  28 #include "qemu/config-file.h"
  29 #include "cpu.h"
  30 #include "monitor/monitor.h"
  31 #include "qapi/qmp/qerror.h"
  32 #include "qemu/error-report.h"
  33 #include "sysemu/sysemu.h"
  34 #include "sysemu/block-backend.h"
  35 #include "exec/gdbstub.h"
  36 #include "sysemu/dma.h"
  37 #include "sysemu/hw_accel.h"
  38 #include "sysemu/kvm.h"
  39 #include "sysemu/hax.h"
  40 #include "qmp-commands.h"
  41 #include "exec/exec-all.h"
  42
  43 #include "qemu/thread.h"
  44 #include "sysemu/cpus.h"
  45 #include "sysemu/qtest.h"
  46 #include "qemu/main-loop.h"
  47 #include "qemu/bitmap.h"
  48 #include "qemu/seqlock.h"
  49 #include "tcg.h"
  50 #include "qapi-event.h"
  51 #include "hw/nmi.h"
  52 #include "sysemu/replay.h"
  53
  54 #ifdef CONFIG_LINUX
  55
  56 #include <sys/prctl.h>
  57
  58 #ifndef PR_MCE_KILL
  59 #define PR_MCE_KILL 33
  60 #endif
  61
  62 #ifndef PR_MCE_KILL_SET
  63 #define PR_MCE_KILL_SET 1
  64 #endif
  65
  66 #ifndef PR_MCE_KILL_EARLY
  67 #define PR_MCE_KILL_EARLY 1
  68 #endif
  69
  70 #endif /* CONFIG_LINUX */
  71
  72 int64_t max_delay;
  73 int64_t max_advance;
  74
  75 /* vcpu throttling controls */
  76 static QEMUTimer *throttle_timer;
  77 static unsigned int throttle_percentage;
  78
  79 #define CPU_THROTTLE_PCT_MIN 1
  80 #define CPU_THROTTLE_PCT_MAX 99
  81 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  82
  83 bool cpu_is_stopped(CPUState *cpu)
  84 {
  85     return cpu->stopped || !runstate_is_running();
  86 }
  87
  88 static bool cpu_thread_is_idle(CPUState *cpu)
  89 {
  90     if (cpu->stop || cpu->queued_work_first) {
  91         return false;
  92     }
  93     if (cpu_is_stopped(cpu)) {
  94         return true;
  95     }
  96     if (!cpu->halted || cpu_has_work(cpu) ||
  97         kvm_halt_in_kernel()) {
  98         return false;
  99     }
 100     return true;
 101 }
 102
 103 static bool all_cpu_threads_idle(void)
 104 {
 105     CPUState *cpu;
 106
 107     CPU_FOREACH(cpu) {
 108         if (!cpu_thread_is_idle(cpu)) {
 109             return false;
 110         }
 111     }
 112     return true;
 113 }
 114
 115 /***********************************************************/
 116 /* guest cycle counter */
 117
 118 /* Protected by TimersState seqlock */
 119
 120 static bool icount_sleep = true;
 121 static int64_t vm_clock_warp_start = -1;
 122 /* Conversion factor from emulated instructions to virtual clock ticks.  */
 123 static int icount_time_shift;
 124 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 125 #define MAX_ICOUNT_SHIFT 10
 126
 127 static QEMUTimer *icount_rt_timer;
 128 static QEMUTimer *icount_vm_timer;
 129 static QEMUTimer *icount_warp_timer;
 130
 131 typedef struct TimersState {
 132     /* Protected by BQL.  */
 133     int64_t cpu_ticks_prev;
 134     int64_t cpu_ticks_offset;
 135
 136     /* cpu_clock_offset can be read out of BQL, so protect it with
 137      * this lock.
 138      */
 139     QemuSeqLock vm_clock_seqlock;
 140     int64_t cpu_clock_offset;
 141     int32_t cpu_ticks_enabled;
 142     int64_t dummy;
 143
 144     /* Compensate for varying guest execution speed.  */
 145     int64_t qemu_icount_bias;
 146     /* Only written by TCG thread */
 147     int64_t qemu_icount;
 148 } TimersState;
 149
 150 static TimersState timers_state;
 151 bool mttcg_enabled;
 152
 153 /*
 154  * We default to false if we know other options have been enabled
 155  * which are currently incompatible with MTTCG. Otherwise when each
 156  * guest (target) has been updated to support:
 157  *   - atomic instructions
 158  *   - memory ordering primitives (barriers)
 159  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 160  *
 161  * Once a guest architecture has been converted to the new primitives
 162  * there are two remaining limitations to check.
 163  *
 164  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 165  * - The host must have a stronger memory order than the guest
 166  *
 167  * It may be possible in future to support strong guests on weak hosts
 168  * but that will require tagging all load/stores in a guest with their
 169  * implicit memory order requirements which would likely slow things
 170  * down a lot.
 171  */
 172
 173 static bool check_tcg_memory_orders_compatible(void)
 174 {
 175 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 176     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 177 #else
 178     return false;
 179 #endif
 180 }
 181
 182 static bool default_mttcg_enabled(void)
 183 {
 184     QemuOpts *icount_opts = qemu_find_opts_singleton("icount");
 185     const char *rr = qemu_opt_get(icount_opts, "rr");
 186
 187     if (rr || TCG_OVERSIZED_GUEST) {
 188         return false;
 189     } else {
 190 #ifdef TARGET_SUPPORTS_MTTCG
 191         return check_tcg_memory_orders_compatible();
 192 #else
 193         return false;
 194 #endif
 195     }
 196 }
 197
 198 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 199 {
 200     const char *t = qemu_opt_get(opts, "thread");
 201     if (t) {
 202         if (strcmp(t, "multi") == 0) {
 203             if (TCG_OVERSIZED_GUEST) {
 204                 error_setg(errp, "No MTTCG when guest word size > hosts");
 205             } else {
 206                 if (!check_tcg_memory_orders_compatible()) {
 207                     error_report("Guest expects a stronger memory ordering "
 208                                  "than the host provides");
 209                     error_printf("This may cause strange/hard to debug errors");
 210                 }
 211                 mttcg_enabled = true;
 212             }
 213         } else if (strcmp(t, "single") == 0) {
 214             mttcg_enabled = false;
 215         } else {
 216             error_setg(errp, "Invalid 'thread' setting %s", t);
 217         }
 218     } else {
 219         mttcg_enabled = default_mttcg_enabled();
 220     }
 221 }
 222
 223 int64_t cpu_get_icount_raw(void)
 224 {
 225     int64_t icount;
 226     CPUState *cpu = current_cpu;
 227
 228     icount = timers_state.qemu_icount;
 229     if (cpu) {
 230         if (!cpu->can_do_io) {
 231             fprintf(stderr, "Bad icount read\n");
 232             exit(1);
 233         }
 234         icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
 235     }
 236     return icount;
 237 }
 238
 239 /* Return the virtual CPU time, based on the instruction counter.  */
 240 static int64_t cpu_get_icount_locked(void)
 241 {
 242     int64_t icount = cpu_get_icount_raw();
 243     return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
 244 }
 245
 246 int64_t cpu_get_icount(void)
 247 {
 248     int64_t icount;
 249     unsigned start;
 250
 251     do {
 252         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 253         icount = cpu_get_icount_locked();
 254     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 255
 256     return icount;
 257 }
 258
 259 int64_t cpu_icount_to_ns(int64_t icount)
 260 {
 261     return icount << icount_time_shift;
 262 }
 263
 264 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 265  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 266  * counter.
 267  *
 268  * Caller must hold the BQL
 269  */
 270 int64_t cpu_get_ticks(void)
 271 {
 272     int64_t ticks;
 273
 274     if (use_icount) {
 275         return cpu_get_icount();
 276     }
 277
 278     ticks = timers_state.cpu_ticks_offset;
 279     if (timers_state.cpu_ticks_enabled) {
 280         ticks += cpu_get_host_ticks();
 281     }
 282
 283     if (timers_state.cpu_ticks_prev > ticks) {
 284         /* Note: non increasing ticks may happen if the host uses
 285            software suspend */
 286         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 287         ticks = timers_state.cpu_ticks_prev;
 288     }
 289
 290     timers_state.cpu_ticks_prev = ticks;
 291     return ticks;
 292 }
 293
 294 static int64_t cpu_get_clock_locked(void)
 295 {
 296     int64_t time;
 297
 298     time = timers_state.cpu_clock_offset;
 299     if (timers_state.cpu_ticks_enabled) {
 300         time += get_clock();
 301     }
 302
 303     return time;
 304 }
 305
 306 /* Return the monotonic time elapsed in VM, i.e.,
 307  * the time between vm_start and vm_stop
 308  */
 309 int64_t cpu_get_clock(void)
 310 {
 311     int64_t ti;
 312     unsigned start;
 313
 314     do {
 315         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 316         ti = cpu_get_clock_locked();
 317     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 318
 319     return ti;
 320 }
 321
 322 /* enable cpu_get_ticks()
 323  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 324  */
 325 void cpu_enable_ticks(void)
 326 {
 327     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 328     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 329     if (!timers_state.cpu_ticks_enabled) {
 330         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 331         timers_state.cpu_clock_offset -= get_clock();
 332         timers_state.cpu_ticks_enabled = 1;
 333     }
 334     seqlock_write_end(&timers_state.vm_clock_seqlock);
 335 }
 336
 337 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 338  * cpu_get_ticks() after that.
 339  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 340  */
 341 void cpu_disable_ticks(void)
 342 {
 343     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 344     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 345     if (timers_state.cpu_ticks_enabled) {
 346         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 347         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 348         timers_state.cpu_ticks_enabled = 0;
 349     }
 350     seqlock_write_end(&timers_state.vm_clock_seqlock);
 351 }
 352
 353 /* Correlation between real and virtual time is always going to be
 354    fairly approximate, so ignore small variation.
 355    When the guest is idle real and virtual time will be aligned in
 356    the IO wait loop.  */
 357 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 358
 359 static void icount_adjust(void)
 360 {
 361     int64_t cur_time;
 362     int64_t cur_icount;
 363     int64_t delta;
 364
 365     /* Protected by TimersState mutex.  */
 366     static int64_t last_delta;
 367
 368     /* If the VM is not running, then do nothing.  */
 369     if (!runstate_is_running()) {
 370         return;
 371     }
 372
 373     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 374     cur_time = cpu_get_clock_locked();
 375     cur_icount = cpu_get_icount_locked();
 376
 377     delta = cur_icount - cur_time;
 378     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 379     if (delta > 0
 380         && last_delta + ICOUNT_WOBBLE < delta * 2
 381         && icount_time_shift > 0) {
 382         /* The guest is getting too far ahead.  Slow time down.  */
 383         icount_time_shift--;
 384     }
 385     if (delta < 0
 386         && last_delta - ICOUNT_WOBBLE > delta * 2
 387         && icount_time_shift < MAX_ICOUNT_SHIFT) {
 388         /* The guest is getting too far behind.  Speed time up.  */
 389         icount_time_shift++;
 390     }
 391     last_delta = delta;
 392     timers_state.qemu_icount_bias = cur_icount
 393                               - (timers_state.qemu_icount << icount_time_shift);
 394     seqlock_write_end(&timers_state.vm_clock_seqlock);
 395 }
 396
 397 static void icount_adjust_rt(void *opaque)
 398 {
 399     timer_mod(icount_rt_timer,
 400               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 401     icount_adjust();
 402 }
 403
 404 static void icount_adjust_vm(void *opaque)
 405 {
 406     timer_mod(icount_vm_timer,
 407                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 408                    NANOSECONDS_PER_SECOND / 10);
 409     icount_adjust();
 410 }
 411
 412 static int64_t qemu_icount_round(int64_t count)
 413 {
 414     return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 415 }
 416
 417 static void icount_warp_rt(void)
 418 {
 419     unsigned seq;
 420     int64_t warp_start;
 421
 422     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 423      * changes from -1 to another value, so the race here is okay.
 424      */
 425     do {
 426         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 427         warp_start = vm_clock_warp_start;
 428     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 429
 430     if (warp_start == -1) {
 431         return;
 432     }
 433
 434     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 435     if (runstate_is_running()) {
 436         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 437                                      cpu_get_clock_locked());
 438         int64_t warp_delta;
 439
 440         warp_delta = clock - vm_clock_warp_start;
 441         if (use_icount == 2) {
 442             /*
 443              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 444              * far ahead of real time.
 445              */
 446             int64_t cur_icount = cpu_get_icount_locked();
 447             int64_t delta = clock - cur_icount;
 448             warp_delta = MIN(warp_delta, delta);
 449         }
 450         timers_state.qemu_icount_bias += warp_delta;
 451     }
 452     vm_clock_warp_start = -1;
 453     seqlock_write_end(&timers_state.vm_clock_seqlock);
 454
 455     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 456         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 457     }
 458 }
 459
 460 static void icount_timer_cb(void *opaque)
 461 {
 462     /* No need for a checkpoint because the timer already synchronizes
 463      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 464      */
 465     icount_warp_rt();
 466 }
 467
 468 void qtest_clock_warp(int64_t dest)
 469 {
 470     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 471     AioContext *aio_context;
 472     assert(qtest_enabled());
 473     aio_context = qemu_get_aio_context();
 474     while (clock < dest) {
 475         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 476         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 477
 478         seqlock_write_begin(&timers_state.vm_clock_seqlock);
 479         timers_state.qemu_icount_bias += warp;
 480         seqlock_write_end(&timers_state.vm_clock_seqlock);
 481
 482         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 483         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 484         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 485     }
 486     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 487 }
 488
 489 void qemu_start_warp_timer(void)
 490 {
 491     int64_t clock;
 492     int64_t deadline;
 493
 494     if (!use_icount) {
 495         return;
 496     }
 497
 498     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 499      * do not fire, so computing the deadline does not make sense.
 500      */
 501     if (!runstate_is_running()) {
 502         return;
 503     }
 504
 505     /* warp clock deterministically in record/replay mode */
 506     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 507         return;
 508     }
 509
 510     if (!all_cpu_threads_idle()) {
 511         return;
 512     }
 513
 514     if (qtest_enabled()) {
 515         /* When testing, qtest commands advance icount.  */
 516         return;
 517     }
 518
 519     /* We want to use the earliest deadline from ALL vm_clocks */
 520     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 521     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 522     if (deadline < 0) {
 523         static bool notified;
 524         if (!icount_sleep && !notified) {
 525             error_report("WARNING: icount sleep disabled and no active timers");
 526             notified = true;
 527         }
 528         return;
 529     }
 530
 531     if (deadline > 0) {
 532         /*
 533          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 534          * sleep.  Otherwise, the CPU might be waiting for a future timer
 535          * interrupt to wake it up, but the interrupt never comes because
 536          * the vCPU isn't running any insns and thus doesn't advance the
 537          * QEMU_CLOCK_VIRTUAL.
 538          */
 539         if (!icount_sleep) {
 540             /*
 541              * We never let VCPUs sleep in no sleep icount mode.
 542              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 543              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 544              * It is useful when we want a deterministic execution time,
 545              * isolated from host latencies.
 546              */
 547             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 548             timers_state.qemu_icount_bias += deadline;
 549             seqlock_write_end(&timers_state.vm_clock_seqlock);
 550             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 551         } else {
 552             /*
 553              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 554              * "real" time, (related to the time left until the next event) has
 555              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 556              * This avoids that the warps are visible externally; for example,
 557              * you will not be sending network packets continuously instead of
 558              * every 100ms.
 559              */
 560             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 561             if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
 562                 vm_clock_warp_start = clock;
 563             }
 564             seqlock_write_end(&timers_state.vm_clock_seqlock);
 565             timer_mod_anticipate(icount_warp_timer, clock + deadline);
 566         }
 567     } else if (deadline == 0) {
 568         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 569     }
 570 }
 571
 572 static void qemu_account_warp_timer(void)
 573 {
 574     if (!use_icount || !icount_sleep) {
 575         return;
 576     }
 577
 578     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 579      * do not fire, so computing the deadline does not make sense.
 580      */
 581     if (!runstate_is_running()) {
 582         return;
 583     }
 584
 585     /* warp clock deterministically in record/replay mode */
 586     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 587         return;
 588     }
 589
 590     timer_del(icount_warp_timer);
 591     icount_warp_rt();
 592 }
 593
 594 static bool icount_state_needed(void *opaque)
 595 {
 596     return use_icount;
 597 }
 598
 599 /*
 600  * This is a subsection for icount migration.
 601  */
 602 static const VMStateDescription icount_vmstate_timers = {
 603     .name = "timer/icount",
 604     .version_id = 1,
 605     .minimum_version_id = 1,
 606     .needed = icount_state_needed,
 607     .fields = (VMStateField[]) {
 608         VMSTATE_INT64(qemu_icount_bias, TimersState),
 609         VMSTATE_INT64(qemu_icount, TimersState),
 610         VMSTATE_END_OF_LIST()
 611     }
 612 };
 613
 614 static const VMStateDescription vmstate_timers = {
 615     .name = "timer",
 616     .version_id = 2,
 617     .minimum_version_id = 1,
 618     .fields = (VMStateField[]) {
 619         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 620         VMSTATE_INT64(dummy, TimersState),
 621         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 622         VMSTATE_END_OF_LIST()
 623     },
 624     .subsections = (const VMStateDescription*[]) {
 625         &icount_vmstate_timers,
 626         NULL
 627     }
 628 };
 629
 630 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 631 {
 632     double pct;
 633     double throttle_ratio;
 634     long sleeptime_ns;
 635
 636     if (!cpu_throttle_get_percentage()) {
 637         return;
 638     }
 639
 640     pct = (double)cpu_throttle_get_percentage()/100;
 641     throttle_ratio = pct / (1 - pct);
 642     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 643
 644     qemu_mutex_unlock_iothread();
 645     atomic_set(&cpu->throttle_thread_scheduled, 0);
 646     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 647     qemu_mutex_lock_iothread();
 648 }
 649
 650 static void cpu_throttle_timer_tick(void *opaque)
 651 {
 652     CPUState *cpu;
 653     double pct;
 654
 655     /* Stop the timer if needed */
 656     if (!cpu_throttle_get_percentage()) {
 657         return;
 658     }
 659     CPU_FOREACH(cpu) {
 660         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 661             async_run_on_cpu(cpu, cpu_throttle_thread,
 662                              RUN_ON_CPU_NULL);
 663         }
 664     }
 665
 666     pct = (double)cpu_throttle_get_percentage()/100;
 667     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 668                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 669 }
 670
 671 void cpu_throttle_set(int new_throttle_pct)
 672 {
 673     /* Ensure throttle percentage is within valid range */
 674     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 675     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 676
 677     atomic_set(&throttle_percentage, new_throttle_pct);
 678
 679     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 680                                        CPU_THROTTLE_TIMESLICE_NS);
 681 }
 682
 683 void cpu_throttle_stop(void)
 684 {
 685     atomic_set(&throttle_percentage, 0);
 686 }
 687
 688 bool cpu_throttle_active(void)
 689 {
 690     return (cpu_throttle_get_percentage() != 0);
 691 }
 692
 693 int cpu_throttle_get_percentage(void)
 694 {
 695     return atomic_read(&throttle_percentage);
 696 }
 697
 698 void cpu_ticks_init(void)
 699 {
 700     seqlock_init(&timers_state.vm_clock_seqlock);
 701     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 702     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 703                                            cpu_throttle_timer_tick, NULL);
 704 }
 705
 706 void configure_icount(QemuOpts *opts, Error **errp)
 707 {
 708     const char *option;
 709     char *rem_str = NULL;
 710
 711     option = qemu_opt_get(opts, "shift");
 712     if (!option) {
 713         if (qemu_opt_get(opts, "align") != NULL) {
 714             error_setg(errp, "Please specify shift option when using align");
 715         }
 716         return;
 717     }
 718
 719     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 720     if (icount_sleep) {
 721         icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 722                                          icount_timer_cb, NULL);
 723     }
 724
 725     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 726
 727     if (icount_align_option && !icount_sleep) {
 728         error_setg(errp, "align=on and sleep=off are incompatible");
 729     }
 730     if (strcmp(option, "auto") != 0) {
 731         errno = 0;
 732         icount_time_shift = strtol(option, &rem_str, 0);
 733         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 734             error_setg(errp, "icount: Invalid shift value");
 735         }
 736         use_icount = 1;
 737         return;
 738     } else if (icount_align_option) {
 739         error_setg(errp, "shift=auto and align=on are incompatible");
 740     } else if (!icount_sleep) {
 741         error_setg(errp, "shift=auto and sleep=off are incompatible");
 742     }
 743
 744     use_icount = 2;
 745
 746     /* 125MIPS seems a reasonable initial guess at the guest speed.
 747        It will be corrected fairly quickly anyway.  */
 748     icount_time_shift = 3;
 749
 750     /* Have both realtime and virtual time triggers for speed adjustment.
 751        The realtime trigger catches emulated time passing too slowly,
 752        the virtual time trigger catches emulated time passing too fast.
 753        Realtime triggers occur even when idle, so use them less frequently
 754        than VM triggers.  */
 755     icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 756                                    icount_adjust_rt, NULL);
 757     timer_mod(icount_rt_timer,
 758                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 759     icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 760                                         icount_adjust_vm, NULL);
 761     timer_mod(icount_vm_timer,
 762                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 763                    NANOSECONDS_PER_SECOND / 10);
 764 }
 765
 766 /***********************************************************/
 767 /* TCG vCPU kick timer
 768  *
 769  * The kick timer is responsible for moving single threaded vCPU
 770  * emulation on to the next vCPU. If more than one vCPU is running a
 771  * timer event with force a cpu->exit so the next vCPU can get
 772  * scheduled.
 773  *
 774  * The timer is removed if all vCPUs are idle and restarted again once
 775  * idleness is complete.
 776  */
 777
 778 static QEMUTimer *tcg_kick_vcpu_timer;
 779 static CPUState *tcg_current_rr_cpu;
 780
 781 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 782
 783 static inline int64_t qemu_tcg_next_kick(void)
 784 {
 785     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 786 }
 787
 788 /* Kick the currently round-robin scheduled vCPU */
 789 static void qemu_cpu_kick_rr_cpu(void)
 790 {
 791     CPUState *cpu;
 792     do {
 793         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 794         if (cpu) {
 795             cpu_exit(cpu);
 796         }
 797     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 798 }
 799
 800 static void kick_tcg_thread(void *opaque)
 801 {
 802     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 803     qemu_cpu_kick_rr_cpu();
 804 }
 805
 806 static void start_tcg_kick_timer(void)
 807 {
 808     if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 809         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 810                                            kick_tcg_thread, NULL);
 811         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 812     }
 813 }
 814
 815 static void stop_tcg_kick_timer(void)
 816 {
 817     if (tcg_kick_vcpu_timer) {
 818         timer_del(tcg_kick_vcpu_timer);
 819         tcg_kick_vcpu_timer = NULL;
 820     }
 821 }
 822
 823 /***********************************************************/
 824 void hw_error(const char *fmt, ...)
 825 {
 826     va_list ap;
 827     CPUState *cpu;
 828
 829     va_start(ap, fmt);
 830     fprintf(stderr, "qemu: hardware error: ");
 831     vfprintf(stderr, fmt, ap);
 832     fprintf(stderr, "\n");
 833     CPU_FOREACH(cpu) {
 834         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 835         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 836     }
 837     va_end(ap);
 838     abort();
 839 }
 840
 841 void cpu_synchronize_all_states(void)
 842 {
 843     CPUState *cpu;
 844
 845     CPU_FOREACH(cpu) {
 846         cpu_synchronize_state(cpu);
 847     }
 848 }
 849
 850 void cpu_synchronize_all_post_reset(void)
 851 {
 852     CPUState *cpu;
 853
 854     CPU_FOREACH(cpu) {
 855         cpu_synchronize_post_reset(cpu);
 856     }
 857 }
 858
 859 void cpu_synchronize_all_post_init(void)
 860 {
 861     CPUState *cpu;
 862
 863     CPU_FOREACH(cpu) {
 864         cpu_synchronize_post_init(cpu);
 865     }
 866 }
 867
 868 static int do_vm_stop(RunState state)
 869 {
 870     int ret = 0;
 871
 872     if (runstate_is_running()) {
 873         cpu_disable_ticks();
 874         pause_all_vcpus();
 875         runstate_set(state);
 876         vm_state_notify(0, state);
 877         qapi_event_send_stop(&error_abort);
 878     }
 879
 880     bdrv_drain_all();
 881     replay_disable_events();
 882     ret = bdrv_flush_all();
 883
 884     return ret;
 885 }
 886
 887 static bool cpu_can_run(CPUState *cpu)
 888 {
 889     if (cpu->stop) {
 890         return false;
 891     }
 892     if (cpu_is_stopped(cpu)) {
 893         return false;
 894     }
 895     return true;
 896 }
 897
 898 static void cpu_handle_guest_debug(CPUState *cpu)
 899 {
 900     gdb_set_stop_cpu(cpu);
 901     qemu_system_debug_request();
 902     cpu->stopped = true;
 903 }
 904
 905 #ifdef CONFIG_LINUX
 906 static void sigbus_reraise(void)
 907 {
 908     sigset_t set;
 909     struct sigaction action;
 910
 911     memset(&action, 0, sizeof(action));
 912     action.sa_handler = SIG_DFL;
 913     if (!sigaction(SIGBUS, &action, NULL)) {
 914         raise(SIGBUS);
 915         sigemptyset(&set);
 916         sigaddset(&set, SIGBUS);
 917         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 918     }
 919     perror("Failed to re-raise SIGBUS!\n");
 920     abort();
 921 }
 922
 923 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
 924 {
 925     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
 926         sigbus_reraise();
 927     }
 928
 929     if (current_cpu) {
 930         /* Called asynchronously in VCPU thread.  */
 931         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
 932             sigbus_reraise();
 933         }
 934     } else {
 935         /* Called synchronously (via signalfd) in main thread.  */
 936         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
 937             sigbus_reraise();
 938         }
 939     }
 940 }
 941
 942 static void qemu_init_sigbus(void)
 943 {
 944     struct sigaction action;
 945
 946     memset(&action, 0, sizeof(action));
 947     action.sa_flags = SA_SIGINFO;
 948     action.sa_sigaction = sigbus_handler;
 949     sigaction(SIGBUS, &action, NULL);
 950
 951     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
 952 }
 953
 954 static void dummy_signal(int sig)
 955 {
 956 }
 957
 958 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
 959 {
 960     int r;
 961     sigset_t set;
 962     struct sigaction sigact;
 963
 964     memset(&sigact, 0, sizeof(sigact));
 965     sigact.sa_handler = dummy_signal;
 966     sigaction(SIG_IPI, &sigact, NULL);
 967
 968     pthread_sigmask(SIG_BLOCK, NULL, &set);
 969     sigdelset(&set, SIGBUS);
 970     pthread_sigmask(SIG_SETMASK, &set, NULL);
 971     sigdelset(&set, SIG_IPI);
 972     r = kvm_set_signal_mask(cpu, &set);
 973     if (r) {
 974         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
 975         exit(1);
 976     }
 977 }
 978
 979 static void qemu_kvm_eat_signals(CPUState *cpu)
 980 {
 981     struct timespec ts = { 0, 0 };
 982     siginfo_t siginfo;
 983     sigset_t waitset;
 984     sigset_t chkset;
 985     int r;
 986
 987     sigemptyset(&waitset);
 988     sigaddset(&waitset, SIG_IPI);
 989
 990     do {
 991         r = sigtimedwait(&waitset, &siginfo, &ts);
 992         if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
 993             perror("sigtimedwait");
 994             exit(1);
 995         }
 996
 997         r = sigpending(&chkset);
 998         if (r == -1) {
 999             perror("sigpending");
1000             exit(1);
1001         }
1002     } while (sigismember(&chkset, SIG_IPI));
1003 }
1004 #else /* !CONFIG_LINUX */
1005 static void qemu_init_sigbus(void)
1006 {
1007 }
1008
1009 static void qemu_kvm_eat_signals(CPUState *cpu)
1010 {
1011 }
1012
1013 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
1014 {
1015 }
1016 #endif /* !CONFIG_LINUX */
1017
1018 static QemuMutex qemu_global_mutex;
1019
1020 static QemuThread io_thread;
1021
1022 /* cpu creation */
1023 static QemuCond qemu_cpu_cond;
1024 /* system init */
1025 static QemuCond qemu_pause_cond;
1026
1027 void qemu_init_cpu_loop(void)
1028 {
1029     qemu_init_sigbus();
1030     qemu_cond_init(&qemu_cpu_cond);
1031     qemu_cond_init(&qemu_pause_cond);
1032     qemu_mutex_init(&qemu_global_mutex);
1033
1034     qemu_thread_get_self(&io_thread);
1035 }
1036
1037 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1038 {
1039     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1040 }
1041
1042 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1043 {
1044     if (kvm_destroy_vcpu(cpu) < 0) {
1045         error_report("kvm_destroy_vcpu failed");
1046         exit(EXIT_FAILURE);
1047     }
1048 }
1049
1050 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1051 {
1052 }
1053
1054 static void qemu_wait_io_event_common(CPUState *cpu)
1055 {
1056     atomic_mb_set(&cpu->thread_kicked, false);
1057     if (cpu->stop) {
1058         cpu->stop = false;
1059         cpu->stopped = true;
1060         qemu_cond_broadcast(&qemu_pause_cond);
1061     }
1062     process_queued_cpu_work(cpu);
1063 }
1064
1065 static bool qemu_tcg_should_sleep(CPUState *cpu)
1066 {
1067     if (mttcg_enabled) {
1068         return cpu_thread_is_idle(cpu);
1069     } else {
1070         return all_cpu_threads_idle();
1071     }
1072 }
1073
1074 static void qemu_tcg_wait_io_event(CPUState *cpu)
1075 {
1076     while (qemu_tcg_should_sleep(cpu)) {
1077         stop_tcg_kick_timer();
1078         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1079     }
1080
1081     start_tcg_kick_timer();
1082
1083     qemu_wait_io_event_common(cpu);
1084 }
1085
1086 static void qemu_kvm_wait_io_event(CPUState *cpu)
1087 {
1088     while (cpu_thread_is_idle(cpu)) {
1089         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1090     }
1091
1092     qemu_kvm_eat_signals(cpu);
1093     qemu_wait_io_event_common(cpu);
1094 }
1095
1096 static void *qemu_kvm_cpu_thread_fn(void *arg)
1097 {
1098     CPUState *cpu = arg;
1099     int r;
1100
1101     rcu_register_thread();
1102
1103     qemu_mutex_lock_iothread();
1104     qemu_thread_get_self(cpu->thread);
1105     cpu->thread_id = qemu_get_thread_id();
1106     cpu->can_do_io = 1;
1107     current_cpu = cpu;
1108
1109     r = kvm_init_vcpu(cpu);
1110     if (r < 0) {
1111         fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1112         exit(1);
1113     }
1114
1115     qemu_kvm_init_cpu_signals(cpu);
1116
1117     /* signal CPU creation */
1118     cpu->created = true;
1119     qemu_cond_signal(&qemu_cpu_cond);
1120
1121     do {
1122         if (cpu_can_run(cpu)) {
1123             r = kvm_cpu_exec(cpu);
1124             if (r == EXCP_DEBUG) {
1125                 cpu_handle_guest_debug(cpu);
1126             }
1127         }
1128         qemu_kvm_wait_io_event(cpu);
1129     } while (!cpu->unplug || cpu_can_run(cpu));
1130
1131     qemu_kvm_destroy_vcpu(cpu);
1132     cpu->created = false;
1133     qemu_cond_signal(&qemu_cpu_cond);
1134     qemu_mutex_unlock_iothread();
1135     return NULL;
1136 }
1137
1138 static void *qemu_dummy_cpu_thread_fn(void *arg)
1139 {
1140 #ifdef _WIN32
1141     fprintf(stderr, "qtest is not supported under Windows\n");
1142     exit(1);
1143 #else
1144     CPUState *cpu = arg;
1145     sigset_t waitset;
1146     int r;
1147
1148     rcu_register_thread();
1149
1150     qemu_mutex_lock_iothread();
1151     qemu_thread_get_self(cpu->thread);
1152     cpu->thread_id = qemu_get_thread_id();
1153     cpu->can_do_io = 1;
1154     current_cpu = cpu;
1155
1156     sigemptyset(&waitset);
1157     sigaddset(&waitset, SIG_IPI);
1158
1159     /* signal CPU creation */
1160     cpu->created = true;
1161     qemu_cond_signal(&qemu_cpu_cond);
1162
1163     while (1) {
1164         qemu_mutex_unlock_iothread();
1165         do {
1166             int sig;
1167             r = sigwait(&waitset, &sig);
1168         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1169         if (r == -1) {
1170             perror("sigwait");
1171             exit(1);
1172         }
1173         qemu_mutex_lock_iothread();
1174         qemu_wait_io_event_common(cpu);
1175     }
1176
1177     return NULL;
1178 #endif
1179 }
1180
1181 static int64_t tcg_get_icount_limit(void)
1182 {
1183     int64_t deadline;
1184
1185     if (replay_mode != REPLAY_MODE_PLAY) {
1186         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1187
1188         /* Maintain prior (possibly buggy) behaviour where if no deadline
1189          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1190          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1191          * nanoseconds.
1192          */
1193         if ((deadline < 0) || (deadline > INT32_MAX)) {
1194             deadline = INT32_MAX;
1195         }
1196
1197         return qemu_icount_round(deadline);
1198     } else {
1199         return replay_get_instructions();
1200     }
1201 }
1202
1203 static void handle_icount_deadline(void)
1204 {
1205     if (use_icount) {
1206         int64_t deadline =
1207             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1208
1209         if (deadline == 0) {
1210             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1211         }
1212     }
1213 }
1214
1215 static int tcg_cpu_exec(CPUState *cpu)
1216 {
1217     int ret;
1218 #ifdef CONFIG_PROFILER
1219     int64_t ti;
1220 #endif
1221
1222 #ifdef CONFIG_PROFILER
1223     ti = profile_getclock();
1224 #endif
1225     if (use_icount) {
1226         int64_t count;
1227         int decr;
1228         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1229                                     + cpu->icount_extra);
1230         cpu->icount_decr.u16.low = 0;
1231         cpu->icount_extra = 0;
1232         count = tcg_get_icount_limit();
1233         timers_state.qemu_icount += count;
1234         decr = (count > 0xffff) ? 0xffff : count;
1235         count -= decr;
1236         cpu->icount_decr.u16.low = decr;
1237         cpu->icount_extra = count;
1238     }
1239     qemu_mutex_unlock_iothread();
1240     cpu_exec_start(cpu);
1241     ret = cpu_exec(cpu);
1242     cpu_exec_end(cpu);
1243     qemu_mutex_lock_iothread();
1244 #ifdef CONFIG_PROFILER
1245     tcg_time += profile_getclock() - ti;
1246 #endif
1247     if (use_icount) {
1248         /* Fold pending instructions back into the
1249            instruction counter, and clear the interrupt flag.  */
1250         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1251                         + cpu->icount_extra);
1252         cpu->icount_decr.u32 = 0;
1253         cpu->icount_extra = 0;
1254         replay_account_executed_instructions();
1255     }
1256     return ret;
1257 }
1258
1259 /* Destroy any remaining vCPUs which have been unplugged and have
1260  * finished running
1261  */
1262 static void deal_with_unplugged_cpus(void)
1263 {
1264     CPUState *cpu;
1265
1266     CPU_FOREACH(cpu) {
1267         if (cpu->unplug && !cpu_can_run(cpu)) {
1268             qemu_tcg_destroy_vcpu(cpu);
1269             cpu->created = false;
1270             qemu_cond_signal(&qemu_cpu_cond);
1271             break;
1272         }
1273     }
1274 }
1275
1276 /* Single-threaded TCG
1277  *
1278  * In the single-threaded case each vCPU is simulated in turn. If
1279  * there is more than a single vCPU we create a simple timer to kick
1280  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1281  * This is done explicitly rather than relying on side-effects
1282  * elsewhere.
1283  */
1284
1285 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1286 {
1287     CPUState *cpu = arg;
1288
1289     rcu_register_thread();
1290
1291     qemu_mutex_lock_iothread();
1292     qemu_thread_get_self(cpu->thread);
1293
1294     CPU_FOREACH(cpu) {
1295         cpu->thread_id = qemu_get_thread_id();
1296         cpu->created = true;
1297         cpu->can_do_io = 1;
1298     }
1299     qemu_cond_signal(&qemu_cpu_cond);
1300
1301     /* wait for initial kick-off after machine start */
1302     while (first_cpu->stopped) {
1303         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1304
1305         /* process any pending work */
1306         CPU_FOREACH(cpu) {
1307             current_cpu = cpu;
1308             qemu_wait_io_event_common(cpu);
1309         }
1310     }
1311
1312     start_tcg_kick_timer();
1313
1314     cpu = first_cpu;
1315
1316     /* process any pending work */
1317     cpu->exit_request = 1;
1318
1319     while (1) {
1320         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1321         qemu_account_warp_timer();
1322
1323         if (!cpu) {
1324             cpu = first_cpu;
1325         }
1326
1327         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1328
1329             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1330             current_cpu = cpu;
1331
1332             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1333                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1334
1335             if (cpu_can_run(cpu)) {
1336                 int r;
1337                 r = tcg_cpu_exec(cpu);
1338                 if (r == EXCP_DEBUG) {
1339                     cpu_handle_guest_debug(cpu);
1340                     break;
1341                 } else if (r == EXCP_ATOMIC) {
1342                     qemu_mutex_unlock_iothread();
1343                     cpu_exec_step_atomic(cpu);
1344                     qemu_mutex_lock_iothread();
1345                     break;
1346                 }
1347             } else if (cpu->stop) {
1348                 if (cpu->unplug) {
1349                     cpu = CPU_NEXT(cpu);
1350                 }
1351                 break;
1352             }
1353
1354             cpu = CPU_NEXT(cpu);
1355         } /* while (cpu && !cpu->exit_request).. */
1356
1357         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1358         atomic_set(&tcg_current_rr_cpu, NULL);
1359
1360         if (cpu && cpu->exit_request) {
1361             atomic_mb_set(&cpu->exit_request, 0);
1362         }
1363
1364         handle_icount_deadline();
1365
1366         qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1367         deal_with_unplugged_cpus();
1368     }
1369
1370     return NULL;
1371 }
1372
1373 static void *qemu_hax_cpu_thread_fn(void *arg)
1374 {
1375     CPUState *cpu = arg;
1376     int r;
1377     qemu_thread_get_self(cpu->thread);
1378     qemu_mutex_lock(&qemu_global_mutex);
1379
1380     cpu->thread_id = qemu_get_thread_id();
1381     cpu->created = true;
1382     cpu->halted = 0;
1383     current_cpu = cpu;
1384
1385     hax_init_vcpu(cpu);
1386     qemu_cond_signal(&qemu_cpu_cond);
1387
1388     while (1) {
1389         if (cpu_can_run(cpu)) {
1390             r = hax_smp_cpu_exec(cpu);
1391             if (r == EXCP_DEBUG) {
1392                 cpu_handle_guest_debug(cpu);
1393             }
1394         }
1395
1396         while (cpu_thread_is_idle(cpu)) {
1397             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1398         }
1399 #ifdef _WIN32
1400         SleepEx(0, TRUE);
1401 #endif
1402         qemu_wait_io_event_common(cpu);
1403     }
1404     return NULL;
1405 }
1406
1407 #ifdef _WIN32
1408 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1409 {
1410 }
1411 #endif
1412
1413 /* Multi-threaded TCG
1414  *
1415  * In the multi-threaded case each vCPU has its own thread. The TLS
1416  * variable current_cpu can be used deep in the code to find the
1417  * current CPUState for a given thread.
1418  */
1419
1420 static void *qemu_tcg_cpu_thread_fn(void *arg)
1421 {
1422     CPUState *cpu = arg;
1423
1424     rcu_register_thread();
1425
1426     qemu_mutex_lock_iothread();
1427     qemu_thread_get_self(cpu->thread);
1428
1429     cpu->thread_id = qemu_get_thread_id();
1430     cpu->created = true;
1431     cpu->can_do_io = 1;
1432     current_cpu = cpu;
1433     qemu_cond_signal(&qemu_cpu_cond);
1434
1435     /* process any pending work */
1436     cpu->exit_request = 1;
1437
1438     while (1) {
1439         if (cpu_can_run(cpu)) {
1440             int r;
1441             r = tcg_cpu_exec(cpu);
1442             switch (r) {
1443             case EXCP_DEBUG:
1444                 cpu_handle_guest_debug(cpu);
1445                 break;
1446             case EXCP_HALTED:
1447                 /* during start-up the vCPU is reset and the thread is
1448                  * kicked several times. If we don't ensure we go back
1449                  * to sleep in the halted state we won't cleanly
1450                  * start-up when the vCPU is enabled.
1451                  *
1452                  * cpu->halted should ensure we sleep in wait_io_event
1453                  */
1454                 g_assert(cpu->halted);
1455                 break;
1456             case EXCP_ATOMIC:
1457                 qemu_mutex_unlock_iothread();
1458                 cpu_exec_step_atomic(cpu);
1459                 qemu_mutex_lock_iothread();
1460             default:
1461                 /* Ignore everything else? */
1462                 break;
1463             }
1464         }
1465
1466         handle_icount_deadline();
1467
1468         atomic_mb_set(&cpu->exit_request, 0);
1469         qemu_tcg_wait_io_event(cpu);
1470     }
1471
1472     return NULL;
1473 }
1474
1475 static void qemu_cpu_kick_thread(CPUState *cpu)
1476 {
1477 #ifndef _WIN32
1478     int err;
1479
1480     if (cpu->thread_kicked) {
1481         return;
1482     }
1483     cpu->thread_kicked = true;
1484     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1485     if (err) {
1486         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1487         exit(1);
1488     }
1489 #else /* _WIN32 */
1490     if (!qemu_cpu_is_self(cpu)) {
1491         if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1492             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1493                     __func__, GetLastError());
1494             exit(1);
1495         }
1496     }
1497 #endif
1498 }
1499
1500 void qemu_cpu_kick(CPUState *cpu)
1501 {
1502     qemu_cond_broadcast(cpu->halt_cond);
1503     if (tcg_enabled()) {
1504         cpu_exit(cpu);
1505         /* NOP unless doing single-thread RR */
1506         qemu_cpu_kick_rr_cpu();
1507     } else {
1508         if (hax_enabled()) {
1509             /*
1510              * FIXME: race condition with the exit_request check in
1511              * hax_vcpu_hax_exec
1512              */
1513             cpu->exit_request = 1;
1514         }
1515         qemu_cpu_kick_thread(cpu);
1516     }
1517 }
1518
1519 void qemu_cpu_kick_self(void)
1520 {
1521     assert(current_cpu);
1522     qemu_cpu_kick_thread(current_cpu);
1523 }
1524
1525 bool qemu_cpu_is_self(CPUState *cpu)
1526 {
1527     return qemu_thread_is_self(cpu->thread);
1528 }
1529
1530 bool qemu_in_vcpu_thread(void)
1531 {
1532     return current_cpu && qemu_cpu_is_self(current_cpu);
1533 }
1534
1535 static __thread bool iothread_locked = false;
1536
1537 bool qemu_mutex_iothread_locked(void)
1538 {
1539     return iothread_locked;
1540 }
1541
1542 void qemu_mutex_lock_iothread(void)
1543 {
1544     g_assert(!qemu_mutex_iothread_locked());
1545     qemu_mutex_lock(&qemu_global_mutex);
1546     iothread_locked = true;
1547 }
1548
1549 void qemu_mutex_unlock_iothread(void)
1550 {
1551     g_assert(qemu_mutex_iothread_locked());
1552     iothread_locked = false;
1553     qemu_mutex_unlock(&qemu_global_mutex);
1554 }
1555
1556 static bool all_vcpus_paused(void)
1557 {
1558     CPUState *cpu;
1559
1560     CPU_FOREACH(cpu) {
1561         if (!cpu->stopped) {
1562             return false;
1563         }
1564     }
1565
1566     return true;
1567 }
1568
1569 void pause_all_vcpus(void)
1570 {
1571     CPUState *cpu;
1572
1573     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1574     CPU_FOREACH(cpu) {
1575         cpu->stop = true;
1576         qemu_cpu_kick(cpu);
1577     }
1578
1579     if (qemu_in_vcpu_thread()) {
1580         cpu_stop_current();
1581     }
1582
1583     while (!all_vcpus_paused()) {
1584         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1585         CPU_FOREACH(cpu) {
1586             qemu_cpu_kick(cpu);
1587         }
1588     }
1589 }
1590
1591 void cpu_resume(CPUState *cpu)
1592 {
1593     cpu->stop = false;
1594     cpu->stopped = false;
1595     qemu_cpu_kick(cpu);
1596 }
1597
1598 void resume_all_vcpus(void)
1599 {
1600     CPUState *cpu;
1601
1602     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1603     CPU_FOREACH(cpu) {
1604         cpu_resume(cpu);
1605     }
1606 }
1607
1608 void cpu_remove(CPUState *cpu)
1609 {
1610     cpu->stop = true;
1611     cpu->unplug = true;
1612     qemu_cpu_kick(cpu);
1613 }
1614
1615 void cpu_remove_sync(CPUState *cpu)
1616 {
1617     cpu_remove(cpu);
1618     while (cpu->created) {
1619         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1620     }
1621 }
1622
1623 /* For temporary buffers for forming a name */
1624 #define VCPU_THREAD_NAME_SIZE 16
1625
1626 static void qemu_tcg_init_vcpu(CPUState *cpu)
1627 {
1628     char thread_name[VCPU_THREAD_NAME_SIZE];
1629     static QemuCond *single_tcg_halt_cond;
1630     static QemuThread *single_tcg_cpu_thread;
1631
1632     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1633         cpu->thread = g_malloc0(sizeof(QemuThread));
1634         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1635         qemu_cond_init(cpu->halt_cond);
1636
1637         if (qemu_tcg_mttcg_enabled()) {
1638             /* create a thread per vCPU with TCG (MTTCG) */
1639             parallel_cpus = true;
1640             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1641                  cpu->cpu_index);
1642
1643             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1644                                cpu, QEMU_THREAD_JOINABLE);
1645
1646         } else {
1647             /* share a single thread for all cpus with TCG */
1648             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1649             qemu_thread_create(cpu->thread, thread_name,
1650                                qemu_tcg_rr_cpu_thread_fn,
1651                                cpu, QEMU_THREAD_JOINABLE);
1652
1653             single_tcg_halt_cond = cpu->halt_cond;
1654             single_tcg_cpu_thread = cpu->thread;
1655         }
1656 #ifdef _WIN32
1657         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1658 #endif
1659         while (!cpu->created) {
1660             qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1661         }
1662     } else {
1663         /* For non-MTTCG cases we share the thread */
1664         cpu->thread = single_tcg_cpu_thread;
1665         cpu->halt_cond = single_tcg_halt_cond;
1666     }
1667 }
1668
1669 static void qemu_hax_start_vcpu(CPUState *cpu)
1670 {
1671     char thread_name[VCPU_THREAD_NAME_SIZE];
1672
1673     cpu->thread = g_malloc0(sizeof(QemuThread));
1674     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1675     qemu_cond_init(cpu->halt_cond);
1676
1677     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1678              cpu->cpu_index);
1679     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1680                        cpu, QEMU_THREAD_JOINABLE);
1681 #ifdef _WIN32
1682     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1683 #endif
1684     while (!cpu->created) {
1685         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1686     }
1687 }
1688
1689 static void qemu_kvm_start_vcpu(CPUState *cpu)
1690 {
1691     char thread_name[VCPU_THREAD_NAME_SIZE];
1692
1693     cpu->thread = g_malloc0(sizeof(QemuThread));
1694     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1695     qemu_cond_init(cpu->halt_cond);
1696     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1697              cpu->cpu_index);
1698     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1699                        cpu, QEMU_THREAD_JOINABLE);
1700     while (!cpu->created) {
1701         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1702     }
1703 }
1704
1705 static void qemu_dummy_start_vcpu(CPUState *cpu)
1706 {
1707     char thread_name[VCPU_THREAD_NAME_SIZE];
1708
1709     cpu->thread = g_malloc0(sizeof(QemuThread));
1710     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1711     qemu_cond_init(cpu->halt_cond);
1712     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1713              cpu->cpu_index);
1714     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1715                        QEMU_THREAD_JOINABLE);
1716     while (!cpu->created) {
1717         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1718     }
1719 }
1720
1721 void qemu_init_vcpu(CPUState *cpu)
1722 {
1723     cpu->nr_cores = smp_cores;
1724     cpu->nr_threads = smp_threads;
1725     cpu->stopped = true;
1726
1727     if (!cpu->as) {
1728         /* If the target cpu hasn't set up any address spaces itself,
1729          * give it the default one.
1730          */
1731         AddressSpace *as = address_space_init_shareable(cpu->memory,
1732                                                         "cpu-memory");
1733         cpu->num_ases = 1;
1734         cpu_address_space_init(cpu, as, 0);
1735     }
1736
1737     if (kvm_enabled()) {
1738         qemu_kvm_start_vcpu(cpu);
1739     } else if (hax_enabled()) {
1740         qemu_hax_start_vcpu(cpu);
1741     } else if (tcg_enabled()) {
1742         qemu_tcg_init_vcpu(cpu);
1743     } else {
1744         qemu_dummy_start_vcpu(cpu);
1745     }
1746 }
1747
1748 void cpu_stop_current(void)
1749 {
1750     if (current_cpu) {
1751         current_cpu->stop = false;
1752         current_cpu->stopped = true;
1753         cpu_exit(current_cpu);
1754         qemu_cond_broadcast(&qemu_pause_cond);
1755     }
1756 }
1757
1758 int vm_stop(RunState state)
1759 {
1760     if (qemu_in_vcpu_thread()) {
1761         qemu_system_vmstop_request_prepare();
1762         qemu_system_vmstop_request(state);
1763         /*
1764          * FIXME: should not return to device code in case
1765          * vm_stop() has been requested.
1766          */
1767         cpu_stop_current();
1768         return 0;
1769     }
1770
1771     return do_vm_stop(state);
1772 }
1773
1774 /**
1775  * Prepare for (re)starting the VM.
1776  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1777  * running or in case of an error condition), 0 otherwise.
1778  */
1779 int vm_prepare_start(void)
1780 {
1781     RunState requested;
1782     int res = 0;
1783
1784     qemu_vmstop_requested(&requested);
1785     if (runstate_is_running() && requested == RUN_STATE__MAX) {
1786         return -1;
1787     }
1788
1789     /* Ensure that a STOP/RESUME pair of events is emitted if a
1790      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
1791      * example, according to documentation is always followed by
1792      * the STOP event.
1793      */
1794     if (runstate_is_running()) {
1795         qapi_event_send_stop(&error_abort);
1796         res = -1;
1797     } else {
1798         replay_enable_events();
1799         cpu_enable_ticks();
1800         runstate_set(RUN_STATE_RUNNING);
1801         vm_state_notify(1, RUN_STATE_RUNNING);
1802     }
1803
1804     /* We are sending this now, but the CPUs will be resumed shortly later */
1805     qapi_event_send_resume(&error_abort);
1806     return res;
1807 }
1808
1809 void vm_start(void)
1810 {
1811     if (!vm_prepare_start()) {
1812         resume_all_vcpus();
1813     }
1814 }
1815
1816 /* does a state transition even if the VM is already stopped,
1817    current state is forgotten forever */
1818 int vm_stop_force_state(RunState state)
1819 {
1820     if (runstate_is_running()) {
1821         return vm_stop(state);
1822     } else {
1823         runstate_set(state);
1824
1825         bdrv_drain_all();
1826         /* Make sure to return an error if the flush in a previous vm_stop()
1827          * failed. */
1828         return bdrv_flush_all();
1829     }
1830 }
1831
1832 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1833 {
1834     /* XXX: implement xxx_cpu_list for targets that still miss it */
1835 #if defined(cpu_list)
1836     cpu_list(f, cpu_fprintf);
1837 #endif
1838 }
1839
1840 CpuInfoList *qmp_query_cpus(Error **errp)
1841 {
1842     CpuInfoList *head = NULL, *cur_item = NULL;
1843     CPUState *cpu;
1844
1845     CPU_FOREACH(cpu) {
1846         CpuInfoList *info;
1847 #if defined(TARGET_I386)
1848         X86CPU *x86_cpu = X86_CPU(cpu);
1849         CPUX86State *env = &x86_cpu->env;
1850 #elif defined(TARGET_PPC)
1851         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1852         CPUPPCState *env = &ppc_cpu->env;
1853 #elif defined(TARGET_SPARC)
1854         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1855         CPUSPARCState *env = &sparc_cpu->env;
1856 #elif defined(TARGET_MIPS)
1857         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1858         CPUMIPSState *env = &mips_cpu->env;
1859 #elif defined(TARGET_TRICORE)
1860         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1861         CPUTriCoreState *env = &tricore_cpu->env;
1862 #endif
1863
1864         cpu_synchronize_state(cpu);
1865
1866         info = g_malloc0(sizeof(*info));
1867         info->value = g_malloc0(sizeof(*info->value));
1868         info->value->CPU = cpu->cpu_index;
1869         info->value->current = (cpu == first_cpu);
1870         info->value->halted = cpu->halted;
1871         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1872         info->value->thread_id = cpu->thread_id;
1873 #if defined(TARGET_I386)
1874         info->value->arch = CPU_INFO_ARCH_X86;
1875         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1876 #elif defined(TARGET_PPC)
1877         info->value->arch = CPU_INFO_ARCH_PPC;
1878         info->value->u.ppc.nip = env->nip;
1879 #elif defined(TARGET_SPARC)
1880         info->value->arch = CPU_INFO_ARCH_SPARC;
1881         info->value->u.q_sparc.pc = env->pc;
1882         info->value->u.q_sparc.npc = env->npc;
1883 #elif defined(TARGET_MIPS)
1884         info->value->arch = CPU_INFO_ARCH_MIPS;
1885         info->value->u.q_mips.PC = env->active_tc.PC;
1886 #elif defined(TARGET_TRICORE)
1887         info->value->arch = CPU_INFO_ARCH_TRICORE;
1888         info->value->u.tricore.PC = env->PC;
1889 #else
1890         info->value->arch = CPU_INFO_ARCH_OTHER;
1891 #endif
1892
1893         /* XXX: waiting for the qapi to support GSList */
1894         if (!cur_item) {
1895             head = cur_item = info;
1896         } else {
1897             cur_item->next = info;
1898             cur_item = info;
1899         }
1900     }
1901
1902     return head;
1903 }
1904
1905 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1906                  bool has_cpu, int64_t cpu_index, Error **errp)
1907 {
1908     FILE *f;
1909     uint32_t l;
1910     CPUState *cpu;
1911     uint8_t buf[1024];
1912     int64_t orig_addr = addr, orig_size = size;
1913
1914     if (!has_cpu) {
1915         cpu_index = 0;
1916     }
1917
1918     cpu = qemu_get_cpu(cpu_index);
1919     if (cpu == NULL) {
1920         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1921                    "a CPU number");
1922         return;
1923     }
1924
1925     f = fopen(filename, "wb");
1926     if (!f) {
1927         error_setg_file_open(errp, errno, filename);
1928         return;
1929     }
1930
1931     while (size != 0) {
1932         l = sizeof(buf);
1933         if (l > size)
1934             l = size;
1935         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1936             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1937                              " specified", orig_addr, orig_size);
1938             goto exit;
1939         }
1940         if (fwrite(buf, 1, l, f) != l) {
1941             error_setg(errp, QERR_IO_ERROR);
1942             goto exit;
1943         }
1944         addr += l;
1945         size -= l;
1946     }
1947
1948 exit:
1949     fclose(f);
1950 }
1951
1952 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1953                   Error **errp)
1954 {
1955     FILE *f;
1956     uint32_t l;
1957     uint8_t buf[1024];
1958
1959     f = fopen(filename, "wb");
1960     if (!f) {
1961         error_setg_file_open(errp, errno, filename);
1962         return;
1963     }
1964
1965     while (size != 0) {
1966         l = sizeof(buf);
1967         if (l > size)
1968             l = size;
1969         cpu_physical_memory_read(addr, buf, l);
1970         if (fwrite(buf, 1, l, f) != l) {
1971             error_setg(errp, QERR_IO_ERROR);
1972             goto exit;
1973         }
1974         addr += l;
1975         size -= l;
1976     }
1977
1978 exit:
1979     fclose(f);
1980 }
1981
1982 void qmp_inject_nmi(Error **errp)
1983 {
1984     nmi_monitor_handle(monitor_get_cpu_index(), errp);
1985 }
1986
1987 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1988 {
1989     if (!use_icount) {
1990         return;
1991     }
1992
1993     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
1994                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1995     if (icount_align_option) {
1996         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
1997         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
1998     } else {
1999         cpu_fprintf(f, "Max guest delay     NA\n");
2000         cpu_fprintf(f, "Max guest advance   NA\n");
2001     }
2002 }