cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 /* Needed early for CONFIG_BSD etc. */
  26 #include "qemu/osdep.h"
  27 #include "qemu-common.h"
  28 #include "qemu/config-file.h"
  29 #include "cpu.h"
  30 #include "monitor/monitor.h"
  31 #include "qapi/qmp/qerror.h"
  32 #include "qemu/error-report.h"
  33 #include "sysemu/sysemu.h"
  34 #include "sysemu/block-backend.h"
  35 #include "exec/gdbstub.h"
  36 #include "sysemu/dma.h"
  37 #include "sysemu/hw_accel.h"
  38 #include "sysemu/kvm.h"
  39 #include "sysemu/hax.h"
  40 #include "qmp-commands.h"
  41 #include "exec/exec-all.h"
  42
  43 #include "qemu/thread.h"
  44 #include "sysemu/cpus.h"
  45 #include "sysemu/qtest.h"
  46 #include "qemu/main-loop.h"
  47 #include "qemu/bitmap.h"
  48 #include "qemu/seqlock.h"
  49 #include "tcg.h"
  50 #include "qapi-event.h"
  51 #include "hw/nmi.h"
  52 #include "sysemu/replay.h"
  53
  54 #ifndef _WIN32
  55 #include "qemu/compatfd.h"
  56 #endif
  57
  58 #ifdef CONFIG_LINUX
  59
  60 #include <sys/prctl.h>
  61
  62 #ifndef PR_MCE_KILL
  63 #define PR_MCE_KILL 33
  64 #endif
  65
  66 #ifndef PR_MCE_KILL_SET
  67 #define PR_MCE_KILL_SET 1
  68 #endif
  69
  70 #ifndef PR_MCE_KILL_EARLY
  71 #define PR_MCE_KILL_EARLY 1
  72 #endif
  73
  74 #endif /* CONFIG_LINUX */
  75
  76 int64_t max_delay;
  77 int64_t max_advance;
  78
  79 /* vcpu throttling controls */
  80 static QEMUTimer *throttle_timer;
  81 static unsigned int throttle_percentage;
  82
  83 #define CPU_THROTTLE_PCT_MIN 1
  84 #define CPU_THROTTLE_PCT_MAX 99
  85 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  86
  87 bool cpu_is_stopped(CPUState *cpu)
  88 {
  89     return cpu->stopped || !runstate_is_running();
  90 }
  91
  92 static bool cpu_thread_is_idle(CPUState *cpu)
  93 {
  94     if (cpu->stop || cpu->queued_work_first) {
  95         return false;
  96     }
  97     if (cpu_is_stopped(cpu)) {
  98         return true;
  99     }
 100     if (!cpu->halted || cpu_has_work(cpu) ||
 101         kvm_halt_in_kernel()) {
 102         return false;
 103     }
 104     return true;
 105 }
 106
 107 static bool all_cpu_threads_idle(void)
 108 {
 109     CPUState *cpu;
 110
 111     CPU_FOREACH(cpu) {
 112         if (!cpu_thread_is_idle(cpu)) {
 113             return false;
 114         }
 115     }
 116     return true;
 117 }
 118
 119 /***********************************************************/
 120 /* guest cycle counter */
 121
 122 /* Protected by TimersState seqlock */
 123
 124 static bool icount_sleep = true;
 125 static int64_t vm_clock_warp_start = -1;
 126 /* Conversion factor from emulated instructions to virtual clock ticks.  */
 127 static int icount_time_shift;
 128 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 129 #define MAX_ICOUNT_SHIFT 10
 130
 131 static QEMUTimer *icount_rt_timer;
 132 static QEMUTimer *icount_vm_timer;
 133 static QEMUTimer *icount_warp_timer;
 134
 135 typedef struct TimersState {
 136     /* Protected by BQL.  */
 137     int64_t cpu_ticks_prev;
 138     int64_t cpu_ticks_offset;
 139
 140     /* cpu_clock_offset can be read out of BQL, so protect it with
 141      * this lock.
 142      */
 143     QemuSeqLock vm_clock_seqlock;
 144     int64_t cpu_clock_offset;
 145     int32_t cpu_ticks_enabled;
 146     int64_t dummy;
 147
 148     /* Compensate for varying guest execution speed.  */
 149     int64_t qemu_icount_bias;
 150     /* Only written by TCG thread */
 151     int64_t qemu_icount;
 152 } TimersState;
 153
 154 static TimersState timers_state;
 155 bool mttcg_enabled;
 156
 157 /*
 158  * We default to false if we know other options have been enabled
 159  * which are currently incompatible with MTTCG. Otherwise when each
 160  * guest (target) has been updated to support:
 161  *   - atomic instructions
 162  *   - memory ordering primitives (barriers)
 163  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 164  *
 165  * Once a guest architecture has been converted to the new primitives
 166  * there are two remaining limitations to check.
 167  *
 168  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 169  * - The host must have a stronger memory order than the guest
 170  *
 171  * It may be possible in future to support strong guests on weak hosts
 172  * but that will require tagging all load/stores in a guest with their
 173  * implicit memory order requirements which would likely slow things
 174  * down a lot.
 175  */
 176
 177 static bool check_tcg_memory_orders_compatible(void)
 178 {
 179 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 180     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 181 #else
 182     return false;
 183 #endif
 184 }
 185
 186 static bool default_mttcg_enabled(void)
 187 {
 188     QemuOpts *icount_opts = qemu_find_opts_singleton("icount");
 189     const char *rr = qemu_opt_get(icount_opts, "rr");
 190
 191     if (rr || TCG_OVERSIZED_GUEST) {
 192         return false;
 193     } else {
 194 #ifdef TARGET_SUPPORTS_MTTCG
 195         return check_tcg_memory_orders_compatible();
 196 #else
 197         return false;
 198 #endif
 199     }
 200 }
 201
 202 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 203 {
 204     const char *t = qemu_opt_get(opts, "thread");
 205     if (t) {
 206         if (strcmp(t, "multi") == 0) {
 207             if (TCG_OVERSIZED_GUEST) {
 208                 error_setg(errp, "No MTTCG when guest word size > hosts");
 209             } else {
 210                 if (!check_tcg_memory_orders_compatible()) {
 211                     error_report("Guest expects a stronger memory ordering "
 212                                  "than the host provides");
 213                     error_printf("This may cause strange/hard to debug errors");
 214                 }
 215                 mttcg_enabled = true;
 216             }
 217         } else if (strcmp(t, "single") == 0) {
 218             mttcg_enabled = false;
 219         } else {
 220             error_setg(errp, "Invalid 'thread' setting %s", t);
 221         }
 222     } else {
 223         mttcg_enabled = default_mttcg_enabled();
 224     }
 225 }
 226
 227 int64_t cpu_get_icount_raw(void)
 228 {
 229     int64_t icount;
 230     CPUState *cpu = current_cpu;
 231
 232     icount = timers_state.qemu_icount;
 233     if (cpu) {
 234         if (!cpu->can_do_io) {
 235             fprintf(stderr, "Bad icount read\n");
 236             exit(1);
 237         }
 238         icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
 239     }
 240     return icount;
 241 }
 242
 243 /* Return the virtual CPU time, based on the instruction counter.  */
 244 static int64_t cpu_get_icount_locked(void)
 245 {
 246     int64_t icount = cpu_get_icount_raw();
 247     return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
 248 }
 249
 250 int64_t cpu_get_icount(void)
 251 {
 252     int64_t icount;
 253     unsigned start;
 254
 255     do {
 256         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 257         icount = cpu_get_icount_locked();
 258     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 259
 260     return icount;
 261 }
 262
 263 int64_t cpu_icount_to_ns(int64_t icount)
 264 {
 265     return icount << icount_time_shift;
 266 }
 267
 268 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 269  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 270  * counter.
 271  *
 272  * Caller must hold the BQL
 273  */
 274 int64_t cpu_get_ticks(void)
 275 {
 276     int64_t ticks;
 277
 278     if (use_icount) {
 279         return cpu_get_icount();
 280     }
 281
 282     ticks = timers_state.cpu_ticks_offset;
 283     if (timers_state.cpu_ticks_enabled) {
 284         ticks += cpu_get_host_ticks();
 285     }
 286
 287     if (timers_state.cpu_ticks_prev > ticks) {
 288         /* Note: non increasing ticks may happen if the host uses
 289            software suspend */
 290         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 291         ticks = timers_state.cpu_ticks_prev;
 292     }
 293
 294     timers_state.cpu_ticks_prev = ticks;
 295     return ticks;
 296 }
 297
 298 static int64_t cpu_get_clock_locked(void)
 299 {
 300     int64_t time;
 301
 302     time = timers_state.cpu_clock_offset;
 303     if (timers_state.cpu_ticks_enabled) {
 304         time += get_clock();
 305     }
 306
 307     return time;
 308 }
 309
 310 /* Return the monotonic time elapsed in VM, i.e.,
 311  * the time between vm_start and vm_stop
 312  */
 313 int64_t cpu_get_clock(void)
 314 {
 315     int64_t ti;
 316     unsigned start;
 317
 318     do {
 319         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 320         ti = cpu_get_clock_locked();
 321     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 322
 323     return ti;
 324 }
 325
 326 /* enable cpu_get_ticks()
 327  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 328  */
 329 void cpu_enable_ticks(void)
 330 {
 331     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 332     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 333     if (!timers_state.cpu_ticks_enabled) {
 334         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 335         timers_state.cpu_clock_offset -= get_clock();
 336         timers_state.cpu_ticks_enabled = 1;
 337     }
 338     seqlock_write_end(&timers_state.vm_clock_seqlock);
 339 }
 340
 341 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 342  * cpu_get_ticks() after that.
 343  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 344  */
 345 void cpu_disable_ticks(void)
 346 {
 347     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 348     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 349     if (timers_state.cpu_ticks_enabled) {
 350         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 351         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 352         timers_state.cpu_ticks_enabled = 0;
 353     }
 354     seqlock_write_end(&timers_state.vm_clock_seqlock);
 355 }
 356
 357 /* Correlation between real and virtual time is always going to be
 358    fairly approximate, so ignore small variation.
 359    When the guest is idle real and virtual time will be aligned in
 360    the IO wait loop.  */
 361 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 362
 363 static void icount_adjust(void)
 364 {
 365     int64_t cur_time;
 366     int64_t cur_icount;
 367     int64_t delta;
 368
 369     /* Protected by TimersState mutex.  */
 370     static int64_t last_delta;
 371
 372     /* If the VM is not running, then do nothing.  */
 373     if (!runstate_is_running()) {
 374         return;
 375     }
 376
 377     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 378     cur_time = cpu_get_clock_locked();
 379     cur_icount = cpu_get_icount_locked();
 380
 381     delta = cur_icount - cur_time;
 382     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 383     if (delta > 0
 384         && last_delta + ICOUNT_WOBBLE < delta * 2
 385         && icount_time_shift > 0) {
 386         /* The guest is getting too far ahead.  Slow time down.  */
 387         icount_time_shift--;
 388     }
 389     if (delta < 0
 390         && last_delta - ICOUNT_WOBBLE > delta * 2
 391         && icount_time_shift < MAX_ICOUNT_SHIFT) {
 392         /* The guest is getting too far behind.  Speed time up.  */
 393         icount_time_shift++;
 394     }
 395     last_delta = delta;
 396     timers_state.qemu_icount_bias = cur_icount
 397                               - (timers_state.qemu_icount << icount_time_shift);
 398     seqlock_write_end(&timers_state.vm_clock_seqlock);
 399 }
 400
 401 static void icount_adjust_rt(void *opaque)
 402 {
 403     timer_mod(icount_rt_timer,
 404               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 405     icount_adjust();
 406 }
 407
 408 static void icount_adjust_vm(void *opaque)
 409 {
 410     timer_mod(icount_vm_timer,
 411                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 412                    NANOSECONDS_PER_SECOND / 10);
 413     icount_adjust();
 414 }
 415
 416 static int64_t qemu_icount_round(int64_t count)
 417 {
 418     return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 419 }
 420
 421 static void icount_warp_rt(void)
 422 {
 423     unsigned seq;
 424     int64_t warp_start;
 425
 426     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 427      * changes from -1 to another value, so the race here is okay.
 428      */
 429     do {
 430         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 431         warp_start = vm_clock_warp_start;
 432     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 433
 434     if (warp_start == -1) {
 435         return;
 436     }
 437
 438     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 439     if (runstate_is_running()) {
 440         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 441                                      cpu_get_clock_locked());
 442         int64_t warp_delta;
 443
 444         warp_delta = clock - vm_clock_warp_start;
 445         if (use_icount == 2) {
 446             /*
 447              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 448              * far ahead of real time.
 449              */
 450             int64_t cur_icount = cpu_get_icount_locked();
 451             int64_t delta = clock - cur_icount;
 452             warp_delta = MIN(warp_delta, delta);
 453         }
 454         timers_state.qemu_icount_bias += warp_delta;
 455     }
 456     vm_clock_warp_start = -1;
 457     seqlock_write_end(&timers_state.vm_clock_seqlock);
 458
 459     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 460         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 461     }
 462 }
 463
 464 static void icount_timer_cb(void *opaque)
 465 {
 466     /* No need for a checkpoint because the timer already synchronizes
 467      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 468      */
 469     icount_warp_rt();
 470 }
 471
 472 void qtest_clock_warp(int64_t dest)
 473 {
 474     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 475     AioContext *aio_context;
 476     assert(qtest_enabled());
 477     aio_context = qemu_get_aio_context();
 478     while (clock < dest) {
 479         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 480         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 481
 482         seqlock_write_begin(&timers_state.vm_clock_seqlock);
 483         timers_state.qemu_icount_bias += warp;
 484         seqlock_write_end(&timers_state.vm_clock_seqlock);
 485
 486         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 487         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 488         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 489     }
 490     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 491 }
 492
 493 void qemu_start_warp_timer(void)
 494 {
 495     int64_t clock;
 496     int64_t deadline;
 497
 498     if (!use_icount) {
 499         return;
 500     }
 501
 502     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 503      * do not fire, so computing the deadline does not make sense.
 504      */
 505     if (!runstate_is_running()) {
 506         return;
 507     }
 508
 509     /* warp clock deterministically in record/replay mode */
 510     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 511         return;
 512     }
 513
 514     if (!all_cpu_threads_idle()) {
 515         return;
 516     }
 517
 518     if (qtest_enabled()) {
 519         /* When testing, qtest commands advance icount.  */
 520         return;
 521     }
 522
 523     /* We want to use the earliest deadline from ALL vm_clocks */
 524     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 525     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 526     if (deadline < 0) {
 527         static bool notified;
 528         if (!icount_sleep && !notified) {
 529             error_report("WARNING: icount sleep disabled and no active timers");
 530             notified = true;
 531         }
 532         return;
 533     }
 534
 535     if (deadline > 0) {
 536         /*
 537          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 538          * sleep.  Otherwise, the CPU might be waiting for a future timer
 539          * interrupt to wake it up, but the interrupt never comes because
 540          * the vCPU isn't running any insns and thus doesn't advance the
 541          * QEMU_CLOCK_VIRTUAL.
 542          */
 543         if (!icount_sleep) {
 544             /*
 545              * We never let VCPUs sleep in no sleep icount mode.
 546              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 547              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 548              * It is useful when we want a deterministic execution time,
 549              * isolated from host latencies.
 550              */
 551             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 552             timers_state.qemu_icount_bias += deadline;
 553             seqlock_write_end(&timers_state.vm_clock_seqlock);
 554             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 555         } else {
 556             /*
 557              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 558              * "real" time, (related to the time left until the next event) has
 559              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 560              * This avoids that the warps are visible externally; for example,
 561              * you will not be sending network packets continuously instead of
 562              * every 100ms.
 563              */
 564             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 565             if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
 566                 vm_clock_warp_start = clock;
 567             }
 568             seqlock_write_end(&timers_state.vm_clock_seqlock);
 569             timer_mod_anticipate(icount_warp_timer, clock + deadline);
 570         }
 571     } else if (deadline == 0) {
 572         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 573     }
 574 }
 575
 576 static void qemu_account_warp_timer(void)
 577 {
 578     if (!use_icount || !icount_sleep) {
 579         return;
 580     }
 581
 582     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 583      * do not fire, so computing the deadline does not make sense.
 584      */
 585     if (!runstate_is_running()) {
 586         return;
 587     }
 588
 589     /* warp clock deterministically in record/replay mode */
 590     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 591         return;
 592     }
 593
 594     timer_del(icount_warp_timer);
 595     icount_warp_rt();
 596 }
 597
 598 static bool icount_state_needed(void *opaque)
 599 {
 600     return use_icount;
 601 }
 602
 603 /*
 604  * This is a subsection for icount migration.
 605  */
 606 static const VMStateDescription icount_vmstate_timers = {
 607     .name = "timer/icount",
 608     .version_id = 1,
 609     .minimum_version_id = 1,
 610     .needed = icount_state_needed,
 611     .fields = (VMStateField[]) {
 612         VMSTATE_INT64(qemu_icount_bias, TimersState),
 613         VMSTATE_INT64(qemu_icount, TimersState),
 614         VMSTATE_END_OF_LIST()
 615     }
 616 };
 617
 618 static const VMStateDescription vmstate_timers = {
 619     .name = "timer",
 620     .version_id = 2,
 621     .minimum_version_id = 1,
 622     .fields = (VMStateField[]) {
 623         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 624         VMSTATE_INT64(dummy, TimersState),
 625         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 626         VMSTATE_END_OF_LIST()
 627     },
 628     .subsections = (const VMStateDescription*[]) {
 629         &icount_vmstate_timers,
 630         NULL
 631     }
 632 };
 633
 634 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 635 {
 636     double pct;
 637     double throttle_ratio;
 638     long sleeptime_ns;
 639
 640     if (!cpu_throttle_get_percentage()) {
 641         return;
 642     }
 643
 644     pct = (double)cpu_throttle_get_percentage()/100;
 645     throttle_ratio = pct / (1 - pct);
 646     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 647
 648     qemu_mutex_unlock_iothread();
 649     atomic_set(&cpu->throttle_thread_scheduled, 0);
 650     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 651     qemu_mutex_lock_iothread();
 652 }
 653
 654 static void cpu_throttle_timer_tick(void *opaque)
 655 {
 656     CPUState *cpu;
 657     double pct;
 658
 659     /* Stop the timer if needed */
 660     if (!cpu_throttle_get_percentage()) {
 661         return;
 662     }
 663     CPU_FOREACH(cpu) {
 664         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 665             async_run_on_cpu(cpu, cpu_throttle_thread,
 666                              RUN_ON_CPU_NULL);
 667         }
 668     }
 669
 670     pct = (double)cpu_throttle_get_percentage()/100;
 671     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 672                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 673 }
 674
 675 void cpu_throttle_set(int new_throttle_pct)
 676 {
 677     /* Ensure throttle percentage is within valid range */
 678     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 679     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 680
 681     atomic_set(&throttle_percentage, new_throttle_pct);
 682
 683     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 684                                        CPU_THROTTLE_TIMESLICE_NS);
 685 }
 686
 687 void cpu_throttle_stop(void)
 688 {
 689     atomic_set(&throttle_percentage, 0);
 690 }
 691
 692 bool cpu_throttle_active(void)
 693 {
 694     return (cpu_throttle_get_percentage() != 0);
 695 }
 696
 697 int cpu_throttle_get_percentage(void)
 698 {
 699     return atomic_read(&throttle_percentage);
 700 }
 701
 702 void cpu_ticks_init(void)
 703 {
 704     seqlock_init(&timers_state.vm_clock_seqlock);
 705     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 706     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 707                                            cpu_throttle_timer_tick, NULL);
 708 }
 709
 710 void configure_icount(QemuOpts *opts, Error **errp)
 711 {
 712     const char *option;
 713     char *rem_str = NULL;
 714
 715     option = qemu_opt_get(opts, "shift");
 716     if (!option) {
 717         if (qemu_opt_get(opts, "align") != NULL) {
 718             error_setg(errp, "Please specify shift option when using align");
 719         }
 720         return;
 721     }
 722
 723     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 724     if (icount_sleep) {
 725         icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 726                                          icount_timer_cb, NULL);
 727     }
 728
 729     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 730
 731     if (icount_align_option && !icount_sleep) {
 732         error_setg(errp, "align=on and sleep=off are incompatible");
 733     }
 734     if (strcmp(option, "auto") != 0) {
 735         errno = 0;
 736         icount_time_shift = strtol(option, &rem_str, 0);
 737         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 738             error_setg(errp, "icount: Invalid shift value");
 739         }
 740         use_icount = 1;
 741         return;
 742     } else if (icount_align_option) {
 743         error_setg(errp, "shift=auto and align=on are incompatible");
 744     } else if (!icount_sleep) {
 745         error_setg(errp, "shift=auto and sleep=off are incompatible");
 746     }
 747
 748     use_icount = 2;
 749
 750     /* 125MIPS seems a reasonable initial guess at the guest speed.
 751        It will be corrected fairly quickly anyway.  */
 752     icount_time_shift = 3;
 753
 754     /* Have both realtime and virtual time triggers for speed adjustment.
 755        The realtime trigger catches emulated time passing too slowly,
 756        the virtual time trigger catches emulated time passing too fast.
 757        Realtime triggers occur even when idle, so use them less frequently
 758        than VM triggers.  */
 759     icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 760                                    icount_adjust_rt, NULL);
 761     timer_mod(icount_rt_timer,
 762                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 763     icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 764                                         icount_adjust_vm, NULL);
 765     timer_mod(icount_vm_timer,
 766                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 767                    NANOSECONDS_PER_SECOND / 10);
 768 }
 769
 770 /***********************************************************/
 771 /* TCG vCPU kick timer
 772  *
 773  * The kick timer is responsible for moving single threaded vCPU
 774  * emulation on to the next vCPU. If more than one vCPU is running a
 775  * timer event with force a cpu->exit so the next vCPU can get
 776  * scheduled.
 777  *
 778  * The timer is removed if all vCPUs are idle and restarted again once
 779  * idleness is complete.
 780  */
 781
 782 static QEMUTimer *tcg_kick_vcpu_timer;
 783
 784 static void qemu_cpu_kick_no_halt(void);
 785
 786 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 787
 788 static inline int64_t qemu_tcg_next_kick(void)
 789 {
 790     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 791 }
 792
 793 static void kick_tcg_thread(void *opaque)
 794 {
 795     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 796     qemu_cpu_kick_no_halt();
 797 }
 798
 799 static void start_tcg_kick_timer(void)
 800 {
 801     if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 802         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 803                                            kick_tcg_thread, NULL);
 804         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 805     }
 806 }
 807
 808 static void stop_tcg_kick_timer(void)
 809 {
 810     if (tcg_kick_vcpu_timer) {
 811         timer_del(tcg_kick_vcpu_timer);
 812         tcg_kick_vcpu_timer = NULL;
 813     }
 814 }
 815
 816
 817 /***********************************************************/
 818 void hw_error(const char *fmt, ...)
 819 {
 820     va_list ap;
 821     CPUState *cpu;
 822
 823     va_start(ap, fmt);
 824     fprintf(stderr, "qemu: hardware error: ");
 825     vfprintf(stderr, fmt, ap);
 826     fprintf(stderr, "\n");
 827     CPU_FOREACH(cpu) {
 828         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 829         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 830     }
 831     va_end(ap);
 832     abort();
 833 }
 834
 835 void cpu_synchronize_all_states(void)
 836 {
 837     CPUState *cpu;
 838
 839     CPU_FOREACH(cpu) {
 840         cpu_synchronize_state(cpu);
 841     }
 842 }
 843
 844 void cpu_synchronize_all_post_reset(void)
 845 {
 846     CPUState *cpu;
 847
 848     CPU_FOREACH(cpu) {
 849         cpu_synchronize_post_reset(cpu);
 850     }
 851 }
 852
 853 void cpu_synchronize_all_post_init(void)
 854 {
 855     CPUState *cpu;
 856
 857     CPU_FOREACH(cpu) {
 858         cpu_synchronize_post_init(cpu);
 859     }
 860 }
 861
 862 static int do_vm_stop(RunState state)
 863 {
 864     int ret = 0;
 865
 866     if (runstate_is_running()) {
 867         cpu_disable_ticks();
 868         pause_all_vcpus();
 869         runstate_set(state);
 870         vm_state_notify(0, state);
 871         qapi_event_send_stop(&error_abort);
 872     }
 873
 874     bdrv_drain_all();
 875     replay_disable_events();
 876     ret = bdrv_flush_all();
 877
 878     return ret;
 879 }
 880
 881 static bool cpu_can_run(CPUState *cpu)
 882 {
 883     if (cpu->stop) {
 884         return false;
 885     }
 886     if (cpu_is_stopped(cpu)) {
 887         return false;
 888     }
 889     return true;
 890 }
 891
 892 static void cpu_handle_guest_debug(CPUState *cpu)
 893 {
 894     gdb_set_stop_cpu(cpu);
 895     qemu_system_debug_request();
 896     cpu->stopped = true;
 897 }
 898
 899 #ifdef CONFIG_LINUX
 900 static void sigbus_reraise(void)
 901 {
 902     sigset_t set;
 903     struct sigaction action;
 904
 905     memset(&action, 0, sizeof(action));
 906     action.sa_handler = SIG_DFL;
 907     if (!sigaction(SIGBUS, &action, NULL)) {
 908         raise(SIGBUS);
 909         sigemptyset(&set);
 910         sigaddset(&set, SIGBUS);
 911         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 912     }
 913     perror("Failed to re-raise SIGBUS!\n");
 914     abort();
 915 }
 916
 917 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
 918                            void *ctx)
 919 {
 920     if (kvm_on_sigbus(siginfo->ssi_code,
 921                       (void *)(intptr_t)siginfo->ssi_addr)) {
 922         sigbus_reraise();
 923     }
 924 }
 925
 926 static void qemu_init_sigbus(void)
 927 {
 928     struct sigaction action;
 929
 930     memset(&action, 0, sizeof(action));
 931     action.sa_flags = SA_SIGINFO;
 932     action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
 933     sigaction(SIGBUS, &action, NULL);
 934
 935     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
 936 }
 937
 938 static void qemu_kvm_eat_signals(CPUState *cpu)
 939 {
 940     struct timespec ts = { 0, 0 };
 941     siginfo_t siginfo;
 942     sigset_t waitset;
 943     sigset_t chkset;
 944     int r;
 945
 946     sigemptyset(&waitset);
 947     sigaddset(&waitset, SIG_IPI);
 948     sigaddset(&waitset, SIGBUS);
 949
 950     do {
 951         r = sigtimedwait(&waitset, &siginfo, &ts);
 952         if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
 953             perror("sigtimedwait");
 954             exit(1);
 955         }
 956
 957         switch (r) {
 958         case SIGBUS:
 959             if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
 960                 sigbus_reraise();
 961             }
 962             break;
 963         default:
 964             break;
 965         }
 966
 967         r = sigpending(&chkset);
 968         if (r == -1) {
 969             perror("sigpending");
 970             exit(1);
 971         }
 972     } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 973 }
 974
 975 #else /* !CONFIG_LINUX */
 976
 977 static void qemu_init_sigbus(void)
 978 {
 979 }
 980
 981 static void qemu_kvm_eat_signals(CPUState *cpu)
 982 {
 983 }
 984 #endif /* !CONFIG_LINUX */
 985
 986 #ifndef _WIN32
 987 static void dummy_signal(int sig)
 988 {
 989 }
 990
 991 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
 992 {
 993     int r;
 994     sigset_t set;
 995     struct sigaction sigact;
 996
 997     memset(&sigact, 0, sizeof(sigact));
 998     sigact.sa_handler = dummy_signal;
 999     sigaction(SIG_IPI, &sigact, NULL);
1000
1001     pthread_sigmask(SIG_BLOCK, NULL, &set);
1002     sigdelset(&set, SIG_IPI);
1003     sigdelset(&set, SIGBUS);
1004     r = kvm_set_signal_mask(cpu, &set);
1005     if (r) {
1006         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
1007         exit(1);
1008     }
1009 }
1010
1011 #else /* _WIN32 */
1012 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
1013 {
1014     abort();
1015 }
1016 #endif /* _WIN32 */
1017
1018 static QemuMutex qemu_global_mutex;
1019 static QemuCond qemu_io_proceeded_cond;
1020 static unsigned iothread_requesting_mutex;
1021
1022 static QemuThread io_thread;
1023
1024 /* cpu creation */
1025 static QemuCond qemu_cpu_cond;
1026 /* system init */
1027 static QemuCond qemu_pause_cond;
1028
1029 void qemu_init_cpu_loop(void)
1030 {
1031     qemu_init_sigbus();
1032     qemu_cond_init(&qemu_cpu_cond);
1033     qemu_cond_init(&qemu_pause_cond);
1034     qemu_cond_init(&qemu_io_proceeded_cond);
1035     qemu_mutex_init(&qemu_global_mutex);
1036
1037     qemu_thread_get_self(&io_thread);
1038 }
1039
1040 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1041 {
1042     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1043 }
1044
1045 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1046 {
1047     if (kvm_destroy_vcpu(cpu) < 0) {
1048         error_report("kvm_destroy_vcpu failed");
1049         exit(EXIT_FAILURE);
1050     }
1051 }
1052
1053 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1054 {
1055 }
1056
1057 static void qemu_wait_io_event_common(CPUState *cpu)
1058 {
1059     if (cpu->stop) {
1060         cpu->stop = false;
1061         cpu->stopped = true;
1062         qemu_cond_broadcast(&qemu_pause_cond);
1063     }
1064     process_queued_cpu_work(cpu);
1065     cpu->thread_kicked = false;
1066 }
1067
1068 static void qemu_tcg_wait_io_event(CPUState *cpu)
1069 {
1070     while (all_cpu_threads_idle()) {
1071         stop_tcg_kick_timer();
1072         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1073     }
1074
1075     start_tcg_kick_timer();
1076
1077     while (iothread_requesting_mutex) {
1078         qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
1079     }
1080
1081     CPU_FOREACH(cpu) {
1082         qemu_wait_io_event_common(cpu);
1083     }
1084 }
1085
1086 static void qemu_kvm_wait_io_event(CPUState *cpu)
1087 {
1088     while (cpu_thread_is_idle(cpu)) {
1089         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1090     }
1091
1092     qemu_kvm_eat_signals(cpu);
1093     qemu_wait_io_event_common(cpu);
1094 }
1095
1096 static void *qemu_kvm_cpu_thread_fn(void *arg)
1097 {
1098     CPUState *cpu = arg;
1099     int r;
1100
1101     rcu_register_thread();
1102
1103     qemu_mutex_lock_iothread();
1104     qemu_thread_get_self(cpu->thread);
1105     cpu->thread_id = qemu_get_thread_id();
1106     cpu->can_do_io = 1;
1107     current_cpu = cpu;
1108
1109     r = kvm_init_vcpu(cpu);
1110     if (r < 0) {
1111         fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1112         exit(1);
1113     }
1114
1115     qemu_kvm_init_cpu_signals(cpu);
1116
1117     /* signal CPU creation */
1118     cpu->created = true;
1119     qemu_cond_signal(&qemu_cpu_cond);
1120
1121     do {
1122         if (cpu_can_run(cpu)) {
1123             r = kvm_cpu_exec(cpu);
1124             if (r == EXCP_DEBUG) {
1125                 cpu_handle_guest_debug(cpu);
1126             }
1127         }
1128         qemu_kvm_wait_io_event(cpu);
1129     } while (!cpu->unplug || cpu_can_run(cpu));
1130
1131     qemu_kvm_destroy_vcpu(cpu);
1132     cpu->created = false;
1133     qemu_cond_signal(&qemu_cpu_cond);
1134     qemu_mutex_unlock_iothread();
1135     return NULL;
1136 }
1137
1138 static void *qemu_dummy_cpu_thread_fn(void *arg)
1139 {
1140 #ifdef _WIN32
1141     fprintf(stderr, "qtest is not supported under Windows\n");
1142     exit(1);
1143 #else
1144     CPUState *cpu = arg;
1145     sigset_t waitset;
1146     int r;
1147
1148     rcu_register_thread();
1149
1150     qemu_mutex_lock_iothread();
1151     qemu_thread_get_self(cpu->thread);
1152     cpu->thread_id = qemu_get_thread_id();
1153     cpu->can_do_io = 1;
1154
1155     sigemptyset(&waitset);
1156     sigaddset(&waitset, SIG_IPI);
1157
1158     /* signal CPU creation */
1159     cpu->created = true;
1160     qemu_cond_signal(&qemu_cpu_cond);
1161
1162     current_cpu = cpu;
1163     while (1) {
1164         current_cpu = NULL;
1165         qemu_mutex_unlock_iothread();
1166         do {
1167             int sig;
1168             r = sigwait(&waitset, &sig);
1169         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1170         if (r == -1) {
1171             perror("sigwait");
1172             exit(1);
1173         }
1174         qemu_mutex_lock_iothread();
1175         current_cpu = cpu;
1176         qemu_wait_io_event_common(cpu);
1177     }
1178
1179     return NULL;
1180 #endif
1181 }
1182
1183 static int64_t tcg_get_icount_limit(void)
1184 {
1185     int64_t deadline;
1186
1187     if (replay_mode != REPLAY_MODE_PLAY) {
1188         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1189
1190         /* Maintain prior (possibly buggy) behaviour where if no deadline
1191          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1192          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1193          * nanoseconds.
1194          */
1195         if ((deadline < 0) || (deadline > INT32_MAX)) {
1196             deadline = INT32_MAX;
1197         }
1198
1199         return qemu_icount_round(deadline);
1200     } else {
1201         return replay_get_instructions();
1202     }
1203 }
1204
1205 static void handle_icount_deadline(void)
1206 {
1207     if (use_icount) {
1208         int64_t deadline =
1209             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1210
1211         if (deadline == 0) {
1212             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1213         }
1214     }
1215 }
1216
1217 static int tcg_cpu_exec(CPUState *cpu)
1218 {
1219     int ret;
1220 #ifdef CONFIG_PROFILER
1221     int64_t ti;
1222 #endif
1223
1224 #ifdef CONFIG_PROFILER
1225     ti = profile_getclock();
1226 #endif
1227     if (use_icount) {
1228         int64_t count;
1229         int decr;
1230         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1231                                     + cpu->icount_extra);
1232         cpu->icount_decr.u16.low = 0;
1233         cpu->icount_extra = 0;
1234         count = tcg_get_icount_limit();
1235         timers_state.qemu_icount += count;
1236         decr = (count > 0xffff) ? 0xffff : count;
1237         count -= decr;
1238         cpu->icount_decr.u16.low = decr;
1239         cpu->icount_extra = count;
1240     }
1241     cpu_exec_start(cpu);
1242     ret = cpu_exec(cpu);
1243     cpu_exec_end(cpu);
1244 #ifdef CONFIG_PROFILER
1245     tcg_time += profile_getclock() - ti;
1246 #endif
1247     if (use_icount) {
1248         /* Fold pending instructions back into the
1249            instruction counter, and clear the interrupt flag.  */
1250         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1251                         + cpu->icount_extra);
1252         cpu->icount_decr.u32 = 0;
1253         cpu->icount_extra = 0;
1254         replay_account_executed_instructions();
1255     }
1256     return ret;
1257 }
1258
1259 /* Destroy any remaining vCPUs which have been unplugged and have
1260  * finished running
1261  */
1262 static void deal_with_unplugged_cpus(void)
1263 {
1264     CPUState *cpu;
1265
1266     CPU_FOREACH(cpu) {
1267         if (cpu->unplug && !cpu_can_run(cpu)) {
1268             qemu_tcg_destroy_vcpu(cpu);
1269             cpu->created = false;
1270             qemu_cond_signal(&qemu_cpu_cond);
1271             break;
1272         }
1273     }
1274 }
1275
1276 /* Single-threaded TCG
1277  *
1278  * In the single-threaded case each vCPU is simulated in turn. If
1279  * there is more than a single vCPU we create a simple timer to kick
1280  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1281  * This is done explicitly rather than relying on side-effects
1282  * elsewhere.
1283  */
1284
1285 static void *qemu_tcg_cpu_thread_fn(void *arg)
1286 {
1287     CPUState *cpu = arg;
1288
1289     rcu_register_thread();
1290
1291     qemu_mutex_lock_iothread();
1292     qemu_thread_get_self(cpu->thread);
1293
1294     CPU_FOREACH(cpu) {
1295         cpu->thread_id = qemu_get_thread_id();
1296         cpu->created = true;
1297         cpu->can_do_io = 1;
1298     }
1299     qemu_cond_signal(&qemu_cpu_cond);
1300
1301     /* wait for initial kick-off after machine start */
1302     while (first_cpu->stopped) {
1303         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1304
1305         /* process any pending work */
1306         CPU_FOREACH(cpu) {
1307             qemu_wait_io_event_common(cpu);
1308         }
1309     }
1310
1311     start_tcg_kick_timer();
1312
1313     /* process any pending work */
1314     atomic_mb_set(&exit_request, 1);
1315
1316     cpu = first_cpu;
1317
1318     while (1) {
1319         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1320         qemu_account_warp_timer();
1321
1322         if (!cpu) {
1323             cpu = first_cpu;
1324         }
1325
1326         for (; cpu != NULL && !exit_request; cpu = CPU_NEXT(cpu)) {
1327
1328             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1329                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1330
1331             if (cpu_can_run(cpu)) {
1332                 int r;
1333                 r = tcg_cpu_exec(cpu);
1334                 if (r == EXCP_DEBUG) {
1335                     cpu_handle_guest_debug(cpu);
1336                     break;
1337                 }
1338             } else if (cpu->stop || cpu->stopped) {
1339                 if (cpu->unplug) {
1340                     cpu = CPU_NEXT(cpu);
1341                 }
1342                 break;
1343             }
1344
1345         } /* for cpu.. */
1346
1347         /* Pairs with smp_wmb in qemu_cpu_kick.  */
1348         atomic_mb_set(&exit_request, 0);
1349
1350         handle_icount_deadline();
1351
1352         qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
1353         deal_with_unplugged_cpus();
1354     }
1355
1356     return NULL;
1357 }
1358
1359 static void *qemu_hax_cpu_thread_fn(void *arg)
1360 {
1361     CPUState *cpu = arg;
1362     int r;
1363     qemu_thread_get_self(cpu->thread);
1364     qemu_mutex_lock(&qemu_global_mutex);
1365
1366     cpu->thread_id = qemu_get_thread_id();
1367     cpu->created = true;
1368     cpu->halted = 0;
1369     current_cpu = cpu;
1370
1371     hax_init_vcpu(cpu);
1372     qemu_cond_signal(&qemu_cpu_cond);
1373
1374     while (1) {
1375         if (cpu_can_run(cpu)) {
1376             r = hax_smp_cpu_exec(cpu);
1377             if (r == EXCP_DEBUG) {
1378                 cpu_handle_guest_debug(cpu);
1379             }
1380         }
1381
1382         while (cpu_thread_is_idle(cpu)) {
1383             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1384         }
1385 #ifdef _WIN32
1386         SleepEx(0, TRUE);
1387 #endif
1388         qemu_wait_io_event_common(cpu);
1389     }
1390     return NULL;
1391 }
1392
1393 #ifdef _WIN32
1394 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1395 {
1396 }
1397 #endif
1398
1399 static void qemu_cpu_kick_thread(CPUState *cpu)
1400 {
1401 #ifndef _WIN32
1402     int err;
1403
1404     if (cpu->thread_kicked) {
1405         return;
1406     }
1407     cpu->thread_kicked = true;
1408     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1409     if (err) {
1410         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1411         exit(1);
1412     }
1413 #else /* _WIN32 */
1414     if (!qemu_cpu_is_self(cpu)) {
1415         if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1416             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1417                     __func__, GetLastError());
1418             exit(1);
1419         }
1420     }
1421 #endif
1422 }
1423
1424 static void qemu_cpu_kick_no_halt(void)
1425 {
1426     CPUState *cpu;
1427     /* Ensure whatever caused the exit has reached the CPU threads before
1428      * writing exit_request.
1429      */
1430     atomic_mb_set(&exit_request, 1);
1431     cpu = atomic_mb_read(&tcg_current_cpu);
1432     if (cpu) {
1433         cpu_exit(cpu);
1434     }
1435 }
1436
1437 void qemu_cpu_kick(CPUState *cpu)
1438 {
1439     qemu_cond_broadcast(cpu->halt_cond);
1440     if (tcg_enabled()) {
1441         qemu_cpu_kick_no_halt();
1442     } else {
1443         if (hax_enabled()) {
1444             /*
1445              * FIXME: race condition with the exit_request check in
1446              * hax_vcpu_hax_exec
1447              */
1448             cpu->exit_request = 1;
1449         }
1450         qemu_cpu_kick_thread(cpu);
1451     }
1452 }
1453
1454 void qemu_cpu_kick_self(void)
1455 {
1456     assert(current_cpu);
1457     qemu_cpu_kick_thread(current_cpu);
1458 }
1459
1460 bool qemu_cpu_is_self(CPUState *cpu)
1461 {
1462     return qemu_thread_is_self(cpu->thread);
1463 }
1464
1465 bool qemu_in_vcpu_thread(void)
1466 {
1467     return current_cpu && qemu_cpu_is_self(current_cpu);
1468 }
1469
1470 static __thread bool iothread_locked = false;
1471
1472 bool qemu_mutex_iothread_locked(void)
1473 {
1474     return iothread_locked;
1475 }
1476
1477 void qemu_mutex_lock_iothread(void)
1478 {
1479     atomic_inc(&iothread_requesting_mutex);
1480     /* In the simple case there is no need to bump the VCPU thread out of
1481      * TCG code execution.
1482      */
1483     if (!tcg_enabled() || qemu_in_vcpu_thread() ||
1484         !first_cpu || !first_cpu->created) {
1485         qemu_mutex_lock(&qemu_global_mutex);
1486         atomic_dec(&iothread_requesting_mutex);
1487     } else {
1488         if (qemu_mutex_trylock(&qemu_global_mutex)) {
1489             qemu_cpu_kick_no_halt();
1490             qemu_mutex_lock(&qemu_global_mutex);
1491         }
1492         atomic_dec(&iothread_requesting_mutex);
1493         qemu_cond_broadcast(&qemu_io_proceeded_cond);
1494     }
1495     iothread_locked = true;
1496 }
1497
1498 void qemu_mutex_unlock_iothread(void)
1499 {
1500     iothread_locked = false;
1501     qemu_mutex_unlock(&qemu_global_mutex);
1502 }
1503
1504 static bool all_vcpus_paused(void)
1505 {
1506     CPUState *cpu;
1507
1508     CPU_FOREACH(cpu) {
1509         if (!cpu->stopped) {
1510             return false;
1511         }
1512     }
1513
1514     return true;
1515 }
1516
1517 void pause_all_vcpus(void)
1518 {
1519     CPUState *cpu;
1520
1521     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1522     CPU_FOREACH(cpu) {
1523         cpu->stop = true;
1524         qemu_cpu_kick(cpu);
1525     }
1526
1527     if (qemu_in_vcpu_thread()) {
1528         cpu_stop_current();
1529         if (!kvm_enabled()) {
1530             CPU_FOREACH(cpu) {
1531                 cpu->stop = false;
1532                 cpu->stopped = true;
1533             }
1534             return;
1535         }
1536     }
1537
1538     while (!all_vcpus_paused()) {
1539         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1540         CPU_FOREACH(cpu) {
1541             qemu_cpu_kick(cpu);
1542         }
1543     }
1544 }
1545
1546 void cpu_resume(CPUState *cpu)
1547 {
1548     cpu->stop = false;
1549     cpu->stopped = false;
1550     qemu_cpu_kick(cpu);
1551 }
1552
1553 void resume_all_vcpus(void)
1554 {
1555     CPUState *cpu;
1556
1557     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1558     CPU_FOREACH(cpu) {
1559         cpu_resume(cpu);
1560     }
1561 }
1562
1563 void cpu_remove(CPUState *cpu)
1564 {
1565     cpu->stop = true;
1566     cpu->unplug = true;
1567     qemu_cpu_kick(cpu);
1568 }
1569
1570 void cpu_remove_sync(CPUState *cpu)
1571 {
1572     cpu_remove(cpu);
1573     while (cpu->created) {
1574         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1575     }
1576 }
1577
1578 /* For temporary buffers for forming a name */
1579 #define VCPU_THREAD_NAME_SIZE 16
1580
1581 static void qemu_tcg_init_vcpu(CPUState *cpu)
1582 {
1583     char thread_name[VCPU_THREAD_NAME_SIZE];
1584     static QemuCond *tcg_halt_cond;
1585     static QemuThread *tcg_cpu_thread;
1586
1587     /* share a single thread for all cpus with TCG */
1588     if (!tcg_cpu_thread) {
1589         cpu->thread = g_malloc0(sizeof(QemuThread));
1590         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1591         qemu_cond_init(cpu->halt_cond);
1592         tcg_halt_cond = cpu->halt_cond;
1593         snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1594                  cpu->cpu_index);
1595         qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1596                            cpu, QEMU_THREAD_JOINABLE);
1597 #ifdef _WIN32
1598         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1599 #endif
1600         while (!cpu->created) {
1601             qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1602         }
1603         tcg_cpu_thread = cpu->thread;
1604     } else {
1605         cpu->thread = tcg_cpu_thread;
1606         cpu->halt_cond = tcg_halt_cond;
1607     }
1608 }
1609
1610 static void qemu_hax_start_vcpu(CPUState *cpu)
1611 {
1612     char thread_name[VCPU_THREAD_NAME_SIZE];
1613
1614     cpu->thread = g_malloc0(sizeof(QemuThread));
1615     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1616     qemu_cond_init(cpu->halt_cond);
1617
1618     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1619              cpu->cpu_index);
1620     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1621                        cpu, QEMU_THREAD_JOINABLE);
1622 #ifdef _WIN32
1623     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1624 #endif
1625     while (!cpu->created) {
1626         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1627     }
1628 }
1629
1630 static void qemu_kvm_start_vcpu(CPUState *cpu)
1631 {
1632     char thread_name[VCPU_THREAD_NAME_SIZE];
1633
1634     cpu->thread = g_malloc0(sizeof(QemuThread));
1635     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1636     qemu_cond_init(cpu->halt_cond);
1637     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1638              cpu->cpu_index);
1639     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1640                        cpu, QEMU_THREAD_JOINABLE);
1641     while (!cpu->created) {
1642         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1643     }
1644 }
1645
1646 static void qemu_dummy_start_vcpu(CPUState *cpu)
1647 {
1648     char thread_name[VCPU_THREAD_NAME_SIZE];
1649
1650     cpu->thread = g_malloc0(sizeof(QemuThread));
1651     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1652     qemu_cond_init(cpu->halt_cond);
1653     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1654              cpu->cpu_index);
1655     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1656                        QEMU_THREAD_JOINABLE);
1657     while (!cpu->created) {
1658         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1659     }
1660 }
1661
1662 void qemu_init_vcpu(CPUState *cpu)
1663 {
1664     cpu->nr_cores = smp_cores;
1665     cpu->nr_threads = smp_threads;
1666     cpu->stopped = true;
1667
1668     if (!cpu->as) {
1669         /* If the target cpu hasn't set up any address spaces itself,
1670          * give it the default one.
1671          */
1672         AddressSpace *as = address_space_init_shareable(cpu->memory,
1673                                                         "cpu-memory");
1674         cpu->num_ases = 1;
1675         cpu_address_space_init(cpu, as, 0);
1676     }
1677
1678     if (kvm_enabled()) {
1679         qemu_kvm_start_vcpu(cpu);
1680     } else if (hax_enabled()) {
1681         qemu_hax_start_vcpu(cpu);
1682     } else if (tcg_enabled()) {
1683         qemu_tcg_init_vcpu(cpu);
1684     } else {
1685         qemu_dummy_start_vcpu(cpu);
1686     }
1687 }
1688
1689 void cpu_stop_current(void)
1690 {
1691     if (current_cpu) {
1692         current_cpu->stop = false;
1693         current_cpu->stopped = true;
1694         cpu_exit(current_cpu);
1695         qemu_cond_broadcast(&qemu_pause_cond);
1696     }
1697 }
1698
1699 int vm_stop(RunState state)
1700 {
1701     if (qemu_in_vcpu_thread()) {
1702         qemu_system_vmstop_request_prepare();
1703         qemu_system_vmstop_request(state);
1704         /*
1705          * FIXME: should not return to device code in case
1706          * vm_stop() has been requested.
1707          */
1708         cpu_stop_current();
1709         return 0;
1710     }
1711
1712     return do_vm_stop(state);
1713 }
1714
1715 /**
1716  * Prepare for (re)starting the VM.
1717  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1718  * running or in case of an error condition), 0 otherwise.
1719  */
1720 int vm_prepare_start(void)
1721 {
1722     RunState requested;
1723     int res = 0;
1724
1725     qemu_vmstop_requested(&requested);
1726     if (runstate_is_running() && requested == RUN_STATE__MAX) {
1727         return -1;
1728     }
1729
1730     /* Ensure that a STOP/RESUME pair of events is emitted if a
1731      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
1732      * example, according to documentation is always followed by
1733      * the STOP event.
1734      */
1735     if (runstate_is_running()) {
1736         qapi_event_send_stop(&error_abort);
1737         res = -1;
1738     } else {
1739         replay_enable_events();
1740         cpu_enable_ticks();
1741         runstate_set(RUN_STATE_RUNNING);
1742         vm_state_notify(1, RUN_STATE_RUNNING);
1743     }
1744
1745     /* We are sending this now, but the CPUs will be resumed shortly later */
1746     qapi_event_send_resume(&error_abort);
1747     return res;
1748 }
1749
1750 void vm_start(void)
1751 {
1752     if (!vm_prepare_start()) {
1753         resume_all_vcpus();
1754     }
1755 }
1756
1757 /* does a state transition even if the VM is already stopped,
1758    current state is forgotten forever */
1759 int vm_stop_force_state(RunState state)
1760 {
1761     if (runstate_is_running()) {
1762         return vm_stop(state);
1763     } else {
1764         runstate_set(state);
1765
1766         bdrv_drain_all();
1767         /* Make sure to return an error if the flush in a previous vm_stop()
1768          * failed. */
1769         return bdrv_flush_all();
1770     }
1771 }
1772
1773 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1774 {
1775     /* XXX: implement xxx_cpu_list for targets that still miss it */
1776 #if defined(cpu_list)
1777     cpu_list(f, cpu_fprintf);
1778 #endif
1779 }
1780
1781 CpuInfoList *qmp_query_cpus(Error **errp)
1782 {
1783     CpuInfoList *head = NULL, *cur_item = NULL;
1784     CPUState *cpu;
1785
1786     CPU_FOREACH(cpu) {
1787         CpuInfoList *info;
1788 #if defined(TARGET_I386)
1789         X86CPU *x86_cpu = X86_CPU(cpu);
1790         CPUX86State *env = &x86_cpu->env;
1791 #elif defined(TARGET_PPC)
1792         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1793         CPUPPCState *env = &ppc_cpu->env;
1794 #elif defined(TARGET_SPARC)
1795         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1796         CPUSPARCState *env = &sparc_cpu->env;
1797 #elif defined(TARGET_MIPS)
1798         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1799         CPUMIPSState *env = &mips_cpu->env;
1800 #elif defined(TARGET_TRICORE)
1801         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1802         CPUTriCoreState *env = &tricore_cpu->env;
1803 #endif
1804
1805         cpu_synchronize_state(cpu);
1806
1807         info = g_malloc0(sizeof(*info));
1808         info->value = g_malloc0(sizeof(*info->value));
1809         info->value->CPU = cpu->cpu_index;
1810         info->value->current = (cpu == first_cpu);
1811         info->value->halted = cpu->halted;
1812         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1813         info->value->thread_id = cpu->thread_id;
1814 #if defined(TARGET_I386)
1815         info->value->arch = CPU_INFO_ARCH_X86;
1816         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1817 #elif defined(TARGET_PPC)
1818         info->value->arch = CPU_INFO_ARCH_PPC;
1819         info->value->u.ppc.nip = env->nip;
1820 #elif defined(TARGET_SPARC)
1821         info->value->arch = CPU_INFO_ARCH_SPARC;
1822         info->value->u.q_sparc.pc = env->pc;
1823         info->value->u.q_sparc.npc = env->npc;
1824 #elif defined(TARGET_MIPS)
1825         info->value->arch = CPU_INFO_ARCH_MIPS;
1826         info->value->u.q_mips.PC = env->active_tc.PC;
1827 #elif defined(TARGET_TRICORE)
1828         info->value->arch = CPU_INFO_ARCH_TRICORE;
1829         info->value->u.tricore.PC = env->PC;
1830 #else
1831         info->value->arch = CPU_INFO_ARCH_OTHER;
1832 #endif
1833
1834         /* XXX: waiting for the qapi to support GSList */
1835         if (!cur_item) {
1836             head = cur_item = info;
1837         } else {
1838             cur_item->next = info;
1839             cur_item = info;
1840         }
1841     }
1842
1843     return head;
1844 }
1845
1846 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1847                  bool has_cpu, int64_t cpu_index, Error **errp)
1848 {
1849     FILE *f;
1850     uint32_t l;
1851     CPUState *cpu;
1852     uint8_t buf[1024];
1853     int64_t orig_addr = addr, orig_size = size;
1854
1855     if (!has_cpu) {
1856         cpu_index = 0;
1857     }
1858
1859     cpu = qemu_get_cpu(cpu_index);
1860     if (cpu == NULL) {
1861         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1862                    "a CPU number");
1863         return;
1864     }
1865
1866     f = fopen(filename, "wb");
1867     if (!f) {
1868         error_setg_file_open(errp, errno, filename);
1869         return;
1870     }
1871
1872     while (size != 0) {
1873         l = sizeof(buf);
1874         if (l > size)
1875             l = size;
1876         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1877             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1878                              " specified", orig_addr, orig_size);
1879             goto exit;
1880         }
1881         if (fwrite(buf, 1, l, f) != l) {
1882             error_setg(errp, QERR_IO_ERROR);
1883             goto exit;
1884         }
1885         addr += l;
1886         size -= l;
1887     }
1888
1889 exit:
1890     fclose(f);
1891 }
1892
1893 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1894                   Error **errp)
1895 {
1896     FILE *f;
1897     uint32_t l;
1898     uint8_t buf[1024];
1899
1900     f = fopen(filename, "wb");
1901     if (!f) {
1902         error_setg_file_open(errp, errno, filename);
1903         return;
1904     }
1905
1906     while (size != 0) {
1907         l = sizeof(buf);
1908         if (l > size)
1909             l = size;
1910         cpu_physical_memory_read(addr, buf, l);
1911         if (fwrite(buf, 1, l, f) != l) {
1912             error_setg(errp, QERR_IO_ERROR);
1913             goto exit;
1914         }
1915         addr += l;
1916         size -= l;
1917     }
1918
1919 exit:
1920     fclose(f);
1921 }
1922
1923 void qmp_inject_nmi(Error **errp)
1924 {
1925     nmi_monitor_handle(monitor_get_cpu_index(), errp);
1926 }
1927
1928 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1929 {
1930     if (!use_icount) {
1931         return;
1932     }
1933
1934     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
1935                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1936     if (icount_align_option) {
1937         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
1938         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
1939     } else {
1940         cpu_fprintf(f, "Max guest delay     NA\n");
1941         cpu_fprintf(f, "Max guest advance   NA\n");
1942     }
1943 }