cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 /* Needed early for CONFIG_BSD etc. */
  26 #include "qemu/osdep.h"
  27 #include "qemu-common.h"
  28 #include "qemu/config-file.h"
  29 #include "cpu.h"
  30 #include "monitor/monitor.h"
  31 #include "qapi/qmp/qerror.h"
  32 #include "qemu/error-report.h"
  33 #include "sysemu/sysemu.h"
  34 #include "sysemu/block-backend.h"
  35 #include "exec/gdbstub.h"
  36 #include "sysemu/dma.h"
  37 #include "sysemu/hw_accel.h"
  38 #include "sysemu/kvm.h"
  39 #include "sysemu/hax.h"
  40 #include "qmp-commands.h"
  41 #include "exec/exec-all.h"
  42
  43 #include "qemu/thread.h"
  44 #include "sysemu/cpus.h"
  45 #include "sysemu/qtest.h"
  46 #include "qemu/main-loop.h"
  47 #include "qemu/bitmap.h"
  48 #include "qemu/seqlock.h"
  49 #include "tcg.h"
  50 #include "qapi-event.h"
  51 #include "hw/nmi.h"
  52 #include "sysemu/replay.h"
  53
  54 #ifdef CONFIG_LINUX
  55
  56 #include <sys/prctl.h>
  57
  58 #ifndef PR_MCE_KILL
  59 #define PR_MCE_KILL 33
  60 #endif
  61
  62 #ifndef PR_MCE_KILL_SET
  63 #define PR_MCE_KILL_SET 1
  64 #endif
  65
  66 #ifndef PR_MCE_KILL_EARLY
  67 #define PR_MCE_KILL_EARLY 1
  68 #endif
  69
  70 #endif /* CONFIG_LINUX */
  71
  72 int64_t max_delay;
  73 int64_t max_advance;
  74
  75 /* vcpu throttling controls */
  76 static QEMUTimer *throttle_timer;
  77 static unsigned int throttle_percentage;
  78
  79 #define CPU_THROTTLE_PCT_MIN 1
  80 #define CPU_THROTTLE_PCT_MAX 99
  81 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  82
  83 bool cpu_is_stopped(CPUState *cpu)
  84 {
  85     return cpu->stopped || !runstate_is_running();
  86 }
  87
  88 static bool cpu_thread_is_idle(CPUState *cpu)
  89 {
  90     if (cpu->stop || cpu->queued_work_first) {
  91         return false;
  92     }
  93     if (cpu_is_stopped(cpu)) {
  94         return true;
  95     }
  96     if (!cpu->halted || cpu_has_work(cpu) ||
  97         kvm_halt_in_kernel()) {
  98         return false;
  99     }
 100     return true;
 101 }
 102
 103 static bool all_cpu_threads_idle(void)
 104 {
 105     CPUState *cpu;
 106
 107     CPU_FOREACH(cpu) {
 108         if (!cpu_thread_is_idle(cpu)) {
 109             return false;
 110         }
 111     }
 112     return true;
 113 }
 114
 115 /***********************************************************/
 116 /* guest cycle counter */
 117
 118 /* Protected by TimersState seqlock */
 119
 120 static bool icount_sleep = true;
 121 static int64_t vm_clock_warp_start = -1;
 122 /* Conversion factor from emulated instructions to virtual clock ticks.  */
 123 static int icount_time_shift;
 124 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 125 #define MAX_ICOUNT_SHIFT 10
 126
 127 static QEMUTimer *icount_rt_timer;
 128 static QEMUTimer *icount_vm_timer;
 129 static QEMUTimer *icount_warp_timer;
 130
 131 typedef struct TimersState {
 132     /* Protected by BQL.  */
 133     int64_t cpu_ticks_prev;
 134     int64_t cpu_ticks_offset;
 135
 136     /* cpu_clock_offset can be read out of BQL, so protect it with
 137      * this lock.
 138      */
 139     QemuSeqLock vm_clock_seqlock;
 140     int64_t cpu_clock_offset;
 141     int32_t cpu_ticks_enabled;
 142     int64_t dummy;
 143
 144     /* Compensate for varying guest execution speed.  */
 145     int64_t qemu_icount_bias;
 146     /* Only written by TCG thread */
 147     int64_t qemu_icount;
 148 } TimersState;
 149
 150 static TimersState timers_state;
 151 bool mttcg_enabled;
 152
 153 /*
 154  * We default to false if we know other options have been enabled
 155  * which are currently incompatible with MTTCG. Otherwise when each
 156  * guest (target) has been updated to support:
 157  *   - atomic instructions
 158  *   - memory ordering primitives (barriers)
 159  * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 160  *
 161  * Once a guest architecture has been converted to the new primitives
 162  * there are two remaining limitations to check.
 163  *
 164  * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 165  * - The host must have a stronger memory order than the guest
 166  *
 167  * It may be possible in future to support strong guests on weak hosts
 168  * but that will require tagging all load/stores in a guest with their
 169  * implicit memory order requirements which would likely slow things
 170  * down a lot.
 171  */
 172
 173 static bool check_tcg_memory_orders_compatible(void)
 174 {
 175 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 176     return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 177 #else
 178     return false;
 179 #endif
 180 }
 181
 182 static bool default_mttcg_enabled(void)
 183 {
 184     if (use_icount || TCG_OVERSIZED_GUEST) {
 185         return false;
 186     } else {
 187 #ifdef TARGET_SUPPORTS_MTTCG
 188         return check_tcg_memory_orders_compatible();
 189 #else
 190         return false;
 191 #endif
 192     }
 193 }
 194
 195 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 196 {
 197     const char *t = qemu_opt_get(opts, "thread");
 198     if (t) {
 199         if (strcmp(t, "multi") == 0) {
 200             if (TCG_OVERSIZED_GUEST) {
 201                 error_setg(errp, "No MTTCG when guest word size > hosts");
 202             } else if (use_icount) {
 203                 error_setg(errp, "No MTTCG when icount is enabled");
 204             } else {
 205 #ifndef TARGET_SUPPORTS_MTTCG
 206                 error_report("Guest not yet converted to MTTCG - "
 207                              "you may get unexpected results");
 208 #endif
 209                 if (!check_tcg_memory_orders_compatible()) {
 210                     error_report("Guest expects a stronger memory ordering "
 211                                  "than the host provides");
 212                     error_printf("This may cause strange/hard to debug errors\n");
 213                 }
 214                 mttcg_enabled = true;
 215             }
 216         } else if (strcmp(t, "single") == 0) {
 217             mttcg_enabled = false;
 218         } else {
 219             error_setg(errp, "Invalid 'thread' setting %s", t);
 220         }
 221     } else {
 222         mttcg_enabled = default_mttcg_enabled();
 223     }
 224 }
 225
 226 int64_t cpu_get_icount_raw(void)
 227 {
 228     int64_t icount;
 229     CPUState *cpu = current_cpu;
 230
 231     icount = timers_state.qemu_icount;
 232     if (cpu && cpu->running) {
 233         if (!cpu->can_do_io) {
 234             fprintf(stderr, "Bad icount read\n");
 235             exit(1);
 236         }
 237         icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
 238     }
 239     return icount;
 240 }
 241
 242 /* Return the virtual CPU time, based on the instruction counter.  */
 243 static int64_t cpu_get_icount_locked(void)
 244 {
 245     int64_t icount = cpu_get_icount_raw();
 246     return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
 247 }
 248
 249 int64_t cpu_get_icount(void)
 250 {
 251     int64_t icount;
 252     unsigned start;
 253
 254     do {
 255         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 256         icount = cpu_get_icount_locked();
 257     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 258
 259     return icount;
 260 }
 261
 262 int64_t cpu_icount_to_ns(int64_t icount)
 263 {
 264     return icount << icount_time_shift;
 265 }
 266
 267 /* return the time elapsed in VM between vm_start and vm_stop.  Unless
 268  * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 269  * counter.
 270  *
 271  * Caller must hold the BQL
 272  */
 273 int64_t cpu_get_ticks(void)
 274 {
 275     int64_t ticks;
 276
 277     if (use_icount) {
 278         return cpu_get_icount();
 279     }
 280
 281     ticks = timers_state.cpu_ticks_offset;
 282     if (timers_state.cpu_ticks_enabled) {
 283         ticks += cpu_get_host_ticks();
 284     }
 285
 286     if (timers_state.cpu_ticks_prev > ticks) {
 287         /* Note: non increasing ticks may happen if the host uses
 288            software suspend */
 289         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 290         ticks = timers_state.cpu_ticks_prev;
 291     }
 292
 293     timers_state.cpu_ticks_prev = ticks;
 294     return ticks;
 295 }
 296
 297 static int64_t cpu_get_clock_locked(void)
 298 {
 299     int64_t time;
 300
 301     time = timers_state.cpu_clock_offset;
 302     if (timers_state.cpu_ticks_enabled) {
 303         time += get_clock();
 304     }
 305
 306     return time;
 307 }
 308
 309 /* Return the monotonic time elapsed in VM, i.e.,
 310  * the time between vm_start and vm_stop
 311  */
 312 int64_t cpu_get_clock(void)
 313 {
 314     int64_t ti;
 315     unsigned start;
 316
 317     do {
 318         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 319         ti = cpu_get_clock_locked();
 320     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 321
 322     return ti;
 323 }
 324
 325 /* enable cpu_get_ticks()
 326  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 327  */
 328 void cpu_enable_ticks(void)
 329 {
 330     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 331     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 332     if (!timers_state.cpu_ticks_enabled) {
 333         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 334         timers_state.cpu_clock_offset -= get_clock();
 335         timers_state.cpu_ticks_enabled = 1;
 336     }
 337     seqlock_write_end(&timers_state.vm_clock_seqlock);
 338 }
 339
 340 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 341  * cpu_get_ticks() after that.
 342  * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 343  */
 344 void cpu_disable_ticks(void)
 345 {
 346     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 347     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 348     if (timers_state.cpu_ticks_enabled) {
 349         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 350         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 351         timers_state.cpu_ticks_enabled = 0;
 352     }
 353     seqlock_write_end(&timers_state.vm_clock_seqlock);
 354 }
 355
 356 /* Correlation between real and virtual time is always going to be
 357    fairly approximate, so ignore small variation.
 358    When the guest is idle real and virtual time will be aligned in
 359    the IO wait loop.  */
 360 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 361
 362 static void icount_adjust(void)
 363 {
 364     int64_t cur_time;
 365     int64_t cur_icount;
 366     int64_t delta;
 367
 368     /* Protected by TimersState mutex.  */
 369     static int64_t last_delta;
 370
 371     /* If the VM is not running, then do nothing.  */
 372     if (!runstate_is_running()) {
 373         return;
 374     }
 375
 376     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 377     cur_time = cpu_get_clock_locked();
 378     cur_icount = cpu_get_icount_locked();
 379
 380     delta = cur_icount - cur_time;
 381     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 382     if (delta > 0
 383         && last_delta + ICOUNT_WOBBLE < delta * 2
 384         && icount_time_shift > 0) {
 385         /* The guest is getting too far ahead.  Slow time down.  */
 386         icount_time_shift--;
 387     }
 388     if (delta < 0
 389         && last_delta - ICOUNT_WOBBLE > delta * 2
 390         && icount_time_shift < MAX_ICOUNT_SHIFT) {
 391         /* The guest is getting too far behind.  Speed time up.  */
 392         icount_time_shift++;
 393     }
 394     last_delta = delta;
 395     timers_state.qemu_icount_bias = cur_icount
 396                               - (timers_state.qemu_icount << icount_time_shift);
 397     seqlock_write_end(&timers_state.vm_clock_seqlock);
 398 }
 399
 400 static void icount_adjust_rt(void *opaque)
 401 {
 402     timer_mod(icount_rt_timer,
 403               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 404     icount_adjust();
 405 }
 406
 407 static void icount_adjust_vm(void *opaque)
 408 {
 409     timer_mod(icount_vm_timer,
 410                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 411                    NANOSECONDS_PER_SECOND / 10);
 412     icount_adjust();
 413 }
 414
 415 static int64_t qemu_icount_round(int64_t count)
 416 {
 417     return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 418 }
 419
 420 static void icount_warp_rt(void)
 421 {
 422     unsigned seq;
 423     int64_t warp_start;
 424
 425     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 426      * changes from -1 to another value, so the race here is okay.
 427      */
 428     do {
 429         seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 430         warp_start = vm_clock_warp_start;
 431     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 432
 433     if (warp_start == -1) {
 434         return;
 435     }
 436
 437     seqlock_write_begin(&timers_state.vm_clock_seqlock);
 438     if (runstate_is_running()) {
 439         int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 440                                      cpu_get_clock_locked());
 441         int64_t warp_delta;
 442
 443         warp_delta = clock - vm_clock_warp_start;
 444         if (use_icount == 2) {
 445             /*
 446              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 447              * far ahead of real time.
 448              */
 449             int64_t cur_icount = cpu_get_icount_locked();
 450             int64_t delta = clock - cur_icount;
 451             warp_delta = MIN(warp_delta, delta);
 452         }
 453         timers_state.qemu_icount_bias += warp_delta;
 454     }
 455     vm_clock_warp_start = -1;
 456     seqlock_write_end(&timers_state.vm_clock_seqlock);
 457
 458     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 459         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 460     }
 461 }
 462
 463 static void icount_timer_cb(void *opaque)
 464 {
 465     /* No need for a checkpoint because the timer already synchronizes
 466      * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 467      */
 468     icount_warp_rt();
 469 }
 470
 471 void qtest_clock_warp(int64_t dest)
 472 {
 473     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 474     AioContext *aio_context;
 475     assert(qtest_enabled());
 476     aio_context = qemu_get_aio_context();
 477     while (clock < dest) {
 478         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 479         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 480
 481         seqlock_write_begin(&timers_state.vm_clock_seqlock);
 482         timers_state.qemu_icount_bias += warp;
 483         seqlock_write_end(&timers_state.vm_clock_seqlock);
 484
 485         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 486         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 487         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 488     }
 489     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 490 }
 491
 492 void qemu_start_warp_timer(void)
 493 {
 494     int64_t clock;
 495     int64_t deadline;
 496
 497     if (!use_icount) {
 498         return;
 499     }
 500
 501     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 502      * do not fire, so computing the deadline does not make sense.
 503      */
 504     if (!runstate_is_running()) {
 505         return;
 506     }
 507
 508     /* warp clock deterministically in record/replay mode */
 509     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 510         return;
 511     }
 512
 513     if (!all_cpu_threads_idle()) {
 514         return;
 515     }
 516
 517     if (qtest_enabled()) {
 518         /* When testing, qtest commands advance icount.  */
 519         return;
 520     }
 521
 522     /* We want to use the earliest deadline from ALL vm_clocks */
 523     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 524     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 525     if (deadline < 0) {
 526         static bool notified;
 527         if (!icount_sleep && !notified) {
 528             error_report("WARNING: icount sleep disabled and no active timers");
 529             notified = true;
 530         }
 531         return;
 532     }
 533
 534     if (deadline > 0) {
 535         /*
 536          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 537          * sleep.  Otherwise, the CPU might be waiting for a future timer
 538          * interrupt to wake it up, but the interrupt never comes because
 539          * the vCPU isn't running any insns and thus doesn't advance the
 540          * QEMU_CLOCK_VIRTUAL.
 541          */
 542         if (!icount_sleep) {
 543             /*
 544              * We never let VCPUs sleep in no sleep icount mode.
 545              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 546              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 547              * It is useful when we want a deterministic execution time,
 548              * isolated from host latencies.
 549              */
 550             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 551             timers_state.qemu_icount_bias += deadline;
 552             seqlock_write_end(&timers_state.vm_clock_seqlock);
 553             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 554         } else {
 555             /*
 556              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 557              * "real" time, (related to the time left until the next event) has
 558              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 559              * This avoids that the warps are visible externally; for example,
 560              * you will not be sending network packets continuously instead of
 561              * every 100ms.
 562              */
 563             seqlock_write_begin(&timers_state.vm_clock_seqlock);
 564             if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
 565                 vm_clock_warp_start = clock;
 566             }
 567             seqlock_write_end(&timers_state.vm_clock_seqlock);
 568             timer_mod_anticipate(icount_warp_timer, clock + deadline);
 569         }
 570     } else if (deadline == 0) {
 571         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 572     }
 573 }
 574
 575 static void qemu_account_warp_timer(void)
 576 {
 577     if (!use_icount || !icount_sleep) {
 578         return;
 579     }
 580
 581     /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 582      * do not fire, so computing the deadline does not make sense.
 583      */
 584     if (!runstate_is_running()) {
 585         return;
 586     }
 587
 588     /* warp clock deterministically in record/replay mode */
 589     if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 590         return;
 591     }
 592
 593     timer_del(icount_warp_timer);
 594     icount_warp_rt();
 595 }
 596
 597 static bool icount_state_needed(void *opaque)
 598 {
 599     return use_icount;
 600 }
 601
 602 /*
 603  * This is a subsection for icount migration.
 604  */
 605 static const VMStateDescription icount_vmstate_timers = {
 606     .name = "timer/icount",
 607     .version_id = 1,
 608     .minimum_version_id = 1,
 609     .needed = icount_state_needed,
 610     .fields = (VMStateField[]) {
 611         VMSTATE_INT64(qemu_icount_bias, TimersState),
 612         VMSTATE_INT64(qemu_icount, TimersState),
 613         VMSTATE_END_OF_LIST()
 614     }
 615 };
 616
 617 static const VMStateDescription vmstate_timers = {
 618     .name = "timer",
 619     .version_id = 2,
 620     .minimum_version_id = 1,
 621     .fields = (VMStateField[]) {
 622         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 623         VMSTATE_INT64(dummy, TimersState),
 624         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 625         VMSTATE_END_OF_LIST()
 626     },
 627     .subsections = (const VMStateDescription*[]) {
 628         &icount_vmstate_timers,
 629         NULL
 630     }
 631 };
 632
 633 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 634 {
 635     double pct;
 636     double throttle_ratio;
 637     long sleeptime_ns;
 638
 639     if (!cpu_throttle_get_percentage()) {
 640         return;
 641     }
 642
 643     pct = (double)cpu_throttle_get_percentage()/100;
 644     throttle_ratio = pct / (1 - pct);
 645     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 646
 647     qemu_mutex_unlock_iothread();
 648     atomic_set(&cpu->throttle_thread_scheduled, 0);
 649     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 650     qemu_mutex_lock_iothread();
 651 }
 652
 653 static void cpu_throttle_timer_tick(void *opaque)
 654 {
 655     CPUState *cpu;
 656     double pct;
 657
 658     /* Stop the timer if needed */
 659     if (!cpu_throttle_get_percentage()) {
 660         return;
 661     }
 662     CPU_FOREACH(cpu) {
 663         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 664             async_run_on_cpu(cpu, cpu_throttle_thread,
 665                              RUN_ON_CPU_NULL);
 666         }
 667     }
 668
 669     pct = (double)cpu_throttle_get_percentage()/100;
 670     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 671                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 672 }
 673
 674 void cpu_throttle_set(int new_throttle_pct)
 675 {
 676     /* Ensure throttle percentage is within valid range */
 677     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 678     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 679
 680     atomic_set(&throttle_percentage, new_throttle_pct);
 681
 682     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 683                                        CPU_THROTTLE_TIMESLICE_NS);
 684 }
 685
 686 void cpu_throttle_stop(void)
 687 {
 688     atomic_set(&throttle_percentage, 0);
 689 }
 690
 691 bool cpu_throttle_active(void)
 692 {
 693     return (cpu_throttle_get_percentage() != 0);
 694 }
 695
 696 int cpu_throttle_get_percentage(void)
 697 {
 698     return atomic_read(&throttle_percentage);
 699 }
 700
 701 void cpu_ticks_init(void)
 702 {
 703     seqlock_init(&timers_state.vm_clock_seqlock);
 704     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 705     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 706                                            cpu_throttle_timer_tick, NULL);
 707 }
 708
 709 void configure_icount(QemuOpts *opts, Error **errp)
 710 {
 711     const char *option;
 712     char *rem_str = NULL;
 713
 714     option = qemu_opt_get(opts, "shift");
 715     if (!option) {
 716         if (qemu_opt_get(opts, "align") != NULL) {
 717             error_setg(errp, "Please specify shift option when using align");
 718         }
 719         return;
 720     }
 721
 722     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 723     if (icount_sleep) {
 724         icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 725                                          icount_timer_cb, NULL);
 726     }
 727
 728     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 729
 730     if (icount_align_option && !icount_sleep) {
 731         error_setg(errp, "align=on and sleep=off are incompatible");
 732     }
 733     if (strcmp(option, "auto") != 0) {
 734         errno = 0;
 735         icount_time_shift = strtol(option, &rem_str, 0);
 736         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 737             error_setg(errp, "icount: Invalid shift value");
 738         }
 739         use_icount = 1;
 740         return;
 741     } else if (icount_align_option) {
 742         error_setg(errp, "shift=auto and align=on are incompatible");
 743     } else if (!icount_sleep) {
 744         error_setg(errp, "shift=auto and sleep=off are incompatible");
 745     }
 746
 747     use_icount = 2;
 748
 749     /* 125MIPS seems a reasonable initial guess at the guest speed.
 750        It will be corrected fairly quickly anyway.  */
 751     icount_time_shift = 3;
 752
 753     /* Have both realtime and virtual time triggers for speed adjustment.
 754        The realtime trigger catches emulated time passing too slowly,
 755        the virtual time trigger catches emulated time passing too fast.
 756        Realtime triggers occur even when idle, so use them less frequently
 757        than VM triggers.  */
 758     icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 759                                    icount_adjust_rt, NULL);
 760     timer_mod(icount_rt_timer,
 761                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 762     icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 763                                         icount_adjust_vm, NULL);
 764     timer_mod(icount_vm_timer,
 765                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 766                    NANOSECONDS_PER_SECOND / 10);
 767 }
 768
 769 /***********************************************************/
 770 /* TCG vCPU kick timer
 771  *
 772  * The kick timer is responsible for moving single threaded vCPU
 773  * emulation on to the next vCPU. If more than one vCPU is running a
 774  * timer event with force a cpu->exit so the next vCPU can get
 775  * scheduled.
 776  *
 777  * The timer is removed if all vCPUs are idle and restarted again once
 778  * idleness is complete.
 779  */
 780
 781 static QEMUTimer *tcg_kick_vcpu_timer;
 782 static CPUState *tcg_current_rr_cpu;
 783
 784 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 785
 786 static inline int64_t qemu_tcg_next_kick(void)
 787 {
 788     return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 789 }
 790
 791 /* Kick the currently round-robin scheduled vCPU */
 792 static void qemu_cpu_kick_rr_cpu(void)
 793 {
 794     CPUState *cpu;
 795     do {
 796         cpu = atomic_mb_read(&tcg_current_rr_cpu);
 797         if (cpu) {
 798             cpu_exit(cpu);
 799         }
 800     } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 801 }
 802
 803 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 804 {
 805 }
 806
 807 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 808 {
 809     if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 810         qemu_notify_event();
 811         return;
 812     }
 813
 814     if (!qemu_in_vcpu_thread() && first_cpu) {
 815         /* qemu_cpu_kick is not enough to kick a halted CPU out of
 816          * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 817          * causes cpu_thread_is_idle to return false.  This way,
 818          * handle_icount_deadline can run.
 819          */
 820         async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 821     }
 822 }
 823
 824 static void kick_tcg_thread(void *opaque)
 825 {
 826     timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 827     qemu_cpu_kick_rr_cpu();
 828 }
 829
 830 static void start_tcg_kick_timer(void)
 831 {
 832     if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 833         tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 834                                            kick_tcg_thread, NULL);
 835         timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 836     }
 837 }
 838
 839 static void stop_tcg_kick_timer(void)
 840 {
 841     if (tcg_kick_vcpu_timer) {
 842         timer_del(tcg_kick_vcpu_timer);
 843         tcg_kick_vcpu_timer = NULL;
 844     }
 845 }
 846
 847 /***********************************************************/
 848 void hw_error(const char *fmt, ...)
 849 {
 850     va_list ap;
 851     CPUState *cpu;
 852
 853     va_start(ap, fmt);
 854     fprintf(stderr, "qemu: hardware error: ");
 855     vfprintf(stderr, fmt, ap);
 856     fprintf(stderr, "\n");
 857     CPU_FOREACH(cpu) {
 858         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 859         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 860     }
 861     va_end(ap);
 862     abort();
 863 }
 864
 865 void cpu_synchronize_all_states(void)
 866 {
 867     CPUState *cpu;
 868
 869     CPU_FOREACH(cpu) {
 870         cpu_synchronize_state(cpu);
 871     }
 872 }
 873
 874 void cpu_synchronize_all_post_reset(void)
 875 {
 876     CPUState *cpu;
 877
 878     CPU_FOREACH(cpu) {
 879         cpu_synchronize_post_reset(cpu);
 880     }
 881 }
 882
 883 void cpu_synchronize_all_post_init(void)
 884 {
 885     CPUState *cpu;
 886
 887     CPU_FOREACH(cpu) {
 888         cpu_synchronize_post_init(cpu);
 889     }
 890 }
 891
 892 static int do_vm_stop(RunState state)
 893 {
 894     int ret = 0;
 895
 896     if (runstate_is_running()) {
 897         cpu_disable_ticks();
 898         pause_all_vcpus();
 899         runstate_set(state);
 900         vm_state_notify(0, state);
 901         qapi_event_send_stop(&error_abort);
 902     }
 903
 904     bdrv_drain_all();
 905     replay_disable_events();
 906     ret = bdrv_flush_all();
 907
 908     return ret;
 909 }
 910
 911 static bool cpu_can_run(CPUState *cpu)
 912 {
 913     if (cpu->stop) {
 914         return false;
 915     }
 916     if (cpu_is_stopped(cpu)) {
 917         return false;
 918     }
 919     return true;
 920 }
 921
 922 static void cpu_handle_guest_debug(CPUState *cpu)
 923 {
 924     gdb_set_stop_cpu(cpu);
 925     qemu_system_debug_request();
 926     cpu->stopped = true;
 927 }
 928
 929 #ifdef CONFIG_LINUX
 930 static void sigbus_reraise(void)
 931 {
 932     sigset_t set;
 933     struct sigaction action;
 934
 935     memset(&action, 0, sizeof(action));
 936     action.sa_handler = SIG_DFL;
 937     if (!sigaction(SIGBUS, &action, NULL)) {
 938         raise(SIGBUS);
 939         sigemptyset(&set);
 940         sigaddset(&set, SIGBUS);
 941         pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 942     }
 943     perror("Failed to re-raise SIGBUS!\n");
 944     abort();
 945 }
 946
 947 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
 948 {
 949     if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
 950         sigbus_reraise();
 951     }
 952
 953     if (current_cpu) {
 954         /* Called asynchronously in VCPU thread.  */
 955         if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
 956             sigbus_reraise();
 957         }
 958     } else {
 959         /* Called synchronously (via signalfd) in main thread.  */
 960         if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
 961             sigbus_reraise();
 962         }
 963     }
 964 }
 965
 966 static void qemu_init_sigbus(void)
 967 {
 968     struct sigaction action;
 969
 970     memset(&action, 0, sizeof(action));
 971     action.sa_flags = SA_SIGINFO;
 972     action.sa_sigaction = sigbus_handler;
 973     sigaction(SIGBUS, &action, NULL);
 974
 975     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
 976 }
 977 #else /* !CONFIG_LINUX */
 978 static void qemu_init_sigbus(void)
 979 {
 980 }
 981 #endif /* !CONFIG_LINUX */
 982
 983 static QemuMutex qemu_global_mutex;
 984
 985 static QemuThread io_thread;
 986
 987 /* cpu creation */
 988 static QemuCond qemu_cpu_cond;
 989 /* system init */
 990 static QemuCond qemu_pause_cond;
 991
 992 void qemu_init_cpu_loop(void)
 993 {
 994     qemu_init_sigbus();
 995     qemu_cond_init(&qemu_cpu_cond);
 996     qemu_cond_init(&qemu_pause_cond);
 997     qemu_mutex_init(&qemu_global_mutex);
 998
 999     qemu_thread_get_self(&io_thread);
1000 }
1001
1002 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1003 {
1004     do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1005 }
1006
1007 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1008 {
1009     if (kvm_destroy_vcpu(cpu) < 0) {
1010         error_report("kvm_destroy_vcpu failed");
1011         exit(EXIT_FAILURE);
1012     }
1013 }
1014
1015 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1016 {
1017 }
1018
1019 static void qemu_wait_io_event_common(CPUState *cpu)
1020 {
1021     atomic_mb_set(&cpu->thread_kicked, false);
1022     if (cpu->stop) {
1023         cpu->stop = false;
1024         cpu->stopped = true;
1025         qemu_cond_broadcast(&qemu_pause_cond);
1026     }
1027     process_queued_cpu_work(cpu);
1028 }
1029
1030 static bool qemu_tcg_should_sleep(CPUState *cpu)
1031 {
1032     if (mttcg_enabled) {
1033         return cpu_thread_is_idle(cpu);
1034     } else {
1035         return all_cpu_threads_idle();
1036     }
1037 }
1038
1039 static void qemu_tcg_wait_io_event(CPUState *cpu)
1040 {
1041     while (qemu_tcg_should_sleep(cpu)) {
1042         stop_tcg_kick_timer();
1043         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1044     }
1045
1046     start_tcg_kick_timer();
1047
1048     qemu_wait_io_event_common(cpu);
1049 }
1050
1051 static void qemu_kvm_wait_io_event(CPUState *cpu)
1052 {
1053     while (cpu_thread_is_idle(cpu)) {
1054         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1055     }
1056
1057     qemu_wait_io_event_common(cpu);
1058 }
1059
1060 static void *qemu_kvm_cpu_thread_fn(void *arg)
1061 {
1062     CPUState *cpu = arg;
1063     int r;
1064
1065     rcu_register_thread();
1066
1067     qemu_mutex_lock_iothread();
1068     qemu_thread_get_self(cpu->thread);
1069     cpu->thread_id = qemu_get_thread_id();
1070     cpu->can_do_io = 1;
1071     current_cpu = cpu;
1072
1073     r = kvm_init_vcpu(cpu);
1074     if (r < 0) {
1075         fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1076         exit(1);
1077     }
1078
1079     kvm_init_cpu_signals(cpu);
1080
1081     /* signal CPU creation */
1082     cpu->created = true;
1083     qemu_cond_signal(&qemu_cpu_cond);
1084
1085     do {
1086         if (cpu_can_run(cpu)) {
1087             r = kvm_cpu_exec(cpu);
1088             if (r == EXCP_DEBUG) {
1089                 cpu_handle_guest_debug(cpu);
1090             }
1091         }
1092         qemu_kvm_wait_io_event(cpu);
1093     } while (!cpu->unplug || cpu_can_run(cpu));
1094
1095     qemu_kvm_destroy_vcpu(cpu);
1096     cpu->created = false;
1097     qemu_cond_signal(&qemu_cpu_cond);
1098     qemu_mutex_unlock_iothread();
1099     return NULL;
1100 }
1101
1102 static void *qemu_dummy_cpu_thread_fn(void *arg)
1103 {
1104 #ifdef _WIN32
1105     fprintf(stderr, "qtest is not supported under Windows\n");
1106     exit(1);
1107 #else
1108     CPUState *cpu = arg;
1109     sigset_t waitset;
1110     int r;
1111
1112     rcu_register_thread();
1113
1114     qemu_mutex_lock_iothread();
1115     qemu_thread_get_self(cpu->thread);
1116     cpu->thread_id = qemu_get_thread_id();
1117     cpu->can_do_io = 1;
1118     current_cpu = cpu;
1119
1120     sigemptyset(&waitset);
1121     sigaddset(&waitset, SIG_IPI);
1122
1123     /* signal CPU creation */
1124     cpu->created = true;
1125     qemu_cond_signal(&qemu_cpu_cond);
1126
1127     while (1) {
1128         qemu_mutex_unlock_iothread();
1129         do {
1130             int sig;
1131             r = sigwait(&waitset, &sig);
1132         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1133         if (r == -1) {
1134             perror("sigwait");
1135             exit(1);
1136         }
1137         qemu_mutex_lock_iothread();
1138         qemu_wait_io_event_common(cpu);
1139     }
1140
1141     return NULL;
1142 #endif
1143 }
1144
1145 static int64_t tcg_get_icount_limit(void)
1146 {
1147     int64_t deadline;
1148
1149     if (replay_mode != REPLAY_MODE_PLAY) {
1150         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1151
1152         /* Maintain prior (possibly buggy) behaviour where if no deadline
1153          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1154          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1155          * nanoseconds.
1156          */
1157         if ((deadline < 0) || (deadline > INT32_MAX)) {
1158             deadline = INT32_MAX;
1159         }
1160
1161         return qemu_icount_round(deadline);
1162     } else {
1163         return replay_get_instructions();
1164     }
1165 }
1166
1167 static void handle_icount_deadline(void)
1168 {
1169     assert(qemu_in_vcpu_thread());
1170     if (use_icount) {
1171         int64_t deadline =
1172             qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1173
1174         if (deadline == 0) {
1175             /* Wake up other AioContexts.  */
1176             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1177             qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1178         }
1179     }
1180 }
1181
1182 static void prepare_icount_for_run(CPUState *cpu)
1183 {
1184     if (use_icount) {
1185         int64_t count;
1186         int decr;
1187
1188         /* These should always be cleared by process_icount_data after
1189          * each vCPU execution. However u16.high can be raised
1190          * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1191          */
1192         g_assert(cpu->icount_decr.u16.low == 0);
1193         g_assert(cpu->icount_extra == 0);
1194
1195
1196         count = tcg_get_icount_limit();
1197
1198         timers_state.qemu_icount += count;
1199         decr = (count > 0xffff) ? 0xffff : count;
1200         count -= decr;
1201         cpu->icount_decr.u16.low = decr;
1202         cpu->icount_extra = count;
1203     }
1204 }
1205
1206 static void process_icount_data(CPUState *cpu)
1207 {
1208     if (use_icount) {
1209         /* Fold pending instructions back into the
1210            instruction counter, and clear the interrupt flag.  */
1211         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1212                         + cpu->icount_extra);
1213
1214         /* Reset the counters */
1215         cpu->icount_decr.u16.low = 0;
1216         cpu->icount_extra = 0;
1217         replay_account_executed_instructions();
1218     }
1219 }
1220
1221
1222 static int tcg_cpu_exec(CPUState *cpu)
1223 {
1224     int ret;
1225 #ifdef CONFIG_PROFILER
1226     int64_t ti;
1227 #endif
1228
1229 #ifdef CONFIG_PROFILER
1230     ti = profile_getclock();
1231 #endif
1232     qemu_mutex_unlock_iothread();
1233     cpu_exec_start(cpu);
1234     ret = cpu_exec(cpu);
1235     cpu_exec_end(cpu);
1236     qemu_mutex_lock_iothread();
1237 #ifdef CONFIG_PROFILER
1238     tcg_time += profile_getclock() - ti;
1239 #endif
1240     return ret;
1241 }
1242
1243 /* Destroy any remaining vCPUs which have been unplugged and have
1244  * finished running
1245  */
1246 static void deal_with_unplugged_cpus(void)
1247 {
1248     CPUState *cpu;
1249
1250     CPU_FOREACH(cpu) {
1251         if (cpu->unplug && !cpu_can_run(cpu)) {
1252             qemu_tcg_destroy_vcpu(cpu);
1253             cpu->created = false;
1254             qemu_cond_signal(&qemu_cpu_cond);
1255             break;
1256         }
1257     }
1258 }
1259
1260 /* Single-threaded TCG
1261  *
1262  * In the single-threaded case each vCPU is simulated in turn. If
1263  * there is more than a single vCPU we create a simple timer to kick
1264  * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1265  * This is done explicitly rather than relying on side-effects
1266  * elsewhere.
1267  */
1268
1269 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1270 {
1271     CPUState *cpu = arg;
1272
1273     rcu_register_thread();
1274
1275     qemu_mutex_lock_iothread();
1276     qemu_thread_get_self(cpu->thread);
1277
1278     CPU_FOREACH(cpu) {
1279         cpu->thread_id = qemu_get_thread_id();
1280         cpu->created = true;
1281         cpu->can_do_io = 1;
1282     }
1283     qemu_cond_signal(&qemu_cpu_cond);
1284
1285     /* wait for initial kick-off after machine start */
1286     while (first_cpu->stopped) {
1287         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1288
1289         /* process any pending work */
1290         CPU_FOREACH(cpu) {
1291             current_cpu = cpu;
1292             qemu_wait_io_event_common(cpu);
1293         }
1294     }
1295
1296     start_tcg_kick_timer();
1297
1298     cpu = first_cpu;
1299
1300     /* process any pending work */
1301     cpu->exit_request = 1;
1302
1303     while (1) {
1304         /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1305         qemu_account_warp_timer();
1306
1307         /* Run the timers here.  This is much more efficient than
1308          * waking up the I/O thread and waiting for completion.
1309          */
1310         handle_icount_deadline();
1311
1312         if (!cpu) {
1313             cpu = first_cpu;
1314         }
1315
1316         while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1317
1318             atomic_mb_set(&tcg_current_rr_cpu, cpu);
1319             current_cpu = cpu;
1320
1321             qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1322                               (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1323
1324             if (cpu_can_run(cpu)) {
1325                 int r;
1326
1327                 prepare_icount_for_run(cpu);
1328
1329                 r = tcg_cpu_exec(cpu);
1330
1331                 process_icount_data(cpu);
1332
1333                 if (r == EXCP_DEBUG) {
1334                     cpu_handle_guest_debug(cpu);
1335                     break;
1336                 } else if (r == EXCP_ATOMIC) {
1337                     qemu_mutex_unlock_iothread();
1338                     cpu_exec_step_atomic(cpu);
1339                     qemu_mutex_lock_iothread();
1340                     break;
1341                 }
1342             } else if (cpu->stop) {
1343                 if (cpu->unplug) {
1344                     cpu = CPU_NEXT(cpu);
1345                 }
1346                 break;
1347             }
1348
1349             cpu = CPU_NEXT(cpu);
1350         } /* while (cpu && !cpu->exit_request).. */
1351
1352         /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1353         atomic_set(&tcg_current_rr_cpu, NULL);
1354
1355         if (cpu && cpu->exit_request) {
1356             atomic_mb_set(&cpu->exit_request, 0);
1357         }
1358
1359         qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1360         deal_with_unplugged_cpus();
1361     }
1362
1363     return NULL;
1364 }
1365
1366 static void *qemu_hax_cpu_thread_fn(void *arg)
1367 {
1368     CPUState *cpu = arg;
1369     int r;
1370
1371     qemu_mutex_lock_iothread();
1372     qemu_thread_get_self(cpu->thread);
1373
1374     cpu->thread_id = qemu_get_thread_id();
1375     cpu->created = true;
1376     cpu->halted = 0;
1377     current_cpu = cpu;
1378
1379     hax_init_vcpu(cpu);
1380     qemu_cond_signal(&qemu_cpu_cond);
1381
1382     while (1) {
1383         if (cpu_can_run(cpu)) {
1384             r = hax_smp_cpu_exec(cpu);
1385             if (r == EXCP_DEBUG) {
1386                 cpu_handle_guest_debug(cpu);
1387             }
1388         }
1389
1390         while (cpu_thread_is_idle(cpu)) {
1391             qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1392         }
1393 #ifdef _WIN32
1394         SleepEx(0, TRUE);
1395 #endif
1396         qemu_wait_io_event_common(cpu);
1397     }
1398     return NULL;
1399 }
1400
1401 #ifdef _WIN32
1402 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1403 {
1404 }
1405 #endif
1406
1407 /* Multi-threaded TCG
1408  *
1409  * In the multi-threaded case each vCPU has its own thread. The TLS
1410  * variable current_cpu can be used deep in the code to find the
1411  * current CPUState for a given thread.
1412  */
1413
1414 static void *qemu_tcg_cpu_thread_fn(void *arg)
1415 {
1416     CPUState *cpu = arg;
1417
1418     g_assert(!use_icount);
1419
1420     rcu_register_thread();
1421
1422     qemu_mutex_lock_iothread();
1423     qemu_thread_get_self(cpu->thread);
1424
1425     cpu->thread_id = qemu_get_thread_id();
1426     cpu->created = true;
1427     cpu->can_do_io = 1;
1428     current_cpu = cpu;
1429     qemu_cond_signal(&qemu_cpu_cond);
1430
1431     /* process any pending work */
1432     cpu->exit_request = 1;
1433
1434     while (1) {
1435         if (cpu_can_run(cpu)) {
1436             int r;
1437             r = tcg_cpu_exec(cpu);
1438             switch (r) {
1439             case EXCP_DEBUG:
1440                 cpu_handle_guest_debug(cpu);
1441                 break;
1442             case EXCP_HALTED:
1443                 /* during start-up the vCPU is reset and the thread is
1444                  * kicked several times. If we don't ensure we go back
1445                  * to sleep in the halted state we won't cleanly
1446                  * start-up when the vCPU is enabled.
1447                  *
1448                  * cpu->halted should ensure we sleep in wait_io_event
1449                  */
1450                 g_assert(cpu->halted);
1451                 break;
1452             case EXCP_ATOMIC:
1453                 qemu_mutex_unlock_iothread();
1454                 cpu_exec_step_atomic(cpu);
1455                 qemu_mutex_lock_iothread();
1456             default:
1457                 /* Ignore everything else? */
1458                 break;
1459             }
1460         }
1461
1462         atomic_mb_set(&cpu->exit_request, 0);
1463         qemu_tcg_wait_io_event(cpu);
1464     }
1465
1466     return NULL;
1467 }
1468
1469 static void qemu_cpu_kick_thread(CPUState *cpu)
1470 {
1471 #ifndef _WIN32
1472     int err;
1473
1474     if (cpu->thread_kicked) {
1475         return;
1476     }
1477     cpu->thread_kicked = true;
1478     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1479     if (err) {
1480         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1481         exit(1);
1482     }
1483 #else /* _WIN32 */
1484     if (!qemu_cpu_is_self(cpu)) {
1485         if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1486             fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1487                     __func__, GetLastError());
1488             exit(1);
1489         }
1490     }
1491 #endif
1492 }
1493
1494 void qemu_cpu_kick(CPUState *cpu)
1495 {
1496     qemu_cond_broadcast(cpu->halt_cond);
1497     if (tcg_enabled()) {
1498         cpu_exit(cpu);
1499         /* NOP unless doing single-thread RR */
1500         qemu_cpu_kick_rr_cpu();
1501     } else {
1502         if (hax_enabled()) {
1503             /*
1504              * FIXME: race condition with the exit_request check in
1505              * hax_vcpu_hax_exec
1506              */
1507             cpu->exit_request = 1;
1508         }
1509         qemu_cpu_kick_thread(cpu);
1510     }
1511 }
1512
1513 void qemu_cpu_kick_self(void)
1514 {
1515     assert(current_cpu);
1516     qemu_cpu_kick_thread(current_cpu);
1517 }
1518
1519 bool qemu_cpu_is_self(CPUState *cpu)
1520 {
1521     return qemu_thread_is_self(cpu->thread);
1522 }
1523
1524 bool qemu_in_vcpu_thread(void)
1525 {
1526     return current_cpu && qemu_cpu_is_self(current_cpu);
1527 }
1528
1529 static __thread bool iothread_locked = false;
1530
1531 bool qemu_mutex_iothread_locked(void)
1532 {
1533     return iothread_locked;
1534 }
1535
1536 void qemu_mutex_lock_iothread(void)
1537 {
1538     g_assert(!qemu_mutex_iothread_locked());
1539     qemu_mutex_lock(&qemu_global_mutex);
1540     iothread_locked = true;
1541 }
1542
1543 void qemu_mutex_unlock_iothread(void)
1544 {
1545     g_assert(qemu_mutex_iothread_locked());
1546     iothread_locked = false;
1547     qemu_mutex_unlock(&qemu_global_mutex);
1548 }
1549
1550 static bool all_vcpus_paused(void)
1551 {
1552     CPUState *cpu;
1553
1554     CPU_FOREACH(cpu) {
1555         if (!cpu->stopped) {
1556             return false;
1557         }
1558     }
1559
1560     return true;
1561 }
1562
1563 void pause_all_vcpus(void)
1564 {
1565     CPUState *cpu;
1566
1567     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1568     CPU_FOREACH(cpu) {
1569         cpu->stop = true;
1570         qemu_cpu_kick(cpu);
1571     }
1572
1573     if (qemu_in_vcpu_thread()) {
1574         cpu_stop_current();
1575     }
1576
1577     while (!all_vcpus_paused()) {
1578         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1579         CPU_FOREACH(cpu) {
1580             qemu_cpu_kick(cpu);
1581         }
1582     }
1583 }
1584
1585 void cpu_resume(CPUState *cpu)
1586 {
1587     cpu->stop = false;
1588     cpu->stopped = false;
1589     qemu_cpu_kick(cpu);
1590 }
1591
1592 void resume_all_vcpus(void)
1593 {
1594     CPUState *cpu;
1595
1596     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1597     CPU_FOREACH(cpu) {
1598         cpu_resume(cpu);
1599     }
1600 }
1601
1602 void cpu_remove(CPUState *cpu)
1603 {
1604     cpu->stop = true;
1605     cpu->unplug = true;
1606     qemu_cpu_kick(cpu);
1607 }
1608
1609 void cpu_remove_sync(CPUState *cpu)
1610 {
1611     cpu_remove(cpu);
1612     while (cpu->created) {
1613         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1614     }
1615 }
1616
1617 /* For temporary buffers for forming a name */
1618 #define VCPU_THREAD_NAME_SIZE 16
1619
1620 static void qemu_tcg_init_vcpu(CPUState *cpu)
1621 {
1622     char thread_name[VCPU_THREAD_NAME_SIZE];
1623     static QemuCond *single_tcg_halt_cond;
1624     static QemuThread *single_tcg_cpu_thread;
1625
1626     if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1627         cpu->thread = g_malloc0(sizeof(QemuThread));
1628         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1629         qemu_cond_init(cpu->halt_cond);
1630
1631         if (qemu_tcg_mttcg_enabled()) {
1632             /* create a thread per vCPU with TCG (MTTCG) */
1633             parallel_cpus = true;
1634             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1635                  cpu->cpu_index);
1636
1637             qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1638                                cpu, QEMU_THREAD_JOINABLE);
1639
1640         } else {
1641             /* share a single thread for all cpus with TCG */
1642             snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1643             qemu_thread_create(cpu->thread, thread_name,
1644                                qemu_tcg_rr_cpu_thread_fn,
1645                                cpu, QEMU_THREAD_JOINABLE);
1646
1647             single_tcg_halt_cond = cpu->halt_cond;
1648             single_tcg_cpu_thread = cpu->thread;
1649         }
1650 #ifdef _WIN32
1651         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1652 #endif
1653         while (!cpu->created) {
1654             qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1655         }
1656     } else {
1657         /* For non-MTTCG cases we share the thread */
1658         cpu->thread = single_tcg_cpu_thread;
1659         cpu->halt_cond = single_tcg_halt_cond;
1660     }
1661 }
1662
1663 static void qemu_hax_start_vcpu(CPUState *cpu)
1664 {
1665     char thread_name[VCPU_THREAD_NAME_SIZE];
1666
1667     cpu->thread = g_malloc0(sizeof(QemuThread));
1668     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1669     qemu_cond_init(cpu->halt_cond);
1670
1671     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1672              cpu->cpu_index);
1673     qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1674                        cpu, QEMU_THREAD_JOINABLE);
1675 #ifdef _WIN32
1676     cpu->hThread = qemu_thread_get_handle(cpu->thread);
1677 #endif
1678     while (!cpu->created) {
1679         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1680     }
1681 }
1682
1683 static void qemu_kvm_start_vcpu(CPUState *cpu)
1684 {
1685     char thread_name[VCPU_THREAD_NAME_SIZE];
1686
1687     cpu->thread = g_malloc0(sizeof(QemuThread));
1688     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1689     qemu_cond_init(cpu->halt_cond);
1690     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1691              cpu->cpu_index);
1692     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1693                        cpu, QEMU_THREAD_JOINABLE);
1694     while (!cpu->created) {
1695         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1696     }
1697 }
1698
1699 static void qemu_dummy_start_vcpu(CPUState *cpu)
1700 {
1701     char thread_name[VCPU_THREAD_NAME_SIZE];
1702
1703     cpu->thread = g_malloc0(sizeof(QemuThread));
1704     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1705     qemu_cond_init(cpu->halt_cond);
1706     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1707              cpu->cpu_index);
1708     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1709                        QEMU_THREAD_JOINABLE);
1710     while (!cpu->created) {
1711         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1712     }
1713 }
1714
1715 void qemu_init_vcpu(CPUState *cpu)
1716 {
1717     cpu->nr_cores = smp_cores;
1718     cpu->nr_threads = smp_threads;
1719     cpu->stopped = true;
1720
1721     if (!cpu->as) {
1722         /* If the target cpu hasn't set up any address spaces itself,
1723          * give it the default one.
1724          */
1725         AddressSpace *as = address_space_init_shareable(cpu->memory,
1726                                                         "cpu-memory");
1727         cpu->num_ases = 1;
1728         cpu_address_space_init(cpu, as, 0);
1729     }
1730
1731     if (kvm_enabled()) {
1732         qemu_kvm_start_vcpu(cpu);
1733     } else if (hax_enabled()) {
1734         qemu_hax_start_vcpu(cpu);
1735     } else if (tcg_enabled()) {
1736         qemu_tcg_init_vcpu(cpu);
1737     } else {
1738         qemu_dummy_start_vcpu(cpu);
1739     }
1740 }
1741
1742 void cpu_stop_current(void)
1743 {
1744     if (current_cpu) {
1745         current_cpu->stop = false;
1746         current_cpu->stopped = true;
1747         cpu_exit(current_cpu);
1748         qemu_cond_broadcast(&qemu_pause_cond);
1749     }
1750 }
1751
1752 int vm_stop(RunState state)
1753 {
1754     if (qemu_in_vcpu_thread()) {
1755         qemu_system_vmstop_request_prepare();
1756         qemu_system_vmstop_request(state);
1757         /*
1758          * FIXME: should not return to device code in case
1759          * vm_stop() has been requested.
1760          */
1761         cpu_stop_current();
1762         return 0;
1763     }
1764
1765     return do_vm_stop(state);
1766 }
1767
1768 /**
1769  * Prepare for (re)starting the VM.
1770  * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1771  * running or in case of an error condition), 0 otherwise.
1772  */
1773 int vm_prepare_start(void)
1774 {
1775     RunState requested;
1776     int res = 0;
1777
1778     qemu_vmstop_requested(&requested);
1779     if (runstate_is_running() && requested == RUN_STATE__MAX) {
1780         return -1;
1781     }
1782
1783     /* Ensure that a STOP/RESUME pair of events is emitted if a
1784      * vmstop request was pending.  The BLOCK_IO_ERROR event, for
1785      * example, according to documentation is always followed by
1786      * the STOP event.
1787      */
1788     if (runstate_is_running()) {
1789         qapi_event_send_stop(&error_abort);
1790         res = -1;
1791     } else {
1792         replay_enable_events();
1793         cpu_enable_ticks();
1794         runstate_set(RUN_STATE_RUNNING);
1795         vm_state_notify(1, RUN_STATE_RUNNING);
1796     }
1797
1798     /* We are sending this now, but the CPUs will be resumed shortly later */
1799     qapi_event_send_resume(&error_abort);
1800     return res;
1801 }
1802
1803 void vm_start(void)
1804 {
1805     if (!vm_prepare_start()) {
1806         resume_all_vcpus();
1807     }
1808 }
1809
1810 /* does a state transition even if the VM is already stopped,
1811    current state is forgotten forever */
1812 int vm_stop_force_state(RunState state)
1813 {
1814     if (runstate_is_running()) {
1815         return vm_stop(state);
1816     } else {
1817         runstate_set(state);
1818
1819         bdrv_drain_all();
1820         /* Make sure to return an error if the flush in a previous vm_stop()
1821          * failed. */
1822         return bdrv_flush_all();
1823     }
1824 }
1825
1826 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1827 {
1828     /* XXX: implement xxx_cpu_list for targets that still miss it */
1829 #if defined(cpu_list)
1830     cpu_list(f, cpu_fprintf);
1831 #endif
1832 }
1833
1834 CpuInfoList *qmp_query_cpus(Error **errp)
1835 {
1836     CpuInfoList *head = NULL, *cur_item = NULL;
1837     CPUState *cpu;
1838
1839     CPU_FOREACH(cpu) {
1840         CpuInfoList *info;
1841 #if defined(TARGET_I386)
1842         X86CPU *x86_cpu = X86_CPU(cpu);
1843         CPUX86State *env = &x86_cpu->env;
1844 #elif defined(TARGET_PPC)
1845         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1846         CPUPPCState *env = &ppc_cpu->env;
1847 #elif defined(TARGET_SPARC)
1848         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1849         CPUSPARCState *env = &sparc_cpu->env;
1850 #elif defined(TARGET_MIPS)
1851         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1852         CPUMIPSState *env = &mips_cpu->env;
1853 #elif defined(TARGET_TRICORE)
1854         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1855         CPUTriCoreState *env = &tricore_cpu->env;
1856 #endif
1857
1858         cpu_synchronize_state(cpu);
1859
1860         info = g_malloc0(sizeof(*info));
1861         info->value = g_malloc0(sizeof(*info->value));
1862         info->value->CPU = cpu->cpu_index;
1863         info->value->current = (cpu == first_cpu);
1864         info->value->halted = cpu->halted;
1865         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1866         info->value->thread_id = cpu->thread_id;
1867 #if defined(TARGET_I386)
1868         info->value->arch = CPU_INFO_ARCH_X86;
1869         info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1870 #elif defined(TARGET_PPC)
1871         info->value->arch = CPU_INFO_ARCH_PPC;
1872         info->value->u.ppc.nip = env->nip;
1873 #elif defined(TARGET_SPARC)
1874         info->value->arch = CPU_INFO_ARCH_SPARC;
1875         info->value->u.q_sparc.pc = env->pc;
1876         info->value->u.q_sparc.npc = env->npc;
1877 #elif defined(TARGET_MIPS)
1878         info->value->arch = CPU_INFO_ARCH_MIPS;
1879         info->value->u.q_mips.PC = env->active_tc.PC;
1880 #elif defined(TARGET_TRICORE)
1881         info->value->arch = CPU_INFO_ARCH_TRICORE;
1882         info->value->u.tricore.PC = env->PC;
1883 #else
1884         info->value->arch = CPU_INFO_ARCH_OTHER;
1885 #endif
1886
1887         /* XXX: waiting for the qapi to support GSList */
1888         if (!cur_item) {
1889             head = cur_item = info;
1890         } else {
1891             cur_item->next = info;
1892             cur_item = info;
1893         }
1894     }
1895
1896     return head;
1897 }
1898
1899 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1900                  bool has_cpu, int64_t cpu_index, Error **errp)
1901 {
1902     FILE *f;
1903     uint32_t l;
1904     CPUState *cpu;
1905     uint8_t buf[1024];
1906     int64_t orig_addr = addr, orig_size = size;
1907
1908     if (!has_cpu) {
1909         cpu_index = 0;
1910     }
1911
1912     cpu = qemu_get_cpu(cpu_index);
1913     if (cpu == NULL) {
1914         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1915                    "a CPU number");
1916         return;
1917     }
1918
1919     f = fopen(filename, "wb");
1920     if (!f) {
1921         error_setg_file_open(errp, errno, filename);
1922         return;
1923     }
1924
1925     while (size != 0) {
1926         l = sizeof(buf);
1927         if (l > size)
1928             l = size;
1929         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1930             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1931                              " specified", orig_addr, orig_size);
1932             goto exit;
1933         }
1934         if (fwrite(buf, 1, l, f) != l) {
1935             error_setg(errp, QERR_IO_ERROR);
1936             goto exit;
1937         }
1938         addr += l;
1939         size -= l;
1940     }
1941
1942 exit:
1943     fclose(f);
1944 }
1945
1946 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1947                   Error **errp)
1948 {
1949     FILE *f;
1950     uint32_t l;
1951     uint8_t buf[1024];
1952
1953     f = fopen(filename, "wb");
1954     if (!f) {
1955         error_setg_file_open(errp, errno, filename);
1956         return;
1957     }
1958
1959     while (size != 0) {
1960         l = sizeof(buf);
1961         if (l > size)
1962             l = size;
1963         cpu_physical_memory_read(addr, buf, l);
1964         if (fwrite(buf, 1, l, f) != l) {
1965             error_setg(errp, QERR_IO_ERROR);
1966             goto exit;
1967         }
1968         addr += l;
1969         size -= l;
1970     }
1971
1972 exit:
1973     fclose(f);
1974 }
1975
1976 void qmp_inject_nmi(Error **errp)
1977 {
1978     nmi_monitor_handle(monitor_get_cpu_index(), errp);
1979 }
1980
1981 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1982 {
1983     if (!use_icount) {
1984         return;
1985     }
1986
1987     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
1988                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1989     if (icount_align_option) {
1990         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
1991         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
1992     } else {
1993         cpu_fprintf(f, "Max guest delay     NA\n");
1994         cpu_fprintf(f, "Max guest advance   NA\n");
1995     }
1996 }