cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 /* Needed early for CONFIG_BSD etc. */
  26 #include "config-host.h"
  27
  28 #include "monitor/monitor.h"
  29 #include "qapi/qmp/qerror.h"
  30 #include "qemu/error-report.h"
  31 #include "sysemu/sysemu.h"
  32 #include "exec/gdbstub.h"
  33 #include "sysemu/dma.h"
  34 #include "sysemu/kvm.h"
  35 #include "qmp-commands.h"
  36
  37 #include "qemu/thread.h"
  38 #include "sysemu/cpus.h"
  39 #include "sysemu/qtest.h"
  40 #include "qemu/main-loop.h"
  41 #include "qemu/bitmap.h"
  42 #include "qemu/seqlock.h"
  43 #include "qapi-event.h"
  44 #include "hw/nmi.h"
  45
  46 #ifndef _WIN32
  47 #include "qemu/compatfd.h"
  48 #endif
  49
  50 #ifdef CONFIG_LINUX
  51
  52 #include <sys/prctl.h>
  53
  54 #ifndef PR_MCE_KILL
  55 #define PR_MCE_KILL 33
  56 #endif
  57
  58 #ifndef PR_MCE_KILL_SET
  59 #define PR_MCE_KILL_SET 1
  60 #endif
  61
  62 #ifndef PR_MCE_KILL_EARLY
  63 #define PR_MCE_KILL_EARLY 1
  64 #endif
  65
  66 #endif /* CONFIG_LINUX */
  67
  68 static CPUState *next_cpu;
  69 int64_t max_delay;
  70 int64_t max_advance;
  71
  72 /* vcpu throttling controls */
  73 static QEMUTimer *throttle_timer;
  74 static unsigned int throttle_percentage;
  75
  76 #define CPU_THROTTLE_PCT_MIN 1
  77 #define CPU_THROTTLE_PCT_MAX 99
  78 #define CPU_THROTTLE_TIMESLICE_NS 10000000
  79
  80 bool cpu_is_stopped(CPUState *cpu)
  81 {
  82     return cpu->stopped || !runstate_is_running();
  83 }
  84
  85 static bool cpu_thread_is_idle(CPUState *cpu)
  86 {
  87     if (cpu->stop || cpu->queued_work_first) {
  88         return false;
  89     }
  90     if (cpu_is_stopped(cpu)) {
  91         return true;
  92     }
  93     if (!cpu->halted || cpu_has_work(cpu) ||
  94         kvm_halt_in_kernel()) {
  95         return false;
  96     }
  97     return true;
  98 }
  99
 100 static bool all_cpu_threads_idle(void)
 101 {
 102     CPUState *cpu;
 103
 104     CPU_FOREACH(cpu) {
 105         if (!cpu_thread_is_idle(cpu)) {
 106             return false;
 107         }
 108     }
 109     return true;
 110 }
 111
 112 /***********************************************************/
 113 /* guest cycle counter */
 114
 115 /* Protected by TimersState seqlock */
 116
 117 static bool icount_sleep = true;
 118 static int64_t vm_clock_warp_start = -1;
 119 /* Conversion factor from emulated instructions to virtual clock ticks.  */
 120 static int icount_time_shift;
 121 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 122 #define MAX_ICOUNT_SHIFT 10
 123
 124 static QEMUTimer *icount_rt_timer;
 125 static QEMUTimer *icount_vm_timer;
 126 static QEMUTimer *icount_warp_timer;
 127
 128 typedef struct TimersState {
 129     /* Protected by BQL.  */
 130     int64_t cpu_ticks_prev;
 131     int64_t cpu_ticks_offset;
 132
 133     /* cpu_clock_offset can be read out of BQL, so protect it with
 134      * this lock.
 135      */
 136     QemuSeqLock vm_clock_seqlock;
 137     int64_t cpu_clock_offset;
 138     int32_t cpu_ticks_enabled;
 139     int64_t dummy;
 140
 141     /* Compensate for varying guest execution speed.  */
 142     int64_t qemu_icount_bias;
 143     /* Only written by TCG thread */
 144     int64_t qemu_icount;
 145 } TimersState;
 146
 147 static TimersState timers_state;
 148
 149 int64_t cpu_get_icount_raw(void)
 150 {
 151     int64_t icount;
 152     CPUState *cpu = current_cpu;
 153
 154     icount = timers_state.qemu_icount;
 155     if (cpu) {
 156         if (!cpu->can_do_io) {
 157             fprintf(stderr, "Bad icount read\n");
 158             exit(1);
 159         }
 160         icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
 161     }
 162     return icount;
 163 }
 164
 165 /* Return the virtual CPU time, based on the instruction counter.  */
 166 static int64_t cpu_get_icount_locked(void)
 167 {
 168     int64_t icount = cpu_get_icount_raw();
 169     return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
 170 }
 171
 172 int64_t cpu_get_icount(void)
 173 {
 174     int64_t icount;
 175     unsigned start;
 176
 177     do {
 178         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 179         icount = cpu_get_icount_locked();
 180     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 181
 182     return icount;
 183 }
 184
 185 int64_t cpu_icount_to_ns(int64_t icount)
 186 {
 187     return icount << icount_time_shift;
 188 }
 189
 190 /* return the host CPU cycle counter and handle stop/restart */
 191 /* Caller must hold the BQL */
 192 int64_t cpu_get_ticks(void)
 193 {
 194     int64_t ticks;
 195
 196     if (use_icount) {
 197         return cpu_get_icount();
 198     }
 199
 200     ticks = timers_state.cpu_ticks_offset;
 201     if (timers_state.cpu_ticks_enabled) {
 202         ticks += cpu_get_host_ticks();
 203     }
 204
 205     if (timers_state.cpu_ticks_prev > ticks) {
 206         /* Note: non increasing ticks may happen if the host uses
 207            software suspend */
 208         timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 209         ticks = timers_state.cpu_ticks_prev;
 210     }
 211
 212     timers_state.cpu_ticks_prev = ticks;
 213     return ticks;
 214 }
 215
 216 static int64_t cpu_get_clock_locked(void)
 217 {
 218     int64_t ticks;
 219
 220     ticks = timers_state.cpu_clock_offset;
 221     if (timers_state.cpu_ticks_enabled) {
 222         ticks += get_clock();
 223     }
 224
 225     return ticks;
 226 }
 227
 228 /* return the host CPU monotonic timer and handle stop/restart */
 229 int64_t cpu_get_clock(void)
 230 {
 231     int64_t ti;
 232     unsigned start;
 233
 234     do {
 235         start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 236         ti = cpu_get_clock_locked();
 237     } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 238
 239     return ti;
 240 }
 241
 242 /* enable cpu_get_ticks()
 243  * Caller must hold BQL which server as mutex for vm_clock_seqlock.
 244  */
 245 void cpu_enable_ticks(void)
 246 {
 247     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 248     seqlock_write_lock(&timers_state.vm_clock_seqlock);
 249     if (!timers_state.cpu_ticks_enabled) {
 250         timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 251         timers_state.cpu_clock_offset -= get_clock();
 252         timers_state.cpu_ticks_enabled = 1;
 253     }
 254     seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 255 }
 256
 257 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 258  * cpu_get_ticks() after that.
 259  * Caller must hold BQL which server as mutex for vm_clock_seqlock.
 260  */
 261 void cpu_disable_ticks(void)
 262 {
 263     /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 264     seqlock_write_lock(&timers_state.vm_clock_seqlock);
 265     if (timers_state.cpu_ticks_enabled) {
 266         timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 267         timers_state.cpu_clock_offset = cpu_get_clock_locked();
 268         timers_state.cpu_ticks_enabled = 0;
 269     }
 270     seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 271 }
 272
 273 /* Correlation between real and virtual time is always going to be
 274    fairly approximate, so ignore small variation.
 275    When the guest is idle real and virtual time will be aligned in
 276    the IO wait loop.  */
 277 #define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)
 278
 279 static void icount_adjust(void)
 280 {
 281     int64_t cur_time;
 282     int64_t cur_icount;
 283     int64_t delta;
 284
 285     /* Protected by TimersState mutex.  */
 286     static int64_t last_delta;
 287
 288     /* If the VM is not running, then do nothing.  */
 289     if (!runstate_is_running()) {
 290         return;
 291     }
 292
 293     seqlock_write_lock(&timers_state.vm_clock_seqlock);
 294     cur_time = cpu_get_clock_locked();
 295     cur_icount = cpu_get_icount_locked();
 296
 297     delta = cur_icount - cur_time;
 298     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 299     if (delta > 0
 300         && last_delta + ICOUNT_WOBBLE < delta * 2
 301         && icount_time_shift > 0) {
 302         /* The guest is getting too far ahead.  Slow time down.  */
 303         icount_time_shift--;
 304     }
 305     if (delta < 0
 306         && last_delta - ICOUNT_WOBBLE > delta * 2
 307         && icount_time_shift < MAX_ICOUNT_SHIFT) {
 308         /* The guest is getting too far behind.  Speed time up.  */
 309         icount_time_shift++;
 310     }
 311     last_delta = delta;
 312     timers_state.qemu_icount_bias = cur_icount
 313                               - (timers_state.qemu_icount << icount_time_shift);
 314     seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 315 }
 316
 317 static void icount_adjust_rt(void *opaque)
 318 {
 319     timer_mod(icount_rt_timer,
 320               qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 321     icount_adjust();
 322 }
 323
 324 static void icount_adjust_vm(void *opaque)
 325 {
 326     timer_mod(icount_vm_timer,
 327                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 328                    get_ticks_per_sec() / 10);
 329     icount_adjust();
 330 }
 331
 332 static int64_t qemu_icount_round(int64_t count)
 333 {
 334     return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 335 }
 336
 337 static void icount_warp_rt(void *opaque)
 338 {
 339     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 340      * changes from -1 to another value, so the race here is okay.
 341      */
 342     if (atomic_read(&vm_clock_warp_start) == -1) {
 343         return;
 344     }
 345
 346     seqlock_write_lock(&timers_state.vm_clock_seqlock);
 347     if (runstate_is_running()) {
 348         int64_t clock = cpu_get_clock_locked();
 349         int64_t warp_delta;
 350
 351         warp_delta = clock - vm_clock_warp_start;
 352         if (use_icount == 2) {
 353             /*
 354              * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 355              * far ahead of real time.
 356              */
 357             int64_t cur_icount = cpu_get_icount_locked();
 358             int64_t delta = clock - cur_icount;
 359             warp_delta = MIN(warp_delta, delta);
 360         }
 361         timers_state.qemu_icount_bias += warp_delta;
 362     }
 363     vm_clock_warp_start = -1;
 364     seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 365
 366     if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 367         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 368     }
 369 }
 370
 371 void qtest_clock_warp(int64_t dest)
 372 {
 373     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 374     AioContext *aio_context;
 375     assert(qtest_enabled());
 376     aio_context = qemu_get_aio_context();
 377     while (clock < dest) {
 378         int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 379         int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 380
 381         seqlock_write_lock(&timers_state.vm_clock_seqlock);
 382         timers_state.qemu_icount_bias += warp;
 383         seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 384
 385         qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 386         timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 387         clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 388     }
 389     qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 390 }
 391
 392 void qemu_clock_warp(QEMUClockType type)
 393 {
 394     int64_t clock;
 395     int64_t deadline;
 396
 397     /*
 398      * There are too many global variables to make the "warp" behavior
 399      * applicable to other clocks.  But a clock argument removes the
 400      * need for if statements all over the place.
 401      */
 402     if (type != QEMU_CLOCK_VIRTUAL || !use_icount) {
 403         return;
 404     }
 405
 406     if (icount_sleep) {
 407         /*
 408          * If the CPUs have been sleeping, advance QEMU_CLOCK_VIRTUAL timer now.
 409          * This ensures that the deadline for the timer is computed correctly
 410          * below.
 411          * This also makes sure that the insn counter is synchronized before
 412          * the CPU starts running, in case the CPU is woken by an event other
 413          * than the earliest QEMU_CLOCK_VIRTUAL timer.
 414          */
 415         icount_warp_rt(NULL);
 416         timer_del(icount_warp_timer);
 417     }
 418     if (!all_cpu_threads_idle()) {
 419         return;
 420     }
 421
 422     if (qtest_enabled()) {
 423         /* When testing, qtest commands advance icount.  */
 424         return;
 425     }
 426
 427     /* We want to use the earliest deadline from ALL vm_clocks */
 428     clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 429     deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 430     if (deadline < 0) {
 431         static bool notified;
 432         if (!icount_sleep && !notified) {
 433             error_report("WARNING: icount sleep disabled and no active timers");
 434             notified = true;
 435         }
 436         return;
 437     }
 438
 439     if (deadline > 0) {
 440         /*
 441          * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 442          * sleep.  Otherwise, the CPU might be waiting for a future timer
 443          * interrupt to wake it up, but the interrupt never comes because
 444          * the vCPU isn't running any insns and thus doesn't advance the
 445          * QEMU_CLOCK_VIRTUAL.
 446          */
 447         if (!icount_sleep) {
 448             /*
 449              * We never let VCPUs sleep in no sleep icount mode.
 450              * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 451              * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 452              * It is useful when we want a deterministic execution time,
 453              * isolated from host latencies.
 454              */
 455             seqlock_write_lock(&timers_state.vm_clock_seqlock);
 456             timers_state.qemu_icount_bias += deadline;
 457             seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 458             qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 459         } else {
 460             /*
 461              * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 462              * "real" time, (related to the time left until the next event) has
 463              * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 464              * This avoids that the warps are visible externally; for example,
 465              * you will not be sending network packets continuously instead of
 466              * every 100ms.
 467              */
 468             seqlock_write_lock(&timers_state.vm_clock_seqlock);
 469             if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
 470                 vm_clock_warp_start = clock;
 471             }
 472             seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 473             timer_mod_anticipate(icount_warp_timer, clock + deadline);
 474         }
 475     } else if (deadline == 0) {
 476         qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 477     }
 478 }
 479
 480 static bool icount_state_needed(void *opaque)
 481 {
 482     return use_icount;
 483 }
 484
 485 /*
 486  * This is a subsection for icount migration.
 487  */
 488 static const VMStateDescription icount_vmstate_timers = {
 489     .name = "timer/icount",
 490     .version_id = 1,
 491     .minimum_version_id = 1,
 492     .needed = icount_state_needed,
 493     .fields = (VMStateField[]) {
 494         VMSTATE_INT64(qemu_icount_bias, TimersState),
 495         VMSTATE_INT64(qemu_icount, TimersState),
 496         VMSTATE_END_OF_LIST()
 497     }
 498 };
 499
 500 static const VMStateDescription vmstate_timers = {
 501     .name = "timer",
 502     .version_id = 2,
 503     .minimum_version_id = 1,
 504     .fields = (VMStateField[]) {
 505         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 506         VMSTATE_INT64(dummy, TimersState),
 507         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 508         VMSTATE_END_OF_LIST()
 509     },
 510     .subsections = (const VMStateDescription*[]) {
 511         &icount_vmstate_timers,
 512         NULL
 513     }
 514 };
 515
 516 static void cpu_throttle_thread(void *opaque)
 517 {
 518     CPUState *cpu = opaque;
 519     double pct;
 520     double throttle_ratio;
 521     long sleeptime_ns;
 522
 523     if (!cpu_throttle_get_percentage()) {
 524         return;
 525     }
 526
 527     pct = (double)cpu_throttle_get_percentage()/100;
 528     throttle_ratio = pct / (1 - pct);
 529     sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 530
 531     qemu_mutex_unlock_iothread();
 532     atomic_set(&cpu->throttle_thread_scheduled, 0);
 533     g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 534     qemu_mutex_lock_iothread();
 535 }
 536
 537 static void cpu_throttle_timer_tick(void *opaque)
 538 {
 539     CPUState *cpu;
 540     double pct;
 541
 542     /* Stop the timer if needed */
 543     if (!cpu_throttle_get_percentage()) {
 544         return;
 545     }
 546     CPU_FOREACH(cpu) {
 547         if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 548             async_run_on_cpu(cpu, cpu_throttle_thread, cpu);
 549         }
 550     }
 551
 552     pct = (double)cpu_throttle_get_percentage()/100;
 553     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 554                                    CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 555 }
 556
 557 void cpu_throttle_set(int new_throttle_pct)
 558 {
 559     /* Ensure throttle percentage is within valid range */
 560     new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 561     new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 562
 563     atomic_set(&throttle_percentage, new_throttle_pct);
 564
 565     timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 566                                        CPU_THROTTLE_TIMESLICE_NS);
 567 }
 568
 569 void cpu_throttle_stop(void)
 570 {
 571     atomic_set(&throttle_percentage, 0);
 572 }
 573
 574 bool cpu_throttle_active(void)
 575 {
 576     return (cpu_throttle_get_percentage() != 0);
 577 }
 578
 579 int cpu_throttle_get_percentage(void)
 580 {
 581     return atomic_read(&throttle_percentage);
 582 }
 583
 584 void cpu_ticks_init(void)
 585 {
 586     seqlock_init(&timers_state.vm_clock_seqlock, NULL);
 587     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 588     throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 589                                            cpu_throttle_timer_tick, NULL);
 590 }
 591
 592 void configure_icount(QemuOpts *opts, Error **errp)
 593 {
 594     const char *option;
 595     char *rem_str = NULL;
 596
 597     option = qemu_opt_get(opts, "shift");
 598     if (!option) {
 599         if (qemu_opt_get(opts, "align") != NULL) {
 600             error_setg(errp, "Please specify shift option when using align");
 601         }
 602         return;
 603     }
 604
 605     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 606     if (icount_sleep) {
 607         icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 608                                          icount_warp_rt, NULL);
 609     }
 610
 611     icount_align_option = qemu_opt_get_bool(opts, "align", false);
 612
 613     if (icount_align_option && !icount_sleep) {
 614         error_setg(errp, "align=on and sleep=no are incompatible");
 615     }
 616     if (strcmp(option, "auto") != 0) {
 617         errno = 0;
 618         icount_time_shift = strtol(option, &rem_str, 0);
 619         if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 620             error_setg(errp, "icount: Invalid shift value");
 621         }
 622         use_icount = 1;
 623         return;
 624     } else if (icount_align_option) {
 625         error_setg(errp, "shift=auto and align=on are incompatible");
 626     } else if (!icount_sleep) {
 627         error_setg(errp, "shift=auto and sleep=no are incompatible");
 628     }
 629
 630     use_icount = 2;
 631
 632     /* 125MIPS seems a reasonable initial guess at the guest speed.
 633        It will be corrected fairly quickly anyway.  */
 634     icount_time_shift = 3;
 635
 636     /* Have both realtime and virtual time triggers for speed adjustment.
 637        The realtime trigger catches emulated time passing too slowly,
 638        the virtual time trigger catches emulated time passing too fast.
 639        Realtime triggers occur even when idle, so use them less frequently
 640        than VM triggers.  */
 641     icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 642                                    icount_adjust_rt, NULL);
 643     timer_mod(icount_rt_timer,
 644                    qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 645     icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 646                                         icount_adjust_vm, NULL);
 647     timer_mod(icount_vm_timer,
 648                    qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 649                    get_ticks_per_sec() / 10);
 650 }
 651
 652 /***********************************************************/
 653 void hw_error(const char *fmt, ...)
 654 {
 655     va_list ap;
 656     CPUState *cpu;
 657
 658     va_start(ap, fmt);
 659     fprintf(stderr, "qemu: hardware error: ");
 660     vfprintf(stderr, fmt, ap);
 661     fprintf(stderr, "\n");
 662     CPU_FOREACH(cpu) {
 663         fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 664         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 665     }
 666     va_end(ap);
 667     abort();
 668 }
 669
 670 void cpu_synchronize_all_states(void)
 671 {
 672     CPUState *cpu;
 673
 674     CPU_FOREACH(cpu) {
 675         cpu_synchronize_state(cpu);
 676     }
 677 }
 678
 679 void cpu_synchronize_all_post_reset(void)
 680 {
 681     CPUState *cpu;
 682
 683     CPU_FOREACH(cpu) {
 684         cpu_synchronize_post_reset(cpu);
 685     }
 686 }
 687
 688 void cpu_synchronize_all_post_init(void)
 689 {
 690     CPUState *cpu;
 691
 692     CPU_FOREACH(cpu) {
 693         cpu_synchronize_post_init(cpu);
 694     }
 695 }
 696
 697 void cpu_clean_all_dirty(void)
 698 {
 699     CPUState *cpu;
 700
 701     CPU_FOREACH(cpu) {
 702         cpu_clean_state(cpu);
 703     }
 704 }
 705
 706 static int do_vm_stop(RunState state)
 707 {
 708     int ret = 0;
 709
 710     if (runstate_is_running()) {
 711         cpu_disable_ticks();
 712         pause_all_vcpus();
 713         runstate_set(state);
 714         vm_state_notify(0, state);
 715         qapi_event_send_stop(&error_abort);
 716     }
 717
 718     bdrv_drain_all();
 719     ret = bdrv_flush_all();
 720
 721     return ret;
 722 }
 723
 724 static bool cpu_can_run(CPUState *cpu)
 725 {
 726     if (cpu->stop) {
 727         return false;
 728     }
 729     if (cpu_is_stopped(cpu)) {
 730         return false;
 731     }
 732     return true;
 733 }
 734
 735 static void cpu_handle_guest_debug(CPUState *cpu)
 736 {
 737     gdb_set_stop_cpu(cpu);
 738     qemu_system_debug_request();
 739     cpu->stopped = true;
 740 }
 741
 742 #ifdef CONFIG_LINUX
 743 static void sigbus_reraise(void)
 744 {
 745     sigset_t set;
 746     struct sigaction action;
 747
 748     memset(&action, 0, sizeof(action));
 749     action.sa_handler = SIG_DFL;
 750     if (!sigaction(SIGBUS, &action, NULL)) {
 751         raise(SIGBUS);
 752         sigemptyset(&set);
 753         sigaddset(&set, SIGBUS);
 754         sigprocmask(SIG_UNBLOCK, &set, NULL);
 755     }
 756     perror("Failed to re-raise SIGBUS!\n");
 757     abort();
 758 }
 759
 760 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
 761                            void *ctx)
 762 {
 763     if (kvm_on_sigbus(siginfo->ssi_code,
 764                       (void *)(intptr_t)siginfo->ssi_addr)) {
 765         sigbus_reraise();
 766     }
 767 }
 768
 769 static void qemu_init_sigbus(void)
 770 {
 771     struct sigaction action;
 772
 773     memset(&action, 0, sizeof(action));
 774     action.sa_flags = SA_SIGINFO;
 775     action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
 776     sigaction(SIGBUS, &action, NULL);
 777
 778     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
 779 }
 780
 781 static void qemu_kvm_eat_signals(CPUState *cpu)
 782 {
 783     struct timespec ts = { 0, 0 };
 784     siginfo_t siginfo;
 785     sigset_t waitset;
 786     sigset_t chkset;
 787     int r;
 788
 789     sigemptyset(&waitset);
 790     sigaddset(&waitset, SIG_IPI);
 791     sigaddset(&waitset, SIGBUS);
 792
 793     do {
 794         r = sigtimedwait(&waitset, &siginfo, &ts);
 795         if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
 796             perror("sigtimedwait");
 797             exit(1);
 798         }
 799
 800         switch (r) {
 801         case SIGBUS:
 802             if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
 803                 sigbus_reraise();
 804             }
 805             break;
 806         default:
 807             break;
 808         }
 809
 810         r = sigpending(&chkset);
 811         if (r == -1) {
 812             perror("sigpending");
 813             exit(1);
 814         }
 815     } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 816 }
 817
 818 #else /* !CONFIG_LINUX */
 819
 820 static void qemu_init_sigbus(void)
 821 {
 822 }
 823
 824 static void qemu_kvm_eat_signals(CPUState *cpu)
 825 {
 826 }
 827 #endif /* !CONFIG_LINUX */
 828
 829 #ifndef _WIN32
 830 static void dummy_signal(int sig)
 831 {
 832 }
 833
 834 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
 835 {
 836     int r;
 837     sigset_t set;
 838     struct sigaction sigact;
 839
 840     memset(&sigact, 0, sizeof(sigact));
 841     sigact.sa_handler = dummy_signal;
 842     sigaction(SIG_IPI, &sigact, NULL);
 843
 844     pthread_sigmask(SIG_BLOCK, NULL, &set);
 845     sigdelset(&set, SIG_IPI);
 846     sigdelset(&set, SIGBUS);
 847     r = kvm_set_signal_mask(cpu, &set);
 848     if (r) {
 849         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
 850         exit(1);
 851     }
 852 }
 853
 854 #else /* _WIN32 */
 855 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
 856 {
 857     abort();
 858 }
 859 #endif /* _WIN32 */
 860
 861 static QemuMutex qemu_global_mutex;
 862 static QemuCond qemu_io_proceeded_cond;
 863 static unsigned iothread_requesting_mutex;
 864
 865 static QemuThread io_thread;
 866
 867 /* cpu creation */
 868 static QemuCond qemu_cpu_cond;
 869 /* system init */
 870 static QemuCond qemu_pause_cond;
 871 static QemuCond qemu_work_cond;
 872
 873 void qemu_init_cpu_loop(void)
 874 {
 875     qemu_init_sigbus();
 876     qemu_cond_init(&qemu_cpu_cond);
 877     qemu_cond_init(&qemu_pause_cond);
 878     qemu_cond_init(&qemu_work_cond);
 879     qemu_cond_init(&qemu_io_proceeded_cond);
 880     qemu_mutex_init(&qemu_global_mutex);
 881
 882     qemu_thread_get_self(&io_thread);
 883 }
 884
 885 void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
 886 {
 887     struct qemu_work_item wi;
 888
 889     if (qemu_cpu_is_self(cpu)) {
 890         func(data);
 891         return;
 892     }
 893
 894     wi.func = func;
 895     wi.data = data;
 896     wi.free = false;
 897
 898     qemu_mutex_lock(&cpu->work_mutex);
 899     if (cpu->queued_work_first == NULL) {
 900         cpu->queued_work_first = &wi;
 901     } else {
 902         cpu->queued_work_last->next = &wi;
 903     }
 904     cpu->queued_work_last = &wi;
 905     wi.next = NULL;
 906     wi.done = false;
 907     qemu_mutex_unlock(&cpu->work_mutex);
 908
 909     qemu_cpu_kick(cpu);
 910     while (!atomic_mb_read(&wi.done)) {
 911         CPUState *self_cpu = current_cpu;
 912
 913         qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
 914         current_cpu = self_cpu;
 915     }
 916 }
 917
 918 void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
 919 {
 920     struct qemu_work_item *wi;
 921
 922     if (qemu_cpu_is_self(cpu)) {
 923         func(data);
 924         return;
 925     }
 926
 927     wi = g_malloc0(sizeof(struct qemu_work_item));
 928     wi->func = func;
 929     wi->data = data;
 930     wi->free = true;
 931
 932     qemu_mutex_lock(&cpu->work_mutex);
 933     if (cpu->queued_work_first == NULL) {
 934         cpu->queued_work_first = wi;
 935     } else {
 936         cpu->queued_work_last->next = wi;
 937     }
 938     cpu->queued_work_last = wi;
 939     wi->next = NULL;
 940     wi->done = false;
 941     qemu_mutex_unlock(&cpu->work_mutex);
 942
 943     qemu_cpu_kick(cpu);
 944 }
 945
 946 static void flush_queued_work(CPUState *cpu)
 947 {
 948     struct qemu_work_item *wi;
 949
 950     if (cpu->queued_work_first == NULL) {
 951         return;
 952     }
 953
 954     qemu_mutex_lock(&cpu->work_mutex);
 955     while (cpu->queued_work_first != NULL) {
 956         wi = cpu->queued_work_first;
 957         cpu->queued_work_first = wi->next;
 958         if (!cpu->queued_work_first) {
 959             cpu->queued_work_last = NULL;
 960         }
 961         qemu_mutex_unlock(&cpu->work_mutex);
 962         wi->func(wi->data);
 963         qemu_mutex_lock(&cpu->work_mutex);
 964         if (wi->free) {
 965             g_free(wi);
 966         } else {
 967             atomic_mb_set(&wi->done, true);
 968         }
 969     }
 970     qemu_mutex_unlock(&cpu->work_mutex);
 971     qemu_cond_broadcast(&qemu_work_cond);
 972 }
 973
 974 static void qemu_wait_io_event_common(CPUState *cpu)
 975 {
 976     if (cpu->stop) {
 977         cpu->stop = false;
 978         cpu->stopped = true;
 979         qemu_cond_signal(&qemu_pause_cond);
 980     }
 981     flush_queued_work(cpu);
 982     cpu->thread_kicked = false;
 983 }
 984
 985 static void qemu_tcg_wait_io_event(CPUState *cpu)
 986 {
 987     while (all_cpu_threads_idle()) {
 988        /* Start accounting real time to the virtual clock if the CPUs
 989           are idle.  */
 990         qemu_clock_warp(QEMU_CLOCK_VIRTUAL);
 991         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
 992     }
 993
 994     while (iothread_requesting_mutex) {
 995         qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
 996     }
 997
 998     CPU_FOREACH(cpu) {
 999         qemu_wait_io_event_common(cpu);
1000     }
1001 }
1002
1003 static void qemu_kvm_wait_io_event(CPUState *cpu)
1004 {
1005     while (cpu_thread_is_idle(cpu)) {
1006         qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1007     }
1008
1009     qemu_kvm_eat_signals(cpu);
1010     qemu_wait_io_event_common(cpu);
1011 }
1012
1013 static void *qemu_kvm_cpu_thread_fn(void *arg)
1014 {
1015     CPUState *cpu = arg;
1016     int r;
1017
1018     rcu_register_thread();
1019
1020     qemu_mutex_lock_iothread();
1021     qemu_thread_get_self(cpu->thread);
1022     cpu->thread_id = qemu_get_thread_id();
1023     cpu->can_do_io = 1;
1024     current_cpu = cpu;
1025
1026     r = kvm_init_vcpu(cpu);
1027     if (r < 0) {
1028         fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1029         exit(1);
1030     }
1031
1032     qemu_kvm_init_cpu_signals(cpu);
1033
1034     /* signal CPU creation */
1035     cpu->created = true;
1036     qemu_cond_signal(&qemu_cpu_cond);
1037
1038     while (1) {
1039         if (cpu_can_run(cpu)) {
1040             r = kvm_cpu_exec(cpu);
1041             if (r == EXCP_DEBUG) {
1042                 cpu_handle_guest_debug(cpu);
1043             }
1044         }
1045         qemu_kvm_wait_io_event(cpu);
1046     }
1047
1048     return NULL;
1049 }
1050
1051 static void *qemu_dummy_cpu_thread_fn(void *arg)
1052 {
1053 #ifdef _WIN32
1054     fprintf(stderr, "qtest is not supported under Windows\n");
1055     exit(1);
1056 #else
1057     CPUState *cpu = arg;
1058     sigset_t waitset;
1059     int r;
1060
1061     rcu_register_thread();
1062
1063     qemu_mutex_lock_iothread();
1064     qemu_thread_get_self(cpu->thread);
1065     cpu->thread_id = qemu_get_thread_id();
1066     cpu->can_do_io = 1;
1067
1068     sigemptyset(&waitset);
1069     sigaddset(&waitset, SIG_IPI);
1070
1071     /* signal CPU creation */
1072     cpu->created = true;
1073     qemu_cond_signal(&qemu_cpu_cond);
1074
1075     current_cpu = cpu;
1076     while (1) {
1077         current_cpu = NULL;
1078         qemu_mutex_unlock_iothread();
1079         do {
1080             int sig;
1081             r = sigwait(&waitset, &sig);
1082         } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1083         if (r == -1) {
1084             perror("sigwait");
1085             exit(1);
1086         }
1087         qemu_mutex_lock_iothread();
1088         current_cpu = cpu;
1089         qemu_wait_io_event_common(cpu);
1090     }
1091
1092     return NULL;
1093 #endif
1094 }
1095
1096 static void tcg_exec_all(void);
1097
1098 static void *qemu_tcg_cpu_thread_fn(void *arg)
1099 {
1100     CPUState *cpu = arg;
1101
1102     rcu_register_thread();
1103
1104     qemu_mutex_lock_iothread();
1105     qemu_thread_get_self(cpu->thread);
1106
1107     CPU_FOREACH(cpu) {
1108         cpu->thread_id = qemu_get_thread_id();
1109         cpu->created = true;
1110         cpu->can_do_io = 1;
1111     }
1112     qemu_cond_signal(&qemu_cpu_cond);
1113
1114     /* wait for initial kick-off after machine start */
1115     while (first_cpu->stopped) {
1116         qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1117
1118         /* process any pending work */
1119         CPU_FOREACH(cpu) {
1120             qemu_wait_io_event_common(cpu);
1121         }
1122     }
1123
1124     /* process any pending work */
1125     atomic_mb_set(&exit_request, 1);
1126
1127     while (1) {
1128         tcg_exec_all();
1129
1130         if (use_icount) {
1131             int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1132
1133             if (deadline == 0) {
1134                 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1135             }
1136         }
1137         qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
1138     }
1139
1140     return NULL;
1141 }
1142
1143 static void qemu_cpu_kick_thread(CPUState *cpu)
1144 {
1145 #ifndef _WIN32
1146     int err;
1147
1148     if (cpu->thread_kicked) {
1149         return;
1150     }
1151     cpu->thread_kicked = true;
1152     err = pthread_kill(cpu->thread->thread, SIG_IPI);
1153     if (err) {
1154         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1155         exit(1);
1156     }
1157 #else /* _WIN32 */
1158     abort();
1159 #endif
1160 }
1161
1162 static void qemu_cpu_kick_no_halt(void)
1163 {
1164     CPUState *cpu;
1165     /* Ensure whatever caused the exit has reached the CPU threads before
1166      * writing exit_request.
1167      */
1168     atomic_mb_set(&exit_request, 1);
1169     cpu = atomic_mb_read(&tcg_current_cpu);
1170     if (cpu) {
1171         cpu_exit(cpu);
1172     }
1173 }
1174
1175 void qemu_cpu_kick(CPUState *cpu)
1176 {
1177     qemu_cond_broadcast(cpu->halt_cond);
1178     if (tcg_enabled()) {
1179         qemu_cpu_kick_no_halt();
1180     } else {
1181         qemu_cpu_kick_thread(cpu);
1182     }
1183 }
1184
1185 void qemu_cpu_kick_self(void)
1186 {
1187     assert(current_cpu);
1188     qemu_cpu_kick_thread(current_cpu);
1189 }
1190
1191 bool qemu_cpu_is_self(CPUState *cpu)
1192 {
1193     return qemu_thread_is_self(cpu->thread);
1194 }
1195
1196 bool qemu_in_vcpu_thread(void)
1197 {
1198     return current_cpu && qemu_cpu_is_self(current_cpu);
1199 }
1200
1201 static __thread bool iothread_locked = false;
1202
1203 bool qemu_mutex_iothread_locked(void)
1204 {
1205     return iothread_locked;
1206 }
1207
1208 void qemu_mutex_lock_iothread(void)
1209 {
1210     atomic_inc(&iothread_requesting_mutex);
1211     /* In the simple case there is no need to bump the VCPU thread out of
1212      * TCG code execution.
1213      */
1214     if (!tcg_enabled() || qemu_in_vcpu_thread() ||
1215         !first_cpu || !first_cpu->created) {
1216         qemu_mutex_lock(&qemu_global_mutex);
1217         atomic_dec(&iothread_requesting_mutex);
1218     } else {
1219         if (qemu_mutex_trylock(&qemu_global_mutex)) {
1220             qemu_cpu_kick_no_halt();
1221             qemu_mutex_lock(&qemu_global_mutex);
1222         }
1223         atomic_dec(&iothread_requesting_mutex);
1224         qemu_cond_broadcast(&qemu_io_proceeded_cond);
1225     }
1226     iothread_locked = true;
1227 }
1228
1229 void qemu_mutex_unlock_iothread(void)
1230 {
1231     iothread_locked = false;
1232     qemu_mutex_unlock(&qemu_global_mutex);
1233 }
1234
1235 static int all_vcpus_paused(void)
1236 {
1237     CPUState *cpu;
1238
1239     CPU_FOREACH(cpu) {
1240         if (!cpu->stopped) {
1241             return 0;
1242         }
1243     }
1244
1245     return 1;
1246 }
1247
1248 void pause_all_vcpus(void)
1249 {
1250     CPUState *cpu;
1251
1252     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1253     CPU_FOREACH(cpu) {
1254         cpu->stop = true;
1255         qemu_cpu_kick(cpu);
1256     }
1257
1258     if (qemu_in_vcpu_thread()) {
1259         cpu_stop_current();
1260         if (!kvm_enabled()) {
1261             CPU_FOREACH(cpu) {
1262                 cpu->stop = false;
1263                 cpu->stopped = true;
1264             }
1265             return;
1266         }
1267     }
1268
1269     while (!all_vcpus_paused()) {
1270         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1271         CPU_FOREACH(cpu) {
1272             qemu_cpu_kick(cpu);
1273         }
1274     }
1275 }
1276
1277 void cpu_resume(CPUState *cpu)
1278 {
1279     cpu->stop = false;
1280     cpu->stopped = false;
1281     qemu_cpu_kick(cpu);
1282 }
1283
1284 void resume_all_vcpus(void)
1285 {
1286     CPUState *cpu;
1287
1288     qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1289     CPU_FOREACH(cpu) {
1290         cpu_resume(cpu);
1291     }
1292 }
1293
1294 /* For temporary buffers for forming a name */
1295 #define VCPU_THREAD_NAME_SIZE 16
1296
1297 static void qemu_tcg_init_vcpu(CPUState *cpu)
1298 {
1299     char thread_name[VCPU_THREAD_NAME_SIZE];
1300     static QemuCond *tcg_halt_cond;
1301     static QemuThread *tcg_cpu_thread;
1302
1303     tcg_cpu_address_space_init(cpu, cpu->as);
1304
1305     /* share a single thread for all cpus with TCG */
1306     if (!tcg_cpu_thread) {
1307         cpu->thread = g_malloc0(sizeof(QemuThread));
1308         cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1309         qemu_cond_init(cpu->halt_cond);
1310         tcg_halt_cond = cpu->halt_cond;
1311         snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1312                  cpu->cpu_index);
1313         qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1314                            cpu, QEMU_THREAD_JOINABLE);
1315 #ifdef _WIN32
1316         cpu->hThread = qemu_thread_get_handle(cpu->thread);
1317 #endif
1318         while (!cpu->created) {
1319             qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1320         }
1321         tcg_cpu_thread = cpu->thread;
1322     } else {
1323         cpu->thread = tcg_cpu_thread;
1324         cpu->halt_cond = tcg_halt_cond;
1325     }
1326 }
1327
1328 static void qemu_kvm_start_vcpu(CPUState *cpu)
1329 {
1330     char thread_name[VCPU_THREAD_NAME_SIZE];
1331
1332     cpu->thread = g_malloc0(sizeof(QemuThread));
1333     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1334     qemu_cond_init(cpu->halt_cond);
1335     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1336              cpu->cpu_index);
1337     qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1338                        cpu, QEMU_THREAD_JOINABLE);
1339     while (!cpu->created) {
1340         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1341     }
1342 }
1343
1344 static void qemu_dummy_start_vcpu(CPUState *cpu)
1345 {
1346     char thread_name[VCPU_THREAD_NAME_SIZE];
1347
1348     cpu->thread = g_malloc0(sizeof(QemuThread));
1349     cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1350     qemu_cond_init(cpu->halt_cond);
1351     snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1352              cpu->cpu_index);
1353     qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1354                        QEMU_THREAD_JOINABLE);
1355     while (!cpu->created) {
1356         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1357     }
1358 }
1359
1360 void qemu_init_vcpu(CPUState *cpu)
1361 {
1362     cpu->nr_cores = smp_cores;
1363     cpu->nr_threads = smp_threads;
1364     cpu->stopped = true;
1365     if (kvm_enabled()) {
1366         qemu_kvm_start_vcpu(cpu);
1367     } else if (tcg_enabled()) {
1368         qemu_tcg_init_vcpu(cpu);
1369     } else {
1370         qemu_dummy_start_vcpu(cpu);
1371     }
1372 }
1373
1374 void cpu_stop_current(void)
1375 {
1376     if (current_cpu) {
1377         current_cpu->stop = false;
1378         current_cpu->stopped = true;
1379         cpu_exit(current_cpu);
1380         qemu_cond_signal(&qemu_pause_cond);
1381     }
1382 }
1383
1384 int vm_stop(RunState state)
1385 {
1386     if (qemu_in_vcpu_thread()) {
1387         qemu_system_vmstop_request_prepare();
1388         qemu_system_vmstop_request(state);
1389         /*
1390          * FIXME: should not return to device code in case
1391          * vm_stop() has been requested.
1392          */
1393         cpu_stop_current();
1394         return 0;
1395     }
1396
1397     return do_vm_stop(state);
1398 }
1399
1400 /* does a state transition even if the VM is already stopped,
1401    current state is forgotten forever */
1402 int vm_stop_force_state(RunState state)
1403 {
1404     if (runstate_is_running()) {
1405         return vm_stop(state);
1406     } else {
1407         runstate_set(state);
1408         /* Make sure to return an error if the flush in a previous vm_stop()
1409          * failed. */
1410         return bdrv_flush_all();
1411     }
1412 }
1413
1414 static int tcg_cpu_exec(CPUState *cpu)
1415 {
1416     int ret;
1417 #ifdef CONFIG_PROFILER
1418     int64_t ti;
1419 #endif
1420
1421 #ifdef CONFIG_PROFILER
1422     ti = profile_getclock();
1423 #endif
1424     if (use_icount) {
1425         int64_t count;
1426         int64_t deadline;
1427         int decr;
1428         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1429                                     + cpu->icount_extra);
1430         cpu->icount_decr.u16.low = 0;
1431         cpu->icount_extra = 0;
1432         deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1433
1434         /* Maintain prior (possibly buggy) behaviour where if no deadline
1435          * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1436          * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1437          * nanoseconds.
1438          */
1439         if ((deadline < 0) || (deadline > INT32_MAX)) {
1440             deadline = INT32_MAX;
1441         }
1442
1443         count = qemu_icount_round(deadline);
1444         timers_state.qemu_icount += count;
1445         decr = (count > 0xffff) ? 0xffff : count;
1446         count -= decr;
1447         cpu->icount_decr.u16.low = decr;
1448         cpu->icount_extra = count;
1449     }
1450     ret = cpu_exec(cpu);
1451 #ifdef CONFIG_PROFILER
1452     tcg_time += profile_getclock() - ti;
1453 #endif
1454     if (use_icount) {
1455         /* Fold pending instructions back into the
1456            instruction counter, and clear the interrupt flag.  */
1457         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1458                         + cpu->icount_extra);
1459         cpu->icount_decr.u32 = 0;
1460         cpu->icount_extra = 0;
1461     }
1462     return ret;
1463 }
1464
1465 static void tcg_exec_all(void)
1466 {
1467     int r;
1468
1469     /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1470     qemu_clock_warp(QEMU_CLOCK_VIRTUAL);
1471
1472     if (next_cpu == NULL) {
1473         next_cpu = first_cpu;
1474     }
1475     for (; next_cpu != NULL && !exit_request; next_cpu = CPU_NEXT(next_cpu)) {
1476         CPUState *cpu = next_cpu;
1477
1478         qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1479                           (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1480
1481         if (cpu_can_run(cpu)) {
1482             r = tcg_cpu_exec(cpu);
1483             if (r == EXCP_DEBUG) {
1484                 cpu_handle_guest_debug(cpu);
1485                 break;
1486             }
1487         } else if (cpu->stop || cpu->stopped) {
1488             break;
1489         }
1490     }
1491
1492     /* Pairs with smp_wmb in qemu_cpu_kick.  */
1493     atomic_mb_set(&exit_request, 0);
1494 }
1495
1496 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1497 {
1498     /* XXX: implement xxx_cpu_list for targets that still miss it */
1499 #if defined(cpu_list)
1500     cpu_list(f, cpu_fprintf);
1501 #endif
1502 }
1503
1504 CpuInfoList *qmp_query_cpus(Error **errp)
1505 {
1506     CpuInfoList *head = NULL, *cur_item = NULL;
1507     CPUState *cpu;
1508
1509     CPU_FOREACH(cpu) {
1510         CpuInfoList *info;
1511 #if defined(TARGET_I386)
1512         X86CPU *x86_cpu = X86_CPU(cpu);
1513         CPUX86State *env = &x86_cpu->env;
1514 #elif defined(TARGET_PPC)
1515         PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1516         CPUPPCState *env = &ppc_cpu->env;
1517 #elif defined(TARGET_SPARC)
1518         SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1519         CPUSPARCState *env = &sparc_cpu->env;
1520 #elif defined(TARGET_MIPS)
1521         MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1522         CPUMIPSState *env = &mips_cpu->env;
1523 #elif defined(TARGET_TRICORE)
1524         TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1525         CPUTriCoreState *env = &tricore_cpu->env;
1526 #endif
1527
1528         cpu_synchronize_state(cpu);
1529
1530         info = g_malloc0(sizeof(*info));
1531         info->value = g_malloc0(sizeof(*info->value));
1532         info->value->CPU = cpu->cpu_index;
1533         info->value->current = (cpu == first_cpu);
1534         info->value->halted = cpu->halted;
1535         info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1536         info->value->thread_id = cpu->thread_id;
1537 #if defined(TARGET_I386)
1538         info->value->has_pc = true;
1539         info->value->pc = env->eip + env->segs[R_CS].base;
1540 #elif defined(TARGET_PPC)
1541         info->value->has_nip = true;
1542         info->value->nip = env->nip;
1543 #elif defined(TARGET_SPARC)
1544         info->value->has_pc = true;
1545         info->value->pc = env->pc;
1546         info->value->has_npc = true;
1547         info->value->npc = env->npc;
1548 #elif defined(TARGET_MIPS)
1549         info->value->has_PC = true;
1550         info->value->PC = env->active_tc.PC;
1551 #elif defined(TARGET_TRICORE)
1552         info->value->has_PC = true;
1553         info->value->PC = env->PC;
1554 #endif
1555
1556         /* XXX: waiting for the qapi to support GSList */
1557         if (!cur_item) {
1558             head = cur_item = info;
1559         } else {
1560             cur_item->next = info;
1561             cur_item = info;
1562         }
1563     }
1564
1565     return head;
1566 }
1567
1568 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1569                  bool has_cpu, int64_t cpu_index, Error **errp)
1570 {
1571     FILE *f;
1572     uint32_t l;
1573     CPUState *cpu;
1574     uint8_t buf[1024];
1575     int64_t orig_addr = addr, orig_size = size;
1576
1577     if (!has_cpu) {
1578         cpu_index = 0;
1579     }
1580
1581     cpu = qemu_get_cpu(cpu_index);
1582     if (cpu == NULL) {
1583         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1584                    "a CPU number");
1585         return;
1586     }
1587
1588     f = fopen(filename, "wb");
1589     if (!f) {
1590         error_setg_file_open(errp, errno, filename);
1591         return;
1592     }
1593
1594     while (size != 0) {
1595         l = sizeof(buf);
1596         if (l > size)
1597             l = size;
1598         if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1599             error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1600                              " specified", orig_addr, orig_size);
1601             goto exit;
1602         }
1603         if (fwrite(buf, 1, l, f) != l) {
1604             error_setg(errp, QERR_IO_ERROR);
1605             goto exit;
1606         }
1607         addr += l;
1608         size -= l;
1609     }
1610
1611 exit:
1612     fclose(f);
1613 }
1614
1615 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1616                   Error **errp)
1617 {
1618     FILE *f;
1619     uint32_t l;
1620     uint8_t buf[1024];
1621
1622     f = fopen(filename, "wb");
1623     if (!f) {
1624         error_setg_file_open(errp, errno, filename);
1625         return;
1626     }
1627
1628     while (size != 0) {
1629         l = sizeof(buf);
1630         if (l > size)
1631             l = size;
1632         cpu_physical_memory_read(addr, buf, l);
1633         if (fwrite(buf, 1, l, f) != l) {
1634             error_setg(errp, QERR_IO_ERROR);
1635             goto exit;
1636         }
1637         addr += l;
1638         size -= l;
1639     }
1640
1641 exit:
1642     fclose(f);
1643 }
1644
1645 void qmp_inject_nmi(Error **errp)
1646 {
1647 #if defined(TARGET_I386)
1648     CPUState *cs;
1649
1650     CPU_FOREACH(cs) {
1651         X86CPU *cpu = X86_CPU(cs);
1652
1653         if (!cpu->apic_state) {
1654             cpu_interrupt(cs, CPU_INTERRUPT_NMI);
1655         } else {
1656             apic_deliver_nmi(cpu->apic_state);
1657         }
1658     }
1659 #else
1660     nmi_monitor_handle(monitor_get_cpu_index(), errp);
1661 #endif
1662 }
1663
1664 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1665 {
1666     if (!use_icount) {
1667         return;
1668     }
1669
1670     cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
1671                 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1672     if (icount_align_option) {
1673         cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
1674         cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
1675     } else {
1676         cpu_fprintf(f, "Max guest delay     NA\n");
1677         cpu_fprintf(f, "Max guest advance   NA\n");
1678     }
1679 }