cpus.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 /* Needed early for CONFIG_BSD etc. */
  26 #include "config-host.h"
  27
  28 #include "monitor.h"
  29 #include "sysemu.h"
  30 #include "gdbstub.h"
  31 #include "dma.h"
  32 #include "kvm.h"
  33
  34 #include "qemu-thread.h"
  35 #include "cpus.h"
  36
  37 #ifndef _WIN32
  38 #include "compatfd.h"
  39 #endif
  40
  41 #ifdef SIGRTMIN
  42 #define SIG_IPI (SIGRTMIN+4)
  43 #else
  44 #define SIG_IPI SIGUSR1
  45 #endif
  46
  47 #ifdef CONFIG_LINUX
  48
  49 #include <sys/prctl.h>
  50
  51 #ifndef PR_MCE_KILL
  52 #define PR_MCE_KILL 33
  53 #endif
  54
  55 #ifndef PR_MCE_KILL_SET
  56 #define PR_MCE_KILL_SET 1
  57 #endif
  58
  59 #ifndef PR_MCE_KILL_EARLY
  60 #define PR_MCE_KILL_EARLY 1
  61 #endif
  62
  63 #endif /* CONFIG_LINUX */
  64
  65 static CPUState *next_cpu;
  66
  67 /***********************************************************/
  68 /* guest cycle counter */
  69
  70 /* Conversion factor from emulated instructions to virtual clock ticks.  */
  71 static int icount_time_shift;
  72 /* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
  73 #define MAX_ICOUNT_SHIFT 10
  74 /* Compensate for varying guest execution speed.  */
  75 static int64_t qemu_icount_bias;
  76 static QEMUTimer *icount_rt_timer;
  77 static QEMUTimer *icount_vm_timer;
  78 static QEMUTimer *icount_warp_timer;
  79 static int64_t vm_clock_warp_start;
  80 static int64_t qemu_icount;
  81
  82 typedef struct TimersState {
  83     int64_t cpu_ticks_prev;
  84     int64_t cpu_ticks_offset;
  85     int64_t cpu_clock_offset;
  86     int32_t cpu_ticks_enabled;
  87     int64_t dummy;
  88 } TimersState;
  89
  90 TimersState timers_state;
  91
  92 /* Return the virtual CPU time, based on the instruction counter.  */
  93 int64_t cpu_get_icount(void)
  94 {
  95     int64_t icount;
  96     CPUState *env = cpu_single_env;;
  97
  98     icount = qemu_icount;
  99     if (env) {
 100         if (!can_do_io(env)) {
 101             fprintf(stderr, "Bad clock read\n");
 102         }
 103         icount -= (env->icount_decr.u16.low + env->icount_extra);
 104     }
 105     return qemu_icount_bias + (icount << icount_time_shift);
 106 }
 107
 108 /* return the host CPU cycle counter and handle stop/restart */
 109 int64_t cpu_get_ticks(void)
 110 {
 111     if (use_icount) {
 112         return cpu_get_icount();
 113     }
 114     if (!timers_state.cpu_ticks_enabled) {
 115         return timers_state.cpu_ticks_offset;
 116     } else {
 117         int64_t ticks;
 118         ticks = cpu_get_real_ticks();
 119         if (timers_state.cpu_ticks_prev > ticks) {
 120             /* Note: non increasing ticks may happen if the host uses
 121                software suspend */
 122             timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 123         }
 124         timers_state.cpu_ticks_prev = ticks;
 125         return ticks + timers_state.cpu_ticks_offset;
 126     }
 127 }
 128
 129 /* return the host CPU monotonic timer and handle stop/restart */
 130 int64_t cpu_get_clock(void)
 131 {
 132     int64_t ti;
 133     if (!timers_state.cpu_ticks_enabled) {
 134         return timers_state.cpu_clock_offset;
 135     } else {
 136         ti = get_clock();
 137         return ti + timers_state.cpu_clock_offset;
 138     }
 139 }
 140
 141 /* enable cpu_get_ticks() */
 142 void cpu_enable_ticks(void)
 143 {
 144     if (!timers_state.cpu_ticks_enabled) {
 145         timers_state.cpu_ticks_offset -= cpu_get_real_ticks();
 146         timers_state.cpu_clock_offset -= get_clock();
 147         timers_state.cpu_ticks_enabled = 1;
 148     }
 149 }
 150
 151 /* disable cpu_get_ticks() : the clock is stopped. You must not call
 152    cpu_get_ticks() after that.  */
 153 void cpu_disable_ticks(void)
 154 {
 155     if (timers_state.cpu_ticks_enabled) {
 156         timers_state.cpu_ticks_offset = cpu_get_ticks();
 157         timers_state.cpu_clock_offset = cpu_get_clock();
 158         timers_state.cpu_ticks_enabled = 0;
 159     }
 160 }
 161
 162 /* Correlation between real and virtual time is always going to be
 163    fairly approximate, so ignore small variation.
 164    When the guest is idle real and virtual time will be aligned in
 165    the IO wait loop.  */
 166 #define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)
 167
 168 static void icount_adjust(void)
 169 {
 170     int64_t cur_time;
 171     int64_t cur_icount;
 172     int64_t delta;
 173     static int64_t last_delta;
 174     /* If the VM is not running, then do nothing.  */
 175     if (!runstate_is_running()) {
 176         return;
 177     }
 178     cur_time = cpu_get_clock();
 179     cur_icount = qemu_get_clock_ns(vm_clock);
 180     delta = cur_icount - cur_time;
 181     /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 182     if (delta > 0
 183         && last_delta + ICOUNT_WOBBLE < delta * 2
 184         && icount_time_shift > 0) {
 185         /* The guest is getting too far ahead.  Slow time down.  */
 186         icount_time_shift--;
 187     }
 188     if (delta < 0
 189         && last_delta - ICOUNT_WOBBLE > delta * 2
 190         && icount_time_shift < MAX_ICOUNT_SHIFT) {
 191         /* The guest is getting too far behind.  Speed time up.  */
 192         icount_time_shift++;
 193     }
 194     last_delta = delta;
 195     qemu_icount_bias = cur_icount - (qemu_icount << icount_time_shift);
 196 }
 197
 198 static void icount_adjust_rt(void *opaque)
 199 {
 200     qemu_mod_timer(icount_rt_timer,
 201                    qemu_get_clock_ms(rt_clock) + 1000);
 202     icount_adjust();
 203 }
 204
 205 static void icount_adjust_vm(void *opaque)
 206 {
 207     qemu_mod_timer(icount_vm_timer,
 208                    qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10);
 209     icount_adjust();
 210 }
 211
 212 static int64_t qemu_icount_round(int64_t count)
 213 {
 214     return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 215 }
 216
 217 static void icount_warp_rt(void *opaque)
 218 {
 219     if (vm_clock_warp_start == -1) {
 220         return;
 221     }
 222
 223     if (runstate_is_running()) {
 224         int64_t clock = qemu_get_clock_ns(rt_clock);
 225         int64_t warp_delta = clock - vm_clock_warp_start;
 226         if (use_icount == 1) {
 227             qemu_icount_bias += warp_delta;
 228         } else {
 229             /*
 230              * In adaptive mode, do not let the vm_clock run too
 231              * far ahead of real time.
 232              */
 233             int64_t cur_time = cpu_get_clock();
 234             int64_t cur_icount = qemu_get_clock_ns(vm_clock);
 235             int64_t delta = cur_time - cur_icount;
 236             qemu_icount_bias += MIN(warp_delta, delta);
 237         }
 238         if (qemu_clock_expired(vm_clock)) {
 239             qemu_notify_event();
 240         }
 241     }
 242     vm_clock_warp_start = -1;
 243 }
 244
 245 void qemu_clock_warp(QEMUClock *clock)
 246 {
 247     int64_t deadline;
 248
 249     /*
 250      * There are too many global variables to make the "warp" behavior
 251      * applicable to other clocks.  But a clock argument removes the
 252      * need for if statements all over the place.
 253      */
 254     if (clock != vm_clock || !use_icount) {
 255         return;
 256     }
 257
 258     /*
 259      * If the CPUs have been sleeping, advance the vm_clock timer now.  This
 260      * ensures that the deadline for the timer is computed correctly below.
 261      * This also makes sure that the insn counter is synchronized before the
 262      * CPU starts running, in case the CPU is woken by an event other than
 263      * the earliest vm_clock timer.
 264      */
 265     icount_warp_rt(NULL);
 266     if (!all_cpu_threads_idle() || !qemu_clock_has_timers(vm_clock)) {
 267         qemu_del_timer(icount_warp_timer);
 268         return;
 269     }
 270
 271     vm_clock_warp_start = qemu_get_clock_ns(rt_clock);
 272     deadline = qemu_clock_deadline(vm_clock);
 273     if (deadline > 0) {
 274         /*
 275          * Ensure the vm_clock proceeds even when the virtual CPU goes to
 276          * sleep.  Otherwise, the CPU might be waiting for a future timer
 277          * interrupt to wake it up, but the interrupt never comes because
 278          * the vCPU isn't running any insns and thus doesn't advance the
 279          * vm_clock.
 280          *
 281          * An extreme solution for this problem would be to never let VCPUs
 282          * sleep in icount mode if there is a pending vm_clock timer; rather
 283          * time could just advance to the next vm_clock event.  Instead, we
 284          * do stop VCPUs and only advance vm_clock after some "real" time,
 285          * (related to the time left until the next event) has passed.  This
 286          * rt_clock timer will do this.  This avoids that the warps are too
 287          * visible externally---for example, you will not be sending network
 288          * packets continously instead of every 100ms.
 289          */
 290         qemu_mod_timer(icount_warp_timer, vm_clock_warp_start + deadline);
 291     } else {
 292         qemu_notify_event();
 293     }
 294 }
 295
 296 static const VMStateDescription vmstate_timers = {
 297     .name = "timer",
 298     .version_id = 2,
 299     .minimum_version_id = 1,
 300     .minimum_version_id_old = 1,
 301     .fields      = (VMStateField[]) {
 302         VMSTATE_INT64(cpu_ticks_offset, TimersState),
 303         VMSTATE_INT64(dummy, TimersState),
 304         VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 305         VMSTATE_END_OF_LIST()
 306     }
 307 };
 308
 309 void configure_icount(const char *option)
 310 {
 311     vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 312     if (!option) {
 313         return;
 314     }
 315
 316     icount_warp_timer = qemu_new_timer_ns(rt_clock, icount_warp_rt, NULL);
 317     if (strcmp(option, "auto") != 0) {
 318         icount_time_shift = strtol(option, NULL, 0);
 319         use_icount = 1;
 320         return;
 321     }
 322
 323     use_icount = 2;
 324
 325     /* 125MIPS seems a reasonable initial guess at the guest speed.
 326        It will be corrected fairly quickly anyway.  */
 327     icount_time_shift = 3;
 328
 329     /* Have both realtime and virtual time triggers for speed adjustment.
 330        The realtime trigger catches emulated time passing too slowly,
 331        the virtual time trigger catches emulated time passing too fast.
 332        Realtime triggers occur even when idle, so use them less frequently
 333        than VM triggers.  */
 334     icount_rt_timer = qemu_new_timer_ms(rt_clock, icount_adjust_rt, NULL);
 335     qemu_mod_timer(icount_rt_timer,
 336                    qemu_get_clock_ms(rt_clock) + 1000);
 337     icount_vm_timer = qemu_new_timer_ns(vm_clock, icount_adjust_vm, NULL);
 338     qemu_mod_timer(icount_vm_timer,
 339                    qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10);
 340 }
 341
 342 /***********************************************************/
 343 void hw_error(const char *fmt, ...)
 344 {
 345     va_list ap;
 346     CPUState *env;
 347
 348     va_start(ap, fmt);
 349     fprintf(stderr, "qemu: hardware error: ");
 350     vfprintf(stderr, fmt, ap);
 351     fprintf(stderr, "\n");
 352     for(env = first_cpu; env != NULL; env = env->next_cpu) {
 353         fprintf(stderr, "CPU #%d:\n", env->cpu_index);
 354 #ifdef TARGET_I386
 355         cpu_dump_state(env, stderr, fprintf, X86_DUMP_FPU);
 356 #else
 357         cpu_dump_state(env, stderr, fprintf, 0);
 358 #endif
 359     }
 360     va_end(ap);
 361     abort();
 362 }
 363
 364 void cpu_synchronize_all_states(void)
 365 {
 366     CPUState *cpu;
 367
 368     for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) {
 369         cpu_synchronize_state(cpu);
 370     }
 371 }
 372
 373 void cpu_synchronize_all_post_reset(void)
 374 {
 375     CPUState *cpu;
 376
 377     for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) {
 378         cpu_synchronize_post_reset(cpu);
 379     }
 380 }
 381
 382 void cpu_synchronize_all_post_init(void)
 383 {
 384     CPUState *cpu;
 385
 386     for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) {
 387         cpu_synchronize_post_init(cpu);
 388     }
 389 }
 390
 391 int cpu_is_stopped(CPUState *env)
 392 {
 393     return !runstate_is_running() || env->stopped;
 394 }
 395
 396 static void do_vm_stop(RunState state)
 397 {
 398     if (runstate_is_running()) {
 399         cpu_disable_ticks();
 400         pause_all_vcpus();
 401         runstate_set(state);
 402         vm_state_notify(0, state);
 403         qemu_aio_flush();
 404         bdrv_flush_all();
 405         monitor_protocol_event(QEVENT_STOP, NULL);
 406     }
 407 }
 408
 409 static int cpu_can_run(CPUState *env)
 410 {
 411     if (env->stop) {
 412         return 0;
 413     }
 414     if (env->stopped || !runstate_is_running()) {
 415         return 0;
 416     }
 417     return 1;
 418 }
 419
 420 static bool cpu_thread_is_idle(CPUState *env)
 421 {
 422     if (env->stop || env->queued_work_first) {
 423         return false;
 424     }
 425     if (env->stopped || !runstate_is_running()) {
 426         return true;
 427     }
 428     if (!env->halted || qemu_cpu_has_work(env) ||
 429         (kvm_enabled() && kvm_irqchip_in_kernel())) {
 430         return false;
 431     }
 432     return true;
 433 }
 434
 435 bool all_cpu_threads_idle(void)
 436 {
 437     CPUState *env;
 438
 439     for (env = first_cpu; env != NULL; env = env->next_cpu) {
 440         if (!cpu_thread_is_idle(env)) {
 441             return false;
 442         }
 443     }
 444     return true;
 445 }
 446
 447 static void cpu_handle_guest_debug(CPUState *env)
 448 {
 449     gdb_set_stop_cpu(env);
 450     qemu_system_debug_request();
 451     env->stopped = 1;
 452 }
 453
 454 static void cpu_signal(int sig)
 455 {
 456     if (cpu_single_env) {
 457         cpu_exit(cpu_single_env);
 458     }
 459     exit_request = 1;
 460 }
 461
 462 #ifdef CONFIG_LINUX
 463 static void sigbus_reraise(void)
 464 {
 465     sigset_t set;
 466     struct sigaction action;
 467
 468     memset(&action, 0, sizeof(action));
 469     action.sa_handler = SIG_DFL;
 470     if (!sigaction(SIGBUS, &action, NULL)) {
 471         raise(SIGBUS);
 472         sigemptyset(&set);
 473         sigaddset(&set, SIGBUS);
 474         sigprocmask(SIG_UNBLOCK, &set, NULL);
 475     }
 476     perror("Failed to re-raise SIGBUS!\n");
 477     abort();
 478 }
 479
 480 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
 481                            void *ctx)
 482 {
 483     if (kvm_on_sigbus(siginfo->ssi_code,
 484                       (void *)(intptr_t)siginfo->ssi_addr)) {
 485         sigbus_reraise();
 486     }
 487 }
 488
 489 static void qemu_init_sigbus(void)
 490 {
 491     struct sigaction action;
 492
 493     memset(&action, 0, sizeof(action));
 494     action.sa_flags = SA_SIGINFO;
 495     action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
 496     sigaction(SIGBUS, &action, NULL);
 497
 498     prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
 499 }
 500
 501 static void qemu_kvm_eat_signals(CPUState *env)
 502 {
 503     struct timespec ts = { 0, 0 };
 504     siginfo_t siginfo;
 505     sigset_t waitset;
 506     sigset_t chkset;
 507     int r;
 508
 509     sigemptyset(&waitset);
 510     sigaddset(&waitset, SIG_IPI);
 511     sigaddset(&waitset, SIGBUS);
 512
 513     do {
 514         r = sigtimedwait(&waitset, &siginfo, &ts);
 515         if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
 516             perror("sigtimedwait");
 517             exit(1);
 518         }
 519
 520         switch (r) {
 521         case SIGBUS:
 522             if (kvm_on_sigbus_vcpu(env, siginfo.si_code, siginfo.si_addr)) {
 523                 sigbus_reraise();
 524             }
 525             break;
 526         default:
 527             break;
 528         }
 529
 530         r = sigpending(&chkset);
 531         if (r == -1) {
 532             perror("sigpending");
 533             exit(1);
 534         }
 535     } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 536 }
 537
 538 #else /* !CONFIG_LINUX */
 539
 540 static void qemu_init_sigbus(void)
 541 {
 542 }
 543
 544 static void qemu_kvm_eat_signals(CPUState *env)
 545 {
 546 }
 547 #endif /* !CONFIG_LINUX */
 548
 549 #ifndef _WIN32
 550 static int io_thread_fd = -1;
 551
 552 static void qemu_event_increment(void)
 553 {
 554     /* Write 8 bytes to be compatible with eventfd.  */
 555     static const uint64_t val = 1;
 556     ssize_t ret;
 557
 558     if (io_thread_fd == -1) {
 559         return;
 560     }
 561     do {
 562         ret = write(io_thread_fd, &val, sizeof(val));
 563     } while (ret < 0 && errno == EINTR);
 564
 565     /* EAGAIN is fine, a read must be pending.  */
 566     if (ret < 0 && errno != EAGAIN) {
 567         fprintf(stderr, "qemu_event_increment: write() failed: %s\n",
 568                 strerror(errno));
 569         exit (1);
 570     }
 571 }
 572
 573 static void qemu_event_read(void *opaque)
 574 {
 575     int fd = (intptr_t)opaque;
 576     ssize_t len;
 577     char buffer[512];
 578
 579     /* Drain the notify pipe.  For eventfd, only 8 bytes will be read.  */
 580     do {
 581         len = read(fd, buffer, sizeof(buffer));
 582     } while ((len == -1 && errno == EINTR) || len == sizeof(buffer));
 583 }
 584
 585 static int qemu_event_init(void)
 586 {
 587     int err;
 588     int fds[2];
 589
 590     err = qemu_eventfd(fds);
 591     if (err == -1) {
 592         return -errno;
 593     }
 594     err = fcntl_setfl(fds[0], O_NONBLOCK);
 595     if (err < 0) {
 596         goto fail;
 597     }
 598     err = fcntl_setfl(fds[1], O_NONBLOCK);
 599     if (err < 0) {
 600         goto fail;
 601     }
 602     qemu_set_fd_handler2(fds[0], NULL, qemu_event_read, NULL,
 603                          (void *)(intptr_t)fds[0]);
 604
 605     io_thread_fd = fds[1];
 606     return 0;
 607
 608 fail:
 609     close(fds[0]);
 610     close(fds[1]);
 611     return err;
 612 }
 613
 614 static void dummy_signal(int sig)
 615 {
 616 }
 617
 618 /* If we have signalfd, we mask out the signals we want to handle and then
 619  * use signalfd to listen for them.  We rely on whatever the current signal
 620  * handler is to dispatch the signals when we receive them.
 621  */
 622 static void sigfd_handler(void *opaque)
 623 {
 624     int fd = (intptr_t)opaque;
 625     struct qemu_signalfd_siginfo info;
 626     struct sigaction action;
 627     ssize_t len;
 628
 629     while (1) {
 630         do {
 631             len = read(fd, &info, sizeof(info));
 632         } while (len == -1 && errno == EINTR);
 633
 634         if (len == -1 && errno == EAGAIN) {
 635             break;
 636         }
 637
 638         if (len != sizeof(info)) {
 639             printf("read from sigfd returned %zd: %m\n", len);
 640             return;
 641         }
 642
 643         sigaction(info.ssi_signo, NULL, &action);
 644         if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) {
 645             action.sa_sigaction(info.ssi_signo,
 646                                 (siginfo_t *)&info, NULL);
 647         } else if (action.sa_handler) {
 648             action.sa_handler(info.ssi_signo);
 649         }
 650     }
 651 }
 652
 653 static int qemu_signal_init(void)
 654 {
 655     int sigfd;
 656     sigset_t set;
 657
 658     /*
 659      * SIG_IPI must be blocked in the main thread and must not be caught
 660      * by sigwait() in the signal thread. Otherwise, the cpu thread will
 661      * not catch it reliably.
 662      */
 663     sigemptyset(&set);
 664     sigaddset(&set, SIG_IPI);
 665     pthread_sigmask(SIG_BLOCK, &set, NULL);
 666
 667     sigemptyset(&set);
 668     sigaddset(&set, SIGIO);
 669     sigaddset(&set, SIGALRM);
 670     sigaddset(&set, SIGBUS);
 671     pthread_sigmask(SIG_BLOCK, &set, NULL);
 672
 673     sigfd = qemu_signalfd(&set);
 674     if (sigfd == -1) {
 675         fprintf(stderr, "failed to create signalfd\n");
 676         return -errno;
 677     }
 678
 679     fcntl_setfl(sigfd, O_NONBLOCK);
 680
 681     qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
 682                          (void *)(intptr_t)sigfd);
 683
 684     return 0;
 685 }
 686
 687 static void qemu_kvm_init_cpu_signals(CPUState *env)
 688 {
 689     int r;
 690     sigset_t set;
 691     struct sigaction sigact;
 692
 693     memset(&sigact, 0, sizeof(sigact));
 694     sigact.sa_handler = dummy_signal;
 695     sigaction(SIG_IPI, &sigact, NULL);
 696
 697     pthread_sigmask(SIG_BLOCK, NULL, &set);
 698     sigdelset(&set, SIG_IPI);
 699     sigdelset(&set, SIGBUS);
 700     r = kvm_set_signal_mask(env, &set);
 701     if (r) {
 702         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
 703         exit(1);
 704     }
 705
 706     sigdelset(&set, SIG_IPI);
 707     sigdelset(&set, SIGBUS);
 708     r = kvm_set_signal_mask(env, &set);
 709     if (r) {
 710         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
 711         exit(1);
 712     }
 713 }
 714
 715 static void qemu_tcg_init_cpu_signals(void)
 716 {
 717     sigset_t set;
 718     struct sigaction sigact;
 719
 720     memset(&sigact, 0, sizeof(sigact));
 721     sigact.sa_handler = cpu_signal;
 722     sigaction(SIG_IPI, &sigact, NULL);
 723
 724     sigemptyset(&set);
 725     sigaddset(&set, SIG_IPI);
 726     pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 727 }
 728
 729 #else /* _WIN32 */
 730
 731 HANDLE qemu_event_handle;
 732
 733 static void dummy_event_handler(void *opaque)
 734 {
 735 }
 736
 737 static int qemu_event_init(void)
 738 {
 739     qemu_event_handle = CreateEvent(NULL, FALSE, FALSE, NULL);
 740     if (!qemu_event_handle) {
 741         fprintf(stderr, "Failed CreateEvent: %ld\n", GetLastError());
 742         return -1;
 743     }
 744     qemu_add_wait_object(qemu_event_handle, dummy_event_handler, NULL);
 745     return 0;
 746 }
 747
 748 static void qemu_event_increment(void)
 749 {
 750     if (!SetEvent(qemu_event_handle)) {
 751         fprintf(stderr, "qemu_event_increment: SetEvent failed: %ld\n",
 752                 GetLastError());
 753         exit (1);
 754     }
 755 }
 756
 757 static int qemu_signal_init(void)
 758 {
 759     return 0;
 760 }
 761
 762 static void qemu_kvm_init_cpu_signals(CPUState *env)
 763 {
 764     abort();
 765 }
 766
 767 static void qemu_tcg_init_cpu_signals(void)
 768 {
 769 }
 770 #endif /* _WIN32 */
 771
 772 QemuMutex qemu_global_mutex;
 773 static QemuCond qemu_io_proceeded_cond;
 774 static bool iothread_requesting_mutex;
 775
 776 static QemuThread io_thread;
 777
 778 static QemuThread *tcg_cpu_thread;
 779 static QemuCond *tcg_halt_cond;
 780
 781 /* cpu creation */
 782 static QemuCond qemu_cpu_cond;
 783 /* system init */
 784 static QemuCond qemu_pause_cond;
 785 static QemuCond qemu_work_cond;
 786
 787 int qemu_init_main_loop(void)
 788 {
 789     int ret;
 790
 791     qemu_init_sigbus();
 792
 793     ret = qemu_signal_init();
 794     if (ret) {
 795         return ret;
 796     }
 797
 798     /* Note eventfd must be drained before signalfd handlers run */
 799     ret = qemu_event_init();
 800     if (ret) {
 801         return ret;
 802     }
 803
 804     qemu_cond_init(&qemu_cpu_cond);
 805     qemu_cond_init(&qemu_pause_cond);
 806     qemu_cond_init(&qemu_work_cond);
 807     qemu_cond_init(&qemu_io_proceeded_cond);
 808     qemu_mutex_init(&qemu_global_mutex);
 809     qemu_mutex_lock(&qemu_global_mutex);
 810
 811     qemu_thread_get_self(&io_thread);
 812
 813     return 0;
 814 }
 815
 816 void qemu_main_loop_start(void)
 817 {
 818     resume_all_vcpus();
 819 }
 820
 821 void run_on_cpu(CPUState *env, void (*func)(void *data), void *data)
 822 {
 823     struct qemu_work_item wi;
 824
 825     if (qemu_cpu_is_self(env)) {
 826         func(data);
 827         return;
 828     }
 829
 830     wi.func = func;
 831     wi.data = data;
 832     if (!env->queued_work_first) {
 833         env->queued_work_first = &wi;
 834     } else {
 835         env->queued_work_last->next = &wi;
 836     }
 837     env->queued_work_last = &wi;
 838     wi.next = NULL;
 839     wi.done = false;
 840
 841     qemu_cpu_kick(env);
 842     while (!wi.done) {
 843         CPUState *self_env = cpu_single_env;
 844
 845         qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
 846         cpu_single_env = self_env;
 847     }
 848 }
 849
 850 static void flush_queued_work(CPUState *env)
 851 {
 852     struct qemu_work_item *wi;
 853
 854     if (!env->queued_work_first) {
 855         return;
 856     }
 857
 858     while ((wi = env->queued_work_first)) {
 859         env->queued_work_first = wi->next;
 860         wi->func(wi->data);
 861         wi->done = true;
 862     }
 863     env->queued_work_last = NULL;
 864     qemu_cond_broadcast(&qemu_work_cond);
 865 }
 866
 867 static void qemu_wait_io_event_common(CPUState *env)
 868 {
 869     if (env->stop) {
 870         env->stop = 0;
 871         env->stopped = 1;
 872         qemu_cond_signal(&qemu_pause_cond);
 873     }
 874     flush_queued_work(env);
 875     env->thread_kicked = false;
 876 }
 877
 878 static void qemu_tcg_wait_io_event(void)
 879 {
 880     CPUState *env;
 881
 882     while (all_cpu_threads_idle()) {
 883        /* Start accounting real time to the virtual clock if the CPUs
 884           are idle.  */
 885         qemu_clock_warp(vm_clock);
 886         qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
 887     }
 888
 889     while (iothread_requesting_mutex) {
 890         qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
 891     }
 892
 893     for (env = first_cpu; env != NULL; env = env->next_cpu) {
 894         qemu_wait_io_event_common(env);
 895     }
 896 }
 897
 898 static void qemu_kvm_wait_io_event(CPUState *env)
 899 {
 900     while (cpu_thread_is_idle(env)) {
 901         qemu_cond_wait(env->halt_cond, &qemu_global_mutex);
 902     }
 903
 904     qemu_kvm_eat_signals(env);
 905     qemu_wait_io_event_common(env);
 906 }
 907
 908 static void *qemu_kvm_cpu_thread_fn(void *arg)
 909 {
 910     CPUState *env = arg;
 911     int r;
 912
 913     qemu_mutex_lock(&qemu_global_mutex);
 914     qemu_thread_get_self(env->thread);
 915     env->thread_id = qemu_get_thread_id();
 916
 917     r = kvm_init_vcpu(env);
 918     if (r < 0) {
 919         fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
 920         exit(1);
 921     }
 922
 923     qemu_kvm_init_cpu_signals(env);
 924
 925     /* signal CPU creation */
 926     env->created = 1;
 927     qemu_cond_signal(&qemu_cpu_cond);
 928
 929     while (1) {
 930         if (cpu_can_run(env)) {
 931             r = kvm_cpu_exec(env);
 932             if (r == EXCP_DEBUG) {
 933                 cpu_handle_guest_debug(env);
 934             }
 935         }
 936         qemu_kvm_wait_io_event(env);
 937     }
 938
 939     return NULL;
 940 }
 941
 942 static void *qemu_tcg_cpu_thread_fn(void *arg)
 943 {
 944     CPUState *env = arg;
 945
 946     qemu_tcg_init_cpu_signals();
 947     qemu_thread_get_self(env->thread);
 948
 949     /* signal CPU creation */
 950     qemu_mutex_lock(&qemu_global_mutex);
 951     for (env = first_cpu; env != NULL; env = env->next_cpu) {
 952         env->thread_id = qemu_get_thread_id();
 953         env->created = 1;
 954     }
 955     qemu_cond_signal(&qemu_cpu_cond);
 956
 957     /* wait for initial kick-off after machine start */
 958     while (first_cpu->stopped) {
 959         qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
 960     }
 961
 962     while (1) {
 963         cpu_exec_all();
 964         if (use_icount && qemu_clock_deadline(vm_clock) <= 0) {
 965             qemu_notify_event();
 966         }
 967         qemu_tcg_wait_io_event();
 968     }
 969
 970     return NULL;
 971 }
 972
 973 static void qemu_cpu_kick_thread(CPUState *env)
 974 {
 975 #ifndef _WIN32
 976     int err;
 977
 978     err = pthread_kill(env->thread->thread, SIG_IPI);
 979     if (err) {
 980         fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
 981         exit(1);
 982     }
 983 #else /* _WIN32 */
 984     if (!qemu_cpu_is_self(env)) {
 985         SuspendThread(env->thread->thread);
 986         cpu_signal(0);
 987         ResumeThread(env->thread->thread);
 988     }
 989 #endif
 990 }
 991
 992 void qemu_cpu_kick(void *_env)
 993 {
 994     CPUState *env = _env;
 995
 996     qemu_cond_broadcast(env->halt_cond);
 997     if (kvm_enabled() && !env->thread_kicked) {
 998         qemu_cpu_kick_thread(env);
 999         env->thread_kicked = true;
1000     }
1001 }
1002
1003 void qemu_cpu_kick_self(void)
1004 {
1005 #ifndef _WIN32
1006     assert(cpu_single_env);
1007
1008     if (!cpu_single_env->thread_kicked) {
1009         qemu_cpu_kick_thread(cpu_single_env);
1010         cpu_single_env->thread_kicked = true;
1011     }
1012 #else
1013     abort();
1014 #endif
1015 }
1016
1017 int qemu_cpu_is_self(void *_env)
1018 {
1019     CPUState *env = _env;
1020
1021     return qemu_thread_is_self(env->thread);
1022 }
1023
1024 void qemu_mutex_lock_iothread(void)
1025 {
1026     if (kvm_enabled()) {
1027         qemu_mutex_lock(&qemu_global_mutex);
1028     } else {
1029         iothread_requesting_mutex = true;
1030         if (qemu_mutex_trylock(&qemu_global_mutex)) {
1031             qemu_cpu_kick_thread(first_cpu);
1032             qemu_mutex_lock(&qemu_global_mutex);
1033         }
1034         iothread_requesting_mutex = false;
1035         qemu_cond_broadcast(&qemu_io_proceeded_cond);
1036     }
1037 }
1038
1039 void qemu_mutex_unlock_iothread(void)
1040 {
1041     qemu_mutex_unlock(&qemu_global_mutex);
1042 }
1043
1044 static int all_vcpus_paused(void)
1045 {
1046     CPUState *penv = first_cpu;
1047
1048     while (penv) {
1049         if (!penv->stopped) {
1050             return 0;
1051         }
1052         penv = (CPUState *)penv->next_cpu;
1053     }
1054
1055     return 1;
1056 }
1057
1058 void pause_all_vcpus(void)
1059 {
1060     CPUState *penv = first_cpu;
1061
1062     qemu_clock_enable(vm_clock, false);
1063     while (penv) {
1064         penv->stop = 1;
1065         qemu_cpu_kick(penv);
1066         penv = (CPUState *)penv->next_cpu;
1067     }
1068
1069     while (!all_vcpus_paused()) {
1070         qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1071         penv = first_cpu;
1072         while (penv) {
1073             qemu_cpu_kick(penv);
1074             penv = (CPUState *)penv->next_cpu;
1075         }
1076     }
1077 }
1078
1079 void resume_all_vcpus(void)
1080 {
1081     CPUState *penv = first_cpu;
1082
1083     while (penv) {
1084         penv->stop = 0;
1085         penv->stopped = 0;
1086         qemu_cpu_kick(penv);
1087         penv = (CPUState *)penv->next_cpu;
1088     }
1089 }
1090
1091 static void qemu_tcg_init_vcpu(void *_env)
1092 {
1093     CPUState *env = _env;
1094
1095     /* share a single thread for all cpus with TCG */
1096     if (!tcg_cpu_thread) {
1097         env->thread = g_malloc0(sizeof(QemuThread));
1098         env->halt_cond = g_malloc0(sizeof(QemuCond));
1099         qemu_cond_init(env->halt_cond);
1100         tcg_halt_cond = env->halt_cond;
1101         qemu_thread_create(env->thread, qemu_tcg_cpu_thread_fn, env);
1102         while (env->created == 0) {
1103             qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1104         }
1105         tcg_cpu_thread = env->thread;
1106     } else {
1107         env->thread = tcg_cpu_thread;
1108         env->halt_cond = tcg_halt_cond;
1109     }
1110 }
1111
1112 static void qemu_kvm_start_vcpu(CPUState *env)
1113 {
1114     env->thread = g_malloc0(sizeof(QemuThread));
1115     env->halt_cond = g_malloc0(sizeof(QemuCond));
1116     qemu_cond_init(env->halt_cond);
1117     qemu_thread_create(env->thread, qemu_kvm_cpu_thread_fn, env);
1118     while (env->created == 0) {
1119         qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1120     }
1121 }
1122
1123 void qemu_init_vcpu(void *_env)
1124 {
1125     CPUState *env = _env;
1126
1127     env->nr_cores = smp_cores;
1128     env->nr_threads = smp_threads;
1129     env->stopped = 1;
1130     if (kvm_enabled()) {
1131         qemu_kvm_start_vcpu(env);
1132     } else {
1133         qemu_tcg_init_vcpu(env);
1134     }
1135 }
1136
1137 void qemu_notify_event(void)
1138 {
1139     qemu_event_increment();
1140 }
1141
1142 void cpu_stop_current(void)
1143 {
1144     if (cpu_single_env) {
1145         cpu_single_env->stop = 0;
1146         cpu_single_env->stopped = 1;
1147         cpu_exit(cpu_single_env);
1148         qemu_cond_signal(&qemu_pause_cond);
1149     }
1150 }
1151
1152 void vm_stop(RunState state)
1153 {
1154     if (!qemu_thread_is_self(&io_thread)) {
1155         qemu_system_vmstop_request(state);
1156         /*
1157          * FIXME: should not return to device code in case
1158          * vm_stop() has been requested.
1159          */
1160         cpu_stop_current();
1161         return;
1162     }
1163     do_vm_stop(state);
1164 }
1165
1166 /* does a state transition even if the VM is already stopped,
1167    current state is forgotten forever */
1168 void vm_stop_force_state(RunState state)
1169 {
1170     if (runstate_is_running()) {
1171         vm_stop(state);
1172     } else {
1173         runstate_set(state);
1174     }
1175 }
1176
1177 static int tcg_cpu_exec(CPUState *env)
1178 {
1179     int ret;
1180 #ifdef CONFIG_PROFILER
1181     int64_t ti;
1182 #endif
1183
1184 #ifdef CONFIG_PROFILER
1185     ti = profile_getclock();
1186 #endif
1187     if (use_icount) {
1188         int64_t count;
1189         int decr;
1190         qemu_icount -= (env->icount_decr.u16.low + env->icount_extra);
1191         env->icount_decr.u16.low = 0;
1192         env->icount_extra = 0;
1193         count = qemu_icount_round(qemu_clock_deadline(vm_clock));
1194         qemu_icount += count;
1195         decr = (count > 0xffff) ? 0xffff : count;
1196         count -= decr;
1197         env->icount_decr.u16.low = decr;
1198         env->icount_extra = count;
1199     }
1200     ret = cpu_exec(env);
1201 #ifdef CONFIG_PROFILER
1202     qemu_time += profile_getclock() - ti;
1203 #endif
1204     if (use_icount) {
1205         /* Fold pending instructions back into the
1206            instruction counter, and clear the interrupt flag.  */
1207         qemu_icount -= (env->icount_decr.u16.low
1208                         + env->icount_extra);
1209         env->icount_decr.u32 = 0;
1210         env->icount_extra = 0;
1211     }
1212     return ret;
1213 }
1214
1215 bool cpu_exec_all(void)
1216 {
1217     int r;
1218
1219     /* Account partial waits to the vm_clock.  */
1220     qemu_clock_warp(vm_clock);
1221
1222     if (next_cpu == NULL) {
1223         next_cpu = first_cpu;
1224     }
1225     for (; next_cpu != NULL && !exit_request; next_cpu = next_cpu->next_cpu) {
1226         CPUState *env = next_cpu;
1227
1228         qemu_clock_enable(vm_clock,
1229                           (env->singlestep_enabled & SSTEP_NOTIMER) == 0);
1230
1231         if (cpu_can_run(env)) {
1232             if (kvm_enabled()) {
1233                 r = kvm_cpu_exec(env);
1234                 qemu_kvm_eat_signals(env);
1235             } else {
1236                 r = tcg_cpu_exec(env);
1237             }
1238             if (r == EXCP_DEBUG) {
1239                 cpu_handle_guest_debug(env);
1240                 break;
1241             }
1242         } else if (env->stop || env->stopped) {
1243             break;
1244         }
1245     }
1246     exit_request = 0;
1247     return !all_cpu_threads_idle();
1248 }
1249
1250 void set_numa_modes(void)
1251 {
1252     CPUState *env;
1253     int i;
1254
1255     for (env = first_cpu; env != NULL; env = env->next_cpu) {
1256         for (i = 0; i < nb_numa_nodes; i++) {
1257             if (node_cpumask[i] & (1 << env->cpu_index)) {
1258                 env->numa_node = i;
1259             }
1260         }
1261     }
1262 }
1263
1264 void set_cpu_log(const char *optarg)
1265 {
1266     int mask;
1267     const CPULogItem *item;
1268
1269     mask = cpu_str_to_log_mask(optarg);
1270     if (!mask) {
1271         printf("Log items (comma separated):\n");
1272         for (item = cpu_log_items; item->mask != 0; item++) {
1273             printf("%-10s %s\n", item->name, item->help);
1274         }
1275         exit(1);
1276     }
1277     cpu_set_log(mask);
1278 }
1279
1280 void set_cpu_log_filename(const char *optarg)
1281 {
1282     cpu_set_log_filename(optarg);
1283 }
1284
1285 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1286 {
1287     /* XXX: implement xxx_cpu_list for targets that still miss it */
1288 #if defined(cpu_list_id)
1289     cpu_list_id(f, cpu_fprintf, optarg);
1290 #elif defined(cpu_list)
1291     cpu_list(f, cpu_fprintf); /* deprecated */
1292 #endif
1293 }