X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=cpus.c;h=740b8dc3f808b320cce92c434af93f8bc315eb79;hb=3ae74003b55d821a1b3ee92d04cfa18528e0334d;hp=326742f445eeffdcf3ba35758f453ec3e4440075;hpb=500acc9c410bcea17148a1072e323c08d12e6a6b;p=mirror_qemu.git diff --git a/cpus.c b/cpus.c index 326742f445..740b8dc3f8 100644 --- a/cpus.c +++ b/cpus.c @@ -25,6 +25,7 @@ /* Needed early for CONFIG_BSD etc. */ #include "qemu/osdep.h" #include "qemu-common.h" +#include "qemu/config-file.h" #include "cpu.h" #include "monitor/monitor.h" #include "qapi/qmp/qerror.h" @@ -33,7 +34,9 @@ #include "sysemu/block-backend.h" #include "exec/gdbstub.h" #include "sysemu/dma.h" +#include "sysemu/hw_accel.h" #include "sysemu/kvm.h" +#include "sysemu/hax.h" #include "qmp-commands.h" #include "exec/exec-all.h" @@ -43,14 +46,11 @@ #include "qemu/main-loop.h" #include "qemu/bitmap.h" #include "qemu/seqlock.h" +#include "tcg.h" #include "qapi-event.h" #include "hw/nmi.h" #include "sysemu/replay.h" -#ifndef _WIN32 -#include "qemu/compatfd.h" -#endif - #ifdef CONFIG_LINUX #include @@ -69,7 +69,6 @@ #endif /* CONFIG_LINUX */ -static CPUState *next_cpu; int64_t max_delay; int64_t max_advance; @@ -149,21 +148,126 @@ typedef struct TimersState { } TimersState; static TimersState timers_state; +bool mttcg_enabled; + +/* + * We default to false if we know other options have been enabled + * which are currently incompatible with MTTCG. Otherwise when each + * guest (target) has been updated to support: + * - atomic instructions + * - memory ordering primitives (barriers) + * they can set the appropriate CONFIG flags in ${target}-softmmu.mak + * + * Once a guest architecture has been converted to the new primitives + * there are two remaining limitations to check. + * + * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host) + * - The host must have a stronger memory order than the guest + * + * It may be possible in future to support strong guests on weak hosts + * but that will require tagging all load/stores in a guest with their + * implicit memory order requirements which would likely slow things + * down a lot. + */ + +static bool check_tcg_memory_orders_compatible(void) +{ +#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO) + return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0; +#else + return false; +#endif +} + +static bool default_mttcg_enabled(void) +{ + if (use_icount || TCG_OVERSIZED_GUEST) { + return false; + } else { +#ifdef TARGET_SUPPORTS_MTTCG + return check_tcg_memory_orders_compatible(); +#else + return false; +#endif + } +} + +void qemu_tcg_configure(QemuOpts *opts, Error **errp) +{ + const char *t = qemu_opt_get(opts, "thread"); + if (t) { + if (strcmp(t, "multi") == 0) { + if (TCG_OVERSIZED_GUEST) { + error_setg(errp, "No MTTCG when guest word size > hosts"); + } else if (use_icount) { + error_setg(errp, "No MTTCG when icount is enabled"); + } else { +#ifndef TARGET_SUPPORTS_MTTCG + error_report("Guest not yet converted to MTTCG - " + "you may get unexpected results"); +#endif + if (!check_tcg_memory_orders_compatible()) { + error_report("Guest expects a stronger memory ordering " + "than the host provides"); + error_printf("This may cause strange/hard to debug errors\n"); + } + mttcg_enabled = true; + } + } else if (strcmp(t, "single") == 0) { + mttcg_enabled = false; + } else { + error_setg(errp, "Invalid 'thread' setting %s", t); + } + } else { + mttcg_enabled = default_mttcg_enabled(); + } +} + +/* The current number of executed instructions is based on what we + * originally budgeted minus the current state of the decrementing + * icount counters in extra/u16.low. + */ +static int64_t cpu_get_icount_executed(CPUState *cpu) +{ + return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra); +} + +/* + * Update the global shared timer_state.qemu_icount to take into + * account executed instructions. This is done by the TCG vCPU + * thread so the main-loop can see time has moved forward. + */ +void cpu_update_icount(CPUState *cpu) +{ + int64_t executed = cpu_get_icount_executed(cpu); + cpu->icount_budget -= executed; + +#ifdef CONFIG_ATOMIC64 + atomic_set__nocheck(&timers_state.qemu_icount, + atomic_read__nocheck(&timers_state.qemu_icount) + + executed); +#else /* FIXME: we need 64bit atomics to do this safely */ + timers_state.qemu_icount += executed; +#endif +} int64_t cpu_get_icount_raw(void) { - int64_t icount; CPUState *cpu = current_cpu; - icount = timers_state.qemu_icount; - if (cpu) { + if (cpu && cpu->running) { if (!cpu->can_do_io) { fprintf(stderr, "Bad icount read\n"); exit(1); } - icount -= (cpu->icount_decr.u16.low + cpu->icount_extra); + /* Take into account what has run */ + cpu_update_icount(cpu); } - return icount; +#ifdef CONFIG_ATOMIC64 + return atomic_read__nocheck(&timers_state.qemu_icount); +#else /* FIXME: we need 64bit atomics to do this safely */ + return timers_state.qemu_icount; +#endif } /* Return the virtual CPU time, based on the instruction counter. */ @@ -191,8 +295,12 @@ int64_t cpu_icount_to_ns(int64_t icount) return icount << icount_time_shift; } -/* return the host CPU cycle counter and handle stop/restart */ -/* Caller must hold the BQL */ +/* return the time elapsed in VM between vm_start and vm_stop. Unless + * icount is active, cpu_get_ticks() uses units of the host CPU cycle + * counter. + * + * Caller must hold the BQL + */ int64_t cpu_get_ticks(void) { int64_t ticks; @@ -219,17 +327,19 @@ int64_t cpu_get_ticks(void) static int64_t cpu_get_clock_locked(void) { - int64_t ticks; + int64_t time; - ticks = timers_state.cpu_clock_offset; + time = timers_state.cpu_clock_offset; if (timers_state.cpu_ticks_enabled) { - ticks += get_clock(); + time += get_clock(); } - return ticks; + return time; } -/* return the host CPU monotonic timer and handle stop/restart */ +/* Return the monotonic time elapsed in VM, i.e., + * the time between vm_start and vm_stop + */ int64_t cpu_get_clock(void) { int64_t ti; @@ -244,34 +354,34 @@ int64_t cpu_get_clock(void) } /* enable cpu_get_ticks() - * Caller must hold BQL which server as mutex for vm_clock_seqlock. + * Caller must hold BQL which serves as mutex for vm_clock_seqlock. */ void cpu_enable_ticks(void) { /* Here, the really thing protected by seqlock is cpu_clock_offset. */ - seqlock_write_lock(&timers_state.vm_clock_seqlock); + seqlock_write_begin(&timers_state.vm_clock_seqlock); if (!timers_state.cpu_ticks_enabled) { timers_state.cpu_ticks_offset -= cpu_get_host_ticks(); timers_state.cpu_clock_offset -= get_clock(); timers_state.cpu_ticks_enabled = 1; } - seqlock_write_unlock(&timers_state.vm_clock_seqlock); + seqlock_write_end(&timers_state.vm_clock_seqlock); } /* disable cpu_get_ticks() : the clock is stopped. You must not call * cpu_get_ticks() after that. - * Caller must hold BQL which server as mutex for vm_clock_seqlock. + * Caller must hold BQL which serves as mutex for vm_clock_seqlock. */ void cpu_disable_ticks(void) { /* Here, the really thing protected by seqlock is cpu_clock_offset. */ - seqlock_write_lock(&timers_state.vm_clock_seqlock); + seqlock_write_begin(&timers_state.vm_clock_seqlock); if (timers_state.cpu_ticks_enabled) { timers_state.cpu_ticks_offset += cpu_get_host_ticks(); timers_state.cpu_clock_offset = cpu_get_clock_locked(); timers_state.cpu_ticks_enabled = 0; } - seqlock_write_unlock(&timers_state.vm_clock_seqlock); + seqlock_write_end(&timers_state.vm_clock_seqlock); } /* Correlation between real and virtual time is always going to be @@ -294,7 +404,7 @@ static void icount_adjust(void) return; } - seqlock_write_lock(&timers_state.vm_clock_seqlock); + seqlock_write_begin(&timers_state.vm_clock_seqlock); cur_time = cpu_get_clock_locked(); cur_icount = cpu_get_icount_locked(); @@ -315,7 +425,7 @@ static void icount_adjust(void) last_delta = delta; timers_state.qemu_icount_bias = cur_icount - (timers_state.qemu_icount << icount_time_shift); - seqlock_write_unlock(&timers_state.vm_clock_seqlock); + seqlock_write_end(&timers_state.vm_clock_seqlock); } static void icount_adjust_rt(void *opaque) @@ -355,7 +465,7 @@ static void icount_warp_rt(void) return; } - seqlock_write_lock(&timers_state.vm_clock_seqlock); + seqlock_write_begin(&timers_state.vm_clock_seqlock); if (runstate_is_running()) { int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT, cpu_get_clock_locked()); @@ -374,7 +484,7 @@ static void icount_warp_rt(void) timers_state.qemu_icount_bias += warp_delta; } vm_clock_warp_start = -1; - seqlock_write_unlock(&timers_state.vm_clock_seqlock); + seqlock_write_end(&timers_state.vm_clock_seqlock); if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) { qemu_clock_notify(QEMU_CLOCK_VIRTUAL); @@ -399,9 +509,9 @@ void qtest_clock_warp(int64_t dest) int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL); int64_t warp = qemu_soonest_timeout(dest - clock, deadline); - seqlock_write_lock(&timers_state.vm_clock_seqlock); + seqlock_write_begin(&timers_state.vm_clock_seqlock); timers_state.qemu_icount_bias += warp; - seqlock_write_unlock(&timers_state.vm_clock_seqlock); + seqlock_write_end(&timers_state.vm_clock_seqlock); qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL); timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]); @@ -468,9 +578,9 @@ void qemu_start_warp_timer(void) * It is useful when we want a deterministic execution time, * isolated from host latencies. */ - seqlock_write_lock(&timers_state.vm_clock_seqlock); + seqlock_write_begin(&timers_state.vm_clock_seqlock); timers_state.qemu_icount_bias += deadline; - seqlock_write_unlock(&timers_state.vm_clock_seqlock); + seqlock_write_end(&timers_state.vm_clock_seqlock); qemu_clock_notify(QEMU_CLOCK_VIRTUAL); } else { /* @@ -481,11 +591,11 @@ void qemu_start_warp_timer(void) * you will not be sending network packets continuously instead of * every 100ms. */ - seqlock_write_lock(&timers_state.vm_clock_seqlock); + seqlock_write_begin(&timers_state.vm_clock_seqlock); if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) { vm_clock_warp_start = clock; } - seqlock_write_unlock(&timers_state.vm_clock_seqlock); + seqlock_write_end(&timers_state.vm_clock_seqlock); timer_mod_anticipate(icount_warp_timer, clock + deadline); } } else if (deadline == 0) { @@ -551,9 +661,8 @@ static const VMStateDescription vmstate_timers = { } }; -static void cpu_throttle_thread(void *opaque) +static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque) { - CPUState *cpu = opaque; double pct; double throttle_ratio; long sleeptime_ns; @@ -583,7 +692,8 @@ static void cpu_throttle_timer_tick(void *opaque) } CPU_FOREACH(cpu) { if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) { - async_run_on_cpu(cpu, cpu_throttle_thread, cpu); + async_run_on_cpu(cpu, cpu_throttle_thread, + RUN_ON_CPU_NULL); } } @@ -621,7 +731,7 @@ int cpu_throttle_get_percentage(void) void cpu_ticks_init(void) { - seqlock_init(&timers_state.vm_clock_seqlock, NULL); + seqlock_init(&timers_state.vm_clock_seqlock); vmstate_register(NULL, 0, &vmstate_timers, &timers_state); throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT, cpu_throttle_timer_tick, NULL); @@ -687,6 +797,84 @@ void configure_icount(QemuOpts *opts, Error **errp) NANOSECONDS_PER_SECOND / 10); } +/***********************************************************/ +/* TCG vCPU kick timer + * + * The kick timer is responsible for moving single threaded vCPU + * emulation on to the next vCPU. If more than one vCPU is running a + * timer event with force a cpu->exit so the next vCPU can get + * scheduled. + * + * The timer is removed if all vCPUs are idle and restarted again once + * idleness is complete. + */ + +static QEMUTimer *tcg_kick_vcpu_timer; +static CPUState *tcg_current_rr_cpu; + +#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10) + +static inline int64_t qemu_tcg_next_kick(void) +{ + return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD; +} + +/* Kick the currently round-robin scheduled vCPU */ +static void qemu_cpu_kick_rr_cpu(void) +{ + CPUState *cpu; + do { + cpu = atomic_mb_read(&tcg_current_rr_cpu); + if (cpu) { + cpu_exit(cpu); + } + } while (cpu != atomic_mb_read(&tcg_current_rr_cpu)); +} + +static void do_nothing(CPUState *cpu, run_on_cpu_data unused) +{ +} + +void qemu_timer_notify_cb(void *opaque, QEMUClockType type) +{ + if (!use_icount || type != QEMU_CLOCK_VIRTUAL) { + qemu_notify_event(); + return; + } + + if (!qemu_in_vcpu_thread() && first_cpu) { + /* qemu_cpu_kick is not enough to kick a halted CPU out of + * qemu_tcg_wait_io_event. async_run_on_cpu, instead, + * causes cpu_thread_is_idle to return false. This way, + * handle_icount_deadline can run. + */ + async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL); + } +} + +static void kick_tcg_thread(void *opaque) +{ + timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick()); + qemu_cpu_kick_rr_cpu(); +} + +static void start_tcg_kick_timer(void) +{ + if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) { + tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, + kick_tcg_thread, NULL); + timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick()); + } +} + +static void stop_tcg_kick_timer(void) +{ + if (tcg_kick_vcpu_timer) { + timer_del(tcg_kick_vcpu_timer); + tcg_kick_vcpu_timer = NULL; + } +} + /***********************************************************/ void hw_error(const char *fmt, ...) { @@ -745,7 +933,8 @@ static int do_vm_stop(RunState state) } bdrv_drain_all(); - ret = blk_flush_all(); + replay_disable_events(); + ret = bdrv_flush_all(); return ret; } @@ -786,13 +975,23 @@ static void sigbus_reraise(void) abort(); } -static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo, - void *ctx) +static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx) { - if (kvm_on_sigbus(siginfo->ssi_code, - (void *)(intptr_t)siginfo->ssi_addr)) { + if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) { sigbus_reraise(); } + + if (current_cpu) { + /* Called asynchronously in VCPU thread. */ + if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) { + sigbus_reraise(); + } + } else { + /* Called synchronously (via signalfd) in main thread. */ + if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) { + sigbus_reraise(); + } + } } static void qemu_init_sigbus(void) @@ -801,95 +1000,18 @@ static void qemu_init_sigbus(void) memset(&action, 0, sizeof(action)); action.sa_flags = SA_SIGINFO; - action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler; + action.sa_sigaction = sigbus_handler; sigaction(SIGBUS, &action, NULL); prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0); } - -static void qemu_kvm_eat_signals(CPUState *cpu) -{ - struct timespec ts = { 0, 0 }; - siginfo_t siginfo; - sigset_t waitset; - sigset_t chkset; - int r; - - sigemptyset(&waitset); - sigaddset(&waitset, SIG_IPI); - sigaddset(&waitset, SIGBUS); - - do { - r = sigtimedwait(&waitset, &siginfo, &ts); - if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { - perror("sigtimedwait"); - exit(1); - } - - switch (r) { - case SIGBUS: - if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) { - sigbus_reraise(); - } - break; - default: - break; - } - - r = sigpending(&chkset); - if (r == -1) { - perror("sigpending"); - exit(1); - } - } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS)); -} - #else /* !CONFIG_LINUX */ - static void qemu_init_sigbus(void) { } - -static void qemu_kvm_eat_signals(CPUState *cpu) -{ -} #endif /* !CONFIG_LINUX */ -#ifndef _WIN32 -static void dummy_signal(int sig) -{ -} - -static void qemu_kvm_init_cpu_signals(CPUState *cpu) -{ - int r; - sigset_t set; - struct sigaction sigact; - - memset(&sigact, 0, sizeof(sigact)); - sigact.sa_handler = dummy_signal; - sigaction(SIG_IPI, &sigact, NULL); - - pthread_sigmask(SIG_BLOCK, NULL, &set); - sigdelset(&set, SIG_IPI); - sigdelset(&set, SIGBUS); - r = kvm_set_signal_mask(cpu, &set); - if (r) { - fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r)); - exit(1); - } -} - -#else /* _WIN32 */ -static void qemu_kvm_init_cpu_signals(CPUState *cpu) -{ - abort(); -} -#endif /* _WIN32 */ - static QemuMutex qemu_global_mutex; -static QemuCond qemu_io_proceeded_cond; -static unsigned iothread_requesting_mutex; static QemuThread io_thread; @@ -897,79 +1019,20 @@ static QemuThread io_thread; static QemuCond qemu_cpu_cond; /* system init */ static QemuCond qemu_pause_cond; -static QemuCond qemu_work_cond; void qemu_init_cpu_loop(void) { qemu_init_sigbus(); qemu_cond_init(&qemu_cpu_cond); qemu_cond_init(&qemu_pause_cond); - qemu_cond_init(&qemu_work_cond); - qemu_cond_init(&qemu_io_proceeded_cond); qemu_mutex_init(&qemu_global_mutex); qemu_thread_get_self(&io_thread); } -void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data) -{ - struct qemu_work_item wi; - - if (qemu_cpu_is_self(cpu)) { - func(data); - return; - } - - wi.func = func; - wi.data = data; - wi.free = false; - - qemu_mutex_lock(&cpu->work_mutex); - if (cpu->queued_work_first == NULL) { - cpu->queued_work_first = &wi; - } else { - cpu->queued_work_last->next = &wi; - } - cpu->queued_work_last = &wi; - wi.next = NULL; - wi.done = false; - qemu_mutex_unlock(&cpu->work_mutex); - - qemu_cpu_kick(cpu); - while (!atomic_mb_read(&wi.done)) { - CPUState *self_cpu = current_cpu; - - qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex); - current_cpu = self_cpu; - } -} - -void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data) +void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data) { - struct qemu_work_item *wi; - - if (qemu_cpu_is_self(cpu)) { - func(data); - return; - } - - wi = g_malloc0(sizeof(struct qemu_work_item)); - wi->func = func; - wi->data = data; - wi->free = true; - - qemu_mutex_lock(&cpu->work_mutex); - if (cpu->queued_work_first == NULL) { - cpu->queued_work_first = wi; - } else { - cpu->queued_work_last->next = wi; - } - cpu->queued_work_last = wi; - wi->next = NULL; - wi->done = false; - qemu_mutex_unlock(&cpu->work_mutex); - - qemu_cpu_kick(cpu); + do_run_on_cpu(cpu, func, data, &qemu_global_mutex); } static void qemu_kvm_destroy_vcpu(CPUState *cpu) @@ -984,58 +1047,36 @@ static void qemu_tcg_destroy_vcpu(CPUState *cpu) { } -static void flush_queued_work(CPUState *cpu) -{ - struct qemu_work_item *wi; - - if (cpu->queued_work_first == NULL) { - return; - } - - qemu_mutex_lock(&cpu->work_mutex); - while (cpu->queued_work_first != NULL) { - wi = cpu->queued_work_first; - cpu->queued_work_first = wi->next; - if (!cpu->queued_work_first) { - cpu->queued_work_last = NULL; - } - qemu_mutex_unlock(&cpu->work_mutex); - wi->func(wi->data); - qemu_mutex_lock(&cpu->work_mutex); - if (wi->free) { - g_free(wi); - } else { - atomic_mb_set(&wi->done, true); - } - } - qemu_mutex_unlock(&cpu->work_mutex); - qemu_cond_broadcast(&qemu_work_cond); -} - static void qemu_wait_io_event_common(CPUState *cpu) { + atomic_mb_set(&cpu->thread_kicked, false); if (cpu->stop) { cpu->stop = false; cpu->stopped = true; qemu_cond_broadcast(&qemu_pause_cond); } - flush_queued_work(cpu); - cpu->thread_kicked = false; + process_queued_cpu_work(cpu); +} + +static bool qemu_tcg_should_sleep(CPUState *cpu) +{ + if (mttcg_enabled) { + return cpu_thread_is_idle(cpu); + } else { + return all_cpu_threads_idle(); + } } static void qemu_tcg_wait_io_event(CPUState *cpu) { - while (all_cpu_threads_idle()) { + while (qemu_tcg_should_sleep(cpu)) { + stop_tcg_kick_timer(); qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex); } - while (iothread_requesting_mutex) { - qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex); - } + start_tcg_kick_timer(); - CPU_FOREACH(cpu) { - qemu_wait_io_event_common(cpu); - } + qemu_wait_io_event_common(cpu); } static void qemu_kvm_wait_io_event(CPUState *cpu) @@ -1044,7 +1085,6 @@ static void qemu_kvm_wait_io_event(CPUState *cpu) qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex); } - qemu_kvm_eat_signals(cpu); qemu_wait_io_event_common(cpu); } @@ -1067,7 +1107,7 @@ static void *qemu_kvm_cpu_thread_fn(void *arg) exit(1); } - qemu_kvm_init_cpu_signals(cpu); + kvm_init_cpu_signals(cpu); /* signal CPU creation */ cpu->created = true; @@ -1106,6 +1146,7 @@ static void *qemu_dummy_cpu_thread_fn(void *arg) qemu_thread_get_self(cpu->thread); cpu->thread_id = qemu_get_thread_id(); cpu->can_do_io = 1; + current_cpu = cpu; sigemptyset(&waitset); sigaddset(&waitset, SIG_IPI); @@ -1114,9 +1155,7 @@ static void *qemu_dummy_cpu_thread_fn(void *arg) cpu->created = true; qemu_cond_signal(&qemu_cpu_cond); - current_cpu = cpu; while (1) { - current_cpu = NULL; qemu_mutex_unlock_iothread(); do { int sig; @@ -1127,7 +1166,6 @@ static void *qemu_dummy_cpu_thread_fn(void *arg) exit(1); } qemu_mutex_lock_iothread(); - current_cpu = cpu; qemu_wait_io_event_common(cpu); } @@ -1135,12 +1173,128 @@ static void *qemu_dummy_cpu_thread_fn(void *arg) #endif } -static void tcg_exec_all(void); +static int64_t tcg_get_icount_limit(void) +{ + int64_t deadline; -static void *qemu_tcg_cpu_thread_fn(void *arg) + if (replay_mode != REPLAY_MODE_PLAY) { + deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL); + + /* Maintain prior (possibly buggy) behaviour where if no deadline + * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than + * INT32_MAX nanoseconds ahead, we still use INT32_MAX + * nanoseconds. + */ + if ((deadline < 0) || (deadline > INT32_MAX)) { + deadline = INT32_MAX; + } + + return qemu_icount_round(deadline); + } else { + return replay_get_instructions(); + } +} + +static void handle_icount_deadline(void) +{ + assert(qemu_in_vcpu_thread()); + if (use_icount) { + int64_t deadline = + qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL); + + if (deadline == 0) { + /* Wake up other AioContexts. */ + qemu_clock_notify(QEMU_CLOCK_VIRTUAL); + qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL); + } + } +} + +static void prepare_icount_for_run(CPUState *cpu) +{ + if (use_icount) { + int insns_left; + + /* These should always be cleared by process_icount_data after + * each vCPU execution. However u16.high can be raised + * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt + */ + g_assert(cpu->icount_decr.u16.low == 0); + g_assert(cpu->icount_extra == 0); + + cpu->icount_budget = tcg_get_icount_limit(); + insns_left = MIN(0xffff, cpu->icount_budget); + cpu->icount_decr.u16.low = insns_left; + cpu->icount_extra = cpu->icount_budget - insns_left; + } +} + +static void process_icount_data(CPUState *cpu) +{ + if (use_icount) { + /* Account for executed instructions */ + cpu_update_icount(cpu); + + /* Reset the counters */ + cpu->icount_decr.u16.low = 0; + cpu->icount_extra = 0; + cpu->icount_budget = 0; + + replay_account_executed_instructions(); + } +} + + +static int tcg_cpu_exec(CPUState *cpu) +{ + int ret; +#ifdef CONFIG_PROFILER + int64_t ti; +#endif + +#ifdef CONFIG_PROFILER + ti = profile_getclock(); +#endif + qemu_mutex_unlock_iothread(); + cpu_exec_start(cpu); + ret = cpu_exec(cpu); + cpu_exec_end(cpu); + qemu_mutex_lock_iothread(); +#ifdef CONFIG_PROFILER + tcg_time += profile_getclock() - ti; +#endif + return ret; +} + +/* Destroy any remaining vCPUs which have been unplugged and have + * finished running + */ +static void deal_with_unplugged_cpus(void) +{ + CPUState *cpu; + + CPU_FOREACH(cpu) { + if (cpu->unplug && !cpu_can_run(cpu)) { + qemu_tcg_destroy_vcpu(cpu); + cpu->created = false; + qemu_cond_signal(&qemu_cpu_cond); + break; + } + } +} + +/* Single-threaded TCG + * + * In the single-threaded case each vCPU is simulated in turn. If + * there is more than a single vCPU we create a simple timer to kick + * the vCPU and ensure we don't get stuck in a tight loop in one vCPU. + * This is done explicitly rather than relying on side-effects + * elsewhere. + */ + +static void *qemu_tcg_rr_cpu_thread_fn(void *arg) { CPUState *cpu = arg; - CPUState *remove_cpu = NULL; rcu_register_thread(); @@ -1160,36 +1314,179 @@ static void *qemu_tcg_cpu_thread_fn(void *arg) /* process any pending work */ CPU_FOREACH(cpu) { + current_cpu = cpu; qemu_wait_io_event_common(cpu); } } + start_tcg_kick_timer(); + + cpu = first_cpu; + /* process any pending work */ - atomic_mb_set(&exit_request, 1); + cpu->exit_request = 1; while (1) { - tcg_exec_all(); + /* Account partial waits to QEMU_CLOCK_VIRTUAL. */ + qemu_account_warp_timer(); - if (use_icount) { - int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL); + /* Run the timers here. This is much more efficient than + * waking up the I/O thread and waiting for completion. + */ + handle_icount_deadline(); - if (deadline == 0) { - qemu_clock_notify(QEMU_CLOCK_VIRTUAL); - } + if (!cpu) { + cpu = first_cpu; } - qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus)); - CPU_FOREACH(cpu) { - if (cpu->unplug && !cpu_can_run(cpu)) { - remove_cpu = cpu; + + while (cpu && !cpu->queued_work_first && !cpu->exit_request) { + + atomic_mb_set(&tcg_current_rr_cpu, cpu); + current_cpu = cpu; + + qemu_clock_enable(QEMU_CLOCK_VIRTUAL, + (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0); + + if (cpu_can_run(cpu)) { + int r; + + prepare_icount_for_run(cpu); + + r = tcg_cpu_exec(cpu); + + process_icount_data(cpu); + + if (r == EXCP_DEBUG) { + cpu_handle_guest_debug(cpu); + break; + } else if (r == EXCP_ATOMIC) { + qemu_mutex_unlock_iothread(); + cpu_exec_step_atomic(cpu); + qemu_mutex_lock_iothread(); + break; + } + } else if (cpu->stop) { + if (cpu->unplug) { + cpu = CPU_NEXT(cpu); + } break; } + + cpu = CPU_NEXT(cpu); + } /* while (cpu && !cpu->exit_request).. */ + + /* Does not need atomic_mb_set because a spurious wakeup is okay. */ + atomic_set(&tcg_current_rr_cpu, NULL); + + if (cpu && cpu->exit_request) { + atomic_mb_set(&cpu->exit_request, 0); } - if (remove_cpu) { - qemu_tcg_destroy_vcpu(remove_cpu); - cpu->created = false; - qemu_cond_signal(&qemu_cpu_cond); - remove_cpu = NULL; + + qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus)); + deal_with_unplugged_cpus(); + } + + return NULL; +} + +static void *qemu_hax_cpu_thread_fn(void *arg) +{ + CPUState *cpu = arg; + int r; + + qemu_mutex_lock_iothread(); + qemu_thread_get_self(cpu->thread); + + cpu->thread_id = qemu_get_thread_id(); + cpu->created = true; + cpu->halted = 0; + current_cpu = cpu; + + hax_init_vcpu(cpu); + qemu_cond_signal(&qemu_cpu_cond); + + while (1) { + if (cpu_can_run(cpu)) { + r = hax_smp_cpu_exec(cpu); + if (r == EXCP_DEBUG) { + cpu_handle_guest_debug(cpu); + } + } + + while (cpu_thread_is_idle(cpu)) { + qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex); } +#ifdef _WIN32 + SleepEx(0, TRUE); +#endif + qemu_wait_io_event_common(cpu); + } + return NULL; +} + +#ifdef _WIN32 +static void CALLBACK dummy_apc_func(ULONG_PTR unused) +{ +} +#endif + +/* Multi-threaded TCG + * + * In the multi-threaded case each vCPU has its own thread. The TLS + * variable current_cpu can be used deep in the code to find the + * current CPUState for a given thread. + */ + +static void *qemu_tcg_cpu_thread_fn(void *arg) +{ + CPUState *cpu = arg; + + g_assert(!use_icount); + + rcu_register_thread(); + + qemu_mutex_lock_iothread(); + qemu_thread_get_self(cpu->thread); + + cpu->thread_id = qemu_get_thread_id(); + cpu->created = true; + cpu->can_do_io = 1; + current_cpu = cpu; + qemu_cond_signal(&qemu_cpu_cond); + + /* process any pending work */ + cpu->exit_request = 1; + + while (1) { + if (cpu_can_run(cpu)) { + int r; + r = tcg_cpu_exec(cpu); + switch (r) { + case EXCP_DEBUG: + cpu_handle_guest_debug(cpu); + break; + case EXCP_HALTED: + /* during start-up the vCPU is reset and the thread is + * kicked several times. If we don't ensure we go back + * to sleep in the halted state we won't cleanly + * start-up when the vCPU is enabled. + * + * cpu->halted should ensure we sleep in wait_io_event + */ + g_assert(cpu->halted); + break; + case EXCP_ATOMIC: + qemu_mutex_unlock_iothread(); + cpu_exec_step_atomic(cpu); + qemu_mutex_lock_iothread(); + default: + /* Ignore everything else? */ + break; + } + } + + atomic_mb_set(&cpu->exit_request, 0); + qemu_tcg_wait_io_event(cpu); } return NULL; @@ -1210,29 +1507,31 @@ static void qemu_cpu_kick_thread(CPUState *cpu) exit(1); } #else /* _WIN32 */ - abort(); -#endif -} - -static void qemu_cpu_kick_no_halt(void) -{ - CPUState *cpu; - /* Ensure whatever caused the exit has reached the CPU threads before - * writing exit_request. - */ - atomic_mb_set(&exit_request, 1); - cpu = atomic_mb_read(&tcg_current_cpu); - if (cpu) { - cpu_exit(cpu); + if (!qemu_cpu_is_self(cpu)) { + if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) { + fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n", + __func__, GetLastError()); + exit(1); + } } +#endif } void qemu_cpu_kick(CPUState *cpu) { qemu_cond_broadcast(cpu->halt_cond); if (tcg_enabled()) { - qemu_cpu_kick_no_halt(); + cpu_exit(cpu); + /* NOP unless doing single-thread RR */ + qemu_cpu_kick_rr_cpu(); } else { + if (hax_enabled()) { + /* + * FIXME: race condition with the exit_request check in + * hax_vcpu_hax_exec + */ + cpu->exit_request = 1; + } qemu_cpu_kick_thread(cpu); } } @@ -1262,42 +1561,29 @@ bool qemu_mutex_iothread_locked(void) void qemu_mutex_lock_iothread(void) { - atomic_inc(&iothread_requesting_mutex); - /* In the simple case there is no need to bump the VCPU thread out of - * TCG code execution. - */ - if (!tcg_enabled() || qemu_in_vcpu_thread() || - !first_cpu || !first_cpu->created) { - qemu_mutex_lock(&qemu_global_mutex); - atomic_dec(&iothread_requesting_mutex); - } else { - if (qemu_mutex_trylock(&qemu_global_mutex)) { - qemu_cpu_kick_no_halt(); - qemu_mutex_lock(&qemu_global_mutex); - } - atomic_dec(&iothread_requesting_mutex); - qemu_cond_broadcast(&qemu_io_proceeded_cond); - } + g_assert(!qemu_mutex_iothread_locked()); + qemu_mutex_lock(&qemu_global_mutex); iothread_locked = true; } void qemu_mutex_unlock_iothread(void) { + g_assert(qemu_mutex_iothread_locked()); iothread_locked = false; qemu_mutex_unlock(&qemu_global_mutex); } -static int all_vcpus_paused(void) +static bool all_vcpus_paused(void) { CPUState *cpu; CPU_FOREACH(cpu) { if (!cpu->stopped) { - return 0; + return false; } } - return 1; + return true; } void pause_all_vcpus(void) @@ -1312,13 +1598,6 @@ void pause_all_vcpus(void) if (qemu_in_vcpu_thread()) { cpu_stop_current(); - if (!kvm_enabled()) { - CPU_FOREACH(cpu) { - cpu->stop = false; - cpu->stopped = true; - } - return; - } } while (!all_vcpus_paused()) { @@ -1367,29 +1646,63 @@ void cpu_remove_sync(CPUState *cpu) static void qemu_tcg_init_vcpu(CPUState *cpu) { char thread_name[VCPU_THREAD_NAME_SIZE]; - static QemuCond *tcg_halt_cond; - static QemuThread *tcg_cpu_thread; + static QemuCond *single_tcg_halt_cond; + static QemuThread *single_tcg_cpu_thread; - /* share a single thread for all cpus with TCG */ - if (!tcg_cpu_thread) { + if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) { cpu->thread = g_malloc0(sizeof(QemuThread)); cpu->halt_cond = g_malloc0(sizeof(QemuCond)); qemu_cond_init(cpu->halt_cond); - tcg_halt_cond = cpu->halt_cond; - snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG", + + if (qemu_tcg_mttcg_enabled()) { + /* create a thread per vCPU with TCG (MTTCG) */ + parallel_cpus = true; + snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG", cpu->cpu_index); - qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn, - cpu, QEMU_THREAD_JOINABLE); + + qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn, + cpu, QEMU_THREAD_JOINABLE); + + } else { + /* share a single thread for all cpus with TCG */ + snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG"); + qemu_thread_create(cpu->thread, thread_name, + qemu_tcg_rr_cpu_thread_fn, + cpu, QEMU_THREAD_JOINABLE); + + single_tcg_halt_cond = cpu->halt_cond; + single_tcg_cpu_thread = cpu->thread; + } #ifdef _WIN32 cpu->hThread = qemu_thread_get_handle(cpu->thread); #endif while (!cpu->created) { qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex); } - tcg_cpu_thread = cpu->thread; } else { - cpu->thread = tcg_cpu_thread; - cpu->halt_cond = tcg_halt_cond; + /* For non-MTTCG cases we share the thread */ + cpu->thread = single_tcg_cpu_thread; + cpu->halt_cond = single_tcg_halt_cond; + } +} + +static void qemu_hax_start_vcpu(CPUState *cpu) +{ + char thread_name[VCPU_THREAD_NAME_SIZE]; + + cpu->thread = g_malloc0(sizeof(QemuThread)); + cpu->halt_cond = g_malloc0(sizeof(QemuCond)); + qemu_cond_init(cpu->halt_cond); + + snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX", + cpu->cpu_index); + qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn, + cpu, QEMU_THREAD_JOINABLE); +#ifdef _WIN32 + cpu->hThread = qemu_thread_get_handle(cpu->thread); +#endif + while (!cpu->created) { + qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex); } } @@ -1443,6 +1756,8 @@ void qemu_init_vcpu(CPUState *cpu) if (kvm_enabled()) { qemu_kvm_start_vcpu(cpu); + } else if (hax_enabled()) { + qemu_hax_start_vcpu(cpu); } else if (tcg_enabled()) { qemu_tcg_init_vcpu(cpu); } else { @@ -1476,116 +1791,62 @@ int vm_stop(RunState state) return do_vm_stop(state); } -/* does a state transition even if the VM is already stopped, - current state is forgotten forever */ -int vm_stop_force_state(RunState state) +/** + * Prepare for (re)starting the VM. + * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already + * running or in case of an error condition), 0 otherwise. + */ +int vm_prepare_start(void) { - if (runstate_is_running()) { - return vm_stop(state); - } else { - runstate_set(state); + RunState requested; + int res = 0; - bdrv_drain_all(); - /* Make sure to return an error if the flush in a previous vm_stop() - * failed. */ - return blk_flush_all(); + qemu_vmstop_requested(&requested); + if (runstate_is_running() && requested == RUN_STATE__MAX) { + return -1; } -} -static int64_t tcg_get_icount_limit(void) -{ - int64_t deadline; - - if (replay_mode != REPLAY_MODE_PLAY) { - deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL); - - /* Maintain prior (possibly buggy) behaviour where if no deadline - * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than - * INT32_MAX nanoseconds ahead, we still use INT32_MAX - * nanoseconds. - */ - if ((deadline < 0) || (deadline > INT32_MAX)) { - deadline = INT32_MAX; - } - - return qemu_icount_round(deadline); + /* Ensure that a STOP/RESUME pair of events is emitted if a + * vmstop request was pending. The BLOCK_IO_ERROR event, for + * example, according to documentation is always followed by + * the STOP event. + */ + if (runstate_is_running()) { + qapi_event_send_stop(&error_abort); + res = -1; } else { - return replay_get_instructions(); + replay_enable_events(); + cpu_enable_ticks(); + runstate_set(RUN_STATE_RUNNING); + vm_state_notify(1, RUN_STATE_RUNNING); } + + /* We are sending this now, but the CPUs will be resumed shortly later */ + qapi_event_send_resume(&error_abort); + return res; } -static int tcg_cpu_exec(CPUState *cpu) +void vm_start(void) { - int ret; -#ifdef CONFIG_PROFILER - int64_t ti; -#endif - -#ifdef CONFIG_PROFILER - ti = profile_getclock(); -#endif - if (use_icount) { - int64_t count; - int decr; - timers_state.qemu_icount -= (cpu->icount_decr.u16.low - + cpu->icount_extra); - cpu->icount_decr.u16.low = 0; - cpu->icount_extra = 0; - count = tcg_get_icount_limit(); - timers_state.qemu_icount += count; - decr = (count > 0xffff) ? 0xffff : count; - count -= decr; - cpu->icount_decr.u16.low = decr; - cpu->icount_extra = count; + if (!vm_prepare_start()) { + resume_all_vcpus(); } - ret = cpu_exec(cpu); -#ifdef CONFIG_PROFILER - tcg_time += profile_getclock() - ti; -#endif - if (use_icount) { - /* Fold pending instructions back into the - instruction counter, and clear the interrupt flag. */ - timers_state.qemu_icount -= (cpu->icount_decr.u16.low - + cpu->icount_extra); - cpu->icount_decr.u32 = 0; - cpu->icount_extra = 0; - replay_account_executed_instructions(); - } - return ret; } -static void tcg_exec_all(void) +/* does a state transition even if the VM is already stopped, + current state is forgotten forever */ +int vm_stop_force_state(RunState state) { - int r; - - /* Account partial waits to QEMU_CLOCK_VIRTUAL. */ - qemu_account_warp_timer(); - - if (next_cpu == NULL) { - next_cpu = first_cpu; - } - for (; next_cpu != NULL && !exit_request; next_cpu = CPU_NEXT(next_cpu)) { - CPUState *cpu = next_cpu; - - qemu_clock_enable(QEMU_CLOCK_VIRTUAL, - (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0); + if (runstate_is_running()) { + return vm_stop(state); + } else { + runstate_set(state); - if (cpu_can_run(cpu)) { - r = tcg_cpu_exec(cpu); - if (r == EXCP_DEBUG) { - cpu_handle_guest_debug(cpu); - break; - } - } else if (cpu->stop || cpu->stopped) { - if (cpu->unplug) { - next_cpu = CPU_NEXT(cpu); - } - break; - } + bdrv_drain_all(); + /* Make sure to return an error if the flush in a previous vm_stop() + * failed. */ + return bdrv_flush_all(); } - - /* Pairs with smp_wmb in qemu_cpu_kick. */ - atomic_mb_set(&exit_request, 0); } void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)