X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=cpus.c;h=e83f72b48b4bf21e93b8588f953659e64eb20645;hb=bea0279b72fcd7bf5b5305fabf3a3cbee3a3eaa6;hp=d1f16296de7ef526ca0d0ce403b3bc8e70d5679a;hpb=bbd87423ea0c436c55bbc3f9c23d4f811d1f3f29;p=mirror_qemu.git diff --git a/cpus.c b/cpus.c index d1f16296de..e83f72b48b 100644 --- a/cpus.c +++ b/cpus.c @@ -121,8 +121,6 @@ static bool all_cpu_threads_idle(void) /* Protected by TimersState seqlock */ static bool icount_sleep = true; -/* Conversion factor from emulated instructions to virtual clock ticks. */ -static int icount_time_shift; /* Arbitrarily pick 1MIPS as the minimum allowable speed. */ #define MAX_ICOUNT_SHIFT 10 @@ -131,20 +129,27 @@ typedef struct TimersState { int64_t cpu_ticks_prev; int64_t cpu_ticks_offset; - /* cpu_clock_offset can be read out of BQL, so protect it with - * this lock. + /* Protect fields that can be respectively read outside the + * BQL, and written from multiple threads. */ QemuSeqLock vm_clock_seqlock; - int64_t cpu_clock_offset; - int32_t cpu_ticks_enabled; - int64_t dummy; + QemuSpin vm_clock_lock; + + int16_t cpu_ticks_enabled; + + /* Conversion factor from emulated instructions to virtual clock ticks. */ + int16_t icount_time_shift; /* Compensate for varying guest execution speed. */ int64_t qemu_icount_bias; + + int64_t vm_clock_warp_start; + int64_t cpu_clock_offset; + /* Only written by TCG thread */ int64_t qemu_icount; + /* for adjusting icount */ - int64_t vm_clock_warp_start; QEMUTimer *icount_rt_timer; QEMUTimer *icount_vm_timer; QEMUTimer *icount_warp_timer; @@ -206,12 +211,12 @@ void qemu_tcg_configure(QemuOpts *opts, Error **errp) error_setg(errp, "No MTTCG when icount is enabled"); } else { #ifndef TARGET_SUPPORTS_MTTCG - error_report("Guest not yet converted to MTTCG - " - "you may get unexpected results"); + warn_report("Guest not yet converted to MTTCG - " + "you may get unexpected results"); #endif if (!check_tcg_memory_orders_compatible()) { - error_report("Guest expects a stronger memory ordering " - "than the host provides"); + warn_report("Guest expects a stronger memory ordering " + "than the host provides"); error_printf("This may cause strange/hard to debug errors\n"); } mttcg_enabled = true; @@ -240,21 +245,30 @@ static int64_t cpu_get_icount_executed(CPUState *cpu) * account executed instructions. This is done by the TCG vCPU * thread so the main-loop can see time has moved forward. */ -void cpu_update_icount(CPUState *cpu) +static void cpu_update_icount_locked(CPUState *cpu) { int64_t executed = cpu_get_icount_executed(cpu); cpu->icount_budget -= executed; -#ifdef CONFIG_ATOMIC64 - atomic_set__nocheck(&timers_state.qemu_icount, - atomic_read__nocheck(&timers_state.qemu_icount) + - executed); -#else /* FIXME: we need 64bit atomics to do this safely */ - timers_state.qemu_icount += executed; -#endif + atomic_set_i64(&timers_state.qemu_icount, + timers_state.qemu_icount + executed); } -int64_t cpu_get_icount_raw(void) +/* + * Update the global shared timer_state.qemu_icount to take into + * account executed instructions. This is done by the TCG vCPU + * thread so the main-loop can see time has moved forward. + */ +void cpu_update_icount(CPUState *cpu) +{ + seqlock_write_lock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); + cpu_update_icount_locked(cpu); + seqlock_write_unlock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); +} + +static int64_t cpu_get_icount_raw_locked(void) { CPUState *cpu = current_cpu; @@ -264,22 +278,33 @@ int64_t cpu_get_icount_raw(void) exit(1); } /* Take into account what has run */ - cpu_update_icount(cpu); + cpu_update_icount_locked(cpu); } -#ifdef CONFIG_ATOMIC64 - return atomic_read__nocheck(&timers_state.qemu_icount); -#else /* FIXME: we need 64bit atomics to do this safely */ - return timers_state.qemu_icount; -#endif + /* The read is protected by the seqlock, but needs atomic64 to avoid UB */ + return atomic_read_i64(&timers_state.qemu_icount); } -/* Return the virtual CPU time, based on the instruction counter. */ static int64_t cpu_get_icount_locked(void) { - int64_t icount = cpu_get_icount_raw(); - return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount); + int64_t icount = cpu_get_icount_raw_locked(); + return atomic_read_i64(&timers_state.qemu_icount_bias) + + cpu_icount_to_ns(icount); } +int64_t cpu_get_icount_raw(void) +{ + int64_t icount; + unsigned start; + + do { + start = seqlock_read_begin(&timers_state.vm_clock_seqlock); + icount = cpu_get_icount_raw_locked(); + } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start)); + + return icount; +} + +/* Return the virtual CPU time, based on the instruction counter. */ int64_t cpu_get_icount(void) { int64_t icount; @@ -295,14 +320,29 @@ int64_t cpu_get_icount(void) int64_t cpu_icount_to_ns(int64_t icount) { - return icount << icount_time_shift; + return icount << atomic_read(&timers_state.icount_time_shift); +} + +static int64_t cpu_get_ticks_locked(void) +{ + int64_t ticks = timers_state.cpu_ticks_offset; + if (timers_state.cpu_ticks_enabled) { + ticks += cpu_get_host_ticks(); + } + + if (timers_state.cpu_ticks_prev > ticks) { + /* Non increasing ticks may happen if the host uses software suspend. */ + timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks; + ticks = timers_state.cpu_ticks_prev; + } + + timers_state.cpu_ticks_prev = ticks; + return ticks; } /* return the time elapsed in VM between vm_start and vm_stop. Unless * icount is active, cpu_get_ticks() uses units of the host CPU cycle * counter. - * - * Caller must hold the BQL */ int64_t cpu_get_ticks(void) { @@ -312,19 +352,9 @@ int64_t cpu_get_ticks(void) return cpu_get_icount(); } - ticks = timers_state.cpu_ticks_offset; - if (timers_state.cpu_ticks_enabled) { - ticks += cpu_get_host_ticks(); - } - - if (timers_state.cpu_ticks_prev > ticks) { - /* Note: non increasing ticks may happen if the host uses - software suspend */ - timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks; - ticks = timers_state.cpu_ticks_prev; - } - - timers_state.cpu_ticks_prev = ticks; + qemu_spin_lock(&timers_state.vm_clock_lock); + ticks = cpu_get_ticks_locked(); + qemu_spin_unlock(&timers_state.vm_clock_lock); return ticks; } @@ -361,14 +391,15 @@ int64_t cpu_get_clock(void) */ void cpu_enable_ticks(void) { - /* Here, the really thing protected by seqlock is cpu_clock_offset. */ - seqlock_write_begin(&timers_state.vm_clock_seqlock); + seqlock_write_lock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); if (!timers_state.cpu_ticks_enabled) { timers_state.cpu_ticks_offset -= cpu_get_host_ticks(); timers_state.cpu_clock_offset -= get_clock(); timers_state.cpu_ticks_enabled = 1; } - seqlock_write_end(&timers_state.vm_clock_seqlock); + seqlock_write_unlock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); } /* disable cpu_get_ticks() : the clock is stopped. You must not call @@ -377,14 +408,15 @@ void cpu_enable_ticks(void) */ void cpu_disable_ticks(void) { - /* Here, the really thing protected by seqlock is cpu_clock_offset. */ - seqlock_write_begin(&timers_state.vm_clock_seqlock); + seqlock_write_lock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); if (timers_state.cpu_ticks_enabled) { timers_state.cpu_ticks_offset += cpu_get_host_ticks(); timers_state.cpu_clock_offset = cpu_get_clock_locked(); timers_state.cpu_ticks_enabled = 0; } - seqlock_write_end(&timers_state.vm_clock_seqlock); + seqlock_write_unlock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); } /* Correlation between real and virtual time is always going to be @@ -407,7 +439,8 @@ static void icount_adjust(void) return; } - seqlock_write_begin(&timers_state.vm_clock_seqlock); + seqlock_write_lock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); cur_time = cpu_get_clock_locked(); cur_icount = cpu_get_icount_locked(); @@ -415,20 +448,24 @@ static void icount_adjust(void) /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */ if (delta > 0 && last_delta + ICOUNT_WOBBLE < delta * 2 - && icount_time_shift > 0) { + && timers_state.icount_time_shift > 0) { /* The guest is getting too far ahead. Slow time down. */ - icount_time_shift--; + atomic_set(&timers_state.icount_time_shift, + timers_state.icount_time_shift - 1); } if (delta < 0 && last_delta - ICOUNT_WOBBLE > delta * 2 - && icount_time_shift < MAX_ICOUNT_SHIFT) { + && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) { /* The guest is getting too far behind. Speed time up. */ - icount_time_shift++; + atomic_set(&timers_state.icount_time_shift, + timers_state.icount_time_shift + 1); } last_delta = delta; - timers_state.qemu_icount_bias = cur_icount - - (timers_state.qemu_icount << icount_time_shift); - seqlock_write_end(&timers_state.vm_clock_seqlock); + atomic_set_i64(&timers_state.qemu_icount_bias, + cur_icount - (timers_state.qemu_icount + << timers_state.icount_time_shift)); + seqlock_write_unlock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); } static void icount_adjust_rt(void *opaque) @@ -448,7 +485,8 @@ static void icount_adjust_vm(void *opaque) static int64_t qemu_icount_round(int64_t count) { - return (count + (1 << icount_time_shift) - 1) >> icount_time_shift; + int shift = atomic_read(&timers_state.icount_time_shift); + return (count + (1 << shift) - 1) >> shift; } static void icount_warp_rt(void) @@ -468,10 +506,11 @@ static void icount_warp_rt(void) return; } - seqlock_write_begin(&timers_state.vm_clock_seqlock); + seqlock_write_lock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); if (runstate_is_running()) { - int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT, - cpu_get_clock_locked()); + int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT, + cpu_get_clock_locked()); int64_t warp_delta; warp_delta = clock - timers_state.vm_clock_warp_start; @@ -484,10 +523,12 @@ static void icount_warp_rt(void) int64_t delta = clock - cur_icount; warp_delta = MIN(warp_delta, delta); } - timers_state.qemu_icount_bias += warp_delta; + atomic_set_i64(&timers_state.qemu_icount_bias, + timers_state.qemu_icount_bias + warp_delta); } timers_state.vm_clock_warp_start = -1; - seqlock_write_end(&timers_state.vm_clock_seqlock); + seqlock_write_unlock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) { qemu_clock_notify(QEMU_CLOCK_VIRTUAL); @@ -512,9 +553,12 @@ void qtest_clock_warp(int64_t dest) int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL); int64_t warp = qemu_soonest_timeout(dest - clock, deadline); - seqlock_write_begin(&timers_state.vm_clock_seqlock); - timers_state.qemu_icount_bias += warp; - seqlock_write_end(&timers_state.vm_clock_seqlock); + seqlock_write_lock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); + atomic_set_i64(&timers_state.qemu_icount_bias, + timers_state.qemu_icount_bias + warp); + seqlock_write_unlock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL); timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]); @@ -539,18 +583,29 @@ void qemu_start_warp_timer(void) return; } - /* warp clock deterministically in record/replay mode */ - if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) { - return; - } + if (replay_mode != REPLAY_MODE_PLAY) { + if (!all_cpu_threads_idle()) { + return; + } - if (!all_cpu_threads_idle()) { - return; - } + if (qtest_enabled()) { + /* When testing, qtest commands advance icount. */ + return; + } - if (qtest_enabled()) { - /* When testing, qtest commands advance icount. */ - return; + replay_checkpoint(CHECKPOINT_CLOCK_WARP_START); + } else { + /* warp clock deterministically in record/replay mode */ + if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) { + /* vCPU is sleeping and warp can't be started. + It is probably a race condition: notification sent + to vCPU was processed in advance and vCPU went to sleep. + Therefore we have to wake it up for doing someting. */ + if (replay_has_checkpoint()) { + qemu_clock_notify(QEMU_CLOCK_VIRTUAL); + } + return; + } } /* We want to use the earliest deadline from ALL vm_clocks */ @@ -581,9 +636,12 @@ void qemu_start_warp_timer(void) * It is useful when we want a deterministic execution time, * isolated from host latencies. */ - seqlock_write_begin(&timers_state.vm_clock_seqlock); - timers_state.qemu_icount_bias += deadline; - seqlock_write_end(&timers_state.vm_clock_seqlock); + seqlock_write_lock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); + atomic_set_i64(&timers_state.qemu_icount_bias, + timers_state.qemu_icount_bias + deadline); + seqlock_write_unlock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); qemu_clock_notify(QEMU_CLOCK_VIRTUAL); } else { /* @@ -594,12 +652,14 @@ void qemu_start_warp_timer(void) * you will not be sending network packets continuously instead of * every 100ms. */ - seqlock_write_begin(&timers_state.vm_clock_seqlock); + seqlock_write_lock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); if (timers_state.vm_clock_warp_start == -1 || timers_state.vm_clock_warp_start > clock) { timers_state.vm_clock_warp_start = clock; } - seqlock_write_end(&timers_state.vm_clock_seqlock); + seqlock_write_unlock(&timers_state.vm_clock_seqlock, + &timers_state.vm_clock_lock); timer_mod_anticipate(timers_state.icount_warp_timer, clock + deadline); } @@ -700,7 +760,7 @@ static const VMStateDescription vmstate_timers = { .minimum_version_id = 1, .fields = (VMStateField[]) { VMSTATE_INT64(cpu_ticks_offset, TimersState), - VMSTATE_INT64(dummy, TimersState), + VMSTATE_UNUSED(8), VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2), VMSTATE_END_OF_LIST() }, @@ -781,6 +841,7 @@ int cpu_throttle_get_percentage(void) void cpu_ticks_init(void) { seqlock_init(&timers_state.vm_clock_seqlock); + qemu_spin_init(&timers_state.vm_clock_lock); vmstate_register(NULL, 0, &vmstate_timers, &timers_state); throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT, cpu_throttle_timer_tick, NULL); @@ -812,7 +873,7 @@ void configure_icount(QemuOpts *opts, Error **errp) } if (strcmp(option, "auto") != 0) { errno = 0; - icount_time_shift = strtol(option, &rem_str, 0); + timers_state.icount_time_shift = strtol(option, &rem_str, 0); if (errno != 0 || *rem_str != '\0' || !strlen(option)) { error_setg(errp, "icount: Invalid shift value"); } @@ -828,7 +889,7 @@ void configure_icount(QemuOpts *opts, Error **errp) /* 125MIPS seems a reasonable initial guess at the guest speed. It will be corrected fairly quickly anyway. */ - icount_time_shift = 3; + timers_state.icount_time_shift = 3; /* Have both realtime and virtual time triggers for speed adjustment. The realtime trigger catches emulated time passing too slowly, @@ -922,6 +983,8 @@ static void start_tcg_kick_timer(void) if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) { tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, kick_tcg_thread, NULL); + } + if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) { timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick()); } } @@ -929,9 +992,8 @@ static void start_tcg_kick_timer(void) static void stop_tcg_kick_timer(void) { assert(!mttcg_enabled); - if (tcg_kick_vcpu_timer) { + if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) { timer_del(tcg_kick_vcpu_timer); - tcg_kick_vcpu_timer = NULL; } } @@ -1011,7 +1073,7 @@ static int do_vm_stop(RunState state, bool send_stop) runstate_set(state); vm_state_notify(0, state); if (send_stop) { - qapi_event_send_stop(&error_abort); + qapi_event_send_stop(); } } @@ -1158,16 +1220,20 @@ static void qemu_wait_io_event_common(CPUState *cpu) process_queued_cpu_work(cpu); } -static void qemu_tcg_rr_wait_io_event(CPUState *cpu) +static void qemu_tcg_rr_wait_io_event(void) { + CPUState *cpu; + while (all_cpu_threads_idle()) { stop_tcg_kick_timer(); - qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex); + qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex); } start_tcg_kick_timer(); - qemu_wait_io_event_common(cpu); + CPU_FOREACH(cpu) { + qemu_wait_io_event_common(cpu); + } } static void qemu_wait_io_event(CPUState *cpu) @@ -1267,6 +1333,7 @@ static void *qemu_dummy_cpu_thread_fn(void *arg) qemu_wait_io_event(cpu); } while (!cpu->unplug); + qemu_mutex_unlock_iothread(); rcu_unregister_thread(); return NULL; #endif @@ -1355,6 +1422,7 @@ static int tcg_cpu_exec(CPUState *cpu) int64_t ti; #endif + assert(tcg_enabled()); #ifdef CONFIG_PROFILER ti = profile_getclock(); #endif @@ -1362,7 +1430,8 @@ static int tcg_cpu_exec(CPUState *cpu) ret = cpu_exec(cpu); cpu_exec_end(cpu); #ifdef CONFIG_PROFILER - tcg_time += profile_getclock() - ti; + atomic_set(&tcg_ctx->prof.cpu_exec_time, + tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti); #endif return ret; } @@ -1397,6 +1466,7 @@ static void *qemu_tcg_rr_cpu_thread_fn(void *arg) { CPUState *cpu = arg; + assert(tcg_enabled()); rcu_register_thread(); tcg_register_thread(); @@ -1489,7 +1559,15 @@ static void *qemu_tcg_rr_cpu_thread_fn(void *arg) atomic_mb_set(&cpu->exit_request, 0); } - qemu_tcg_rr_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus)); + if (use_icount && all_cpu_threads_idle()) { + /* + * When all cpus are sleeping (e.g in WFI), to avoid a deadlock + * in the main_loop, wake it up in order to start the warp timer. + */ + qemu_notify_event(); + } + + qemu_tcg_rr_wait_io_event(); deal_with_unplugged_cpus(); } @@ -1631,6 +1709,7 @@ static void *qemu_tcg_cpu_thread_fn(void *arg) { CPUState *cpu = arg; + assert(tcg_enabled()); g_assert(!use_icount); rcu_register_thread(); @@ -1700,7 +1779,7 @@ static void qemu_cpu_kick_thread(CPUState *cpu) } cpu->thread_kicked = true; err = pthread_kill(cpu->thread->thread, SIG_IPI); - if (err) { + if (err && err != ESRCH) { fprintf(stderr, "qemu:%s: %s", __func__, strerror(err)); exit(1); } @@ -1759,10 +1838,16 @@ bool qemu_mutex_iothread_locked(void) return iothread_locked; } -void qemu_mutex_lock_iothread(void) +/* + * The BQL is taken from so many places that it is worth profiling the + * callers directly, instead of funneling them all through a single function. + */ +void qemu_mutex_lock_iothread_impl(const char *file, int line) { + QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func); + g_assert(!qemu_mutex_iothread_locked()); - qemu_mutex_lock(&qemu_global_mutex); + bql_lock(&qemu_global_mutex, file, line); iothread_locked = true; } @@ -1854,6 +1939,7 @@ static void qemu_tcg_init_vcpu(CPUState *cpu) static QemuThread *single_tcg_cpu_thread; static int tcg_region_inited; + assert(tcg_enabled()); /* * Initialize TCG regions--once. Now is a good time, because: * (1) TCG's init context, prologue and target globals have been set up. @@ -2015,7 +2101,8 @@ void qemu_init_vcpu(CPUState *cpu) void cpu_stop_current(void) { if (current_cpu) { - qemu_cpu_stop(current_cpu, true); + current_cpu->stop = true; + cpu_exit(current_cpu); } } @@ -2055,13 +2142,13 @@ int vm_prepare_start(void) * the STOP event. */ if (runstate_is_running()) { - qapi_event_send_stop(&error_abort); - qapi_event_send_resume(&error_abort); + qapi_event_send_stop(); + qapi_event_send_resume(); return -1; } /* We are sending this now, but the CPUs will be resumed shortly later */ - qapi_event_send_resume(&error_abort); + qapi_event_send_resume(); replay_enable_events(); cpu_enable_ticks(); @@ -2199,7 +2286,6 @@ static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target) return CPU_INFO_ARCH_X86; case SYS_EMU_TARGET_PPC: - case SYS_EMU_TARGET_PPCEMB: case SYS_EMU_TARGET_PPC64: return CPU_INFO_ARCH_PPC; @@ -2273,8 +2359,6 @@ CpuInfoFastList *qmp_query_cpus_fast(Error **errp) info->value->target = target; if (target == SYS_EMU_TARGET_S390X) { cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu); - } else { - /* do nothing for @CpuInfoOther */ } if (!cur_item) {