]> git.proxmox.com Git - mirror_qemu.git/blame_incremental - cpus.c
docs/devel/memory.txt: Document _with_attrs accessors
[mirror_qemu.git] / cpus.c
... / ...
CommitLineData
1/*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25#include "qemu/osdep.h"
26#include "qemu/config-file.h"
27#include "cpu.h"
28#include "monitor/monitor.h"
29#include "qapi/error.h"
30#include "qapi/qapi-commands-misc.h"
31#include "qapi/qapi-events-run-state.h"
32#include "qapi/qmp/qerror.h"
33#include "qemu/error-report.h"
34#include "sysemu/sysemu.h"
35#include "sysemu/block-backend.h"
36#include "exec/gdbstub.h"
37#include "sysemu/dma.h"
38#include "sysemu/hw_accel.h"
39#include "sysemu/kvm.h"
40#include "sysemu/hax.h"
41#include "sysemu/hvf.h"
42#include "sysemu/whpx.h"
43#include "exec/exec-all.h"
44
45#include "qemu/thread.h"
46#include "sysemu/cpus.h"
47#include "sysemu/qtest.h"
48#include "qemu/main-loop.h"
49#include "qemu/option.h"
50#include "qemu/bitmap.h"
51#include "qemu/seqlock.h"
52#include "tcg.h"
53#include "hw/nmi.h"
54#include "sysemu/replay.h"
55#include "hw/boards.h"
56
57#ifdef CONFIG_LINUX
58
59#include <sys/prctl.h>
60
61#ifndef PR_MCE_KILL
62#define PR_MCE_KILL 33
63#endif
64
65#ifndef PR_MCE_KILL_SET
66#define PR_MCE_KILL_SET 1
67#endif
68
69#ifndef PR_MCE_KILL_EARLY
70#define PR_MCE_KILL_EARLY 1
71#endif
72
73#endif /* CONFIG_LINUX */
74
75int64_t max_delay;
76int64_t max_advance;
77
78/* vcpu throttling controls */
79static QEMUTimer *throttle_timer;
80static unsigned int throttle_percentage;
81
82#define CPU_THROTTLE_PCT_MIN 1
83#define CPU_THROTTLE_PCT_MAX 99
84#define CPU_THROTTLE_TIMESLICE_NS 10000000
85
86bool cpu_is_stopped(CPUState *cpu)
87{
88 return cpu->stopped || !runstate_is_running();
89}
90
91static bool cpu_thread_is_idle(CPUState *cpu)
92{
93 if (cpu->stop || cpu->queued_work_first) {
94 return false;
95 }
96 if (cpu_is_stopped(cpu)) {
97 return true;
98 }
99 if (!cpu->halted || cpu_has_work(cpu) ||
100 kvm_halt_in_kernel()) {
101 return false;
102 }
103 return true;
104}
105
106static bool all_cpu_threads_idle(void)
107{
108 CPUState *cpu;
109
110 CPU_FOREACH(cpu) {
111 if (!cpu_thread_is_idle(cpu)) {
112 return false;
113 }
114 }
115 return true;
116}
117
118/***********************************************************/
119/* guest cycle counter */
120
121/* Protected by TimersState seqlock */
122
123static bool icount_sleep = true;
124/* Arbitrarily pick 1MIPS as the minimum allowable speed. */
125#define MAX_ICOUNT_SHIFT 10
126
127typedef struct TimersState {
128 /* Protected by BQL. */
129 int64_t cpu_ticks_prev;
130 int64_t cpu_ticks_offset;
131
132 /* Protect fields that can be respectively read outside the
133 * BQL, and written from multiple threads.
134 */
135 QemuSeqLock vm_clock_seqlock;
136 QemuSpin vm_clock_lock;
137
138 int16_t cpu_ticks_enabled;
139
140 /* Conversion factor from emulated instructions to virtual clock ticks. */
141 int16_t icount_time_shift;
142
143 /* Compensate for varying guest execution speed. */
144 int64_t qemu_icount_bias;
145
146 int64_t vm_clock_warp_start;
147 int64_t cpu_clock_offset;
148
149 /* Only written by TCG thread */
150 int64_t qemu_icount;
151
152 /* for adjusting icount */
153 QEMUTimer *icount_rt_timer;
154 QEMUTimer *icount_vm_timer;
155 QEMUTimer *icount_warp_timer;
156} TimersState;
157
158static TimersState timers_state;
159bool mttcg_enabled;
160
161/*
162 * We default to false if we know other options have been enabled
163 * which are currently incompatible with MTTCG. Otherwise when each
164 * guest (target) has been updated to support:
165 * - atomic instructions
166 * - memory ordering primitives (barriers)
167 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
168 *
169 * Once a guest architecture has been converted to the new primitives
170 * there are two remaining limitations to check.
171 *
172 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
173 * - The host must have a stronger memory order than the guest
174 *
175 * It may be possible in future to support strong guests on weak hosts
176 * but that will require tagging all load/stores in a guest with their
177 * implicit memory order requirements which would likely slow things
178 * down a lot.
179 */
180
181static bool check_tcg_memory_orders_compatible(void)
182{
183#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
184 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
185#else
186 return false;
187#endif
188}
189
190static bool default_mttcg_enabled(void)
191{
192 if (use_icount || TCG_OVERSIZED_GUEST) {
193 return false;
194 } else {
195#ifdef TARGET_SUPPORTS_MTTCG
196 return check_tcg_memory_orders_compatible();
197#else
198 return false;
199#endif
200 }
201}
202
203void qemu_tcg_configure(QemuOpts *opts, Error **errp)
204{
205 const char *t = qemu_opt_get(opts, "thread");
206 if (t) {
207 if (strcmp(t, "multi") == 0) {
208 if (TCG_OVERSIZED_GUEST) {
209 error_setg(errp, "No MTTCG when guest word size > hosts");
210 } else if (use_icount) {
211 error_setg(errp, "No MTTCG when icount is enabled");
212 } else {
213#ifndef TARGET_SUPPORTS_MTTCG
214 error_report("Guest not yet converted to MTTCG - "
215 "you may get unexpected results");
216#endif
217 if (!check_tcg_memory_orders_compatible()) {
218 error_report("Guest expects a stronger memory ordering "
219 "than the host provides");
220 error_printf("This may cause strange/hard to debug errors\n");
221 }
222 mttcg_enabled = true;
223 }
224 } else if (strcmp(t, "single") == 0) {
225 mttcg_enabled = false;
226 } else {
227 error_setg(errp, "Invalid 'thread' setting %s", t);
228 }
229 } else {
230 mttcg_enabled = default_mttcg_enabled();
231 }
232}
233
234/* The current number of executed instructions is based on what we
235 * originally budgeted minus the current state of the decrementing
236 * icount counters in extra/u16.low.
237 */
238static int64_t cpu_get_icount_executed(CPUState *cpu)
239{
240 return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
241}
242
243/*
244 * Update the global shared timer_state.qemu_icount to take into
245 * account executed instructions. This is done by the TCG vCPU
246 * thread so the main-loop can see time has moved forward.
247 */
248static void cpu_update_icount_locked(CPUState *cpu)
249{
250 int64_t executed = cpu_get_icount_executed(cpu);
251 cpu->icount_budget -= executed;
252
253 atomic_set_i64(&timers_state.qemu_icount,
254 timers_state.qemu_icount + executed);
255}
256
257/*
258 * Update the global shared timer_state.qemu_icount to take into
259 * account executed instructions. This is done by the TCG vCPU
260 * thread so the main-loop can see time has moved forward.
261 */
262void cpu_update_icount(CPUState *cpu)
263{
264 seqlock_write_lock(&timers_state.vm_clock_seqlock,
265 &timers_state.vm_clock_lock);
266 cpu_update_icount_locked(cpu);
267 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
268 &timers_state.vm_clock_lock);
269}
270
271static int64_t cpu_get_icount_raw_locked(void)
272{
273 CPUState *cpu = current_cpu;
274
275 if (cpu && cpu->running) {
276 if (!cpu->can_do_io) {
277 error_report("Bad icount read");
278 exit(1);
279 }
280 /* Take into account what has run */
281 cpu_update_icount_locked(cpu);
282 }
283 /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
284 return atomic_read_i64(&timers_state.qemu_icount);
285}
286
287static int64_t cpu_get_icount_locked(void)
288{
289 int64_t icount = cpu_get_icount_raw_locked();
290 return atomic_read_i64(&timers_state.qemu_icount_bias) +
291 cpu_icount_to_ns(icount);
292}
293
294int64_t cpu_get_icount_raw(void)
295{
296 int64_t icount;
297 unsigned start;
298
299 do {
300 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
301 icount = cpu_get_icount_raw_locked();
302 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
303
304 return icount;
305}
306
307/* Return the virtual CPU time, based on the instruction counter. */
308int64_t cpu_get_icount(void)
309{
310 int64_t icount;
311 unsigned start;
312
313 do {
314 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
315 icount = cpu_get_icount_locked();
316 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
317
318 return icount;
319}
320
321int64_t cpu_icount_to_ns(int64_t icount)
322{
323 return icount << atomic_read(&timers_state.icount_time_shift);
324}
325
326static int64_t cpu_get_ticks_locked(void)
327{
328 int64_t ticks = timers_state.cpu_ticks_offset;
329 if (timers_state.cpu_ticks_enabled) {
330 ticks += cpu_get_host_ticks();
331 }
332
333 if (timers_state.cpu_ticks_prev > ticks) {
334 /* Non increasing ticks may happen if the host uses software suspend. */
335 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
336 ticks = timers_state.cpu_ticks_prev;
337 }
338
339 timers_state.cpu_ticks_prev = ticks;
340 return ticks;
341}
342
343/* return the time elapsed in VM between vm_start and vm_stop. Unless
344 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
345 * counter.
346 */
347int64_t cpu_get_ticks(void)
348{
349 int64_t ticks;
350
351 if (use_icount) {
352 return cpu_get_icount();
353 }
354
355 qemu_spin_lock(&timers_state.vm_clock_lock);
356 ticks = cpu_get_ticks_locked();
357 qemu_spin_unlock(&timers_state.vm_clock_lock);
358 return ticks;
359}
360
361static int64_t cpu_get_clock_locked(void)
362{
363 int64_t time;
364
365 time = timers_state.cpu_clock_offset;
366 if (timers_state.cpu_ticks_enabled) {
367 time += get_clock();
368 }
369
370 return time;
371}
372
373/* Return the monotonic time elapsed in VM, i.e.,
374 * the time between vm_start and vm_stop
375 */
376int64_t cpu_get_clock(void)
377{
378 int64_t ti;
379 unsigned start;
380
381 do {
382 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
383 ti = cpu_get_clock_locked();
384 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
385
386 return ti;
387}
388
389/* enable cpu_get_ticks()
390 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
391 */
392void cpu_enable_ticks(void)
393{
394 seqlock_write_lock(&timers_state.vm_clock_seqlock,
395 &timers_state.vm_clock_lock);
396 if (!timers_state.cpu_ticks_enabled) {
397 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
398 timers_state.cpu_clock_offset -= get_clock();
399 timers_state.cpu_ticks_enabled = 1;
400 }
401 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
402 &timers_state.vm_clock_lock);
403}
404
405/* disable cpu_get_ticks() : the clock is stopped. You must not call
406 * cpu_get_ticks() after that.
407 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
408 */
409void cpu_disable_ticks(void)
410{
411 seqlock_write_lock(&timers_state.vm_clock_seqlock,
412 &timers_state.vm_clock_lock);
413 if (timers_state.cpu_ticks_enabled) {
414 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
415 timers_state.cpu_clock_offset = cpu_get_clock_locked();
416 timers_state.cpu_ticks_enabled = 0;
417 }
418 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
419 &timers_state.vm_clock_lock);
420}
421
422/* Correlation between real and virtual time is always going to be
423 fairly approximate, so ignore small variation.
424 When the guest is idle real and virtual time will be aligned in
425 the IO wait loop. */
426#define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
427
428static void icount_adjust(void)
429{
430 int64_t cur_time;
431 int64_t cur_icount;
432 int64_t delta;
433
434 /* Protected by TimersState mutex. */
435 static int64_t last_delta;
436
437 /* If the VM is not running, then do nothing. */
438 if (!runstate_is_running()) {
439 return;
440 }
441
442 seqlock_write_lock(&timers_state.vm_clock_seqlock,
443 &timers_state.vm_clock_lock);
444 cur_time = cpu_get_clock_locked();
445 cur_icount = cpu_get_icount_locked();
446
447 delta = cur_icount - cur_time;
448 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
449 if (delta > 0
450 && last_delta + ICOUNT_WOBBLE < delta * 2
451 && timers_state.icount_time_shift > 0) {
452 /* The guest is getting too far ahead. Slow time down. */
453 atomic_set(&timers_state.icount_time_shift,
454 timers_state.icount_time_shift - 1);
455 }
456 if (delta < 0
457 && last_delta - ICOUNT_WOBBLE > delta * 2
458 && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
459 /* The guest is getting too far behind. Speed time up. */
460 atomic_set(&timers_state.icount_time_shift,
461 timers_state.icount_time_shift + 1);
462 }
463 last_delta = delta;
464 atomic_set_i64(&timers_state.qemu_icount_bias,
465 cur_icount - (timers_state.qemu_icount
466 << timers_state.icount_time_shift));
467 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
468 &timers_state.vm_clock_lock);
469}
470
471static void icount_adjust_rt(void *opaque)
472{
473 timer_mod(timers_state.icount_rt_timer,
474 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
475 icount_adjust();
476}
477
478static void icount_adjust_vm(void *opaque)
479{
480 timer_mod(timers_state.icount_vm_timer,
481 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
482 NANOSECONDS_PER_SECOND / 10);
483 icount_adjust();
484}
485
486static int64_t qemu_icount_round(int64_t count)
487{
488 int shift = atomic_read(&timers_state.icount_time_shift);
489 return (count + (1 << shift) - 1) >> shift;
490}
491
492static void icount_warp_rt(void)
493{
494 unsigned seq;
495 int64_t warp_start;
496
497 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
498 * changes from -1 to another value, so the race here is okay.
499 */
500 do {
501 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
502 warp_start = timers_state.vm_clock_warp_start;
503 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
504
505 if (warp_start == -1) {
506 return;
507 }
508
509 seqlock_write_lock(&timers_state.vm_clock_seqlock,
510 &timers_state.vm_clock_lock);
511 if (runstate_is_running()) {
512 int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
513 cpu_get_clock_locked());
514 int64_t warp_delta;
515
516 warp_delta = clock - timers_state.vm_clock_warp_start;
517 if (use_icount == 2) {
518 /*
519 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
520 * far ahead of real time.
521 */
522 int64_t cur_icount = cpu_get_icount_locked();
523 int64_t delta = clock - cur_icount;
524 warp_delta = MIN(warp_delta, delta);
525 }
526 atomic_set_i64(&timers_state.qemu_icount_bias,
527 timers_state.qemu_icount_bias + warp_delta);
528 }
529 timers_state.vm_clock_warp_start = -1;
530 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
531 &timers_state.vm_clock_lock);
532
533 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
534 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
535 }
536}
537
538static void icount_timer_cb(void *opaque)
539{
540 /* No need for a checkpoint because the timer already synchronizes
541 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
542 */
543 icount_warp_rt();
544}
545
546void qtest_clock_warp(int64_t dest)
547{
548 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
549 AioContext *aio_context;
550 assert(qtest_enabled());
551 aio_context = qemu_get_aio_context();
552 while (clock < dest) {
553 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
554 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
555
556 seqlock_write_lock(&timers_state.vm_clock_seqlock,
557 &timers_state.vm_clock_lock);
558 atomic_set_i64(&timers_state.qemu_icount_bias,
559 timers_state.qemu_icount_bias + warp);
560 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
561 &timers_state.vm_clock_lock);
562
563 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
564 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
565 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
566 }
567 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
568}
569
570void qemu_start_warp_timer(void)
571{
572 int64_t clock;
573 int64_t deadline;
574
575 if (!use_icount) {
576 return;
577 }
578
579 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
580 * do not fire, so computing the deadline does not make sense.
581 */
582 if (!runstate_is_running()) {
583 return;
584 }
585
586 if (replay_mode != REPLAY_MODE_PLAY) {
587 if (!all_cpu_threads_idle()) {
588 return;
589 }
590
591 if (qtest_enabled()) {
592 /* When testing, qtest commands advance icount. */
593 return;
594 }
595
596 replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
597 } else {
598 /* warp clock deterministically in record/replay mode */
599 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
600 /* vCPU is sleeping and warp can't be started.
601 It is probably a race condition: notification sent
602 to vCPU was processed in advance and vCPU went to sleep.
603 Therefore we have to wake it up for doing someting. */
604 if (replay_has_checkpoint()) {
605 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
606 }
607 return;
608 }
609 }
610
611 /* We want to use the earliest deadline from ALL vm_clocks */
612 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
613 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
614 if (deadline < 0) {
615 static bool notified;
616 if (!icount_sleep && !notified) {
617 warn_report("icount sleep disabled and no active timers");
618 notified = true;
619 }
620 return;
621 }
622
623 if (deadline > 0) {
624 /*
625 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
626 * sleep. Otherwise, the CPU might be waiting for a future timer
627 * interrupt to wake it up, but the interrupt never comes because
628 * the vCPU isn't running any insns and thus doesn't advance the
629 * QEMU_CLOCK_VIRTUAL.
630 */
631 if (!icount_sleep) {
632 /*
633 * We never let VCPUs sleep in no sleep icount mode.
634 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
635 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
636 * It is useful when we want a deterministic execution time,
637 * isolated from host latencies.
638 */
639 seqlock_write_lock(&timers_state.vm_clock_seqlock,
640 &timers_state.vm_clock_lock);
641 atomic_set_i64(&timers_state.qemu_icount_bias,
642 timers_state.qemu_icount_bias + deadline);
643 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
644 &timers_state.vm_clock_lock);
645 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
646 } else {
647 /*
648 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
649 * "real" time, (related to the time left until the next event) has
650 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
651 * This avoids that the warps are visible externally; for example,
652 * you will not be sending network packets continuously instead of
653 * every 100ms.
654 */
655 seqlock_write_lock(&timers_state.vm_clock_seqlock,
656 &timers_state.vm_clock_lock);
657 if (timers_state.vm_clock_warp_start == -1
658 || timers_state.vm_clock_warp_start > clock) {
659 timers_state.vm_clock_warp_start = clock;
660 }
661 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
662 &timers_state.vm_clock_lock);
663 timer_mod_anticipate(timers_state.icount_warp_timer,
664 clock + deadline);
665 }
666 } else if (deadline == 0) {
667 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
668 }
669}
670
671static void qemu_account_warp_timer(void)
672{
673 if (!use_icount || !icount_sleep) {
674 return;
675 }
676
677 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
678 * do not fire, so computing the deadline does not make sense.
679 */
680 if (!runstate_is_running()) {
681 return;
682 }
683
684 /* warp clock deterministically in record/replay mode */
685 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
686 return;
687 }
688
689 timer_del(timers_state.icount_warp_timer);
690 icount_warp_rt();
691}
692
693static bool icount_state_needed(void *opaque)
694{
695 return use_icount;
696}
697
698static bool warp_timer_state_needed(void *opaque)
699{
700 TimersState *s = opaque;
701 return s->icount_warp_timer != NULL;
702}
703
704static bool adjust_timers_state_needed(void *opaque)
705{
706 TimersState *s = opaque;
707 return s->icount_rt_timer != NULL;
708}
709
710/*
711 * Subsection for warp timer migration is optional, because may not be created
712 */
713static const VMStateDescription icount_vmstate_warp_timer = {
714 .name = "timer/icount/warp_timer",
715 .version_id = 1,
716 .minimum_version_id = 1,
717 .needed = warp_timer_state_needed,
718 .fields = (VMStateField[]) {
719 VMSTATE_INT64(vm_clock_warp_start, TimersState),
720 VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
721 VMSTATE_END_OF_LIST()
722 }
723};
724
725static const VMStateDescription icount_vmstate_adjust_timers = {
726 .name = "timer/icount/timers",
727 .version_id = 1,
728 .minimum_version_id = 1,
729 .needed = adjust_timers_state_needed,
730 .fields = (VMStateField[]) {
731 VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
732 VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
733 VMSTATE_END_OF_LIST()
734 }
735};
736
737/*
738 * This is a subsection for icount migration.
739 */
740static const VMStateDescription icount_vmstate_timers = {
741 .name = "timer/icount",
742 .version_id = 1,
743 .minimum_version_id = 1,
744 .needed = icount_state_needed,
745 .fields = (VMStateField[]) {
746 VMSTATE_INT64(qemu_icount_bias, TimersState),
747 VMSTATE_INT64(qemu_icount, TimersState),
748 VMSTATE_END_OF_LIST()
749 },
750 .subsections = (const VMStateDescription*[]) {
751 &icount_vmstate_warp_timer,
752 &icount_vmstate_adjust_timers,
753 NULL
754 }
755};
756
757static const VMStateDescription vmstate_timers = {
758 .name = "timer",
759 .version_id = 2,
760 .minimum_version_id = 1,
761 .fields = (VMStateField[]) {
762 VMSTATE_INT64(cpu_ticks_offset, TimersState),
763 VMSTATE_UNUSED(8),
764 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
765 VMSTATE_END_OF_LIST()
766 },
767 .subsections = (const VMStateDescription*[]) {
768 &icount_vmstate_timers,
769 NULL
770 }
771};
772
773static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
774{
775 double pct;
776 double throttle_ratio;
777 long sleeptime_ns;
778
779 if (!cpu_throttle_get_percentage()) {
780 return;
781 }
782
783 pct = (double)cpu_throttle_get_percentage()/100;
784 throttle_ratio = pct / (1 - pct);
785 sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
786
787 qemu_mutex_unlock_iothread();
788 g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
789 qemu_mutex_lock_iothread();
790 atomic_set(&cpu->throttle_thread_scheduled, 0);
791}
792
793static void cpu_throttle_timer_tick(void *opaque)
794{
795 CPUState *cpu;
796 double pct;
797
798 /* Stop the timer if needed */
799 if (!cpu_throttle_get_percentage()) {
800 return;
801 }
802 CPU_FOREACH(cpu) {
803 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
804 async_run_on_cpu(cpu, cpu_throttle_thread,
805 RUN_ON_CPU_NULL);
806 }
807 }
808
809 pct = (double)cpu_throttle_get_percentage()/100;
810 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
811 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
812}
813
814void cpu_throttle_set(int new_throttle_pct)
815{
816 /* Ensure throttle percentage is within valid range */
817 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
818 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
819
820 atomic_set(&throttle_percentage, new_throttle_pct);
821
822 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
823 CPU_THROTTLE_TIMESLICE_NS);
824}
825
826void cpu_throttle_stop(void)
827{
828 atomic_set(&throttle_percentage, 0);
829}
830
831bool cpu_throttle_active(void)
832{
833 return (cpu_throttle_get_percentage() != 0);
834}
835
836int cpu_throttle_get_percentage(void)
837{
838 return atomic_read(&throttle_percentage);
839}
840
841void cpu_ticks_init(void)
842{
843 seqlock_init(&timers_state.vm_clock_seqlock);
844 qemu_spin_init(&timers_state.vm_clock_lock);
845 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
846 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
847 cpu_throttle_timer_tick, NULL);
848}
849
850void configure_icount(QemuOpts *opts, Error **errp)
851{
852 const char *option;
853 char *rem_str = NULL;
854
855 option = qemu_opt_get(opts, "shift");
856 if (!option) {
857 if (qemu_opt_get(opts, "align") != NULL) {
858 error_setg(errp, "Please specify shift option when using align");
859 }
860 return;
861 }
862
863 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
864 if (icount_sleep) {
865 timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
866 icount_timer_cb, NULL);
867 }
868
869 icount_align_option = qemu_opt_get_bool(opts, "align", false);
870
871 if (icount_align_option && !icount_sleep) {
872 error_setg(errp, "align=on and sleep=off are incompatible");
873 }
874 if (strcmp(option, "auto") != 0) {
875 errno = 0;
876 timers_state.icount_time_shift = strtol(option, &rem_str, 0);
877 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
878 error_setg(errp, "icount: Invalid shift value");
879 }
880 use_icount = 1;
881 return;
882 } else if (icount_align_option) {
883 error_setg(errp, "shift=auto and align=on are incompatible");
884 } else if (!icount_sleep) {
885 error_setg(errp, "shift=auto and sleep=off are incompatible");
886 }
887
888 use_icount = 2;
889
890 /* 125MIPS seems a reasonable initial guess at the guest speed.
891 It will be corrected fairly quickly anyway. */
892 timers_state.icount_time_shift = 3;
893
894 /* Have both realtime and virtual time triggers for speed adjustment.
895 The realtime trigger catches emulated time passing too slowly,
896 the virtual time trigger catches emulated time passing too fast.
897 Realtime triggers occur even when idle, so use them less frequently
898 than VM triggers. */
899 timers_state.vm_clock_warp_start = -1;
900 timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
901 icount_adjust_rt, NULL);
902 timer_mod(timers_state.icount_rt_timer,
903 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
904 timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
905 icount_adjust_vm, NULL);
906 timer_mod(timers_state.icount_vm_timer,
907 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
908 NANOSECONDS_PER_SECOND / 10);
909}
910
911/***********************************************************/
912/* TCG vCPU kick timer
913 *
914 * The kick timer is responsible for moving single threaded vCPU
915 * emulation on to the next vCPU. If more than one vCPU is running a
916 * timer event with force a cpu->exit so the next vCPU can get
917 * scheduled.
918 *
919 * The timer is removed if all vCPUs are idle and restarted again once
920 * idleness is complete.
921 */
922
923static QEMUTimer *tcg_kick_vcpu_timer;
924static CPUState *tcg_current_rr_cpu;
925
926#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
927
928static inline int64_t qemu_tcg_next_kick(void)
929{
930 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
931}
932
933/* Kick the currently round-robin scheduled vCPU */
934static void qemu_cpu_kick_rr_cpu(void)
935{
936 CPUState *cpu;
937 do {
938 cpu = atomic_mb_read(&tcg_current_rr_cpu);
939 if (cpu) {
940 cpu_exit(cpu);
941 }
942 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
943}
944
945static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
946{
947}
948
949void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
950{
951 if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
952 qemu_notify_event();
953 return;
954 }
955
956 if (qemu_in_vcpu_thread()) {
957 /* A CPU is currently running; kick it back out to the
958 * tcg_cpu_exec() loop so it will recalculate its
959 * icount deadline immediately.
960 */
961 qemu_cpu_kick(current_cpu);
962 } else if (first_cpu) {
963 /* qemu_cpu_kick is not enough to kick a halted CPU out of
964 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
965 * causes cpu_thread_is_idle to return false. This way,
966 * handle_icount_deadline can run.
967 * If we have no CPUs at all for some reason, we don't
968 * need to do anything.
969 */
970 async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
971 }
972}
973
974static void kick_tcg_thread(void *opaque)
975{
976 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
977 qemu_cpu_kick_rr_cpu();
978}
979
980static void start_tcg_kick_timer(void)
981{
982 assert(!mttcg_enabled);
983 if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
984 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
985 kick_tcg_thread, NULL);
986 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
987 }
988}
989
990static void stop_tcg_kick_timer(void)
991{
992 assert(!mttcg_enabled);
993 if (tcg_kick_vcpu_timer) {
994 timer_del(tcg_kick_vcpu_timer);
995 tcg_kick_vcpu_timer = NULL;
996 }
997}
998
999/***********************************************************/
1000void hw_error(const char *fmt, ...)
1001{
1002 va_list ap;
1003 CPUState *cpu;
1004
1005 va_start(ap, fmt);
1006 fprintf(stderr, "qemu: hardware error: ");
1007 vfprintf(stderr, fmt, ap);
1008 fprintf(stderr, "\n");
1009 CPU_FOREACH(cpu) {
1010 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1011 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
1012 }
1013 va_end(ap);
1014 abort();
1015}
1016
1017void cpu_synchronize_all_states(void)
1018{
1019 CPUState *cpu;
1020
1021 CPU_FOREACH(cpu) {
1022 cpu_synchronize_state(cpu);
1023 /* TODO: move to cpu_synchronize_state() */
1024 if (hvf_enabled()) {
1025 hvf_cpu_synchronize_state(cpu);
1026 }
1027 }
1028}
1029
1030void cpu_synchronize_all_post_reset(void)
1031{
1032 CPUState *cpu;
1033
1034 CPU_FOREACH(cpu) {
1035 cpu_synchronize_post_reset(cpu);
1036 /* TODO: move to cpu_synchronize_post_reset() */
1037 if (hvf_enabled()) {
1038 hvf_cpu_synchronize_post_reset(cpu);
1039 }
1040 }
1041}
1042
1043void cpu_synchronize_all_post_init(void)
1044{
1045 CPUState *cpu;
1046
1047 CPU_FOREACH(cpu) {
1048 cpu_synchronize_post_init(cpu);
1049 /* TODO: move to cpu_synchronize_post_init() */
1050 if (hvf_enabled()) {
1051 hvf_cpu_synchronize_post_init(cpu);
1052 }
1053 }
1054}
1055
1056void cpu_synchronize_all_pre_loadvm(void)
1057{
1058 CPUState *cpu;
1059
1060 CPU_FOREACH(cpu) {
1061 cpu_synchronize_pre_loadvm(cpu);
1062 }
1063}
1064
1065static int do_vm_stop(RunState state, bool send_stop)
1066{
1067 int ret = 0;
1068
1069 if (runstate_is_running()) {
1070 cpu_disable_ticks();
1071 pause_all_vcpus();
1072 runstate_set(state);
1073 vm_state_notify(0, state);
1074 if (send_stop) {
1075 qapi_event_send_stop();
1076 }
1077 }
1078
1079 bdrv_drain_all();
1080 replay_disable_events();
1081 ret = bdrv_flush_all();
1082
1083 return ret;
1084}
1085
1086/* Special vm_stop() variant for terminating the process. Historically clients
1087 * did not expect a QMP STOP event and so we need to retain compatibility.
1088 */
1089int vm_shutdown(void)
1090{
1091 return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1092}
1093
1094static bool cpu_can_run(CPUState *cpu)
1095{
1096 if (cpu->stop) {
1097 return false;
1098 }
1099 if (cpu_is_stopped(cpu)) {
1100 return false;
1101 }
1102 return true;
1103}
1104
1105static void cpu_handle_guest_debug(CPUState *cpu)
1106{
1107 gdb_set_stop_cpu(cpu);
1108 qemu_system_debug_request();
1109 cpu->stopped = true;
1110}
1111
1112#ifdef CONFIG_LINUX
1113static void sigbus_reraise(void)
1114{
1115 sigset_t set;
1116 struct sigaction action;
1117
1118 memset(&action, 0, sizeof(action));
1119 action.sa_handler = SIG_DFL;
1120 if (!sigaction(SIGBUS, &action, NULL)) {
1121 raise(SIGBUS);
1122 sigemptyset(&set);
1123 sigaddset(&set, SIGBUS);
1124 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1125 }
1126 perror("Failed to re-raise SIGBUS!\n");
1127 abort();
1128}
1129
1130static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1131{
1132 if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1133 sigbus_reraise();
1134 }
1135
1136 if (current_cpu) {
1137 /* Called asynchronously in VCPU thread. */
1138 if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1139 sigbus_reraise();
1140 }
1141 } else {
1142 /* Called synchronously (via signalfd) in main thread. */
1143 if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1144 sigbus_reraise();
1145 }
1146 }
1147}
1148
1149static void qemu_init_sigbus(void)
1150{
1151 struct sigaction action;
1152
1153 memset(&action, 0, sizeof(action));
1154 action.sa_flags = SA_SIGINFO;
1155 action.sa_sigaction = sigbus_handler;
1156 sigaction(SIGBUS, &action, NULL);
1157
1158 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1159}
1160#else /* !CONFIG_LINUX */
1161static void qemu_init_sigbus(void)
1162{
1163}
1164#endif /* !CONFIG_LINUX */
1165
1166static QemuMutex qemu_global_mutex;
1167
1168static QemuThread io_thread;
1169
1170/* cpu creation */
1171static QemuCond qemu_cpu_cond;
1172/* system init */
1173static QemuCond qemu_pause_cond;
1174
1175void qemu_init_cpu_loop(void)
1176{
1177 qemu_init_sigbus();
1178 qemu_cond_init(&qemu_cpu_cond);
1179 qemu_cond_init(&qemu_pause_cond);
1180 qemu_mutex_init(&qemu_global_mutex);
1181
1182 qemu_thread_get_self(&io_thread);
1183}
1184
1185void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1186{
1187 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1188}
1189
1190static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1191{
1192 if (kvm_destroy_vcpu(cpu) < 0) {
1193 error_report("kvm_destroy_vcpu failed");
1194 exit(EXIT_FAILURE);
1195 }
1196}
1197
1198static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1199{
1200}
1201
1202static void qemu_cpu_stop(CPUState *cpu, bool exit)
1203{
1204 g_assert(qemu_cpu_is_self(cpu));
1205 cpu->stop = false;
1206 cpu->stopped = true;
1207 if (exit) {
1208 cpu_exit(cpu);
1209 }
1210 qemu_cond_broadcast(&qemu_pause_cond);
1211}
1212
1213static void qemu_wait_io_event_common(CPUState *cpu)
1214{
1215 atomic_mb_set(&cpu->thread_kicked, false);
1216 if (cpu->stop) {
1217 qemu_cpu_stop(cpu, false);
1218 }
1219 process_queued_cpu_work(cpu);
1220}
1221
1222static void qemu_tcg_rr_wait_io_event(CPUState *cpu)
1223{
1224 while (all_cpu_threads_idle()) {
1225 stop_tcg_kick_timer();
1226 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1227 }
1228
1229 start_tcg_kick_timer();
1230
1231 qemu_wait_io_event_common(cpu);
1232}
1233
1234static void qemu_wait_io_event(CPUState *cpu)
1235{
1236 while (cpu_thread_is_idle(cpu)) {
1237 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1238 }
1239
1240#ifdef _WIN32
1241 /* Eat dummy APC queued by qemu_cpu_kick_thread. */
1242 if (!tcg_enabled()) {
1243 SleepEx(0, TRUE);
1244 }
1245#endif
1246 qemu_wait_io_event_common(cpu);
1247}
1248
1249static void *qemu_kvm_cpu_thread_fn(void *arg)
1250{
1251 CPUState *cpu = arg;
1252 int r;
1253
1254 rcu_register_thread();
1255
1256 qemu_mutex_lock_iothread();
1257 qemu_thread_get_self(cpu->thread);
1258 cpu->thread_id = qemu_get_thread_id();
1259 cpu->can_do_io = 1;
1260 current_cpu = cpu;
1261
1262 r = kvm_init_vcpu(cpu);
1263 if (r < 0) {
1264 error_report("kvm_init_vcpu failed: %s", strerror(-r));
1265 exit(1);
1266 }
1267
1268 kvm_init_cpu_signals(cpu);
1269
1270 /* signal CPU creation */
1271 cpu->created = true;
1272 qemu_cond_signal(&qemu_cpu_cond);
1273
1274 do {
1275 if (cpu_can_run(cpu)) {
1276 r = kvm_cpu_exec(cpu);
1277 if (r == EXCP_DEBUG) {
1278 cpu_handle_guest_debug(cpu);
1279 }
1280 }
1281 qemu_wait_io_event(cpu);
1282 } while (!cpu->unplug || cpu_can_run(cpu));
1283
1284 qemu_kvm_destroy_vcpu(cpu);
1285 cpu->created = false;
1286 qemu_cond_signal(&qemu_cpu_cond);
1287 qemu_mutex_unlock_iothread();
1288 rcu_unregister_thread();
1289 return NULL;
1290}
1291
1292static void *qemu_dummy_cpu_thread_fn(void *arg)
1293{
1294#ifdef _WIN32
1295 error_report("qtest is not supported under Windows");
1296 exit(1);
1297#else
1298 CPUState *cpu = arg;
1299 sigset_t waitset;
1300 int r;
1301
1302 rcu_register_thread();
1303
1304 qemu_mutex_lock_iothread();
1305 qemu_thread_get_self(cpu->thread);
1306 cpu->thread_id = qemu_get_thread_id();
1307 cpu->can_do_io = 1;
1308 current_cpu = cpu;
1309
1310 sigemptyset(&waitset);
1311 sigaddset(&waitset, SIG_IPI);
1312
1313 /* signal CPU creation */
1314 cpu->created = true;
1315 qemu_cond_signal(&qemu_cpu_cond);
1316
1317 do {
1318 qemu_mutex_unlock_iothread();
1319 do {
1320 int sig;
1321 r = sigwait(&waitset, &sig);
1322 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1323 if (r == -1) {
1324 perror("sigwait");
1325 exit(1);
1326 }
1327 qemu_mutex_lock_iothread();
1328 qemu_wait_io_event(cpu);
1329 } while (!cpu->unplug);
1330
1331 rcu_unregister_thread();
1332 return NULL;
1333#endif
1334}
1335
1336static int64_t tcg_get_icount_limit(void)
1337{
1338 int64_t deadline;
1339
1340 if (replay_mode != REPLAY_MODE_PLAY) {
1341 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1342
1343 /* Maintain prior (possibly buggy) behaviour where if no deadline
1344 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1345 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1346 * nanoseconds.
1347 */
1348 if ((deadline < 0) || (deadline > INT32_MAX)) {
1349 deadline = INT32_MAX;
1350 }
1351
1352 return qemu_icount_round(deadline);
1353 } else {
1354 return replay_get_instructions();
1355 }
1356}
1357
1358static void handle_icount_deadline(void)
1359{
1360 assert(qemu_in_vcpu_thread());
1361 if (use_icount) {
1362 int64_t deadline =
1363 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1364
1365 if (deadline == 0) {
1366 /* Wake up other AioContexts. */
1367 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1368 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1369 }
1370 }
1371}
1372
1373static void prepare_icount_for_run(CPUState *cpu)
1374{
1375 if (use_icount) {
1376 int insns_left;
1377
1378 /* These should always be cleared by process_icount_data after
1379 * each vCPU execution. However u16.high can be raised
1380 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1381 */
1382 g_assert(cpu->icount_decr.u16.low == 0);
1383 g_assert(cpu->icount_extra == 0);
1384
1385 cpu->icount_budget = tcg_get_icount_limit();
1386 insns_left = MIN(0xffff, cpu->icount_budget);
1387 cpu->icount_decr.u16.low = insns_left;
1388 cpu->icount_extra = cpu->icount_budget - insns_left;
1389
1390 replay_mutex_lock();
1391 }
1392}
1393
1394static void process_icount_data(CPUState *cpu)
1395{
1396 if (use_icount) {
1397 /* Account for executed instructions */
1398 cpu_update_icount(cpu);
1399
1400 /* Reset the counters */
1401 cpu->icount_decr.u16.low = 0;
1402 cpu->icount_extra = 0;
1403 cpu->icount_budget = 0;
1404
1405 replay_account_executed_instructions();
1406
1407 replay_mutex_unlock();
1408 }
1409}
1410
1411
1412static int tcg_cpu_exec(CPUState *cpu)
1413{
1414 int ret;
1415#ifdef CONFIG_PROFILER
1416 int64_t ti;
1417#endif
1418
1419 assert(tcg_enabled());
1420#ifdef CONFIG_PROFILER
1421 ti = profile_getclock();
1422#endif
1423 cpu_exec_start(cpu);
1424 ret = cpu_exec(cpu);
1425 cpu_exec_end(cpu);
1426#ifdef CONFIG_PROFILER
1427 tcg_time += profile_getclock() - ti;
1428#endif
1429 return ret;
1430}
1431
1432/* Destroy any remaining vCPUs which have been unplugged and have
1433 * finished running
1434 */
1435static void deal_with_unplugged_cpus(void)
1436{
1437 CPUState *cpu;
1438
1439 CPU_FOREACH(cpu) {
1440 if (cpu->unplug && !cpu_can_run(cpu)) {
1441 qemu_tcg_destroy_vcpu(cpu);
1442 cpu->created = false;
1443 qemu_cond_signal(&qemu_cpu_cond);
1444 break;
1445 }
1446 }
1447}
1448
1449/* Single-threaded TCG
1450 *
1451 * In the single-threaded case each vCPU is simulated in turn. If
1452 * there is more than a single vCPU we create a simple timer to kick
1453 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1454 * This is done explicitly rather than relying on side-effects
1455 * elsewhere.
1456 */
1457
1458static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1459{
1460 CPUState *cpu = arg;
1461
1462 assert(tcg_enabled());
1463 rcu_register_thread();
1464 tcg_register_thread();
1465
1466 qemu_mutex_lock_iothread();
1467 qemu_thread_get_self(cpu->thread);
1468
1469 cpu->thread_id = qemu_get_thread_id();
1470 cpu->created = true;
1471 cpu->can_do_io = 1;
1472 qemu_cond_signal(&qemu_cpu_cond);
1473
1474 /* wait for initial kick-off after machine start */
1475 while (first_cpu->stopped) {
1476 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1477
1478 /* process any pending work */
1479 CPU_FOREACH(cpu) {
1480 current_cpu = cpu;
1481 qemu_wait_io_event_common(cpu);
1482 }
1483 }
1484
1485 start_tcg_kick_timer();
1486
1487 cpu = first_cpu;
1488
1489 /* process any pending work */
1490 cpu->exit_request = 1;
1491
1492 while (1) {
1493 qemu_mutex_unlock_iothread();
1494 replay_mutex_lock();
1495 qemu_mutex_lock_iothread();
1496 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1497 qemu_account_warp_timer();
1498
1499 /* Run the timers here. This is much more efficient than
1500 * waking up the I/O thread and waiting for completion.
1501 */
1502 handle_icount_deadline();
1503
1504 replay_mutex_unlock();
1505
1506 if (!cpu) {
1507 cpu = first_cpu;
1508 }
1509
1510 while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1511
1512 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1513 current_cpu = cpu;
1514
1515 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1516 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1517
1518 if (cpu_can_run(cpu)) {
1519 int r;
1520
1521 qemu_mutex_unlock_iothread();
1522 prepare_icount_for_run(cpu);
1523
1524 r = tcg_cpu_exec(cpu);
1525
1526 process_icount_data(cpu);
1527 qemu_mutex_lock_iothread();
1528
1529 if (r == EXCP_DEBUG) {
1530 cpu_handle_guest_debug(cpu);
1531 break;
1532 } else if (r == EXCP_ATOMIC) {
1533 qemu_mutex_unlock_iothread();
1534 cpu_exec_step_atomic(cpu);
1535 qemu_mutex_lock_iothread();
1536 break;
1537 }
1538 } else if (cpu->stop) {
1539 if (cpu->unplug) {
1540 cpu = CPU_NEXT(cpu);
1541 }
1542 break;
1543 }
1544
1545 cpu = CPU_NEXT(cpu);
1546 } /* while (cpu && !cpu->exit_request).. */
1547
1548 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1549 atomic_set(&tcg_current_rr_cpu, NULL);
1550
1551 if (cpu && cpu->exit_request) {
1552 atomic_mb_set(&cpu->exit_request, 0);
1553 }
1554
1555 qemu_tcg_rr_wait_io_event(cpu ? cpu : first_cpu);
1556 deal_with_unplugged_cpus();
1557 }
1558
1559 rcu_unregister_thread();
1560 return NULL;
1561}
1562
1563static void *qemu_hax_cpu_thread_fn(void *arg)
1564{
1565 CPUState *cpu = arg;
1566 int r;
1567
1568 rcu_register_thread();
1569 qemu_mutex_lock_iothread();
1570 qemu_thread_get_self(cpu->thread);
1571
1572 cpu->thread_id = qemu_get_thread_id();
1573 cpu->created = true;
1574 cpu->halted = 0;
1575 current_cpu = cpu;
1576
1577 hax_init_vcpu(cpu);
1578 qemu_cond_signal(&qemu_cpu_cond);
1579
1580 do {
1581 if (cpu_can_run(cpu)) {
1582 r = hax_smp_cpu_exec(cpu);
1583 if (r == EXCP_DEBUG) {
1584 cpu_handle_guest_debug(cpu);
1585 }
1586 }
1587
1588 qemu_wait_io_event(cpu);
1589 } while (!cpu->unplug || cpu_can_run(cpu));
1590 rcu_unregister_thread();
1591 return NULL;
1592}
1593
1594/* The HVF-specific vCPU thread function. This one should only run when the host
1595 * CPU supports the VMX "unrestricted guest" feature. */
1596static void *qemu_hvf_cpu_thread_fn(void *arg)
1597{
1598 CPUState *cpu = arg;
1599
1600 int r;
1601
1602 assert(hvf_enabled());
1603
1604 rcu_register_thread();
1605
1606 qemu_mutex_lock_iothread();
1607 qemu_thread_get_self(cpu->thread);
1608
1609 cpu->thread_id = qemu_get_thread_id();
1610 cpu->can_do_io = 1;
1611 current_cpu = cpu;
1612
1613 hvf_init_vcpu(cpu);
1614
1615 /* signal CPU creation */
1616 cpu->created = true;
1617 qemu_cond_signal(&qemu_cpu_cond);
1618
1619 do {
1620 if (cpu_can_run(cpu)) {
1621 r = hvf_vcpu_exec(cpu);
1622 if (r == EXCP_DEBUG) {
1623 cpu_handle_guest_debug(cpu);
1624 }
1625 }
1626 qemu_wait_io_event(cpu);
1627 } while (!cpu->unplug || cpu_can_run(cpu));
1628
1629 hvf_vcpu_destroy(cpu);
1630 cpu->created = false;
1631 qemu_cond_signal(&qemu_cpu_cond);
1632 qemu_mutex_unlock_iothread();
1633 rcu_unregister_thread();
1634 return NULL;
1635}
1636
1637static void *qemu_whpx_cpu_thread_fn(void *arg)
1638{
1639 CPUState *cpu = arg;
1640 int r;
1641
1642 rcu_register_thread();
1643
1644 qemu_mutex_lock_iothread();
1645 qemu_thread_get_self(cpu->thread);
1646 cpu->thread_id = qemu_get_thread_id();
1647 current_cpu = cpu;
1648
1649 r = whpx_init_vcpu(cpu);
1650 if (r < 0) {
1651 fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1652 exit(1);
1653 }
1654
1655 /* signal CPU creation */
1656 cpu->created = true;
1657 qemu_cond_signal(&qemu_cpu_cond);
1658
1659 do {
1660 if (cpu_can_run(cpu)) {
1661 r = whpx_vcpu_exec(cpu);
1662 if (r == EXCP_DEBUG) {
1663 cpu_handle_guest_debug(cpu);
1664 }
1665 }
1666 while (cpu_thread_is_idle(cpu)) {
1667 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1668 }
1669 qemu_wait_io_event_common(cpu);
1670 } while (!cpu->unplug || cpu_can_run(cpu));
1671
1672 whpx_destroy_vcpu(cpu);
1673 cpu->created = false;
1674 qemu_cond_signal(&qemu_cpu_cond);
1675 qemu_mutex_unlock_iothread();
1676 rcu_unregister_thread();
1677 return NULL;
1678}
1679
1680#ifdef _WIN32
1681static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1682{
1683}
1684#endif
1685
1686/* Multi-threaded TCG
1687 *
1688 * In the multi-threaded case each vCPU has its own thread. The TLS
1689 * variable current_cpu can be used deep in the code to find the
1690 * current CPUState for a given thread.
1691 */
1692
1693static void *qemu_tcg_cpu_thread_fn(void *arg)
1694{
1695 CPUState *cpu = arg;
1696
1697 assert(tcg_enabled());
1698 g_assert(!use_icount);
1699
1700 rcu_register_thread();
1701 tcg_register_thread();
1702
1703 qemu_mutex_lock_iothread();
1704 qemu_thread_get_self(cpu->thread);
1705
1706 cpu->thread_id = qemu_get_thread_id();
1707 cpu->created = true;
1708 cpu->can_do_io = 1;
1709 current_cpu = cpu;
1710 qemu_cond_signal(&qemu_cpu_cond);
1711
1712 /* process any pending work */
1713 cpu->exit_request = 1;
1714
1715 do {
1716 if (cpu_can_run(cpu)) {
1717 int r;
1718 qemu_mutex_unlock_iothread();
1719 r = tcg_cpu_exec(cpu);
1720 qemu_mutex_lock_iothread();
1721 switch (r) {
1722 case EXCP_DEBUG:
1723 cpu_handle_guest_debug(cpu);
1724 break;
1725 case EXCP_HALTED:
1726 /* during start-up the vCPU is reset and the thread is
1727 * kicked several times. If we don't ensure we go back
1728 * to sleep in the halted state we won't cleanly
1729 * start-up when the vCPU is enabled.
1730 *
1731 * cpu->halted should ensure we sleep in wait_io_event
1732 */
1733 g_assert(cpu->halted);
1734 break;
1735 case EXCP_ATOMIC:
1736 qemu_mutex_unlock_iothread();
1737 cpu_exec_step_atomic(cpu);
1738 qemu_mutex_lock_iothread();
1739 default:
1740 /* Ignore everything else? */
1741 break;
1742 }
1743 }
1744
1745 atomic_mb_set(&cpu->exit_request, 0);
1746 qemu_wait_io_event(cpu);
1747 } while (!cpu->unplug || cpu_can_run(cpu));
1748
1749 qemu_tcg_destroy_vcpu(cpu);
1750 cpu->created = false;
1751 qemu_cond_signal(&qemu_cpu_cond);
1752 qemu_mutex_unlock_iothread();
1753 rcu_unregister_thread();
1754 return NULL;
1755}
1756
1757static void qemu_cpu_kick_thread(CPUState *cpu)
1758{
1759#ifndef _WIN32
1760 int err;
1761
1762 if (cpu->thread_kicked) {
1763 return;
1764 }
1765 cpu->thread_kicked = true;
1766 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1767 if (err) {
1768 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1769 exit(1);
1770 }
1771#else /* _WIN32 */
1772 if (!qemu_cpu_is_self(cpu)) {
1773 if (whpx_enabled()) {
1774 whpx_vcpu_kick(cpu);
1775 } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1776 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1777 __func__, GetLastError());
1778 exit(1);
1779 }
1780 }
1781#endif
1782}
1783
1784void qemu_cpu_kick(CPUState *cpu)
1785{
1786 qemu_cond_broadcast(cpu->halt_cond);
1787 if (tcg_enabled()) {
1788 cpu_exit(cpu);
1789 /* NOP unless doing single-thread RR */
1790 qemu_cpu_kick_rr_cpu();
1791 } else {
1792 if (hax_enabled()) {
1793 /*
1794 * FIXME: race condition with the exit_request check in
1795 * hax_vcpu_hax_exec
1796 */
1797 cpu->exit_request = 1;
1798 }
1799 qemu_cpu_kick_thread(cpu);
1800 }
1801}
1802
1803void qemu_cpu_kick_self(void)
1804{
1805 assert(current_cpu);
1806 qemu_cpu_kick_thread(current_cpu);
1807}
1808
1809bool qemu_cpu_is_self(CPUState *cpu)
1810{
1811 return qemu_thread_is_self(cpu->thread);
1812}
1813
1814bool qemu_in_vcpu_thread(void)
1815{
1816 return current_cpu && qemu_cpu_is_self(current_cpu);
1817}
1818
1819static __thread bool iothread_locked = false;
1820
1821bool qemu_mutex_iothread_locked(void)
1822{
1823 return iothread_locked;
1824}
1825
1826/*
1827 * The BQL is taken from so many places that it is worth profiling the
1828 * callers directly, instead of funneling them all through a single function.
1829 */
1830void qemu_mutex_lock_iothread_impl(const char *file, int line)
1831{
1832 QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1833
1834 g_assert(!qemu_mutex_iothread_locked());
1835 bql_lock(&qemu_global_mutex, file, line);
1836 iothread_locked = true;
1837}
1838
1839void qemu_mutex_unlock_iothread(void)
1840{
1841 g_assert(qemu_mutex_iothread_locked());
1842 iothread_locked = false;
1843 qemu_mutex_unlock(&qemu_global_mutex);
1844}
1845
1846static bool all_vcpus_paused(void)
1847{
1848 CPUState *cpu;
1849
1850 CPU_FOREACH(cpu) {
1851 if (!cpu->stopped) {
1852 return false;
1853 }
1854 }
1855
1856 return true;
1857}
1858
1859void pause_all_vcpus(void)
1860{
1861 CPUState *cpu;
1862
1863 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1864 CPU_FOREACH(cpu) {
1865 if (qemu_cpu_is_self(cpu)) {
1866 qemu_cpu_stop(cpu, true);
1867 } else {
1868 cpu->stop = true;
1869 qemu_cpu_kick(cpu);
1870 }
1871 }
1872
1873 /* We need to drop the replay_lock so any vCPU threads woken up
1874 * can finish their replay tasks
1875 */
1876 replay_mutex_unlock();
1877
1878 while (!all_vcpus_paused()) {
1879 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1880 CPU_FOREACH(cpu) {
1881 qemu_cpu_kick(cpu);
1882 }
1883 }
1884
1885 qemu_mutex_unlock_iothread();
1886 replay_mutex_lock();
1887 qemu_mutex_lock_iothread();
1888}
1889
1890void cpu_resume(CPUState *cpu)
1891{
1892 cpu->stop = false;
1893 cpu->stopped = false;
1894 qemu_cpu_kick(cpu);
1895}
1896
1897void resume_all_vcpus(void)
1898{
1899 CPUState *cpu;
1900
1901 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1902 CPU_FOREACH(cpu) {
1903 cpu_resume(cpu);
1904 }
1905}
1906
1907void cpu_remove_sync(CPUState *cpu)
1908{
1909 cpu->stop = true;
1910 cpu->unplug = true;
1911 qemu_cpu_kick(cpu);
1912 qemu_mutex_unlock_iothread();
1913 qemu_thread_join(cpu->thread);
1914 qemu_mutex_lock_iothread();
1915}
1916
1917/* For temporary buffers for forming a name */
1918#define VCPU_THREAD_NAME_SIZE 16
1919
1920static void qemu_tcg_init_vcpu(CPUState *cpu)
1921{
1922 char thread_name[VCPU_THREAD_NAME_SIZE];
1923 static QemuCond *single_tcg_halt_cond;
1924 static QemuThread *single_tcg_cpu_thread;
1925 static int tcg_region_inited;
1926
1927 assert(tcg_enabled());
1928 /*
1929 * Initialize TCG regions--once. Now is a good time, because:
1930 * (1) TCG's init context, prologue and target globals have been set up.
1931 * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1932 * -accel flag is processed, so the check doesn't work then).
1933 */
1934 if (!tcg_region_inited) {
1935 tcg_region_inited = 1;
1936 tcg_region_init();
1937 }
1938
1939 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1940 cpu->thread = g_malloc0(sizeof(QemuThread));
1941 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1942 qemu_cond_init(cpu->halt_cond);
1943
1944 if (qemu_tcg_mttcg_enabled()) {
1945 /* create a thread per vCPU with TCG (MTTCG) */
1946 parallel_cpus = true;
1947 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1948 cpu->cpu_index);
1949
1950 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1951 cpu, QEMU_THREAD_JOINABLE);
1952
1953 } else {
1954 /* share a single thread for all cpus with TCG */
1955 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1956 qemu_thread_create(cpu->thread, thread_name,
1957 qemu_tcg_rr_cpu_thread_fn,
1958 cpu, QEMU_THREAD_JOINABLE);
1959
1960 single_tcg_halt_cond = cpu->halt_cond;
1961 single_tcg_cpu_thread = cpu->thread;
1962 }
1963#ifdef _WIN32
1964 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1965#endif
1966 } else {
1967 /* For non-MTTCG cases we share the thread */
1968 cpu->thread = single_tcg_cpu_thread;
1969 cpu->halt_cond = single_tcg_halt_cond;
1970 cpu->thread_id = first_cpu->thread_id;
1971 cpu->can_do_io = 1;
1972 cpu->created = true;
1973 }
1974}
1975
1976static void qemu_hax_start_vcpu(CPUState *cpu)
1977{
1978 char thread_name[VCPU_THREAD_NAME_SIZE];
1979
1980 cpu->thread = g_malloc0(sizeof(QemuThread));
1981 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1982 qemu_cond_init(cpu->halt_cond);
1983
1984 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1985 cpu->cpu_index);
1986 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1987 cpu, QEMU_THREAD_JOINABLE);
1988#ifdef _WIN32
1989 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1990#endif
1991}
1992
1993static void qemu_kvm_start_vcpu(CPUState *cpu)
1994{
1995 char thread_name[VCPU_THREAD_NAME_SIZE];
1996
1997 cpu->thread = g_malloc0(sizeof(QemuThread));
1998 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1999 qemu_cond_init(cpu->halt_cond);
2000 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2001 cpu->cpu_index);
2002 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2003 cpu, QEMU_THREAD_JOINABLE);
2004}
2005
2006static void qemu_hvf_start_vcpu(CPUState *cpu)
2007{
2008 char thread_name[VCPU_THREAD_NAME_SIZE];
2009
2010 /* HVF currently does not support TCG, and only runs in
2011 * unrestricted-guest mode. */
2012 assert(hvf_enabled());
2013
2014 cpu->thread = g_malloc0(sizeof(QemuThread));
2015 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2016 qemu_cond_init(cpu->halt_cond);
2017
2018 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2019 cpu->cpu_index);
2020 qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2021 cpu, QEMU_THREAD_JOINABLE);
2022}
2023
2024static void qemu_whpx_start_vcpu(CPUState *cpu)
2025{
2026 char thread_name[VCPU_THREAD_NAME_SIZE];
2027
2028 cpu->thread = g_malloc0(sizeof(QemuThread));
2029 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2030 qemu_cond_init(cpu->halt_cond);
2031 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2032 cpu->cpu_index);
2033 qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2034 cpu, QEMU_THREAD_JOINABLE);
2035#ifdef _WIN32
2036 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2037#endif
2038}
2039
2040static void qemu_dummy_start_vcpu(CPUState *cpu)
2041{
2042 char thread_name[VCPU_THREAD_NAME_SIZE];
2043
2044 cpu->thread = g_malloc0(sizeof(QemuThread));
2045 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2046 qemu_cond_init(cpu->halt_cond);
2047 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2048 cpu->cpu_index);
2049 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2050 QEMU_THREAD_JOINABLE);
2051}
2052
2053void qemu_init_vcpu(CPUState *cpu)
2054{
2055 cpu->nr_cores = smp_cores;
2056 cpu->nr_threads = smp_threads;
2057 cpu->stopped = true;
2058
2059 if (!cpu->as) {
2060 /* If the target cpu hasn't set up any address spaces itself,
2061 * give it the default one.
2062 */
2063 cpu->num_ases = 1;
2064 cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2065 }
2066
2067 if (kvm_enabled()) {
2068 qemu_kvm_start_vcpu(cpu);
2069 } else if (hax_enabled()) {
2070 qemu_hax_start_vcpu(cpu);
2071 } else if (hvf_enabled()) {
2072 qemu_hvf_start_vcpu(cpu);
2073 } else if (tcg_enabled()) {
2074 qemu_tcg_init_vcpu(cpu);
2075 } else if (whpx_enabled()) {
2076 qemu_whpx_start_vcpu(cpu);
2077 } else {
2078 qemu_dummy_start_vcpu(cpu);
2079 }
2080
2081 while (!cpu->created) {
2082 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2083 }
2084}
2085
2086void cpu_stop_current(void)
2087{
2088 if (current_cpu) {
2089 qemu_cpu_stop(current_cpu, true);
2090 }
2091}
2092
2093int vm_stop(RunState state)
2094{
2095 if (qemu_in_vcpu_thread()) {
2096 qemu_system_vmstop_request_prepare();
2097 qemu_system_vmstop_request(state);
2098 /*
2099 * FIXME: should not return to device code in case
2100 * vm_stop() has been requested.
2101 */
2102 cpu_stop_current();
2103 return 0;
2104 }
2105
2106 return do_vm_stop(state, true);
2107}
2108
2109/**
2110 * Prepare for (re)starting the VM.
2111 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2112 * running or in case of an error condition), 0 otherwise.
2113 */
2114int vm_prepare_start(void)
2115{
2116 RunState requested;
2117
2118 qemu_vmstop_requested(&requested);
2119 if (runstate_is_running() && requested == RUN_STATE__MAX) {
2120 return -1;
2121 }
2122
2123 /* Ensure that a STOP/RESUME pair of events is emitted if a
2124 * vmstop request was pending. The BLOCK_IO_ERROR event, for
2125 * example, according to documentation is always followed by
2126 * the STOP event.
2127 */
2128 if (runstate_is_running()) {
2129 qapi_event_send_stop();
2130 qapi_event_send_resume();
2131 return -1;
2132 }
2133
2134 /* We are sending this now, but the CPUs will be resumed shortly later */
2135 qapi_event_send_resume();
2136
2137 replay_enable_events();
2138 cpu_enable_ticks();
2139 runstate_set(RUN_STATE_RUNNING);
2140 vm_state_notify(1, RUN_STATE_RUNNING);
2141 return 0;
2142}
2143
2144void vm_start(void)
2145{
2146 if (!vm_prepare_start()) {
2147 resume_all_vcpus();
2148 }
2149}
2150
2151/* does a state transition even if the VM is already stopped,
2152 current state is forgotten forever */
2153int vm_stop_force_state(RunState state)
2154{
2155 if (runstate_is_running()) {
2156 return vm_stop(state);
2157 } else {
2158 runstate_set(state);
2159
2160 bdrv_drain_all();
2161 /* Make sure to return an error if the flush in a previous vm_stop()
2162 * failed. */
2163 return bdrv_flush_all();
2164 }
2165}
2166
2167void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
2168{
2169 /* XXX: implement xxx_cpu_list for targets that still miss it */
2170#if defined(cpu_list)
2171 cpu_list(f, cpu_fprintf);
2172#endif
2173}
2174
2175CpuInfoList *qmp_query_cpus(Error **errp)
2176{
2177 MachineState *ms = MACHINE(qdev_get_machine());
2178 MachineClass *mc = MACHINE_GET_CLASS(ms);
2179 CpuInfoList *head = NULL, *cur_item = NULL;
2180 CPUState *cpu;
2181
2182 CPU_FOREACH(cpu) {
2183 CpuInfoList *info;
2184#if defined(TARGET_I386)
2185 X86CPU *x86_cpu = X86_CPU(cpu);
2186 CPUX86State *env = &x86_cpu->env;
2187#elif defined(TARGET_PPC)
2188 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2189 CPUPPCState *env = &ppc_cpu->env;
2190#elif defined(TARGET_SPARC)
2191 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2192 CPUSPARCState *env = &sparc_cpu->env;
2193#elif defined(TARGET_RISCV)
2194 RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2195 CPURISCVState *env = &riscv_cpu->env;
2196#elif defined(TARGET_MIPS)
2197 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2198 CPUMIPSState *env = &mips_cpu->env;
2199#elif defined(TARGET_TRICORE)
2200 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2201 CPUTriCoreState *env = &tricore_cpu->env;
2202#elif defined(TARGET_S390X)
2203 S390CPU *s390_cpu = S390_CPU(cpu);
2204 CPUS390XState *env = &s390_cpu->env;
2205#endif
2206
2207 cpu_synchronize_state(cpu);
2208
2209 info = g_malloc0(sizeof(*info));
2210 info->value = g_malloc0(sizeof(*info->value));
2211 info->value->CPU = cpu->cpu_index;
2212 info->value->current = (cpu == first_cpu);
2213 info->value->halted = cpu->halted;
2214 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2215 info->value->thread_id = cpu->thread_id;
2216#if defined(TARGET_I386)
2217 info->value->arch = CPU_INFO_ARCH_X86;
2218 info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2219#elif defined(TARGET_PPC)
2220 info->value->arch = CPU_INFO_ARCH_PPC;
2221 info->value->u.ppc.nip = env->nip;
2222#elif defined(TARGET_SPARC)
2223 info->value->arch = CPU_INFO_ARCH_SPARC;
2224 info->value->u.q_sparc.pc = env->pc;
2225 info->value->u.q_sparc.npc = env->npc;
2226#elif defined(TARGET_MIPS)
2227 info->value->arch = CPU_INFO_ARCH_MIPS;
2228 info->value->u.q_mips.PC = env->active_tc.PC;
2229#elif defined(TARGET_TRICORE)
2230 info->value->arch = CPU_INFO_ARCH_TRICORE;
2231 info->value->u.tricore.PC = env->PC;
2232#elif defined(TARGET_S390X)
2233 info->value->arch = CPU_INFO_ARCH_S390;
2234 info->value->u.s390.cpu_state = env->cpu_state;
2235#elif defined(TARGET_RISCV)
2236 info->value->arch = CPU_INFO_ARCH_RISCV;
2237 info->value->u.riscv.pc = env->pc;
2238#else
2239 info->value->arch = CPU_INFO_ARCH_OTHER;
2240#endif
2241 info->value->has_props = !!mc->cpu_index_to_instance_props;
2242 if (info->value->has_props) {
2243 CpuInstanceProperties *props;
2244 props = g_malloc0(sizeof(*props));
2245 *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2246 info->value->props = props;
2247 }
2248
2249 /* XXX: waiting for the qapi to support GSList */
2250 if (!cur_item) {
2251 head = cur_item = info;
2252 } else {
2253 cur_item->next = info;
2254 cur_item = info;
2255 }
2256 }
2257
2258 return head;
2259}
2260
2261static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2262{
2263 /*
2264 * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2265 * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2266 */
2267 switch (target) {
2268 case SYS_EMU_TARGET_I386:
2269 case SYS_EMU_TARGET_X86_64:
2270 return CPU_INFO_ARCH_X86;
2271
2272 case SYS_EMU_TARGET_PPC:
2273 case SYS_EMU_TARGET_PPC64:
2274 return CPU_INFO_ARCH_PPC;
2275
2276 case SYS_EMU_TARGET_SPARC:
2277 case SYS_EMU_TARGET_SPARC64:
2278 return CPU_INFO_ARCH_SPARC;
2279
2280 case SYS_EMU_TARGET_MIPS:
2281 case SYS_EMU_TARGET_MIPSEL:
2282 case SYS_EMU_TARGET_MIPS64:
2283 case SYS_EMU_TARGET_MIPS64EL:
2284 return CPU_INFO_ARCH_MIPS;
2285
2286 case SYS_EMU_TARGET_TRICORE:
2287 return CPU_INFO_ARCH_TRICORE;
2288
2289 case SYS_EMU_TARGET_S390X:
2290 return CPU_INFO_ARCH_S390;
2291
2292 case SYS_EMU_TARGET_RISCV32:
2293 case SYS_EMU_TARGET_RISCV64:
2294 return CPU_INFO_ARCH_RISCV;
2295
2296 default:
2297 return CPU_INFO_ARCH_OTHER;
2298 }
2299}
2300
2301static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2302{
2303#ifdef TARGET_S390X
2304 S390CPU *s390_cpu = S390_CPU(cpu);
2305 CPUS390XState *env = &s390_cpu->env;
2306
2307 info->cpu_state = env->cpu_state;
2308#else
2309 abort();
2310#endif
2311}
2312
2313/*
2314 * fast means: we NEVER interrupt vCPU threads to retrieve
2315 * information from KVM.
2316 */
2317CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2318{
2319 MachineState *ms = MACHINE(qdev_get_machine());
2320 MachineClass *mc = MACHINE_GET_CLASS(ms);
2321 CpuInfoFastList *head = NULL, *cur_item = NULL;
2322 SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2323 -1, &error_abort);
2324 CPUState *cpu;
2325
2326 CPU_FOREACH(cpu) {
2327 CpuInfoFastList *info = g_malloc0(sizeof(*info));
2328 info->value = g_malloc0(sizeof(*info->value));
2329
2330 info->value->cpu_index = cpu->cpu_index;
2331 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2332 info->value->thread_id = cpu->thread_id;
2333
2334 info->value->has_props = !!mc->cpu_index_to_instance_props;
2335 if (info->value->has_props) {
2336 CpuInstanceProperties *props;
2337 props = g_malloc0(sizeof(*props));
2338 *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2339 info->value->props = props;
2340 }
2341
2342 info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2343 info->value->target = target;
2344 if (target == SYS_EMU_TARGET_S390X) {
2345 cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2346 }
2347
2348 if (!cur_item) {
2349 head = cur_item = info;
2350 } else {
2351 cur_item->next = info;
2352 cur_item = info;
2353 }
2354 }
2355
2356 return head;
2357}
2358
2359void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2360 bool has_cpu, int64_t cpu_index, Error **errp)
2361{
2362 FILE *f;
2363 uint32_t l;
2364 CPUState *cpu;
2365 uint8_t buf[1024];
2366 int64_t orig_addr = addr, orig_size = size;
2367
2368 if (!has_cpu) {
2369 cpu_index = 0;
2370 }
2371
2372 cpu = qemu_get_cpu(cpu_index);
2373 if (cpu == NULL) {
2374 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2375 "a CPU number");
2376 return;
2377 }
2378
2379 f = fopen(filename, "wb");
2380 if (!f) {
2381 error_setg_file_open(errp, errno, filename);
2382 return;
2383 }
2384
2385 while (size != 0) {
2386 l = sizeof(buf);
2387 if (l > size)
2388 l = size;
2389 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2390 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2391 " specified", orig_addr, orig_size);
2392 goto exit;
2393 }
2394 if (fwrite(buf, 1, l, f) != l) {
2395 error_setg(errp, QERR_IO_ERROR);
2396 goto exit;
2397 }
2398 addr += l;
2399 size -= l;
2400 }
2401
2402exit:
2403 fclose(f);
2404}
2405
2406void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2407 Error **errp)
2408{
2409 FILE *f;
2410 uint32_t l;
2411 uint8_t buf[1024];
2412
2413 f = fopen(filename, "wb");
2414 if (!f) {
2415 error_setg_file_open(errp, errno, filename);
2416 return;
2417 }
2418
2419 while (size != 0) {
2420 l = sizeof(buf);
2421 if (l > size)
2422 l = size;
2423 cpu_physical_memory_read(addr, buf, l);
2424 if (fwrite(buf, 1, l, f) != l) {
2425 error_setg(errp, QERR_IO_ERROR);
2426 goto exit;
2427 }
2428 addr += l;
2429 size -= l;
2430 }
2431
2432exit:
2433 fclose(f);
2434}
2435
2436void qmp_inject_nmi(Error **errp)
2437{
2438 nmi_monitor_handle(monitor_get_cpu_index(), errp);
2439}
2440
2441void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2442{
2443 if (!use_icount) {
2444 return;
2445 }
2446
2447 cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
2448 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2449 if (icount_align_option) {
2450 cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
2451 cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
2452 } else {
2453 cpu_fprintf(f, "Max guest delay NA\n");
2454 cpu_fprintf(f, "Max guest advance NA\n");
2455 }
2456}