]> git.proxmox.com Git - mirror_qemu.git/blob - cpus.c
Merge remote-tracking branch 'remotes/armbru/tags/pull-monitor-2018-09-01' into staging
[mirror_qemu.git] / cpus.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 #include "qemu/osdep.h"
26 #include "qemu/config-file.h"
27 #include "cpu.h"
28 #include "monitor/monitor.h"
29 #include "qapi/error.h"
30 #include "qapi/qapi-commands-misc.h"
31 #include "qapi/qapi-events-run-state.h"
32 #include "qapi/qmp/qerror.h"
33 #include "qemu/error-report.h"
34 #include "sysemu/sysemu.h"
35 #include "sysemu/block-backend.h"
36 #include "exec/gdbstub.h"
37 #include "sysemu/dma.h"
38 #include "sysemu/hw_accel.h"
39 #include "sysemu/kvm.h"
40 #include "sysemu/hax.h"
41 #include "sysemu/hvf.h"
42 #include "sysemu/whpx.h"
43 #include "exec/exec-all.h"
44
45 #include "qemu/thread.h"
46 #include "sysemu/cpus.h"
47 #include "sysemu/qtest.h"
48 #include "qemu/main-loop.h"
49 #include "qemu/option.h"
50 #include "qemu/bitmap.h"
51 #include "qemu/seqlock.h"
52 #include "tcg.h"
53 #include "hw/nmi.h"
54 #include "sysemu/replay.h"
55 #include "hw/boards.h"
56
57 #ifdef CONFIG_LINUX
58
59 #include <sys/prctl.h>
60
61 #ifndef PR_MCE_KILL
62 #define PR_MCE_KILL 33
63 #endif
64
65 #ifndef PR_MCE_KILL_SET
66 #define PR_MCE_KILL_SET 1
67 #endif
68
69 #ifndef PR_MCE_KILL_EARLY
70 #define PR_MCE_KILL_EARLY 1
71 #endif
72
73 #endif /* CONFIG_LINUX */
74
75 int64_t max_delay;
76 int64_t max_advance;
77
78 /* vcpu throttling controls */
79 static QEMUTimer *throttle_timer;
80 static unsigned int throttle_percentage;
81
82 #define CPU_THROTTLE_PCT_MIN 1
83 #define CPU_THROTTLE_PCT_MAX 99
84 #define CPU_THROTTLE_TIMESLICE_NS 10000000
85
86 bool cpu_is_stopped(CPUState *cpu)
87 {
88 return cpu->stopped || !runstate_is_running();
89 }
90
91 static bool cpu_thread_is_idle(CPUState *cpu)
92 {
93 if (cpu->stop || cpu->queued_work_first) {
94 return false;
95 }
96 if (cpu_is_stopped(cpu)) {
97 return true;
98 }
99 if (!cpu->halted || cpu_has_work(cpu) ||
100 kvm_halt_in_kernel()) {
101 return false;
102 }
103 return true;
104 }
105
106 static bool all_cpu_threads_idle(void)
107 {
108 CPUState *cpu;
109
110 CPU_FOREACH(cpu) {
111 if (!cpu_thread_is_idle(cpu)) {
112 return false;
113 }
114 }
115 return true;
116 }
117
118 /***********************************************************/
119 /* guest cycle counter */
120
121 /* Protected by TimersState seqlock */
122
123 static bool icount_sleep = true;
124 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
125 #define MAX_ICOUNT_SHIFT 10
126
127 typedef struct TimersState {
128 /* Protected by BQL. */
129 int64_t cpu_ticks_prev;
130 int64_t cpu_ticks_offset;
131
132 /* Protect fields that can be respectively read outside the
133 * BQL, and written from multiple threads.
134 */
135 QemuSeqLock vm_clock_seqlock;
136 QemuSpin vm_clock_lock;
137
138 int16_t cpu_ticks_enabled;
139
140 /* Conversion factor from emulated instructions to virtual clock ticks. */
141 int16_t icount_time_shift;
142
143 /* Compensate for varying guest execution speed. */
144 int64_t qemu_icount_bias;
145
146 int64_t vm_clock_warp_start;
147 int64_t cpu_clock_offset;
148
149 /* Only written by TCG thread */
150 int64_t qemu_icount;
151
152 /* for adjusting icount */
153 QEMUTimer *icount_rt_timer;
154 QEMUTimer *icount_vm_timer;
155 QEMUTimer *icount_warp_timer;
156 } TimersState;
157
158 static TimersState timers_state;
159 bool mttcg_enabled;
160
161 /*
162 * We default to false if we know other options have been enabled
163 * which are currently incompatible with MTTCG. Otherwise when each
164 * guest (target) has been updated to support:
165 * - atomic instructions
166 * - memory ordering primitives (barriers)
167 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
168 *
169 * Once a guest architecture has been converted to the new primitives
170 * there are two remaining limitations to check.
171 *
172 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
173 * - The host must have a stronger memory order than the guest
174 *
175 * It may be possible in future to support strong guests on weak hosts
176 * but that will require tagging all load/stores in a guest with their
177 * implicit memory order requirements which would likely slow things
178 * down a lot.
179 */
180
181 static bool check_tcg_memory_orders_compatible(void)
182 {
183 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
184 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
185 #else
186 return false;
187 #endif
188 }
189
190 static bool default_mttcg_enabled(void)
191 {
192 if (use_icount || TCG_OVERSIZED_GUEST) {
193 return false;
194 } else {
195 #ifdef TARGET_SUPPORTS_MTTCG
196 return check_tcg_memory_orders_compatible();
197 #else
198 return false;
199 #endif
200 }
201 }
202
203 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
204 {
205 const char *t = qemu_opt_get(opts, "thread");
206 if (t) {
207 if (strcmp(t, "multi") == 0) {
208 if (TCG_OVERSIZED_GUEST) {
209 error_setg(errp, "No MTTCG when guest word size > hosts");
210 } else if (use_icount) {
211 error_setg(errp, "No MTTCG when icount is enabled");
212 } else {
213 #ifndef TARGET_SUPPORTS_MTTCG
214 error_report("Guest not yet converted to MTTCG - "
215 "you may get unexpected results");
216 #endif
217 if (!check_tcg_memory_orders_compatible()) {
218 error_report("Guest expects a stronger memory ordering "
219 "than the host provides");
220 error_printf("This may cause strange/hard to debug errors\n");
221 }
222 mttcg_enabled = true;
223 }
224 } else if (strcmp(t, "single") == 0) {
225 mttcg_enabled = false;
226 } else {
227 error_setg(errp, "Invalid 'thread' setting %s", t);
228 }
229 } else {
230 mttcg_enabled = default_mttcg_enabled();
231 }
232 }
233
234 /* The current number of executed instructions is based on what we
235 * originally budgeted minus the current state of the decrementing
236 * icount counters in extra/u16.low.
237 */
238 static int64_t cpu_get_icount_executed(CPUState *cpu)
239 {
240 return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
241 }
242
243 /*
244 * Update the global shared timer_state.qemu_icount to take into
245 * account executed instructions. This is done by the TCG vCPU
246 * thread so the main-loop can see time has moved forward.
247 */
248 void cpu_update_icount(CPUState *cpu)
249 {
250 int64_t executed = cpu_get_icount_executed(cpu);
251 cpu->icount_budget -= executed;
252
253 #ifndef CONFIG_ATOMIC64
254 seqlock_write_lock(&timers_state.vm_clock_seqlock,
255 &timers_state.vm_clock_lock);
256 #endif
257 atomic_set__nocheck(&timers_state.qemu_icount,
258 timers_state.qemu_icount + executed);
259 #ifndef CONFIG_ATOMIC64
260 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
261 &timers_state.vm_clock_lock);
262 #endif
263 }
264
265 static int64_t cpu_get_icount_raw_locked(void)
266 {
267 CPUState *cpu = current_cpu;
268
269 if (cpu && cpu->running) {
270 if (!cpu->can_do_io) {
271 error_report("Bad icount read");
272 exit(1);
273 }
274 /* Take into account what has run */
275 cpu_update_icount(cpu);
276 }
277 /* The read is protected by the seqlock, so __nocheck is okay. */
278 return atomic_read__nocheck(&timers_state.qemu_icount);
279 }
280
281 static int64_t cpu_get_icount_locked(void)
282 {
283 int64_t icount = cpu_get_icount_raw_locked();
284 return atomic_read__nocheck(&timers_state.qemu_icount_bias) + cpu_icount_to_ns(icount);
285 }
286
287 int64_t cpu_get_icount_raw(void)
288 {
289 int64_t icount;
290 unsigned start;
291
292 do {
293 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
294 icount = cpu_get_icount_raw_locked();
295 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
296
297 return icount;
298 }
299
300 /* Return the virtual CPU time, based on the instruction counter. */
301 int64_t cpu_get_icount(void)
302 {
303 int64_t icount;
304 unsigned start;
305
306 do {
307 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
308 icount = cpu_get_icount_locked();
309 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
310
311 return icount;
312 }
313
314 int64_t cpu_icount_to_ns(int64_t icount)
315 {
316 return icount << atomic_read(&timers_state.icount_time_shift);
317 }
318
319 static int64_t cpu_get_ticks_locked(void)
320 {
321 int64_t ticks = timers_state.cpu_ticks_offset;
322 if (timers_state.cpu_ticks_enabled) {
323 ticks += cpu_get_host_ticks();
324 }
325
326 if (timers_state.cpu_ticks_prev > ticks) {
327 /* Non increasing ticks may happen if the host uses software suspend. */
328 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
329 ticks = timers_state.cpu_ticks_prev;
330 }
331
332 timers_state.cpu_ticks_prev = ticks;
333 return ticks;
334 }
335
336 /* return the time elapsed in VM between vm_start and vm_stop. Unless
337 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
338 * counter.
339 */
340 int64_t cpu_get_ticks(void)
341 {
342 int64_t ticks;
343
344 if (use_icount) {
345 return cpu_get_icount();
346 }
347
348 qemu_spin_lock(&timers_state.vm_clock_lock);
349 ticks = cpu_get_ticks_locked();
350 qemu_spin_unlock(&timers_state.vm_clock_lock);
351 return ticks;
352 }
353
354 static int64_t cpu_get_clock_locked(void)
355 {
356 int64_t time;
357
358 time = timers_state.cpu_clock_offset;
359 if (timers_state.cpu_ticks_enabled) {
360 time += get_clock();
361 }
362
363 return time;
364 }
365
366 /* Return the monotonic time elapsed in VM, i.e.,
367 * the time between vm_start and vm_stop
368 */
369 int64_t cpu_get_clock(void)
370 {
371 int64_t ti;
372 unsigned start;
373
374 do {
375 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
376 ti = cpu_get_clock_locked();
377 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
378
379 return ti;
380 }
381
382 /* enable cpu_get_ticks()
383 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
384 */
385 void cpu_enable_ticks(void)
386 {
387 seqlock_write_lock(&timers_state.vm_clock_seqlock,
388 &timers_state.vm_clock_lock);
389 if (!timers_state.cpu_ticks_enabled) {
390 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
391 timers_state.cpu_clock_offset -= get_clock();
392 timers_state.cpu_ticks_enabled = 1;
393 }
394 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
395 &timers_state.vm_clock_lock);
396 }
397
398 /* disable cpu_get_ticks() : the clock is stopped. You must not call
399 * cpu_get_ticks() after that.
400 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
401 */
402 void cpu_disable_ticks(void)
403 {
404 seqlock_write_lock(&timers_state.vm_clock_seqlock,
405 &timers_state.vm_clock_lock);
406 if (timers_state.cpu_ticks_enabled) {
407 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
408 timers_state.cpu_clock_offset = cpu_get_clock_locked();
409 timers_state.cpu_ticks_enabled = 0;
410 }
411 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
412 &timers_state.vm_clock_lock);
413 }
414
415 /* Correlation between real and virtual time is always going to be
416 fairly approximate, so ignore small variation.
417 When the guest is idle real and virtual time will be aligned in
418 the IO wait loop. */
419 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
420
421 static void icount_adjust(void)
422 {
423 int64_t cur_time;
424 int64_t cur_icount;
425 int64_t delta;
426
427 /* Protected by TimersState mutex. */
428 static int64_t last_delta;
429
430 /* If the VM is not running, then do nothing. */
431 if (!runstate_is_running()) {
432 return;
433 }
434
435 seqlock_write_lock(&timers_state.vm_clock_seqlock,
436 &timers_state.vm_clock_lock);
437 cur_time = cpu_get_clock_locked();
438 cur_icount = cpu_get_icount_locked();
439
440 delta = cur_icount - cur_time;
441 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
442 if (delta > 0
443 && last_delta + ICOUNT_WOBBLE < delta * 2
444 && timers_state.icount_time_shift > 0) {
445 /* The guest is getting too far ahead. Slow time down. */
446 atomic_set(&timers_state.icount_time_shift,
447 timers_state.icount_time_shift - 1);
448 }
449 if (delta < 0
450 && last_delta - ICOUNT_WOBBLE > delta * 2
451 && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
452 /* The guest is getting too far behind. Speed time up. */
453 atomic_set(&timers_state.icount_time_shift,
454 timers_state.icount_time_shift + 1);
455 }
456 last_delta = delta;
457 atomic_set__nocheck(&timers_state.qemu_icount_bias,
458 cur_icount - (timers_state.qemu_icount
459 << timers_state.icount_time_shift));
460 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
461 &timers_state.vm_clock_lock);
462 }
463
464 static void icount_adjust_rt(void *opaque)
465 {
466 timer_mod(timers_state.icount_rt_timer,
467 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
468 icount_adjust();
469 }
470
471 static void icount_adjust_vm(void *opaque)
472 {
473 timer_mod(timers_state.icount_vm_timer,
474 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
475 NANOSECONDS_PER_SECOND / 10);
476 icount_adjust();
477 }
478
479 static int64_t qemu_icount_round(int64_t count)
480 {
481 int shift = atomic_read(&timers_state.icount_time_shift);
482 return (count + (1 << shift) - 1) >> shift;
483 }
484
485 static void icount_warp_rt(void)
486 {
487 unsigned seq;
488 int64_t warp_start;
489
490 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
491 * changes from -1 to another value, so the race here is okay.
492 */
493 do {
494 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
495 warp_start = timers_state.vm_clock_warp_start;
496 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
497
498 if (warp_start == -1) {
499 return;
500 }
501
502 seqlock_write_lock(&timers_state.vm_clock_seqlock,
503 &timers_state.vm_clock_lock);
504 if (runstate_is_running()) {
505 int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
506 cpu_get_clock_locked());
507 int64_t warp_delta;
508
509 warp_delta = clock - timers_state.vm_clock_warp_start;
510 if (use_icount == 2) {
511 /*
512 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
513 * far ahead of real time.
514 */
515 int64_t cur_icount = cpu_get_icount_locked();
516 int64_t delta = clock - cur_icount;
517 warp_delta = MIN(warp_delta, delta);
518 }
519 atomic_set__nocheck(&timers_state.qemu_icount_bias,
520 timers_state.qemu_icount_bias + warp_delta);
521 }
522 timers_state.vm_clock_warp_start = -1;
523 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
524 &timers_state.vm_clock_lock);
525
526 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
527 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
528 }
529 }
530
531 static void icount_timer_cb(void *opaque)
532 {
533 /* No need for a checkpoint because the timer already synchronizes
534 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
535 */
536 icount_warp_rt();
537 }
538
539 void qtest_clock_warp(int64_t dest)
540 {
541 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
542 AioContext *aio_context;
543 assert(qtest_enabled());
544 aio_context = qemu_get_aio_context();
545 while (clock < dest) {
546 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
547 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
548
549 seqlock_write_lock(&timers_state.vm_clock_seqlock,
550 &timers_state.vm_clock_lock);
551 atomic_set__nocheck(&timers_state.qemu_icount_bias,
552 timers_state.qemu_icount_bias + warp);
553 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
554 &timers_state.vm_clock_lock);
555
556 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
557 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
558 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
559 }
560 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
561 }
562
563 void qemu_start_warp_timer(void)
564 {
565 int64_t clock;
566 int64_t deadline;
567
568 if (!use_icount) {
569 return;
570 }
571
572 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
573 * do not fire, so computing the deadline does not make sense.
574 */
575 if (!runstate_is_running()) {
576 return;
577 }
578
579 /* warp clock deterministically in record/replay mode */
580 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
581 return;
582 }
583
584 if (!all_cpu_threads_idle()) {
585 return;
586 }
587
588 if (qtest_enabled()) {
589 /* When testing, qtest commands advance icount. */
590 return;
591 }
592
593 /* We want to use the earliest deadline from ALL vm_clocks */
594 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
595 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
596 if (deadline < 0) {
597 static bool notified;
598 if (!icount_sleep && !notified) {
599 warn_report("icount sleep disabled and no active timers");
600 notified = true;
601 }
602 return;
603 }
604
605 if (deadline > 0) {
606 /*
607 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
608 * sleep. Otherwise, the CPU might be waiting for a future timer
609 * interrupt to wake it up, but the interrupt never comes because
610 * the vCPU isn't running any insns and thus doesn't advance the
611 * QEMU_CLOCK_VIRTUAL.
612 */
613 if (!icount_sleep) {
614 /*
615 * We never let VCPUs sleep in no sleep icount mode.
616 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
617 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
618 * It is useful when we want a deterministic execution time,
619 * isolated from host latencies.
620 */
621 seqlock_write_lock(&timers_state.vm_clock_seqlock,
622 &timers_state.vm_clock_lock);
623 atomic_set__nocheck(&timers_state.qemu_icount_bias,
624 timers_state.qemu_icount_bias + deadline);
625 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
626 &timers_state.vm_clock_lock);
627 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
628 } else {
629 /*
630 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
631 * "real" time, (related to the time left until the next event) has
632 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
633 * This avoids that the warps are visible externally; for example,
634 * you will not be sending network packets continuously instead of
635 * every 100ms.
636 */
637 seqlock_write_lock(&timers_state.vm_clock_seqlock,
638 &timers_state.vm_clock_lock);
639 if (timers_state.vm_clock_warp_start == -1
640 || timers_state.vm_clock_warp_start > clock) {
641 timers_state.vm_clock_warp_start = clock;
642 }
643 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
644 &timers_state.vm_clock_lock);
645 timer_mod_anticipate(timers_state.icount_warp_timer,
646 clock + deadline);
647 }
648 } else if (deadline == 0) {
649 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
650 }
651 }
652
653 static void qemu_account_warp_timer(void)
654 {
655 if (!use_icount || !icount_sleep) {
656 return;
657 }
658
659 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
660 * do not fire, so computing the deadline does not make sense.
661 */
662 if (!runstate_is_running()) {
663 return;
664 }
665
666 /* warp clock deterministically in record/replay mode */
667 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
668 return;
669 }
670
671 timer_del(timers_state.icount_warp_timer);
672 icount_warp_rt();
673 }
674
675 static bool icount_state_needed(void *opaque)
676 {
677 return use_icount;
678 }
679
680 static bool warp_timer_state_needed(void *opaque)
681 {
682 TimersState *s = opaque;
683 return s->icount_warp_timer != NULL;
684 }
685
686 static bool adjust_timers_state_needed(void *opaque)
687 {
688 TimersState *s = opaque;
689 return s->icount_rt_timer != NULL;
690 }
691
692 /*
693 * Subsection for warp timer migration is optional, because may not be created
694 */
695 static const VMStateDescription icount_vmstate_warp_timer = {
696 .name = "timer/icount/warp_timer",
697 .version_id = 1,
698 .minimum_version_id = 1,
699 .needed = warp_timer_state_needed,
700 .fields = (VMStateField[]) {
701 VMSTATE_INT64(vm_clock_warp_start, TimersState),
702 VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
703 VMSTATE_END_OF_LIST()
704 }
705 };
706
707 static const VMStateDescription icount_vmstate_adjust_timers = {
708 .name = "timer/icount/timers",
709 .version_id = 1,
710 .minimum_version_id = 1,
711 .needed = adjust_timers_state_needed,
712 .fields = (VMStateField[]) {
713 VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
714 VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
715 VMSTATE_END_OF_LIST()
716 }
717 };
718
719 /*
720 * This is a subsection for icount migration.
721 */
722 static const VMStateDescription icount_vmstate_timers = {
723 .name = "timer/icount",
724 .version_id = 1,
725 .minimum_version_id = 1,
726 .needed = icount_state_needed,
727 .fields = (VMStateField[]) {
728 VMSTATE_INT64(qemu_icount_bias, TimersState),
729 VMSTATE_INT64(qemu_icount, TimersState),
730 VMSTATE_END_OF_LIST()
731 },
732 .subsections = (const VMStateDescription*[]) {
733 &icount_vmstate_warp_timer,
734 &icount_vmstate_adjust_timers,
735 NULL
736 }
737 };
738
739 static const VMStateDescription vmstate_timers = {
740 .name = "timer",
741 .version_id = 2,
742 .minimum_version_id = 1,
743 .fields = (VMStateField[]) {
744 VMSTATE_INT64(cpu_ticks_offset, TimersState),
745 VMSTATE_UNUSED(8),
746 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
747 VMSTATE_END_OF_LIST()
748 },
749 .subsections = (const VMStateDescription*[]) {
750 &icount_vmstate_timers,
751 NULL
752 }
753 };
754
755 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
756 {
757 double pct;
758 double throttle_ratio;
759 long sleeptime_ns;
760
761 if (!cpu_throttle_get_percentage()) {
762 return;
763 }
764
765 pct = (double)cpu_throttle_get_percentage()/100;
766 throttle_ratio = pct / (1 - pct);
767 sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
768
769 qemu_mutex_unlock_iothread();
770 g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
771 qemu_mutex_lock_iothread();
772 atomic_set(&cpu->throttle_thread_scheduled, 0);
773 }
774
775 static void cpu_throttle_timer_tick(void *opaque)
776 {
777 CPUState *cpu;
778 double pct;
779
780 /* Stop the timer if needed */
781 if (!cpu_throttle_get_percentage()) {
782 return;
783 }
784 CPU_FOREACH(cpu) {
785 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
786 async_run_on_cpu(cpu, cpu_throttle_thread,
787 RUN_ON_CPU_NULL);
788 }
789 }
790
791 pct = (double)cpu_throttle_get_percentage()/100;
792 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
793 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
794 }
795
796 void cpu_throttle_set(int new_throttle_pct)
797 {
798 /* Ensure throttle percentage is within valid range */
799 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
800 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
801
802 atomic_set(&throttle_percentage, new_throttle_pct);
803
804 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
805 CPU_THROTTLE_TIMESLICE_NS);
806 }
807
808 void cpu_throttle_stop(void)
809 {
810 atomic_set(&throttle_percentage, 0);
811 }
812
813 bool cpu_throttle_active(void)
814 {
815 return (cpu_throttle_get_percentage() != 0);
816 }
817
818 int cpu_throttle_get_percentage(void)
819 {
820 return atomic_read(&throttle_percentage);
821 }
822
823 void cpu_ticks_init(void)
824 {
825 seqlock_init(&timers_state.vm_clock_seqlock);
826 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
827 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
828 cpu_throttle_timer_tick, NULL);
829 }
830
831 void configure_icount(QemuOpts *opts, Error **errp)
832 {
833 const char *option;
834 char *rem_str = NULL;
835
836 option = qemu_opt_get(opts, "shift");
837 if (!option) {
838 if (qemu_opt_get(opts, "align") != NULL) {
839 error_setg(errp, "Please specify shift option when using align");
840 }
841 return;
842 }
843
844 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
845 if (icount_sleep) {
846 timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
847 icount_timer_cb, NULL);
848 }
849
850 icount_align_option = qemu_opt_get_bool(opts, "align", false);
851
852 if (icount_align_option && !icount_sleep) {
853 error_setg(errp, "align=on and sleep=off are incompatible");
854 }
855 if (strcmp(option, "auto") != 0) {
856 errno = 0;
857 timers_state.icount_time_shift = strtol(option, &rem_str, 0);
858 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
859 error_setg(errp, "icount: Invalid shift value");
860 }
861 use_icount = 1;
862 return;
863 } else if (icount_align_option) {
864 error_setg(errp, "shift=auto and align=on are incompatible");
865 } else if (!icount_sleep) {
866 error_setg(errp, "shift=auto and sleep=off are incompatible");
867 }
868
869 use_icount = 2;
870
871 /* 125MIPS seems a reasonable initial guess at the guest speed.
872 It will be corrected fairly quickly anyway. */
873 timers_state.icount_time_shift = 3;
874
875 /* Have both realtime and virtual time triggers for speed adjustment.
876 The realtime trigger catches emulated time passing too slowly,
877 the virtual time trigger catches emulated time passing too fast.
878 Realtime triggers occur even when idle, so use them less frequently
879 than VM triggers. */
880 timers_state.vm_clock_warp_start = -1;
881 timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
882 icount_adjust_rt, NULL);
883 timer_mod(timers_state.icount_rt_timer,
884 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
885 timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
886 icount_adjust_vm, NULL);
887 timer_mod(timers_state.icount_vm_timer,
888 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
889 NANOSECONDS_PER_SECOND / 10);
890 }
891
892 /***********************************************************/
893 /* TCG vCPU kick timer
894 *
895 * The kick timer is responsible for moving single threaded vCPU
896 * emulation on to the next vCPU. If more than one vCPU is running a
897 * timer event with force a cpu->exit so the next vCPU can get
898 * scheduled.
899 *
900 * The timer is removed if all vCPUs are idle and restarted again once
901 * idleness is complete.
902 */
903
904 static QEMUTimer *tcg_kick_vcpu_timer;
905 static CPUState *tcg_current_rr_cpu;
906
907 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
908
909 static inline int64_t qemu_tcg_next_kick(void)
910 {
911 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
912 }
913
914 /* Kick the currently round-robin scheduled vCPU */
915 static void qemu_cpu_kick_rr_cpu(void)
916 {
917 CPUState *cpu;
918 do {
919 cpu = atomic_mb_read(&tcg_current_rr_cpu);
920 if (cpu) {
921 cpu_exit(cpu);
922 }
923 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
924 }
925
926 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
927 {
928 }
929
930 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
931 {
932 if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
933 qemu_notify_event();
934 return;
935 }
936
937 if (qemu_in_vcpu_thread()) {
938 /* A CPU is currently running; kick it back out to the
939 * tcg_cpu_exec() loop so it will recalculate its
940 * icount deadline immediately.
941 */
942 qemu_cpu_kick(current_cpu);
943 } else if (first_cpu) {
944 /* qemu_cpu_kick is not enough to kick a halted CPU out of
945 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
946 * causes cpu_thread_is_idle to return false. This way,
947 * handle_icount_deadline can run.
948 * If we have no CPUs at all for some reason, we don't
949 * need to do anything.
950 */
951 async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
952 }
953 }
954
955 static void kick_tcg_thread(void *opaque)
956 {
957 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
958 qemu_cpu_kick_rr_cpu();
959 }
960
961 static void start_tcg_kick_timer(void)
962 {
963 assert(!mttcg_enabled);
964 if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
965 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
966 kick_tcg_thread, NULL);
967 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
968 }
969 }
970
971 static void stop_tcg_kick_timer(void)
972 {
973 assert(!mttcg_enabled);
974 if (tcg_kick_vcpu_timer) {
975 timer_del(tcg_kick_vcpu_timer);
976 tcg_kick_vcpu_timer = NULL;
977 }
978 }
979
980 /***********************************************************/
981 void hw_error(const char *fmt, ...)
982 {
983 va_list ap;
984 CPUState *cpu;
985
986 va_start(ap, fmt);
987 fprintf(stderr, "qemu: hardware error: ");
988 vfprintf(stderr, fmt, ap);
989 fprintf(stderr, "\n");
990 CPU_FOREACH(cpu) {
991 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
992 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
993 }
994 va_end(ap);
995 abort();
996 }
997
998 void cpu_synchronize_all_states(void)
999 {
1000 CPUState *cpu;
1001
1002 CPU_FOREACH(cpu) {
1003 cpu_synchronize_state(cpu);
1004 /* TODO: move to cpu_synchronize_state() */
1005 if (hvf_enabled()) {
1006 hvf_cpu_synchronize_state(cpu);
1007 }
1008 }
1009 }
1010
1011 void cpu_synchronize_all_post_reset(void)
1012 {
1013 CPUState *cpu;
1014
1015 CPU_FOREACH(cpu) {
1016 cpu_synchronize_post_reset(cpu);
1017 /* TODO: move to cpu_synchronize_post_reset() */
1018 if (hvf_enabled()) {
1019 hvf_cpu_synchronize_post_reset(cpu);
1020 }
1021 }
1022 }
1023
1024 void cpu_synchronize_all_post_init(void)
1025 {
1026 CPUState *cpu;
1027
1028 CPU_FOREACH(cpu) {
1029 cpu_synchronize_post_init(cpu);
1030 /* TODO: move to cpu_synchronize_post_init() */
1031 if (hvf_enabled()) {
1032 hvf_cpu_synchronize_post_init(cpu);
1033 }
1034 }
1035 }
1036
1037 void cpu_synchronize_all_pre_loadvm(void)
1038 {
1039 CPUState *cpu;
1040
1041 CPU_FOREACH(cpu) {
1042 cpu_synchronize_pre_loadvm(cpu);
1043 }
1044 }
1045
1046 static int do_vm_stop(RunState state, bool send_stop)
1047 {
1048 int ret = 0;
1049
1050 if (runstate_is_running()) {
1051 cpu_disable_ticks();
1052 pause_all_vcpus();
1053 runstate_set(state);
1054 vm_state_notify(0, state);
1055 if (send_stop) {
1056 qapi_event_send_stop();
1057 }
1058 }
1059
1060 bdrv_drain_all();
1061 replay_disable_events();
1062 ret = bdrv_flush_all();
1063
1064 return ret;
1065 }
1066
1067 /* Special vm_stop() variant for terminating the process. Historically clients
1068 * did not expect a QMP STOP event and so we need to retain compatibility.
1069 */
1070 int vm_shutdown(void)
1071 {
1072 return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1073 }
1074
1075 static bool cpu_can_run(CPUState *cpu)
1076 {
1077 if (cpu->stop) {
1078 return false;
1079 }
1080 if (cpu_is_stopped(cpu)) {
1081 return false;
1082 }
1083 return true;
1084 }
1085
1086 static void cpu_handle_guest_debug(CPUState *cpu)
1087 {
1088 gdb_set_stop_cpu(cpu);
1089 qemu_system_debug_request();
1090 cpu->stopped = true;
1091 }
1092
1093 #ifdef CONFIG_LINUX
1094 static void sigbus_reraise(void)
1095 {
1096 sigset_t set;
1097 struct sigaction action;
1098
1099 memset(&action, 0, sizeof(action));
1100 action.sa_handler = SIG_DFL;
1101 if (!sigaction(SIGBUS, &action, NULL)) {
1102 raise(SIGBUS);
1103 sigemptyset(&set);
1104 sigaddset(&set, SIGBUS);
1105 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1106 }
1107 perror("Failed to re-raise SIGBUS!\n");
1108 abort();
1109 }
1110
1111 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1112 {
1113 if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1114 sigbus_reraise();
1115 }
1116
1117 if (current_cpu) {
1118 /* Called asynchronously in VCPU thread. */
1119 if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1120 sigbus_reraise();
1121 }
1122 } else {
1123 /* Called synchronously (via signalfd) in main thread. */
1124 if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1125 sigbus_reraise();
1126 }
1127 }
1128 }
1129
1130 static void qemu_init_sigbus(void)
1131 {
1132 struct sigaction action;
1133
1134 memset(&action, 0, sizeof(action));
1135 action.sa_flags = SA_SIGINFO;
1136 action.sa_sigaction = sigbus_handler;
1137 sigaction(SIGBUS, &action, NULL);
1138
1139 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1140 }
1141 #else /* !CONFIG_LINUX */
1142 static void qemu_init_sigbus(void)
1143 {
1144 }
1145 #endif /* !CONFIG_LINUX */
1146
1147 static QemuMutex qemu_global_mutex;
1148
1149 static QemuThread io_thread;
1150
1151 /* cpu creation */
1152 static QemuCond qemu_cpu_cond;
1153 /* system init */
1154 static QemuCond qemu_pause_cond;
1155
1156 void qemu_init_cpu_loop(void)
1157 {
1158 qemu_init_sigbus();
1159 qemu_cond_init(&qemu_cpu_cond);
1160 qemu_cond_init(&qemu_pause_cond);
1161 qemu_mutex_init(&qemu_global_mutex);
1162
1163 qemu_thread_get_self(&io_thread);
1164 }
1165
1166 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1167 {
1168 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1169 }
1170
1171 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1172 {
1173 if (kvm_destroy_vcpu(cpu) < 0) {
1174 error_report("kvm_destroy_vcpu failed");
1175 exit(EXIT_FAILURE);
1176 }
1177 }
1178
1179 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1180 {
1181 }
1182
1183 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1184 {
1185 g_assert(qemu_cpu_is_self(cpu));
1186 cpu->stop = false;
1187 cpu->stopped = true;
1188 if (exit) {
1189 cpu_exit(cpu);
1190 }
1191 qemu_cond_broadcast(&qemu_pause_cond);
1192 }
1193
1194 static void qemu_wait_io_event_common(CPUState *cpu)
1195 {
1196 atomic_mb_set(&cpu->thread_kicked, false);
1197 if (cpu->stop) {
1198 qemu_cpu_stop(cpu, false);
1199 }
1200 process_queued_cpu_work(cpu);
1201 }
1202
1203 static void qemu_tcg_rr_wait_io_event(CPUState *cpu)
1204 {
1205 while (all_cpu_threads_idle()) {
1206 stop_tcg_kick_timer();
1207 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1208 }
1209
1210 start_tcg_kick_timer();
1211
1212 qemu_wait_io_event_common(cpu);
1213 }
1214
1215 static void qemu_wait_io_event(CPUState *cpu)
1216 {
1217 while (cpu_thread_is_idle(cpu)) {
1218 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1219 }
1220
1221 #ifdef _WIN32
1222 /* Eat dummy APC queued by qemu_cpu_kick_thread. */
1223 if (!tcg_enabled()) {
1224 SleepEx(0, TRUE);
1225 }
1226 #endif
1227 qemu_wait_io_event_common(cpu);
1228 }
1229
1230 static void *qemu_kvm_cpu_thread_fn(void *arg)
1231 {
1232 CPUState *cpu = arg;
1233 int r;
1234
1235 rcu_register_thread();
1236
1237 qemu_mutex_lock_iothread();
1238 qemu_thread_get_self(cpu->thread);
1239 cpu->thread_id = qemu_get_thread_id();
1240 cpu->can_do_io = 1;
1241 current_cpu = cpu;
1242
1243 r = kvm_init_vcpu(cpu);
1244 if (r < 0) {
1245 error_report("kvm_init_vcpu failed: %s", strerror(-r));
1246 exit(1);
1247 }
1248
1249 kvm_init_cpu_signals(cpu);
1250
1251 /* signal CPU creation */
1252 cpu->created = true;
1253 qemu_cond_signal(&qemu_cpu_cond);
1254
1255 do {
1256 if (cpu_can_run(cpu)) {
1257 r = kvm_cpu_exec(cpu);
1258 if (r == EXCP_DEBUG) {
1259 cpu_handle_guest_debug(cpu);
1260 }
1261 }
1262 qemu_wait_io_event(cpu);
1263 } while (!cpu->unplug || cpu_can_run(cpu));
1264
1265 qemu_kvm_destroy_vcpu(cpu);
1266 cpu->created = false;
1267 qemu_cond_signal(&qemu_cpu_cond);
1268 qemu_mutex_unlock_iothread();
1269 rcu_unregister_thread();
1270 return NULL;
1271 }
1272
1273 static void *qemu_dummy_cpu_thread_fn(void *arg)
1274 {
1275 #ifdef _WIN32
1276 error_report("qtest is not supported under Windows");
1277 exit(1);
1278 #else
1279 CPUState *cpu = arg;
1280 sigset_t waitset;
1281 int r;
1282
1283 rcu_register_thread();
1284
1285 qemu_mutex_lock_iothread();
1286 qemu_thread_get_self(cpu->thread);
1287 cpu->thread_id = qemu_get_thread_id();
1288 cpu->can_do_io = 1;
1289 current_cpu = cpu;
1290
1291 sigemptyset(&waitset);
1292 sigaddset(&waitset, SIG_IPI);
1293
1294 /* signal CPU creation */
1295 cpu->created = true;
1296 qemu_cond_signal(&qemu_cpu_cond);
1297
1298 do {
1299 qemu_mutex_unlock_iothread();
1300 do {
1301 int sig;
1302 r = sigwait(&waitset, &sig);
1303 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1304 if (r == -1) {
1305 perror("sigwait");
1306 exit(1);
1307 }
1308 qemu_mutex_lock_iothread();
1309 qemu_wait_io_event(cpu);
1310 } while (!cpu->unplug);
1311
1312 rcu_unregister_thread();
1313 return NULL;
1314 #endif
1315 }
1316
1317 static int64_t tcg_get_icount_limit(void)
1318 {
1319 int64_t deadline;
1320
1321 if (replay_mode != REPLAY_MODE_PLAY) {
1322 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1323
1324 /* Maintain prior (possibly buggy) behaviour where if no deadline
1325 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1326 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1327 * nanoseconds.
1328 */
1329 if ((deadline < 0) || (deadline > INT32_MAX)) {
1330 deadline = INT32_MAX;
1331 }
1332
1333 return qemu_icount_round(deadline);
1334 } else {
1335 return replay_get_instructions();
1336 }
1337 }
1338
1339 static void handle_icount_deadline(void)
1340 {
1341 assert(qemu_in_vcpu_thread());
1342 if (use_icount) {
1343 int64_t deadline =
1344 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1345
1346 if (deadline == 0) {
1347 /* Wake up other AioContexts. */
1348 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1349 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1350 }
1351 }
1352 }
1353
1354 static void prepare_icount_for_run(CPUState *cpu)
1355 {
1356 if (use_icount) {
1357 int insns_left;
1358
1359 /* These should always be cleared by process_icount_data after
1360 * each vCPU execution. However u16.high can be raised
1361 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1362 */
1363 g_assert(cpu->icount_decr.u16.low == 0);
1364 g_assert(cpu->icount_extra == 0);
1365
1366 cpu->icount_budget = tcg_get_icount_limit();
1367 insns_left = MIN(0xffff, cpu->icount_budget);
1368 cpu->icount_decr.u16.low = insns_left;
1369 cpu->icount_extra = cpu->icount_budget - insns_left;
1370
1371 replay_mutex_lock();
1372 }
1373 }
1374
1375 static void process_icount_data(CPUState *cpu)
1376 {
1377 if (use_icount) {
1378 /* Account for executed instructions */
1379 cpu_update_icount(cpu);
1380
1381 /* Reset the counters */
1382 cpu->icount_decr.u16.low = 0;
1383 cpu->icount_extra = 0;
1384 cpu->icount_budget = 0;
1385
1386 replay_account_executed_instructions();
1387
1388 replay_mutex_unlock();
1389 }
1390 }
1391
1392
1393 static int tcg_cpu_exec(CPUState *cpu)
1394 {
1395 int ret;
1396 #ifdef CONFIG_PROFILER
1397 int64_t ti;
1398 #endif
1399
1400 assert(tcg_enabled());
1401 #ifdef CONFIG_PROFILER
1402 ti = profile_getclock();
1403 #endif
1404 cpu_exec_start(cpu);
1405 ret = cpu_exec(cpu);
1406 cpu_exec_end(cpu);
1407 #ifdef CONFIG_PROFILER
1408 tcg_time += profile_getclock() - ti;
1409 #endif
1410 return ret;
1411 }
1412
1413 /* Destroy any remaining vCPUs which have been unplugged and have
1414 * finished running
1415 */
1416 static void deal_with_unplugged_cpus(void)
1417 {
1418 CPUState *cpu;
1419
1420 CPU_FOREACH(cpu) {
1421 if (cpu->unplug && !cpu_can_run(cpu)) {
1422 qemu_tcg_destroy_vcpu(cpu);
1423 cpu->created = false;
1424 qemu_cond_signal(&qemu_cpu_cond);
1425 break;
1426 }
1427 }
1428 }
1429
1430 /* Single-threaded TCG
1431 *
1432 * In the single-threaded case each vCPU is simulated in turn. If
1433 * there is more than a single vCPU we create a simple timer to kick
1434 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1435 * This is done explicitly rather than relying on side-effects
1436 * elsewhere.
1437 */
1438
1439 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1440 {
1441 CPUState *cpu = arg;
1442
1443 assert(tcg_enabled());
1444 rcu_register_thread();
1445 tcg_register_thread();
1446
1447 qemu_mutex_lock_iothread();
1448 qemu_thread_get_self(cpu->thread);
1449
1450 cpu->thread_id = qemu_get_thread_id();
1451 cpu->created = true;
1452 cpu->can_do_io = 1;
1453 qemu_cond_signal(&qemu_cpu_cond);
1454
1455 /* wait for initial kick-off after machine start */
1456 while (first_cpu->stopped) {
1457 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1458
1459 /* process any pending work */
1460 CPU_FOREACH(cpu) {
1461 current_cpu = cpu;
1462 qemu_wait_io_event_common(cpu);
1463 }
1464 }
1465
1466 start_tcg_kick_timer();
1467
1468 cpu = first_cpu;
1469
1470 /* process any pending work */
1471 cpu->exit_request = 1;
1472
1473 while (1) {
1474 qemu_mutex_unlock_iothread();
1475 replay_mutex_lock();
1476 qemu_mutex_lock_iothread();
1477 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1478 qemu_account_warp_timer();
1479
1480 /* Run the timers here. This is much more efficient than
1481 * waking up the I/O thread and waiting for completion.
1482 */
1483 handle_icount_deadline();
1484
1485 replay_mutex_unlock();
1486
1487 if (!cpu) {
1488 cpu = first_cpu;
1489 }
1490
1491 while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1492
1493 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1494 current_cpu = cpu;
1495
1496 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1497 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1498
1499 if (cpu_can_run(cpu)) {
1500 int r;
1501
1502 qemu_mutex_unlock_iothread();
1503 prepare_icount_for_run(cpu);
1504
1505 r = tcg_cpu_exec(cpu);
1506
1507 process_icount_data(cpu);
1508 qemu_mutex_lock_iothread();
1509
1510 if (r == EXCP_DEBUG) {
1511 cpu_handle_guest_debug(cpu);
1512 break;
1513 } else if (r == EXCP_ATOMIC) {
1514 qemu_mutex_unlock_iothread();
1515 cpu_exec_step_atomic(cpu);
1516 qemu_mutex_lock_iothread();
1517 break;
1518 }
1519 } else if (cpu->stop) {
1520 if (cpu->unplug) {
1521 cpu = CPU_NEXT(cpu);
1522 }
1523 break;
1524 }
1525
1526 cpu = CPU_NEXT(cpu);
1527 } /* while (cpu && !cpu->exit_request).. */
1528
1529 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1530 atomic_set(&tcg_current_rr_cpu, NULL);
1531
1532 if (cpu && cpu->exit_request) {
1533 atomic_mb_set(&cpu->exit_request, 0);
1534 }
1535
1536 qemu_tcg_rr_wait_io_event(cpu ? cpu : first_cpu);
1537 deal_with_unplugged_cpus();
1538 }
1539
1540 rcu_unregister_thread();
1541 return NULL;
1542 }
1543
1544 static void *qemu_hax_cpu_thread_fn(void *arg)
1545 {
1546 CPUState *cpu = arg;
1547 int r;
1548
1549 rcu_register_thread();
1550 qemu_mutex_lock_iothread();
1551 qemu_thread_get_self(cpu->thread);
1552
1553 cpu->thread_id = qemu_get_thread_id();
1554 cpu->created = true;
1555 cpu->halted = 0;
1556 current_cpu = cpu;
1557
1558 hax_init_vcpu(cpu);
1559 qemu_cond_signal(&qemu_cpu_cond);
1560
1561 do {
1562 if (cpu_can_run(cpu)) {
1563 r = hax_smp_cpu_exec(cpu);
1564 if (r == EXCP_DEBUG) {
1565 cpu_handle_guest_debug(cpu);
1566 }
1567 }
1568
1569 qemu_wait_io_event(cpu);
1570 } while (!cpu->unplug || cpu_can_run(cpu));
1571 rcu_unregister_thread();
1572 return NULL;
1573 }
1574
1575 /* The HVF-specific vCPU thread function. This one should only run when the host
1576 * CPU supports the VMX "unrestricted guest" feature. */
1577 static void *qemu_hvf_cpu_thread_fn(void *arg)
1578 {
1579 CPUState *cpu = arg;
1580
1581 int r;
1582
1583 assert(hvf_enabled());
1584
1585 rcu_register_thread();
1586
1587 qemu_mutex_lock_iothread();
1588 qemu_thread_get_self(cpu->thread);
1589
1590 cpu->thread_id = qemu_get_thread_id();
1591 cpu->can_do_io = 1;
1592 current_cpu = cpu;
1593
1594 hvf_init_vcpu(cpu);
1595
1596 /* signal CPU creation */
1597 cpu->created = true;
1598 qemu_cond_signal(&qemu_cpu_cond);
1599
1600 do {
1601 if (cpu_can_run(cpu)) {
1602 r = hvf_vcpu_exec(cpu);
1603 if (r == EXCP_DEBUG) {
1604 cpu_handle_guest_debug(cpu);
1605 }
1606 }
1607 qemu_wait_io_event(cpu);
1608 } while (!cpu->unplug || cpu_can_run(cpu));
1609
1610 hvf_vcpu_destroy(cpu);
1611 cpu->created = false;
1612 qemu_cond_signal(&qemu_cpu_cond);
1613 qemu_mutex_unlock_iothread();
1614 rcu_unregister_thread();
1615 return NULL;
1616 }
1617
1618 static void *qemu_whpx_cpu_thread_fn(void *arg)
1619 {
1620 CPUState *cpu = arg;
1621 int r;
1622
1623 rcu_register_thread();
1624
1625 qemu_mutex_lock_iothread();
1626 qemu_thread_get_self(cpu->thread);
1627 cpu->thread_id = qemu_get_thread_id();
1628 current_cpu = cpu;
1629
1630 r = whpx_init_vcpu(cpu);
1631 if (r < 0) {
1632 fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1633 exit(1);
1634 }
1635
1636 /* signal CPU creation */
1637 cpu->created = true;
1638 qemu_cond_signal(&qemu_cpu_cond);
1639
1640 do {
1641 if (cpu_can_run(cpu)) {
1642 r = whpx_vcpu_exec(cpu);
1643 if (r == EXCP_DEBUG) {
1644 cpu_handle_guest_debug(cpu);
1645 }
1646 }
1647 while (cpu_thread_is_idle(cpu)) {
1648 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1649 }
1650 qemu_wait_io_event_common(cpu);
1651 } while (!cpu->unplug || cpu_can_run(cpu));
1652
1653 whpx_destroy_vcpu(cpu);
1654 cpu->created = false;
1655 qemu_cond_signal(&qemu_cpu_cond);
1656 qemu_mutex_unlock_iothread();
1657 rcu_unregister_thread();
1658 return NULL;
1659 }
1660
1661 #ifdef _WIN32
1662 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1663 {
1664 }
1665 #endif
1666
1667 /* Multi-threaded TCG
1668 *
1669 * In the multi-threaded case each vCPU has its own thread. The TLS
1670 * variable current_cpu can be used deep in the code to find the
1671 * current CPUState for a given thread.
1672 */
1673
1674 static void *qemu_tcg_cpu_thread_fn(void *arg)
1675 {
1676 CPUState *cpu = arg;
1677
1678 assert(tcg_enabled());
1679 g_assert(!use_icount);
1680
1681 rcu_register_thread();
1682 tcg_register_thread();
1683
1684 qemu_mutex_lock_iothread();
1685 qemu_thread_get_self(cpu->thread);
1686
1687 cpu->thread_id = qemu_get_thread_id();
1688 cpu->created = true;
1689 cpu->can_do_io = 1;
1690 current_cpu = cpu;
1691 qemu_cond_signal(&qemu_cpu_cond);
1692
1693 /* process any pending work */
1694 cpu->exit_request = 1;
1695
1696 do {
1697 if (cpu_can_run(cpu)) {
1698 int r;
1699 qemu_mutex_unlock_iothread();
1700 r = tcg_cpu_exec(cpu);
1701 qemu_mutex_lock_iothread();
1702 switch (r) {
1703 case EXCP_DEBUG:
1704 cpu_handle_guest_debug(cpu);
1705 break;
1706 case EXCP_HALTED:
1707 /* during start-up the vCPU is reset and the thread is
1708 * kicked several times. If we don't ensure we go back
1709 * to sleep in the halted state we won't cleanly
1710 * start-up when the vCPU is enabled.
1711 *
1712 * cpu->halted should ensure we sleep in wait_io_event
1713 */
1714 g_assert(cpu->halted);
1715 break;
1716 case EXCP_ATOMIC:
1717 qemu_mutex_unlock_iothread();
1718 cpu_exec_step_atomic(cpu);
1719 qemu_mutex_lock_iothread();
1720 default:
1721 /* Ignore everything else? */
1722 break;
1723 }
1724 }
1725
1726 atomic_mb_set(&cpu->exit_request, 0);
1727 qemu_wait_io_event(cpu);
1728 } while (!cpu->unplug || cpu_can_run(cpu));
1729
1730 qemu_tcg_destroy_vcpu(cpu);
1731 cpu->created = false;
1732 qemu_cond_signal(&qemu_cpu_cond);
1733 qemu_mutex_unlock_iothread();
1734 rcu_unregister_thread();
1735 return NULL;
1736 }
1737
1738 static void qemu_cpu_kick_thread(CPUState *cpu)
1739 {
1740 #ifndef _WIN32
1741 int err;
1742
1743 if (cpu->thread_kicked) {
1744 return;
1745 }
1746 cpu->thread_kicked = true;
1747 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1748 if (err) {
1749 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1750 exit(1);
1751 }
1752 #else /* _WIN32 */
1753 if (!qemu_cpu_is_self(cpu)) {
1754 if (whpx_enabled()) {
1755 whpx_vcpu_kick(cpu);
1756 } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1757 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1758 __func__, GetLastError());
1759 exit(1);
1760 }
1761 }
1762 #endif
1763 }
1764
1765 void qemu_cpu_kick(CPUState *cpu)
1766 {
1767 qemu_cond_broadcast(cpu->halt_cond);
1768 if (tcg_enabled()) {
1769 cpu_exit(cpu);
1770 /* NOP unless doing single-thread RR */
1771 qemu_cpu_kick_rr_cpu();
1772 } else {
1773 if (hax_enabled()) {
1774 /*
1775 * FIXME: race condition with the exit_request check in
1776 * hax_vcpu_hax_exec
1777 */
1778 cpu->exit_request = 1;
1779 }
1780 qemu_cpu_kick_thread(cpu);
1781 }
1782 }
1783
1784 void qemu_cpu_kick_self(void)
1785 {
1786 assert(current_cpu);
1787 qemu_cpu_kick_thread(current_cpu);
1788 }
1789
1790 bool qemu_cpu_is_self(CPUState *cpu)
1791 {
1792 return qemu_thread_is_self(cpu->thread);
1793 }
1794
1795 bool qemu_in_vcpu_thread(void)
1796 {
1797 return current_cpu && qemu_cpu_is_self(current_cpu);
1798 }
1799
1800 static __thread bool iothread_locked = false;
1801
1802 bool qemu_mutex_iothread_locked(void)
1803 {
1804 return iothread_locked;
1805 }
1806
1807 /*
1808 * The BQL is taken from so many places that it is worth profiling the
1809 * callers directly, instead of funneling them all through a single function.
1810 */
1811 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1812 {
1813 QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1814
1815 g_assert(!qemu_mutex_iothread_locked());
1816 bql_lock(&qemu_global_mutex, file, line);
1817 iothread_locked = true;
1818 }
1819
1820 void qemu_mutex_unlock_iothread(void)
1821 {
1822 g_assert(qemu_mutex_iothread_locked());
1823 iothread_locked = false;
1824 qemu_mutex_unlock(&qemu_global_mutex);
1825 }
1826
1827 static bool all_vcpus_paused(void)
1828 {
1829 CPUState *cpu;
1830
1831 CPU_FOREACH(cpu) {
1832 if (!cpu->stopped) {
1833 return false;
1834 }
1835 }
1836
1837 return true;
1838 }
1839
1840 void pause_all_vcpus(void)
1841 {
1842 CPUState *cpu;
1843
1844 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1845 CPU_FOREACH(cpu) {
1846 if (qemu_cpu_is_self(cpu)) {
1847 qemu_cpu_stop(cpu, true);
1848 } else {
1849 cpu->stop = true;
1850 qemu_cpu_kick(cpu);
1851 }
1852 }
1853
1854 /* We need to drop the replay_lock so any vCPU threads woken up
1855 * can finish their replay tasks
1856 */
1857 replay_mutex_unlock();
1858
1859 while (!all_vcpus_paused()) {
1860 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1861 CPU_FOREACH(cpu) {
1862 qemu_cpu_kick(cpu);
1863 }
1864 }
1865
1866 qemu_mutex_unlock_iothread();
1867 replay_mutex_lock();
1868 qemu_mutex_lock_iothread();
1869 }
1870
1871 void cpu_resume(CPUState *cpu)
1872 {
1873 cpu->stop = false;
1874 cpu->stopped = false;
1875 qemu_cpu_kick(cpu);
1876 }
1877
1878 void resume_all_vcpus(void)
1879 {
1880 CPUState *cpu;
1881
1882 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1883 CPU_FOREACH(cpu) {
1884 cpu_resume(cpu);
1885 }
1886 }
1887
1888 void cpu_remove_sync(CPUState *cpu)
1889 {
1890 cpu->stop = true;
1891 cpu->unplug = true;
1892 qemu_cpu_kick(cpu);
1893 qemu_mutex_unlock_iothread();
1894 qemu_thread_join(cpu->thread);
1895 qemu_mutex_lock_iothread();
1896 }
1897
1898 /* For temporary buffers for forming a name */
1899 #define VCPU_THREAD_NAME_SIZE 16
1900
1901 static void qemu_tcg_init_vcpu(CPUState *cpu)
1902 {
1903 char thread_name[VCPU_THREAD_NAME_SIZE];
1904 static QemuCond *single_tcg_halt_cond;
1905 static QemuThread *single_tcg_cpu_thread;
1906 static int tcg_region_inited;
1907
1908 assert(tcg_enabled());
1909 /*
1910 * Initialize TCG regions--once. Now is a good time, because:
1911 * (1) TCG's init context, prologue and target globals have been set up.
1912 * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1913 * -accel flag is processed, so the check doesn't work then).
1914 */
1915 if (!tcg_region_inited) {
1916 tcg_region_inited = 1;
1917 tcg_region_init();
1918 }
1919
1920 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1921 cpu->thread = g_malloc0(sizeof(QemuThread));
1922 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1923 qemu_cond_init(cpu->halt_cond);
1924
1925 if (qemu_tcg_mttcg_enabled()) {
1926 /* create a thread per vCPU with TCG (MTTCG) */
1927 parallel_cpus = true;
1928 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1929 cpu->cpu_index);
1930
1931 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1932 cpu, QEMU_THREAD_JOINABLE);
1933
1934 } else {
1935 /* share a single thread for all cpus with TCG */
1936 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1937 qemu_thread_create(cpu->thread, thread_name,
1938 qemu_tcg_rr_cpu_thread_fn,
1939 cpu, QEMU_THREAD_JOINABLE);
1940
1941 single_tcg_halt_cond = cpu->halt_cond;
1942 single_tcg_cpu_thread = cpu->thread;
1943 }
1944 #ifdef _WIN32
1945 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1946 #endif
1947 } else {
1948 /* For non-MTTCG cases we share the thread */
1949 cpu->thread = single_tcg_cpu_thread;
1950 cpu->halt_cond = single_tcg_halt_cond;
1951 cpu->thread_id = first_cpu->thread_id;
1952 cpu->can_do_io = 1;
1953 cpu->created = true;
1954 }
1955 }
1956
1957 static void qemu_hax_start_vcpu(CPUState *cpu)
1958 {
1959 char thread_name[VCPU_THREAD_NAME_SIZE];
1960
1961 cpu->thread = g_malloc0(sizeof(QemuThread));
1962 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1963 qemu_cond_init(cpu->halt_cond);
1964
1965 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1966 cpu->cpu_index);
1967 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1968 cpu, QEMU_THREAD_JOINABLE);
1969 #ifdef _WIN32
1970 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1971 #endif
1972 }
1973
1974 static void qemu_kvm_start_vcpu(CPUState *cpu)
1975 {
1976 char thread_name[VCPU_THREAD_NAME_SIZE];
1977
1978 cpu->thread = g_malloc0(sizeof(QemuThread));
1979 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1980 qemu_cond_init(cpu->halt_cond);
1981 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1982 cpu->cpu_index);
1983 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1984 cpu, QEMU_THREAD_JOINABLE);
1985 }
1986
1987 static void qemu_hvf_start_vcpu(CPUState *cpu)
1988 {
1989 char thread_name[VCPU_THREAD_NAME_SIZE];
1990
1991 /* HVF currently does not support TCG, and only runs in
1992 * unrestricted-guest mode. */
1993 assert(hvf_enabled());
1994
1995 cpu->thread = g_malloc0(sizeof(QemuThread));
1996 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1997 qemu_cond_init(cpu->halt_cond);
1998
1999 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2000 cpu->cpu_index);
2001 qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2002 cpu, QEMU_THREAD_JOINABLE);
2003 }
2004
2005 static void qemu_whpx_start_vcpu(CPUState *cpu)
2006 {
2007 char thread_name[VCPU_THREAD_NAME_SIZE];
2008
2009 cpu->thread = g_malloc0(sizeof(QemuThread));
2010 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2011 qemu_cond_init(cpu->halt_cond);
2012 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2013 cpu->cpu_index);
2014 qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2015 cpu, QEMU_THREAD_JOINABLE);
2016 #ifdef _WIN32
2017 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2018 #endif
2019 }
2020
2021 static void qemu_dummy_start_vcpu(CPUState *cpu)
2022 {
2023 char thread_name[VCPU_THREAD_NAME_SIZE];
2024
2025 cpu->thread = g_malloc0(sizeof(QemuThread));
2026 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2027 qemu_cond_init(cpu->halt_cond);
2028 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2029 cpu->cpu_index);
2030 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2031 QEMU_THREAD_JOINABLE);
2032 }
2033
2034 void qemu_init_vcpu(CPUState *cpu)
2035 {
2036 cpu->nr_cores = smp_cores;
2037 cpu->nr_threads = smp_threads;
2038 cpu->stopped = true;
2039
2040 if (!cpu->as) {
2041 /* If the target cpu hasn't set up any address spaces itself,
2042 * give it the default one.
2043 */
2044 cpu->num_ases = 1;
2045 cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2046 }
2047
2048 if (kvm_enabled()) {
2049 qemu_kvm_start_vcpu(cpu);
2050 } else if (hax_enabled()) {
2051 qemu_hax_start_vcpu(cpu);
2052 } else if (hvf_enabled()) {
2053 qemu_hvf_start_vcpu(cpu);
2054 } else if (tcg_enabled()) {
2055 qemu_tcg_init_vcpu(cpu);
2056 } else if (whpx_enabled()) {
2057 qemu_whpx_start_vcpu(cpu);
2058 } else {
2059 qemu_dummy_start_vcpu(cpu);
2060 }
2061
2062 while (!cpu->created) {
2063 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2064 }
2065 }
2066
2067 void cpu_stop_current(void)
2068 {
2069 if (current_cpu) {
2070 qemu_cpu_stop(current_cpu, true);
2071 }
2072 }
2073
2074 int vm_stop(RunState state)
2075 {
2076 if (qemu_in_vcpu_thread()) {
2077 qemu_system_vmstop_request_prepare();
2078 qemu_system_vmstop_request(state);
2079 /*
2080 * FIXME: should not return to device code in case
2081 * vm_stop() has been requested.
2082 */
2083 cpu_stop_current();
2084 return 0;
2085 }
2086
2087 return do_vm_stop(state, true);
2088 }
2089
2090 /**
2091 * Prepare for (re)starting the VM.
2092 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2093 * running or in case of an error condition), 0 otherwise.
2094 */
2095 int vm_prepare_start(void)
2096 {
2097 RunState requested;
2098
2099 qemu_vmstop_requested(&requested);
2100 if (runstate_is_running() && requested == RUN_STATE__MAX) {
2101 return -1;
2102 }
2103
2104 /* Ensure that a STOP/RESUME pair of events is emitted if a
2105 * vmstop request was pending. The BLOCK_IO_ERROR event, for
2106 * example, according to documentation is always followed by
2107 * the STOP event.
2108 */
2109 if (runstate_is_running()) {
2110 qapi_event_send_stop();
2111 qapi_event_send_resume();
2112 return -1;
2113 }
2114
2115 /* We are sending this now, but the CPUs will be resumed shortly later */
2116 qapi_event_send_resume();
2117
2118 replay_enable_events();
2119 cpu_enable_ticks();
2120 runstate_set(RUN_STATE_RUNNING);
2121 vm_state_notify(1, RUN_STATE_RUNNING);
2122 return 0;
2123 }
2124
2125 void vm_start(void)
2126 {
2127 if (!vm_prepare_start()) {
2128 resume_all_vcpus();
2129 }
2130 }
2131
2132 /* does a state transition even if the VM is already stopped,
2133 current state is forgotten forever */
2134 int vm_stop_force_state(RunState state)
2135 {
2136 if (runstate_is_running()) {
2137 return vm_stop(state);
2138 } else {
2139 runstate_set(state);
2140
2141 bdrv_drain_all();
2142 /* Make sure to return an error if the flush in a previous vm_stop()
2143 * failed. */
2144 return bdrv_flush_all();
2145 }
2146 }
2147
2148 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
2149 {
2150 /* XXX: implement xxx_cpu_list for targets that still miss it */
2151 #if defined(cpu_list)
2152 cpu_list(f, cpu_fprintf);
2153 #endif
2154 }
2155
2156 CpuInfoList *qmp_query_cpus(Error **errp)
2157 {
2158 MachineState *ms = MACHINE(qdev_get_machine());
2159 MachineClass *mc = MACHINE_GET_CLASS(ms);
2160 CpuInfoList *head = NULL, *cur_item = NULL;
2161 CPUState *cpu;
2162
2163 CPU_FOREACH(cpu) {
2164 CpuInfoList *info;
2165 #if defined(TARGET_I386)
2166 X86CPU *x86_cpu = X86_CPU(cpu);
2167 CPUX86State *env = &x86_cpu->env;
2168 #elif defined(TARGET_PPC)
2169 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2170 CPUPPCState *env = &ppc_cpu->env;
2171 #elif defined(TARGET_SPARC)
2172 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2173 CPUSPARCState *env = &sparc_cpu->env;
2174 #elif defined(TARGET_RISCV)
2175 RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2176 CPURISCVState *env = &riscv_cpu->env;
2177 #elif defined(TARGET_MIPS)
2178 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2179 CPUMIPSState *env = &mips_cpu->env;
2180 #elif defined(TARGET_TRICORE)
2181 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2182 CPUTriCoreState *env = &tricore_cpu->env;
2183 #elif defined(TARGET_S390X)
2184 S390CPU *s390_cpu = S390_CPU(cpu);
2185 CPUS390XState *env = &s390_cpu->env;
2186 #endif
2187
2188 cpu_synchronize_state(cpu);
2189
2190 info = g_malloc0(sizeof(*info));
2191 info->value = g_malloc0(sizeof(*info->value));
2192 info->value->CPU = cpu->cpu_index;
2193 info->value->current = (cpu == first_cpu);
2194 info->value->halted = cpu->halted;
2195 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2196 info->value->thread_id = cpu->thread_id;
2197 #if defined(TARGET_I386)
2198 info->value->arch = CPU_INFO_ARCH_X86;
2199 info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2200 #elif defined(TARGET_PPC)
2201 info->value->arch = CPU_INFO_ARCH_PPC;
2202 info->value->u.ppc.nip = env->nip;
2203 #elif defined(TARGET_SPARC)
2204 info->value->arch = CPU_INFO_ARCH_SPARC;
2205 info->value->u.q_sparc.pc = env->pc;
2206 info->value->u.q_sparc.npc = env->npc;
2207 #elif defined(TARGET_MIPS)
2208 info->value->arch = CPU_INFO_ARCH_MIPS;
2209 info->value->u.q_mips.PC = env->active_tc.PC;
2210 #elif defined(TARGET_TRICORE)
2211 info->value->arch = CPU_INFO_ARCH_TRICORE;
2212 info->value->u.tricore.PC = env->PC;
2213 #elif defined(TARGET_S390X)
2214 info->value->arch = CPU_INFO_ARCH_S390;
2215 info->value->u.s390.cpu_state = env->cpu_state;
2216 #elif defined(TARGET_RISCV)
2217 info->value->arch = CPU_INFO_ARCH_RISCV;
2218 info->value->u.riscv.pc = env->pc;
2219 #else
2220 info->value->arch = CPU_INFO_ARCH_OTHER;
2221 #endif
2222 info->value->has_props = !!mc->cpu_index_to_instance_props;
2223 if (info->value->has_props) {
2224 CpuInstanceProperties *props;
2225 props = g_malloc0(sizeof(*props));
2226 *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2227 info->value->props = props;
2228 }
2229
2230 /* XXX: waiting for the qapi to support GSList */
2231 if (!cur_item) {
2232 head = cur_item = info;
2233 } else {
2234 cur_item->next = info;
2235 cur_item = info;
2236 }
2237 }
2238
2239 return head;
2240 }
2241
2242 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2243 {
2244 /*
2245 * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2246 * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2247 */
2248 switch (target) {
2249 case SYS_EMU_TARGET_I386:
2250 case SYS_EMU_TARGET_X86_64:
2251 return CPU_INFO_ARCH_X86;
2252
2253 case SYS_EMU_TARGET_PPC:
2254 case SYS_EMU_TARGET_PPCEMB:
2255 case SYS_EMU_TARGET_PPC64:
2256 return CPU_INFO_ARCH_PPC;
2257
2258 case SYS_EMU_TARGET_SPARC:
2259 case SYS_EMU_TARGET_SPARC64:
2260 return CPU_INFO_ARCH_SPARC;
2261
2262 case SYS_EMU_TARGET_MIPS:
2263 case SYS_EMU_TARGET_MIPSEL:
2264 case SYS_EMU_TARGET_MIPS64:
2265 case SYS_EMU_TARGET_MIPS64EL:
2266 return CPU_INFO_ARCH_MIPS;
2267
2268 case SYS_EMU_TARGET_TRICORE:
2269 return CPU_INFO_ARCH_TRICORE;
2270
2271 case SYS_EMU_TARGET_S390X:
2272 return CPU_INFO_ARCH_S390;
2273
2274 case SYS_EMU_TARGET_RISCV32:
2275 case SYS_EMU_TARGET_RISCV64:
2276 return CPU_INFO_ARCH_RISCV;
2277
2278 default:
2279 return CPU_INFO_ARCH_OTHER;
2280 }
2281 }
2282
2283 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2284 {
2285 #ifdef TARGET_S390X
2286 S390CPU *s390_cpu = S390_CPU(cpu);
2287 CPUS390XState *env = &s390_cpu->env;
2288
2289 info->cpu_state = env->cpu_state;
2290 #else
2291 abort();
2292 #endif
2293 }
2294
2295 /*
2296 * fast means: we NEVER interrupt vCPU threads to retrieve
2297 * information from KVM.
2298 */
2299 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2300 {
2301 MachineState *ms = MACHINE(qdev_get_machine());
2302 MachineClass *mc = MACHINE_GET_CLASS(ms);
2303 CpuInfoFastList *head = NULL, *cur_item = NULL;
2304 SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2305 -1, &error_abort);
2306 CPUState *cpu;
2307
2308 CPU_FOREACH(cpu) {
2309 CpuInfoFastList *info = g_malloc0(sizeof(*info));
2310 info->value = g_malloc0(sizeof(*info->value));
2311
2312 info->value->cpu_index = cpu->cpu_index;
2313 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2314 info->value->thread_id = cpu->thread_id;
2315
2316 info->value->has_props = !!mc->cpu_index_to_instance_props;
2317 if (info->value->has_props) {
2318 CpuInstanceProperties *props;
2319 props = g_malloc0(sizeof(*props));
2320 *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2321 info->value->props = props;
2322 }
2323
2324 info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2325 info->value->target = target;
2326 if (target == SYS_EMU_TARGET_S390X) {
2327 cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2328 }
2329
2330 if (!cur_item) {
2331 head = cur_item = info;
2332 } else {
2333 cur_item->next = info;
2334 cur_item = info;
2335 }
2336 }
2337
2338 return head;
2339 }
2340
2341 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2342 bool has_cpu, int64_t cpu_index, Error **errp)
2343 {
2344 FILE *f;
2345 uint32_t l;
2346 CPUState *cpu;
2347 uint8_t buf[1024];
2348 int64_t orig_addr = addr, orig_size = size;
2349
2350 if (!has_cpu) {
2351 cpu_index = 0;
2352 }
2353
2354 cpu = qemu_get_cpu(cpu_index);
2355 if (cpu == NULL) {
2356 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2357 "a CPU number");
2358 return;
2359 }
2360
2361 f = fopen(filename, "wb");
2362 if (!f) {
2363 error_setg_file_open(errp, errno, filename);
2364 return;
2365 }
2366
2367 while (size != 0) {
2368 l = sizeof(buf);
2369 if (l > size)
2370 l = size;
2371 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2372 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2373 " specified", orig_addr, orig_size);
2374 goto exit;
2375 }
2376 if (fwrite(buf, 1, l, f) != l) {
2377 error_setg(errp, QERR_IO_ERROR);
2378 goto exit;
2379 }
2380 addr += l;
2381 size -= l;
2382 }
2383
2384 exit:
2385 fclose(f);
2386 }
2387
2388 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2389 Error **errp)
2390 {
2391 FILE *f;
2392 uint32_t l;
2393 uint8_t buf[1024];
2394
2395 f = fopen(filename, "wb");
2396 if (!f) {
2397 error_setg_file_open(errp, errno, filename);
2398 return;
2399 }
2400
2401 while (size != 0) {
2402 l = sizeof(buf);
2403 if (l > size)
2404 l = size;
2405 cpu_physical_memory_read(addr, buf, l);
2406 if (fwrite(buf, 1, l, f) != l) {
2407 error_setg(errp, QERR_IO_ERROR);
2408 goto exit;
2409 }
2410 addr += l;
2411 size -= l;
2412 }
2413
2414 exit:
2415 fclose(f);
2416 }
2417
2418 void qmp_inject_nmi(Error **errp)
2419 {
2420 nmi_monitor_handle(monitor_get_cpu_index(), errp);
2421 }
2422
2423 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2424 {
2425 if (!use_icount) {
2426 return;
2427 }
2428
2429 cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
2430 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2431 if (icount_align_option) {
2432 cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
2433 cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
2434 } else {
2435 cpu_fprintf(f, "Max guest delay NA\n");
2436 cpu_fprintf(f, "Max guest advance NA\n");
2437 }
2438 }