]> git.proxmox.com Git - mirror_qemu.git/blob - cpus.c
qemu-common: Move tcg_enabled() etc. to sysemu/tcg.h
[mirror_qemu.git] / cpus.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 #include "qemu/osdep.h"
26 #include "qemu/config-file.h"
27 #include "cpu.h"
28 #include "monitor/monitor.h"
29 #include "qapi/error.h"
30 #include "qapi/qapi-commands-misc.h"
31 #include "qapi/qapi-events-run-state.h"
32 #include "qapi/qmp/qerror.h"
33 #include "qemu/error-report.h"
34 #include "qemu/qemu-print.h"
35 #include "sysemu/sysemu.h"
36 #include "sysemu/tcg.h"
37 #include "sysemu/block-backend.h"
38 #include "exec/gdbstub.h"
39 #include "sysemu/dma.h"
40 #include "sysemu/hw_accel.h"
41 #include "sysemu/kvm.h"
42 #include "sysemu/hax.h"
43 #include "sysemu/hvf.h"
44 #include "sysemu/whpx.h"
45 #include "exec/exec-all.h"
46
47 #include "qemu/thread.h"
48 #include "sysemu/cpus.h"
49 #include "sysemu/qtest.h"
50 #include "qemu/main-loop.h"
51 #include "qemu/option.h"
52 #include "qemu/bitmap.h"
53 #include "qemu/seqlock.h"
54 #include "qemu/guest-random.h"
55 #include "tcg.h"
56 #include "hw/nmi.h"
57 #include "sysemu/replay.h"
58 #include "hw/boards.h"
59
60 #ifdef CONFIG_LINUX
61
62 #include <sys/prctl.h>
63
64 #ifndef PR_MCE_KILL
65 #define PR_MCE_KILL 33
66 #endif
67
68 #ifndef PR_MCE_KILL_SET
69 #define PR_MCE_KILL_SET 1
70 #endif
71
72 #ifndef PR_MCE_KILL_EARLY
73 #define PR_MCE_KILL_EARLY 1
74 #endif
75
76 #endif /* CONFIG_LINUX */
77
78 int64_t max_delay;
79 int64_t max_advance;
80
81 /* vcpu throttling controls */
82 static QEMUTimer *throttle_timer;
83 static unsigned int throttle_percentage;
84
85 #define CPU_THROTTLE_PCT_MIN 1
86 #define CPU_THROTTLE_PCT_MAX 99
87 #define CPU_THROTTLE_TIMESLICE_NS 10000000
88
89 bool cpu_is_stopped(CPUState *cpu)
90 {
91 return cpu->stopped || !runstate_is_running();
92 }
93
94 static bool cpu_thread_is_idle(CPUState *cpu)
95 {
96 if (cpu->stop || cpu->queued_work_first) {
97 return false;
98 }
99 if (cpu_is_stopped(cpu)) {
100 return true;
101 }
102 if (!cpu->halted || cpu_has_work(cpu) ||
103 kvm_halt_in_kernel()) {
104 return false;
105 }
106 return true;
107 }
108
109 static bool all_cpu_threads_idle(void)
110 {
111 CPUState *cpu;
112
113 CPU_FOREACH(cpu) {
114 if (!cpu_thread_is_idle(cpu)) {
115 return false;
116 }
117 }
118 return true;
119 }
120
121 /***********************************************************/
122 /* guest cycle counter */
123
124 /* Protected by TimersState seqlock */
125
126 static bool icount_sleep = true;
127 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
128 #define MAX_ICOUNT_SHIFT 10
129
130 typedef struct TimersState {
131 /* Protected by BQL. */
132 int64_t cpu_ticks_prev;
133 int64_t cpu_ticks_offset;
134
135 /* Protect fields that can be respectively read outside the
136 * BQL, and written from multiple threads.
137 */
138 QemuSeqLock vm_clock_seqlock;
139 QemuSpin vm_clock_lock;
140
141 int16_t cpu_ticks_enabled;
142
143 /* Conversion factor from emulated instructions to virtual clock ticks. */
144 int16_t icount_time_shift;
145
146 /* Compensate for varying guest execution speed. */
147 int64_t qemu_icount_bias;
148
149 int64_t vm_clock_warp_start;
150 int64_t cpu_clock_offset;
151
152 /* Only written by TCG thread */
153 int64_t qemu_icount;
154
155 /* for adjusting icount */
156 QEMUTimer *icount_rt_timer;
157 QEMUTimer *icount_vm_timer;
158 QEMUTimer *icount_warp_timer;
159 } TimersState;
160
161 static TimersState timers_state;
162 bool mttcg_enabled;
163
164 /*
165 * We default to false if we know other options have been enabled
166 * which are currently incompatible with MTTCG. Otherwise when each
167 * guest (target) has been updated to support:
168 * - atomic instructions
169 * - memory ordering primitives (barriers)
170 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
171 *
172 * Once a guest architecture has been converted to the new primitives
173 * there are two remaining limitations to check.
174 *
175 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
176 * - The host must have a stronger memory order than the guest
177 *
178 * It may be possible in future to support strong guests on weak hosts
179 * but that will require tagging all load/stores in a guest with their
180 * implicit memory order requirements which would likely slow things
181 * down a lot.
182 */
183
184 static bool check_tcg_memory_orders_compatible(void)
185 {
186 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
187 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
188 #else
189 return false;
190 #endif
191 }
192
193 static bool default_mttcg_enabled(void)
194 {
195 if (use_icount || TCG_OVERSIZED_GUEST) {
196 return false;
197 } else {
198 #ifdef TARGET_SUPPORTS_MTTCG
199 return check_tcg_memory_orders_compatible();
200 #else
201 return false;
202 #endif
203 }
204 }
205
206 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
207 {
208 const char *t = qemu_opt_get(opts, "thread");
209 if (t) {
210 if (strcmp(t, "multi") == 0) {
211 if (TCG_OVERSIZED_GUEST) {
212 error_setg(errp, "No MTTCG when guest word size > hosts");
213 } else if (use_icount) {
214 error_setg(errp, "No MTTCG when icount is enabled");
215 } else {
216 #ifndef TARGET_SUPPORTS_MTTCG
217 warn_report("Guest not yet converted to MTTCG - "
218 "you may get unexpected results");
219 #endif
220 if (!check_tcg_memory_orders_compatible()) {
221 warn_report("Guest expects a stronger memory ordering "
222 "than the host provides");
223 error_printf("This may cause strange/hard to debug errors\n");
224 }
225 mttcg_enabled = true;
226 }
227 } else if (strcmp(t, "single") == 0) {
228 mttcg_enabled = false;
229 } else {
230 error_setg(errp, "Invalid 'thread' setting %s", t);
231 }
232 } else {
233 mttcg_enabled = default_mttcg_enabled();
234 }
235 }
236
237 /* The current number of executed instructions is based on what we
238 * originally budgeted minus the current state of the decrementing
239 * icount counters in extra/u16.low.
240 */
241 static int64_t cpu_get_icount_executed(CPUState *cpu)
242 {
243 return (cpu->icount_budget -
244 (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
245 }
246
247 /*
248 * Update the global shared timer_state.qemu_icount to take into
249 * account executed instructions. This is done by the TCG vCPU
250 * thread so the main-loop can see time has moved forward.
251 */
252 static void cpu_update_icount_locked(CPUState *cpu)
253 {
254 int64_t executed = cpu_get_icount_executed(cpu);
255 cpu->icount_budget -= executed;
256
257 atomic_set_i64(&timers_state.qemu_icount,
258 timers_state.qemu_icount + executed);
259 }
260
261 /*
262 * Update the global shared timer_state.qemu_icount to take into
263 * account executed instructions. This is done by the TCG vCPU
264 * thread so the main-loop can see time has moved forward.
265 */
266 void cpu_update_icount(CPUState *cpu)
267 {
268 seqlock_write_lock(&timers_state.vm_clock_seqlock,
269 &timers_state.vm_clock_lock);
270 cpu_update_icount_locked(cpu);
271 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
272 &timers_state.vm_clock_lock);
273 }
274
275 static int64_t cpu_get_icount_raw_locked(void)
276 {
277 CPUState *cpu = current_cpu;
278
279 if (cpu && cpu->running) {
280 if (!cpu->can_do_io) {
281 error_report("Bad icount read");
282 exit(1);
283 }
284 /* Take into account what has run */
285 cpu_update_icount_locked(cpu);
286 }
287 /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
288 return atomic_read_i64(&timers_state.qemu_icount);
289 }
290
291 static int64_t cpu_get_icount_locked(void)
292 {
293 int64_t icount = cpu_get_icount_raw_locked();
294 return atomic_read_i64(&timers_state.qemu_icount_bias) +
295 cpu_icount_to_ns(icount);
296 }
297
298 int64_t cpu_get_icount_raw(void)
299 {
300 int64_t icount;
301 unsigned start;
302
303 do {
304 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
305 icount = cpu_get_icount_raw_locked();
306 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
307
308 return icount;
309 }
310
311 /* Return the virtual CPU time, based on the instruction counter. */
312 int64_t cpu_get_icount(void)
313 {
314 int64_t icount;
315 unsigned start;
316
317 do {
318 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
319 icount = cpu_get_icount_locked();
320 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
321
322 return icount;
323 }
324
325 int64_t cpu_icount_to_ns(int64_t icount)
326 {
327 return icount << atomic_read(&timers_state.icount_time_shift);
328 }
329
330 static int64_t cpu_get_ticks_locked(void)
331 {
332 int64_t ticks = timers_state.cpu_ticks_offset;
333 if (timers_state.cpu_ticks_enabled) {
334 ticks += cpu_get_host_ticks();
335 }
336
337 if (timers_state.cpu_ticks_prev > ticks) {
338 /* Non increasing ticks may happen if the host uses software suspend. */
339 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
340 ticks = timers_state.cpu_ticks_prev;
341 }
342
343 timers_state.cpu_ticks_prev = ticks;
344 return ticks;
345 }
346
347 /* return the time elapsed in VM between vm_start and vm_stop. Unless
348 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
349 * counter.
350 */
351 int64_t cpu_get_ticks(void)
352 {
353 int64_t ticks;
354
355 if (use_icount) {
356 return cpu_get_icount();
357 }
358
359 qemu_spin_lock(&timers_state.vm_clock_lock);
360 ticks = cpu_get_ticks_locked();
361 qemu_spin_unlock(&timers_state.vm_clock_lock);
362 return ticks;
363 }
364
365 static int64_t cpu_get_clock_locked(void)
366 {
367 int64_t time;
368
369 time = timers_state.cpu_clock_offset;
370 if (timers_state.cpu_ticks_enabled) {
371 time += get_clock();
372 }
373
374 return time;
375 }
376
377 /* Return the monotonic time elapsed in VM, i.e.,
378 * the time between vm_start and vm_stop
379 */
380 int64_t cpu_get_clock(void)
381 {
382 int64_t ti;
383 unsigned start;
384
385 do {
386 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
387 ti = cpu_get_clock_locked();
388 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
389
390 return ti;
391 }
392
393 /* enable cpu_get_ticks()
394 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
395 */
396 void cpu_enable_ticks(void)
397 {
398 seqlock_write_lock(&timers_state.vm_clock_seqlock,
399 &timers_state.vm_clock_lock);
400 if (!timers_state.cpu_ticks_enabled) {
401 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
402 timers_state.cpu_clock_offset -= get_clock();
403 timers_state.cpu_ticks_enabled = 1;
404 }
405 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
406 &timers_state.vm_clock_lock);
407 }
408
409 /* disable cpu_get_ticks() : the clock is stopped. You must not call
410 * cpu_get_ticks() after that.
411 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
412 */
413 void cpu_disable_ticks(void)
414 {
415 seqlock_write_lock(&timers_state.vm_clock_seqlock,
416 &timers_state.vm_clock_lock);
417 if (timers_state.cpu_ticks_enabled) {
418 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
419 timers_state.cpu_clock_offset = cpu_get_clock_locked();
420 timers_state.cpu_ticks_enabled = 0;
421 }
422 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
423 &timers_state.vm_clock_lock);
424 }
425
426 /* Correlation between real and virtual time is always going to be
427 fairly approximate, so ignore small variation.
428 When the guest is idle real and virtual time will be aligned in
429 the IO wait loop. */
430 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
431
432 static void icount_adjust(void)
433 {
434 int64_t cur_time;
435 int64_t cur_icount;
436 int64_t delta;
437
438 /* Protected by TimersState mutex. */
439 static int64_t last_delta;
440
441 /* If the VM is not running, then do nothing. */
442 if (!runstate_is_running()) {
443 return;
444 }
445
446 seqlock_write_lock(&timers_state.vm_clock_seqlock,
447 &timers_state.vm_clock_lock);
448 cur_time = cpu_get_clock_locked();
449 cur_icount = cpu_get_icount_locked();
450
451 delta = cur_icount - cur_time;
452 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
453 if (delta > 0
454 && last_delta + ICOUNT_WOBBLE < delta * 2
455 && timers_state.icount_time_shift > 0) {
456 /* The guest is getting too far ahead. Slow time down. */
457 atomic_set(&timers_state.icount_time_shift,
458 timers_state.icount_time_shift - 1);
459 }
460 if (delta < 0
461 && last_delta - ICOUNT_WOBBLE > delta * 2
462 && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
463 /* The guest is getting too far behind. Speed time up. */
464 atomic_set(&timers_state.icount_time_shift,
465 timers_state.icount_time_shift + 1);
466 }
467 last_delta = delta;
468 atomic_set_i64(&timers_state.qemu_icount_bias,
469 cur_icount - (timers_state.qemu_icount
470 << timers_state.icount_time_shift));
471 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
472 &timers_state.vm_clock_lock);
473 }
474
475 static void icount_adjust_rt(void *opaque)
476 {
477 timer_mod(timers_state.icount_rt_timer,
478 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
479 icount_adjust();
480 }
481
482 static void icount_adjust_vm(void *opaque)
483 {
484 timer_mod(timers_state.icount_vm_timer,
485 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
486 NANOSECONDS_PER_SECOND / 10);
487 icount_adjust();
488 }
489
490 static int64_t qemu_icount_round(int64_t count)
491 {
492 int shift = atomic_read(&timers_state.icount_time_shift);
493 return (count + (1 << shift) - 1) >> shift;
494 }
495
496 static void icount_warp_rt(void)
497 {
498 unsigned seq;
499 int64_t warp_start;
500
501 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
502 * changes from -1 to another value, so the race here is okay.
503 */
504 do {
505 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
506 warp_start = timers_state.vm_clock_warp_start;
507 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
508
509 if (warp_start == -1) {
510 return;
511 }
512
513 seqlock_write_lock(&timers_state.vm_clock_seqlock,
514 &timers_state.vm_clock_lock);
515 if (runstate_is_running()) {
516 int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
517 cpu_get_clock_locked());
518 int64_t warp_delta;
519
520 warp_delta = clock - timers_state.vm_clock_warp_start;
521 if (use_icount == 2) {
522 /*
523 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
524 * far ahead of real time.
525 */
526 int64_t cur_icount = cpu_get_icount_locked();
527 int64_t delta = clock - cur_icount;
528 warp_delta = MIN(warp_delta, delta);
529 }
530 atomic_set_i64(&timers_state.qemu_icount_bias,
531 timers_state.qemu_icount_bias + warp_delta);
532 }
533 timers_state.vm_clock_warp_start = -1;
534 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
535 &timers_state.vm_clock_lock);
536
537 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
538 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
539 }
540 }
541
542 static void icount_timer_cb(void *opaque)
543 {
544 /* No need for a checkpoint because the timer already synchronizes
545 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
546 */
547 icount_warp_rt();
548 }
549
550 void qtest_clock_warp(int64_t dest)
551 {
552 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
553 AioContext *aio_context;
554 assert(qtest_enabled());
555 aio_context = qemu_get_aio_context();
556 while (clock < dest) {
557 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
558 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
559
560 seqlock_write_lock(&timers_state.vm_clock_seqlock,
561 &timers_state.vm_clock_lock);
562 atomic_set_i64(&timers_state.qemu_icount_bias,
563 timers_state.qemu_icount_bias + warp);
564 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
565 &timers_state.vm_clock_lock);
566
567 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
568 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
569 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
570 }
571 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
572 }
573
574 void qemu_start_warp_timer(void)
575 {
576 int64_t clock;
577 int64_t deadline;
578
579 if (!use_icount) {
580 return;
581 }
582
583 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
584 * do not fire, so computing the deadline does not make sense.
585 */
586 if (!runstate_is_running()) {
587 return;
588 }
589
590 if (replay_mode != REPLAY_MODE_PLAY) {
591 if (!all_cpu_threads_idle()) {
592 return;
593 }
594
595 if (qtest_enabled()) {
596 /* When testing, qtest commands advance icount. */
597 return;
598 }
599
600 replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
601 } else {
602 /* warp clock deterministically in record/replay mode */
603 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
604 /* vCPU is sleeping and warp can't be started.
605 It is probably a race condition: notification sent
606 to vCPU was processed in advance and vCPU went to sleep.
607 Therefore we have to wake it up for doing someting. */
608 if (replay_has_checkpoint()) {
609 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
610 }
611 return;
612 }
613 }
614
615 /* We want to use the earliest deadline from ALL vm_clocks */
616 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
617 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
618 if (deadline < 0) {
619 static bool notified;
620 if (!icount_sleep && !notified) {
621 warn_report("icount sleep disabled and no active timers");
622 notified = true;
623 }
624 return;
625 }
626
627 if (deadline > 0) {
628 /*
629 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
630 * sleep. Otherwise, the CPU might be waiting for a future timer
631 * interrupt to wake it up, but the interrupt never comes because
632 * the vCPU isn't running any insns and thus doesn't advance the
633 * QEMU_CLOCK_VIRTUAL.
634 */
635 if (!icount_sleep) {
636 /*
637 * We never let VCPUs sleep in no sleep icount mode.
638 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
639 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
640 * It is useful when we want a deterministic execution time,
641 * isolated from host latencies.
642 */
643 seqlock_write_lock(&timers_state.vm_clock_seqlock,
644 &timers_state.vm_clock_lock);
645 atomic_set_i64(&timers_state.qemu_icount_bias,
646 timers_state.qemu_icount_bias + deadline);
647 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
648 &timers_state.vm_clock_lock);
649 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
650 } else {
651 /*
652 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
653 * "real" time, (related to the time left until the next event) has
654 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
655 * This avoids that the warps are visible externally; for example,
656 * you will not be sending network packets continuously instead of
657 * every 100ms.
658 */
659 seqlock_write_lock(&timers_state.vm_clock_seqlock,
660 &timers_state.vm_clock_lock);
661 if (timers_state.vm_clock_warp_start == -1
662 || timers_state.vm_clock_warp_start > clock) {
663 timers_state.vm_clock_warp_start = clock;
664 }
665 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
666 &timers_state.vm_clock_lock);
667 timer_mod_anticipate(timers_state.icount_warp_timer,
668 clock + deadline);
669 }
670 } else if (deadline == 0) {
671 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
672 }
673 }
674
675 static void qemu_account_warp_timer(void)
676 {
677 if (!use_icount || !icount_sleep) {
678 return;
679 }
680
681 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
682 * do not fire, so computing the deadline does not make sense.
683 */
684 if (!runstate_is_running()) {
685 return;
686 }
687
688 /* warp clock deterministically in record/replay mode */
689 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
690 return;
691 }
692
693 timer_del(timers_state.icount_warp_timer);
694 icount_warp_rt();
695 }
696
697 static bool icount_state_needed(void *opaque)
698 {
699 return use_icount;
700 }
701
702 static bool warp_timer_state_needed(void *opaque)
703 {
704 TimersState *s = opaque;
705 return s->icount_warp_timer != NULL;
706 }
707
708 static bool adjust_timers_state_needed(void *opaque)
709 {
710 TimersState *s = opaque;
711 return s->icount_rt_timer != NULL;
712 }
713
714 /*
715 * Subsection for warp timer migration is optional, because may not be created
716 */
717 static const VMStateDescription icount_vmstate_warp_timer = {
718 .name = "timer/icount/warp_timer",
719 .version_id = 1,
720 .minimum_version_id = 1,
721 .needed = warp_timer_state_needed,
722 .fields = (VMStateField[]) {
723 VMSTATE_INT64(vm_clock_warp_start, TimersState),
724 VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
725 VMSTATE_END_OF_LIST()
726 }
727 };
728
729 static const VMStateDescription icount_vmstate_adjust_timers = {
730 .name = "timer/icount/timers",
731 .version_id = 1,
732 .minimum_version_id = 1,
733 .needed = adjust_timers_state_needed,
734 .fields = (VMStateField[]) {
735 VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
736 VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
737 VMSTATE_END_OF_LIST()
738 }
739 };
740
741 /*
742 * This is a subsection for icount migration.
743 */
744 static const VMStateDescription icount_vmstate_timers = {
745 .name = "timer/icount",
746 .version_id = 1,
747 .minimum_version_id = 1,
748 .needed = icount_state_needed,
749 .fields = (VMStateField[]) {
750 VMSTATE_INT64(qemu_icount_bias, TimersState),
751 VMSTATE_INT64(qemu_icount, TimersState),
752 VMSTATE_END_OF_LIST()
753 },
754 .subsections = (const VMStateDescription*[]) {
755 &icount_vmstate_warp_timer,
756 &icount_vmstate_adjust_timers,
757 NULL
758 }
759 };
760
761 static const VMStateDescription vmstate_timers = {
762 .name = "timer",
763 .version_id = 2,
764 .minimum_version_id = 1,
765 .fields = (VMStateField[]) {
766 VMSTATE_INT64(cpu_ticks_offset, TimersState),
767 VMSTATE_UNUSED(8),
768 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
769 VMSTATE_END_OF_LIST()
770 },
771 .subsections = (const VMStateDescription*[]) {
772 &icount_vmstate_timers,
773 NULL
774 }
775 };
776
777 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
778 {
779 double pct;
780 double throttle_ratio;
781 long sleeptime_ns;
782
783 if (!cpu_throttle_get_percentage()) {
784 return;
785 }
786
787 pct = (double)cpu_throttle_get_percentage()/100;
788 throttle_ratio = pct / (1 - pct);
789 sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
790
791 qemu_mutex_unlock_iothread();
792 g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
793 qemu_mutex_lock_iothread();
794 atomic_set(&cpu->throttle_thread_scheduled, 0);
795 }
796
797 static void cpu_throttle_timer_tick(void *opaque)
798 {
799 CPUState *cpu;
800 double pct;
801
802 /* Stop the timer if needed */
803 if (!cpu_throttle_get_percentage()) {
804 return;
805 }
806 CPU_FOREACH(cpu) {
807 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
808 async_run_on_cpu(cpu, cpu_throttle_thread,
809 RUN_ON_CPU_NULL);
810 }
811 }
812
813 pct = (double)cpu_throttle_get_percentage()/100;
814 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
815 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
816 }
817
818 void cpu_throttle_set(int new_throttle_pct)
819 {
820 /* Ensure throttle percentage is within valid range */
821 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
822 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
823
824 atomic_set(&throttle_percentage, new_throttle_pct);
825
826 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
827 CPU_THROTTLE_TIMESLICE_NS);
828 }
829
830 void cpu_throttle_stop(void)
831 {
832 atomic_set(&throttle_percentage, 0);
833 }
834
835 bool cpu_throttle_active(void)
836 {
837 return (cpu_throttle_get_percentage() != 0);
838 }
839
840 int cpu_throttle_get_percentage(void)
841 {
842 return atomic_read(&throttle_percentage);
843 }
844
845 void cpu_ticks_init(void)
846 {
847 seqlock_init(&timers_state.vm_clock_seqlock);
848 qemu_spin_init(&timers_state.vm_clock_lock);
849 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
850 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
851 cpu_throttle_timer_tick, NULL);
852 }
853
854 void configure_icount(QemuOpts *opts, Error **errp)
855 {
856 const char *option;
857 char *rem_str = NULL;
858
859 option = qemu_opt_get(opts, "shift");
860 if (!option) {
861 if (qemu_opt_get(opts, "align") != NULL) {
862 error_setg(errp, "Please specify shift option when using align");
863 }
864 return;
865 }
866
867 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
868 if (icount_sleep) {
869 timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
870 icount_timer_cb, NULL);
871 }
872
873 icount_align_option = qemu_opt_get_bool(opts, "align", false);
874
875 if (icount_align_option && !icount_sleep) {
876 error_setg(errp, "align=on and sleep=off are incompatible");
877 }
878 if (strcmp(option, "auto") != 0) {
879 errno = 0;
880 timers_state.icount_time_shift = strtol(option, &rem_str, 0);
881 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
882 error_setg(errp, "icount: Invalid shift value");
883 }
884 use_icount = 1;
885 return;
886 } else if (icount_align_option) {
887 error_setg(errp, "shift=auto and align=on are incompatible");
888 } else if (!icount_sleep) {
889 error_setg(errp, "shift=auto and sleep=off are incompatible");
890 }
891
892 use_icount = 2;
893
894 /* 125MIPS seems a reasonable initial guess at the guest speed.
895 It will be corrected fairly quickly anyway. */
896 timers_state.icount_time_shift = 3;
897
898 /* Have both realtime and virtual time triggers for speed adjustment.
899 The realtime trigger catches emulated time passing too slowly,
900 the virtual time trigger catches emulated time passing too fast.
901 Realtime triggers occur even when idle, so use them less frequently
902 than VM triggers. */
903 timers_state.vm_clock_warp_start = -1;
904 timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
905 icount_adjust_rt, NULL);
906 timer_mod(timers_state.icount_rt_timer,
907 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
908 timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
909 icount_adjust_vm, NULL);
910 timer_mod(timers_state.icount_vm_timer,
911 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
912 NANOSECONDS_PER_SECOND / 10);
913 }
914
915 /***********************************************************/
916 /* TCG vCPU kick timer
917 *
918 * The kick timer is responsible for moving single threaded vCPU
919 * emulation on to the next vCPU. If more than one vCPU is running a
920 * timer event with force a cpu->exit so the next vCPU can get
921 * scheduled.
922 *
923 * The timer is removed if all vCPUs are idle and restarted again once
924 * idleness is complete.
925 */
926
927 static QEMUTimer *tcg_kick_vcpu_timer;
928 static CPUState *tcg_current_rr_cpu;
929
930 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
931
932 static inline int64_t qemu_tcg_next_kick(void)
933 {
934 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
935 }
936
937 /* Kick the currently round-robin scheduled vCPU */
938 static void qemu_cpu_kick_rr_cpu(void)
939 {
940 CPUState *cpu;
941 do {
942 cpu = atomic_mb_read(&tcg_current_rr_cpu);
943 if (cpu) {
944 cpu_exit(cpu);
945 }
946 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
947 }
948
949 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
950 {
951 }
952
953 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
954 {
955 if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
956 qemu_notify_event();
957 return;
958 }
959
960 if (qemu_in_vcpu_thread()) {
961 /* A CPU is currently running; kick it back out to the
962 * tcg_cpu_exec() loop so it will recalculate its
963 * icount deadline immediately.
964 */
965 qemu_cpu_kick(current_cpu);
966 } else if (first_cpu) {
967 /* qemu_cpu_kick is not enough to kick a halted CPU out of
968 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
969 * causes cpu_thread_is_idle to return false. This way,
970 * handle_icount_deadline can run.
971 * If we have no CPUs at all for some reason, we don't
972 * need to do anything.
973 */
974 async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
975 }
976 }
977
978 static void kick_tcg_thread(void *opaque)
979 {
980 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
981 qemu_cpu_kick_rr_cpu();
982 }
983
984 static void start_tcg_kick_timer(void)
985 {
986 assert(!mttcg_enabled);
987 if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
988 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
989 kick_tcg_thread, NULL);
990 }
991 if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
992 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
993 }
994 }
995
996 static void stop_tcg_kick_timer(void)
997 {
998 assert(!mttcg_enabled);
999 if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
1000 timer_del(tcg_kick_vcpu_timer);
1001 }
1002 }
1003
1004 /***********************************************************/
1005 void hw_error(const char *fmt, ...)
1006 {
1007 va_list ap;
1008 CPUState *cpu;
1009
1010 va_start(ap, fmt);
1011 fprintf(stderr, "qemu: hardware error: ");
1012 vfprintf(stderr, fmt, ap);
1013 fprintf(stderr, "\n");
1014 CPU_FOREACH(cpu) {
1015 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1016 cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1017 }
1018 va_end(ap);
1019 abort();
1020 }
1021
1022 void cpu_synchronize_all_states(void)
1023 {
1024 CPUState *cpu;
1025
1026 CPU_FOREACH(cpu) {
1027 cpu_synchronize_state(cpu);
1028 /* TODO: move to cpu_synchronize_state() */
1029 if (hvf_enabled()) {
1030 hvf_cpu_synchronize_state(cpu);
1031 }
1032 }
1033 }
1034
1035 void cpu_synchronize_all_post_reset(void)
1036 {
1037 CPUState *cpu;
1038
1039 CPU_FOREACH(cpu) {
1040 cpu_synchronize_post_reset(cpu);
1041 /* TODO: move to cpu_synchronize_post_reset() */
1042 if (hvf_enabled()) {
1043 hvf_cpu_synchronize_post_reset(cpu);
1044 }
1045 }
1046 }
1047
1048 void cpu_synchronize_all_post_init(void)
1049 {
1050 CPUState *cpu;
1051
1052 CPU_FOREACH(cpu) {
1053 cpu_synchronize_post_init(cpu);
1054 /* TODO: move to cpu_synchronize_post_init() */
1055 if (hvf_enabled()) {
1056 hvf_cpu_synchronize_post_init(cpu);
1057 }
1058 }
1059 }
1060
1061 void cpu_synchronize_all_pre_loadvm(void)
1062 {
1063 CPUState *cpu;
1064
1065 CPU_FOREACH(cpu) {
1066 cpu_synchronize_pre_loadvm(cpu);
1067 }
1068 }
1069
1070 static int do_vm_stop(RunState state, bool send_stop)
1071 {
1072 int ret = 0;
1073
1074 if (runstate_is_running()) {
1075 cpu_disable_ticks();
1076 pause_all_vcpus();
1077 runstate_set(state);
1078 vm_state_notify(0, state);
1079 if (send_stop) {
1080 qapi_event_send_stop();
1081 }
1082 }
1083
1084 bdrv_drain_all();
1085 replay_disable_events();
1086 ret = bdrv_flush_all();
1087
1088 return ret;
1089 }
1090
1091 /* Special vm_stop() variant for terminating the process. Historically clients
1092 * did not expect a QMP STOP event and so we need to retain compatibility.
1093 */
1094 int vm_shutdown(void)
1095 {
1096 return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1097 }
1098
1099 static bool cpu_can_run(CPUState *cpu)
1100 {
1101 if (cpu->stop) {
1102 return false;
1103 }
1104 if (cpu_is_stopped(cpu)) {
1105 return false;
1106 }
1107 return true;
1108 }
1109
1110 static void cpu_handle_guest_debug(CPUState *cpu)
1111 {
1112 gdb_set_stop_cpu(cpu);
1113 qemu_system_debug_request();
1114 cpu->stopped = true;
1115 }
1116
1117 #ifdef CONFIG_LINUX
1118 static void sigbus_reraise(void)
1119 {
1120 sigset_t set;
1121 struct sigaction action;
1122
1123 memset(&action, 0, sizeof(action));
1124 action.sa_handler = SIG_DFL;
1125 if (!sigaction(SIGBUS, &action, NULL)) {
1126 raise(SIGBUS);
1127 sigemptyset(&set);
1128 sigaddset(&set, SIGBUS);
1129 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1130 }
1131 perror("Failed to re-raise SIGBUS!\n");
1132 abort();
1133 }
1134
1135 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1136 {
1137 if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1138 sigbus_reraise();
1139 }
1140
1141 if (current_cpu) {
1142 /* Called asynchronously in VCPU thread. */
1143 if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1144 sigbus_reraise();
1145 }
1146 } else {
1147 /* Called synchronously (via signalfd) in main thread. */
1148 if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1149 sigbus_reraise();
1150 }
1151 }
1152 }
1153
1154 static void qemu_init_sigbus(void)
1155 {
1156 struct sigaction action;
1157
1158 memset(&action, 0, sizeof(action));
1159 action.sa_flags = SA_SIGINFO;
1160 action.sa_sigaction = sigbus_handler;
1161 sigaction(SIGBUS, &action, NULL);
1162
1163 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1164 }
1165 #else /* !CONFIG_LINUX */
1166 static void qemu_init_sigbus(void)
1167 {
1168 }
1169 #endif /* !CONFIG_LINUX */
1170
1171 static QemuMutex qemu_global_mutex;
1172
1173 static QemuThread io_thread;
1174
1175 /* cpu creation */
1176 static QemuCond qemu_cpu_cond;
1177 /* system init */
1178 static QemuCond qemu_pause_cond;
1179
1180 void qemu_init_cpu_loop(void)
1181 {
1182 qemu_init_sigbus();
1183 qemu_cond_init(&qemu_cpu_cond);
1184 qemu_cond_init(&qemu_pause_cond);
1185 qemu_mutex_init(&qemu_global_mutex);
1186
1187 qemu_thread_get_self(&io_thread);
1188 }
1189
1190 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1191 {
1192 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1193 }
1194
1195 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1196 {
1197 if (kvm_destroy_vcpu(cpu) < 0) {
1198 error_report("kvm_destroy_vcpu failed");
1199 exit(EXIT_FAILURE);
1200 }
1201 }
1202
1203 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1204 {
1205 }
1206
1207 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1208 {
1209 g_assert(qemu_cpu_is_self(cpu));
1210 cpu->stop = false;
1211 cpu->stopped = true;
1212 if (exit) {
1213 cpu_exit(cpu);
1214 }
1215 qemu_cond_broadcast(&qemu_pause_cond);
1216 }
1217
1218 static void qemu_wait_io_event_common(CPUState *cpu)
1219 {
1220 atomic_mb_set(&cpu->thread_kicked, false);
1221 if (cpu->stop) {
1222 qemu_cpu_stop(cpu, false);
1223 }
1224 process_queued_cpu_work(cpu);
1225 }
1226
1227 static void qemu_tcg_rr_wait_io_event(void)
1228 {
1229 CPUState *cpu;
1230
1231 while (all_cpu_threads_idle()) {
1232 stop_tcg_kick_timer();
1233 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1234 }
1235
1236 start_tcg_kick_timer();
1237
1238 CPU_FOREACH(cpu) {
1239 qemu_wait_io_event_common(cpu);
1240 }
1241 }
1242
1243 static void qemu_wait_io_event(CPUState *cpu)
1244 {
1245 while (cpu_thread_is_idle(cpu)) {
1246 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1247 }
1248
1249 #ifdef _WIN32
1250 /* Eat dummy APC queued by qemu_cpu_kick_thread. */
1251 if (!tcg_enabled()) {
1252 SleepEx(0, TRUE);
1253 }
1254 #endif
1255 qemu_wait_io_event_common(cpu);
1256 }
1257
1258 static void *qemu_kvm_cpu_thread_fn(void *arg)
1259 {
1260 CPUState *cpu = arg;
1261 int r;
1262
1263 rcu_register_thread();
1264
1265 qemu_mutex_lock_iothread();
1266 qemu_thread_get_self(cpu->thread);
1267 cpu->thread_id = qemu_get_thread_id();
1268 cpu->can_do_io = 1;
1269 current_cpu = cpu;
1270
1271 r = kvm_init_vcpu(cpu);
1272 if (r < 0) {
1273 error_report("kvm_init_vcpu failed: %s", strerror(-r));
1274 exit(1);
1275 }
1276
1277 kvm_init_cpu_signals(cpu);
1278
1279 /* signal CPU creation */
1280 cpu->created = true;
1281 qemu_cond_signal(&qemu_cpu_cond);
1282 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1283
1284 do {
1285 if (cpu_can_run(cpu)) {
1286 r = kvm_cpu_exec(cpu);
1287 if (r == EXCP_DEBUG) {
1288 cpu_handle_guest_debug(cpu);
1289 }
1290 }
1291 qemu_wait_io_event(cpu);
1292 } while (!cpu->unplug || cpu_can_run(cpu));
1293
1294 qemu_kvm_destroy_vcpu(cpu);
1295 cpu->created = false;
1296 qemu_cond_signal(&qemu_cpu_cond);
1297 qemu_mutex_unlock_iothread();
1298 rcu_unregister_thread();
1299 return NULL;
1300 }
1301
1302 static void *qemu_dummy_cpu_thread_fn(void *arg)
1303 {
1304 #ifdef _WIN32
1305 error_report("qtest is not supported under Windows");
1306 exit(1);
1307 #else
1308 CPUState *cpu = arg;
1309 sigset_t waitset;
1310 int r;
1311
1312 rcu_register_thread();
1313
1314 qemu_mutex_lock_iothread();
1315 qemu_thread_get_self(cpu->thread);
1316 cpu->thread_id = qemu_get_thread_id();
1317 cpu->can_do_io = 1;
1318 current_cpu = cpu;
1319
1320 sigemptyset(&waitset);
1321 sigaddset(&waitset, SIG_IPI);
1322
1323 /* signal CPU creation */
1324 cpu->created = true;
1325 qemu_cond_signal(&qemu_cpu_cond);
1326 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1327
1328 do {
1329 qemu_mutex_unlock_iothread();
1330 do {
1331 int sig;
1332 r = sigwait(&waitset, &sig);
1333 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1334 if (r == -1) {
1335 perror("sigwait");
1336 exit(1);
1337 }
1338 qemu_mutex_lock_iothread();
1339 qemu_wait_io_event(cpu);
1340 } while (!cpu->unplug);
1341
1342 qemu_mutex_unlock_iothread();
1343 rcu_unregister_thread();
1344 return NULL;
1345 #endif
1346 }
1347
1348 static int64_t tcg_get_icount_limit(void)
1349 {
1350 int64_t deadline;
1351
1352 if (replay_mode != REPLAY_MODE_PLAY) {
1353 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1354
1355 /* Maintain prior (possibly buggy) behaviour where if no deadline
1356 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1357 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1358 * nanoseconds.
1359 */
1360 if ((deadline < 0) || (deadline > INT32_MAX)) {
1361 deadline = INT32_MAX;
1362 }
1363
1364 return qemu_icount_round(deadline);
1365 } else {
1366 return replay_get_instructions();
1367 }
1368 }
1369
1370 static void handle_icount_deadline(void)
1371 {
1372 assert(qemu_in_vcpu_thread());
1373 if (use_icount) {
1374 int64_t deadline =
1375 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1376
1377 if (deadline == 0) {
1378 /* Wake up other AioContexts. */
1379 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1380 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1381 }
1382 }
1383 }
1384
1385 static void prepare_icount_for_run(CPUState *cpu)
1386 {
1387 if (use_icount) {
1388 int insns_left;
1389
1390 /* These should always be cleared by process_icount_data after
1391 * each vCPU execution. However u16.high can be raised
1392 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1393 */
1394 g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1395 g_assert(cpu->icount_extra == 0);
1396
1397 cpu->icount_budget = tcg_get_icount_limit();
1398 insns_left = MIN(0xffff, cpu->icount_budget);
1399 cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1400 cpu->icount_extra = cpu->icount_budget - insns_left;
1401
1402 replay_mutex_lock();
1403 }
1404 }
1405
1406 static void process_icount_data(CPUState *cpu)
1407 {
1408 if (use_icount) {
1409 /* Account for executed instructions */
1410 cpu_update_icount(cpu);
1411
1412 /* Reset the counters */
1413 cpu_neg(cpu)->icount_decr.u16.low = 0;
1414 cpu->icount_extra = 0;
1415 cpu->icount_budget = 0;
1416
1417 replay_account_executed_instructions();
1418
1419 replay_mutex_unlock();
1420 }
1421 }
1422
1423
1424 static int tcg_cpu_exec(CPUState *cpu)
1425 {
1426 int ret;
1427 #ifdef CONFIG_PROFILER
1428 int64_t ti;
1429 #endif
1430
1431 assert(tcg_enabled());
1432 #ifdef CONFIG_PROFILER
1433 ti = profile_getclock();
1434 #endif
1435 cpu_exec_start(cpu);
1436 ret = cpu_exec(cpu);
1437 cpu_exec_end(cpu);
1438 #ifdef CONFIG_PROFILER
1439 atomic_set(&tcg_ctx->prof.cpu_exec_time,
1440 tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1441 #endif
1442 return ret;
1443 }
1444
1445 /* Destroy any remaining vCPUs which have been unplugged and have
1446 * finished running
1447 */
1448 static void deal_with_unplugged_cpus(void)
1449 {
1450 CPUState *cpu;
1451
1452 CPU_FOREACH(cpu) {
1453 if (cpu->unplug && !cpu_can_run(cpu)) {
1454 qemu_tcg_destroy_vcpu(cpu);
1455 cpu->created = false;
1456 qemu_cond_signal(&qemu_cpu_cond);
1457 break;
1458 }
1459 }
1460 }
1461
1462 /* Single-threaded TCG
1463 *
1464 * In the single-threaded case each vCPU is simulated in turn. If
1465 * there is more than a single vCPU we create a simple timer to kick
1466 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1467 * This is done explicitly rather than relying on side-effects
1468 * elsewhere.
1469 */
1470
1471 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1472 {
1473 CPUState *cpu = arg;
1474
1475 assert(tcg_enabled());
1476 rcu_register_thread();
1477 tcg_register_thread();
1478
1479 qemu_mutex_lock_iothread();
1480 qemu_thread_get_self(cpu->thread);
1481
1482 cpu->thread_id = qemu_get_thread_id();
1483 cpu->created = true;
1484 cpu->can_do_io = 1;
1485 qemu_cond_signal(&qemu_cpu_cond);
1486 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1487
1488 /* wait for initial kick-off after machine start */
1489 while (first_cpu->stopped) {
1490 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1491
1492 /* process any pending work */
1493 CPU_FOREACH(cpu) {
1494 current_cpu = cpu;
1495 qemu_wait_io_event_common(cpu);
1496 }
1497 }
1498
1499 start_tcg_kick_timer();
1500
1501 cpu = first_cpu;
1502
1503 /* process any pending work */
1504 cpu->exit_request = 1;
1505
1506 while (1) {
1507 qemu_mutex_unlock_iothread();
1508 replay_mutex_lock();
1509 qemu_mutex_lock_iothread();
1510 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1511 qemu_account_warp_timer();
1512
1513 /* Run the timers here. This is much more efficient than
1514 * waking up the I/O thread and waiting for completion.
1515 */
1516 handle_icount_deadline();
1517
1518 replay_mutex_unlock();
1519
1520 if (!cpu) {
1521 cpu = first_cpu;
1522 }
1523
1524 while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1525
1526 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1527 current_cpu = cpu;
1528
1529 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1530 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1531
1532 if (cpu_can_run(cpu)) {
1533 int r;
1534
1535 qemu_mutex_unlock_iothread();
1536 prepare_icount_for_run(cpu);
1537
1538 r = tcg_cpu_exec(cpu);
1539
1540 process_icount_data(cpu);
1541 qemu_mutex_lock_iothread();
1542
1543 if (r == EXCP_DEBUG) {
1544 cpu_handle_guest_debug(cpu);
1545 break;
1546 } else if (r == EXCP_ATOMIC) {
1547 qemu_mutex_unlock_iothread();
1548 cpu_exec_step_atomic(cpu);
1549 qemu_mutex_lock_iothread();
1550 break;
1551 }
1552 } else if (cpu->stop) {
1553 if (cpu->unplug) {
1554 cpu = CPU_NEXT(cpu);
1555 }
1556 break;
1557 }
1558
1559 cpu = CPU_NEXT(cpu);
1560 } /* while (cpu && !cpu->exit_request).. */
1561
1562 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1563 atomic_set(&tcg_current_rr_cpu, NULL);
1564
1565 if (cpu && cpu->exit_request) {
1566 atomic_mb_set(&cpu->exit_request, 0);
1567 }
1568
1569 if (use_icount && all_cpu_threads_idle()) {
1570 /*
1571 * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1572 * in the main_loop, wake it up in order to start the warp timer.
1573 */
1574 qemu_notify_event();
1575 }
1576
1577 qemu_tcg_rr_wait_io_event();
1578 deal_with_unplugged_cpus();
1579 }
1580
1581 rcu_unregister_thread();
1582 return NULL;
1583 }
1584
1585 static void *qemu_hax_cpu_thread_fn(void *arg)
1586 {
1587 CPUState *cpu = arg;
1588 int r;
1589
1590 rcu_register_thread();
1591 qemu_mutex_lock_iothread();
1592 qemu_thread_get_self(cpu->thread);
1593
1594 cpu->thread_id = qemu_get_thread_id();
1595 cpu->created = true;
1596 cpu->halted = 0;
1597 current_cpu = cpu;
1598
1599 hax_init_vcpu(cpu);
1600 qemu_cond_signal(&qemu_cpu_cond);
1601 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1602
1603 do {
1604 if (cpu_can_run(cpu)) {
1605 r = hax_smp_cpu_exec(cpu);
1606 if (r == EXCP_DEBUG) {
1607 cpu_handle_guest_debug(cpu);
1608 }
1609 }
1610
1611 qemu_wait_io_event(cpu);
1612 } while (!cpu->unplug || cpu_can_run(cpu));
1613 rcu_unregister_thread();
1614 return NULL;
1615 }
1616
1617 /* The HVF-specific vCPU thread function. This one should only run when the host
1618 * CPU supports the VMX "unrestricted guest" feature. */
1619 static void *qemu_hvf_cpu_thread_fn(void *arg)
1620 {
1621 CPUState *cpu = arg;
1622
1623 int r;
1624
1625 assert(hvf_enabled());
1626
1627 rcu_register_thread();
1628
1629 qemu_mutex_lock_iothread();
1630 qemu_thread_get_self(cpu->thread);
1631
1632 cpu->thread_id = qemu_get_thread_id();
1633 cpu->can_do_io = 1;
1634 current_cpu = cpu;
1635
1636 hvf_init_vcpu(cpu);
1637
1638 /* signal CPU creation */
1639 cpu->created = true;
1640 qemu_cond_signal(&qemu_cpu_cond);
1641 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1642
1643 do {
1644 if (cpu_can_run(cpu)) {
1645 r = hvf_vcpu_exec(cpu);
1646 if (r == EXCP_DEBUG) {
1647 cpu_handle_guest_debug(cpu);
1648 }
1649 }
1650 qemu_wait_io_event(cpu);
1651 } while (!cpu->unplug || cpu_can_run(cpu));
1652
1653 hvf_vcpu_destroy(cpu);
1654 cpu->created = false;
1655 qemu_cond_signal(&qemu_cpu_cond);
1656 qemu_mutex_unlock_iothread();
1657 rcu_unregister_thread();
1658 return NULL;
1659 }
1660
1661 static void *qemu_whpx_cpu_thread_fn(void *arg)
1662 {
1663 CPUState *cpu = arg;
1664 int r;
1665
1666 rcu_register_thread();
1667
1668 qemu_mutex_lock_iothread();
1669 qemu_thread_get_self(cpu->thread);
1670 cpu->thread_id = qemu_get_thread_id();
1671 current_cpu = cpu;
1672
1673 r = whpx_init_vcpu(cpu);
1674 if (r < 0) {
1675 fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1676 exit(1);
1677 }
1678
1679 /* signal CPU creation */
1680 cpu->created = true;
1681 qemu_cond_signal(&qemu_cpu_cond);
1682 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1683
1684 do {
1685 if (cpu_can_run(cpu)) {
1686 r = whpx_vcpu_exec(cpu);
1687 if (r == EXCP_DEBUG) {
1688 cpu_handle_guest_debug(cpu);
1689 }
1690 }
1691 while (cpu_thread_is_idle(cpu)) {
1692 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1693 }
1694 qemu_wait_io_event_common(cpu);
1695 } while (!cpu->unplug || cpu_can_run(cpu));
1696
1697 whpx_destroy_vcpu(cpu);
1698 cpu->created = false;
1699 qemu_cond_signal(&qemu_cpu_cond);
1700 qemu_mutex_unlock_iothread();
1701 rcu_unregister_thread();
1702 return NULL;
1703 }
1704
1705 #ifdef _WIN32
1706 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1707 {
1708 }
1709 #endif
1710
1711 /* Multi-threaded TCG
1712 *
1713 * In the multi-threaded case each vCPU has its own thread. The TLS
1714 * variable current_cpu can be used deep in the code to find the
1715 * current CPUState for a given thread.
1716 */
1717
1718 static void *qemu_tcg_cpu_thread_fn(void *arg)
1719 {
1720 CPUState *cpu = arg;
1721
1722 assert(tcg_enabled());
1723 g_assert(!use_icount);
1724
1725 rcu_register_thread();
1726 tcg_register_thread();
1727
1728 qemu_mutex_lock_iothread();
1729 qemu_thread_get_self(cpu->thread);
1730
1731 cpu->thread_id = qemu_get_thread_id();
1732 cpu->created = true;
1733 cpu->can_do_io = 1;
1734 current_cpu = cpu;
1735 qemu_cond_signal(&qemu_cpu_cond);
1736 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1737
1738 /* process any pending work */
1739 cpu->exit_request = 1;
1740
1741 do {
1742 if (cpu_can_run(cpu)) {
1743 int r;
1744 qemu_mutex_unlock_iothread();
1745 r = tcg_cpu_exec(cpu);
1746 qemu_mutex_lock_iothread();
1747 switch (r) {
1748 case EXCP_DEBUG:
1749 cpu_handle_guest_debug(cpu);
1750 break;
1751 case EXCP_HALTED:
1752 /* during start-up the vCPU is reset and the thread is
1753 * kicked several times. If we don't ensure we go back
1754 * to sleep in the halted state we won't cleanly
1755 * start-up when the vCPU is enabled.
1756 *
1757 * cpu->halted should ensure we sleep in wait_io_event
1758 */
1759 g_assert(cpu->halted);
1760 break;
1761 case EXCP_ATOMIC:
1762 qemu_mutex_unlock_iothread();
1763 cpu_exec_step_atomic(cpu);
1764 qemu_mutex_lock_iothread();
1765 default:
1766 /* Ignore everything else? */
1767 break;
1768 }
1769 }
1770
1771 atomic_mb_set(&cpu->exit_request, 0);
1772 qemu_wait_io_event(cpu);
1773 } while (!cpu->unplug || cpu_can_run(cpu));
1774
1775 qemu_tcg_destroy_vcpu(cpu);
1776 cpu->created = false;
1777 qemu_cond_signal(&qemu_cpu_cond);
1778 qemu_mutex_unlock_iothread();
1779 rcu_unregister_thread();
1780 return NULL;
1781 }
1782
1783 static void qemu_cpu_kick_thread(CPUState *cpu)
1784 {
1785 #ifndef _WIN32
1786 int err;
1787
1788 if (cpu->thread_kicked) {
1789 return;
1790 }
1791 cpu->thread_kicked = true;
1792 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1793 if (err && err != ESRCH) {
1794 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1795 exit(1);
1796 }
1797 #else /* _WIN32 */
1798 if (!qemu_cpu_is_self(cpu)) {
1799 if (whpx_enabled()) {
1800 whpx_vcpu_kick(cpu);
1801 } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1802 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1803 __func__, GetLastError());
1804 exit(1);
1805 }
1806 }
1807 #endif
1808 }
1809
1810 void qemu_cpu_kick(CPUState *cpu)
1811 {
1812 qemu_cond_broadcast(cpu->halt_cond);
1813 if (tcg_enabled()) {
1814 cpu_exit(cpu);
1815 /* NOP unless doing single-thread RR */
1816 qemu_cpu_kick_rr_cpu();
1817 } else {
1818 if (hax_enabled()) {
1819 /*
1820 * FIXME: race condition with the exit_request check in
1821 * hax_vcpu_hax_exec
1822 */
1823 cpu->exit_request = 1;
1824 }
1825 qemu_cpu_kick_thread(cpu);
1826 }
1827 }
1828
1829 void qemu_cpu_kick_self(void)
1830 {
1831 assert(current_cpu);
1832 qemu_cpu_kick_thread(current_cpu);
1833 }
1834
1835 bool qemu_cpu_is_self(CPUState *cpu)
1836 {
1837 return qemu_thread_is_self(cpu->thread);
1838 }
1839
1840 bool qemu_in_vcpu_thread(void)
1841 {
1842 return current_cpu && qemu_cpu_is_self(current_cpu);
1843 }
1844
1845 static __thread bool iothread_locked = false;
1846
1847 bool qemu_mutex_iothread_locked(void)
1848 {
1849 return iothread_locked;
1850 }
1851
1852 /*
1853 * The BQL is taken from so many places that it is worth profiling the
1854 * callers directly, instead of funneling them all through a single function.
1855 */
1856 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1857 {
1858 QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1859
1860 g_assert(!qemu_mutex_iothread_locked());
1861 bql_lock(&qemu_global_mutex, file, line);
1862 iothread_locked = true;
1863 }
1864
1865 void qemu_mutex_unlock_iothread(void)
1866 {
1867 g_assert(qemu_mutex_iothread_locked());
1868 iothread_locked = false;
1869 qemu_mutex_unlock(&qemu_global_mutex);
1870 }
1871
1872 static bool all_vcpus_paused(void)
1873 {
1874 CPUState *cpu;
1875
1876 CPU_FOREACH(cpu) {
1877 if (!cpu->stopped) {
1878 return false;
1879 }
1880 }
1881
1882 return true;
1883 }
1884
1885 void pause_all_vcpus(void)
1886 {
1887 CPUState *cpu;
1888
1889 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1890 CPU_FOREACH(cpu) {
1891 if (qemu_cpu_is_self(cpu)) {
1892 qemu_cpu_stop(cpu, true);
1893 } else {
1894 cpu->stop = true;
1895 qemu_cpu_kick(cpu);
1896 }
1897 }
1898
1899 /* We need to drop the replay_lock so any vCPU threads woken up
1900 * can finish their replay tasks
1901 */
1902 replay_mutex_unlock();
1903
1904 while (!all_vcpus_paused()) {
1905 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1906 CPU_FOREACH(cpu) {
1907 qemu_cpu_kick(cpu);
1908 }
1909 }
1910
1911 qemu_mutex_unlock_iothread();
1912 replay_mutex_lock();
1913 qemu_mutex_lock_iothread();
1914 }
1915
1916 void cpu_resume(CPUState *cpu)
1917 {
1918 cpu->stop = false;
1919 cpu->stopped = false;
1920 qemu_cpu_kick(cpu);
1921 }
1922
1923 void resume_all_vcpus(void)
1924 {
1925 CPUState *cpu;
1926
1927 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1928 CPU_FOREACH(cpu) {
1929 cpu_resume(cpu);
1930 }
1931 }
1932
1933 void cpu_remove_sync(CPUState *cpu)
1934 {
1935 cpu->stop = true;
1936 cpu->unplug = true;
1937 qemu_cpu_kick(cpu);
1938 qemu_mutex_unlock_iothread();
1939 qemu_thread_join(cpu->thread);
1940 qemu_mutex_lock_iothread();
1941 }
1942
1943 /* For temporary buffers for forming a name */
1944 #define VCPU_THREAD_NAME_SIZE 16
1945
1946 static void qemu_tcg_init_vcpu(CPUState *cpu)
1947 {
1948 char thread_name[VCPU_THREAD_NAME_SIZE];
1949 static QemuCond *single_tcg_halt_cond;
1950 static QemuThread *single_tcg_cpu_thread;
1951 static int tcg_region_inited;
1952
1953 assert(tcg_enabled());
1954 /*
1955 * Initialize TCG regions--once. Now is a good time, because:
1956 * (1) TCG's init context, prologue and target globals have been set up.
1957 * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1958 * -accel flag is processed, so the check doesn't work then).
1959 */
1960 if (!tcg_region_inited) {
1961 tcg_region_inited = 1;
1962 tcg_region_init();
1963 }
1964
1965 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1966 cpu->thread = g_malloc0(sizeof(QemuThread));
1967 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1968 qemu_cond_init(cpu->halt_cond);
1969
1970 if (qemu_tcg_mttcg_enabled()) {
1971 /* create a thread per vCPU with TCG (MTTCG) */
1972 parallel_cpus = true;
1973 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1974 cpu->cpu_index);
1975
1976 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1977 cpu, QEMU_THREAD_JOINABLE);
1978
1979 } else {
1980 /* share a single thread for all cpus with TCG */
1981 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1982 qemu_thread_create(cpu->thread, thread_name,
1983 qemu_tcg_rr_cpu_thread_fn,
1984 cpu, QEMU_THREAD_JOINABLE);
1985
1986 single_tcg_halt_cond = cpu->halt_cond;
1987 single_tcg_cpu_thread = cpu->thread;
1988 }
1989 #ifdef _WIN32
1990 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1991 #endif
1992 } else {
1993 /* For non-MTTCG cases we share the thread */
1994 cpu->thread = single_tcg_cpu_thread;
1995 cpu->halt_cond = single_tcg_halt_cond;
1996 cpu->thread_id = first_cpu->thread_id;
1997 cpu->can_do_io = 1;
1998 cpu->created = true;
1999 }
2000 }
2001
2002 static void qemu_hax_start_vcpu(CPUState *cpu)
2003 {
2004 char thread_name[VCPU_THREAD_NAME_SIZE];
2005
2006 cpu->thread = g_malloc0(sizeof(QemuThread));
2007 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2008 qemu_cond_init(cpu->halt_cond);
2009
2010 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2011 cpu->cpu_index);
2012 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2013 cpu, QEMU_THREAD_JOINABLE);
2014 #ifdef _WIN32
2015 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2016 #endif
2017 }
2018
2019 static void qemu_kvm_start_vcpu(CPUState *cpu)
2020 {
2021 char thread_name[VCPU_THREAD_NAME_SIZE];
2022
2023 cpu->thread = g_malloc0(sizeof(QemuThread));
2024 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2025 qemu_cond_init(cpu->halt_cond);
2026 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2027 cpu->cpu_index);
2028 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2029 cpu, QEMU_THREAD_JOINABLE);
2030 }
2031
2032 static void qemu_hvf_start_vcpu(CPUState *cpu)
2033 {
2034 char thread_name[VCPU_THREAD_NAME_SIZE];
2035
2036 /* HVF currently does not support TCG, and only runs in
2037 * unrestricted-guest mode. */
2038 assert(hvf_enabled());
2039
2040 cpu->thread = g_malloc0(sizeof(QemuThread));
2041 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2042 qemu_cond_init(cpu->halt_cond);
2043
2044 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2045 cpu->cpu_index);
2046 qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2047 cpu, QEMU_THREAD_JOINABLE);
2048 }
2049
2050 static void qemu_whpx_start_vcpu(CPUState *cpu)
2051 {
2052 char thread_name[VCPU_THREAD_NAME_SIZE];
2053
2054 cpu->thread = g_malloc0(sizeof(QemuThread));
2055 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2056 qemu_cond_init(cpu->halt_cond);
2057 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2058 cpu->cpu_index);
2059 qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2060 cpu, QEMU_THREAD_JOINABLE);
2061 #ifdef _WIN32
2062 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2063 #endif
2064 }
2065
2066 static void qemu_dummy_start_vcpu(CPUState *cpu)
2067 {
2068 char thread_name[VCPU_THREAD_NAME_SIZE];
2069
2070 cpu->thread = g_malloc0(sizeof(QemuThread));
2071 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2072 qemu_cond_init(cpu->halt_cond);
2073 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2074 cpu->cpu_index);
2075 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2076 QEMU_THREAD_JOINABLE);
2077 }
2078
2079 void qemu_init_vcpu(CPUState *cpu)
2080 {
2081 cpu->nr_cores = smp_cores;
2082 cpu->nr_threads = smp_threads;
2083 cpu->stopped = true;
2084 cpu->random_seed = qemu_guest_random_seed_thread_part1();
2085
2086 if (!cpu->as) {
2087 /* If the target cpu hasn't set up any address spaces itself,
2088 * give it the default one.
2089 */
2090 cpu->num_ases = 1;
2091 cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2092 }
2093
2094 if (kvm_enabled()) {
2095 qemu_kvm_start_vcpu(cpu);
2096 } else if (hax_enabled()) {
2097 qemu_hax_start_vcpu(cpu);
2098 } else if (hvf_enabled()) {
2099 qemu_hvf_start_vcpu(cpu);
2100 } else if (tcg_enabled()) {
2101 qemu_tcg_init_vcpu(cpu);
2102 } else if (whpx_enabled()) {
2103 qemu_whpx_start_vcpu(cpu);
2104 } else {
2105 qemu_dummy_start_vcpu(cpu);
2106 }
2107
2108 while (!cpu->created) {
2109 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2110 }
2111 }
2112
2113 void cpu_stop_current(void)
2114 {
2115 if (current_cpu) {
2116 current_cpu->stop = true;
2117 cpu_exit(current_cpu);
2118 }
2119 }
2120
2121 int vm_stop(RunState state)
2122 {
2123 if (qemu_in_vcpu_thread()) {
2124 qemu_system_vmstop_request_prepare();
2125 qemu_system_vmstop_request(state);
2126 /*
2127 * FIXME: should not return to device code in case
2128 * vm_stop() has been requested.
2129 */
2130 cpu_stop_current();
2131 return 0;
2132 }
2133
2134 return do_vm_stop(state, true);
2135 }
2136
2137 /**
2138 * Prepare for (re)starting the VM.
2139 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2140 * running or in case of an error condition), 0 otherwise.
2141 */
2142 int vm_prepare_start(void)
2143 {
2144 RunState requested;
2145
2146 qemu_vmstop_requested(&requested);
2147 if (runstate_is_running() && requested == RUN_STATE__MAX) {
2148 return -1;
2149 }
2150
2151 /* Ensure that a STOP/RESUME pair of events is emitted if a
2152 * vmstop request was pending. The BLOCK_IO_ERROR event, for
2153 * example, according to documentation is always followed by
2154 * the STOP event.
2155 */
2156 if (runstate_is_running()) {
2157 qapi_event_send_stop();
2158 qapi_event_send_resume();
2159 return -1;
2160 }
2161
2162 /* We are sending this now, but the CPUs will be resumed shortly later */
2163 qapi_event_send_resume();
2164
2165 replay_enable_events();
2166 cpu_enable_ticks();
2167 runstate_set(RUN_STATE_RUNNING);
2168 vm_state_notify(1, RUN_STATE_RUNNING);
2169 return 0;
2170 }
2171
2172 void vm_start(void)
2173 {
2174 if (!vm_prepare_start()) {
2175 resume_all_vcpus();
2176 }
2177 }
2178
2179 /* does a state transition even if the VM is already stopped,
2180 current state is forgotten forever */
2181 int vm_stop_force_state(RunState state)
2182 {
2183 if (runstate_is_running()) {
2184 return vm_stop(state);
2185 } else {
2186 runstate_set(state);
2187
2188 bdrv_drain_all();
2189 /* Make sure to return an error if the flush in a previous vm_stop()
2190 * failed. */
2191 return bdrv_flush_all();
2192 }
2193 }
2194
2195 void list_cpus(const char *optarg)
2196 {
2197 /* XXX: implement xxx_cpu_list for targets that still miss it */
2198 #if defined(cpu_list)
2199 cpu_list();
2200 #endif
2201 }
2202
2203 CpuInfoList *qmp_query_cpus(Error **errp)
2204 {
2205 MachineState *ms = MACHINE(qdev_get_machine());
2206 MachineClass *mc = MACHINE_GET_CLASS(ms);
2207 CpuInfoList *head = NULL, *cur_item = NULL;
2208 CPUState *cpu;
2209
2210 CPU_FOREACH(cpu) {
2211 CpuInfoList *info;
2212 #if defined(TARGET_I386)
2213 X86CPU *x86_cpu = X86_CPU(cpu);
2214 CPUX86State *env = &x86_cpu->env;
2215 #elif defined(TARGET_PPC)
2216 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2217 CPUPPCState *env = &ppc_cpu->env;
2218 #elif defined(TARGET_SPARC)
2219 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2220 CPUSPARCState *env = &sparc_cpu->env;
2221 #elif defined(TARGET_RISCV)
2222 RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2223 CPURISCVState *env = &riscv_cpu->env;
2224 #elif defined(TARGET_MIPS)
2225 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2226 CPUMIPSState *env = &mips_cpu->env;
2227 #elif defined(TARGET_TRICORE)
2228 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2229 CPUTriCoreState *env = &tricore_cpu->env;
2230 #elif defined(TARGET_S390X)
2231 S390CPU *s390_cpu = S390_CPU(cpu);
2232 CPUS390XState *env = &s390_cpu->env;
2233 #endif
2234
2235 cpu_synchronize_state(cpu);
2236
2237 info = g_malloc0(sizeof(*info));
2238 info->value = g_malloc0(sizeof(*info->value));
2239 info->value->CPU = cpu->cpu_index;
2240 info->value->current = (cpu == first_cpu);
2241 info->value->halted = cpu->halted;
2242 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2243 info->value->thread_id = cpu->thread_id;
2244 #if defined(TARGET_I386)
2245 info->value->arch = CPU_INFO_ARCH_X86;
2246 info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2247 #elif defined(TARGET_PPC)
2248 info->value->arch = CPU_INFO_ARCH_PPC;
2249 info->value->u.ppc.nip = env->nip;
2250 #elif defined(TARGET_SPARC)
2251 info->value->arch = CPU_INFO_ARCH_SPARC;
2252 info->value->u.q_sparc.pc = env->pc;
2253 info->value->u.q_sparc.npc = env->npc;
2254 #elif defined(TARGET_MIPS)
2255 info->value->arch = CPU_INFO_ARCH_MIPS;
2256 info->value->u.q_mips.PC = env->active_tc.PC;
2257 #elif defined(TARGET_TRICORE)
2258 info->value->arch = CPU_INFO_ARCH_TRICORE;
2259 info->value->u.tricore.PC = env->PC;
2260 #elif defined(TARGET_S390X)
2261 info->value->arch = CPU_INFO_ARCH_S390;
2262 info->value->u.s390.cpu_state = env->cpu_state;
2263 #elif defined(TARGET_RISCV)
2264 info->value->arch = CPU_INFO_ARCH_RISCV;
2265 info->value->u.riscv.pc = env->pc;
2266 #else
2267 info->value->arch = CPU_INFO_ARCH_OTHER;
2268 #endif
2269 info->value->has_props = !!mc->cpu_index_to_instance_props;
2270 if (info->value->has_props) {
2271 CpuInstanceProperties *props;
2272 props = g_malloc0(sizeof(*props));
2273 *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2274 info->value->props = props;
2275 }
2276
2277 /* XXX: waiting for the qapi to support GSList */
2278 if (!cur_item) {
2279 head = cur_item = info;
2280 } else {
2281 cur_item->next = info;
2282 cur_item = info;
2283 }
2284 }
2285
2286 return head;
2287 }
2288
2289 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2290 {
2291 /*
2292 * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2293 * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2294 */
2295 switch (target) {
2296 case SYS_EMU_TARGET_I386:
2297 case SYS_EMU_TARGET_X86_64:
2298 return CPU_INFO_ARCH_X86;
2299
2300 case SYS_EMU_TARGET_PPC:
2301 case SYS_EMU_TARGET_PPC64:
2302 return CPU_INFO_ARCH_PPC;
2303
2304 case SYS_EMU_TARGET_SPARC:
2305 case SYS_EMU_TARGET_SPARC64:
2306 return CPU_INFO_ARCH_SPARC;
2307
2308 case SYS_EMU_TARGET_MIPS:
2309 case SYS_EMU_TARGET_MIPSEL:
2310 case SYS_EMU_TARGET_MIPS64:
2311 case SYS_EMU_TARGET_MIPS64EL:
2312 return CPU_INFO_ARCH_MIPS;
2313
2314 case SYS_EMU_TARGET_TRICORE:
2315 return CPU_INFO_ARCH_TRICORE;
2316
2317 case SYS_EMU_TARGET_S390X:
2318 return CPU_INFO_ARCH_S390;
2319
2320 case SYS_EMU_TARGET_RISCV32:
2321 case SYS_EMU_TARGET_RISCV64:
2322 return CPU_INFO_ARCH_RISCV;
2323
2324 default:
2325 return CPU_INFO_ARCH_OTHER;
2326 }
2327 }
2328
2329 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2330 {
2331 #ifdef TARGET_S390X
2332 S390CPU *s390_cpu = S390_CPU(cpu);
2333 CPUS390XState *env = &s390_cpu->env;
2334
2335 info->cpu_state = env->cpu_state;
2336 #else
2337 abort();
2338 #endif
2339 }
2340
2341 /*
2342 * fast means: we NEVER interrupt vCPU threads to retrieve
2343 * information from KVM.
2344 */
2345 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2346 {
2347 MachineState *ms = MACHINE(qdev_get_machine());
2348 MachineClass *mc = MACHINE_GET_CLASS(ms);
2349 CpuInfoFastList *head = NULL, *cur_item = NULL;
2350 SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2351 -1, &error_abort);
2352 CPUState *cpu;
2353
2354 CPU_FOREACH(cpu) {
2355 CpuInfoFastList *info = g_malloc0(sizeof(*info));
2356 info->value = g_malloc0(sizeof(*info->value));
2357
2358 info->value->cpu_index = cpu->cpu_index;
2359 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2360 info->value->thread_id = cpu->thread_id;
2361
2362 info->value->has_props = !!mc->cpu_index_to_instance_props;
2363 if (info->value->has_props) {
2364 CpuInstanceProperties *props;
2365 props = g_malloc0(sizeof(*props));
2366 *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2367 info->value->props = props;
2368 }
2369
2370 info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2371 info->value->target = target;
2372 if (target == SYS_EMU_TARGET_S390X) {
2373 cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2374 }
2375
2376 if (!cur_item) {
2377 head = cur_item = info;
2378 } else {
2379 cur_item->next = info;
2380 cur_item = info;
2381 }
2382 }
2383
2384 return head;
2385 }
2386
2387 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2388 bool has_cpu, int64_t cpu_index, Error **errp)
2389 {
2390 FILE *f;
2391 uint32_t l;
2392 CPUState *cpu;
2393 uint8_t buf[1024];
2394 int64_t orig_addr = addr, orig_size = size;
2395
2396 if (!has_cpu) {
2397 cpu_index = 0;
2398 }
2399
2400 cpu = qemu_get_cpu(cpu_index);
2401 if (cpu == NULL) {
2402 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2403 "a CPU number");
2404 return;
2405 }
2406
2407 f = fopen(filename, "wb");
2408 if (!f) {
2409 error_setg_file_open(errp, errno, filename);
2410 return;
2411 }
2412
2413 while (size != 0) {
2414 l = sizeof(buf);
2415 if (l > size)
2416 l = size;
2417 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2418 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2419 " specified", orig_addr, orig_size);
2420 goto exit;
2421 }
2422 if (fwrite(buf, 1, l, f) != l) {
2423 error_setg(errp, QERR_IO_ERROR);
2424 goto exit;
2425 }
2426 addr += l;
2427 size -= l;
2428 }
2429
2430 exit:
2431 fclose(f);
2432 }
2433
2434 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2435 Error **errp)
2436 {
2437 FILE *f;
2438 uint32_t l;
2439 uint8_t buf[1024];
2440
2441 f = fopen(filename, "wb");
2442 if (!f) {
2443 error_setg_file_open(errp, errno, filename);
2444 return;
2445 }
2446
2447 while (size != 0) {
2448 l = sizeof(buf);
2449 if (l > size)
2450 l = size;
2451 cpu_physical_memory_read(addr, buf, l);
2452 if (fwrite(buf, 1, l, f) != l) {
2453 error_setg(errp, QERR_IO_ERROR);
2454 goto exit;
2455 }
2456 addr += l;
2457 size -= l;
2458 }
2459
2460 exit:
2461 fclose(f);
2462 }
2463
2464 void qmp_inject_nmi(Error **errp)
2465 {
2466 nmi_monitor_handle(monitor_get_cpu_index(), errp);
2467 }
2468
2469 void dump_drift_info(void)
2470 {
2471 if (!use_icount) {
2472 return;
2473 }
2474
2475 qemu_printf("Host - Guest clock %"PRIi64" ms\n",
2476 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2477 if (icount_align_option) {
2478 qemu_printf("Max guest delay %"PRIi64" ms\n",
2479 -max_delay / SCALE_MS);
2480 qemu_printf("Max guest advance %"PRIi64" ms\n",
2481 max_advance / SCALE_MS);
2482 } else {
2483 qemu_printf("Max guest delay NA\n");
2484 qemu_printf("Max guest advance NA\n");
2485 }
2486 }