]> git.proxmox.com Git - mirror_qemu.git/blob - cpus.c
Merge remote-tracking branch 'remotes/jnsnow/tags/bitmaps-pull-request' into staging
[mirror_qemu.git] / cpus.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 #include "qemu/osdep.h"
26 #include "qemu/config-file.h"
27 #include "cpu.h"
28 #include "monitor/monitor.h"
29 #include "qapi/error.h"
30 #include "qapi/qapi-commands-misc.h"
31 #include "qapi/qapi-events-run-state.h"
32 #include "qapi/qmp/qerror.h"
33 #include "qemu/error-report.h"
34 #include "qemu/qemu-print.h"
35 #include "sysemu/sysemu.h"
36 #include "sysemu/block-backend.h"
37 #include "exec/gdbstub.h"
38 #include "sysemu/dma.h"
39 #include "sysemu/hw_accel.h"
40 #include "sysemu/kvm.h"
41 #include "sysemu/hax.h"
42 #include "sysemu/hvf.h"
43 #include "sysemu/whpx.h"
44 #include "exec/exec-all.h"
45
46 #include "qemu/thread.h"
47 #include "sysemu/cpus.h"
48 #include "sysemu/qtest.h"
49 #include "qemu/main-loop.h"
50 #include "qemu/option.h"
51 #include "qemu/bitmap.h"
52 #include "qemu/seqlock.h"
53 #include "tcg.h"
54 #include "hw/nmi.h"
55 #include "sysemu/replay.h"
56 #include "hw/boards.h"
57
58 #ifdef CONFIG_LINUX
59
60 #include <sys/prctl.h>
61
62 #ifndef PR_MCE_KILL
63 #define PR_MCE_KILL 33
64 #endif
65
66 #ifndef PR_MCE_KILL_SET
67 #define PR_MCE_KILL_SET 1
68 #endif
69
70 #ifndef PR_MCE_KILL_EARLY
71 #define PR_MCE_KILL_EARLY 1
72 #endif
73
74 #endif /* CONFIG_LINUX */
75
76 int64_t max_delay;
77 int64_t max_advance;
78
79 /* vcpu throttling controls */
80 static QEMUTimer *throttle_timer;
81 static unsigned int throttle_percentage;
82
83 #define CPU_THROTTLE_PCT_MIN 1
84 #define CPU_THROTTLE_PCT_MAX 99
85 #define CPU_THROTTLE_TIMESLICE_NS 10000000
86
87 bool cpu_is_stopped(CPUState *cpu)
88 {
89 return cpu->stopped || !runstate_is_running();
90 }
91
92 static bool cpu_thread_is_idle(CPUState *cpu)
93 {
94 if (cpu->stop || cpu->queued_work_first) {
95 return false;
96 }
97 if (cpu_is_stopped(cpu)) {
98 return true;
99 }
100 if (!cpu->halted || cpu_has_work(cpu) ||
101 kvm_halt_in_kernel()) {
102 return false;
103 }
104 return true;
105 }
106
107 static bool all_cpu_threads_idle(void)
108 {
109 CPUState *cpu;
110
111 CPU_FOREACH(cpu) {
112 if (!cpu_thread_is_idle(cpu)) {
113 return false;
114 }
115 }
116 return true;
117 }
118
119 /***********************************************************/
120 /* guest cycle counter */
121
122 /* Protected by TimersState seqlock */
123
124 static bool icount_sleep = true;
125 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
126 #define MAX_ICOUNT_SHIFT 10
127
128 typedef struct TimersState {
129 /* Protected by BQL. */
130 int64_t cpu_ticks_prev;
131 int64_t cpu_ticks_offset;
132
133 /* Protect fields that can be respectively read outside the
134 * BQL, and written from multiple threads.
135 */
136 QemuSeqLock vm_clock_seqlock;
137 QemuSpin vm_clock_lock;
138
139 int16_t cpu_ticks_enabled;
140
141 /* Conversion factor from emulated instructions to virtual clock ticks. */
142 int16_t icount_time_shift;
143
144 /* Compensate for varying guest execution speed. */
145 int64_t qemu_icount_bias;
146
147 int64_t vm_clock_warp_start;
148 int64_t cpu_clock_offset;
149
150 /* Only written by TCG thread */
151 int64_t qemu_icount;
152
153 /* for adjusting icount */
154 QEMUTimer *icount_rt_timer;
155 QEMUTimer *icount_vm_timer;
156 QEMUTimer *icount_warp_timer;
157 } TimersState;
158
159 static TimersState timers_state;
160 bool mttcg_enabled;
161
162 /*
163 * We default to false if we know other options have been enabled
164 * which are currently incompatible with MTTCG. Otherwise when each
165 * guest (target) has been updated to support:
166 * - atomic instructions
167 * - memory ordering primitives (barriers)
168 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
169 *
170 * Once a guest architecture has been converted to the new primitives
171 * there are two remaining limitations to check.
172 *
173 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
174 * - The host must have a stronger memory order than the guest
175 *
176 * It may be possible in future to support strong guests on weak hosts
177 * but that will require tagging all load/stores in a guest with their
178 * implicit memory order requirements which would likely slow things
179 * down a lot.
180 */
181
182 static bool check_tcg_memory_orders_compatible(void)
183 {
184 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
185 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
186 #else
187 return false;
188 #endif
189 }
190
191 static bool default_mttcg_enabled(void)
192 {
193 if (use_icount || TCG_OVERSIZED_GUEST) {
194 return false;
195 } else {
196 #ifdef TARGET_SUPPORTS_MTTCG
197 return check_tcg_memory_orders_compatible();
198 #else
199 return false;
200 #endif
201 }
202 }
203
204 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
205 {
206 const char *t = qemu_opt_get(opts, "thread");
207 if (t) {
208 if (strcmp(t, "multi") == 0) {
209 if (TCG_OVERSIZED_GUEST) {
210 error_setg(errp, "No MTTCG when guest word size > hosts");
211 } else if (use_icount) {
212 error_setg(errp, "No MTTCG when icount is enabled");
213 } else {
214 #ifndef TARGET_SUPPORTS_MTTCG
215 warn_report("Guest not yet converted to MTTCG - "
216 "you may get unexpected results");
217 #endif
218 if (!check_tcg_memory_orders_compatible()) {
219 warn_report("Guest expects a stronger memory ordering "
220 "than the host provides");
221 error_printf("This may cause strange/hard to debug errors\n");
222 }
223 mttcg_enabled = true;
224 }
225 } else if (strcmp(t, "single") == 0) {
226 mttcg_enabled = false;
227 } else {
228 error_setg(errp, "Invalid 'thread' setting %s", t);
229 }
230 } else {
231 mttcg_enabled = default_mttcg_enabled();
232 }
233 }
234
235 /* The current number of executed instructions is based on what we
236 * originally budgeted minus the current state of the decrementing
237 * icount counters in extra/u16.low.
238 */
239 static int64_t cpu_get_icount_executed(CPUState *cpu)
240 {
241 return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
242 }
243
244 /*
245 * Update the global shared timer_state.qemu_icount to take into
246 * account executed instructions. This is done by the TCG vCPU
247 * thread so the main-loop can see time has moved forward.
248 */
249 static void cpu_update_icount_locked(CPUState *cpu)
250 {
251 int64_t executed = cpu_get_icount_executed(cpu);
252 cpu->icount_budget -= executed;
253
254 atomic_set_i64(&timers_state.qemu_icount,
255 timers_state.qemu_icount + executed);
256 }
257
258 /*
259 * Update the global shared timer_state.qemu_icount to take into
260 * account executed instructions. This is done by the TCG vCPU
261 * thread so the main-loop can see time has moved forward.
262 */
263 void cpu_update_icount(CPUState *cpu)
264 {
265 seqlock_write_lock(&timers_state.vm_clock_seqlock,
266 &timers_state.vm_clock_lock);
267 cpu_update_icount_locked(cpu);
268 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
269 &timers_state.vm_clock_lock);
270 }
271
272 static int64_t cpu_get_icount_raw_locked(void)
273 {
274 CPUState *cpu = current_cpu;
275
276 if (cpu && cpu->running) {
277 if (!cpu->can_do_io) {
278 error_report("Bad icount read");
279 exit(1);
280 }
281 /* Take into account what has run */
282 cpu_update_icount_locked(cpu);
283 }
284 /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
285 return atomic_read_i64(&timers_state.qemu_icount);
286 }
287
288 static int64_t cpu_get_icount_locked(void)
289 {
290 int64_t icount = cpu_get_icount_raw_locked();
291 return atomic_read_i64(&timers_state.qemu_icount_bias) +
292 cpu_icount_to_ns(icount);
293 }
294
295 int64_t cpu_get_icount_raw(void)
296 {
297 int64_t icount;
298 unsigned start;
299
300 do {
301 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
302 icount = cpu_get_icount_raw_locked();
303 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
304
305 return icount;
306 }
307
308 /* Return the virtual CPU time, based on the instruction counter. */
309 int64_t cpu_get_icount(void)
310 {
311 int64_t icount;
312 unsigned start;
313
314 do {
315 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
316 icount = cpu_get_icount_locked();
317 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
318
319 return icount;
320 }
321
322 int64_t cpu_icount_to_ns(int64_t icount)
323 {
324 return icount << atomic_read(&timers_state.icount_time_shift);
325 }
326
327 static int64_t cpu_get_ticks_locked(void)
328 {
329 int64_t ticks = timers_state.cpu_ticks_offset;
330 if (timers_state.cpu_ticks_enabled) {
331 ticks += cpu_get_host_ticks();
332 }
333
334 if (timers_state.cpu_ticks_prev > ticks) {
335 /* Non increasing ticks may happen if the host uses software suspend. */
336 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
337 ticks = timers_state.cpu_ticks_prev;
338 }
339
340 timers_state.cpu_ticks_prev = ticks;
341 return ticks;
342 }
343
344 /* return the time elapsed in VM between vm_start and vm_stop. Unless
345 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
346 * counter.
347 */
348 int64_t cpu_get_ticks(void)
349 {
350 int64_t ticks;
351
352 if (use_icount) {
353 return cpu_get_icount();
354 }
355
356 qemu_spin_lock(&timers_state.vm_clock_lock);
357 ticks = cpu_get_ticks_locked();
358 qemu_spin_unlock(&timers_state.vm_clock_lock);
359 return ticks;
360 }
361
362 static int64_t cpu_get_clock_locked(void)
363 {
364 int64_t time;
365
366 time = timers_state.cpu_clock_offset;
367 if (timers_state.cpu_ticks_enabled) {
368 time += get_clock();
369 }
370
371 return time;
372 }
373
374 /* Return the monotonic time elapsed in VM, i.e.,
375 * the time between vm_start and vm_stop
376 */
377 int64_t cpu_get_clock(void)
378 {
379 int64_t ti;
380 unsigned start;
381
382 do {
383 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
384 ti = cpu_get_clock_locked();
385 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
386
387 return ti;
388 }
389
390 /* enable cpu_get_ticks()
391 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
392 */
393 void cpu_enable_ticks(void)
394 {
395 seqlock_write_lock(&timers_state.vm_clock_seqlock,
396 &timers_state.vm_clock_lock);
397 if (!timers_state.cpu_ticks_enabled) {
398 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
399 timers_state.cpu_clock_offset -= get_clock();
400 timers_state.cpu_ticks_enabled = 1;
401 }
402 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
403 &timers_state.vm_clock_lock);
404 }
405
406 /* disable cpu_get_ticks() : the clock is stopped. You must not call
407 * cpu_get_ticks() after that.
408 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
409 */
410 void cpu_disable_ticks(void)
411 {
412 seqlock_write_lock(&timers_state.vm_clock_seqlock,
413 &timers_state.vm_clock_lock);
414 if (timers_state.cpu_ticks_enabled) {
415 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
416 timers_state.cpu_clock_offset = cpu_get_clock_locked();
417 timers_state.cpu_ticks_enabled = 0;
418 }
419 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
420 &timers_state.vm_clock_lock);
421 }
422
423 /* Correlation between real and virtual time is always going to be
424 fairly approximate, so ignore small variation.
425 When the guest is idle real and virtual time will be aligned in
426 the IO wait loop. */
427 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
428
429 static void icount_adjust(void)
430 {
431 int64_t cur_time;
432 int64_t cur_icount;
433 int64_t delta;
434
435 /* Protected by TimersState mutex. */
436 static int64_t last_delta;
437
438 /* If the VM is not running, then do nothing. */
439 if (!runstate_is_running()) {
440 return;
441 }
442
443 seqlock_write_lock(&timers_state.vm_clock_seqlock,
444 &timers_state.vm_clock_lock);
445 cur_time = cpu_get_clock_locked();
446 cur_icount = cpu_get_icount_locked();
447
448 delta = cur_icount - cur_time;
449 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
450 if (delta > 0
451 && last_delta + ICOUNT_WOBBLE < delta * 2
452 && timers_state.icount_time_shift > 0) {
453 /* The guest is getting too far ahead. Slow time down. */
454 atomic_set(&timers_state.icount_time_shift,
455 timers_state.icount_time_shift - 1);
456 }
457 if (delta < 0
458 && last_delta - ICOUNT_WOBBLE > delta * 2
459 && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
460 /* The guest is getting too far behind. Speed time up. */
461 atomic_set(&timers_state.icount_time_shift,
462 timers_state.icount_time_shift + 1);
463 }
464 last_delta = delta;
465 atomic_set_i64(&timers_state.qemu_icount_bias,
466 cur_icount - (timers_state.qemu_icount
467 << timers_state.icount_time_shift));
468 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
469 &timers_state.vm_clock_lock);
470 }
471
472 static void icount_adjust_rt(void *opaque)
473 {
474 timer_mod(timers_state.icount_rt_timer,
475 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
476 icount_adjust();
477 }
478
479 static void icount_adjust_vm(void *opaque)
480 {
481 timer_mod(timers_state.icount_vm_timer,
482 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
483 NANOSECONDS_PER_SECOND / 10);
484 icount_adjust();
485 }
486
487 static int64_t qemu_icount_round(int64_t count)
488 {
489 int shift = atomic_read(&timers_state.icount_time_shift);
490 return (count + (1 << shift) - 1) >> shift;
491 }
492
493 static void icount_warp_rt(void)
494 {
495 unsigned seq;
496 int64_t warp_start;
497
498 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
499 * changes from -1 to another value, so the race here is okay.
500 */
501 do {
502 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
503 warp_start = timers_state.vm_clock_warp_start;
504 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
505
506 if (warp_start == -1) {
507 return;
508 }
509
510 seqlock_write_lock(&timers_state.vm_clock_seqlock,
511 &timers_state.vm_clock_lock);
512 if (runstate_is_running()) {
513 int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
514 cpu_get_clock_locked());
515 int64_t warp_delta;
516
517 warp_delta = clock - timers_state.vm_clock_warp_start;
518 if (use_icount == 2) {
519 /*
520 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
521 * far ahead of real time.
522 */
523 int64_t cur_icount = cpu_get_icount_locked();
524 int64_t delta = clock - cur_icount;
525 warp_delta = MIN(warp_delta, delta);
526 }
527 atomic_set_i64(&timers_state.qemu_icount_bias,
528 timers_state.qemu_icount_bias + warp_delta);
529 }
530 timers_state.vm_clock_warp_start = -1;
531 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
532 &timers_state.vm_clock_lock);
533
534 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
535 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
536 }
537 }
538
539 static void icount_timer_cb(void *opaque)
540 {
541 /* No need for a checkpoint because the timer already synchronizes
542 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
543 */
544 icount_warp_rt();
545 }
546
547 void qtest_clock_warp(int64_t dest)
548 {
549 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
550 AioContext *aio_context;
551 assert(qtest_enabled());
552 aio_context = qemu_get_aio_context();
553 while (clock < dest) {
554 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
555 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
556
557 seqlock_write_lock(&timers_state.vm_clock_seqlock,
558 &timers_state.vm_clock_lock);
559 atomic_set_i64(&timers_state.qemu_icount_bias,
560 timers_state.qemu_icount_bias + warp);
561 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
562 &timers_state.vm_clock_lock);
563
564 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
565 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
566 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
567 }
568 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
569 }
570
571 void qemu_start_warp_timer(void)
572 {
573 int64_t clock;
574 int64_t deadline;
575
576 if (!use_icount) {
577 return;
578 }
579
580 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
581 * do not fire, so computing the deadline does not make sense.
582 */
583 if (!runstate_is_running()) {
584 return;
585 }
586
587 if (replay_mode != REPLAY_MODE_PLAY) {
588 if (!all_cpu_threads_idle()) {
589 return;
590 }
591
592 if (qtest_enabled()) {
593 /* When testing, qtest commands advance icount. */
594 return;
595 }
596
597 replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
598 } else {
599 /* warp clock deterministically in record/replay mode */
600 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
601 /* vCPU is sleeping and warp can't be started.
602 It is probably a race condition: notification sent
603 to vCPU was processed in advance and vCPU went to sleep.
604 Therefore we have to wake it up for doing someting. */
605 if (replay_has_checkpoint()) {
606 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
607 }
608 return;
609 }
610 }
611
612 /* We want to use the earliest deadline from ALL vm_clocks */
613 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
614 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
615 if (deadline < 0) {
616 static bool notified;
617 if (!icount_sleep && !notified) {
618 warn_report("icount sleep disabled and no active timers");
619 notified = true;
620 }
621 return;
622 }
623
624 if (deadline > 0) {
625 /*
626 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
627 * sleep. Otherwise, the CPU might be waiting for a future timer
628 * interrupt to wake it up, but the interrupt never comes because
629 * the vCPU isn't running any insns and thus doesn't advance the
630 * QEMU_CLOCK_VIRTUAL.
631 */
632 if (!icount_sleep) {
633 /*
634 * We never let VCPUs sleep in no sleep icount mode.
635 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
636 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
637 * It is useful when we want a deterministic execution time,
638 * isolated from host latencies.
639 */
640 seqlock_write_lock(&timers_state.vm_clock_seqlock,
641 &timers_state.vm_clock_lock);
642 atomic_set_i64(&timers_state.qemu_icount_bias,
643 timers_state.qemu_icount_bias + deadline);
644 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
645 &timers_state.vm_clock_lock);
646 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
647 } else {
648 /*
649 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
650 * "real" time, (related to the time left until the next event) has
651 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
652 * This avoids that the warps are visible externally; for example,
653 * you will not be sending network packets continuously instead of
654 * every 100ms.
655 */
656 seqlock_write_lock(&timers_state.vm_clock_seqlock,
657 &timers_state.vm_clock_lock);
658 if (timers_state.vm_clock_warp_start == -1
659 || timers_state.vm_clock_warp_start > clock) {
660 timers_state.vm_clock_warp_start = clock;
661 }
662 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
663 &timers_state.vm_clock_lock);
664 timer_mod_anticipate(timers_state.icount_warp_timer,
665 clock + deadline);
666 }
667 } else if (deadline == 0) {
668 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
669 }
670 }
671
672 static void qemu_account_warp_timer(void)
673 {
674 if (!use_icount || !icount_sleep) {
675 return;
676 }
677
678 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
679 * do not fire, so computing the deadline does not make sense.
680 */
681 if (!runstate_is_running()) {
682 return;
683 }
684
685 /* warp clock deterministically in record/replay mode */
686 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
687 return;
688 }
689
690 timer_del(timers_state.icount_warp_timer);
691 icount_warp_rt();
692 }
693
694 static bool icount_state_needed(void *opaque)
695 {
696 return use_icount;
697 }
698
699 static bool warp_timer_state_needed(void *opaque)
700 {
701 TimersState *s = opaque;
702 return s->icount_warp_timer != NULL;
703 }
704
705 static bool adjust_timers_state_needed(void *opaque)
706 {
707 TimersState *s = opaque;
708 return s->icount_rt_timer != NULL;
709 }
710
711 /*
712 * Subsection for warp timer migration is optional, because may not be created
713 */
714 static const VMStateDescription icount_vmstate_warp_timer = {
715 .name = "timer/icount/warp_timer",
716 .version_id = 1,
717 .minimum_version_id = 1,
718 .needed = warp_timer_state_needed,
719 .fields = (VMStateField[]) {
720 VMSTATE_INT64(vm_clock_warp_start, TimersState),
721 VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
722 VMSTATE_END_OF_LIST()
723 }
724 };
725
726 static const VMStateDescription icount_vmstate_adjust_timers = {
727 .name = "timer/icount/timers",
728 .version_id = 1,
729 .minimum_version_id = 1,
730 .needed = adjust_timers_state_needed,
731 .fields = (VMStateField[]) {
732 VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
733 VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
734 VMSTATE_END_OF_LIST()
735 }
736 };
737
738 /*
739 * This is a subsection for icount migration.
740 */
741 static const VMStateDescription icount_vmstate_timers = {
742 .name = "timer/icount",
743 .version_id = 1,
744 .minimum_version_id = 1,
745 .needed = icount_state_needed,
746 .fields = (VMStateField[]) {
747 VMSTATE_INT64(qemu_icount_bias, TimersState),
748 VMSTATE_INT64(qemu_icount, TimersState),
749 VMSTATE_END_OF_LIST()
750 },
751 .subsections = (const VMStateDescription*[]) {
752 &icount_vmstate_warp_timer,
753 &icount_vmstate_adjust_timers,
754 NULL
755 }
756 };
757
758 static const VMStateDescription vmstate_timers = {
759 .name = "timer",
760 .version_id = 2,
761 .minimum_version_id = 1,
762 .fields = (VMStateField[]) {
763 VMSTATE_INT64(cpu_ticks_offset, TimersState),
764 VMSTATE_UNUSED(8),
765 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
766 VMSTATE_END_OF_LIST()
767 },
768 .subsections = (const VMStateDescription*[]) {
769 &icount_vmstate_timers,
770 NULL
771 }
772 };
773
774 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
775 {
776 double pct;
777 double throttle_ratio;
778 long sleeptime_ns;
779
780 if (!cpu_throttle_get_percentage()) {
781 return;
782 }
783
784 pct = (double)cpu_throttle_get_percentage()/100;
785 throttle_ratio = pct / (1 - pct);
786 sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
787
788 qemu_mutex_unlock_iothread();
789 g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
790 qemu_mutex_lock_iothread();
791 atomic_set(&cpu->throttle_thread_scheduled, 0);
792 }
793
794 static void cpu_throttle_timer_tick(void *opaque)
795 {
796 CPUState *cpu;
797 double pct;
798
799 /* Stop the timer if needed */
800 if (!cpu_throttle_get_percentage()) {
801 return;
802 }
803 CPU_FOREACH(cpu) {
804 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
805 async_run_on_cpu(cpu, cpu_throttle_thread,
806 RUN_ON_CPU_NULL);
807 }
808 }
809
810 pct = (double)cpu_throttle_get_percentage()/100;
811 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
812 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
813 }
814
815 void cpu_throttle_set(int new_throttle_pct)
816 {
817 /* Ensure throttle percentage is within valid range */
818 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
819 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
820
821 atomic_set(&throttle_percentage, new_throttle_pct);
822
823 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
824 CPU_THROTTLE_TIMESLICE_NS);
825 }
826
827 void cpu_throttle_stop(void)
828 {
829 atomic_set(&throttle_percentage, 0);
830 }
831
832 bool cpu_throttle_active(void)
833 {
834 return (cpu_throttle_get_percentage() != 0);
835 }
836
837 int cpu_throttle_get_percentage(void)
838 {
839 return atomic_read(&throttle_percentage);
840 }
841
842 void cpu_ticks_init(void)
843 {
844 seqlock_init(&timers_state.vm_clock_seqlock);
845 qemu_spin_init(&timers_state.vm_clock_lock);
846 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
847 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
848 cpu_throttle_timer_tick, NULL);
849 }
850
851 void configure_icount(QemuOpts *opts, Error **errp)
852 {
853 const char *option;
854 char *rem_str = NULL;
855
856 option = qemu_opt_get(opts, "shift");
857 if (!option) {
858 if (qemu_opt_get(opts, "align") != NULL) {
859 error_setg(errp, "Please specify shift option when using align");
860 }
861 return;
862 }
863
864 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
865 if (icount_sleep) {
866 timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
867 icount_timer_cb, NULL);
868 }
869
870 icount_align_option = qemu_opt_get_bool(opts, "align", false);
871
872 if (icount_align_option && !icount_sleep) {
873 error_setg(errp, "align=on and sleep=off are incompatible");
874 }
875 if (strcmp(option, "auto") != 0) {
876 errno = 0;
877 timers_state.icount_time_shift = strtol(option, &rem_str, 0);
878 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
879 error_setg(errp, "icount: Invalid shift value");
880 }
881 use_icount = 1;
882 return;
883 } else if (icount_align_option) {
884 error_setg(errp, "shift=auto and align=on are incompatible");
885 } else if (!icount_sleep) {
886 error_setg(errp, "shift=auto and sleep=off are incompatible");
887 }
888
889 use_icount = 2;
890
891 /* 125MIPS seems a reasonable initial guess at the guest speed.
892 It will be corrected fairly quickly anyway. */
893 timers_state.icount_time_shift = 3;
894
895 /* Have both realtime and virtual time triggers for speed adjustment.
896 The realtime trigger catches emulated time passing too slowly,
897 the virtual time trigger catches emulated time passing too fast.
898 Realtime triggers occur even when idle, so use them less frequently
899 than VM triggers. */
900 timers_state.vm_clock_warp_start = -1;
901 timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
902 icount_adjust_rt, NULL);
903 timer_mod(timers_state.icount_rt_timer,
904 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
905 timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
906 icount_adjust_vm, NULL);
907 timer_mod(timers_state.icount_vm_timer,
908 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
909 NANOSECONDS_PER_SECOND / 10);
910 }
911
912 /***********************************************************/
913 /* TCG vCPU kick timer
914 *
915 * The kick timer is responsible for moving single threaded vCPU
916 * emulation on to the next vCPU. If more than one vCPU is running a
917 * timer event with force a cpu->exit so the next vCPU can get
918 * scheduled.
919 *
920 * The timer is removed if all vCPUs are idle and restarted again once
921 * idleness is complete.
922 */
923
924 static QEMUTimer *tcg_kick_vcpu_timer;
925 static CPUState *tcg_current_rr_cpu;
926
927 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
928
929 static inline int64_t qemu_tcg_next_kick(void)
930 {
931 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
932 }
933
934 /* Kick the currently round-robin scheduled vCPU */
935 static void qemu_cpu_kick_rr_cpu(void)
936 {
937 CPUState *cpu;
938 do {
939 cpu = atomic_mb_read(&tcg_current_rr_cpu);
940 if (cpu) {
941 cpu_exit(cpu);
942 }
943 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
944 }
945
946 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
947 {
948 }
949
950 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
951 {
952 if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
953 qemu_notify_event();
954 return;
955 }
956
957 if (qemu_in_vcpu_thread()) {
958 /* A CPU is currently running; kick it back out to the
959 * tcg_cpu_exec() loop so it will recalculate its
960 * icount deadline immediately.
961 */
962 qemu_cpu_kick(current_cpu);
963 } else if (first_cpu) {
964 /* qemu_cpu_kick is not enough to kick a halted CPU out of
965 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
966 * causes cpu_thread_is_idle to return false. This way,
967 * handle_icount_deadline can run.
968 * If we have no CPUs at all for some reason, we don't
969 * need to do anything.
970 */
971 async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
972 }
973 }
974
975 static void kick_tcg_thread(void *opaque)
976 {
977 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
978 qemu_cpu_kick_rr_cpu();
979 }
980
981 static void start_tcg_kick_timer(void)
982 {
983 assert(!mttcg_enabled);
984 if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
985 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
986 kick_tcg_thread, NULL);
987 }
988 if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
989 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
990 }
991 }
992
993 static void stop_tcg_kick_timer(void)
994 {
995 assert(!mttcg_enabled);
996 if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
997 timer_del(tcg_kick_vcpu_timer);
998 }
999 }
1000
1001 /***********************************************************/
1002 void hw_error(const char *fmt, ...)
1003 {
1004 va_list ap;
1005 CPUState *cpu;
1006
1007 va_start(ap, fmt);
1008 fprintf(stderr, "qemu: hardware error: ");
1009 vfprintf(stderr, fmt, ap);
1010 fprintf(stderr, "\n");
1011 CPU_FOREACH(cpu) {
1012 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1013 cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1014 }
1015 va_end(ap);
1016 abort();
1017 }
1018
1019 void cpu_synchronize_all_states(void)
1020 {
1021 CPUState *cpu;
1022
1023 CPU_FOREACH(cpu) {
1024 cpu_synchronize_state(cpu);
1025 /* TODO: move to cpu_synchronize_state() */
1026 if (hvf_enabled()) {
1027 hvf_cpu_synchronize_state(cpu);
1028 }
1029 }
1030 }
1031
1032 void cpu_synchronize_all_post_reset(void)
1033 {
1034 CPUState *cpu;
1035
1036 CPU_FOREACH(cpu) {
1037 cpu_synchronize_post_reset(cpu);
1038 /* TODO: move to cpu_synchronize_post_reset() */
1039 if (hvf_enabled()) {
1040 hvf_cpu_synchronize_post_reset(cpu);
1041 }
1042 }
1043 }
1044
1045 void cpu_synchronize_all_post_init(void)
1046 {
1047 CPUState *cpu;
1048
1049 CPU_FOREACH(cpu) {
1050 cpu_synchronize_post_init(cpu);
1051 /* TODO: move to cpu_synchronize_post_init() */
1052 if (hvf_enabled()) {
1053 hvf_cpu_synchronize_post_init(cpu);
1054 }
1055 }
1056 }
1057
1058 void cpu_synchronize_all_pre_loadvm(void)
1059 {
1060 CPUState *cpu;
1061
1062 CPU_FOREACH(cpu) {
1063 cpu_synchronize_pre_loadvm(cpu);
1064 }
1065 }
1066
1067 static int do_vm_stop(RunState state, bool send_stop)
1068 {
1069 int ret = 0;
1070
1071 if (runstate_is_running()) {
1072 cpu_disable_ticks();
1073 pause_all_vcpus();
1074 runstate_set(state);
1075 vm_state_notify(0, state);
1076 if (send_stop) {
1077 qapi_event_send_stop();
1078 }
1079 }
1080
1081 bdrv_drain_all();
1082 replay_disable_events();
1083 ret = bdrv_flush_all();
1084
1085 return ret;
1086 }
1087
1088 /* Special vm_stop() variant for terminating the process. Historically clients
1089 * did not expect a QMP STOP event and so we need to retain compatibility.
1090 */
1091 int vm_shutdown(void)
1092 {
1093 return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1094 }
1095
1096 static bool cpu_can_run(CPUState *cpu)
1097 {
1098 if (cpu->stop) {
1099 return false;
1100 }
1101 if (cpu_is_stopped(cpu)) {
1102 return false;
1103 }
1104 return true;
1105 }
1106
1107 static void cpu_handle_guest_debug(CPUState *cpu)
1108 {
1109 gdb_set_stop_cpu(cpu);
1110 qemu_system_debug_request();
1111 cpu->stopped = true;
1112 }
1113
1114 #ifdef CONFIG_LINUX
1115 static void sigbus_reraise(void)
1116 {
1117 sigset_t set;
1118 struct sigaction action;
1119
1120 memset(&action, 0, sizeof(action));
1121 action.sa_handler = SIG_DFL;
1122 if (!sigaction(SIGBUS, &action, NULL)) {
1123 raise(SIGBUS);
1124 sigemptyset(&set);
1125 sigaddset(&set, SIGBUS);
1126 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1127 }
1128 perror("Failed to re-raise SIGBUS!\n");
1129 abort();
1130 }
1131
1132 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1133 {
1134 if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1135 sigbus_reraise();
1136 }
1137
1138 if (current_cpu) {
1139 /* Called asynchronously in VCPU thread. */
1140 if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1141 sigbus_reraise();
1142 }
1143 } else {
1144 /* Called synchronously (via signalfd) in main thread. */
1145 if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1146 sigbus_reraise();
1147 }
1148 }
1149 }
1150
1151 static void qemu_init_sigbus(void)
1152 {
1153 struct sigaction action;
1154
1155 memset(&action, 0, sizeof(action));
1156 action.sa_flags = SA_SIGINFO;
1157 action.sa_sigaction = sigbus_handler;
1158 sigaction(SIGBUS, &action, NULL);
1159
1160 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1161 }
1162 #else /* !CONFIG_LINUX */
1163 static void qemu_init_sigbus(void)
1164 {
1165 }
1166 #endif /* !CONFIG_LINUX */
1167
1168 static QemuMutex qemu_global_mutex;
1169
1170 static QemuThread io_thread;
1171
1172 /* cpu creation */
1173 static QemuCond qemu_cpu_cond;
1174 /* system init */
1175 static QemuCond qemu_pause_cond;
1176
1177 void qemu_init_cpu_loop(void)
1178 {
1179 qemu_init_sigbus();
1180 qemu_cond_init(&qemu_cpu_cond);
1181 qemu_cond_init(&qemu_pause_cond);
1182 qemu_mutex_init(&qemu_global_mutex);
1183
1184 qemu_thread_get_self(&io_thread);
1185 }
1186
1187 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1188 {
1189 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1190 }
1191
1192 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1193 {
1194 if (kvm_destroy_vcpu(cpu) < 0) {
1195 error_report("kvm_destroy_vcpu failed");
1196 exit(EXIT_FAILURE);
1197 }
1198 }
1199
1200 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1201 {
1202 }
1203
1204 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1205 {
1206 g_assert(qemu_cpu_is_self(cpu));
1207 cpu->stop = false;
1208 cpu->stopped = true;
1209 if (exit) {
1210 cpu_exit(cpu);
1211 }
1212 qemu_cond_broadcast(&qemu_pause_cond);
1213 }
1214
1215 static void qemu_wait_io_event_common(CPUState *cpu)
1216 {
1217 atomic_mb_set(&cpu->thread_kicked, false);
1218 if (cpu->stop) {
1219 qemu_cpu_stop(cpu, false);
1220 }
1221 process_queued_cpu_work(cpu);
1222 }
1223
1224 static void qemu_tcg_rr_wait_io_event(void)
1225 {
1226 CPUState *cpu;
1227
1228 while (all_cpu_threads_idle()) {
1229 stop_tcg_kick_timer();
1230 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1231 }
1232
1233 start_tcg_kick_timer();
1234
1235 CPU_FOREACH(cpu) {
1236 qemu_wait_io_event_common(cpu);
1237 }
1238 }
1239
1240 static void qemu_wait_io_event(CPUState *cpu)
1241 {
1242 while (cpu_thread_is_idle(cpu)) {
1243 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1244 }
1245
1246 #ifdef _WIN32
1247 /* Eat dummy APC queued by qemu_cpu_kick_thread. */
1248 if (!tcg_enabled()) {
1249 SleepEx(0, TRUE);
1250 }
1251 #endif
1252 qemu_wait_io_event_common(cpu);
1253 }
1254
1255 static void *qemu_kvm_cpu_thread_fn(void *arg)
1256 {
1257 CPUState *cpu = arg;
1258 int r;
1259
1260 rcu_register_thread();
1261
1262 qemu_mutex_lock_iothread();
1263 qemu_thread_get_self(cpu->thread);
1264 cpu->thread_id = qemu_get_thread_id();
1265 cpu->can_do_io = 1;
1266 current_cpu = cpu;
1267
1268 r = kvm_init_vcpu(cpu);
1269 if (r < 0) {
1270 error_report("kvm_init_vcpu failed: %s", strerror(-r));
1271 exit(1);
1272 }
1273
1274 kvm_init_cpu_signals(cpu);
1275
1276 /* signal CPU creation */
1277 cpu->created = true;
1278 qemu_cond_signal(&qemu_cpu_cond);
1279
1280 do {
1281 if (cpu_can_run(cpu)) {
1282 r = kvm_cpu_exec(cpu);
1283 if (r == EXCP_DEBUG) {
1284 cpu_handle_guest_debug(cpu);
1285 }
1286 }
1287 qemu_wait_io_event(cpu);
1288 } while (!cpu->unplug || cpu_can_run(cpu));
1289
1290 qemu_kvm_destroy_vcpu(cpu);
1291 cpu->created = false;
1292 qemu_cond_signal(&qemu_cpu_cond);
1293 qemu_mutex_unlock_iothread();
1294 rcu_unregister_thread();
1295 return NULL;
1296 }
1297
1298 static void *qemu_dummy_cpu_thread_fn(void *arg)
1299 {
1300 #ifdef _WIN32
1301 error_report("qtest is not supported under Windows");
1302 exit(1);
1303 #else
1304 CPUState *cpu = arg;
1305 sigset_t waitset;
1306 int r;
1307
1308 rcu_register_thread();
1309
1310 qemu_mutex_lock_iothread();
1311 qemu_thread_get_self(cpu->thread);
1312 cpu->thread_id = qemu_get_thread_id();
1313 cpu->can_do_io = 1;
1314 current_cpu = cpu;
1315
1316 sigemptyset(&waitset);
1317 sigaddset(&waitset, SIG_IPI);
1318
1319 /* signal CPU creation */
1320 cpu->created = true;
1321 qemu_cond_signal(&qemu_cpu_cond);
1322
1323 do {
1324 qemu_mutex_unlock_iothread();
1325 do {
1326 int sig;
1327 r = sigwait(&waitset, &sig);
1328 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1329 if (r == -1) {
1330 perror("sigwait");
1331 exit(1);
1332 }
1333 qemu_mutex_lock_iothread();
1334 qemu_wait_io_event(cpu);
1335 } while (!cpu->unplug);
1336
1337 qemu_mutex_unlock_iothread();
1338 rcu_unregister_thread();
1339 return NULL;
1340 #endif
1341 }
1342
1343 static int64_t tcg_get_icount_limit(void)
1344 {
1345 int64_t deadline;
1346
1347 if (replay_mode != REPLAY_MODE_PLAY) {
1348 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1349
1350 /* Maintain prior (possibly buggy) behaviour where if no deadline
1351 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1352 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1353 * nanoseconds.
1354 */
1355 if ((deadline < 0) || (deadline > INT32_MAX)) {
1356 deadline = INT32_MAX;
1357 }
1358
1359 return qemu_icount_round(deadline);
1360 } else {
1361 return replay_get_instructions();
1362 }
1363 }
1364
1365 static void handle_icount_deadline(void)
1366 {
1367 assert(qemu_in_vcpu_thread());
1368 if (use_icount) {
1369 int64_t deadline =
1370 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1371
1372 if (deadline == 0) {
1373 /* Wake up other AioContexts. */
1374 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1375 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1376 }
1377 }
1378 }
1379
1380 static void prepare_icount_for_run(CPUState *cpu)
1381 {
1382 if (use_icount) {
1383 int insns_left;
1384
1385 /* These should always be cleared by process_icount_data after
1386 * each vCPU execution. However u16.high can be raised
1387 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1388 */
1389 g_assert(cpu->icount_decr.u16.low == 0);
1390 g_assert(cpu->icount_extra == 0);
1391
1392 cpu->icount_budget = tcg_get_icount_limit();
1393 insns_left = MIN(0xffff, cpu->icount_budget);
1394 cpu->icount_decr.u16.low = insns_left;
1395 cpu->icount_extra = cpu->icount_budget - insns_left;
1396
1397 replay_mutex_lock();
1398 }
1399 }
1400
1401 static void process_icount_data(CPUState *cpu)
1402 {
1403 if (use_icount) {
1404 /* Account for executed instructions */
1405 cpu_update_icount(cpu);
1406
1407 /* Reset the counters */
1408 cpu->icount_decr.u16.low = 0;
1409 cpu->icount_extra = 0;
1410 cpu->icount_budget = 0;
1411
1412 replay_account_executed_instructions();
1413
1414 replay_mutex_unlock();
1415 }
1416 }
1417
1418
1419 static int tcg_cpu_exec(CPUState *cpu)
1420 {
1421 int ret;
1422 #ifdef CONFIG_PROFILER
1423 int64_t ti;
1424 #endif
1425
1426 assert(tcg_enabled());
1427 #ifdef CONFIG_PROFILER
1428 ti = profile_getclock();
1429 #endif
1430 cpu_exec_start(cpu);
1431 ret = cpu_exec(cpu);
1432 cpu_exec_end(cpu);
1433 #ifdef CONFIG_PROFILER
1434 atomic_set(&tcg_ctx->prof.cpu_exec_time,
1435 tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1436 #endif
1437 return ret;
1438 }
1439
1440 /* Destroy any remaining vCPUs which have been unplugged and have
1441 * finished running
1442 */
1443 static void deal_with_unplugged_cpus(void)
1444 {
1445 CPUState *cpu;
1446
1447 CPU_FOREACH(cpu) {
1448 if (cpu->unplug && !cpu_can_run(cpu)) {
1449 qemu_tcg_destroy_vcpu(cpu);
1450 cpu->created = false;
1451 qemu_cond_signal(&qemu_cpu_cond);
1452 break;
1453 }
1454 }
1455 }
1456
1457 /* Single-threaded TCG
1458 *
1459 * In the single-threaded case each vCPU is simulated in turn. If
1460 * there is more than a single vCPU we create a simple timer to kick
1461 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1462 * This is done explicitly rather than relying on side-effects
1463 * elsewhere.
1464 */
1465
1466 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1467 {
1468 CPUState *cpu = arg;
1469
1470 assert(tcg_enabled());
1471 rcu_register_thread();
1472 tcg_register_thread();
1473
1474 qemu_mutex_lock_iothread();
1475 qemu_thread_get_self(cpu->thread);
1476
1477 cpu->thread_id = qemu_get_thread_id();
1478 cpu->created = true;
1479 cpu->can_do_io = 1;
1480 qemu_cond_signal(&qemu_cpu_cond);
1481
1482 /* wait for initial kick-off after machine start */
1483 while (first_cpu->stopped) {
1484 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1485
1486 /* process any pending work */
1487 CPU_FOREACH(cpu) {
1488 current_cpu = cpu;
1489 qemu_wait_io_event_common(cpu);
1490 }
1491 }
1492
1493 start_tcg_kick_timer();
1494
1495 cpu = first_cpu;
1496
1497 /* process any pending work */
1498 cpu->exit_request = 1;
1499
1500 while (1) {
1501 qemu_mutex_unlock_iothread();
1502 replay_mutex_lock();
1503 qemu_mutex_lock_iothread();
1504 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1505 qemu_account_warp_timer();
1506
1507 /* Run the timers here. This is much more efficient than
1508 * waking up the I/O thread and waiting for completion.
1509 */
1510 handle_icount_deadline();
1511
1512 replay_mutex_unlock();
1513
1514 if (!cpu) {
1515 cpu = first_cpu;
1516 }
1517
1518 while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1519
1520 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1521 current_cpu = cpu;
1522
1523 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1524 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1525
1526 if (cpu_can_run(cpu)) {
1527 int r;
1528
1529 qemu_mutex_unlock_iothread();
1530 prepare_icount_for_run(cpu);
1531
1532 r = tcg_cpu_exec(cpu);
1533
1534 process_icount_data(cpu);
1535 qemu_mutex_lock_iothread();
1536
1537 if (r == EXCP_DEBUG) {
1538 cpu_handle_guest_debug(cpu);
1539 break;
1540 } else if (r == EXCP_ATOMIC) {
1541 qemu_mutex_unlock_iothread();
1542 cpu_exec_step_atomic(cpu);
1543 qemu_mutex_lock_iothread();
1544 break;
1545 }
1546 } else if (cpu->stop) {
1547 if (cpu->unplug) {
1548 cpu = CPU_NEXT(cpu);
1549 }
1550 break;
1551 }
1552
1553 cpu = CPU_NEXT(cpu);
1554 } /* while (cpu && !cpu->exit_request).. */
1555
1556 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1557 atomic_set(&tcg_current_rr_cpu, NULL);
1558
1559 if (cpu && cpu->exit_request) {
1560 atomic_mb_set(&cpu->exit_request, 0);
1561 }
1562
1563 if (use_icount && all_cpu_threads_idle()) {
1564 /*
1565 * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1566 * in the main_loop, wake it up in order to start the warp timer.
1567 */
1568 qemu_notify_event();
1569 }
1570
1571 qemu_tcg_rr_wait_io_event();
1572 deal_with_unplugged_cpus();
1573 }
1574
1575 rcu_unregister_thread();
1576 return NULL;
1577 }
1578
1579 static void *qemu_hax_cpu_thread_fn(void *arg)
1580 {
1581 CPUState *cpu = arg;
1582 int r;
1583
1584 rcu_register_thread();
1585 qemu_mutex_lock_iothread();
1586 qemu_thread_get_self(cpu->thread);
1587
1588 cpu->thread_id = qemu_get_thread_id();
1589 cpu->created = true;
1590 cpu->halted = 0;
1591 current_cpu = cpu;
1592
1593 hax_init_vcpu(cpu);
1594 qemu_cond_signal(&qemu_cpu_cond);
1595
1596 do {
1597 if (cpu_can_run(cpu)) {
1598 r = hax_smp_cpu_exec(cpu);
1599 if (r == EXCP_DEBUG) {
1600 cpu_handle_guest_debug(cpu);
1601 }
1602 }
1603
1604 qemu_wait_io_event(cpu);
1605 } while (!cpu->unplug || cpu_can_run(cpu));
1606 rcu_unregister_thread();
1607 return NULL;
1608 }
1609
1610 /* The HVF-specific vCPU thread function. This one should only run when the host
1611 * CPU supports the VMX "unrestricted guest" feature. */
1612 static void *qemu_hvf_cpu_thread_fn(void *arg)
1613 {
1614 CPUState *cpu = arg;
1615
1616 int r;
1617
1618 assert(hvf_enabled());
1619
1620 rcu_register_thread();
1621
1622 qemu_mutex_lock_iothread();
1623 qemu_thread_get_self(cpu->thread);
1624
1625 cpu->thread_id = qemu_get_thread_id();
1626 cpu->can_do_io = 1;
1627 current_cpu = cpu;
1628
1629 hvf_init_vcpu(cpu);
1630
1631 /* signal CPU creation */
1632 cpu->created = true;
1633 qemu_cond_signal(&qemu_cpu_cond);
1634
1635 do {
1636 if (cpu_can_run(cpu)) {
1637 r = hvf_vcpu_exec(cpu);
1638 if (r == EXCP_DEBUG) {
1639 cpu_handle_guest_debug(cpu);
1640 }
1641 }
1642 qemu_wait_io_event(cpu);
1643 } while (!cpu->unplug || cpu_can_run(cpu));
1644
1645 hvf_vcpu_destroy(cpu);
1646 cpu->created = false;
1647 qemu_cond_signal(&qemu_cpu_cond);
1648 qemu_mutex_unlock_iothread();
1649 rcu_unregister_thread();
1650 return NULL;
1651 }
1652
1653 static void *qemu_whpx_cpu_thread_fn(void *arg)
1654 {
1655 CPUState *cpu = arg;
1656 int r;
1657
1658 rcu_register_thread();
1659
1660 qemu_mutex_lock_iothread();
1661 qemu_thread_get_self(cpu->thread);
1662 cpu->thread_id = qemu_get_thread_id();
1663 current_cpu = cpu;
1664
1665 r = whpx_init_vcpu(cpu);
1666 if (r < 0) {
1667 fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1668 exit(1);
1669 }
1670
1671 /* signal CPU creation */
1672 cpu->created = true;
1673 qemu_cond_signal(&qemu_cpu_cond);
1674
1675 do {
1676 if (cpu_can_run(cpu)) {
1677 r = whpx_vcpu_exec(cpu);
1678 if (r == EXCP_DEBUG) {
1679 cpu_handle_guest_debug(cpu);
1680 }
1681 }
1682 while (cpu_thread_is_idle(cpu)) {
1683 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1684 }
1685 qemu_wait_io_event_common(cpu);
1686 } while (!cpu->unplug || cpu_can_run(cpu));
1687
1688 whpx_destroy_vcpu(cpu);
1689 cpu->created = false;
1690 qemu_cond_signal(&qemu_cpu_cond);
1691 qemu_mutex_unlock_iothread();
1692 rcu_unregister_thread();
1693 return NULL;
1694 }
1695
1696 #ifdef _WIN32
1697 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1698 {
1699 }
1700 #endif
1701
1702 /* Multi-threaded TCG
1703 *
1704 * In the multi-threaded case each vCPU has its own thread. The TLS
1705 * variable current_cpu can be used deep in the code to find the
1706 * current CPUState for a given thread.
1707 */
1708
1709 static void *qemu_tcg_cpu_thread_fn(void *arg)
1710 {
1711 CPUState *cpu = arg;
1712
1713 assert(tcg_enabled());
1714 g_assert(!use_icount);
1715
1716 rcu_register_thread();
1717 tcg_register_thread();
1718
1719 qemu_mutex_lock_iothread();
1720 qemu_thread_get_self(cpu->thread);
1721
1722 cpu->thread_id = qemu_get_thread_id();
1723 cpu->created = true;
1724 cpu->can_do_io = 1;
1725 current_cpu = cpu;
1726 qemu_cond_signal(&qemu_cpu_cond);
1727
1728 /* process any pending work */
1729 cpu->exit_request = 1;
1730
1731 do {
1732 if (cpu_can_run(cpu)) {
1733 int r;
1734 qemu_mutex_unlock_iothread();
1735 r = tcg_cpu_exec(cpu);
1736 qemu_mutex_lock_iothread();
1737 switch (r) {
1738 case EXCP_DEBUG:
1739 cpu_handle_guest_debug(cpu);
1740 break;
1741 case EXCP_HALTED:
1742 /* during start-up the vCPU is reset and the thread is
1743 * kicked several times. If we don't ensure we go back
1744 * to sleep in the halted state we won't cleanly
1745 * start-up when the vCPU is enabled.
1746 *
1747 * cpu->halted should ensure we sleep in wait_io_event
1748 */
1749 g_assert(cpu->halted);
1750 break;
1751 case EXCP_ATOMIC:
1752 qemu_mutex_unlock_iothread();
1753 cpu_exec_step_atomic(cpu);
1754 qemu_mutex_lock_iothread();
1755 default:
1756 /* Ignore everything else? */
1757 break;
1758 }
1759 }
1760
1761 atomic_mb_set(&cpu->exit_request, 0);
1762 qemu_wait_io_event(cpu);
1763 } while (!cpu->unplug || cpu_can_run(cpu));
1764
1765 qemu_tcg_destroy_vcpu(cpu);
1766 cpu->created = false;
1767 qemu_cond_signal(&qemu_cpu_cond);
1768 qemu_mutex_unlock_iothread();
1769 rcu_unregister_thread();
1770 return NULL;
1771 }
1772
1773 static void qemu_cpu_kick_thread(CPUState *cpu)
1774 {
1775 #ifndef _WIN32
1776 int err;
1777
1778 if (cpu->thread_kicked) {
1779 return;
1780 }
1781 cpu->thread_kicked = true;
1782 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1783 if (err && err != ESRCH) {
1784 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1785 exit(1);
1786 }
1787 #else /* _WIN32 */
1788 if (!qemu_cpu_is_self(cpu)) {
1789 if (whpx_enabled()) {
1790 whpx_vcpu_kick(cpu);
1791 } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1792 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1793 __func__, GetLastError());
1794 exit(1);
1795 }
1796 }
1797 #endif
1798 }
1799
1800 void qemu_cpu_kick(CPUState *cpu)
1801 {
1802 qemu_cond_broadcast(cpu->halt_cond);
1803 if (tcg_enabled()) {
1804 cpu_exit(cpu);
1805 /* NOP unless doing single-thread RR */
1806 qemu_cpu_kick_rr_cpu();
1807 } else {
1808 if (hax_enabled()) {
1809 /*
1810 * FIXME: race condition with the exit_request check in
1811 * hax_vcpu_hax_exec
1812 */
1813 cpu->exit_request = 1;
1814 }
1815 qemu_cpu_kick_thread(cpu);
1816 }
1817 }
1818
1819 void qemu_cpu_kick_self(void)
1820 {
1821 assert(current_cpu);
1822 qemu_cpu_kick_thread(current_cpu);
1823 }
1824
1825 bool qemu_cpu_is_self(CPUState *cpu)
1826 {
1827 return qemu_thread_is_self(cpu->thread);
1828 }
1829
1830 bool qemu_in_vcpu_thread(void)
1831 {
1832 return current_cpu && qemu_cpu_is_self(current_cpu);
1833 }
1834
1835 static __thread bool iothread_locked = false;
1836
1837 bool qemu_mutex_iothread_locked(void)
1838 {
1839 return iothread_locked;
1840 }
1841
1842 /*
1843 * The BQL is taken from so many places that it is worth profiling the
1844 * callers directly, instead of funneling them all through a single function.
1845 */
1846 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1847 {
1848 QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1849
1850 g_assert(!qemu_mutex_iothread_locked());
1851 bql_lock(&qemu_global_mutex, file, line);
1852 iothread_locked = true;
1853 }
1854
1855 void qemu_mutex_unlock_iothread(void)
1856 {
1857 g_assert(qemu_mutex_iothread_locked());
1858 iothread_locked = false;
1859 qemu_mutex_unlock(&qemu_global_mutex);
1860 }
1861
1862 static bool all_vcpus_paused(void)
1863 {
1864 CPUState *cpu;
1865
1866 CPU_FOREACH(cpu) {
1867 if (!cpu->stopped) {
1868 return false;
1869 }
1870 }
1871
1872 return true;
1873 }
1874
1875 void pause_all_vcpus(void)
1876 {
1877 CPUState *cpu;
1878
1879 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1880 CPU_FOREACH(cpu) {
1881 if (qemu_cpu_is_self(cpu)) {
1882 qemu_cpu_stop(cpu, true);
1883 } else {
1884 cpu->stop = true;
1885 qemu_cpu_kick(cpu);
1886 }
1887 }
1888
1889 /* We need to drop the replay_lock so any vCPU threads woken up
1890 * can finish their replay tasks
1891 */
1892 replay_mutex_unlock();
1893
1894 while (!all_vcpus_paused()) {
1895 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1896 CPU_FOREACH(cpu) {
1897 qemu_cpu_kick(cpu);
1898 }
1899 }
1900
1901 qemu_mutex_unlock_iothread();
1902 replay_mutex_lock();
1903 qemu_mutex_lock_iothread();
1904 }
1905
1906 void cpu_resume(CPUState *cpu)
1907 {
1908 cpu->stop = false;
1909 cpu->stopped = false;
1910 qemu_cpu_kick(cpu);
1911 }
1912
1913 void resume_all_vcpus(void)
1914 {
1915 CPUState *cpu;
1916
1917 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1918 CPU_FOREACH(cpu) {
1919 cpu_resume(cpu);
1920 }
1921 }
1922
1923 void cpu_remove_sync(CPUState *cpu)
1924 {
1925 cpu->stop = true;
1926 cpu->unplug = true;
1927 qemu_cpu_kick(cpu);
1928 qemu_mutex_unlock_iothread();
1929 qemu_thread_join(cpu->thread);
1930 qemu_mutex_lock_iothread();
1931 }
1932
1933 /* For temporary buffers for forming a name */
1934 #define VCPU_THREAD_NAME_SIZE 16
1935
1936 static void qemu_tcg_init_vcpu(CPUState *cpu)
1937 {
1938 char thread_name[VCPU_THREAD_NAME_SIZE];
1939 static QemuCond *single_tcg_halt_cond;
1940 static QemuThread *single_tcg_cpu_thread;
1941 static int tcg_region_inited;
1942
1943 assert(tcg_enabled());
1944 /*
1945 * Initialize TCG regions--once. Now is a good time, because:
1946 * (1) TCG's init context, prologue and target globals have been set up.
1947 * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1948 * -accel flag is processed, so the check doesn't work then).
1949 */
1950 if (!tcg_region_inited) {
1951 tcg_region_inited = 1;
1952 tcg_region_init();
1953 }
1954
1955 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1956 cpu->thread = g_malloc0(sizeof(QemuThread));
1957 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1958 qemu_cond_init(cpu->halt_cond);
1959
1960 if (qemu_tcg_mttcg_enabled()) {
1961 /* create a thread per vCPU with TCG (MTTCG) */
1962 parallel_cpus = true;
1963 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1964 cpu->cpu_index);
1965
1966 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1967 cpu, QEMU_THREAD_JOINABLE);
1968
1969 } else {
1970 /* share a single thread for all cpus with TCG */
1971 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1972 qemu_thread_create(cpu->thread, thread_name,
1973 qemu_tcg_rr_cpu_thread_fn,
1974 cpu, QEMU_THREAD_JOINABLE);
1975
1976 single_tcg_halt_cond = cpu->halt_cond;
1977 single_tcg_cpu_thread = cpu->thread;
1978 }
1979 #ifdef _WIN32
1980 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1981 #endif
1982 } else {
1983 /* For non-MTTCG cases we share the thread */
1984 cpu->thread = single_tcg_cpu_thread;
1985 cpu->halt_cond = single_tcg_halt_cond;
1986 cpu->thread_id = first_cpu->thread_id;
1987 cpu->can_do_io = 1;
1988 cpu->created = true;
1989 }
1990 }
1991
1992 static void qemu_hax_start_vcpu(CPUState *cpu)
1993 {
1994 char thread_name[VCPU_THREAD_NAME_SIZE];
1995
1996 cpu->thread = g_malloc0(sizeof(QemuThread));
1997 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1998 qemu_cond_init(cpu->halt_cond);
1999
2000 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2001 cpu->cpu_index);
2002 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2003 cpu, QEMU_THREAD_JOINABLE);
2004 #ifdef _WIN32
2005 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2006 #endif
2007 }
2008
2009 static void qemu_kvm_start_vcpu(CPUState *cpu)
2010 {
2011 char thread_name[VCPU_THREAD_NAME_SIZE];
2012
2013 cpu->thread = g_malloc0(sizeof(QemuThread));
2014 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2015 qemu_cond_init(cpu->halt_cond);
2016 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2017 cpu->cpu_index);
2018 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2019 cpu, QEMU_THREAD_JOINABLE);
2020 }
2021
2022 static void qemu_hvf_start_vcpu(CPUState *cpu)
2023 {
2024 char thread_name[VCPU_THREAD_NAME_SIZE];
2025
2026 /* HVF currently does not support TCG, and only runs in
2027 * unrestricted-guest mode. */
2028 assert(hvf_enabled());
2029
2030 cpu->thread = g_malloc0(sizeof(QemuThread));
2031 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2032 qemu_cond_init(cpu->halt_cond);
2033
2034 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2035 cpu->cpu_index);
2036 qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2037 cpu, QEMU_THREAD_JOINABLE);
2038 }
2039
2040 static void qemu_whpx_start_vcpu(CPUState *cpu)
2041 {
2042 char thread_name[VCPU_THREAD_NAME_SIZE];
2043
2044 cpu->thread = g_malloc0(sizeof(QemuThread));
2045 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2046 qemu_cond_init(cpu->halt_cond);
2047 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2048 cpu->cpu_index);
2049 qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2050 cpu, QEMU_THREAD_JOINABLE);
2051 #ifdef _WIN32
2052 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2053 #endif
2054 }
2055
2056 static void qemu_dummy_start_vcpu(CPUState *cpu)
2057 {
2058 char thread_name[VCPU_THREAD_NAME_SIZE];
2059
2060 cpu->thread = g_malloc0(sizeof(QemuThread));
2061 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2062 qemu_cond_init(cpu->halt_cond);
2063 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2064 cpu->cpu_index);
2065 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2066 QEMU_THREAD_JOINABLE);
2067 }
2068
2069 void qemu_init_vcpu(CPUState *cpu)
2070 {
2071 cpu->nr_cores = smp_cores;
2072 cpu->nr_threads = smp_threads;
2073 cpu->stopped = true;
2074
2075 if (!cpu->as) {
2076 /* If the target cpu hasn't set up any address spaces itself,
2077 * give it the default one.
2078 */
2079 cpu->num_ases = 1;
2080 cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2081 }
2082
2083 if (kvm_enabled()) {
2084 qemu_kvm_start_vcpu(cpu);
2085 } else if (hax_enabled()) {
2086 qemu_hax_start_vcpu(cpu);
2087 } else if (hvf_enabled()) {
2088 qemu_hvf_start_vcpu(cpu);
2089 } else if (tcg_enabled()) {
2090 qemu_tcg_init_vcpu(cpu);
2091 } else if (whpx_enabled()) {
2092 qemu_whpx_start_vcpu(cpu);
2093 } else {
2094 qemu_dummy_start_vcpu(cpu);
2095 }
2096
2097 while (!cpu->created) {
2098 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2099 }
2100 }
2101
2102 void cpu_stop_current(void)
2103 {
2104 if (current_cpu) {
2105 current_cpu->stop = true;
2106 cpu_exit(current_cpu);
2107 }
2108 }
2109
2110 int vm_stop(RunState state)
2111 {
2112 if (qemu_in_vcpu_thread()) {
2113 qemu_system_vmstop_request_prepare();
2114 qemu_system_vmstop_request(state);
2115 /*
2116 * FIXME: should not return to device code in case
2117 * vm_stop() has been requested.
2118 */
2119 cpu_stop_current();
2120 return 0;
2121 }
2122
2123 return do_vm_stop(state, true);
2124 }
2125
2126 /**
2127 * Prepare for (re)starting the VM.
2128 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2129 * running or in case of an error condition), 0 otherwise.
2130 */
2131 int vm_prepare_start(void)
2132 {
2133 RunState requested;
2134
2135 qemu_vmstop_requested(&requested);
2136 if (runstate_is_running() && requested == RUN_STATE__MAX) {
2137 return -1;
2138 }
2139
2140 /* Ensure that a STOP/RESUME pair of events is emitted if a
2141 * vmstop request was pending. The BLOCK_IO_ERROR event, for
2142 * example, according to documentation is always followed by
2143 * the STOP event.
2144 */
2145 if (runstate_is_running()) {
2146 qapi_event_send_stop();
2147 qapi_event_send_resume();
2148 return -1;
2149 }
2150
2151 /* We are sending this now, but the CPUs will be resumed shortly later */
2152 qapi_event_send_resume();
2153
2154 replay_enable_events();
2155 cpu_enable_ticks();
2156 runstate_set(RUN_STATE_RUNNING);
2157 vm_state_notify(1, RUN_STATE_RUNNING);
2158 return 0;
2159 }
2160
2161 void vm_start(void)
2162 {
2163 if (!vm_prepare_start()) {
2164 resume_all_vcpus();
2165 }
2166 }
2167
2168 /* does a state transition even if the VM is already stopped,
2169 current state is forgotten forever */
2170 int vm_stop_force_state(RunState state)
2171 {
2172 if (runstate_is_running()) {
2173 return vm_stop(state);
2174 } else {
2175 runstate_set(state);
2176
2177 bdrv_drain_all();
2178 /* Make sure to return an error if the flush in a previous vm_stop()
2179 * failed. */
2180 return bdrv_flush_all();
2181 }
2182 }
2183
2184 void list_cpus(const char *optarg)
2185 {
2186 /* XXX: implement xxx_cpu_list for targets that still miss it */
2187 #if defined(cpu_list)
2188 cpu_list();
2189 #endif
2190 }
2191
2192 CpuInfoList *qmp_query_cpus(Error **errp)
2193 {
2194 MachineState *ms = MACHINE(qdev_get_machine());
2195 MachineClass *mc = MACHINE_GET_CLASS(ms);
2196 CpuInfoList *head = NULL, *cur_item = NULL;
2197 CPUState *cpu;
2198
2199 CPU_FOREACH(cpu) {
2200 CpuInfoList *info;
2201 #if defined(TARGET_I386)
2202 X86CPU *x86_cpu = X86_CPU(cpu);
2203 CPUX86State *env = &x86_cpu->env;
2204 #elif defined(TARGET_PPC)
2205 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2206 CPUPPCState *env = &ppc_cpu->env;
2207 #elif defined(TARGET_SPARC)
2208 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2209 CPUSPARCState *env = &sparc_cpu->env;
2210 #elif defined(TARGET_RISCV)
2211 RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2212 CPURISCVState *env = &riscv_cpu->env;
2213 #elif defined(TARGET_MIPS)
2214 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2215 CPUMIPSState *env = &mips_cpu->env;
2216 #elif defined(TARGET_TRICORE)
2217 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2218 CPUTriCoreState *env = &tricore_cpu->env;
2219 #elif defined(TARGET_S390X)
2220 S390CPU *s390_cpu = S390_CPU(cpu);
2221 CPUS390XState *env = &s390_cpu->env;
2222 #endif
2223
2224 cpu_synchronize_state(cpu);
2225
2226 info = g_malloc0(sizeof(*info));
2227 info->value = g_malloc0(sizeof(*info->value));
2228 info->value->CPU = cpu->cpu_index;
2229 info->value->current = (cpu == first_cpu);
2230 info->value->halted = cpu->halted;
2231 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2232 info->value->thread_id = cpu->thread_id;
2233 #if defined(TARGET_I386)
2234 info->value->arch = CPU_INFO_ARCH_X86;
2235 info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2236 #elif defined(TARGET_PPC)
2237 info->value->arch = CPU_INFO_ARCH_PPC;
2238 info->value->u.ppc.nip = env->nip;
2239 #elif defined(TARGET_SPARC)
2240 info->value->arch = CPU_INFO_ARCH_SPARC;
2241 info->value->u.q_sparc.pc = env->pc;
2242 info->value->u.q_sparc.npc = env->npc;
2243 #elif defined(TARGET_MIPS)
2244 info->value->arch = CPU_INFO_ARCH_MIPS;
2245 info->value->u.q_mips.PC = env->active_tc.PC;
2246 #elif defined(TARGET_TRICORE)
2247 info->value->arch = CPU_INFO_ARCH_TRICORE;
2248 info->value->u.tricore.PC = env->PC;
2249 #elif defined(TARGET_S390X)
2250 info->value->arch = CPU_INFO_ARCH_S390;
2251 info->value->u.s390.cpu_state = env->cpu_state;
2252 #elif defined(TARGET_RISCV)
2253 info->value->arch = CPU_INFO_ARCH_RISCV;
2254 info->value->u.riscv.pc = env->pc;
2255 #else
2256 info->value->arch = CPU_INFO_ARCH_OTHER;
2257 #endif
2258 info->value->has_props = !!mc->cpu_index_to_instance_props;
2259 if (info->value->has_props) {
2260 CpuInstanceProperties *props;
2261 props = g_malloc0(sizeof(*props));
2262 *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2263 info->value->props = props;
2264 }
2265
2266 /* XXX: waiting for the qapi to support GSList */
2267 if (!cur_item) {
2268 head = cur_item = info;
2269 } else {
2270 cur_item->next = info;
2271 cur_item = info;
2272 }
2273 }
2274
2275 return head;
2276 }
2277
2278 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2279 {
2280 /*
2281 * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2282 * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2283 */
2284 switch (target) {
2285 case SYS_EMU_TARGET_I386:
2286 case SYS_EMU_TARGET_X86_64:
2287 return CPU_INFO_ARCH_X86;
2288
2289 case SYS_EMU_TARGET_PPC:
2290 case SYS_EMU_TARGET_PPC64:
2291 return CPU_INFO_ARCH_PPC;
2292
2293 case SYS_EMU_TARGET_SPARC:
2294 case SYS_EMU_TARGET_SPARC64:
2295 return CPU_INFO_ARCH_SPARC;
2296
2297 case SYS_EMU_TARGET_MIPS:
2298 case SYS_EMU_TARGET_MIPSEL:
2299 case SYS_EMU_TARGET_MIPS64:
2300 case SYS_EMU_TARGET_MIPS64EL:
2301 return CPU_INFO_ARCH_MIPS;
2302
2303 case SYS_EMU_TARGET_TRICORE:
2304 return CPU_INFO_ARCH_TRICORE;
2305
2306 case SYS_EMU_TARGET_S390X:
2307 return CPU_INFO_ARCH_S390;
2308
2309 case SYS_EMU_TARGET_RISCV32:
2310 case SYS_EMU_TARGET_RISCV64:
2311 return CPU_INFO_ARCH_RISCV;
2312
2313 default:
2314 return CPU_INFO_ARCH_OTHER;
2315 }
2316 }
2317
2318 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2319 {
2320 #ifdef TARGET_S390X
2321 S390CPU *s390_cpu = S390_CPU(cpu);
2322 CPUS390XState *env = &s390_cpu->env;
2323
2324 info->cpu_state = env->cpu_state;
2325 #else
2326 abort();
2327 #endif
2328 }
2329
2330 /*
2331 * fast means: we NEVER interrupt vCPU threads to retrieve
2332 * information from KVM.
2333 */
2334 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2335 {
2336 MachineState *ms = MACHINE(qdev_get_machine());
2337 MachineClass *mc = MACHINE_GET_CLASS(ms);
2338 CpuInfoFastList *head = NULL, *cur_item = NULL;
2339 SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2340 -1, &error_abort);
2341 CPUState *cpu;
2342
2343 CPU_FOREACH(cpu) {
2344 CpuInfoFastList *info = g_malloc0(sizeof(*info));
2345 info->value = g_malloc0(sizeof(*info->value));
2346
2347 info->value->cpu_index = cpu->cpu_index;
2348 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2349 info->value->thread_id = cpu->thread_id;
2350
2351 info->value->has_props = !!mc->cpu_index_to_instance_props;
2352 if (info->value->has_props) {
2353 CpuInstanceProperties *props;
2354 props = g_malloc0(sizeof(*props));
2355 *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2356 info->value->props = props;
2357 }
2358
2359 info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2360 info->value->target = target;
2361 if (target == SYS_EMU_TARGET_S390X) {
2362 cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2363 }
2364
2365 if (!cur_item) {
2366 head = cur_item = info;
2367 } else {
2368 cur_item->next = info;
2369 cur_item = info;
2370 }
2371 }
2372
2373 return head;
2374 }
2375
2376 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2377 bool has_cpu, int64_t cpu_index, Error **errp)
2378 {
2379 FILE *f;
2380 uint32_t l;
2381 CPUState *cpu;
2382 uint8_t buf[1024];
2383 int64_t orig_addr = addr, orig_size = size;
2384
2385 if (!has_cpu) {
2386 cpu_index = 0;
2387 }
2388
2389 cpu = qemu_get_cpu(cpu_index);
2390 if (cpu == NULL) {
2391 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2392 "a CPU number");
2393 return;
2394 }
2395
2396 f = fopen(filename, "wb");
2397 if (!f) {
2398 error_setg_file_open(errp, errno, filename);
2399 return;
2400 }
2401
2402 while (size != 0) {
2403 l = sizeof(buf);
2404 if (l > size)
2405 l = size;
2406 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2407 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2408 " specified", orig_addr, orig_size);
2409 goto exit;
2410 }
2411 if (fwrite(buf, 1, l, f) != l) {
2412 error_setg(errp, QERR_IO_ERROR);
2413 goto exit;
2414 }
2415 addr += l;
2416 size -= l;
2417 }
2418
2419 exit:
2420 fclose(f);
2421 }
2422
2423 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2424 Error **errp)
2425 {
2426 FILE *f;
2427 uint32_t l;
2428 uint8_t buf[1024];
2429
2430 f = fopen(filename, "wb");
2431 if (!f) {
2432 error_setg_file_open(errp, errno, filename);
2433 return;
2434 }
2435
2436 while (size != 0) {
2437 l = sizeof(buf);
2438 if (l > size)
2439 l = size;
2440 cpu_physical_memory_read(addr, buf, l);
2441 if (fwrite(buf, 1, l, f) != l) {
2442 error_setg(errp, QERR_IO_ERROR);
2443 goto exit;
2444 }
2445 addr += l;
2446 size -= l;
2447 }
2448
2449 exit:
2450 fclose(f);
2451 }
2452
2453 void qmp_inject_nmi(Error **errp)
2454 {
2455 nmi_monitor_handle(monitor_get_cpu_index(), errp);
2456 }
2457
2458 void dump_drift_info(void)
2459 {
2460 if (!use_icount) {
2461 return;
2462 }
2463
2464 qemu_printf("Host - Guest clock %"PRIi64" ms\n",
2465 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2466 if (icount_align_option) {
2467 qemu_printf("Max guest delay %"PRIi64" ms\n",
2468 -max_delay / SCALE_MS);
2469 qemu_printf("Max guest advance %"PRIi64" ms\n",
2470 max_advance / SCALE_MS);
2471 } else {
2472 qemu_printf("Max guest delay NA\n");
2473 qemu_printf("Max guest advance NA\n");
2474 }
2475 }