]> git.proxmox.com Git - mirror_qemu.git/blob - cpus.c
cpu: Move icount_decr to CPUNegativeOffsetState
[mirror_qemu.git] / cpus.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 #include "qemu/osdep.h"
26 #include "qemu/config-file.h"
27 #include "cpu.h"
28 #include "monitor/monitor.h"
29 #include "qapi/error.h"
30 #include "qapi/qapi-commands-misc.h"
31 #include "qapi/qapi-events-run-state.h"
32 #include "qapi/qmp/qerror.h"
33 #include "qemu/error-report.h"
34 #include "qemu/qemu-print.h"
35 #include "sysemu/sysemu.h"
36 #include "sysemu/block-backend.h"
37 #include "exec/gdbstub.h"
38 #include "sysemu/dma.h"
39 #include "sysemu/hw_accel.h"
40 #include "sysemu/kvm.h"
41 #include "sysemu/hax.h"
42 #include "sysemu/hvf.h"
43 #include "sysemu/whpx.h"
44 #include "exec/exec-all.h"
45
46 #include "qemu/thread.h"
47 #include "sysemu/cpus.h"
48 #include "sysemu/qtest.h"
49 #include "qemu/main-loop.h"
50 #include "qemu/option.h"
51 #include "qemu/bitmap.h"
52 #include "qemu/seqlock.h"
53 #include "qemu/guest-random.h"
54 #include "tcg.h"
55 #include "hw/nmi.h"
56 #include "sysemu/replay.h"
57 #include "hw/boards.h"
58
59 #ifdef CONFIG_LINUX
60
61 #include <sys/prctl.h>
62
63 #ifndef PR_MCE_KILL
64 #define PR_MCE_KILL 33
65 #endif
66
67 #ifndef PR_MCE_KILL_SET
68 #define PR_MCE_KILL_SET 1
69 #endif
70
71 #ifndef PR_MCE_KILL_EARLY
72 #define PR_MCE_KILL_EARLY 1
73 #endif
74
75 #endif /* CONFIG_LINUX */
76
77 int64_t max_delay;
78 int64_t max_advance;
79
80 /* vcpu throttling controls */
81 static QEMUTimer *throttle_timer;
82 static unsigned int throttle_percentage;
83
84 #define CPU_THROTTLE_PCT_MIN 1
85 #define CPU_THROTTLE_PCT_MAX 99
86 #define CPU_THROTTLE_TIMESLICE_NS 10000000
87
88 bool cpu_is_stopped(CPUState *cpu)
89 {
90 return cpu->stopped || !runstate_is_running();
91 }
92
93 static bool cpu_thread_is_idle(CPUState *cpu)
94 {
95 if (cpu->stop || cpu->queued_work_first) {
96 return false;
97 }
98 if (cpu_is_stopped(cpu)) {
99 return true;
100 }
101 if (!cpu->halted || cpu_has_work(cpu) ||
102 kvm_halt_in_kernel()) {
103 return false;
104 }
105 return true;
106 }
107
108 static bool all_cpu_threads_idle(void)
109 {
110 CPUState *cpu;
111
112 CPU_FOREACH(cpu) {
113 if (!cpu_thread_is_idle(cpu)) {
114 return false;
115 }
116 }
117 return true;
118 }
119
120 /***********************************************************/
121 /* guest cycle counter */
122
123 /* Protected by TimersState seqlock */
124
125 static bool icount_sleep = true;
126 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
127 #define MAX_ICOUNT_SHIFT 10
128
129 typedef struct TimersState {
130 /* Protected by BQL. */
131 int64_t cpu_ticks_prev;
132 int64_t cpu_ticks_offset;
133
134 /* Protect fields that can be respectively read outside the
135 * BQL, and written from multiple threads.
136 */
137 QemuSeqLock vm_clock_seqlock;
138 QemuSpin vm_clock_lock;
139
140 int16_t cpu_ticks_enabled;
141
142 /* Conversion factor from emulated instructions to virtual clock ticks. */
143 int16_t icount_time_shift;
144
145 /* Compensate for varying guest execution speed. */
146 int64_t qemu_icount_bias;
147
148 int64_t vm_clock_warp_start;
149 int64_t cpu_clock_offset;
150
151 /* Only written by TCG thread */
152 int64_t qemu_icount;
153
154 /* for adjusting icount */
155 QEMUTimer *icount_rt_timer;
156 QEMUTimer *icount_vm_timer;
157 QEMUTimer *icount_warp_timer;
158 } TimersState;
159
160 static TimersState timers_state;
161 bool mttcg_enabled;
162
163 /*
164 * We default to false if we know other options have been enabled
165 * which are currently incompatible with MTTCG. Otherwise when each
166 * guest (target) has been updated to support:
167 * - atomic instructions
168 * - memory ordering primitives (barriers)
169 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
170 *
171 * Once a guest architecture has been converted to the new primitives
172 * there are two remaining limitations to check.
173 *
174 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
175 * - The host must have a stronger memory order than the guest
176 *
177 * It may be possible in future to support strong guests on weak hosts
178 * but that will require tagging all load/stores in a guest with their
179 * implicit memory order requirements which would likely slow things
180 * down a lot.
181 */
182
183 static bool check_tcg_memory_orders_compatible(void)
184 {
185 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
186 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
187 #else
188 return false;
189 #endif
190 }
191
192 static bool default_mttcg_enabled(void)
193 {
194 if (use_icount || TCG_OVERSIZED_GUEST) {
195 return false;
196 } else {
197 #ifdef TARGET_SUPPORTS_MTTCG
198 return check_tcg_memory_orders_compatible();
199 #else
200 return false;
201 #endif
202 }
203 }
204
205 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
206 {
207 const char *t = qemu_opt_get(opts, "thread");
208 if (t) {
209 if (strcmp(t, "multi") == 0) {
210 if (TCG_OVERSIZED_GUEST) {
211 error_setg(errp, "No MTTCG when guest word size > hosts");
212 } else if (use_icount) {
213 error_setg(errp, "No MTTCG when icount is enabled");
214 } else {
215 #ifndef TARGET_SUPPORTS_MTTCG
216 warn_report("Guest not yet converted to MTTCG - "
217 "you may get unexpected results");
218 #endif
219 if (!check_tcg_memory_orders_compatible()) {
220 warn_report("Guest expects a stronger memory ordering "
221 "than the host provides");
222 error_printf("This may cause strange/hard to debug errors\n");
223 }
224 mttcg_enabled = true;
225 }
226 } else if (strcmp(t, "single") == 0) {
227 mttcg_enabled = false;
228 } else {
229 error_setg(errp, "Invalid 'thread' setting %s", t);
230 }
231 } else {
232 mttcg_enabled = default_mttcg_enabled();
233 }
234 }
235
236 /* The current number of executed instructions is based on what we
237 * originally budgeted minus the current state of the decrementing
238 * icount counters in extra/u16.low.
239 */
240 static int64_t cpu_get_icount_executed(CPUState *cpu)
241 {
242 return (cpu->icount_budget -
243 (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
244 }
245
246 /*
247 * Update the global shared timer_state.qemu_icount to take into
248 * account executed instructions. This is done by the TCG vCPU
249 * thread so the main-loop can see time has moved forward.
250 */
251 static void cpu_update_icount_locked(CPUState *cpu)
252 {
253 int64_t executed = cpu_get_icount_executed(cpu);
254 cpu->icount_budget -= executed;
255
256 atomic_set_i64(&timers_state.qemu_icount,
257 timers_state.qemu_icount + executed);
258 }
259
260 /*
261 * Update the global shared timer_state.qemu_icount to take into
262 * account executed instructions. This is done by the TCG vCPU
263 * thread so the main-loop can see time has moved forward.
264 */
265 void cpu_update_icount(CPUState *cpu)
266 {
267 seqlock_write_lock(&timers_state.vm_clock_seqlock,
268 &timers_state.vm_clock_lock);
269 cpu_update_icount_locked(cpu);
270 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
271 &timers_state.vm_clock_lock);
272 }
273
274 static int64_t cpu_get_icount_raw_locked(void)
275 {
276 CPUState *cpu = current_cpu;
277
278 if (cpu && cpu->running) {
279 if (!cpu->can_do_io) {
280 error_report("Bad icount read");
281 exit(1);
282 }
283 /* Take into account what has run */
284 cpu_update_icount_locked(cpu);
285 }
286 /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
287 return atomic_read_i64(&timers_state.qemu_icount);
288 }
289
290 static int64_t cpu_get_icount_locked(void)
291 {
292 int64_t icount = cpu_get_icount_raw_locked();
293 return atomic_read_i64(&timers_state.qemu_icount_bias) +
294 cpu_icount_to_ns(icount);
295 }
296
297 int64_t cpu_get_icount_raw(void)
298 {
299 int64_t icount;
300 unsigned start;
301
302 do {
303 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
304 icount = cpu_get_icount_raw_locked();
305 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
306
307 return icount;
308 }
309
310 /* Return the virtual CPU time, based on the instruction counter. */
311 int64_t cpu_get_icount(void)
312 {
313 int64_t icount;
314 unsigned start;
315
316 do {
317 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
318 icount = cpu_get_icount_locked();
319 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
320
321 return icount;
322 }
323
324 int64_t cpu_icount_to_ns(int64_t icount)
325 {
326 return icount << atomic_read(&timers_state.icount_time_shift);
327 }
328
329 static int64_t cpu_get_ticks_locked(void)
330 {
331 int64_t ticks = timers_state.cpu_ticks_offset;
332 if (timers_state.cpu_ticks_enabled) {
333 ticks += cpu_get_host_ticks();
334 }
335
336 if (timers_state.cpu_ticks_prev > ticks) {
337 /* Non increasing ticks may happen if the host uses software suspend. */
338 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
339 ticks = timers_state.cpu_ticks_prev;
340 }
341
342 timers_state.cpu_ticks_prev = ticks;
343 return ticks;
344 }
345
346 /* return the time elapsed in VM between vm_start and vm_stop. Unless
347 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
348 * counter.
349 */
350 int64_t cpu_get_ticks(void)
351 {
352 int64_t ticks;
353
354 if (use_icount) {
355 return cpu_get_icount();
356 }
357
358 qemu_spin_lock(&timers_state.vm_clock_lock);
359 ticks = cpu_get_ticks_locked();
360 qemu_spin_unlock(&timers_state.vm_clock_lock);
361 return ticks;
362 }
363
364 static int64_t cpu_get_clock_locked(void)
365 {
366 int64_t time;
367
368 time = timers_state.cpu_clock_offset;
369 if (timers_state.cpu_ticks_enabled) {
370 time += get_clock();
371 }
372
373 return time;
374 }
375
376 /* Return the monotonic time elapsed in VM, i.e.,
377 * the time between vm_start and vm_stop
378 */
379 int64_t cpu_get_clock(void)
380 {
381 int64_t ti;
382 unsigned start;
383
384 do {
385 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
386 ti = cpu_get_clock_locked();
387 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
388
389 return ti;
390 }
391
392 /* enable cpu_get_ticks()
393 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
394 */
395 void cpu_enable_ticks(void)
396 {
397 seqlock_write_lock(&timers_state.vm_clock_seqlock,
398 &timers_state.vm_clock_lock);
399 if (!timers_state.cpu_ticks_enabled) {
400 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
401 timers_state.cpu_clock_offset -= get_clock();
402 timers_state.cpu_ticks_enabled = 1;
403 }
404 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
405 &timers_state.vm_clock_lock);
406 }
407
408 /* disable cpu_get_ticks() : the clock is stopped. You must not call
409 * cpu_get_ticks() after that.
410 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
411 */
412 void cpu_disable_ticks(void)
413 {
414 seqlock_write_lock(&timers_state.vm_clock_seqlock,
415 &timers_state.vm_clock_lock);
416 if (timers_state.cpu_ticks_enabled) {
417 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
418 timers_state.cpu_clock_offset = cpu_get_clock_locked();
419 timers_state.cpu_ticks_enabled = 0;
420 }
421 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
422 &timers_state.vm_clock_lock);
423 }
424
425 /* Correlation between real and virtual time is always going to be
426 fairly approximate, so ignore small variation.
427 When the guest is idle real and virtual time will be aligned in
428 the IO wait loop. */
429 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
430
431 static void icount_adjust(void)
432 {
433 int64_t cur_time;
434 int64_t cur_icount;
435 int64_t delta;
436
437 /* Protected by TimersState mutex. */
438 static int64_t last_delta;
439
440 /* If the VM is not running, then do nothing. */
441 if (!runstate_is_running()) {
442 return;
443 }
444
445 seqlock_write_lock(&timers_state.vm_clock_seqlock,
446 &timers_state.vm_clock_lock);
447 cur_time = cpu_get_clock_locked();
448 cur_icount = cpu_get_icount_locked();
449
450 delta = cur_icount - cur_time;
451 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
452 if (delta > 0
453 && last_delta + ICOUNT_WOBBLE < delta * 2
454 && timers_state.icount_time_shift > 0) {
455 /* The guest is getting too far ahead. Slow time down. */
456 atomic_set(&timers_state.icount_time_shift,
457 timers_state.icount_time_shift - 1);
458 }
459 if (delta < 0
460 && last_delta - ICOUNT_WOBBLE > delta * 2
461 && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
462 /* The guest is getting too far behind. Speed time up. */
463 atomic_set(&timers_state.icount_time_shift,
464 timers_state.icount_time_shift + 1);
465 }
466 last_delta = delta;
467 atomic_set_i64(&timers_state.qemu_icount_bias,
468 cur_icount - (timers_state.qemu_icount
469 << timers_state.icount_time_shift));
470 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
471 &timers_state.vm_clock_lock);
472 }
473
474 static void icount_adjust_rt(void *opaque)
475 {
476 timer_mod(timers_state.icount_rt_timer,
477 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
478 icount_adjust();
479 }
480
481 static void icount_adjust_vm(void *opaque)
482 {
483 timer_mod(timers_state.icount_vm_timer,
484 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
485 NANOSECONDS_PER_SECOND / 10);
486 icount_adjust();
487 }
488
489 static int64_t qemu_icount_round(int64_t count)
490 {
491 int shift = atomic_read(&timers_state.icount_time_shift);
492 return (count + (1 << shift) - 1) >> shift;
493 }
494
495 static void icount_warp_rt(void)
496 {
497 unsigned seq;
498 int64_t warp_start;
499
500 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
501 * changes from -1 to another value, so the race here is okay.
502 */
503 do {
504 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
505 warp_start = timers_state.vm_clock_warp_start;
506 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
507
508 if (warp_start == -1) {
509 return;
510 }
511
512 seqlock_write_lock(&timers_state.vm_clock_seqlock,
513 &timers_state.vm_clock_lock);
514 if (runstate_is_running()) {
515 int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
516 cpu_get_clock_locked());
517 int64_t warp_delta;
518
519 warp_delta = clock - timers_state.vm_clock_warp_start;
520 if (use_icount == 2) {
521 /*
522 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
523 * far ahead of real time.
524 */
525 int64_t cur_icount = cpu_get_icount_locked();
526 int64_t delta = clock - cur_icount;
527 warp_delta = MIN(warp_delta, delta);
528 }
529 atomic_set_i64(&timers_state.qemu_icount_bias,
530 timers_state.qemu_icount_bias + warp_delta);
531 }
532 timers_state.vm_clock_warp_start = -1;
533 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
534 &timers_state.vm_clock_lock);
535
536 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
537 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
538 }
539 }
540
541 static void icount_timer_cb(void *opaque)
542 {
543 /* No need for a checkpoint because the timer already synchronizes
544 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
545 */
546 icount_warp_rt();
547 }
548
549 void qtest_clock_warp(int64_t dest)
550 {
551 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
552 AioContext *aio_context;
553 assert(qtest_enabled());
554 aio_context = qemu_get_aio_context();
555 while (clock < dest) {
556 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
557 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
558
559 seqlock_write_lock(&timers_state.vm_clock_seqlock,
560 &timers_state.vm_clock_lock);
561 atomic_set_i64(&timers_state.qemu_icount_bias,
562 timers_state.qemu_icount_bias + warp);
563 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
564 &timers_state.vm_clock_lock);
565
566 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
567 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
568 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
569 }
570 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
571 }
572
573 void qemu_start_warp_timer(void)
574 {
575 int64_t clock;
576 int64_t deadline;
577
578 if (!use_icount) {
579 return;
580 }
581
582 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
583 * do not fire, so computing the deadline does not make sense.
584 */
585 if (!runstate_is_running()) {
586 return;
587 }
588
589 if (replay_mode != REPLAY_MODE_PLAY) {
590 if (!all_cpu_threads_idle()) {
591 return;
592 }
593
594 if (qtest_enabled()) {
595 /* When testing, qtest commands advance icount. */
596 return;
597 }
598
599 replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
600 } else {
601 /* warp clock deterministically in record/replay mode */
602 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
603 /* vCPU is sleeping and warp can't be started.
604 It is probably a race condition: notification sent
605 to vCPU was processed in advance and vCPU went to sleep.
606 Therefore we have to wake it up for doing someting. */
607 if (replay_has_checkpoint()) {
608 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
609 }
610 return;
611 }
612 }
613
614 /* We want to use the earliest deadline from ALL vm_clocks */
615 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
616 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
617 if (deadline < 0) {
618 static bool notified;
619 if (!icount_sleep && !notified) {
620 warn_report("icount sleep disabled and no active timers");
621 notified = true;
622 }
623 return;
624 }
625
626 if (deadline > 0) {
627 /*
628 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
629 * sleep. Otherwise, the CPU might be waiting for a future timer
630 * interrupt to wake it up, but the interrupt never comes because
631 * the vCPU isn't running any insns and thus doesn't advance the
632 * QEMU_CLOCK_VIRTUAL.
633 */
634 if (!icount_sleep) {
635 /*
636 * We never let VCPUs sleep in no sleep icount mode.
637 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
638 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
639 * It is useful when we want a deterministic execution time,
640 * isolated from host latencies.
641 */
642 seqlock_write_lock(&timers_state.vm_clock_seqlock,
643 &timers_state.vm_clock_lock);
644 atomic_set_i64(&timers_state.qemu_icount_bias,
645 timers_state.qemu_icount_bias + deadline);
646 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
647 &timers_state.vm_clock_lock);
648 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
649 } else {
650 /*
651 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
652 * "real" time, (related to the time left until the next event) has
653 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
654 * This avoids that the warps are visible externally; for example,
655 * you will not be sending network packets continuously instead of
656 * every 100ms.
657 */
658 seqlock_write_lock(&timers_state.vm_clock_seqlock,
659 &timers_state.vm_clock_lock);
660 if (timers_state.vm_clock_warp_start == -1
661 || timers_state.vm_clock_warp_start > clock) {
662 timers_state.vm_clock_warp_start = clock;
663 }
664 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
665 &timers_state.vm_clock_lock);
666 timer_mod_anticipate(timers_state.icount_warp_timer,
667 clock + deadline);
668 }
669 } else if (deadline == 0) {
670 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
671 }
672 }
673
674 static void qemu_account_warp_timer(void)
675 {
676 if (!use_icount || !icount_sleep) {
677 return;
678 }
679
680 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
681 * do not fire, so computing the deadline does not make sense.
682 */
683 if (!runstate_is_running()) {
684 return;
685 }
686
687 /* warp clock deterministically in record/replay mode */
688 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
689 return;
690 }
691
692 timer_del(timers_state.icount_warp_timer);
693 icount_warp_rt();
694 }
695
696 static bool icount_state_needed(void *opaque)
697 {
698 return use_icount;
699 }
700
701 static bool warp_timer_state_needed(void *opaque)
702 {
703 TimersState *s = opaque;
704 return s->icount_warp_timer != NULL;
705 }
706
707 static bool adjust_timers_state_needed(void *opaque)
708 {
709 TimersState *s = opaque;
710 return s->icount_rt_timer != NULL;
711 }
712
713 /*
714 * Subsection for warp timer migration is optional, because may not be created
715 */
716 static const VMStateDescription icount_vmstate_warp_timer = {
717 .name = "timer/icount/warp_timer",
718 .version_id = 1,
719 .minimum_version_id = 1,
720 .needed = warp_timer_state_needed,
721 .fields = (VMStateField[]) {
722 VMSTATE_INT64(vm_clock_warp_start, TimersState),
723 VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
724 VMSTATE_END_OF_LIST()
725 }
726 };
727
728 static const VMStateDescription icount_vmstate_adjust_timers = {
729 .name = "timer/icount/timers",
730 .version_id = 1,
731 .minimum_version_id = 1,
732 .needed = adjust_timers_state_needed,
733 .fields = (VMStateField[]) {
734 VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
735 VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
736 VMSTATE_END_OF_LIST()
737 }
738 };
739
740 /*
741 * This is a subsection for icount migration.
742 */
743 static const VMStateDescription icount_vmstate_timers = {
744 .name = "timer/icount",
745 .version_id = 1,
746 .minimum_version_id = 1,
747 .needed = icount_state_needed,
748 .fields = (VMStateField[]) {
749 VMSTATE_INT64(qemu_icount_bias, TimersState),
750 VMSTATE_INT64(qemu_icount, TimersState),
751 VMSTATE_END_OF_LIST()
752 },
753 .subsections = (const VMStateDescription*[]) {
754 &icount_vmstate_warp_timer,
755 &icount_vmstate_adjust_timers,
756 NULL
757 }
758 };
759
760 static const VMStateDescription vmstate_timers = {
761 .name = "timer",
762 .version_id = 2,
763 .minimum_version_id = 1,
764 .fields = (VMStateField[]) {
765 VMSTATE_INT64(cpu_ticks_offset, TimersState),
766 VMSTATE_UNUSED(8),
767 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
768 VMSTATE_END_OF_LIST()
769 },
770 .subsections = (const VMStateDescription*[]) {
771 &icount_vmstate_timers,
772 NULL
773 }
774 };
775
776 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
777 {
778 double pct;
779 double throttle_ratio;
780 long sleeptime_ns;
781
782 if (!cpu_throttle_get_percentage()) {
783 return;
784 }
785
786 pct = (double)cpu_throttle_get_percentage()/100;
787 throttle_ratio = pct / (1 - pct);
788 sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
789
790 qemu_mutex_unlock_iothread();
791 g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
792 qemu_mutex_lock_iothread();
793 atomic_set(&cpu->throttle_thread_scheduled, 0);
794 }
795
796 static void cpu_throttle_timer_tick(void *opaque)
797 {
798 CPUState *cpu;
799 double pct;
800
801 /* Stop the timer if needed */
802 if (!cpu_throttle_get_percentage()) {
803 return;
804 }
805 CPU_FOREACH(cpu) {
806 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
807 async_run_on_cpu(cpu, cpu_throttle_thread,
808 RUN_ON_CPU_NULL);
809 }
810 }
811
812 pct = (double)cpu_throttle_get_percentage()/100;
813 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
814 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
815 }
816
817 void cpu_throttle_set(int new_throttle_pct)
818 {
819 /* Ensure throttle percentage is within valid range */
820 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
821 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
822
823 atomic_set(&throttle_percentage, new_throttle_pct);
824
825 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
826 CPU_THROTTLE_TIMESLICE_NS);
827 }
828
829 void cpu_throttle_stop(void)
830 {
831 atomic_set(&throttle_percentage, 0);
832 }
833
834 bool cpu_throttle_active(void)
835 {
836 return (cpu_throttle_get_percentage() != 0);
837 }
838
839 int cpu_throttle_get_percentage(void)
840 {
841 return atomic_read(&throttle_percentage);
842 }
843
844 void cpu_ticks_init(void)
845 {
846 seqlock_init(&timers_state.vm_clock_seqlock);
847 qemu_spin_init(&timers_state.vm_clock_lock);
848 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
849 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
850 cpu_throttle_timer_tick, NULL);
851 }
852
853 void configure_icount(QemuOpts *opts, Error **errp)
854 {
855 const char *option;
856 char *rem_str = NULL;
857
858 option = qemu_opt_get(opts, "shift");
859 if (!option) {
860 if (qemu_opt_get(opts, "align") != NULL) {
861 error_setg(errp, "Please specify shift option when using align");
862 }
863 return;
864 }
865
866 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
867 if (icount_sleep) {
868 timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
869 icount_timer_cb, NULL);
870 }
871
872 icount_align_option = qemu_opt_get_bool(opts, "align", false);
873
874 if (icount_align_option && !icount_sleep) {
875 error_setg(errp, "align=on and sleep=off are incompatible");
876 }
877 if (strcmp(option, "auto") != 0) {
878 errno = 0;
879 timers_state.icount_time_shift = strtol(option, &rem_str, 0);
880 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
881 error_setg(errp, "icount: Invalid shift value");
882 }
883 use_icount = 1;
884 return;
885 } else if (icount_align_option) {
886 error_setg(errp, "shift=auto and align=on are incompatible");
887 } else if (!icount_sleep) {
888 error_setg(errp, "shift=auto and sleep=off are incompatible");
889 }
890
891 use_icount = 2;
892
893 /* 125MIPS seems a reasonable initial guess at the guest speed.
894 It will be corrected fairly quickly anyway. */
895 timers_state.icount_time_shift = 3;
896
897 /* Have both realtime and virtual time triggers for speed adjustment.
898 The realtime trigger catches emulated time passing too slowly,
899 the virtual time trigger catches emulated time passing too fast.
900 Realtime triggers occur even when idle, so use them less frequently
901 than VM triggers. */
902 timers_state.vm_clock_warp_start = -1;
903 timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
904 icount_adjust_rt, NULL);
905 timer_mod(timers_state.icount_rt_timer,
906 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
907 timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
908 icount_adjust_vm, NULL);
909 timer_mod(timers_state.icount_vm_timer,
910 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
911 NANOSECONDS_PER_SECOND / 10);
912 }
913
914 /***********************************************************/
915 /* TCG vCPU kick timer
916 *
917 * The kick timer is responsible for moving single threaded vCPU
918 * emulation on to the next vCPU. If more than one vCPU is running a
919 * timer event with force a cpu->exit so the next vCPU can get
920 * scheduled.
921 *
922 * The timer is removed if all vCPUs are idle and restarted again once
923 * idleness is complete.
924 */
925
926 static QEMUTimer *tcg_kick_vcpu_timer;
927 static CPUState *tcg_current_rr_cpu;
928
929 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
930
931 static inline int64_t qemu_tcg_next_kick(void)
932 {
933 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
934 }
935
936 /* Kick the currently round-robin scheduled vCPU */
937 static void qemu_cpu_kick_rr_cpu(void)
938 {
939 CPUState *cpu;
940 do {
941 cpu = atomic_mb_read(&tcg_current_rr_cpu);
942 if (cpu) {
943 cpu_exit(cpu);
944 }
945 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
946 }
947
948 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
949 {
950 }
951
952 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
953 {
954 if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
955 qemu_notify_event();
956 return;
957 }
958
959 if (qemu_in_vcpu_thread()) {
960 /* A CPU is currently running; kick it back out to the
961 * tcg_cpu_exec() loop so it will recalculate its
962 * icount deadline immediately.
963 */
964 qemu_cpu_kick(current_cpu);
965 } else if (first_cpu) {
966 /* qemu_cpu_kick is not enough to kick a halted CPU out of
967 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
968 * causes cpu_thread_is_idle to return false. This way,
969 * handle_icount_deadline can run.
970 * If we have no CPUs at all for some reason, we don't
971 * need to do anything.
972 */
973 async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
974 }
975 }
976
977 static void kick_tcg_thread(void *opaque)
978 {
979 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
980 qemu_cpu_kick_rr_cpu();
981 }
982
983 static void start_tcg_kick_timer(void)
984 {
985 assert(!mttcg_enabled);
986 if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
987 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
988 kick_tcg_thread, NULL);
989 }
990 if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
991 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
992 }
993 }
994
995 static void stop_tcg_kick_timer(void)
996 {
997 assert(!mttcg_enabled);
998 if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
999 timer_del(tcg_kick_vcpu_timer);
1000 }
1001 }
1002
1003 /***********************************************************/
1004 void hw_error(const char *fmt, ...)
1005 {
1006 va_list ap;
1007 CPUState *cpu;
1008
1009 va_start(ap, fmt);
1010 fprintf(stderr, "qemu: hardware error: ");
1011 vfprintf(stderr, fmt, ap);
1012 fprintf(stderr, "\n");
1013 CPU_FOREACH(cpu) {
1014 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1015 cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1016 }
1017 va_end(ap);
1018 abort();
1019 }
1020
1021 void cpu_synchronize_all_states(void)
1022 {
1023 CPUState *cpu;
1024
1025 CPU_FOREACH(cpu) {
1026 cpu_synchronize_state(cpu);
1027 /* TODO: move to cpu_synchronize_state() */
1028 if (hvf_enabled()) {
1029 hvf_cpu_synchronize_state(cpu);
1030 }
1031 }
1032 }
1033
1034 void cpu_synchronize_all_post_reset(void)
1035 {
1036 CPUState *cpu;
1037
1038 CPU_FOREACH(cpu) {
1039 cpu_synchronize_post_reset(cpu);
1040 /* TODO: move to cpu_synchronize_post_reset() */
1041 if (hvf_enabled()) {
1042 hvf_cpu_synchronize_post_reset(cpu);
1043 }
1044 }
1045 }
1046
1047 void cpu_synchronize_all_post_init(void)
1048 {
1049 CPUState *cpu;
1050
1051 CPU_FOREACH(cpu) {
1052 cpu_synchronize_post_init(cpu);
1053 /* TODO: move to cpu_synchronize_post_init() */
1054 if (hvf_enabled()) {
1055 hvf_cpu_synchronize_post_init(cpu);
1056 }
1057 }
1058 }
1059
1060 void cpu_synchronize_all_pre_loadvm(void)
1061 {
1062 CPUState *cpu;
1063
1064 CPU_FOREACH(cpu) {
1065 cpu_synchronize_pre_loadvm(cpu);
1066 }
1067 }
1068
1069 static int do_vm_stop(RunState state, bool send_stop)
1070 {
1071 int ret = 0;
1072
1073 if (runstate_is_running()) {
1074 cpu_disable_ticks();
1075 pause_all_vcpus();
1076 runstate_set(state);
1077 vm_state_notify(0, state);
1078 if (send_stop) {
1079 qapi_event_send_stop();
1080 }
1081 }
1082
1083 bdrv_drain_all();
1084 replay_disable_events();
1085 ret = bdrv_flush_all();
1086
1087 return ret;
1088 }
1089
1090 /* Special vm_stop() variant for terminating the process. Historically clients
1091 * did not expect a QMP STOP event and so we need to retain compatibility.
1092 */
1093 int vm_shutdown(void)
1094 {
1095 return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1096 }
1097
1098 static bool cpu_can_run(CPUState *cpu)
1099 {
1100 if (cpu->stop) {
1101 return false;
1102 }
1103 if (cpu_is_stopped(cpu)) {
1104 return false;
1105 }
1106 return true;
1107 }
1108
1109 static void cpu_handle_guest_debug(CPUState *cpu)
1110 {
1111 gdb_set_stop_cpu(cpu);
1112 qemu_system_debug_request();
1113 cpu->stopped = true;
1114 }
1115
1116 #ifdef CONFIG_LINUX
1117 static void sigbus_reraise(void)
1118 {
1119 sigset_t set;
1120 struct sigaction action;
1121
1122 memset(&action, 0, sizeof(action));
1123 action.sa_handler = SIG_DFL;
1124 if (!sigaction(SIGBUS, &action, NULL)) {
1125 raise(SIGBUS);
1126 sigemptyset(&set);
1127 sigaddset(&set, SIGBUS);
1128 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1129 }
1130 perror("Failed to re-raise SIGBUS!\n");
1131 abort();
1132 }
1133
1134 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1135 {
1136 if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1137 sigbus_reraise();
1138 }
1139
1140 if (current_cpu) {
1141 /* Called asynchronously in VCPU thread. */
1142 if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1143 sigbus_reraise();
1144 }
1145 } else {
1146 /* Called synchronously (via signalfd) in main thread. */
1147 if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1148 sigbus_reraise();
1149 }
1150 }
1151 }
1152
1153 static void qemu_init_sigbus(void)
1154 {
1155 struct sigaction action;
1156
1157 memset(&action, 0, sizeof(action));
1158 action.sa_flags = SA_SIGINFO;
1159 action.sa_sigaction = sigbus_handler;
1160 sigaction(SIGBUS, &action, NULL);
1161
1162 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1163 }
1164 #else /* !CONFIG_LINUX */
1165 static void qemu_init_sigbus(void)
1166 {
1167 }
1168 #endif /* !CONFIG_LINUX */
1169
1170 static QemuMutex qemu_global_mutex;
1171
1172 static QemuThread io_thread;
1173
1174 /* cpu creation */
1175 static QemuCond qemu_cpu_cond;
1176 /* system init */
1177 static QemuCond qemu_pause_cond;
1178
1179 void qemu_init_cpu_loop(void)
1180 {
1181 qemu_init_sigbus();
1182 qemu_cond_init(&qemu_cpu_cond);
1183 qemu_cond_init(&qemu_pause_cond);
1184 qemu_mutex_init(&qemu_global_mutex);
1185
1186 qemu_thread_get_self(&io_thread);
1187 }
1188
1189 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1190 {
1191 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1192 }
1193
1194 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1195 {
1196 if (kvm_destroy_vcpu(cpu) < 0) {
1197 error_report("kvm_destroy_vcpu failed");
1198 exit(EXIT_FAILURE);
1199 }
1200 }
1201
1202 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1203 {
1204 }
1205
1206 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1207 {
1208 g_assert(qemu_cpu_is_self(cpu));
1209 cpu->stop = false;
1210 cpu->stopped = true;
1211 if (exit) {
1212 cpu_exit(cpu);
1213 }
1214 qemu_cond_broadcast(&qemu_pause_cond);
1215 }
1216
1217 static void qemu_wait_io_event_common(CPUState *cpu)
1218 {
1219 atomic_mb_set(&cpu->thread_kicked, false);
1220 if (cpu->stop) {
1221 qemu_cpu_stop(cpu, false);
1222 }
1223 process_queued_cpu_work(cpu);
1224 }
1225
1226 static void qemu_tcg_rr_wait_io_event(void)
1227 {
1228 CPUState *cpu;
1229
1230 while (all_cpu_threads_idle()) {
1231 stop_tcg_kick_timer();
1232 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1233 }
1234
1235 start_tcg_kick_timer();
1236
1237 CPU_FOREACH(cpu) {
1238 qemu_wait_io_event_common(cpu);
1239 }
1240 }
1241
1242 static void qemu_wait_io_event(CPUState *cpu)
1243 {
1244 while (cpu_thread_is_idle(cpu)) {
1245 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1246 }
1247
1248 #ifdef _WIN32
1249 /* Eat dummy APC queued by qemu_cpu_kick_thread. */
1250 if (!tcg_enabled()) {
1251 SleepEx(0, TRUE);
1252 }
1253 #endif
1254 qemu_wait_io_event_common(cpu);
1255 }
1256
1257 static void *qemu_kvm_cpu_thread_fn(void *arg)
1258 {
1259 CPUState *cpu = arg;
1260 int r;
1261
1262 rcu_register_thread();
1263
1264 qemu_mutex_lock_iothread();
1265 qemu_thread_get_self(cpu->thread);
1266 cpu->thread_id = qemu_get_thread_id();
1267 cpu->can_do_io = 1;
1268 current_cpu = cpu;
1269
1270 r = kvm_init_vcpu(cpu);
1271 if (r < 0) {
1272 error_report("kvm_init_vcpu failed: %s", strerror(-r));
1273 exit(1);
1274 }
1275
1276 kvm_init_cpu_signals(cpu);
1277
1278 /* signal CPU creation */
1279 cpu->created = true;
1280 qemu_cond_signal(&qemu_cpu_cond);
1281 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1282
1283 do {
1284 if (cpu_can_run(cpu)) {
1285 r = kvm_cpu_exec(cpu);
1286 if (r == EXCP_DEBUG) {
1287 cpu_handle_guest_debug(cpu);
1288 }
1289 }
1290 qemu_wait_io_event(cpu);
1291 } while (!cpu->unplug || cpu_can_run(cpu));
1292
1293 qemu_kvm_destroy_vcpu(cpu);
1294 cpu->created = false;
1295 qemu_cond_signal(&qemu_cpu_cond);
1296 qemu_mutex_unlock_iothread();
1297 rcu_unregister_thread();
1298 return NULL;
1299 }
1300
1301 static void *qemu_dummy_cpu_thread_fn(void *arg)
1302 {
1303 #ifdef _WIN32
1304 error_report("qtest is not supported under Windows");
1305 exit(1);
1306 #else
1307 CPUState *cpu = arg;
1308 sigset_t waitset;
1309 int r;
1310
1311 rcu_register_thread();
1312
1313 qemu_mutex_lock_iothread();
1314 qemu_thread_get_self(cpu->thread);
1315 cpu->thread_id = qemu_get_thread_id();
1316 cpu->can_do_io = 1;
1317 current_cpu = cpu;
1318
1319 sigemptyset(&waitset);
1320 sigaddset(&waitset, SIG_IPI);
1321
1322 /* signal CPU creation */
1323 cpu->created = true;
1324 qemu_cond_signal(&qemu_cpu_cond);
1325 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1326
1327 do {
1328 qemu_mutex_unlock_iothread();
1329 do {
1330 int sig;
1331 r = sigwait(&waitset, &sig);
1332 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1333 if (r == -1) {
1334 perror("sigwait");
1335 exit(1);
1336 }
1337 qemu_mutex_lock_iothread();
1338 qemu_wait_io_event(cpu);
1339 } while (!cpu->unplug);
1340
1341 qemu_mutex_unlock_iothread();
1342 rcu_unregister_thread();
1343 return NULL;
1344 #endif
1345 }
1346
1347 static int64_t tcg_get_icount_limit(void)
1348 {
1349 int64_t deadline;
1350
1351 if (replay_mode != REPLAY_MODE_PLAY) {
1352 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1353
1354 /* Maintain prior (possibly buggy) behaviour where if no deadline
1355 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1356 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1357 * nanoseconds.
1358 */
1359 if ((deadline < 0) || (deadline > INT32_MAX)) {
1360 deadline = INT32_MAX;
1361 }
1362
1363 return qemu_icount_round(deadline);
1364 } else {
1365 return replay_get_instructions();
1366 }
1367 }
1368
1369 static void handle_icount_deadline(void)
1370 {
1371 assert(qemu_in_vcpu_thread());
1372 if (use_icount) {
1373 int64_t deadline =
1374 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1375
1376 if (deadline == 0) {
1377 /* Wake up other AioContexts. */
1378 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1379 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1380 }
1381 }
1382 }
1383
1384 static void prepare_icount_for_run(CPUState *cpu)
1385 {
1386 if (use_icount) {
1387 int insns_left;
1388
1389 /* These should always be cleared by process_icount_data after
1390 * each vCPU execution. However u16.high can be raised
1391 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1392 */
1393 g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1394 g_assert(cpu->icount_extra == 0);
1395
1396 cpu->icount_budget = tcg_get_icount_limit();
1397 insns_left = MIN(0xffff, cpu->icount_budget);
1398 cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1399 cpu->icount_extra = cpu->icount_budget - insns_left;
1400
1401 replay_mutex_lock();
1402 }
1403 }
1404
1405 static void process_icount_data(CPUState *cpu)
1406 {
1407 if (use_icount) {
1408 /* Account for executed instructions */
1409 cpu_update_icount(cpu);
1410
1411 /* Reset the counters */
1412 cpu_neg(cpu)->icount_decr.u16.low = 0;
1413 cpu->icount_extra = 0;
1414 cpu->icount_budget = 0;
1415
1416 replay_account_executed_instructions();
1417
1418 replay_mutex_unlock();
1419 }
1420 }
1421
1422
1423 static int tcg_cpu_exec(CPUState *cpu)
1424 {
1425 int ret;
1426 #ifdef CONFIG_PROFILER
1427 int64_t ti;
1428 #endif
1429
1430 assert(tcg_enabled());
1431 #ifdef CONFIG_PROFILER
1432 ti = profile_getclock();
1433 #endif
1434 cpu_exec_start(cpu);
1435 ret = cpu_exec(cpu);
1436 cpu_exec_end(cpu);
1437 #ifdef CONFIG_PROFILER
1438 atomic_set(&tcg_ctx->prof.cpu_exec_time,
1439 tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1440 #endif
1441 return ret;
1442 }
1443
1444 /* Destroy any remaining vCPUs which have been unplugged and have
1445 * finished running
1446 */
1447 static void deal_with_unplugged_cpus(void)
1448 {
1449 CPUState *cpu;
1450
1451 CPU_FOREACH(cpu) {
1452 if (cpu->unplug && !cpu_can_run(cpu)) {
1453 qemu_tcg_destroy_vcpu(cpu);
1454 cpu->created = false;
1455 qemu_cond_signal(&qemu_cpu_cond);
1456 break;
1457 }
1458 }
1459 }
1460
1461 /* Single-threaded TCG
1462 *
1463 * In the single-threaded case each vCPU is simulated in turn. If
1464 * there is more than a single vCPU we create a simple timer to kick
1465 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1466 * This is done explicitly rather than relying on side-effects
1467 * elsewhere.
1468 */
1469
1470 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1471 {
1472 CPUState *cpu = arg;
1473
1474 assert(tcg_enabled());
1475 rcu_register_thread();
1476 tcg_register_thread();
1477
1478 qemu_mutex_lock_iothread();
1479 qemu_thread_get_self(cpu->thread);
1480
1481 cpu->thread_id = qemu_get_thread_id();
1482 cpu->created = true;
1483 cpu->can_do_io = 1;
1484 qemu_cond_signal(&qemu_cpu_cond);
1485 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1486
1487 /* wait for initial kick-off after machine start */
1488 while (first_cpu->stopped) {
1489 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1490
1491 /* process any pending work */
1492 CPU_FOREACH(cpu) {
1493 current_cpu = cpu;
1494 qemu_wait_io_event_common(cpu);
1495 }
1496 }
1497
1498 start_tcg_kick_timer();
1499
1500 cpu = first_cpu;
1501
1502 /* process any pending work */
1503 cpu->exit_request = 1;
1504
1505 while (1) {
1506 qemu_mutex_unlock_iothread();
1507 replay_mutex_lock();
1508 qemu_mutex_lock_iothread();
1509 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1510 qemu_account_warp_timer();
1511
1512 /* Run the timers here. This is much more efficient than
1513 * waking up the I/O thread and waiting for completion.
1514 */
1515 handle_icount_deadline();
1516
1517 replay_mutex_unlock();
1518
1519 if (!cpu) {
1520 cpu = first_cpu;
1521 }
1522
1523 while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1524
1525 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1526 current_cpu = cpu;
1527
1528 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1529 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1530
1531 if (cpu_can_run(cpu)) {
1532 int r;
1533
1534 qemu_mutex_unlock_iothread();
1535 prepare_icount_for_run(cpu);
1536
1537 r = tcg_cpu_exec(cpu);
1538
1539 process_icount_data(cpu);
1540 qemu_mutex_lock_iothread();
1541
1542 if (r == EXCP_DEBUG) {
1543 cpu_handle_guest_debug(cpu);
1544 break;
1545 } else if (r == EXCP_ATOMIC) {
1546 qemu_mutex_unlock_iothread();
1547 cpu_exec_step_atomic(cpu);
1548 qemu_mutex_lock_iothread();
1549 break;
1550 }
1551 } else if (cpu->stop) {
1552 if (cpu->unplug) {
1553 cpu = CPU_NEXT(cpu);
1554 }
1555 break;
1556 }
1557
1558 cpu = CPU_NEXT(cpu);
1559 } /* while (cpu && !cpu->exit_request).. */
1560
1561 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1562 atomic_set(&tcg_current_rr_cpu, NULL);
1563
1564 if (cpu && cpu->exit_request) {
1565 atomic_mb_set(&cpu->exit_request, 0);
1566 }
1567
1568 if (use_icount && all_cpu_threads_idle()) {
1569 /*
1570 * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1571 * in the main_loop, wake it up in order to start the warp timer.
1572 */
1573 qemu_notify_event();
1574 }
1575
1576 qemu_tcg_rr_wait_io_event();
1577 deal_with_unplugged_cpus();
1578 }
1579
1580 rcu_unregister_thread();
1581 return NULL;
1582 }
1583
1584 static void *qemu_hax_cpu_thread_fn(void *arg)
1585 {
1586 CPUState *cpu = arg;
1587 int r;
1588
1589 rcu_register_thread();
1590 qemu_mutex_lock_iothread();
1591 qemu_thread_get_self(cpu->thread);
1592
1593 cpu->thread_id = qemu_get_thread_id();
1594 cpu->created = true;
1595 cpu->halted = 0;
1596 current_cpu = cpu;
1597
1598 hax_init_vcpu(cpu);
1599 qemu_cond_signal(&qemu_cpu_cond);
1600 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1601
1602 do {
1603 if (cpu_can_run(cpu)) {
1604 r = hax_smp_cpu_exec(cpu);
1605 if (r == EXCP_DEBUG) {
1606 cpu_handle_guest_debug(cpu);
1607 }
1608 }
1609
1610 qemu_wait_io_event(cpu);
1611 } while (!cpu->unplug || cpu_can_run(cpu));
1612 rcu_unregister_thread();
1613 return NULL;
1614 }
1615
1616 /* The HVF-specific vCPU thread function. This one should only run when the host
1617 * CPU supports the VMX "unrestricted guest" feature. */
1618 static void *qemu_hvf_cpu_thread_fn(void *arg)
1619 {
1620 CPUState *cpu = arg;
1621
1622 int r;
1623
1624 assert(hvf_enabled());
1625
1626 rcu_register_thread();
1627
1628 qemu_mutex_lock_iothread();
1629 qemu_thread_get_self(cpu->thread);
1630
1631 cpu->thread_id = qemu_get_thread_id();
1632 cpu->can_do_io = 1;
1633 current_cpu = cpu;
1634
1635 hvf_init_vcpu(cpu);
1636
1637 /* signal CPU creation */
1638 cpu->created = true;
1639 qemu_cond_signal(&qemu_cpu_cond);
1640 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1641
1642 do {
1643 if (cpu_can_run(cpu)) {
1644 r = hvf_vcpu_exec(cpu);
1645 if (r == EXCP_DEBUG) {
1646 cpu_handle_guest_debug(cpu);
1647 }
1648 }
1649 qemu_wait_io_event(cpu);
1650 } while (!cpu->unplug || cpu_can_run(cpu));
1651
1652 hvf_vcpu_destroy(cpu);
1653 cpu->created = false;
1654 qemu_cond_signal(&qemu_cpu_cond);
1655 qemu_mutex_unlock_iothread();
1656 rcu_unregister_thread();
1657 return NULL;
1658 }
1659
1660 static void *qemu_whpx_cpu_thread_fn(void *arg)
1661 {
1662 CPUState *cpu = arg;
1663 int r;
1664
1665 rcu_register_thread();
1666
1667 qemu_mutex_lock_iothread();
1668 qemu_thread_get_self(cpu->thread);
1669 cpu->thread_id = qemu_get_thread_id();
1670 current_cpu = cpu;
1671
1672 r = whpx_init_vcpu(cpu);
1673 if (r < 0) {
1674 fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1675 exit(1);
1676 }
1677
1678 /* signal CPU creation */
1679 cpu->created = true;
1680 qemu_cond_signal(&qemu_cpu_cond);
1681 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1682
1683 do {
1684 if (cpu_can_run(cpu)) {
1685 r = whpx_vcpu_exec(cpu);
1686 if (r == EXCP_DEBUG) {
1687 cpu_handle_guest_debug(cpu);
1688 }
1689 }
1690 while (cpu_thread_is_idle(cpu)) {
1691 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1692 }
1693 qemu_wait_io_event_common(cpu);
1694 } while (!cpu->unplug || cpu_can_run(cpu));
1695
1696 whpx_destroy_vcpu(cpu);
1697 cpu->created = false;
1698 qemu_cond_signal(&qemu_cpu_cond);
1699 qemu_mutex_unlock_iothread();
1700 rcu_unregister_thread();
1701 return NULL;
1702 }
1703
1704 #ifdef _WIN32
1705 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1706 {
1707 }
1708 #endif
1709
1710 /* Multi-threaded TCG
1711 *
1712 * In the multi-threaded case each vCPU has its own thread. The TLS
1713 * variable current_cpu can be used deep in the code to find the
1714 * current CPUState for a given thread.
1715 */
1716
1717 static void *qemu_tcg_cpu_thread_fn(void *arg)
1718 {
1719 CPUState *cpu = arg;
1720
1721 assert(tcg_enabled());
1722 g_assert(!use_icount);
1723
1724 rcu_register_thread();
1725 tcg_register_thread();
1726
1727 qemu_mutex_lock_iothread();
1728 qemu_thread_get_self(cpu->thread);
1729
1730 cpu->thread_id = qemu_get_thread_id();
1731 cpu->created = true;
1732 cpu->can_do_io = 1;
1733 current_cpu = cpu;
1734 qemu_cond_signal(&qemu_cpu_cond);
1735 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1736
1737 /* process any pending work */
1738 cpu->exit_request = 1;
1739
1740 do {
1741 if (cpu_can_run(cpu)) {
1742 int r;
1743 qemu_mutex_unlock_iothread();
1744 r = tcg_cpu_exec(cpu);
1745 qemu_mutex_lock_iothread();
1746 switch (r) {
1747 case EXCP_DEBUG:
1748 cpu_handle_guest_debug(cpu);
1749 break;
1750 case EXCP_HALTED:
1751 /* during start-up the vCPU is reset and the thread is
1752 * kicked several times. If we don't ensure we go back
1753 * to sleep in the halted state we won't cleanly
1754 * start-up when the vCPU is enabled.
1755 *
1756 * cpu->halted should ensure we sleep in wait_io_event
1757 */
1758 g_assert(cpu->halted);
1759 break;
1760 case EXCP_ATOMIC:
1761 qemu_mutex_unlock_iothread();
1762 cpu_exec_step_atomic(cpu);
1763 qemu_mutex_lock_iothread();
1764 default:
1765 /* Ignore everything else? */
1766 break;
1767 }
1768 }
1769
1770 atomic_mb_set(&cpu->exit_request, 0);
1771 qemu_wait_io_event(cpu);
1772 } while (!cpu->unplug || cpu_can_run(cpu));
1773
1774 qemu_tcg_destroy_vcpu(cpu);
1775 cpu->created = false;
1776 qemu_cond_signal(&qemu_cpu_cond);
1777 qemu_mutex_unlock_iothread();
1778 rcu_unregister_thread();
1779 return NULL;
1780 }
1781
1782 static void qemu_cpu_kick_thread(CPUState *cpu)
1783 {
1784 #ifndef _WIN32
1785 int err;
1786
1787 if (cpu->thread_kicked) {
1788 return;
1789 }
1790 cpu->thread_kicked = true;
1791 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1792 if (err && err != ESRCH) {
1793 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1794 exit(1);
1795 }
1796 #else /* _WIN32 */
1797 if (!qemu_cpu_is_self(cpu)) {
1798 if (whpx_enabled()) {
1799 whpx_vcpu_kick(cpu);
1800 } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1801 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1802 __func__, GetLastError());
1803 exit(1);
1804 }
1805 }
1806 #endif
1807 }
1808
1809 void qemu_cpu_kick(CPUState *cpu)
1810 {
1811 qemu_cond_broadcast(cpu->halt_cond);
1812 if (tcg_enabled()) {
1813 cpu_exit(cpu);
1814 /* NOP unless doing single-thread RR */
1815 qemu_cpu_kick_rr_cpu();
1816 } else {
1817 if (hax_enabled()) {
1818 /*
1819 * FIXME: race condition with the exit_request check in
1820 * hax_vcpu_hax_exec
1821 */
1822 cpu->exit_request = 1;
1823 }
1824 qemu_cpu_kick_thread(cpu);
1825 }
1826 }
1827
1828 void qemu_cpu_kick_self(void)
1829 {
1830 assert(current_cpu);
1831 qemu_cpu_kick_thread(current_cpu);
1832 }
1833
1834 bool qemu_cpu_is_self(CPUState *cpu)
1835 {
1836 return qemu_thread_is_self(cpu->thread);
1837 }
1838
1839 bool qemu_in_vcpu_thread(void)
1840 {
1841 return current_cpu && qemu_cpu_is_self(current_cpu);
1842 }
1843
1844 static __thread bool iothread_locked = false;
1845
1846 bool qemu_mutex_iothread_locked(void)
1847 {
1848 return iothread_locked;
1849 }
1850
1851 /*
1852 * The BQL is taken from so many places that it is worth profiling the
1853 * callers directly, instead of funneling them all through a single function.
1854 */
1855 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1856 {
1857 QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1858
1859 g_assert(!qemu_mutex_iothread_locked());
1860 bql_lock(&qemu_global_mutex, file, line);
1861 iothread_locked = true;
1862 }
1863
1864 void qemu_mutex_unlock_iothread(void)
1865 {
1866 g_assert(qemu_mutex_iothread_locked());
1867 iothread_locked = false;
1868 qemu_mutex_unlock(&qemu_global_mutex);
1869 }
1870
1871 static bool all_vcpus_paused(void)
1872 {
1873 CPUState *cpu;
1874
1875 CPU_FOREACH(cpu) {
1876 if (!cpu->stopped) {
1877 return false;
1878 }
1879 }
1880
1881 return true;
1882 }
1883
1884 void pause_all_vcpus(void)
1885 {
1886 CPUState *cpu;
1887
1888 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1889 CPU_FOREACH(cpu) {
1890 if (qemu_cpu_is_self(cpu)) {
1891 qemu_cpu_stop(cpu, true);
1892 } else {
1893 cpu->stop = true;
1894 qemu_cpu_kick(cpu);
1895 }
1896 }
1897
1898 /* We need to drop the replay_lock so any vCPU threads woken up
1899 * can finish their replay tasks
1900 */
1901 replay_mutex_unlock();
1902
1903 while (!all_vcpus_paused()) {
1904 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1905 CPU_FOREACH(cpu) {
1906 qemu_cpu_kick(cpu);
1907 }
1908 }
1909
1910 qemu_mutex_unlock_iothread();
1911 replay_mutex_lock();
1912 qemu_mutex_lock_iothread();
1913 }
1914
1915 void cpu_resume(CPUState *cpu)
1916 {
1917 cpu->stop = false;
1918 cpu->stopped = false;
1919 qemu_cpu_kick(cpu);
1920 }
1921
1922 void resume_all_vcpus(void)
1923 {
1924 CPUState *cpu;
1925
1926 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1927 CPU_FOREACH(cpu) {
1928 cpu_resume(cpu);
1929 }
1930 }
1931
1932 void cpu_remove_sync(CPUState *cpu)
1933 {
1934 cpu->stop = true;
1935 cpu->unplug = true;
1936 qemu_cpu_kick(cpu);
1937 qemu_mutex_unlock_iothread();
1938 qemu_thread_join(cpu->thread);
1939 qemu_mutex_lock_iothread();
1940 }
1941
1942 /* For temporary buffers for forming a name */
1943 #define VCPU_THREAD_NAME_SIZE 16
1944
1945 static void qemu_tcg_init_vcpu(CPUState *cpu)
1946 {
1947 char thread_name[VCPU_THREAD_NAME_SIZE];
1948 static QemuCond *single_tcg_halt_cond;
1949 static QemuThread *single_tcg_cpu_thread;
1950 static int tcg_region_inited;
1951
1952 assert(tcg_enabled());
1953 /*
1954 * Initialize TCG regions--once. Now is a good time, because:
1955 * (1) TCG's init context, prologue and target globals have been set up.
1956 * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1957 * -accel flag is processed, so the check doesn't work then).
1958 */
1959 if (!tcg_region_inited) {
1960 tcg_region_inited = 1;
1961 tcg_region_init();
1962 }
1963
1964 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1965 cpu->thread = g_malloc0(sizeof(QemuThread));
1966 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1967 qemu_cond_init(cpu->halt_cond);
1968
1969 if (qemu_tcg_mttcg_enabled()) {
1970 /* create a thread per vCPU with TCG (MTTCG) */
1971 parallel_cpus = true;
1972 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1973 cpu->cpu_index);
1974
1975 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1976 cpu, QEMU_THREAD_JOINABLE);
1977
1978 } else {
1979 /* share a single thread for all cpus with TCG */
1980 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1981 qemu_thread_create(cpu->thread, thread_name,
1982 qemu_tcg_rr_cpu_thread_fn,
1983 cpu, QEMU_THREAD_JOINABLE);
1984
1985 single_tcg_halt_cond = cpu->halt_cond;
1986 single_tcg_cpu_thread = cpu->thread;
1987 }
1988 #ifdef _WIN32
1989 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1990 #endif
1991 } else {
1992 /* For non-MTTCG cases we share the thread */
1993 cpu->thread = single_tcg_cpu_thread;
1994 cpu->halt_cond = single_tcg_halt_cond;
1995 cpu->thread_id = first_cpu->thread_id;
1996 cpu->can_do_io = 1;
1997 cpu->created = true;
1998 }
1999 }
2000
2001 static void qemu_hax_start_vcpu(CPUState *cpu)
2002 {
2003 char thread_name[VCPU_THREAD_NAME_SIZE];
2004
2005 cpu->thread = g_malloc0(sizeof(QemuThread));
2006 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2007 qemu_cond_init(cpu->halt_cond);
2008
2009 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2010 cpu->cpu_index);
2011 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2012 cpu, QEMU_THREAD_JOINABLE);
2013 #ifdef _WIN32
2014 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2015 #endif
2016 }
2017
2018 static void qemu_kvm_start_vcpu(CPUState *cpu)
2019 {
2020 char thread_name[VCPU_THREAD_NAME_SIZE];
2021
2022 cpu->thread = g_malloc0(sizeof(QemuThread));
2023 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2024 qemu_cond_init(cpu->halt_cond);
2025 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2026 cpu->cpu_index);
2027 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2028 cpu, QEMU_THREAD_JOINABLE);
2029 }
2030
2031 static void qemu_hvf_start_vcpu(CPUState *cpu)
2032 {
2033 char thread_name[VCPU_THREAD_NAME_SIZE];
2034
2035 /* HVF currently does not support TCG, and only runs in
2036 * unrestricted-guest mode. */
2037 assert(hvf_enabled());
2038
2039 cpu->thread = g_malloc0(sizeof(QemuThread));
2040 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2041 qemu_cond_init(cpu->halt_cond);
2042
2043 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2044 cpu->cpu_index);
2045 qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2046 cpu, QEMU_THREAD_JOINABLE);
2047 }
2048
2049 static void qemu_whpx_start_vcpu(CPUState *cpu)
2050 {
2051 char thread_name[VCPU_THREAD_NAME_SIZE];
2052
2053 cpu->thread = g_malloc0(sizeof(QemuThread));
2054 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2055 qemu_cond_init(cpu->halt_cond);
2056 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2057 cpu->cpu_index);
2058 qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2059 cpu, QEMU_THREAD_JOINABLE);
2060 #ifdef _WIN32
2061 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2062 #endif
2063 }
2064
2065 static void qemu_dummy_start_vcpu(CPUState *cpu)
2066 {
2067 char thread_name[VCPU_THREAD_NAME_SIZE];
2068
2069 cpu->thread = g_malloc0(sizeof(QemuThread));
2070 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2071 qemu_cond_init(cpu->halt_cond);
2072 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2073 cpu->cpu_index);
2074 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2075 QEMU_THREAD_JOINABLE);
2076 }
2077
2078 void qemu_init_vcpu(CPUState *cpu)
2079 {
2080 cpu->nr_cores = smp_cores;
2081 cpu->nr_threads = smp_threads;
2082 cpu->stopped = true;
2083 cpu->random_seed = qemu_guest_random_seed_thread_part1();
2084
2085 if (!cpu->as) {
2086 /* If the target cpu hasn't set up any address spaces itself,
2087 * give it the default one.
2088 */
2089 cpu->num_ases = 1;
2090 cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2091 }
2092
2093 if (kvm_enabled()) {
2094 qemu_kvm_start_vcpu(cpu);
2095 } else if (hax_enabled()) {
2096 qemu_hax_start_vcpu(cpu);
2097 } else if (hvf_enabled()) {
2098 qemu_hvf_start_vcpu(cpu);
2099 } else if (tcg_enabled()) {
2100 qemu_tcg_init_vcpu(cpu);
2101 } else if (whpx_enabled()) {
2102 qemu_whpx_start_vcpu(cpu);
2103 } else {
2104 qemu_dummy_start_vcpu(cpu);
2105 }
2106
2107 while (!cpu->created) {
2108 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2109 }
2110 }
2111
2112 void cpu_stop_current(void)
2113 {
2114 if (current_cpu) {
2115 current_cpu->stop = true;
2116 cpu_exit(current_cpu);
2117 }
2118 }
2119
2120 int vm_stop(RunState state)
2121 {
2122 if (qemu_in_vcpu_thread()) {
2123 qemu_system_vmstop_request_prepare();
2124 qemu_system_vmstop_request(state);
2125 /*
2126 * FIXME: should not return to device code in case
2127 * vm_stop() has been requested.
2128 */
2129 cpu_stop_current();
2130 return 0;
2131 }
2132
2133 return do_vm_stop(state, true);
2134 }
2135
2136 /**
2137 * Prepare for (re)starting the VM.
2138 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2139 * running or in case of an error condition), 0 otherwise.
2140 */
2141 int vm_prepare_start(void)
2142 {
2143 RunState requested;
2144
2145 qemu_vmstop_requested(&requested);
2146 if (runstate_is_running() && requested == RUN_STATE__MAX) {
2147 return -1;
2148 }
2149
2150 /* Ensure that a STOP/RESUME pair of events is emitted if a
2151 * vmstop request was pending. The BLOCK_IO_ERROR event, for
2152 * example, according to documentation is always followed by
2153 * the STOP event.
2154 */
2155 if (runstate_is_running()) {
2156 qapi_event_send_stop();
2157 qapi_event_send_resume();
2158 return -1;
2159 }
2160
2161 /* We are sending this now, but the CPUs will be resumed shortly later */
2162 qapi_event_send_resume();
2163
2164 replay_enable_events();
2165 cpu_enable_ticks();
2166 runstate_set(RUN_STATE_RUNNING);
2167 vm_state_notify(1, RUN_STATE_RUNNING);
2168 return 0;
2169 }
2170
2171 void vm_start(void)
2172 {
2173 if (!vm_prepare_start()) {
2174 resume_all_vcpus();
2175 }
2176 }
2177
2178 /* does a state transition even if the VM is already stopped,
2179 current state is forgotten forever */
2180 int vm_stop_force_state(RunState state)
2181 {
2182 if (runstate_is_running()) {
2183 return vm_stop(state);
2184 } else {
2185 runstate_set(state);
2186
2187 bdrv_drain_all();
2188 /* Make sure to return an error if the flush in a previous vm_stop()
2189 * failed. */
2190 return bdrv_flush_all();
2191 }
2192 }
2193
2194 void list_cpus(const char *optarg)
2195 {
2196 /* XXX: implement xxx_cpu_list for targets that still miss it */
2197 #if defined(cpu_list)
2198 cpu_list();
2199 #endif
2200 }
2201
2202 CpuInfoList *qmp_query_cpus(Error **errp)
2203 {
2204 MachineState *ms = MACHINE(qdev_get_machine());
2205 MachineClass *mc = MACHINE_GET_CLASS(ms);
2206 CpuInfoList *head = NULL, *cur_item = NULL;
2207 CPUState *cpu;
2208
2209 CPU_FOREACH(cpu) {
2210 CpuInfoList *info;
2211 #if defined(TARGET_I386)
2212 X86CPU *x86_cpu = X86_CPU(cpu);
2213 CPUX86State *env = &x86_cpu->env;
2214 #elif defined(TARGET_PPC)
2215 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2216 CPUPPCState *env = &ppc_cpu->env;
2217 #elif defined(TARGET_SPARC)
2218 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2219 CPUSPARCState *env = &sparc_cpu->env;
2220 #elif defined(TARGET_RISCV)
2221 RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2222 CPURISCVState *env = &riscv_cpu->env;
2223 #elif defined(TARGET_MIPS)
2224 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2225 CPUMIPSState *env = &mips_cpu->env;
2226 #elif defined(TARGET_TRICORE)
2227 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2228 CPUTriCoreState *env = &tricore_cpu->env;
2229 #elif defined(TARGET_S390X)
2230 S390CPU *s390_cpu = S390_CPU(cpu);
2231 CPUS390XState *env = &s390_cpu->env;
2232 #endif
2233
2234 cpu_synchronize_state(cpu);
2235
2236 info = g_malloc0(sizeof(*info));
2237 info->value = g_malloc0(sizeof(*info->value));
2238 info->value->CPU = cpu->cpu_index;
2239 info->value->current = (cpu == first_cpu);
2240 info->value->halted = cpu->halted;
2241 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2242 info->value->thread_id = cpu->thread_id;
2243 #if defined(TARGET_I386)
2244 info->value->arch = CPU_INFO_ARCH_X86;
2245 info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2246 #elif defined(TARGET_PPC)
2247 info->value->arch = CPU_INFO_ARCH_PPC;
2248 info->value->u.ppc.nip = env->nip;
2249 #elif defined(TARGET_SPARC)
2250 info->value->arch = CPU_INFO_ARCH_SPARC;
2251 info->value->u.q_sparc.pc = env->pc;
2252 info->value->u.q_sparc.npc = env->npc;
2253 #elif defined(TARGET_MIPS)
2254 info->value->arch = CPU_INFO_ARCH_MIPS;
2255 info->value->u.q_mips.PC = env->active_tc.PC;
2256 #elif defined(TARGET_TRICORE)
2257 info->value->arch = CPU_INFO_ARCH_TRICORE;
2258 info->value->u.tricore.PC = env->PC;
2259 #elif defined(TARGET_S390X)
2260 info->value->arch = CPU_INFO_ARCH_S390;
2261 info->value->u.s390.cpu_state = env->cpu_state;
2262 #elif defined(TARGET_RISCV)
2263 info->value->arch = CPU_INFO_ARCH_RISCV;
2264 info->value->u.riscv.pc = env->pc;
2265 #else
2266 info->value->arch = CPU_INFO_ARCH_OTHER;
2267 #endif
2268 info->value->has_props = !!mc->cpu_index_to_instance_props;
2269 if (info->value->has_props) {
2270 CpuInstanceProperties *props;
2271 props = g_malloc0(sizeof(*props));
2272 *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2273 info->value->props = props;
2274 }
2275
2276 /* XXX: waiting for the qapi to support GSList */
2277 if (!cur_item) {
2278 head = cur_item = info;
2279 } else {
2280 cur_item->next = info;
2281 cur_item = info;
2282 }
2283 }
2284
2285 return head;
2286 }
2287
2288 static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2289 {
2290 /*
2291 * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2292 * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2293 */
2294 switch (target) {
2295 case SYS_EMU_TARGET_I386:
2296 case SYS_EMU_TARGET_X86_64:
2297 return CPU_INFO_ARCH_X86;
2298
2299 case SYS_EMU_TARGET_PPC:
2300 case SYS_EMU_TARGET_PPC64:
2301 return CPU_INFO_ARCH_PPC;
2302
2303 case SYS_EMU_TARGET_SPARC:
2304 case SYS_EMU_TARGET_SPARC64:
2305 return CPU_INFO_ARCH_SPARC;
2306
2307 case SYS_EMU_TARGET_MIPS:
2308 case SYS_EMU_TARGET_MIPSEL:
2309 case SYS_EMU_TARGET_MIPS64:
2310 case SYS_EMU_TARGET_MIPS64EL:
2311 return CPU_INFO_ARCH_MIPS;
2312
2313 case SYS_EMU_TARGET_TRICORE:
2314 return CPU_INFO_ARCH_TRICORE;
2315
2316 case SYS_EMU_TARGET_S390X:
2317 return CPU_INFO_ARCH_S390;
2318
2319 case SYS_EMU_TARGET_RISCV32:
2320 case SYS_EMU_TARGET_RISCV64:
2321 return CPU_INFO_ARCH_RISCV;
2322
2323 default:
2324 return CPU_INFO_ARCH_OTHER;
2325 }
2326 }
2327
2328 static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2329 {
2330 #ifdef TARGET_S390X
2331 S390CPU *s390_cpu = S390_CPU(cpu);
2332 CPUS390XState *env = &s390_cpu->env;
2333
2334 info->cpu_state = env->cpu_state;
2335 #else
2336 abort();
2337 #endif
2338 }
2339
2340 /*
2341 * fast means: we NEVER interrupt vCPU threads to retrieve
2342 * information from KVM.
2343 */
2344 CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2345 {
2346 MachineState *ms = MACHINE(qdev_get_machine());
2347 MachineClass *mc = MACHINE_GET_CLASS(ms);
2348 CpuInfoFastList *head = NULL, *cur_item = NULL;
2349 SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2350 -1, &error_abort);
2351 CPUState *cpu;
2352
2353 CPU_FOREACH(cpu) {
2354 CpuInfoFastList *info = g_malloc0(sizeof(*info));
2355 info->value = g_malloc0(sizeof(*info->value));
2356
2357 info->value->cpu_index = cpu->cpu_index;
2358 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2359 info->value->thread_id = cpu->thread_id;
2360
2361 info->value->has_props = !!mc->cpu_index_to_instance_props;
2362 if (info->value->has_props) {
2363 CpuInstanceProperties *props;
2364 props = g_malloc0(sizeof(*props));
2365 *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2366 info->value->props = props;
2367 }
2368
2369 info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2370 info->value->target = target;
2371 if (target == SYS_EMU_TARGET_S390X) {
2372 cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2373 }
2374
2375 if (!cur_item) {
2376 head = cur_item = info;
2377 } else {
2378 cur_item->next = info;
2379 cur_item = info;
2380 }
2381 }
2382
2383 return head;
2384 }
2385
2386 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2387 bool has_cpu, int64_t cpu_index, Error **errp)
2388 {
2389 FILE *f;
2390 uint32_t l;
2391 CPUState *cpu;
2392 uint8_t buf[1024];
2393 int64_t orig_addr = addr, orig_size = size;
2394
2395 if (!has_cpu) {
2396 cpu_index = 0;
2397 }
2398
2399 cpu = qemu_get_cpu(cpu_index);
2400 if (cpu == NULL) {
2401 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2402 "a CPU number");
2403 return;
2404 }
2405
2406 f = fopen(filename, "wb");
2407 if (!f) {
2408 error_setg_file_open(errp, errno, filename);
2409 return;
2410 }
2411
2412 while (size != 0) {
2413 l = sizeof(buf);
2414 if (l > size)
2415 l = size;
2416 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2417 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2418 " specified", orig_addr, orig_size);
2419 goto exit;
2420 }
2421 if (fwrite(buf, 1, l, f) != l) {
2422 error_setg(errp, QERR_IO_ERROR);
2423 goto exit;
2424 }
2425 addr += l;
2426 size -= l;
2427 }
2428
2429 exit:
2430 fclose(f);
2431 }
2432
2433 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2434 Error **errp)
2435 {
2436 FILE *f;
2437 uint32_t l;
2438 uint8_t buf[1024];
2439
2440 f = fopen(filename, "wb");
2441 if (!f) {
2442 error_setg_file_open(errp, errno, filename);
2443 return;
2444 }
2445
2446 while (size != 0) {
2447 l = sizeof(buf);
2448 if (l > size)
2449 l = size;
2450 cpu_physical_memory_read(addr, buf, l);
2451 if (fwrite(buf, 1, l, f) != l) {
2452 error_setg(errp, QERR_IO_ERROR);
2453 goto exit;
2454 }
2455 addr += l;
2456 size -= l;
2457 }
2458
2459 exit:
2460 fclose(f);
2461 }
2462
2463 void qmp_inject_nmi(Error **errp)
2464 {
2465 nmi_monitor_handle(monitor_get_cpu_index(), errp);
2466 }
2467
2468 void dump_drift_info(void)
2469 {
2470 if (!use_icount) {
2471 return;
2472 }
2473
2474 qemu_printf("Host - Guest clock %"PRIi64" ms\n",
2475 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2476 if (icount_align_option) {
2477 qemu_printf("Max guest delay %"PRIi64" ms\n",
2478 -max_delay / SCALE_MS);
2479 qemu_printf("Max guest advance %"PRIi64" ms\n",
2480 max_advance / SCALE_MS);
2481 } else {
2482 qemu_printf("Max guest delay NA\n");
2483 qemu_printf("Max guest advance NA\n");
2484 }
2485 }