]> git.proxmox.com Git - mirror_qemu.git/blob - cpus.c
Merge remote-tracking branch 'remotes/stsquad/tags/pull-testing-next-251019-3' into...
[mirror_qemu.git] / cpus.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 #include "qemu/osdep.h"
26 #include "qemu-common.h"
27 #include "qemu/config-file.h"
28 #include "migration/vmstate.h"
29 #include "monitor/monitor.h"
30 #include "qapi/error.h"
31 #include "qapi/qapi-commands-misc.h"
32 #include "qapi/qapi-events-run-state.h"
33 #include "qapi/qmp/qerror.h"
34 #include "qemu/error-report.h"
35 #include "qemu/qemu-print.h"
36 #include "sysemu/tcg.h"
37 #include "sysemu/block-backend.h"
38 #include "exec/gdbstub.h"
39 #include "sysemu/dma.h"
40 #include "sysemu/hw_accel.h"
41 #include "sysemu/kvm.h"
42 #include "sysemu/hax.h"
43 #include "sysemu/hvf.h"
44 #include "sysemu/whpx.h"
45 #include "exec/exec-all.h"
46
47 #include "qemu/thread.h"
48 #include "sysemu/cpus.h"
49 #include "sysemu/qtest.h"
50 #include "qemu/main-loop.h"
51 #include "qemu/option.h"
52 #include "qemu/bitmap.h"
53 #include "qemu/seqlock.h"
54 #include "qemu/guest-random.h"
55 #include "tcg.h"
56 #include "hw/nmi.h"
57 #include "sysemu/replay.h"
58 #include "sysemu/runstate.h"
59 #include "hw/boards.h"
60 #include "hw/hw.h"
61
62 #ifdef CONFIG_LINUX
63
64 #include <sys/prctl.h>
65
66 #ifndef PR_MCE_KILL
67 #define PR_MCE_KILL 33
68 #endif
69
70 #ifndef PR_MCE_KILL_SET
71 #define PR_MCE_KILL_SET 1
72 #endif
73
74 #ifndef PR_MCE_KILL_EARLY
75 #define PR_MCE_KILL_EARLY 1
76 #endif
77
78 #endif /* CONFIG_LINUX */
79
80 static QemuMutex qemu_global_mutex;
81
82 int64_t max_delay;
83 int64_t max_advance;
84
85 /* vcpu throttling controls */
86 static QEMUTimer *throttle_timer;
87 static unsigned int throttle_percentage;
88
89 #define CPU_THROTTLE_PCT_MIN 1
90 #define CPU_THROTTLE_PCT_MAX 99
91 #define CPU_THROTTLE_TIMESLICE_NS 10000000
92
93 bool cpu_is_stopped(CPUState *cpu)
94 {
95 return cpu->stopped || !runstate_is_running();
96 }
97
98 static bool cpu_thread_is_idle(CPUState *cpu)
99 {
100 if (cpu->stop || cpu->queued_work_first) {
101 return false;
102 }
103 if (cpu_is_stopped(cpu)) {
104 return true;
105 }
106 if (!cpu->halted || cpu_has_work(cpu) ||
107 kvm_halt_in_kernel()) {
108 return false;
109 }
110 return true;
111 }
112
113 static bool all_cpu_threads_idle(void)
114 {
115 CPUState *cpu;
116
117 CPU_FOREACH(cpu) {
118 if (!cpu_thread_is_idle(cpu)) {
119 return false;
120 }
121 }
122 return true;
123 }
124
125 /***********************************************************/
126 /* guest cycle counter */
127
128 /* Protected by TimersState seqlock */
129
130 static bool icount_sleep = true;
131 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
132 #define MAX_ICOUNT_SHIFT 10
133
134 typedef struct TimersState {
135 /* Protected by BQL. */
136 int64_t cpu_ticks_prev;
137 int64_t cpu_ticks_offset;
138
139 /* Protect fields that can be respectively read outside the
140 * BQL, and written from multiple threads.
141 */
142 QemuSeqLock vm_clock_seqlock;
143 QemuSpin vm_clock_lock;
144
145 int16_t cpu_ticks_enabled;
146
147 /* Conversion factor from emulated instructions to virtual clock ticks. */
148 int16_t icount_time_shift;
149
150 /* Compensate for varying guest execution speed. */
151 int64_t qemu_icount_bias;
152
153 int64_t vm_clock_warp_start;
154 int64_t cpu_clock_offset;
155
156 /* Only written by TCG thread */
157 int64_t qemu_icount;
158
159 /* for adjusting icount */
160 QEMUTimer *icount_rt_timer;
161 QEMUTimer *icount_vm_timer;
162 QEMUTimer *icount_warp_timer;
163 } TimersState;
164
165 static TimersState timers_state;
166 bool mttcg_enabled;
167
168 /*
169 * We default to false if we know other options have been enabled
170 * which are currently incompatible with MTTCG. Otherwise when each
171 * guest (target) has been updated to support:
172 * - atomic instructions
173 * - memory ordering primitives (barriers)
174 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
175 *
176 * Once a guest architecture has been converted to the new primitives
177 * there are two remaining limitations to check.
178 *
179 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
180 * - The host must have a stronger memory order than the guest
181 *
182 * It may be possible in future to support strong guests on weak hosts
183 * but that will require tagging all load/stores in a guest with their
184 * implicit memory order requirements which would likely slow things
185 * down a lot.
186 */
187
188 static bool check_tcg_memory_orders_compatible(void)
189 {
190 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
191 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
192 #else
193 return false;
194 #endif
195 }
196
197 static bool default_mttcg_enabled(void)
198 {
199 if (use_icount || TCG_OVERSIZED_GUEST) {
200 return false;
201 } else {
202 #ifdef TARGET_SUPPORTS_MTTCG
203 return check_tcg_memory_orders_compatible();
204 #else
205 return false;
206 #endif
207 }
208 }
209
210 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
211 {
212 const char *t = qemu_opt_get(opts, "thread");
213 if (t) {
214 if (strcmp(t, "multi") == 0) {
215 if (TCG_OVERSIZED_GUEST) {
216 error_setg(errp, "No MTTCG when guest word size > hosts");
217 } else if (use_icount) {
218 error_setg(errp, "No MTTCG when icount is enabled");
219 } else {
220 #ifndef TARGET_SUPPORTS_MTTCG
221 warn_report("Guest not yet converted to MTTCG - "
222 "you may get unexpected results");
223 #endif
224 if (!check_tcg_memory_orders_compatible()) {
225 warn_report("Guest expects a stronger memory ordering "
226 "than the host provides");
227 error_printf("This may cause strange/hard to debug errors\n");
228 }
229 mttcg_enabled = true;
230 }
231 } else if (strcmp(t, "single") == 0) {
232 mttcg_enabled = false;
233 } else {
234 error_setg(errp, "Invalid 'thread' setting %s", t);
235 }
236 } else {
237 mttcg_enabled = default_mttcg_enabled();
238 }
239 }
240
241 /* The current number of executed instructions is based on what we
242 * originally budgeted minus the current state of the decrementing
243 * icount counters in extra/u16.low.
244 */
245 static int64_t cpu_get_icount_executed(CPUState *cpu)
246 {
247 return (cpu->icount_budget -
248 (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
249 }
250
251 /*
252 * Update the global shared timer_state.qemu_icount to take into
253 * account executed instructions. This is done by the TCG vCPU
254 * thread so the main-loop can see time has moved forward.
255 */
256 static void cpu_update_icount_locked(CPUState *cpu)
257 {
258 int64_t executed = cpu_get_icount_executed(cpu);
259 cpu->icount_budget -= executed;
260
261 atomic_set_i64(&timers_state.qemu_icount,
262 timers_state.qemu_icount + executed);
263 }
264
265 /*
266 * Update the global shared timer_state.qemu_icount to take into
267 * account executed instructions. This is done by the TCG vCPU
268 * thread so the main-loop can see time has moved forward.
269 */
270 void cpu_update_icount(CPUState *cpu)
271 {
272 seqlock_write_lock(&timers_state.vm_clock_seqlock,
273 &timers_state.vm_clock_lock);
274 cpu_update_icount_locked(cpu);
275 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
276 &timers_state.vm_clock_lock);
277 }
278
279 static int64_t cpu_get_icount_raw_locked(void)
280 {
281 CPUState *cpu = current_cpu;
282
283 if (cpu && cpu->running) {
284 if (!cpu->can_do_io) {
285 error_report("Bad icount read");
286 exit(1);
287 }
288 /* Take into account what has run */
289 cpu_update_icount_locked(cpu);
290 }
291 /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
292 return atomic_read_i64(&timers_state.qemu_icount);
293 }
294
295 static int64_t cpu_get_icount_locked(void)
296 {
297 int64_t icount = cpu_get_icount_raw_locked();
298 return atomic_read_i64(&timers_state.qemu_icount_bias) +
299 cpu_icount_to_ns(icount);
300 }
301
302 int64_t cpu_get_icount_raw(void)
303 {
304 int64_t icount;
305 unsigned start;
306
307 do {
308 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
309 icount = cpu_get_icount_raw_locked();
310 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
311
312 return icount;
313 }
314
315 /* Return the virtual CPU time, based on the instruction counter. */
316 int64_t cpu_get_icount(void)
317 {
318 int64_t icount;
319 unsigned start;
320
321 do {
322 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
323 icount = cpu_get_icount_locked();
324 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
325
326 return icount;
327 }
328
329 int64_t cpu_icount_to_ns(int64_t icount)
330 {
331 return icount << atomic_read(&timers_state.icount_time_shift);
332 }
333
334 static int64_t cpu_get_ticks_locked(void)
335 {
336 int64_t ticks = timers_state.cpu_ticks_offset;
337 if (timers_state.cpu_ticks_enabled) {
338 ticks += cpu_get_host_ticks();
339 }
340
341 if (timers_state.cpu_ticks_prev > ticks) {
342 /* Non increasing ticks may happen if the host uses software suspend. */
343 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
344 ticks = timers_state.cpu_ticks_prev;
345 }
346
347 timers_state.cpu_ticks_prev = ticks;
348 return ticks;
349 }
350
351 /* return the time elapsed in VM between vm_start and vm_stop. Unless
352 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
353 * counter.
354 */
355 int64_t cpu_get_ticks(void)
356 {
357 int64_t ticks;
358
359 if (use_icount) {
360 return cpu_get_icount();
361 }
362
363 qemu_spin_lock(&timers_state.vm_clock_lock);
364 ticks = cpu_get_ticks_locked();
365 qemu_spin_unlock(&timers_state.vm_clock_lock);
366 return ticks;
367 }
368
369 static int64_t cpu_get_clock_locked(void)
370 {
371 int64_t time;
372
373 time = timers_state.cpu_clock_offset;
374 if (timers_state.cpu_ticks_enabled) {
375 time += get_clock();
376 }
377
378 return time;
379 }
380
381 /* Return the monotonic time elapsed in VM, i.e.,
382 * the time between vm_start and vm_stop
383 */
384 int64_t cpu_get_clock(void)
385 {
386 int64_t ti;
387 unsigned start;
388
389 do {
390 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
391 ti = cpu_get_clock_locked();
392 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
393
394 return ti;
395 }
396
397 /* enable cpu_get_ticks()
398 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
399 */
400 void cpu_enable_ticks(void)
401 {
402 seqlock_write_lock(&timers_state.vm_clock_seqlock,
403 &timers_state.vm_clock_lock);
404 if (!timers_state.cpu_ticks_enabled) {
405 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
406 timers_state.cpu_clock_offset -= get_clock();
407 timers_state.cpu_ticks_enabled = 1;
408 }
409 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
410 &timers_state.vm_clock_lock);
411 }
412
413 /* disable cpu_get_ticks() : the clock is stopped. You must not call
414 * cpu_get_ticks() after that.
415 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
416 */
417 void cpu_disable_ticks(void)
418 {
419 seqlock_write_lock(&timers_state.vm_clock_seqlock,
420 &timers_state.vm_clock_lock);
421 if (timers_state.cpu_ticks_enabled) {
422 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
423 timers_state.cpu_clock_offset = cpu_get_clock_locked();
424 timers_state.cpu_ticks_enabled = 0;
425 }
426 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
427 &timers_state.vm_clock_lock);
428 }
429
430 /* Correlation between real and virtual time is always going to be
431 fairly approximate, so ignore small variation.
432 When the guest is idle real and virtual time will be aligned in
433 the IO wait loop. */
434 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
435
436 static void icount_adjust(void)
437 {
438 int64_t cur_time;
439 int64_t cur_icount;
440 int64_t delta;
441
442 /* Protected by TimersState mutex. */
443 static int64_t last_delta;
444
445 /* If the VM is not running, then do nothing. */
446 if (!runstate_is_running()) {
447 return;
448 }
449
450 seqlock_write_lock(&timers_state.vm_clock_seqlock,
451 &timers_state.vm_clock_lock);
452 cur_time = cpu_get_clock_locked();
453 cur_icount = cpu_get_icount_locked();
454
455 delta = cur_icount - cur_time;
456 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
457 if (delta > 0
458 && last_delta + ICOUNT_WOBBLE < delta * 2
459 && timers_state.icount_time_shift > 0) {
460 /* The guest is getting too far ahead. Slow time down. */
461 atomic_set(&timers_state.icount_time_shift,
462 timers_state.icount_time_shift - 1);
463 }
464 if (delta < 0
465 && last_delta - ICOUNT_WOBBLE > delta * 2
466 && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
467 /* The guest is getting too far behind. Speed time up. */
468 atomic_set(&timers_state.icount_time_shift,
469 timers_state.icount_time_shift + 1);
470 }
471 last_delta = delta;
472 atomic_set_i64(&timers_state.qemu_icount_bias,
473 cur_icount - (timers_state.qemu_icount
474 << timers_state.icount_time_shift));
475 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
476 &timers_state.vm_clock_lock);
477 }
478
479 static void icount_adjust_rt(void *opaque)
480 {
481 timer_mod(timers_state.icount_rt_timer,
482 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
483 icount_adjust();
484 }
485
486 static void icount_adjust_vm(void *opaque)
487 {
488 timer_mod(timers_state.icount_vm_timer,
489 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
490 NANOSECONDS_PER_SECOND / 10);
491 icount_adjust();
492 }
493
494 static int64_t qemu_icount_round(int64_t count)
495 {
496 int shift = atomic_read(&timers_state.icount_time_shift);
497 return (count + (1 << shift) - 1) >> shift;
498 }
499
500 static void icount_warp_rt(void)
501 {
502 unsigned seq;
503 int64_t warp_start;
504
505 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
506 * changes from -1 to another value, so the race here is okay.
507 */
508 do {
509 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
510 warp_start = timers_state.vm_clock_warp_start;
511 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
512
513 if (warp_start == -1) {
514 return;
515 }
516
517 seqlock_write_lock(&timers_state.vm_clock_seqlock,
518 &timers_state.vm_clock_lock);
519 if (runstate_is_running()) {
520 int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
521 cpu_get_clock_locked());
522 int64_t warp_delta;
523
524 warp_delta = clock - timers_state.vm_clock_warp_start;
525 if (use_icount == 2) {
526 /*
527 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
528 * far ahead of real time.
529 */
530 int64_t cur_icount = cpu_get_icount_locked();
531 int64_t delta = clock - cur_icount;
532 warp_delta = MIN(warp_delta, delta);
533 }
534 atomic_set_i64(&timers_state.qemu_icount_bias,
535 timers_state.qemu_icount_bias + warp_delta);
536 }
537 timers_state.vm_clock_warp_start = -1;
538 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
539 &timers_state.vm_clock_lock);
540
541 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
542 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
543 }
544 }
545
546 static void icount_timer_cb(void *opaque)
547 {
548 /* No need for a checkpoint because the timer already synchronizes
549 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
550 */
551 icount_warp_rt();
552 }
553
554 void qtest_clock_warp(int64_t dest)
555 {
556 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
557 AioContext *aio_context;
558 assert(qtest_enabled());
559 aio_context = qemu_get_aio_context();
560 while (clock < dest) {
561 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
562 QEMU_TIMER_ATTR_ALL);
563 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
564
565 seqlock_write_lock(&timers_state.vm_clock_seqlock,
566 &timers_state.vm_clock_lock);
567 atomic_set_i64(&timers_state.qemu_icount_bias,
568 timers_state.qemu_icount_bias + warp);
569 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
570 &timers_state.vm_clock_lock);
571
572 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
573 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
574 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
575 }
576 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
577 }
578
579 void qemu_start_warp_timer(void)
580 {
581 int64_t clock;
582 int64_t deadline;
583
584 if (!use_icount) {
585 return;
586 }
587
588 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
589 * do not fire, so computing the deadline does not make sense.
590 */
591 if (!runstate_is_running()) {
592 return;
593 }
594
595 if (replay_mode != REPLAY_MODE_PLAY) {
596 if (!all_cpu_threads_idle()) {
597 return;
598 }
599
600 if (qtest_enabled()) {
601 /* When testing, qtest commands advance icount. */
602 return;
603 }
604
605 replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
606 } else {
607 /* warp clock deterministically in record/replay mode */
608 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
609 /* vCPU is sleeping and warp can't be started.
610 It is probably a race condition: notification sent
611 to vCPU was processed in advance and vCPU went to sleep.
612 Therefore we have to wake it up for doing someting. */
613 if (replay_has_checkpoint()) {
614 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
615 }
616 return;
617 }
618 }
619
620 /* We want to use the earliest deadline from ALL vm_clocks */
621 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
622 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
623 ~QEMU_TIMER_ATTR_EXTERNAL);
624 if (deadline < 0) {
625 static bool notified;
626 if (!icount_sleep && !notified) {
627 warn_report("icount sleep disabled and no active timers");
628 notified = true;
629 }
630 return;
631 }
632
633 if (deadline > 0) {
634 /*
635 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
636 * sleep. Otherwise, the CPU might be waiting for a future timer
637 * interrupt to wake it up, but the interrupt never comes because
638 * the vCPU isn't running any insns and thus doesn't advance the
639 * QEMU_CLOCK_VIRTUAL.
640 */
641 if (!icount_sleep) {
642 /*
643 * We never let VCPUs sleep in no sleep icount mode.
644 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
645 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
646 * It is useful when we want a deterministic execution time,
647 * isolated from host latencies.
648 */
649 seqlock_write_lock(&timers_state.vm_clock_seqlock,
650 &timers_state.vm_clock_lock);
651 atomic_set_i64(&timers_state.qemu_icount_bias,
652 timers_state.qemu_icount_bias + deadline);
653 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
654 &timers_state.vm_clock_lock);
655 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
656 } else {
657 /*
658 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
659 * "real" time, (related to the time left until the next event) has
660 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
661 * This avoids that the warps are visible externally; for example,
662 * you will not be sending network packets continuously instead of
663 * every 100ms.
664 */
665 seqlock_write_lock(&timers_state.vm_clock_seqlock,
666 &timers_state.vm_clock_lock);
667 if (timers_state.vm_clock_warp_start == -1
668 || timers_state.vm_clock_warp_start > clock) {
669 timers_state.vm_clock_warp_start = clock;
670 }
671 seqlock_write_unlock(&timers_state.vm_clock_seqlock,
672 &timers_state.vm_clock_lock);
673 timer_mod_anticipate(timers_state.icount_warp_timer,
674 clock + deadline);
675 }
676 } else if (deadline == 0) {
677 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
678 }
679 }
680
681 static void qemu_account_warp_timer(void)
682 {
683 if (!use_icount || !icount_sleep) {
684 return;
685 }
686
687 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
688 * do not fire, so computing the deadline does not make sense.
689 */
690 if (!runstate_is_running()) {
691 return;
692 }
693
694 /* warp clock deterministically in record/replay mode */
695 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
696 return;
697 }
698
699 timer_del(timers_state.icount_warp_timer);
700 icount_warp_rt();
701 }
702
703 static bool icount_state_needed(void *opaque)
704 {
705 return use_icount;
706 }
707
708 static bool warp_timer_state_needed(void *opaque)
709 {
710 TimersState *s = opaque;
711 return s->icount_warp_timer != NULL;
712 }
713
714 static bool adjust_timers_state_needed(void *opaque)
715 {
716 TimersState *s = opaque;
717 return s->icount_rt_timer != NULL;
718 }
719
720 /*
721 * Subsection for warp timer migration is optional, because may not be created
722 */
723 static const VMStateDescription icount_vmstate_warp_timer = {
724 .name = "timer/icount/warp_timer",
725 .version_id = 1,
726 .minimum_version_id = 1,
727 .needed = warp_timer_state_needed,
728 .fields = (VMStateField[]) {
729 VMSTATE_INT64(vm_clock_warp_start, TimersState),
730 VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
731 VMSTATE_END_OF_LIST()
732 }
733 };
734
735 static const VMStateDescription icount_vmstate_adjust_timers = {
736 .name = "timer/icount/timers",
737 .version_id = 1,
738 .minimum_version_id = 1,
739 .needed = adjust_timers_state_needed,
740 .fields = (VMStateField[]) {
741 VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
742 VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
743 VMSTATE_END_OF_LIST()
744 }
745 };
746
747 /*
748 * This is a subsection for icount migration.
749 */
750 static const VMStateDescription icount_vmstate_timers = {
751 .name = "timer/icount",
752 .version_id = 1,
753 .minimum_version_id = 1,
754 .needed = icount_state_needed,
755 .fields = (VMStateField[]) {
756 VMSTATE_INT64(qemu_icount_bias, TimersState),
757 VMSTATE_INT64(qemu_icount, TimersState),
758 VMSTATE_END_OF_LIST()
759 },
760 .subsections = (const VMStateDescription*[]) {
761 &icount_vmstate_warp_timer,
762 &icount_vmstate_adjust_timers,
763 NULL
764 }
765 };
766
767 static const VMStateDescription vmstate_timers = {
768 .name = "timer",
769 .version_id = 2,
770 .minimum_version_id = 1,
771 .fields = (VMStateField[]) {
772 VMSTATE_INT64(cpu_ticks_offset, TimersState),
773 VMSTATE_UNUSED(8),
774 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
775 VMSTATE_END_OF_LIST()
776 },
777 .subsections = (const VMStateDescription*[]) {
778 &icount_vmstate_timers,
779 NULL
780 }
781 };
782
783 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
784 {
785 double pct;
786 double throttle_ratio;
787 int64_t sleeptime_ns, endtime_ns;
788
789 if (!cpu_throttle_get_percentage()) {
790 return;
791 }
792
793 pct = (double)cpu_throttle_get_percentage()/100;
794 throttle_ratio = pct / (1 - pct);
795 /* Add 1ns to fix double's rounding error (like 0.9999999...) */
796 sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
797 endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
798 while (sleeptime_ns > 0 && !cpu->stop) {
799 if (sleeptime_ns > SCALE_MS) {
800 qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
801 sleeptime_ns / SCALE_MS);
802 } else {
803 qemu_mutex_unlock_iothread();
804 g_usleep(sleeptime_ns / SCALE_US);
805 qemu_mutex_lock_iothread();
806 }
807 sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
808 }
809 atomic_set(&cpu->throttle_thread_scheduled, 0);
810 }
811
812 static void cpu_throttle_timer_tick(void *opaque)
813 {
814 CPUState *cpu;
815 double pct;
816
817 /* Stop the timer if needed */
818 if (!cpu_throttle_get_percentage()) {
819 return;
820 }
821 CPU_FOREACH(cpu) {
822 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
823 async_run_on_cpu(cpu, cpu_throttle_thread,
824 RUN_ON_CPU_NULL);
825 }
826 }
827
828 pct = (double)cpu_throttle_get_percentage()/100;
829 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
830 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
831 }
832
833 void cpu_throttle_set(int new_throttle_pct)
834 {
835 /* Ensure throttle percentage is within valid range */
836 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
837 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
838
839 atomic_set(&throttle_percentage, new_throttle_pct);
840
841 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
842 CPU_THROTTLE_TIMESLICE_NS);
843 }
844
845 void cpu_throttle_stop(void)
846 {
847 atomic_set(&throttle_percentage, 0);
848 }
849
850 bool cpu_throttle_active(void)
851 {
852 return (cpu_throttle_get_percentage() != 0);
853 }
854
855 int cpu_throttle_get_percentage(void)
856 {
857 return atomic_read(&throttle_percentage);
858 }
859
860 void cpu_ticks_init(void)
861 {
862 seqlock_init(&timers_state.vm_clock_seqlock);
863 qemu_spin_init(&timers_state.vm_clock_lock);
864 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
865 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
866 cpu_throttle_timer_tick, NULL);
867 }
868
869 void configure_icount(QemuOpts *opts, Error **errp)
870 {
871 const char *option;
872 char *rem_str = NULL;
873
874 option = qemu_opt_get(opts, "shift");
875 if (!option) {
876 if (qemu_opt_get(opts, "align") != NULL) {
877 error_setg(errp, "Please specify shift option when using align");
878 }
879 return;
880 }
881
882 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
883 if (icount_sleep) {
884 timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
885 icount_timer_cb, NULL);
886 }
887
888 icount_align_option = qemu_opt_get_bool(opts, "align", false);
889
890 if (icount_align_option && !icount_sleep) {
891 error_setg(errp, "align=on and sleep=off are incompatible");
892 }
893 if (strcmp(option, "auto") != 0) {
894 errno = 0;
895 timers_state.icount_time_shift = strtol(option, &rem_str, 0);
896 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
897 error_setg(errp, "icount: Invalid shift value");
898 }
899 use_icount = 1;
900 return;
901 } else if (icount_align_option) {
902 error_setg(errp, "shift=auto and align=on are incompatible");
903 } else if (!icount_sleep) {
904 error_setg(errp, "shift=auto and sleep=off are incompatible");
905 }
906
907 use_icount = 2;
908
909 /* 125MIPS seems a reasonable initial guess at the guest speed.
910 It will be corrected fairly quickly anyway. */
911 timers_state.icount_time_shift = 3;
912
913 /* Have both realtime and virtual time triggers for speed adjustment.
914 The realtime trigger catches emulated time passing too slowly,
915 the virtual time trigger catches emulated time passing too fast.
916 Realtime triggers occur even when idle, so use them less frequently
917 than VM triggers. */
918 timers_state.vm_clock_warp_start = -1;
919 timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
920 icount_adjust_rt, NULL);
921 timer_mod(timers_state.icount_rt_timer,
922 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
923 timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
924 icount_adjust_vm, NULL);
925 timer_mod(timers_state.icount_vm_timer,
926 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
927 NANOSECONDS_PER_SECOND / 10);
928 }
929
930 /***********************************************************/
931 /* TCG vCPU kick timer
932 *
933 * The kick timer is responsible for moving single threaded vCPU
934 * emulation on to the next vCPU. If more than one vCPU is running a
935 * timer event with force a cpu->exit so the next vCPU can get
936 * scheduled.
937 *
938 * The timer is removed if all vCPUs are idle and restarted again once
939 * idleness is complete.
940 */
941
942 static QEMUTimer *tcg_kick_vcpu_timer;
943 static CPUState *tcg_current_rr_cpu;
944
945 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
946
947 static inline int64_t qemu_tcg_next_kick(void)
948 {
949 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
950 }
951
952 /* Kick the currently round-robin scheduled vCPU to next */
953 static void qemu_cpu_kick_rr_next_cpu(void)
954 {
955 CPUState *cpu;
956 do {
957 cpu = atomic_mb_read(&tcg_current_rr_cpu);
958 if (cpu) {
959 cpu_exit(cpu);
960 }
961 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
962 }
963
964 /* Kick all RR vCPUs */
965 static void qemu_cpu_kick_rr_cpus(void)
966 {
967 CPUState *cpu;
968
969 CPU_FOREACH(cpu) {
970 cpu_exit(cpu);
971 };
972 }
973
974 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
975 {
976 }
977
978 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
979 {
980 if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
981 qemu_notify_event();
982 return;
983 }
984
985 if (qemu_in_vcpu_thread()) {
986 /* A CPU is currently running; kick it back out to the
987 * tcg_cpu_exec() loop so it will recalculate its
988 * icount deadline immediately.
989 */
990 qemu_cpu_kick(current_cpu);
991 } else if (first_cpu) {
992 /* qemu_cpu_kick is not enough to kick a halted CPU out of
993 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
994 * causes cpu_thread_is_idle to return false. This way,
995 * handle_icount_deadline can run.
996 * If we have no CPUs at all for some reason, we don't
997 * need to do anything.
998 */
999 async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
1000 }
1001 }
1002
1003 static void kick_tcg_thread(void *opaque)
1004 {
1005 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
1006 qemu_cpu_kick_rr_next_cpu();
1007 }
1008
1009 static void start_tcg_kick_timer(void)
1010 {
1011 assert(!mttcg_enabled);
1012 if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
1013 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
1014 kick_tcg_thread, NULL);
1015 }
1016 if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
1017 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
1018 }
1019 }
1020
1021 static void stop_tcg_kick_timer(void)
1022 {
1023 assert(!mttcg_enabled);
1024 if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
1025 timer_del(tcg_kick_vcpu_timer);
1026 }
1027 }
1028
1029 /***********************************************************/
1030 void hw_error(const char *fmt, ...)
1031 {
1032 va_list ap;
1033 CPUState *cpu;
1034
1035 va_start(ap, fmt);
1036 fprintf(stderr, "qemu: hardware error: ");
1037 vfprintf(stderr, fmt, ap);
1038 fprintf(stderr, "\n");
1039 CPU_FOREACH(cpu) {
1040 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1041 cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1042 }
1043 va_end(ap);
1044 abort();
1045 }
1046
1047 void cpu_synchronize_all_states(void)
1048 {
1049 CPUState *cpu;
1050
1051 CPU_FOREACH(cpu) {
1052 cpu_synchronize_state(cpu);
1053 /* TODO: move to cpu_synchronize_state() */
1054 if (hvf_enabled()) {
1055 hvf_cpu_synchronize_state(cpu);
1056 }
1057 }
1058 }
1059
1060 void cpu_synchronize_all_post_reset(void)
1061 {
1062 CPUState *cpu;
1063
1064 CPU_FOREACH(cpu) {
1065 cpu_synchronize_post_reset(cpu);
1066 /* TODO: move to cpu_synchronize_post_reset() */
1067 if (hvf_enabled()) {
1068 hvf_cpu_synchronize_post_reset(cpu);
1069 }
1070 }
1071 }
1072
1073 void cpu_synchronize_all_post_init(void)
1074 {
1075 CPUState *cpu;
1076
1077 CPU_FOREACH(cpu) {
1078 cpu_synchronize_post_init(cpu);
1079 /* TODO: move to cpu_synchronize_post_init() */
1080 if (hvf_enabled()) {
1081 hvf_cpu_synchronize_post_init(cpu);
1082 }
1083 }
1084 }
1085
1086 void cpu_synchronize_all_pre_loadvm(void)
1087 {
1088 CPUState *cpu;
1089
1090 CPU_FOREACH(cpu) {
1091 cpu_synchronize_pre_loadvm(cpu);
1092 }
1093 }
1094
1095 static int do_vm_stop(RunState state, bool send_stop)
1096 {
1097 int ret = 0;
1098
1099 if (runstate_is_running()) {
1100 cpu_disable_ticks();
1101 pause_all_vcpus();
1102 runstate_set(state);
1103 vm_state_notify(0, state);
1104 if (send_stop) {
1105 qapi_event_send_stop();
1106 }
1107 }
1108
1109 bdrv_drain_all();
1110 ret = bdrv_flush_all();
1111
1112 return ret;
1113 }
1114
1115 /* Special vm_stop() variant for terminating the process. Historically clients
1116 * did not expect a QMP STOP event and so we need to retain compatibility.
1117 */
1118 int vm_shutdown(void)
1119 {
1120 return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1121 }
1122
1123 static bool cpu_can_run(CPUState *cpu)
1124 {
1125 if (cpu->stop) {
1126 return false;
1127 }
1128 if (cpu_is_stopped(cpu)) {
1129 return false;
1130 }
1131 return true;
1132 }
1133
1134 static void cpu_handle_guest_debug(CPUState *cpu)
1135 {
1136 gdb_set_stop_cpu(cpu);
1137 qemu_system_debug_request();
1138 cpu->stopped = true;
1139 }
1140
1141 #ifdef CONFIG_LINUX
1142 static void sigbus_reraise(void)
1143 {
1144 sigset_t set;
1145 struct sigaction action;
1146
1147 memset(&action, 0, sizeof(action));
1148 action.sa_handler = SIG_DFL;
1149 if (!sigaction(SIGBUS, &action, NULL)) {
1150 raise(SIGBUS);
1151 sigemptyset(&set);
1152 sigaddset(&set, SIGBUS);
1153 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1154 }
1155 perror("Failed to re-raise SIGBUS!\n");
1156 abort();
1157 }
1158
1159 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1160 {
1161 if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1162 sigbus_reraise();
1163 }
1164
1165 if (current_cpu) {
1166 /* Called asynchronously in VCPU thread. */
1167 if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1168 sigbus_reraise();
1169 }
1170 } else {
1171 /* Called synchronously (via signalfd) in main thread. */
1172 if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1173 sigbus_reraise();
1174 }
1175 }
1176 }
1177
1178 static void qemu_init_sigbus(void)
1179 {
1180 struct sigaction action;
1181
1182 memset(&action, 0, sizeof(action));
1183 action.sa_flags = SA_SIGINFO;
1184 action.sa_sigaction = sigbus_handler;
1185 sigaction(SIGBUS, &action, NULL);
1186
1187 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1188 }
1189 #else /* !CONFIG_LINUX */
1190 static void qemu_init_sigbus(void)
1191 {
1192 }
1193 #endif /* !CONFIG_LINUX */
1194
1195 static QemuThread io_thread;
1196
1197 /* cpu creation */
1198 static QemuCond qemu_cpu_cond;
1199 /* system init */
1200 static QemuCond qemu_pause_cond;
1201
1202 void qemu_init_cpu_loop(void)
1203 {
1204 qemu_init_sigbus();
1205 qemu_cond_init(&qemu_cpu_cond);
1206 qemu_cond_init(&qemu_pause_cond);
1207 qemu_mutex_init(&qemu_global_mutex);
1208
1209 qemu_thread_get_self(&io_thread);
1210 }
1211
1212 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1213 {
1214 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1215 }
1216
1217 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1218 {
1219 if (kvm_destroy_vcpu(cpu) < 0) {
1220 error_report("kvm_destroy_vcpu failed");
1221 exit(EXIT_FAILURE);
1222 }
1223 }
1224
1225 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1226 {
1227 }
1228
1229 static void qemu_cpu_stop(CPUState *cpu, bool exit)
1230 {
1231 g_assert(qemu_cpu_is_self(cpu));
1232 cpu->stop = false;
1233 cpu->stopped = true;
1234 if (exit) {
1235 cpu_exit(cpu);
1236 }
1237 qemu_cond_broadcast(&qemu_pause_cond);
1238 }
1239
1240 static void qemu_wait_io_event_common(CPUState *cpu)
1241 {
1242 atomic_mb_set(&cpu->thread_kicked, false);
1243 if (cpu->stop) {
1244 qemu_cpu_stop(cpu, false);
1245 }
1246 process_queued_cpu_work(cpu);
1247 }
1248
1249 static void qemu_tcg_rr_wait_io_event(void)
1250 {
1251 CPUState *cpu;
1252
1253 while (all_cpu_threads_idle()) {
1254 stop_tcg_kick_timer();
1255 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1256 }
1257
1258 start_tcg_kick_timer();
1259
1260 CPU_FOREACH(cpu) {
1261 qemu_wait_io_event_common(cpu);
1262 }
1263 }
1264
1265 static void qemu_wait_io_event(CPUState *cpu)
1266 {
1267 while (cpu_thread_is_idle(cpu)) {
1268 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1269 }
1270
1271 #ifdef _WIN32
1272 /* Eat dummy APC queued by qemu_cpu_kick_thread. */
1273 if (!tcg_enabled()) {
1274 SleepEx(0, TRUE);
1275 }
1276 #endif
1277 qemu_wait_io_event_common(cpu);
1278 }
1279
1280 static void *qemu_kvm_cpu_thread_fn(void *arg)
1281 {
1282 CPUState *cpu = arg;
1283 int r;
1284
1285 rcu_register_thread();
1286
1287 qemu_mutex_lock_iothread();
1288 qemu_thread_get_self(cpu->thread);
1289 cpu->thread_id = qemu_get_thread_id();
1290 cpu->can_do_io = 1;
1291 current_cpu = cpu;
1292
1293 r = kvm_init_vcpu(cpu);
1294 if (r < 0) {
1295 error_report("kvm_init_vcpu failed: %s", strerror(-r));
1296 exit(1);
1297 }
1298
1299 kvm_init_cpu_signals(cpu);
1300
1301 /* signal CPU creation */
1302 cpu->created = true;
1303 qemu_cond_signal(&qemu_cpu_cond);
1304 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1305
1306 do {
1307 if (cpu_can_run(cpu)) {
1308 r = kvm_cpu_exec(cpu);
1309 if (r == EXCP_DEBUG) {
1310 cpu_handle_guest_debug(cpu);
1311 }
1312 }
1313 qemu_wait_io_event(cpu);
1314 } while (!cpu->unplug || cpu_can_run(cpu));
1315
1316 qemu_kvm_destroy_vcpu(cpu);
1317 cpu->created = false;
1318 qemu_cond_signal(&qemu_cpu_cond);
1319 qemu_mutex_unlock_iothread();
1320 rcu_unregister_thread();
1321 return NULL;
1322 }
1323
1324 static void *qemu_dummy_cpu_thread_fn(void *arg)
1325 {
1326 #ifdef _WIN32
1327 error_report("qtest is not supported under Windows");
1328 exit(1);
1329 #else
1330 CPUState *cpu = arg;
1331 sigset_t waitset;
1332 int r;
1333
1334 rcu_register_thread();
1335
1336 qemu_mutex_lock_iothread();
1337 qemu_thread_get_self(cpu->thread);
1338 cpu->thread_id = qemu_get_thread_id();
1339 cpu->can_do_io = 1;
1340 current_cpu = cpu;
1341
1342 sigemptyset(&waitset);
1343 sigaddset(&waitset, SIG_IPI);
1344
1345 /* signal CPU creation */
1346 cpu->created = true;
1347 qemu_cond_signal(&qemu_cpu_cond);
1348 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1349
1350 do {
1351 qemu_mutex_unlock_iothread();
1352 do {
1353 int sig;
1354 r = sigwait(&waitset, &sig);
1355 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1356 if (r == -1) {
1357 perror("sigwait");
1358 exit(1);
1359 }
1360 qemu_mutex_lock_iothread();
1361 qemu_wait_io_event(cpu);
1362 } while (!cpu->unplug);
1363
1364 qemu_mutex_unlock_iothread();
1365 rcu_unregister_thread();
1366 return NULL;
1367 #endif
1368 }
1369
1370 static int64_t tcg_get_icount_limit(void)
1371 {
1372 int64_t deadline;
1373
1374 if (replay_mode != REPLAY_MODE_PLAY) {
1375 /*
1376 * Include all the timers, because they may need an attention.
1377 * Too long CPU execution may create unnecessary delay in UI.
1378 */
1379 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1380 QEMU_TIMER_ATTR_ALL);
1381
1382 /* Maintain prior (possibly buggy) behaviour where if no deadline
1383 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1384 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1385 * nanoseconds.
1386 */
1387 if ((deadline < 0) || (deadline > INT32_MAX)) {
1388 deadline = INT32_MAX;
1389 }
1390
1391 return qemu_icount_round(deadline);
1392 } else {
1393 return replay_get_instructions();
1394 }
1395 }
1396
1397 static void handle_icount_deadline(void)
1398 {
1399 assert(qemu_in_vcpu_thread());
1400 if (use_icount) {
1401 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1402 QEMU_TIMER_ATTR_ALL);
1403
1404 if (deadline == 0) {
1405 /* Wake up other AioContexts. */
1406 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1407 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1408 }
1409 }
1410 }
1411
1412 static void prepare_icount_for_run(CPUState *cpu)
1413 {
1414 if (use_icount) {
1415 int insns_left;
1416
1417 /* These should always be cleared by process_icount_data after
1418 * each vCPU execution. However u16.high can be raised
1419 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1420 */
1421 g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1422 g_assert(cpu->icount_extra == 0);
1423
1424 cpu->icount_budget = tcg_get_icount_limit();
1425 insns_left = MIN(0xffff, cpu->icount_budget);
1426 cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1427 cpu->icount_extra = cpu->icount_budget - insns_left;
1428
1429 replay_mutex_lock();
1430 }
1431 }
1432
1433 static void process_icount_data(CPUState *cpu)
1434 {
1435 if (use_icount) {
1436 /* Account for executed instructions */
1437 cpu_update_icount(cpu);
1438
1439 /* Reset the counters */
1440 cpu_neg(cpu)->icount_decr.u16.low = 0;
1441 cpu->icount_extra = 0;
1442 cpu->icount_budget = 0;
1443
1444 replay_account_executed_instructions();
1445
1446 replay_mutex_unlock();
1447 }
1448 }
1449
1450
1451 static int tcg_cpu_exec(CPUState *cpu)
1452 {
1453 int ret;
1454 #ifdef CONFIG_PROFILER
1455 int64_t ti;
1456 #endif
1457
1458 assert(tcg_enabled());
1459 #ifdef CONFIG_PROFILER
1460 ti = profile_getclock();
1461 #endif
1462 cpu_exec_start(cpu);
1463 ret = cpu_exec(cpu);
1464 cpu_exec_end(cpu);
1465 #ifdef CONFIG_PROFILER
1466 atomic_set(&tcg_ctx->prof.cpu_exec_time,
1467 tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1468 #endif
1469 return ret;
1470 }
1471
1472 /* Destroy any remaining vCPUs which have been unplugged and have
1473 * finished running
1474 */
1475 static void deal_with_unplugged_cpus(void)
1476 {
1477 CPUState *cpu;
1478
1479 CPU_FOREACH(cpu) {
1480 if (cpu->unplug && !cpu_can_run(cpu)) {
1481 qemu_tcg_destroy_vcpu(cpu);
1482 cpu->created = false;
1483 qemu_cond_signal(&qemu_cpu_cond);
1484 break;
1485 }
1486 }
1487 }
1488
1489 /* Single-threaded TCG
1490 *
1491 * In the single-threaded case each vCPU is simulated in turn. If
1492 * there is more than a single vCPU we create a simple timer to kick
1493 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1494 * This is done explicitly rather than relying on side-effects
1495 * elsewhere.
1496 */
1497
1498 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1499 {
1500 CPUState *cpu = arg;
1501
1502 assert(tcg_enabled());
1503 rcu_register_thread();
1504 tcg_register_thread();
1505
1506 qemu_mutex_lock_iothread();
1507 qemu_thread_get_self(cpu->thread);
1508
1509 cpu->thread_id = qemu_get_thread_id();
1510 cpu->created = true;
1511 cpu->can_do_io = 1;
1512 qemu_cond_signal(&qemu_cpu_cond);
1513 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1514
1515 /* wait for initial kick-off after machine start */
1516 while (first_cpu->stopped) {
1517 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1518
1519 /* process any pending work */
1520 CPU_FOREACH(cpu) {
1521 current_cpu = cpu;
1522 qemu_wait_io_event_common(cpu);
1523 }
1524 }
1525
1526 start_tcg_kick_timer();
1527
1528 cpu = first_cpu;
1529
1530 /* process any pending work */
1531 cpu->exit_request = 1;
1532
1533 while (1) {
1534 qemu_mutex_unlock_iothread();
1535 replay_mutex_lock();
1536 qemu_mutex_lock_iothread();
1537 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1538 qemu_account_warp_timer();
1539
1540 /* Run the timers here. This is much more efficient than
1541 * waking up the I/O thread and waiting for completion.
1542 */
1543 handle_icount_deadline();
1544
1545 replay_mutex_unlock();
1546
1547 if (!cpu) {
1548 cpu = first_cpu;
1549 }
1550
1551 while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1552
1553 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1554 current_cpu = cpu;
1555
1556 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1557 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1558
1559 if (cpu_can_run(cpu)) {
1560 int r;
1561
1562 qemu_mutex_unlock_iothread();
1563 prepare_icount_for_run(cpu);
1564
1565 r = tcg_cpu_exec(cpu);
1566
1567 process_icount_data(cpu);
1568 qemu_mutex_lock_iothread();
1569
1570 if (r == EXCP_DEBUG) {
1571 cpu_handle_guest_debug(cpu);
1572 break;
1573 } else if (r == EXCP_ATOMIC) {
1574 qemu_mutex_unlock_iothread();
1575 cpu_exec_step_atomic(cpu);
1576 qemu_mutex_lock_iothread();
1577 break;
1578 }
1579 } else if (cpu->stop) {
1580 if (cpu->unplug) {
1581 cpu = CPU_NEXT(cpu);
1582 }
1583 break;
1584 }
1585
1586 cpu = CPU_NEXT(cpu);
1587 } /* while (cpu && !cpu->exit_request).. */
1588
1589 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1590 atomic_set(&tcg_current_rr_cpu, NULL);
1591
1592 if (cpu && cpu->exit_request) {
1593 atomic_mb_set(&cpu->exit_request, 0);
1594 }
1595
1596 if (use_icount && all_cpu_threads_idle()) {
1597 /*
1598 * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1599 * in the main_loop, wake it up in order to start the warp timer.
1600 */
1601 qemu_notify_event();
1602 }
1603
1604 qemu_tcg_rr_wait_io_event();
1605 deal_with_unplugged_cpus();
1606 }
1607
1608 rcu_unregister_thread();
1609 return NULL;
1610 }
1611
1612 static void *qemu_hax_cpu_thread_fn(void *arg)
1613 {
1614 CPUState *cpu = arg;
1615 int r;
1616
1617 rcu_register_thread();
1618 qemu_mutex_lock_iothread();
1619 qemu_thread_get_self(cpu->thread);
1620
1621 cpu->thread_id = qemu_get_thread_id();
1622 cpu->created = true;
1623 current_cpu = cpu;
1624
1625 hax_init_vcpu(cpu);
1626 qemu_cond_signal(&qemu_cpu_cond);
1627 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1628
1629 do {
1630 if (cpu_can_run(cpu)) {
1631 r = hax_smp_cpu_exec(cpu);
1632 if (r == EXCP_DEBUG) {
1633 cpu_handle_guest_debug(cpu);
1634 }
1635 }
1636
1637 qemu_wait_io_event(cpu);
1638 } while (!cpu->unplug || cpu_can_run(cpu));
1639 rcu_unregister_thread();
1640 return NULL;
1641 }
1642
1643 /* The HVF-specific vCPU thread function. This one should only run when the host
1644 * CPU supports the VMX "unrestricted guest" feature. */
1645 static void *qemu_hvf_cpu_thread_fn(void *arg)
1646 {
1647 CPUState *cpu = arg;
1648
1649 int r;
1650
1651 assert(hvf_enabled());
1652
1653 rcu_register_thread();
1654
1655 qemu_mutex_lock_iothread();
1656 qemu_thread_get_self(cpu->thread);
1657
1658 cpu->thread_id = qemu_get_thread_id();
1659 cpu->can_do_io = 1;
1660 current_cpu = cpu;
1661
1662 hvf_init_vcpu(cpu);
1663
1664 /* signal CPU creation */
1665 cpu->created = true;
1666 qemu_cond_signal(&qemu_cpu_cond);
1667 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1668
1669 do {
1670 if (cpu_can_run(cpu)) {
1671 r = hvf_vcpu_exec(cpu);
1672 if (r == EXCP_DEBUG) {
1673 cpu_handle_guest_debug(cpu);
1674 }
1675 }
1676 qemu_wait_io_event(cpu);
1677 } while (!cpu->unplug || cpu_can_run(cpu));
1678
1679 hvf_vcpu_destroy(cpu);
1680 cpu->created = false;
1681 qemu_cond_signal(&qemu_cpu_cond);
1682 qemu_mutex_unlock_iothread();
1683 rcu_unregister_thread();
1684 return NULL;
1685 }
1686
1687 static void *qemu_whpx_cpu_thread_fn(void *arg)
1688 {
1689 CPUState *cpu = arg;
1690 int r;
1691
1692 rcu_register_thread();
1693
1694 qemu_mutex_lock_iothread();
1695 qemu_thread_get_self(cpu->thread);
1696 cpu->thread_id = qemu_get_thread_id();
1697 current_cpu = cpu;
1698
1699 r = whpx_init_vcpu(cpu);
1700 if (r < 0) {
1701 fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1702 exit(1);
1703 }
1704
1705 /* signal CPU creation */
1706 cpu->created = true;
1707 qemu_cond_signal(&qemu_cpu_cond);
1708 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1709
1710 do {
1711 if (cpu_can_run(cpu)) {
1712 r = whpx_vcpu_exec(cpu);
1713 if (r == EXCP_DEBUG) {
1714 cpu_handle_guest_debug(cpu);
1715 }
1716 }
1717 while (cpu_thread_is_idle(cpu)) {
1718 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1719 }
1720 qemu_wait_io_event_common(cpu);
1721 } while (!cpu->unplug || cpu_can_run(cpu));
1722
1723 whpx_destroy_vcpu(cpu);
1724 cpu->created = false;
1725 qemu_cond_signal(&qemu_cpu_cond);
1726 qemu_mutex_unlock_iothread();
1727 rcu_unregister_thread();
1728 return NULL;
1729 }
1730
1731 #ifdef _WIN32
1732 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1733 {
1734 }
1735 #endif
1736
1737 /* Multi-threaded TCG
1738 *
1739 * In the multi-threaded case each vCPU has its own thread. The TLS
1740 * variable current_cpu can be used deep in the code to find the
1741 * current CPUState for a given thread.
1742 */
1743
1744 static void *qemu_tcg_cpu_thread_fn(void *arg)
1745 {
1746 CPUState *cpu = arg;
1747
1748 assert(tcg_enabled());
1749 g_assert(!use_icount);
1750
1751 rcu_register_thread();
1752 tcg_register_thread();
1753
1754 qemu_mutex_lock_iothread();
1755 qemu_thread_get_self(cpu->thread);
1756
1757 cpu->thread_id = qemu_get_thread_id();
1758 cpu->created = true;
1759 cpu->can_do_io = 1;
1760 current_cpu = cpu;
1761 qemu_cond_signal(&qemu_cpu_cond);
1762 qemu_guest_random_seed_thread_part2(cpu->random_seed);
1763
1764 /* process any pending work */
1765 cpu->exit_request = 1;
1766
1767 do {
1768 if (cpu_can_run(cpu)) {
1769 int r;
1770 qemu_mutex_unlock_iothread();
1771 r = tcg_cpu_exec(cpu);
1772 qemu_mutex_lock_iothread();
1773 switch (r) {
1774 case EXCP_DEBUG:
1775 cpu_handle_guest_debug(cpu);
1776 break;
1777 case EXCP_HALTED:
1778 /* during start-up the vCPU is reset and the thread is
1779 * kicked several times. If we don't ensure we go back
1780 * to sleep in the halted state we won't cleanly
1781 * start-up when the vCPU is enabled.
1782 *
1783 * cpu->halted should ensure we sleep in wait_io_event
1784 */
1785 g_assert(cpu->halted);
1786 break;
1787 case EXCP_ATOMIC:
1788 qemu_mutex_unlock_iothread();
1789 cpu_exec_step_atomic(cpu);
1790 qemu_mutex_lock_iothread();
1791 default:
1792 /* Ignore everything else? */
1793 break;
1794 }
1795 }
1796
1797 atomic_mb_set(&cpu->exit_request, 0);
1798 qemu_wait_io_event(cpu);
1799 } while (!cpu->unplug || cpu_can_run(cpu));
1800
1801 qemu_tcg_destroy_vcpu(cpu);
1802 cpu->created = false;
1803 qemu_cond_signal(&qemu_cpu_cond);
1804 qemu_mutex_unlock_iothread();
1805 rcu_unregister_thread();
1806 return NULL;
1807 }
1808
1809 static void qemu_cpu_kick_thread(CPUState *cpu)
1810 {
1811 #ifndef _WIN32
1812 int err;
1813
1814 if (cpu->thread_kicked) {
1815 return;
1816 }
1817 cpu->thread_kicked = true;
1818 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1819 if (err && err != ESRCH) {
1820 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1821 exit(1);
1822 }
1823 #else /* _WIN32 */
1824 if (!qemu_cpu_is_self(cpu)) {
1825 if (whpx_enabled()) {
1826 whpx_vcpu_kick(cpu);
1827 } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1828 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1829 __func__, GetLastError());
1830 exit(1);
1831 }
1832 }
1833 #endif
1834 }
1835
1836 void qemu_cpu_kick(CPUState *cpu)
1837 {
1838 qemu_cond_broadcast(cpu->halt_cond);
1839 if (tcg_enabled()) {
1840 if (qemu_tcg_mttcg_enabled()) {
1841 cpu_exit(cpu);
1842 } else {
1843 qemu_cpu_kick_rr_cpus();
1844 }
1845 } else {
1846 if (hax_enabled()) {
1847 /*
1848 * FIXME: race condition with the exit_request check in
1849 * hax_vcpu_hax_exec
1850 */
1851 cpu->exit_request = 1;
1852 }
1853 qemu_cpu_kick_thread(cpu);
1854 }
1855 }
1856
1857 void qemu_cpu_kick_self(void)
1858 {
1859 assert(current_cpu);
1860 qemu_cpu_kick_thread(current_cpu);
1861 }
1862
1863 bool qemu_cpu_is_self(CPUState *cpu)
1864 {
1865 return qemu_thread_is_self(cpu->thread);
1866 }
1867
1868 bool qemu_in_vcpu_thread(void)
1869 {
1870 return current_cpu && qemu_cpu_is_self(current_cpu);
1871 }
1872
1873 static __thread bool iothread_locked = false;
1874
1875 bool qemu_mutex_iothread_locked(void)
1876 {
1877 return iothread_locked;
1878 }
1879
1880 /*
1881 * The BQL is taken from so many places that it is worth profiling the
1882 * callers directly, instead of funneling them all through a single function.
1883 */
1884 void qemu_mutex_lock_iothread_impl(const char *file, int line)
1885 {
1886 QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1887
1888 g_assert(!qemu_mutex_iothread_locked());
1889 bql_lock(&qemu_global_mutex, file, line);
1890 iothread_locked = true;
1891 }
1892
1893 void qemu_mutex_unlock_iothread(void)
1894 {
1895 g_assert(qemu_mutex_iothread_locked());
1896 iothread_locked = false;
1897 qemu_mutex_unlock(&qemu_global_mutex);
1898 }
1899
1900 static bool all_vcpus_paused(void)
1901 {
1902 CPUState *cpu;
1903
1904 CPU_FOREACH(cpu) {
1905 if (!cpu->stopped) {
1906 return false;
1907 }
1908 }
1909
1910 return true;
1911 }
1912
1913 void pause_all_vcpus(void)
1914 {
1915 CPUState *cpu;
1916
1917 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1918 CPU_FOREACH(cpu) {
1919 if (qemu_cpu_is_self(cpu)) {
1920 qemu_cpu_stop(cpu, true);
1921 } else {
1922 cpu->stop = true;
1923 qemu_cpu_kick(cpu);
1924 }
1925 }
1926
1927 /* We need to drop the replay_lock so any vCPU threads woken up
1928 * can finish their replay tasks
1929 */
1930 replay_mutex_unlock();
1931
1932 while (!all_vcpus_paused()) {
1933 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1934 CPU_FOREACH(cpu) {
1935 qemu_cpu_kick(cpu);
1936 }
1937 }
1938
1939 qemu_mutex_unlock_iothread();
1940 replay_mutex_lock();
1941 qemu_mutex_lock_iothread();
1942 }
1943
1944 void cpu_resume(CPUState *cpu)
1945 {
1946 cpu->stop = false;
1947 cpu->stopped = false;
1948 qemu_cpu_kick(cpu);
1949 }
1950
1951 void resume_all_vcpus(void)
1952 {
1953 CPUState *cpu;
1954
1955 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1956 CPU_FOREACH(cpu) {
1957 cpu_resume(cpu);
1958 }
1959 }
1960
1961 void cpu_remove_sync(CPUState *cpu)
1962 {
1963 cpu->stop = true;
1964 cpu->unplug = true;
1965 qemu_cpu_kick(cpu);
1966 qemu_mutex_unlock_iothread();
1967 qemu_thread_join(cpu->thread);
1968 qemu_mutex_lock_iothread();
1969 }
1970
1971 /* For temporary buffers for forming a name */
1972 #define VCPU_THREAD_NAME_SIZE 16
1973
1974 static void qemu_tcg_init_vcpu(CPUState *cpu)
1975 {
1976 char thread_name[VCPU_THREAD_NAME_SIZE];
1977 static QemuCond *single_tcg_halt_cond;
1978 static QemuThread *single_tcg_cpu_thread;
1979 static int tcg_region_inited;
1980
1981 assert(tcg_enabled());
1982 /*
1983 * Initialize TCG regions--once. Now is a good time, because:
1984 * (1) TCG's init context, prologue and target globals have been set up.
1985 * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1986 * -accel flag is processed, so the check doesn't work then).
1987 */
1988 if (!tcg_region_inited) {
1989 tcg_region_inited = 1;
1990 tcg_region_init();
1991 }
1992
1993 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1994 cpu->thread = g_malloc0(sizeof(QemuThread));
1995 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1996 qemu_cond_init(cpu->halt_cond);
1997
1998 if (qemu_tcg_mttcg_enabled()) {
1999 /* create a thread per vCPU with TCG (MTTCG) */
2000 parallel_cpus = true;
2001 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
2002 cpu->cpu_index);
2003
2004 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
2005 cpu, QEMU_THREAD_JOINABLE);
2006
2007 } else {
2008 /* share a single thread for all cpus with TCG */
2009 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
2010 qemu_thread_create(cpu->thread, thread_name,
2011 qemu_tcg_rr_cpu_thread_fn,
2012 cpu, QEMU_THREAD_JOINABLE);
2013
2014 single_tcg_halt_cond = cpu->halt_cond;
2015 single_tcg_cpu_thread = cpu->thread;
2016 }
2017 #ifdef _WIN32
2018 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2019 #endif
2020 } else {
2021 /* For non-MTTCG cases we share the thread */
2022 cpu->thread = single_tcg_cpu_thread;
2023 cpu->halt_cond = single_tcg_halt_cond;
2024 cpu->thread_id = first_cpu->thread_id;
2025 cpu->can_do_io = 1;
2026 cpu->created = true;
2027 }
2028 }
2029
2030 static void qemu_hax_start_vcpu(CPUState *cpu)
2031 {
2032 char thread_name[VCPU_THREAD_NAME_SIZE];
2033
2034 cpu->thread = g_malloc0(sizeof(QemuThread));
2035 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2036 qemu_cond_init(cpu->halt_cond);
2037
2038 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2039 cpu->cpu_index);
2040 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2041 cpu, QEMU_THREAD_JOINABLE);
2042 #ifdef _WIN32
2043 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2044 #endif
2045 }
2046
2047 static void qemu_kvm_start_vcpu(CPUState *cpu)
2048 {
2049 char thread_name[VCPU_THREAD_NAME_SIZE];
2050
2051 cpu->thread = g_malloc0(sizeof(QemuThread));
2052 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2053 qemu_cond_init(cpu->halt_cond);
2054 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2055 cpu->cpu_index);
2056 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2057 cpu, QEMU_THREAD_JOINABLE);
2058 }
2059
2060 static void qemu_hvf_start_vcpu(CPUState *cpu)
2061 {
2062 char thread_name[VCPU_THREAD_NAME_SIZE];
2063
2064 /* HVF currently does not support TCG, and only runs in
2065 * unrestricted-guest mode. */
2066 assert(hvf_enabled());
2067
2068 cpu->thread = g_malloc0(sizeof(QemuThread));
2069 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2070 qemu_cond_init(cpu->halt_cond);
2071
2072 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2073 cpu->cpu_index);
2074 qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2075 cpu, QEMU_THREAD_JOINABLE);
2076 }
2077
2078 static void qemu_whpx_start_vcpu(CPUState *cpu)
2079 {
2080 char thread_name[VCPU_THREAD_NAME_SIZE];
2081
2082 cpu->thread = g_malloc0(sizeof(QemuThread));
2083 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2084 qemu_cond_init(cpu->halt_cond);
2085 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2086 cpu->cpu_index);
2087 qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2088 cpu, QEMU_THREAD_JOINABLE);
2089 #ifdef _WIN32
2090 cpu->hThread = qemu_thread_get_handle(cpu->thread);
2091 #endif
2092 }
2093
2094 static void qemu_dummy_start_vcpu(CPUState *cpu)
2095 {
2096 char thread_name[VCPU_THREAD_NAME_SIZE];
2097
2098 cpu->thread = g_malloc0(sizeof(QemuThread));
2099 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2100 qemu_cond_init(cpu->halt_cond);
2101 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2102 cpu->cpu_index);
2103 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2104 QEMU_THREAD_JOINABLE);
2105 }
2106
2107 void qemu_init_vcpu(CPUState *cpu)
2108 {
2109 MachineState *ms = MACHINE(qdev_get_machine());
2110
2111 cpu->nr_cores = ms->smp.cores;
2112 cpu->nr_threads = ms->smp.threads;
2113 cpu->stopped = true;
2114 cpu->random_seed = qemu_guest_random_seed_thread_part1();
2115
2116 if (!cpu->as) {
2117 /* If the target cpu hasn't set up any address spaces itself,
2118 * give it the default one.
2119 */
2120 cpu->num_ases = 1;
2121 cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2122 }
2123
2124 if (kvm_enabled()) {
2125 qemu_kvm_start_vcpu(cpu);
2126 } else if (hax_enabled()) {
2127 qemu_hax_start_vcpu(cpu);
2128 } else if (hvf_enabled()) {
2129 qemu_hvf_start_vcpu(cpu);
2130 } else if (tcg_enabled()) {
2131 qemu_tcg_init_vcpu(cpu);
2132 } else if (whpx_enabled()) {
2133 qemu_whpx_start_vcpu(cpu);
2134 } else {
2135 qemu_dummy_start_vcpu(cpu);
2136 }
2137
2138 while (!cpu->created) {
2139 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2140 }
2141 }
2142
2143 void cpu_stop_current(void)
2144 {
2145 if (current_cpu) {
2146 current_cpu->stop = true;
2147 cpu_exit(current_cpu);
2148 }
2149 }
2150
2151 int vm_stop(RunState state)
2152 {
2153 if (qemu_in_vcpu_thread()) {
2154 qemu_system_vmstop_request_prepare();
2155 qemu_system_vmstop_request(state);
2156 /*
2157 * FIXME: should not return to device code in case
2158 * vm_stop() has been requested.
2159 */
2160 cpu_stop_current();
2161 return 0;
2162 }
2163
2164 return do_vm_stop(state, true);
2165 }
2166
2167 /**
2168 * Prepare for (re)starting the VM.
2169 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2170 * running or in case of an error condition), 0 otherwise.
2171 */
2172 int vm_prepare_start(void)
2173 {
2174 RunState requested;
2175
2176 qemu_vmstop_requested(&requested);
2177 if (runstate_is_running() && requested == RUN_STATE__MAX) {
2178 return -1;
2179 }
2180
2181 /* Ensure that a STOP/RESUME pair of events is emitted if a
2182 * vmstop request was pending. The BLOCK_IO_ERROR event, for
2183 * example, according to documentation is always followed by
2184 * the STOP event.
2185 */
2186 if (runstate_is_running()) {
2187 qapi_event_send_stop();
2188 qapi_event_send_resume();
2189 return -1;
2190 }
2191
2192 /* We are sending this now, but the CPUs will be resumed shortly later */
2193 qapi_event_send_resume();
2194
2195 cpu_enable_ticks();
2196 runstate_set(RUN_STATE_RUNNING);
2197 vm_state_notify(1, RUN_STATE_RUNNING);
2198 return 0;
2199 }
2200
2201 void vm_start(void)
2202 {
2203 if (!vm_prepare_start()) {
2204 resume_all_vcpus();
2205 }
2206 }
2207
2208 /* does a state transition even if the VM is already stopped,
2209 current state is forgotten forever */
2210 int vm_stop_force_state(RunState state)
2211 {
2212 if (runstate_is_running()) {
2213 return vm_stop(state);
2214 } else {
2215 runstate_set(state);
2216
2217 bdrv_drain_all();
2218 /* Make sure to return an error if the flush in a previous vm_stop()
2219 * failed. */
2220 return bdrv_flush_all();
2221 }
2222 }
2223
2224 void list_cpus(const char *optarg)
2225 {
2226 /* XXX: implement xxx_cpu_list for targets that still miss it */
2227 #if defined(cpu_list)
2228 cpu_list();
2229 #endif
2230 }
2231
2232 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2233 bool has_cpu, int64_t cpu_index, Error **errp)
2234 {
2235 FILE *f;
2236 uint32_t l;
2237 CPUState *cpu;
2238 uint8_t buf[1024];
2239 int64_t orig_addr = addr, orig_size = size;
2240
2241 if (!has_cpu) {
2242 cpu_index = 0;
2243 }
2244
2245 cpu = qemu_get_cpu(cpu_index);
2246 if (cpu == NULL) {
2247 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2248 "a CPU number");
2249 return;
2250 }
2251
2252 f = fopen(filename, "wb");
2253 if (!f) {
2254 error_setg_file_open(errp, errno, filename);
2255 return;
2256 }
2257
2258 while (size != 0) {
2259 l = sizeof(buf);
2260 if (l > size)
2261 l = size;
2262 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2263 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2264 " specified", orig_addr, orig_size);
2265 goto exit;
2266 }
2267 if (fwrite(buf, 1, l, f) != l) {
2268 error_setg(errp, QERR_IO_ERROR);
2269 goto exit;
2270 }
2271 addr += l;
2272 size -= l;
2273 }
2274
2275 exit:
2276 fclose(f);
2277 }
2278
2279 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2280 Error **errp)
2281 {
2282 FILE *f;
2283 uint32_t l;
2284 uint8_t buf[1024];
2285
2286 f = fopen(filename, "wb");
2287 if (!f) {
2288 error_setg_file_open(errp, errno, filename);
2289 return;
2290 }
2291
2292 while (size != 0) {
2293 l = sizeof(buf);
2294 if (l > size)
2295 l = size;
2296 cpu_physical_memory_read(addr, buf, l);
2297 if (fwrite(buf, 1, l, f) != l) {
2298 error_setg(errp, QERR_IO_ERROR);
2299 goto exit;
2300 }
2301 addr += l;
2302 size -= l;
2303 }
2304
2305 exit:
2306 fclose(f);
2307 }
2308
2309 void qmp_inject_nmi(Error **errp)
2310 {
2311 nmi_monitor_handle(monitor_get_cpu_index(), errp);
2312 }
2313
2314 void dump_drift_info(void)
2315 {
2316 if (!use_icount) {
2317 return;
2318 }
2319
2320 qemu_printf("Host - Guest clock %"PRIi64" ms\n",
2321 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2322 if (icount_align_option) {
2323 qemu_printf("Max guest delay %"PRIi64" ms\n",
2324 -max_delay / SCALE_MS);
2325 qemu_printf("Max guest advance %"PRIi64" ms\n",
2326 max_advance / SCALE_MS);
2327 } else {
2328 qemu_printf("Max guest delay NA\n");
2329 qemu_printf("Max guest advance NA\n");
2330 }
2331 }