]> git.proxmox.com Git - mirror_qemu.git/blob - cpus.c
tcg: add kick timer for single-threaded vCPU emulation
[mirror_qemu.git] / cpus.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 /* Needed early for CONFIG_BSD etc. */
26 #include "qemu/osdep.h"
27 #include "qemu-common.h"
28 #include "qemu/config-file.h"
29 #include "cpu.h"
30 #include "monitor/monitor.h"
31 #include "qapi/qmp/qerror.h"
32 #include "qemu/error-report.h"
33 #include "sysemu/sysemu.h"
34 #include "sysemu/block-backend.h"
35 #include "exec/gdbstub.h"
36 #include "sysemu/dma.h"
37 #include "sysemu/hw_accel.h"
38 #include "sysemu/kvm.h"
39 #include "sysemu/hax.h"
40 #include "qmp-commands.h"
41 #include "exec/exec-all.h"
42
43 #include "qemu/thread.h"
44 #include "sysemu/cpus.h"
45 #include "sysemu/qtest.h"
46 #include "qemu/main-loop.h"
47 #include "qemu/bitmap.h"
48 #include "qemu/seqlock.h"
49 #include "tcg.h"
50 #include "qapi-event.h"
51 #include "hw/nmi.h"
52 #include "sysemu/replay.h"
53
54 #ifndef _WIN32
55 #include "qemu/compatfd.h"
56 #endif
57
58 #ifdef CONFIG_LINUX
59
60 #include <sys/prctl.h>
61
62 #ifndef PR_MCE_KILL
63 #define PR_MCE_KILL 33
64 #endif
65
66 #ifndef PR_MCE_KILL_SET
67 #define PR_MCE_KILL_SET 1
68 #endif
69
70 #ifndef PR_MCE_KILL_EARLY
71 #define PR_MCE_KILL_EARLY 1
72 #endif
73
74 #endif /* CONFIG_LINUX */
75
76 int64_t max_delay;
77 int64_t max_advance;
78
79 /* vcpu throttling controls */
80 static QEMUTimer *throttle_timer;
81 static unsigned int throttle_percentage;
82
83 #define CPU_THROTTLE_PCT_MIN 1
84 #define CPU_THROTTLE_PCT_MAX 99
85 #define CPU_THROTTLE_TIMESLICE_NS 10000000
86
87 bool cpu_is_stopped(CPUState *cpu)
88 {
89 return cpu->stopped || !runstate_is_running();
90 }
91
92 static bool cpu_thread_is_idle(CPUState *cpu)
93 {
94 if (cpu->stop || cpu->queued_work_first) {
95 return false;
96 }
97 if (cpu_is_stopped(cpu)) {
98 return true;
99 }
100 if (!cpu->halted || cpu_has_work(cpu) ||
101 kvm_halt_in_kernel()) {
102 return false;
103 }
104 return true;
105 }
106
107 static bool all_cpu_threads_idle(void)
108 {
109 CPUState *cpu;
110
111 CPU_FOREACH(cpu) {
112 if (!cpu_thread_is_idle(cpu)) {
113 return false;
114 }
115 }
116 return true;
117 }
118
119 /***********************************************************/
120 /* guest cycle counter */
121
122 /* Protected by TimersState seqlock */
123
124 static bool icount_sleep = true;
125 static int64_t vm_clock_warp_start = -1;
126 /* Conversion factor from emulated instructions to virtual clock ticks. */
127 static int icount_time_shift;
128 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
129 #define MAX_ICOUNT_SHIFT 10
130
131 static QEMUTimer *icount_rt_timer;
132 static QEMUTimer *icount_vm_timer;
133 static QEMUTimer *icount_warp_timer;
134
135 typedef struct TimersState {
136 /* Protected by BQL. */
137 int64_t cpu_ticks_prev;
138 int64_t cpu_ticks_offset;
139
140 /* cpu_clock_offset can be read out of BQL, so protect it with
141 * this lock.
142 */
143 QemuSeqLock vm_clock_seqlock;
144 int64_t cpu_clock_offset;
145 int32_t cpu_ticks_enabled;
146 int64_t dummy;
147
148 /* Compensate for varying guest execution speed. */
149 int64_t qemu_icount_bias;
150 /* Only written by TCG thread */
151 int64_t qemu_icount;
152 } TimersState;
153
154 static TimersState timers_state;
155 bool mttcg_enabled;
156
157 /*
158 * We default to false if we know other options have been enabled
159 * which are currently incompatible with MTTCG. Otherwise when each
160 * guest (target) has been updated to support:
161 * - atomic instructions
162 * - memory ordering primitives (barriers)
163 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
164 *
165 * Once a guest architecture has been converted to the new primitives
166 * there are two remaining limitations to check.
167 *
168 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
169 * - The host must have a stronger memory order than the guest
170 *
171 * It may be possible in future to support strong guests on weak hosts
172 * but that will require tagging all load/stores in a guest with their
173 * implicit memory order requirements which would likely slow things
174 * down a lot.
175 */
176
177 static bool check_tcg_memory_orders_compatible(void)
178 {
179 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
180 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
181 #else
182 return false;
183 #endif
184 }
185
186 static bool default_mttcg_enabled(void)
187 {
188 QemuOpts *icount_opts = qemu_find_opts_singleton("icount");
189 const char *rr = qemu_opt_get(icount_opts, "rr");
190
191 if (rr || TCG_OVERSIZED_GUEST) {
192 return false;
193 } else {
194 #ifdef TARGET_SUPPORTS_MTTCG
195 return check_tcg_memory_orders_compatible();
196 #else
197 return false;
198 #endif
199 }
200 }
201
202 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
203 {
204 const char *t = qemu_opt_get(opts, "thread");
205 if (t) {
206 if (strcmp(t, "multi") == 0) {
207 if (TCG_OVERSIZED_GUEST) {
208 error_setg(errp, "No MTTCG when guest word size > hosts");
209 } else {
210 if (!check_tcg_memory_orders_compatible()) {
211 error_report("Guest expects a stronger memory ordering "
212 "than the host provides");
213 error_printf("This may cause strange/hard to debug errors");
214 }
215 mttcg_enabled = true;
216 }
217 } else if (strcmp(t, "single") == 0) {
218 mttcg_enabled = false;
219 } else {
220 error_setg(errp, "Invalid 'thread' setting %s", t);
221 }
222 } else {
223 mttcg_enabled = default_mttcg_enabled();
224 }
225 }
226
227 int64_t cpu_get_icount_raw(void)
228 {
229 int64_t icount;
230 CPUState *cpu = current_cpu;
231
232 icount = timers_state.qemu_icount;
233 if (cpu) {
234 if (!cpu->can_do_io) {
235 fprintf(stderr, "Bad icount read\n");
236 exit(1);
237 }
238 icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
239 }
240 return icount;
241 }
242
243 /* Return the virtual CPU time, based on the instruction counter. */
244 static int64_t cpu_get_icount_locked(void)
245 {
246 int64_t icount = cpu_get_icount_raw();
247 return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
248 }
249
250 int64_t cpu_get_icount(void)
251 {
252 int64_t icount;
253 unsigned start;
254
255 do {
256 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
257 icount = cpu_get_icount_locked();
258 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
259
260 return icount;
261 }
262
263 int64_t cpu_icount_to_ns(int64_t icount)
264 {
265 return icount << icount_time_shift;
266 }
267
268 /* return the time elapsed in VM between vm_start and vm_stop. Unless
269 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
270 * counter.
271 *
272 * Caller must hold the BQL
273 */
274 int64_t cpu_get_ticks(void)
275 {
276 int64_t ticks;
277
278 if (use_icount) {
279 return cpu_get_icount();
280 }
281
282 ticks = timers_state.cpu_ticks_offset;
283 if (timers_state.cpu_ticks_enabled) {
284 ticks += cpu_get_host_ticks();
285 }
286
287 if (timers_state.cpu_ticks_prev > ticks) {
288 /* Note: non increasing ticks may happen if the host uses
289 software suspend */
290 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
291 ticks = timers_state.cpu_ticks_prev;
292 }
293
294 timers_state.cpu_ticks_prev = ticks;
295 return ticks;
296 }
297
298 static int64_t cpu_get_clock_locked(void)
299 {
300 int64_t time;
301
302 time = timers_state.cpu_clock_offset;
303 if (timers_state.cpu_ticks_enabled) {
304 time += get_clock();
305 }
306
307 return time;
308 }
309
310 /* Return the monotonic time elapsed in VM, i.e.,
311 * the time between vm_start and vm_stop
312 */
313 int64_t cpu_get_clock(void)
314 {
315 int64_t ti;
316 unsigned start;
317
318 do {
319 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
320 ti = cpu_get_clock_locked();
321 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
322
323 return ti;
324 }
325
326 /* enable cpu_get_ticks()
327 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
328 */
329 void cpu_enable_ticks(void)
330 {
331 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
332 seqlock_write_begin(&timers_state.vm_clock_seqlock);
333 if (!timers_state.cpu_ticks_enabled) {
334 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
335 timers_state.cpu_clock_offset -= get_clock();
336 timers_state.cpu_ticks_enabled = 1;
337 }
338 seqlock_write_end(&timers_state.vm_clock_seqlock);
339 }
340
341 /* disable cpu_get_ticks() : the clock is stopped. You must not call
342 * cpu_get_ticks() after that.
343 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
344 */
345 void cpu_disable_ticks(void)
346 {
347 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
348 seqlock_write_begin(&timers_state.vm_clock_seqlock);
349 if (timers_state.cpu_ticks_enabled) {
350 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
351 timers_state.cpu_clock_offset = cpu_get_clock_locked();
352 timers_state.cpu_ticks_enabled = 0;
353 }
354 seqlock_write_end(&timers_state.vm_clock_seqlock);
355 }
356
357 /* Correlation between real and virtual time is always going to be
358 fairly approximate, so ignore small variation.
359 When the guest is idle real and virtual time will be aligned in
360 the IO wait loop. */
361 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
362
363 static void icount_adjust(void)
364 {
365 int64_t cur_time;
366 int64_t cur_icount;
367 int64_t delta;
368
369 /* Protected by TimersState mutex. */
370 static int64_t last_delta;
371
372 /* If the VM is not running, then do nothing. */
373 if (!runstate_is_running()) {
374 return;
375 }
376
377 seqlock_write_begin(&timers_state.vm_clock_seqlock);
378 cur_time = cpu_get_clock_locked();
379 cur_icount = cpu_get_icount_locked();
380
381 delta = cur_icount - cur_time;
382 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
383 if (delta > 0
384 && last_delta + ICOUNT_WOBBLE < delta * 2
385 && icount_time_shift > 0) {
386 /* The guest is getting too far ahead. Slow time down. */
387 icount_time_shift--;
388 }
389 if (delta < 0
390 && last_delta - ICOUNT_WOBBLE > delta * 2
391 && icount_time_shift < MAX_ICOUNT_SHIFT) {
392 /* The guest is getting too far behind. Speed time up. */
393 icount_time_shift++;
394 }
395 last_delta = delta;
396 timers_state.qemu_icount_bias = cur_icount
397 - (timers_state.qemu_icount << icount_time_shift);
398 seqlock_write_end(&timers_state.vm_clock_seqlock);
399 }
400
401 static void icount_adjust_rt(void *opaque)
402 {
403 timer_mod(icount_rt_timer,
404 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
405 icount_adjust();
406 }
407
408 static void icount_adjust_vm(void *opaque)
409 {
410 timer_mod(icount_vm_timer,
411 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
412 NANOSECONDS_PER_SECOND / 10);
413 icount_adjust();
414 }
415
416 static int64_t qemu_icount_round(int64_t count)
417 {
418 return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
419 }
420
421 static void icount_warp_rt(void)
422 {
423 unsigned seq;
424 int64_t warp_start;
425
426 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
427 * changes from -1 to another value, so the race here is okay.
428 */
429 do {
430 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
431 warp_start = vm_clock_warp_start;
432 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
433
434 if (warp_start == -1) {
435 return;
436 }
437
438 seqlock_write_begin(&timers_state.vm_clock_seqlock);
439 if (runstate_is_running()) {
440 int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
441 cpu_get_clock_locked());
442 int64_t warp_delta;
443
444 warp_delta = clock - vm_clock_warp_start;
445 if (use_icount == 2) {
446 /*
447 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
448 * far ahead of real time.
449 */
450 int64_t cur_icount = cpu_get_icount_locked();
451 int64_t delta = clock - cur_icount;
452 warp_delta = MIN(warp_delta, delta);
453 }
454 timers_state.qemu_icount_bias += warp_delta;
455 }
456 vm_clock_warp_start = -1;
457 seqlock_write_end(&timers_state.vm_clock_seqlock);
458
459 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
460 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
461 }
462 }
463
464 static void icount_timer_cb(void *opaque)
465 {
466 /* No need for a checkpoint because the timer already synchronizes
467 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
468 */
469 icount_warp_rt();
470 }
471
472 void qtest_clock_warp(int64_t dest)
473 {
474 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
475 AioContext *aio_context;
476 assert(qtest_enabled());
477 aio_context = qemu_get_aio_context();
478 while (clock < dest) {
479 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
480 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
481
482 seqlock_write_begin(&timers_state.vm_clock_seqlock);
483 timers_state.qemu_icount_bias += warp;
484 seqlock_write_end(&timers_state.vm_clock_seqlock);
485
486 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
487 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
488 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
489 }
490 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
491 }
492
493 void qemu_start_warp_timer(void)
494 {
495 int64_t clock;
496 int64_t deadline;
497
498 if (!use_icount) {
499 return;
500 }
501
502 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
503 * do not fire, so computing the deadline does not make sense.
504 */
505 if (!runstate_is_running()) {
506 return;
507 }
508
509 /* warp clock deterministically in record/replay mode */
510 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
511 return;
512 }
513
514 if (!all_cpu_threads_idle()) {
515 return;
516 }
517
518 if (qtest_enabled()) {
519 /* When testing, qtest commands advance icount. */
520 return;
521 }
522
523 /* We want to use the earliest deadline from ALL vm_clocks */
524 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
525 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
526 if (deadline < 0) {
527 static bool notified;
528 if (!icount_sleep && !notified) {
529 error_report("WARNING: icount sleep disabled and no active timers");
530 notified = true;
531 }
532 return;
533 }
534
535 if (deadline > 0) {
536 /*
537 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
538 * sleep. Otherwise, the CPU might be waiting for a future timer
539 * interrupt to wake it up, but the interrupt never comes because
540 * the vCPU isn't running any insns and thus doesn't advance the
541 * QEMU_CLOCK_VIRTUAL.
542 */
543 if (!icount_sleep) {
544 /*
545 * We never let VCPUs sleep in no sleep icount mode.
546 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
547 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
548 * It is useful when we want a deterministic execution time,
549 * isolated from host latencies.
550 */
551 seqlock_write_begin(&timers_state.vm_clock_seqlock);
552 timers_state.qemu_icount_bias += deadline;
553 seqlock_write_end(&timers_state.vm_clock_seqlock);
554 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
555 } else {
556 /*
557 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
558 * "real" time, (related to the time left until the next event) has
559 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
560 * This avoids that the warps are visible externally; for example,
561 * you will not be sending network packets continuously instead of
562 * every 100ms.
563 */
564 seqlock_write_begin(&timers_state.vm_clock_seqlock);
565 if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
566 vm_clock_warp_start = clock;
567 }
568 seqlock_write_end(&timers_state.vm_clock_seqlock);
569 timer_mod_anticipate(icount_warp_timer, clock + deadline);
570 }
571 } else if (deadline == 0) {
572 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
573 }
574 }
575
576 static void qemu_account_warp_timer(void)
577 {
578 if (!use_icount || !icount_sleep) {
579 return;
580 }
581
582 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
583 * do not fire, so computing the deadline does not make sense.
584 */
585 if (!runstate_is_running()) {
586 return;
587 }
588
589 /* warp clock deterministically in record/replay mode */
590 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
591 return;
592 }
593
594 timer_del(icount_warp_timer);
595 icount_warp_rt();
596 }
597
598 static bool icount_state_needed(void *opaque)
599 {
600 return use_icount;
601 }
602
603 /*
604 * This is a subsection for icount migration.
605 */
606 static const VMStateDescription icount_vmstate_timers = {
607 .name = "timer/icount",
608 .version_id = 1,
609 .minimum_version_id = 1,
610 .needed = icount_state_needed,
611 .fields = (VMStateField[]) {
612 VMSTATE_INT64(qemu_icount_bias, TimersState),
613 VMSTATE_INT64(qemu_icount, TimersState),
614 VMSTATE_END_OF_LIST()
615 }
616 };
617
618 static const VMStateDescription vmstate_timers = {
619 .name = "timer",
620 .version_id = 2,
621 .minimum_version_id = 1,
622 .fields = (VMStateField[]) {
623 VMSTATE_INT64(cpu_ticks_offset, TimersState),
624 VMSTATE_INT64(dummy, TimersState),
625 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
626 VMSTATE_END_OF_LIST()
627 },
628 .subsections = (const VMStateDescription*[]) {
629 &icount_vmstate_timers,
630 NULL
631 }
632 };
633
634 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
635 {
636 double pct;
637 double throttle_ratio;
638 long sleeptime_ns;
639
640 if (!cpu_throttle_get_percentage()) {
641 return;
642 }
643
644 pct = (double)cpu_throttle_get_percentage()/100;
645 throttle_ratio = pct / (1 - pct);
646 sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
647
648 qemu_mutex_unlock_iothread();
649 atomic_set(&cpu->throttle_thread_scheduled, 0);
650 g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
651 qemu_mutex_lock_iothread();
652 }
653
654 static void cpu_throttle_timer_tick(void *opaque)
655 {
656 CPUState *cpu;
657 double pct;
658
659 /* Stop the timer if needed */
660 if (!cpu_throttle_get_percentage()) {
661 return;
662 }
663 CPU_FOREACH(cpu) {
664 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
665 async_run_on_cpu(cpu, cpu_throttle_thread,
666 RUN_ON_CPU_NULL);
667 }
668 }
669
670 pct = (double)cpu_throttle_get_percentage()/100;
671 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
672 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
673 }
674
675 void cpu_throttle_set(int new_throttle_pct)
676 {
677 /* Ensure throttle percentage is within valid range */
678 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
679 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
680
681 atomic_set(&throttle_percentage, new_throttle_pct);
682
683 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
684 CPU_THROTTLE_TIMESLICE_NS);
685 }
686
687 void cpu_throttle_stop(void)
688 {
689 atomic_set(&throttle_percentage, 0);
690 }
691
692 bool cpu_throttle_active(void)
693 {
694 return (cpu_throttle_get_percentage() != 0);
695 }
696
697 int cpu_throttle_get_percentage(void)
698 {
699 return atomic_read(&throttle_percentage);
700 }
701
702 void cpu_ticks_init(void)
703 {
704 seqlock_init(&timers_state.vm_clock_seqlock);
705 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
706 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
707 cpu_throttle_timer_tick, NULL);
708 }
709
710 void configure_icount(QemuOpts *opts, Error **errp)
711 {
712 const char *option;
713 char *rem_str = NULL;
714
715 option = qemu_opt_get(opts, "shift");
716 if (!option) {
717 if (qemu_opt_get(opts, "align") != NULL) {
718 error_setg(errp, "Please specify shift option when using align");
719 }
720 return;
721 }
722
723 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
724 if (icount_sleep) {
725 icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
726 icount_timer_cb, NULL);
727 }
728
729 icount_align_option = qemu_opt_get_bool(opts, "align", false);
730
731 if (icount_align_option && !icount_sleep) {
732 error_setg(errp, "align=on and sleep=off are incompatible");
733 }
734 if (strcmp(option, "auto") != 0) {
735 errno = 0;
736 icount_time_shift = strtol(option, &rem_str, 0);
737 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
738 error_setg(errp, "icount: Invalid shift value");
739 }
740 use_icount = 1;
741 return;
742 } else if (icount_align_option) {
743 error_setg(errp, "shift=auto and align=on are incompatible");
744 } else if (!icount_sleep) {
745 error_setg(errp, "shift=auto and sleep=off are incompatible");
746 }
747
748 use_icount = 2;
749
750 /* 125MIPS seems a reasonable initial guess at the guest speed.
751 It will be corrected fairly quickly anyway. */
752 icount_time_shift = 3;
753
754 /* Have both realtime and virtual time triggers for speed adjustment.
755 The realtime trigger catches emulated time passing too slowly,
756 the virtual time trigger catches emulated time passing too fast.
757 Realtime triggers occur even when idle, so use them less frequently
758 than VM triggers. */
759 icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
760 icount_adjust_rt, NULL);
761 timer_mod(icount_rt_timer,
762 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
763 icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
764 icount_adjust_vm, NULL);
765 timer_mod(icount_vm_timer,
766 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
767 NANOSECONDS_PER_SECOND / 10);
768 }
769
770 /***********************************************************/
771 /* TCG vCPU kick timer
772 *
773 * The kick timer is responsible for moving single threaded vCPU
774 * emulation on to the next vCPU. If more than one vCPU is running a
775 * timer event with force a cpu->exit so the next vCPU can get
776 * scheduled.
777 *
778 * The timer is removed if all vCPUs are idle and restarted again once
779 * idleness is complete.
780 */
781
782 static QEMUTimer *tcg_kick_vcpu_timer;
783
784 static void qemu_cpu_kick_no_halt(void);
785
786 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
787
788 static inline int64_t qemu_tcg_next_kick(void)
789 {
790 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
791 }
792
793 static void kick_tcg_thread(void *opaque)
794 {
795 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
796 qemu_cpu_kick_no_halt();
797 }
798
799 static void start_tcg_kick_timer(void)
800 {
801 if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
802 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
803 kick_tcg_thread, NULL);
804 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
805 }
806 }
807
808 static void stop_tcg_kick_timer(void)
809 {
810 if (tcg_kick_vcpu_timer) {
811 timer_del(tcg_kick_vcpu_timer);
812 tcg_kick_vcpu_timer = NULL;
813 }
814 }
815
816
817 /***********************************************************/
818 void hw_error(const char *fmt, ...)
819 {
820 va_list ap;
821 CPUState *cpu;
822
823 va_start(ap, fmt);
824 fprintf(stderr, "qemu: hardware error: ");
825 vfprintf(stderr, fmt, ap);
826 fprintf(stderr, "\n");
827 CPU_FOREACH(cpu) {
828 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
829 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
830 }
831 va_end(ap);
832 abort();
833 }
834
835 void cpu_synchronize_all_states(void)
836 {
837 CPUState *cpu;
838
839 CPU_FOREACH(cpu) {
840 cpu_synchronize_state(cpu);
841 }
842 }
843
844 void cpu_synchronize_all_post_reset(void)
845 {
846 CPUState *cpu;
847
848 CPU_FOREACH(cpu) {
849 cpu_synchronize_post_reset(cpu);
850 }
851 }
852
853 void cpu_synchronize_all_post_init(void)
854 {
855 CPUState *cpu;
856
857 CPU_FOREACH(cpu) {
858 cpu_synchronize_post_init(cpu);
859 }
860 }
861
862 static int do_vm_stop(RunState state)
863 {
864 int ret = 0;
865
866 if (runstate_is_running()) {
867 cpu_disable_ticks();
868 pause_all_vcpus();
869 runstate_set(state);
870 vm_state_notify(0, state);
871 qapi_event_send_stop(&error_abort);
872 }
873
874 bdrv_drain_all();
875 replay_disable_events();
876 ret = bdrv_flush_all();
877
878 return ret;
879 }
880
881 static bool cpu_can_run(CPUState *cpu)
882 {
883 if (cpu->stop) {
884 return false;
885 }
886 if (cpu_is_stopped(cpu)) {
887 return false;
888 }
889 return true;
890 }
891
892 static void cpu_handle_guest_debug(CPUState *cpu)
893 {
894 gdb_set_stop_cpu(cpu);
895 qemu_system_debug_request();
896 cpu->stopped = true;
897 }
898
899 #ifdef CONFIG_LINUX
900 static void sigbus_reraise(void)
901 {
902 sigset_t set;
903 struct sigaction action;
904
905 memset(&action, 0, sizeof(action));
906 action.sa_handler = SIG_DFL;
907 if (!sigaction(SIGBUS, &action, NULL)) {
908 raise(SIGBUS);
909 sigemptyset(&set);
910 sigaddset(&set, SIGBUS);
911 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
912 }
913 perror("Failed to re-raise SIGBUS!\n");
914 abort();
915 }
916
917 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
918 void *ctx)
919 {
920 if (kvm_on_sigbus(siginfo->ssi_code,
921 (void *)(intptr_t)siginfo->ssi_addr)) {
922 sigbus_reraise();
923 }
924 }
925
926 static void qemu_init_sigbus(void)
927 {
928 struct sigaction action;
929
930 memset(&action, 0, sizeof(action));
931 action.sa_flags = SA_SIGINFO;
932 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
933 sigaction(SIGBUS, &action, NULL);
934
935 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
936 }
937
938 static void qemu_kvm_eat_signals(CPUState *cpu)
939 {
940 struct timespec ts = { 0, 0 };
941 siginfo_t siginfo;
942 sigset_t waitset;
943 sigset_t chkset;
944 int r;
945
946 sigemptyset(&waitset);
947 sigaddset(&waitset, SIG_IPI);
948 sigaddset(&waitset, SIGBUS);
949
950 do {
951 r = sigtimedwait(&waitset, &siginfo, &ts);
952 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
953 perror("sigtimedwait");
954 exit(1);
955 }
956
957 switch (r) {
958 case SIGBUS:
959 if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
960 sigbus_reraise();
961 }
962 break;
963 default:
964 break;
965 }
966
967 r = sigpending(&chkset);
968 if (r == -1) {
969 perror("sigpending");
970 exit(1);
971 }
972 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
973 }
974
975 #else /* !CONFIG_LINUX */
976
977 static void qemu_init_sigbus(void)
978 {
979 }
980
981 static void qemu_kvm_eat_signals(CPUState *cpu)
982 {
983 }
984 #endif /* !CONFIG_LINUX */
985
986 #ifndef _WIN32
987 static void dummy_signal(int sig)
988 {
989 }
990
991 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
992 {
993 int r;
994 sigset_t set;
995 struct sigaction sigact;
996
997 memset(&sigact, 0, sizeof(sigact));
998 sigact.sa_handler = dummy_signal;
999 sigaction(SIG_IPI, &sigact, NULL);
1000
1001 pthread_sigmask(SIG_BLOCK, NULL, &set);
1002 sigdelset(&set, SIG_IPI);
1003 sigdelset(&set, SIGBUS);
1004 r = kvm_set_signal_mask(cpu, &set);
1005 if (r) {
1006 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
1007 exit(1);
1008 }
1009 }
1010
1011 #else /* _WIN32 */
1012 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
1013 {
1014 abort();
1015 }
1016 #endif /* _WIN32 */
1017
1018 static QemuMutex qemu_global_mutex;
1019 static QemuCond qemu_io_proceeded_cond;
1020 static unsigned iothread_requesting_mutex;
1021
1022 static QemuThread io_thread;
1023
1024 /* cpu creation */
1025 static QemuCond qemu_cpu_cond;
1026 /* system init */
1027 static QemuCond qemu_pause_cond;
1028
1029 void qemu_init_cpu_loop(void)
1030 {
1031 qemu_init_sigbus();
1032 qemu_cond_init(&qemu_cpu_cond);
1033 qemu_cond_init(&qemu_pause_cond);
1034 qemu_cond_init(&qemu_io_proceeded_cond);
1035 qemu_mutex_init(&qemu_global_mutex);
1036
1037 qemu_thread_get_self(&io_thread);
1038 }
1039
1040 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1041 {
1042 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1043 }
1044
1045 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1046 {
1047 if (kvm_destroy_vcpu(cpu) < 0) {
1048 error_report("kvm_destroy_vcpu failed");
1049 exit(EXIT_FAILURE);
1050 }
1051 }
1052
1053 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1054 {
1055 }
1056
1057 static void qemu_wait_io_event_common(CPUState *cpu)
1058 {
1059 if (cpu->stop) {
1060 cpu->stop = false;
1061 cpu->stopped = true;
1062 qemu_cond_broadcast(&qemu_pause_cond);
1063 }
1064 process_queued_cpu_work(cpu);
1065 cpu->thread_kicked = false;
1066 }
1067
1068 static void qemu_tcg_wait_io_event(CPUState *cpu)
1069 {
1070 while (all_cpu_threads_idle()) {
1071 stop_tcg_kick_timer();
1072 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1073 }
1074
1075 start_tcg_kick_timer();
1076
1077 while (iothread_requesting_mutex) {
1078 qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
1079 }
1080
1081 CPU_FOREACH(cpu) {
1082 qemu_wait_io_event_common(cpu);
1083 }
1084 }
1085
1086 static void qemu_kvm_wait_io_event(CPUState *cpu)
1087 {
1088 while (cpu_thread_is_idle(cpu)) {
1089 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1090 }
1091
1092 qemu_kvm_eat_signals(cpu);
1093 qemu_wait_io_event_common(cpu);
1094 }
1095
1096 static void *qemu_kvm_cpu_thread_fn(void *arg)
1097 {
1098 CPUState *cpu = arg;
1099 int r;
1100
1101 rcu_register_thread();
1102
1103 qemu_mutex_lock_iothread();
1104 qemu_thread_get_self(cpu->thread);
1105 cpu->thread_id = qemu_get_thread_id();
1106 cpu->can_do_io = 1;
1107 current_cpu = cpu;
1108
1109 r = kvm_init_vcpu(cpu);
1110 if (r < 0) {
1111 fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1112 exit(1);
1113 }
1114
1115 qemu_kvm_init_cpu_signals(cpu);
1116
1117 /* signal CPU creation */
1118 cpu->created = true;
1119 qemu_cond_signal(&qemu_cpu_cond);
1120
1121 do {
1122 if (cpu_can_run(cpu)) {
1123 r = kvm_cpu_exec(cpu);
1124 if (r == EXCP_DEBUG) {
1125 cpu_handle_guest_debug(cpu);
1126 }
1127 }
1128 qemu_kvm_wait_io_event(cpu);
1129 } while (!cpu->unplug || cpu_can_run(cpu));
1130
1131 qemu_kvm_destroy_vcpu(cpu);
1132 cpu->created = false;
1133 qemu_cond_signal(&qemu_cpu_cond);
1134 qemu_mutex_unlock_iothread();
1135 return NULL;
1136 }
1137
1138 static void *qemu_dummy_cpu_thread_fn(void *arg)
1139 {
1140 #ifdef _WIN32
1141 fprintf(stderr, "qtest is not supported under Windows\n");
1142 exit(1);
1143 #else
1144 CPUState *cpu = arg;
1145 sigset_t waitset;
1146 int r;
1147
1148 rcu_register_thread();
1149
1150 qemu_mutex_lock_iothread();
1151 qemu_thread_get_self(cpu->thread);
1152 cpu->thread_id = qemu_get_thread_id();
1153 cpu->can_do_io = 1;
1154
1155 sigemptyset(&waitset);
1156 sigaddset(&waitset, SIG_IPI);
1157
1158 /* signal CPU creation */
1159 cpu->created = true;
1160 qemu_cond_signal(&qemu_cpu_cond);
1161
1162 current_cpu = cpu;
1163 while (1) {
1164 current_cpu = NULL;
1165 qemu_mutex_unlock_iothread();
1166 do {
1167 int sig;
1168 r = sigwait(&waitset, &sig);
1169 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1170 if (r == -1) {
1171 perror("sigwait");
1172 exit(1);
1173 }
1174 qemu_mutex_lock_iothread();
1175 current_cpu = cpu;
1176 qemu_wait_io_event_common(cpu);
1177 }
1178
1179 return NULL;
1180 #endif
1181 }
1182
1183 static int64_t tcg_get_icount_limit(void)
1184 {
1185 int64_t deadline;
1186
1187 if (replay_mode != REPLAY_MODE_PLAY) {
1188 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1189
1190 /* Maintain prior (possibly buggy) behaviour where if no deadline
1191 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1192 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1193 * nanoseconds.
1194 */
1195 if ((deadline < 0) || (deadline > INT32_MAX)) {
1196 deadline = INT32_MAX;
1197 }
1198
1199 return qemu_icount_round(deadline);
1200 } else {
1201 return replay_get_instructions();
1202 }
1203 }
1204
1205 static void handle_icount_deadline(void)
1206 {
1207 if (use_icount) {
1208 int64_t deadline =
1209 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1210
1211 if (deadline == 0) {
1212 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1213 }
1214 }
1215 }
1216
1217 static int tcg_cpu_exec(CPUState *cpu)
1218 {
1219 int ret;
1220 #ifdef CONFIG_PROFILER
1221 int64_t ti;
1222 #endif
1223
1224 #ifdef CONFIG_PROFILER
1225 ti = profile_getclock();
1226 #endif
1227 if (use_icount) {
1228 int64_t count;
1229 int decr;
1230 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1231 + cpu->icount_extra);
1232 cpu->icount_decr.u16.low = 0;
1233 cpu->icount_extra = 0;
1234 count = tcg_get_icount_limit();
1235 timers_state.qemu_icount += count;
1236 decr = (count > 0xffff) ? 0xffff : count;
1237 count -= decr;
1238 cpu->icount_decr.u16.low = decr;
1239 cpu->icount_extra = count;
1240 }
1241 cpu_exec_start(cpu);
1242 ret = cpu_exec(cpu);
1243 cpu_exec_end(cpu);
1244 #ifdef CONFIG_PROFILER
1245 tcg_time += profile_getclock() - ti;
1246 #endif
1247 if (use_icount) {
1248 /* Fold pending instructions back into the
1249 instruction counter, and clear the interrupt flag. */
1250 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1251 + cpu->icount_extra);
1252 cpu->icount_decr.u32 = 0;
1253 cpu->icount_extra = 0;
1254 replay_account_executed_instructions();
1255 }
1256 return ret;
1257 }
1258
1259 /* Destroy any remaining vCPUs which have been unplugged and have
1260 * finished running
1261 */
1262 static void deal_with_unplugged_cpus(void)
1263 {
1264 CPUState *cpu;
1265
1266 CPU_FOREACH(cpu) {
1267 if (cpu->unplug && !cpu_can_run(cpu)) {
1268 qemu_tcg_destroy_vcpu(cpu);
1269 cpu->created = false;
1270 qemu_cond_signal(&qemu_cpu_cond);
1271 break;
1272 }
1273 }
1274 }
1275
1276 /* Single-threaded TCG
1277 *
1278 * In the single-threaded case each vCPU is simulated in turn. If
1279 * there is more than a single vCPU we create a simple timer to kick
1280 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1281 * This is done explicitly rather than relying on side-effects
1282 * elsewhere.
1283 */
1284
1285 static void *qemu_tcg_cpu_thread_fn(void *arg)
1286 {
1287 CPUState *cpu = arg;
1288
1289 rcu_register_thread();
1290
1291 qemu_mutex_lock_iothread();
1292 qemu_thread_get_self(cpu->thread);
1293
1294 CPU_FOREACH(cpu) {
1295 cpu->thread_id = qemu_get_thread_id();
1296 cpu->created = true;
1297 cpu->can_do_io = 1;
1298 }
1299 qemu_cond_signal(&qemu_cpu_cond);
1300
1301 /* wait for initial kick-off after machine start */
1302 while (first_cpu->stopped) {
1303 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1304
1305 /* process any pending work */
1306 CPU_FOREACH(cpu) {
1307 qemu_wait_io_event_common(cpu);
1308 }
1309 }
1310
1311 start_tcg_kick_timer();
1312
1313 /* process any pending work */
1314 atomic_mb_set(&exit_request, 1);
1315
1316 cpu = first_cpu;
1317
1318 while (1) {
1319 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1320 qemu_account_warp_timer();
1321
1322 if (!cpu) {
1323 cpu = first_cpu;
1324 }
1325
1326 for (; cpu != NULL && !exit_request; cpu = CPU_NEXT(cpu)) {
1327
1328 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1329 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1330
1331 if (cpu_can_run(cpu)) {
1332 int r;
1333 r = tcg_cpu_exec(cpu);
1334 if (r == EXCP_DEBUG) {
1335 cpu_handle_guest_debug(cpu);
1336 break;
1337 }
1338 } else if (cpu->stop || cpu->stopped) {
1339 if (cpu->unplug) {
1340 cpu = CPU_NEXT(cpu);
1341 }
1342 break;
1343 }
1344
1345 } /* for cpu.. */
1346
1347 /* Pairs with smp_wmb in qemu_cpu_kick. */
1348 atomic_mb_set(&exit_request, 0);
1349
1350 handle_icount_deadline();
1351
1352 qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
1353 deal_with_unplugged_cpus();
1354 }
1355
1356 return NULL;
1357 }
1358
1359 static void *qemu_hax_cpu_thread_fn(void *arg)
1360 {
1361 CPUState *cpu = arg;
1362 int r;
1363 qemu_thread_get_self(cpu->thread);
1364 qemu_mutex_lock(&qemu_global_mutex);
1365
1366 cpu->thread_id = qemu_get_thread_id();
1367 cpu->created = true;
1368 cpu->halted = 0;
1369 current_cpu = cpu;
1370
1371 hax_init_vcpu(cpu);
1372 qemu_cond_signal(&qemu_cpu_cond);
1373
1374 while (1) {
1375 if (cpu_can_run(cpu)) {
1376 r = hax_smp_cpu_exec(cpu);
1377 if (r == EXCP_DEBUG) {
1378 cpu_handle_guest_debug(cpu);
1379 }
1380 }
1381
1382 while (cpu_thread_is_idle(cpu)) {
1383 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1384 }
1385 #ifdef _WIN32
1386 SleepEx(0, TRUE);
1387 #endif
1388 qemu_wait_io_event_common(cpu);
1389 }
1390 return NULL;
1391 }
1392
1393 #ifdef _WIN32
1394 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1395 {
1396 }
1397 #endif
1398
1399 static void qemu_cpu_kick_thread(CPUState *cpu)
1400 {
1401 #ifndef _WIN32
1402 int err;
1403
1404 if (cpu->thread_kicked) {
1405 return;
1406 }
1407 cpu->thread_kicked = true;
1408 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1409 if (err) {
1410 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1411 exit(1);
1412 }
1413 #else /* _WIN32 */
1414 if (!qemu_cpu_is_self(cpu)) {
1415 if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1416 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1417 __func__, GetLastError());
1418 exit(1);
1419 }
1420 }
1421 #endif
1422 }
1423
1424 static void qemu_cpu_kick_no_halt(void)
1425 {
1426 CPUState *cpu;
1427 /* Ensure whatever caused the exit has reached the CPU threads before
1428 * writing exit_request.
1429 */
1430 atomic_mb_set(&exit_request, 1);
1431 cpu = atomic_mb_read(&tcg_current_cpu);
1432 if (cpu) {
1433 cpu_exit(cpu);
1434 }
1435 }
1436
1437 void qemu_cpu_kick(CPUState *cpu)
1438 {
1439 qemu_cond_broadcast(cpu->halt_cond);
1440 if (tcg_enabled()) {
1441 qemu_cpu_kick_no_halt();
1442 } else {
1443 if (hax_enabled()) {
1444 /*
1445 * FIXME: race condition with the exit_request check in
1446 * hax_vcpu_hax_exec
1447 */
1448 cpu->exit_request = 1;
1449 }
1450 qemu_cpu_kick_thread(cpu);
1451 }
1452 }
1453
1454 void qemu_cpu_kick_self(void)
1455 {
1456 assert(current_cpu);
1457 qemu_cpu_kick_thread(current_cpu);
1458 }
1459
1460 bool qemu_cpu_is_self(CPUState *cpu)
1461 {
1462 return qemu_thread_is_self(cpu->thread);
1463 }
1464
1465 bool qemu_in_vcpu_thread(void)
1466 {
1467 return current_cpu && qemu_cpu_is_self(current_cpu);
1468 }
1469
1470 static __thread bool iothread_locked = false;
1471
1472 bool qemu_mutex_iothread_locked(void)
1473 {
1474 return iothread_locked;
1475 }
1476
1477 void qemu_mutex_lock_iothread(void)
1478 {
1479 atomic_inc(&iothread_requesting_mutex);
1480 /* In the simple case there is no need to bump the VCPU thread out of
1481 * TCG code execution.
1482 */
1483 if (!tcg_enabled() || qemu_in_vcpu_thread() ||
1484 !first_cpu || !first_cpu->created) {
1485 qemu_mutex_lock(&qemu_global_mutex);
1486 atomic_dec(&iothread_requesting_mutex);
1487 } else {
1488 if (qemu_mutex_trylock(&qemu_global_mutex)) {
1489 qemu_cpu_kick_no_halt();
1490 qemu_mutex_lock(&qemu_global_mutex);
1491 }
1492 atomic_dec(&iothread_requesting_mutex);
1493 qemu_cond_broadcast(&qemu_io_proceeded_cond);
1494 }
1495 iothread_locked = true;
1496 }
1497
1498 void qemu_mutex_unlock_iothread(void)
1499 {
1500 iothread_locked = false;
1501 qemu_mutex_unlock(&qemu_global_mutex);
1502 }
1503
1504 static bool all_vcpus_paused(void)
1505 {
1506 CPUState *cpu;
1507
1508 CPU_FOREACH(cpu) {
1509 if (!cpu->stopped) {
1510 return false;
1511 }
1512 }
1513
1514 return true;
1515 }
1516
1517 void pause_all_vcpus(void)
1518 {
1519 CPUState *cpu;
1520
1521 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1522 CPU_FOREACH(cpu) {
1523 cpu->stop = true;
1524 qemu_cpu_kick(cpu);
1525 }
1526
1527 if (qemu_in_vcpu_thread()) {
1528 cpu_stop_current();
1529 if (!kvm_enabled()) {
1530 CPU_FOREACH(cpu) {
1531 cpu->stop = false;
1532 cpu->stopped = true;
1533 }
1534 return;
1535 }
1536 }
1537
1538 while (!all_vcpus_paused()) {
1539 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1540 CPU_FOREACH(cpu) {
1541 qemu_cpu_kick(cpu);
1542 }
1543 }
1544 }
1545
1546 void cpu_resume(CPUState *cpu)
1547 {
1548 cpu->stop = false;
1549 cpu->stopped = false;
1550 qemu_cpu_kick(cpu);
1551 }
1552
1553 void resume_all_vcpus(void)
1554 {
1555 CPUState *cpu;
1556
1557 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1558 CPU_FOREACH(cpu) {
1559 cpu_resume(cpu);
1560 }
1561 }
1562
1563 void cpu_remove(CPUState *cpu)
1564 {
1565 cpu->stop = true;
1566 cpu->unplug = true;
1567 qemu_cpu_kick(cpu);
1568 }
1569
1570 void cpu_remove_sync(CPUState *cpu)
1571 {
1572 cpu_remove(cpu);
1573 while (cpu->created) {
1574 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1575 }
1576 }
1577
1578 /* For temporary buffers for forming a name */
1579 #define VCPU_THREAD_NAME_SIZE 16
1580
1581 static void qemu_tcg_init_vcpu(CPUState *cpu)
1582 {
1583 char thread_name[VCPU_THREAD_NAME_SIZE];
1584 static QemuCond *tcg_halt_cond;
1585 static QemuThread *tcg_cpu_thread;
1586
1587 /* share a single thread for all cpus with TCG */
1588 if (!tcg_cpu_thread) {
1589 cpu->thread = g_malloc0(sizeof(QemuThread));
1590 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1591 qemu_cond_init(cpu->halt_cond);
1592 tcg_halt_cond = cpu->halt_cond;
1593 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1594 cpu->cpu_index);
1595 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1596 cpu, QEMU_THREAD_JOINABLE);
1597 #ifdef _WIN32
1598 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1599 #endif
1600 while (!cpu->created) {
1601 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1602 }
1603 tcg_cpu_thread = cpu->thread;
1604 } else {
1605 cpu->thread = tcg_cpu_thread;
1606 cpu->halt_cond = tcg_halt_cond;
1607 }
1608 }
1609
1610 static void qemu_hax_start_vcpu(CPUState *cpu)
1611 {
1612 char thread_name[VCPU_THREAD_NAME_SIZE];
1613
1614 cpu->thread = g_malloc0(sizeof(QemuThread));
1615 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1616 qemu_cond_init(cpu->halt_cond);
1617
1618 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1619 cpu->cpu_index);
1620 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1621 cpu, QEMU_THREAD_JOINABLE);
1622 #ifdef _WIN32
1623 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1624 #endif
1625 while (!cpu->created) {
1626 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1627 }
1628 }
1629
1630 static void qemu_kvm_start_vcpu(CPUState *cpu)
1631 {
1632 char thread_name[VCPU_THREAD_NAME_SIZE];
1633
1634 cpu->thread = g_malloc0(sizeof(QemuThread));
1635 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1636 qemu_cond_init(cpu->halt_cond);
1637 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1638 cpu->cpu_index);
1639 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1640 cpu, QEMU_THREAD_JOINABLE);
1641 while (!cpu->created) {
1642 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1643 }
1644 }
1645
1646 static void qemu_dummy_start_vcpu(CPUState *cpu)
1647 {
1648 char thread_name[VCPU_THREAD_NAME_SIZE];
1649
1650 cpu->thread = g_malloc0(sizeof(QemuThread));
1651 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1652 qemu_cond_init(cpu->halt_cond);
1653 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1654 cpu->cpu_index);
1655 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1656 QEMU_THREAD_JOINABLE);
1657 while (!cpu->created) {
1658 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1659 }
1660 }
1661
1662 void qemu_init_vcpu(CPUState *cpu)
1663 {
1664 cpu->nr_cores = smp_cores;
1665 cpu->nr_threads = smp_threads;
1666 cpu->stopped = true;
1667
1668 if (!cpu->as) {
1669 /* If the target cpu hasn't set up any address spaces itself,
1670 * give it the default one.
1671 */
1672 AddressSpace *as = address_space_init_shareable(cpu->memory,
1673 "cpu-memory");
1674 cpu->num_ases = 1;
1675 cpu_address_space_init(cpu, as, 0);
1676 }
1677
1678 if (kvm_enabled()) {
1679 qemu_kvm_start_vcpu(cpu);
1680 } else if (hax_enabled()) {
1681 qemu_hax_start_vcpu(cpu);
1682 } else if (tcg_enabled()) {
1683 qemu_tcg_init_vcpu(cpu);
1684 } else {
1685 qemu_dummy_start_vcpu(cpu);
1686 }
1687 }
1688
1689 void cpu_stop_current(void)
1690 {
1691 if (current_cpu) {
1692 current_cpu->stop = false;
1693 current_cpu->stopped = true;
1694 cpu_exit(current_cpu);
1695 qemu_cond_broadcast(&qemu_pause_cond);
1696 }
1697 }
1698
1699 int vm_stop(RunState state)
1700 {
1701 if (qemu_in_vcpu_thread()) {
1702 qemu_system_vmstop_request_prepare();
1703 qemu_system_vmstop_request(state);
1704 /*
1705 * FIXME: should not return to device code in case
1706 * vm_stop() has been requested.
1707 */
1708 cpu_stop_current();
1709 return 0;
1710 }
1711
1712 return do_vm_stop(state);
1713 }
1714
1715 /**
1716 * Prepare for (re)starting the VM.
1717 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1718 * running or in case of an error condition), 0 otherwise.
1719 */
1720 int vm_prepare_start(void)
1721 {
1722 RunState requested;
1723 int res = 0;
1724
1725 qemu_vmstop_requested(&requested);
1726 if (runstate_is_running() && requested == RUN_STATE__MAX) {
1727 return -1;
1728 }
1729
1730 /* Ensure that a STOP/RESUME pair of events is emitted if a
1731 * vmstop request was pending. The BLOCK_IO_ERROR event, for
1732 * example, according to documentation is always followed by
1733 * the STOP event.
1734 */
1735 if (runstate_is_running()) {
1736 qapi_event_send_stop(&error_abort);
1737 res = -1;
1738 } else {
1739 replay_enable_events();
1740 cpu_enable_ticks();
1741 runstate_set(RUN_STATE_RUNNING);
1742 vm_state_notify(1, RUN_STATE_RUNNING);
1743 }
1744
1745 /* We are sending this now, but the CPUs will be resumed shortly later */
1746 qapi_event_send_resume(&error_abort);
1747 return res;
1748 }
1749
1750 void vm_start(void)
1751 {
1752 if (!vm_prepare_start()) {
1753 resume_all_vcpus();
1754 }
1755 }
1756
1757 /* does a state transition even if the VM is already stopped,
1758 current state is forgotten forever */
1759 int vm_stop_force_state(RunState state)
1760 {
1761 if (runstate_is_running()) {
1762 return vm_stop(state);
1763 } else {
1764 runstate_set(state);
1765
1766 bdrv_drain_all();
1767 /* Make sure to return an error if the flush in a previous vm_stop()
1768 * failed. */
1769 return bdrv_flush_all();
1770 }
1771 }
1772
1773 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1774 {
1775 /* XXX: implement xxx_cpu_list for targets that still miss it */
1776 #if defined(cpu_list)
1777 cpu_list(f, cpu_fprintf);
1778 #endif
1779 }
1780
1781 CpuInfoList *qmp_query_cpus(Error **errp)
1782 {
1783 CpuInfoList *head = NULL, *cur_item = NULL;
1784 CPUState *cpu;
1785
1786 CPU_FOREACH(cpu) {
1787 CpuInfoList *info;
1788 #if defined(TARGET_I386)
1789 X86CPU *x86_cpu = X86_CPU(cpu);
1790 CPUX86State *env = &x86_cpu->env;
1791 #elif defined(TARGET_PPC)
1792 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1793 CPUPPCState *env = &ppc_cpu->env;
1794 #elif defined(TARGET_SPARC)
1795 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1796 CPUSPARCState *env = &sparc_cpu->env;
1797 #elif defined(TARGET_MIPS)
1798 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1799 CPUMIPSState *env = &mips_cpu->env;
1800 #elif defined(TARGET_TRICORE)
1801 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1802 CPUTriCoreState *env = &tricore_cpu->env;
1803 #endif
1804
1805 cpu_synchronize_state(cpu);
1806
1807 info = g_malloc0(sizeof(*info));
1808 info->value = g_malloc0(sizeof(*info->value));
1809 info->value->CPU = cpu->cpu_index;
1810 info->value->current = (cpu == first_cpu);
1811 info->value->halted = cpu->halted;
1812 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1813 info->value->thread_id = cpu->thread_id;
1814 #if defined(TARGET_I386)
1815 info->value->arch = CPU_INFO_ARCH_X86;
1816 info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1817 #elif defined(TARGET_PPC)
1818 info->value->arch = CPU_INFO_ARCH_PPC;
1819 info->value->u.ppc.nip = env->nip;
1820 #elif defined(TARGET_SPARC)
1821 info->value->arch = CPU_INFO_ARCH_SPARC;
1822 info->value->u.q_sparc.pc = env->pc;
1823 info->value->u.q_sparc.npc = env->npc;
1824 #elif defined(TARGET_MIPS)
1825 info->value->arch = CPU_INFO_ARCH_MIPS;
1826 info->value->u.q_mips.PC = env->active_tc.PC;
1827 #elif defined(TARGET_TRICORE)
1828 info->value->arch = CPU_INFO_ARCH_TRICORE;
1829 info->value->u.tricore.PC = env->PC;
1830 #else
1831 info->value->arch = CPU_INFO_ARCH_OTHER;
1832 #endif
1833
1834 /* XXX: waiting for the qapi to support GSList */
1835 if (!cur_item) {
1836 head = cur_item = info;
1837 } else {
1838 cur_item->next = info;
1839 cur_item = info;
1840 }
1841 }
1842
1843 return head;
1844 }
1845
1846 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1847 bool has_cpu, int64_t cpu_index, Error **errp)
1848 {
1849 FILE *f;
1850 uint32_t l;
1851 CPUState *cpu;
1852 uint8_t buf[1024];
1853 int64_t orig_addr = addr, orig_size = size;
1854
1855 if (!has_cpu) {
1856 cpu_index = 0;
1857 }
1858
1859 cpu = qemu_get_cpu(cpu_index);
1860 if (cpu == NULL) {
1861 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1862 "a CPU number");
1863 return;
1864 }
1865
1866 f = fopen(filename, "wb");
1867 if (!f) {
1868 error_setg_file_open(errp, errno, filename);
1869 return;
1870 }
1871
1872 while (size != 0) {
1873 l = sizeof(buf);
1874 if (l > size)
1875 l = size;
1876 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1877 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1878 " specified", orig_addr, orig_size);
1879 goto exit;
1880 }
1881 if (fwrite(buf, 1, l, f) != l) {
1882 error_setg(errp, QERR_IO_ERROR);
1883 goto exit;
1884 }
1885 addr += l;
1886 size -= l;
1887 }
1888
1889 exit:
1890 fclose(f);
1891 }
1892
1893 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1894 Error **errp)
1895 {
1896 FILE *f;
1897 uint32_t l;
1898 uint8_t buf[1024];
1899
1900 f = fopen(filename, "wb");
1901 if (!f) {
1902 error_setg_file_open(errp, errno, filename);
1903 return;
1904 }
1905
1906 while (size != 0) {
1907 l = sizeof(buf);
1908 if (l > size)
1909 l = size;
1910 cpu_physical_memory_read(addr, buf, l);
1911 if (fwrite(buf, 1, l, f) != l) {
1912 error_setg(errp, QERR_IO_ERROR);
1913 goto exit;
1914 }
1915 addr += l;
1916 size -= l;
1917 }
1918
1919 exit:
1920 fclose(f);
1921 }
1922
1923 void qmp_inject_nmi(Error **errp)
1924 {
1925 nmi_monitor_handle(monitor_get_cpu_index(), errp);
1926 }
1927
1928 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1929 {
1930 if (!use_icount) {
1931 return;
1932 }
1933
1934 cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
1935 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1936 if (icount_align_option) {
1937 cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
1938 cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
1939 } else {
1940 cpu_fprintf(f, "Max guest delay NA\n");
1941 cpu_fprintf(f, "Max guest advance NA\n");
1942 }
1943 }