]> git.proxmox.com Git - mirror_qemu.git/blob - cpus.c
tcg: add options for enabling MTTCG
[mirror_qemu.git] / cpus.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 /* Needed early for CONFIG_BSD etc. */
26 #include "qemu/osdep.h"
27 #include "qemu-common.h"
28 #include "qemu/config-file.h"
29 #include "cpu.h"
30 #include "monitor/monitor.h"
31 #include "qapi/qmp/qerror.h"
32 #include "qemu/error-report.h"
33 #include "sysemu/sysemu.h"
34 #include "sysemu/block-backend.h"
35 #include "exec/gdbstub.h"
36 #include "sysemu/dma.h"
37 #include "sysemu/hw_accel.h"
38 #include "sysemu/kvm.h"
39 #include "sysemu/hax.h"
40 #include "qmp-commands.h"
41 #include "exec/exec-all.h"
42
43 #include "qemu/thread.h"
44 #include "sysemu/cpus.h"
45 #include "sysemu/qtest.h"
46 #include "qemu/main-loop.h"
47 #include "qemu/bitmap.h"
48 #include "qemu/seqlock.h"
49 #include "tcg.h"
50 #include "qapi-event.h"
51 #include "hw/nmi.h"
52 #include "sysemu/replay.h"
53
54 #ifndef _WIN32
55 #include "qemu/compatfd.h"
56 #endif
57
58 #ifdef CONFIG_LINUX
59
60 #include <sys/prctl.h>
61
62 #ifndef PR_MCE_KILL
63 #define PR_MCE_KILL 33
64 #endif
65
66 #ifndef PR_MCE_KILL_SET
67 #define PR_MCE_KILL_SET 1
68 #endif
69
70 #ifndef PR_MCE_KILL_EARLY
71 #define PR_MCE_KILL_EARLY 1
72 #endif
73
74 #endif /* CONFIG_LINUX */
75
76 int64_t max_delay;
77 int64_t max_advance;
78
79 /* vcpu throttling controls */
80 static QEMUTimer *throttle_timer;
81 static unsigned int throttle_percentage;
82
83 #define CPU_THROTTLE_PCT_MIN 1
84 #define CPU_THROTTLE_PCT_MAX 99
85 #define CPU_THROTTLE_TIMESLICE_NS 10000000
86
87 bool cpu_is_stopped(CPUState *cpu)
88 {
89 return cpu->stopped || !runstate_is_running();
90 }
91
92 static bool cpu_thread_is_idle(CPUState *cpu)
93 {
94 if (cpu->stop || cpu->queued_work_first) {
95 return false;
96 }
97 if (cpu_is_stopped(cpu)) {
98 return true;
99 }
100 if (!cpu->halted || cpu_has_work(cpu) ||
101 kvm_halt_in_kernel()) {
102 return false;
103 }
104 return true;
105 }
106
107 static bool all_cpu_threads_idle(void)
108 {
109 CPUState *cpu;
110
111 CPU_FOREACH(cpu) {
112 if (!cpu_thread_is_idle(cpu)) {
113 return false;
114 }
115 }
116 return true;
117 }
118
119 /***********************************************************/
120 /* guest cycle counter */
121
122 /* Protected by TimersState seqlock */
123
124 static bool icount_sleep = true;
125 static int64_t vm_clock_warp_start = -1;
126 /* Conversion factor from emulated instructions to virtual clock ticks. */
127 static int icount_time_shift;
128 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
129 #define MAX_ICOUNT_SHIFT 10
130
131 static QEMUTimer *icount_rt_timer;
132 static QEMUTimer *icount_vm_timer;
133 static QEMUTimer *icount_warp_timer;
134
135 typedef struct TimersState {
136 /* Protected by BQL. */
137 int64_t cpu_ticks_prev;
138 int64_t cpu_ticks_offset;
139
140 /* cpu_clock_offset can be read out of BQL, so protect it with
141 * this lock.
142 */
143 QemuSeqLock vm_clock_seqlock;
144 int64_t cpu_clock_offset;
145 int32_t cpu_ticks_enabled;
146 int64_t dummy;
147
148 /* Compensate for varying guest execution speed. */
149 int64_t qemu_icount_bias;
150 /* Only written by TCG thread */
151 int64_t qemu_icount;
152 } TimersState;
153
154 static TimersState timers_state;
155 bool mttcg_enabled;
156
157 /*
158 * We default to false if we know other options have been enabled
159 * which are currently incompatible with MTTCG. Otherwise when each
160 * guest (target) has been updated to support:
161 * - atomic instructions
162 * - memory ordering primitives (barriers)
163 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
164 *
165 * Once a guest architecture has been converted to the new primitives
166 * there are two remaining limitations to check.
167 *
168 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
169 * - The host must have a stronger memory order than the guest
170 *
171 * It may be possible in future to support strong guests on weak hosts
172 * but that will require tagging all load/stores in a guest with their
173 * implicit memory order requirements which would likely slow things
174 * down a lot.
175 */
176
177 static bool check_tcg_memory_orders_compatible(void)
178 {
179 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
180 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
181 #else
182 return false;
183 #endif
184 }
185
186 static bool default_mttcg_enabled(void)
187 {
188 QemuOpts *icount_opts = qemu_find_opts_singleton("icount");
189 const char *rr = qemu_opt_get(icount_opts, "rr");
190
191 if (rr || TCG_OVERSIZED_GUEST) {
192 return false;
193 } else {
194 #ifdef TARGET_SUPPORTS_MTTCG
195 return check_tcg_memory_orders_compatible();
196 #else
197 return false;
198 #endif
199 }
200 }
201
202 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
203 {
204 const char *t = qemu_opt_get(opts, "thread");
205 if (t) {
206 if (strcmp(t, "multi") == 0) {
207 if (TCG_OVERSIZED_GUEST) {
208 error_setg(errp, "No MTTCG when guest word size > hosts");
209 } else {
210 if (!check_tcg_memory_orders_compatible()) {
211 error_report("Guest expects a stronger memory ordering "
212 "than the host provides");
213 error_printf("This may cause strange/hard to debug errors");
214 }
215 mttcg_enabled = true;
216 }
217 } else if (strcmp(t, "single") == 0) {
218 mttcg_enabled = false;
219 } else {
220 error_setg(errp, "Invalid 'thread' setting %s", t);
221 }
222 } else {
223 mttcg_enabled = default_mttcg_enabled();
224 }
225 }
226
227 int64_t cpu_get_icount_raw(void)
228 {
229 int64_t icount;
230 CPUState *cpu = current_cpu;
231
232 icount = timers_state.qemu_icount;
233 if (cpu) {
234 if (!cpu->can_do_io) {
235 fprintf(stderr, "Bad icount read\n");
236 exit(1);
237 }
238 icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
239 }
240 return icount;
241 }
242
243 /* Return the virtual CPU time, based on the instruction counter. */
244 static int64_t cpu_get_icount_locked(void)
245 {
246 int64_t icount = cpu_get_icount_raw();
247 return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
248 }
249
250 int64_t cpu_get_icount(void)
251 {
252 int64_t icount;
253 unsigned start;
254
255 do {
256 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
257 icount = cpu_get_icount_locked();
258 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
259
260 return icount;
261 }
262
263 int64_t cpu_icount_to_ns(int64_t icount)
264 {
265 return icount << icount_time_shift;
266 }
267
268 /* return the time elapsed in VM between vm_start and vm_stop. Unless
269 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
270 * counter.
271 *
272 * Caller must hold the BQL
273 */
274 int64_t cpu_get_ticks(void)
275 {
276 int64_t ticks;
277
278 if (use_icount) {
279 return cpu_get_icount();
280 }
281
282 ticks = timers_state.cpu_ticks_offset;
283 if (timers_state.cpu_ticks_enabled) {
284 ticks += cpu_get_host_ticks();
285 }
286
287 if (timers_state.cpu_ticks_prev > ticks) {
288 /* Note: non increasing ticks may happen if the host uses
289 software suspend */
290 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
291 ticks = timers_state.cpu_ticks_prev;
292 }
293
294 timers_state.cpu_ticks_prev = ticks;
295 return ticks;
296 }
297
298 static int64_t cpu_get_clock_locked(void)
299 {
300 int64_t time;
301
302 time = timers_state.cpu_clock_offset;
303 if (timers_state.cpu_ticks_enabled) {
304 time += get_clock();
305 }
306
307 return time;
308 }
309
310 /* Return the monotonic time elapsed in VM, i.e.,
311 * the time between vm_start and vm_stop
312 */
313 int64_t cpu_get_clock(void)
314 {
315 int64_t ti;
316 unsigned start;
317
318 do {
319 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
320 ti = cpu_get_clock_locked();
321 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
322
323 return ti;
324 }
325
326 /* enable cpu_get_ticks()
327 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
328 */
329 void cpu_enable_ticks(void)
330 {
331 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
332 seqlock_write_begin(&timers_state.vm_clock_seqlock);
333 if (!timers_state.cpu_ticks_enabled) {
334 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
335 timers_state.cpu_clock_offset -= get_clock();
336 timers_state.cpu_ticks_enabled = 1;
337 }
338 seqlock_write_end(&timers_state.vm_clock_seqlock);
339 }
340
341 /* disable cpu_get_ticks() : the clock is stopped. You must not call
342 * cpu_get_ticks() after that.
343 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
344 */
345 void cpu_disable_ticks(void)
346 {
347 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
348 seqlock_write_begin(&timers_state.vm_clock_seqlock);
349 if (timers_state.cpu_ticks_enabled) {
350 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
351 timers_state.cpu_clock_offset = cpu_get_clock_locked();
352 timers_state.cpu_ticks_enabled = 0;
353 }
354 seqlock_write_end(&timers_state.vm_clock_seqlock);
355 }
356
357 /* Correlation between real and virtual time is always going to be
358 fairly approximate, so ignore small variation.
359 When the guest is idle real and virtual time will be aligned in
360 the IO wait loop. */
361 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
362
363 static void icount_adjust(void)
364 {
365 int64_t cur_time;
366 int64_t cur_icount;
367 int64_t delta;
368
369 /* Protected by TimersState mutex. */
370 static int64_t last_delta;
371
372 /* If the VM is not running, then do nothing. */
373 if (!runstate_is_running()) {
374 return;
375 }
376
377 seqlock_write_begin(&timers_state.vm_clock_seqlock);
378 cur_time = cpu_get_clock_locked();
379 cur_icount = cpu_get_icount_locked();
380
381 delta = cur_icount - cur_time;
382 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
383 if (delta > 0
384 && last_delta + ICOUNT_WOBBLE < delta * 2
385 && icount_time_shift > 0) {
386 /* The guest is getting too far ahead. Slow time down. */
387 icount_time_shift--;
388 }
389 if (delta < 0
390 && last_delta - ICOUNT_WOBBLE > delta * 2
391 && icount_time_shift < MAX_ICOUNT_SHIFT) {
392 /* The guest is getting too far behind. Speed time up. */
393 icount_time_shift++;
394 }
395 last_delta = delta;
396 timers_state.qemu_icount_bias = cur_icount
397 - (timers_state.qemu_icount << icount_time_shift);
398 seqlock_write_end(&timers_state.vm_clock_seqlock);
399 }
400
401 static void icount_adjust_rt(void *opaque)
402 {
403 timer_mod(icount_rt_timer,
404 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
405 icount_adjust();
406 }
407
408 static void icount_adjust_vm(void *opaque)
409 {
410 timer_mod(icount_vm_timer,
411 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
412 NANOSECONDS_PER_SECOND / 10);
413 icount_adjust();
414 }
415
416 static int64_t qemu_icount_round(int64_t count)
417 {
418 return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
419 }
420
421 static void icount_warp_rt(void)
422 {
423 unsigned seq;
424 int64_t warp_start;
425
426 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
427 * changes from -1 to another value, so the race here is okay.
428 */
429 do {
430 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
431 warp_start = vm_clock_warp_start;
432 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
433
434 if (warp_start == -1) {
435 return;
436 }
437
438 seqlock_write_begin(&timers_state.vm_clock_seqlock);
439 if (runstate_is_running()) {
440 int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
441 cpu_get_clock_locked());
442 int64_t warp_delta;
443
444 warp_delta = clock - vm_clock_warp_start;
445 if (use_icount == 2) {
446 /*
447 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
448 * far ahead of real time.
449 */
450 int64_t cur_icount = cpu_get_icount_locked();
451 int64_t delta = clock - cur_icount;
452 warp_delta = MIN(warp_delta, delta);
453 }
454 timers_state.qemu_icount_bias += warp_delta;
455 }
456 vm_clock_warp_start = -1;
457 seqlock_write_end(&timers_state.vm_clock_seqlock);
458
459 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
460 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
461 }
462 }
463
464 static void icount_timer_cb(void *opaque)
465 {
466 /* No need for a checkpoint because the timer already synchronizes
467 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
468 */
469 icount_warp_rt();
470 }
471
472 void qtest_clock_warp(int64_t dest)
473 {
474 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
475 AioContext *aio_context;
476 assert(qtest_enabled());
477 aio_context = qemu_get_aio_context();
478 while (clock < dest) {
479 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
480 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
481
482 seqlock_write_begin(&timers_state.vm_clock_seqlock);
483 timers_state.qemu_icount_bias += warp;
484 seqlock_write_end(&timers_state.vm_clock_seqlock);
485
486 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
487 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
488 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
489 }
490 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
491 }
492
493 void qemu_start_warp_timer(void)
494 {
495 int64_t clock;
496 int64_t deadline;
497
498 if (!use_icount) {
499 return;
500 }
501
502 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
503 * do not fire, so computing the deadline does not make sense.
504 */
505 if (!runstate_is_running()) {
506 return;
507 }
508
509 /* warp clock deterministically in record/replay mode */
510 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
511 return;
512 }
513
514 if (!all_cpu_threads_idle()) {
515 return;
516 }
517
518 if (qtest_enabled()) {
519 /* When testing, qtest commands advance icount. */
520 return;
521 }
522
523 /* We want to use the earliest deadline from ALL vm_clocks */
524 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
525 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
526 if (deadline < 0) {
527 static bool notified;
528 if (!icount_sleep && !notified) {
529 error_report("WARNING: icount sleep disabled and no active timers");
530 notified = true;
531 }
532 return;
533 }
534
535 if (deadline > 0) {
536 /*
537 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
538 * sleep. Otherwise, the CPU might be waiting for a future timer
539 * interrupt to wake it up, but the interrupt never comes because
540 * the vCPU isn't running any insns and thus doesn't advance the
541 * QEMU_CLOCK_VIRTUAL.
542 */
543 if (!icount_sleep) {
544 /*
545 * We never let VCPUs sleep in no sleep icount mode.
546 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
547 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
548 * It is useful when we want a deterministic execution time,
549 * isolated from host latencies.
550 */
551 seqlock_write_begin(&timers_state.vm_clock_seqlock);
552 timers_state.qemu_icount_bias += deadline;
553 seqlock_write_end(&timers_state.vm_clock_seqlock);
554 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
555 } else {
556 /*
557 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
558 * "real" time, (related to the time left until the next event) has
559 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
560 * This avoids that the warps are visible externally; for example,
561 * you will not be sending network packets continuously instead of
562 * every 100ms.
563 */
564 seqlock_write_begin(&timers_state.vm_clock_seqlock);
565 if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
566 vm_clock_warp_start = clock;
567 }
568 seqlock_write_end(&timers_state.vm_clock_seqlock);
569 timer_mod_anticipate(icount_warp_timer, clock + deadline);
570 }
571 } else if (deadline == 0) {
572 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
573 }
574 }
575
576 static void qemu_account_warp_timer(void)
577 {
578 if (!use_icount || !icount_sleep) {
579 return;
580 }
581
582 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
583 * do not fire, so computing the deadline does not make sense.
584 */
585 if (!runstate_is_running()) {
586 return;
587 }
588
589 /* warp clock deterministically in record/replay mode */
590 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
591 return;
592 }
593
594 timer_del(icount_warp_timer);
595 icount_warp_rt();
596 }
597
598 static bool icount_state_needed(void *opaque)
599 {
600 return use_icount;
601 }
602
603 /*
604 * This is a subsection for icount migration.
605 */
606 static const VMStateDescription icount_vmstate_timers = {
607 .name = "timer/icount",
608 .version_id = 1,
609 .minimum_version_id = 1,
610 .needed = icount_state_needed,
611 .fields = (VMStateField[]) {
612 VMSTATE_INT64(qemu_icount_bias, TimersState),
613 VMSTATE_INT64(qemu_icount, TimersState),
614 VMSTATE_END_OF_LIST()
615 }
616 };
617
618 static const VMStateDescription vmstate_timers = {
619 .name = "timer",
620 .version_id = 2,
621 .minimum_version_id = 1,
622 .fields = (VMStateField[]) {
623 VMSTATE_INT64(cpu_ticks_offset, TimersState),
624 VMSTATE_INT64(dummy, TimersState),
625 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
626 VMSTATE_END_OF_LIST()
627 },
628 .subsections = (const VMStateDescription*[]) {
629 &icount_vmstate_timers,
630 NULL
631 }
632 };
633
634 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
635 {
636 double pct;
637 double throttle_ratio;
638 long sleeptime_ns;
639
640 if (!cpu_throttle_get_percentage()) {
641 return;
642 }
643
644 pct = (double)cpu_throttle_get_percentage()/100;
645 throttle_ratio = pct / (1 - pct);
646 sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
647
648 qemu_mutex_unlock_iothread();
649 atomic_set(&cpu->throttle_thread_scheduled, 0);
650 g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
651 qemu_mutex_lock_iothread();
652 }
653
654 static void cpu_throttle_timer_tick(void *opaque)
655 {
656 CPUState *cpu;
657 double pct;
658
659 /* Stop the timer if needed */
660 if (!cpu_throttle_get_percentage()) {
661 return;
662 }
663 CPU_FOREACH(cpu) {
664 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
665 async_run_on_cpu(cpu, cpu_throttle_thread,
666 RUN_ON_CPU_NULL);
667 }
668 }
669
670 pct = (double)cpu_throttle_get_percentage()/100;
671 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
672 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
673 }
674
675 void cpu_throttle_set(int new_throttle_pct)
676 {
677 /* Ensure throttle percentage is within valid range */
678 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
679 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
680
681 atomic_set(&throttle_percentage, new_throttle_pct);
682
683 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
684 CPU_THROTTLE_TIMESLICE_NS);
685 }
686
687 void cpu_throttle_stop(void)
688 {
689 atomic_set(&throttle_percentage, 0);
690 }
691
692 bool cpu_throttle_active(void)
693 {
694 return (cpu_throttle_get_percentage() != 0);
695 }
696
697 int cpu_throttle_get_percentage(void)
698 {
699 return atomic_read(&throttle_percentage);
700 }
701
702 void cpu_ticks_init(void)
703 {
704 seqlock_init(&timers_state.vm_clock_seqlock);
705 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
706 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
707 cpu_throttle_timer_tick, NULL);
708 }
709
710 void configure_icount(QemuOpts *opts, Error **errp)
711 {
712 const char *option;
713 char *rem_str = NULL;
714
715 option = qemu_opt_get(opts, "shift");
716 if (!option) {
717 if (qemu_opt_get(opts, "align") != NULL) {
718 error_setg(errp, "Please specify shift option when using align");
719 }
720 return;
721 }
722
723 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
724 if (icount_sleep) {
725 icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
726 icount_timer_cb, NULL);
727 }
728
729 icount_align_option = qemu_opt_get_bool(opts, "align", false);
730
731 if (icount_align_option && !icount_sleep) {
732 error_setg(errp, "align=on and sleep=off are incompatible");
733 }
734 if (strcmp(option, "auto") != 0) {
735 errno = 0;
736 icount_time_shift = strtol(option, &rem_str, 0);
737 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
738 error_setg(errp, "icount: Invalid shift value");
739 }
740 use_icount = 1;
741 return;
742 } else if (icount_align_option) {
743 error_setg(errp, "shift=auto and align=on are incompatible");
744 } else if (!icount_sleep) {
745 error_setg(errp, "shift=auto and sleep=off are incompatible");
746 }
747
748 use_icount = 2;
749
750 /* 125MIPS seems a reasonable initial guess at the guest speed.
751 It will be corrected fairly quickly anyway. */
752 icount_time_shift = 3;
753
754 /* Have both realtime and virtual time triggers for speed adjustment.
755 The realtime trigger catches emulated time passing too slowly,
756 the virtual time trigger catches emulated time passing too fast.
757 Realtime triggers occur even when idle, so use them less frequently
758 than VM triggers. */
759 icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
760 icount_adjust_rt, NULL);
761 timer_mod(icount_rt_timer,
762 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
763 icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
764 icount_adjust_vm, NULL);
765 timer_mod(icount_vm_timer,
766 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
767 NANOSECONDS_PER_SECOND / 10);
768 }
769
770 /***********************************************************/
771 void hw_error(const char *fmt, ...)
772 {
773 va_list ap;
774 CPUState *cpu;
775
776 va_start(ap, fmt);
777 fprintf(stderr, "qemu: hardware error: ");
778 vfprintf(stderr, fmt, ap);
779 fprintf(stderr, "\n");
780 CPU_FOREACH(cpu) {
781 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
782 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
783 }
784 va_end(ap);
785 abort();
786 }
787
788 void cpu_synchronize_all_states(void)
789 {
790 CPUState *cpu;
791
792 CPU_FOREACH(cpu) {
793 cpu_synchronize_state(cpu);
794 }
795 }
796
797 void cpu_synchronize_all_post_reset(void)
798 {
799 CPUState *cpu;
800
801 CPU_FOREACH(cpu) {
802 cpu_synchronize_post_reset(cpu);
803 }
804 }
805
806 void cpu_synchronize_all_post_init(void)
807 {
808 CPUState *cpu;
809
810 CPU_FOREACH(cpu) {
811 cpu_synchronize_post_init(cpu);
812 }
813 }
814
815 static int do_vm_stop(RunState state)
816 {
817 int ret = 0;
818
819 if (runstate_is_running()) {
820 cpu_disable_ticks();
821 pause_all_vcpus();
822 runstate_set(state);
823 vm_state_notify(0, state);
824 qapi_event_send_stop(&error_abort);
825 }
826
827 bdrv_drain_all();
828 replay_disable_events();
829 ret = bdrv_flush_all();
830
831 return ret;
832 }
833
834 static bool cpu_can_run(CPUState *cpu)
835 {
836 if (cpu->stop) {
837 return false;
838 }
839 if (cpu_is_stopped(cpu)) {
840 return false;
841 }
842 return true;
843 }
844
845 static void cpu_handle_guest_debug(CPUState *cpu)
846 {
847 gdb_set_stop_cpu(cpu);
848 qemu_system_debug_request();
849 cpu->stopped = true;
850 }
851
852 #ifdef CONFIG_LINUX
853 static void sigbus_reraise(void)
854 {
855 sigset_t set;
856 struct sigaction action;
857
858 memset(&action, 0, sizeof(action));
859 action.sa_handler = SIG_DFL;
860 if (!sigaction(SIGBUS, &action, NULL)) {
861 raise(SIGBUS);
862 sigemptyset(&set);
863 sigaddset(&set, SIGBUS);
864 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
865 }
866 perror("Failed to re-raise SIGBUS!\n");
867 abort();
868 }
869
870 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
871 void *ctx)
872 {
873 if (kvm_on_sigbus(siginfo->ssi_code,
874 (void *)(intptr_t)siginfo->ssi_addr)) {
875 sigbus_reraise();
876 }
877 }
878
879 static void qemu_init_sigbus(void)
880 {
881 struct sigaction action;
882
883 memset(&action, 0, sizeof(action));
884 action.sa_flags = SA_SIGINFO;
885 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
886 sigaction(SIGBUS, &action, NULL);
887
888 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
889 }
890
891 static void qemu_kvm_eat_signals(CPUState *cpu)
892 {
893 struct timespec ts = { 0, 0 };
894 siginfo_t siginfo;
895 sigset_t waitset;
896 sigset_t chkset;
897 int r;
898
899 sigemptyset(&waitset);
900 sigaddset(&waitset, SIG_IPI);
901 sigaddset(&waitset, SIGBUS);
902
903 do {
904 r = sigtimedwait(&waitset, &siginfo, &ts);
905 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
906 perror("sigtimedwait");
907 exit(1);
908 }
909
910 switch (r) {
911 case SIGBUS:
912 if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
913 sigbus_reraise();
914 }
915 break;
916 default:
917 break;
918 }
919
920 r = sigpending(&chkset);
921 if (r == -1) {
922 perror("sigpending");
923 exit(1);
924 }
925 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
926 }
927
928 #else /* !CONFIG_LINUX */
929
930 static void qemu_init_sigbus(void)
931 {
932 }
933
934 static void qemu_kvm_eat_signals(CPUState *cpu)
935 {
936 }
937 #endif /* !CONFIG_LINUX */
938
939 #ifndef _WIN32
940 static void dummy_signal(int sig)
941 {
942 }
943
944 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
945 {
946 int r;
947 sigset_t set;
948 struct sigaction sigact;
949
950 memset(&sigact, 0, sizeof(sigact));
951 sigact.sa_handler = dummy_signal;
952 sigaction(SIG_IPI, &sigact, NULL);
953
954 pthread_sigmask(SIG_BLOCK, NULL, &set);
955 sigdelset(&set, SIG_IPI);
956 sigdelset(&set, SIGBUS);
957 r = kvm_set_signal_mask(cpu, &set);
958 if (r) {
959 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
960 exit(1);
961 }
962 }
963
964 #else /* _WIN32 */
965 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
966 {
967 abort();
968 }
969 #endif /* _WIN32 */
970
971 static QemuMutex qemu_global_mutex;
972 static QemuCond qemu_io_proceeded_cond;
973 static unsigned iothread_requesting_mutex;
974
975 static QemuThread io_thread;
976
977 /* cpu creation */
978 static QemuCond qemu_cpu_cond;
979 /* system init */
980 static QemuCond qemu_pause_cond;
981
982 void qemu_init_cpu_loop(void)
983 {
984 qemu_init_sigbus();
985 qemu_cond_init(&qemu_cpu_cond);
986 qemu_cond_init(&qemu_pause_cond);
987 qemu_cond_init(&qemu_io_proceeded_cond);
988 qemu_mutex_init(&qemu_global_mutex);
989
990 qemu_thread_get_self(&io_thread);
991 }
992
993 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
994 {
995 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
996 }
997
998 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
999 {
1000 if (kvm_destroy_vcpu(cpu) < 0) {
1001 error_report("kvm_destroy_vcpu failed");
1002 exit(EXIT_FAILURE);
1003 }
1004 }
1005
1006 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1007 {
1008 }
1009
1010 static void qemu_wait_io_event_common(CPUState *cpu)
1011 {
1012 if (cpu->stop) {
1013 cpu->stop = false;
1014 cpu->stopped = true;
1015 qemu_cond_broadcast(&qemu_pause_cond);
1016 }
1017 process_queued_cpu_work(cpu);
1018 cpu->thread_kicked = false;
1019 }
1020
1021 static void qemu_tcg_wait_io_event(CPUState *cpu)
1022 {
1023 while (all_cpu_threads_idle()) {
1024 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1025 }
1026
1027 while (iothread_requesting_mutex) {
1028 qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
1029 }
1030
1031 CPU_FOREACH(cpu) {
1032 qemu_wait_io_event_common(cpu);
1033 }
1034 }
1035
1036 static void qemu_kvm_wait_io_event(CPUState *cpu)
1037 {
1038 while (cpu_thread_is_idle(cpu)) {
1039 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1040 }
1041
1042 qemu_kvm_eat_signals(cpu);
1043 qemu_wait_io_event_common(cpu);
1044 }
1045
1046 static void *qemu_kvm_cpu_thread_fn(void *arg)
1047 {
1048 CPUState *cpu = arg;
1049 int r;
1050
1051 rcu_register_thread();
1052
1053 qemu_mutex_lock_iothread();
1054 qemu_thread_get_self(cpu->thread);
1055 cpu->thread_id = qemu_get_thread_id();
1056 cpu->can_do_io = 1;
1057 current_cpu = cpu;
1058
1059 r = kvm_init_vcpu(cpu);
1060 if (r < 0) {
1061 fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1062 exit(1);
1063 }
1064
1065 qemu_kvm_init_cpu_signals(cpu);
1066
1067 /* signal CPU creation */
1068 cpu->created = true;
1069 qemu_cond_signal(&qemu_cpu_cond);
1070
1071 do {
1072 if (cpu_can_run(cpu)) {
1073 r = kvm_cpu_exec(cpu);
1074 if (r == EXCP_DEBUG) {
1075 cpu_handle_guest_debug(cpu);
1076 }
1077 }
1078 qemu_kvm_wait_io_event(cpu);
1079 } while (!cpu->unplug || cpu_can_run(cpu));
1080
1081 qemu_kvm_destroy_vcpu(cpu);
1082 cpu->created = false;
1083 qemu_cond_signal(&qemu_cpu_cond);
1084 qemu_mutex_unlock_iothread();
1085 return NULL;
1086 }
1087
1088 static void *qemu_dummy_cpu_thread_fn(void *arg)
1089 {
1090 #ifdef _WIN32
1091 fprintf(stderr, "qtest is not supported under Windows\n");
1092 exit(1);
1093 #else
1094 CPUState *cpu = arg;
1095 sigset_t waitset;
1096 int r;
1097
1098 rcu_register_thread();
1099
1100 qemu_mutex_lock_iothread();
1101 qemu_thread_get_self(cpu->thread);
1102 cpu->thread_id = qemu_get_thread_id();
1103 cpu->can_do_io = 1;
1104
1105 sigemptyset(&waitset);
1106 sigaddset(&waitset, SIG_IPI);
1107
1108 /* signal CPU creation */
1109 cpu->created = true;
1110 qemu_cond_signal(&qemu_cpu_cond);
1111
1112 current_cpu = cpu;
1113 while (1) {
1114 current_cpu = NULL;
1115 qemu_mutex_unlock_iothread();
1116 do {
1117 int sig;
1118 r = sigwait(&waitset, &sig);
1119 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1120 if (r == -1) {
1121 perror("sigwait");
1122 exit(1);
1123 }
1124 qemu_mutex_lock_iothread();
1125 current_cpu = cpu;
1126 qemu_wait_io_event_common(cpu);
1127 }
1128
1129 return NULL;
1130 #endif
1131 }
1132
1133 static int64_t tcg_get_icount_limit(void)
1134 {
1135 int64_t deadline;
1136
1137 if (replay_mode != REPLAY_MODE_PLAY) {
1138 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1139
1140 /* Maintain prior (possibly buggy) behaviour where if no deadline
1141 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1142 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1143 * nanoseconds.
1144 */
1145 if ((deadline < 0) || (deadline > INT32_MAX)) {
1146 deadline = INT32_MAX;
1147 }
1148
1149 return qemu_icount_round(deadline);
1150 } else {
1151 return replay_get_instructions();
1152 }
1153 }
1154
1155 static void handle_icount_deadline(void)
1156 {
1157 if (use_icount) {
1158 int64_t deadline =
1159 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1160
1161 if (deadline == 0) {
1162 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1163 }
1164 }
1165 }
1166
1167 static int tcg_cpu_exec(CPUState *cpu)
1168 {
1169 int ret;
1170 #ifdef CONFIG_PROFILER
1171 int64_t ti;
1172 #endif
1173
1174 #ifdef CONFIG_PROFILER
1175 ti = profile_getclock();
1176 #endif
1177 if (use_icount) {
1178 int64_t count;
1179 int decr;
1180 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1181 + cpu->icount_extra);
1182 cpu->icount_decr.u16.low = 0;
1183 cpu->icount_extra = 0;
1184 count = tcg_get_icount_limit();
1185 timers_state.qemu_icount += count;
1186 decr = (count > 0xffff) ? 0xffff : count;
1187 count -= decr;
1188 cpu->icount_decr.u16.low = decr;
1189 cpu->icount_extra = count;
1190 }
1191 cpu_exec_start(cpu);
1192 ret = cpu_exec(cpu);
1193 cpu_exec_end(cpu);
1194 #ifdef CONFIG_PROFILER
1195 tcg_time += profile_getclock() - ti;
1196 #endif
1197 if (use_icount) {
1198 /* Fold pending instructions back into the
1199 instruction counter, and clear the interrupt flag. */
1200 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1201 + cpu->icount_extra);
1202 cpu->icount_decr.u32 = 0;
1203 cpu->icount_extra = 0;
1204 replay_account_executed_instructions();
1205 }
1206 return ret;
1207 }
1208
1209 /* Destroy any remaining vCPUs which have been unplugged and have
1210 * finished running
1211 */
1212 static void deal_with_unplugged_cpus(void)
1213 {
1214 CPUState *cpu;
1215
1216 CPU_FOREACH(cpu) {
1217 if (cpu->unplug && !cpu_can_run(cpu)) {
1218 qemu_tcg_destroy_vcpu(cpu);
1219 cpu->created = false;
1220 qemu_cond_signal(&qemu_cpu_cond);
1221 break;
1222 }
1223 }
1224 }
1225
1226 static void *qemu_tcg_cpu_thread_fn(void *arg)
1227 {
1228 CPUState *cpu = arg;
1229
1230 rcu_register_thread();
1231
1232 qemu_mutex_lock_iothread();
1233 qemu_thread_get_self(cpu->thread);
1234
1235 CPU_FOREACH(cpu) {
1236 cpu->thread_id = qemu_get_thread_id();
1237 cpu->created = true;
1238 cpu->can_do_io = 1;
1239 }
1240 qemu_cond_signal(&qemu_cpu_cond);
1241
1242 /* wait for initial kick-off after machine start */
1243 while (first_cpu->stopped) {
1244 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1245
1246 /* process any pending work */
1247 CPU_FOREACH(cpu) {
1248 qemu_wait_io_event_common(cpu);
1249 }
1250 }
1251
1252 /* process any pending work */
1253 atomic_mb_set(&exit_request, 1);
1254
1255 cpu = first_cpu;
1256
1257 while (1) {
1258 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1259 qemu_account_warp_timer();
1260
1261 if (!cpu) {
1262 cpu = first_cpu;
1263 }
1264
1265 for (; cpu != NULL && !exit_request; cpu = CPU_NEXT(cpu)) {
1266
1267 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1268 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1269
1270 if (cpu_can_run(cpu)) {
1271 int r;
1272 r = tcg_cpu_exec(cpu);
1273 if (r == EXCP_DEBUG) {
1274 cpu_handle_guest_debug(cpu);
1275 break;
1276 }
1277 } else if (cpu->stop || cpu->stopped) {
1278 if (cpu->unplug) {
1279 cpu = CPU_NEXT(cpu);
1280 }
1281 break;
1282 }
1283
1284 } /* for cpu.. */
1285
1286 /* Pairs with smp_wmb in qemu_cpu_kick. */
1287 atomic_mb_set(&exit_request, 0);
1288
1289 handle_icount_deadline();
1290
1291 qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
1292 deal_with_unplugged_cpus();
1293 }
1294
1295 return NULL;
1296 }
1297
1298 static void *qemu_hax_cpu_thread_fn(void *arg)
1299 {
1300 CPUState *cpu = arg;
1301 int r;
1302 qemu_thread_get_self(cpu->thread);
1303 qemu_mutex_lock(&qemu_global_mutex);
1304
1305 cpu->thread_id = qemu_get_thread_id();
1306 cpu->created = true;
1307 cpu->halted = 0;
1308 current_cpu = cpu;
1309
1310 hax_init_vcpu(cpu);
1311 qemu_cond_signal(&qemu_cpu_cond);
1312
1313 while (1) {
1314 if (cpu_can_run(cpu)) {
1315 r = hax_smp_cpu_exec(cpu);
1316 if (r == EXCP_DEBUG) {
1317 cpu_handle_guest_debug(cpu);
1318 }
1319 }
1320
1321 while (cpu_thread_is_idle(cpu)) {
1322 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1323 }
1324 #ifdef _WIN32
1325 SleepEx(0, TRUE);
1326 #endif
1327 qemu_wait_io_event_common(cpu);
1328 }
1329 return NULL;
1330 }
1331
1332 #ifdef _WIN32
1333 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1334 {
1335 }
1336 #endif
1337
1338 static void qemu_cpu_kick_thread(CPUState *cpu)
1339 {
1340 #ifndef _WIN32
1341 int err;
1342
1343 if (cpu->thread_kicked) {
1344 return;
1345 }
1346 cpu->thread_kicked = true;
1347 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1348 if (err) {
1349 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1350 exit(1);
1351 }
1352 #else /* _WIN32 */
1353 if (!qemu_cpu_is_self(cpu)) {
1354 if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1355 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1356 __func__, GetLastError());
1357 exit(1);
1358 }
1359 }
1360 #endif
1361 }
1362
1363 static void qemu_cpu_kick_no_halt(void)
1364 {
1365 CPUState *cpu;
1366 /* Ensure whatever caused the exit has reached the CPU threads before
1367 * writing exit_request.
1368 */
1369 atomic_mb_set(&exit_request, 1);
1370 cpu = atomic_mb_read(&tcg_current_cpu);
1371 if (cpu) {
1372 cpu_exit(cpu);
1373 }
1374 }
1375
1376 void qemu_cpu_kick(CPUState *cpu)
1377 {
1378 qemu_cond_broadcast(cpu->halt_cond);
1379 if (tcg_enabled()) {
1380 qemu_cpu_kick_no_halt();
1381 } else {
1382 if (hax_enabled()) {
1383 /*
1384 * FIXME: race condition with the exit_request check in
1385 * hax_vcpu_hax_exec
1386 */
1387 cpu->exit_request = 1;
1388 }
1389 qemu_cpu_kick_thread(cpu);
1390 }
1391 }
1392
1393 void qemu_cpu_kick_self(void)
1394 {
1395 assert(current_cpu);
1396 qemu_cpu_kick_thread(current_cpu);
1397 }
1398
1399 bool qemu_cpu_is_self(CPUState *cpu)
1400 {
1401 return qemu_thread_is_self(cpu->thread);
1402 }
1403
1404 bool qemu_in_vcpu_thread(void)
1405 {
1406 return current_cpu && qemu_cpu_is_self(current_cpu);
1407 }
1408
1409 static __thread bool iothread_locked = false;
1410
1411 bool qemu_mutex_iothread_locked(void)
1412 {
1413 return iothread_locked;
1414 }
1415
1416 void qemu_mutex_lock_iothread(void)
1417 {
1418 atomic_inc(&iothread_requesting_mutex);
1419 /* In the simple case there is no need to bump the VCPU thread out of
1420 * TCG code execution.
1421 */
1422 if (!tcg_enabled() || qemu_in_vcpu_thread() ||
1423 !first_cpu || !first_cpu->created) {
1424 qemu_mutex_lock(&qemu_global_mutex);
1425 atomic_dec(&iothread_requesting_mutex);
1426 } else {
1427 if (qemu_mutex_trylock(&qemu_global_mutex)) {
1428 qemu_cpu_kick_no_halt();
1429 qemu_mutex_lock(&qemu_global_mutex);
1430 }
1431 atomic_dec(&iothread_requesting_mutex);
1432 qemu_cond_broadcast(&qemu_io_proceeded_cond);
1433 }
1434 iothread_locked = true;
1435 }
1436
1437 void qemu_mutex_unlock_iothread(void)
1438 {
1439 iothread_locked = false;
1440 qemu_mutex_unlock(&qemu_global_mutex);
1441 }
1442
1443 static bool all_vcpus_paused(void)
1444 {
1445 CPUState *cpu;
1446
1447 CPU_FOREACH(cpu) {
1448 if (!cpu->stopped) {
1449 return false;
1450 }
1451 }
1452
1453 return true;
1454 }
1455
1456 void pause_all_vcpus(void)
1457 {
1458 CPUState *cpu;
1459
1460 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1461 CPU_FOREACH(cpu) {
1462 cpu->stop = true;
1463 qemu_cpu_kick(cpu);
1464 }
1465
1466 if (qemu_in_vcpu_thread()) {
1467 cpu_stop_current();
1468 if (!kvm_enabled()) {
1469 CPU_FOREACH(cpu) {
1470 cpu->stop = false;
1471 cpu->stopped = true;
1472 }
1473 return;
1474 }
1475 }
1476
1477 while (!all_vcpus_paused()) {
1478 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1479 CPU_FOREACH(cpu) {
1480 qemu_cpu_kick(cpu);
1481 }
1482 }
1483 }
1484
1485 void cpu_resume(CPUState *cpu)
1486 {
1487 cpu->stop = false;
1488 cpu->stopped = false;
1489 qemu_cpu_kick(cpu);
1490 }
1491
1492 void resume_all_vcpus(void)
1493 {
1494 CPUState *cpu;
1495
1496 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1497 CPU_FOREACH(cpu) {
1498 cpu_resume(cpu);
1499 }
1500 }
1501
1502 void cpu_remove(CPUState *cpu)
1503 {
1504 cpu->stop = true;
1505 cpu->unplug = true;
1506 qemu_cpu_kick(cpu);
1507 }
1508
1509 void cpu_remove_sync(CPUState *cpu)
1510 {
1511 cpu_remove(cpu);
1512 while (cpu->created) {
1513 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1514 }
1515 }
1516
1517 /* For temporary buffers for forming a name */
1518 #define VCPU_THREAD_NAME_SIZE 16
1519
1520 static void qemu_tcg_init_vcpu(CPUState *cpu)
1521 {
1522 char thread_name[VCPU_THREAD_NAME_SIZE];
1523 static QemuCond *tcg_halt_cond;
1524 static QemuThread *tcg_cpu_thread;
1525
1526 /* share a single thread for all cpus with TCG */
1527 if (!tcg_cpu_thread) {
1528 cpu->thread = g_malloc0(sizeof(QemuThread));
1529 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1530 qemu_cond_init(cpu->halt_cond);
1531 tcg_halt_cond = cpu->halt_cond;
1532 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1533 cpu->cpu_index);
1534 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1535 cpu, QEMU_THREAD_JOINABLE);
1536 #ifdef _WIN32
1537 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1538 #endif
1539 while (!cpu->created) {
1540 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1541 }
1542 tcg_cpu_thread = cpu->thread;
1543 } else {
1544 cpu->thread = tcg_cpu_thread;
1545 cpu->halt_cond = tcg_halt_cond;
1546 }
1547 }
1548
1549 static void qemu_hax_start_vcpu(CPUState *cpu)
1550 {
1551 char thread_name[VCPU_THREAD_NAME_SIZE];
1552
1553 cpu->thread = g_malloc0(sizeof(QemuThread));
1554 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1555 qemu_cond_init(cpu->halt_cond);
1556
1557 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1558 cpu->cpu_index);
1559 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1560 cpu, QEMU_THREAD_JOINABLE);
1561 #ifdef _WIN32
1562 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1563 #endif
1564 while (!cpu->created) {
1565 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1566 }
1567 }
1568
1569 static void qemu_kvm_start_vcpu(CPUState *cpu)
1570 {
1571 char thread_name[VCPU_THREAD_NAME_SIZE];
1572
1573 cpu->thread = g_malloc0(sizeof(QemuThread));
1574 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1575 qemu_cond_init(cpu->halt_cond);
1576 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1577 cpu->cpu_index);
1578 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1579 cpu, QEMU_THREAD_JOINABLE);
1580 while (!cpu->created) {
1581 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1582 }
1583 }
1584
1585 static void qemu_dummy_start_vcpu(CPUState *cpu)
1586 {
1587 char thread_name[VCPU_THREAD_NAME_SIZE];
1588
1589 cpu->thread = g_malloc0(sizeof(QemuThread));
1590 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1591 qemu_cond_init(cpu->halt_cond);
1592 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1593 cpu->cpu_index);
1594 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1595 QEMU_THREAD_JOINABLE);
1596 while (!cpu->created) {
1597 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1598 }
1599 }
1600
1601 void qemu_init_vcpu(CPUState *cpu)
1602 {
1603 cpu->nr_cores = smp_cores;
1604 cpu->nr_threads = smp_threads;
1605 cpu->stopped = true;
1606
1607 if (!cpu->as) {
1608 /* If the target cpu hasn't set up any address spaces itself,
1609 * give it the default one.
1610 */
1611 AddressSpace *as = address_space_init_shareable(cpu->memory,
1612 "cpu-memory");
1613 cpu->num_ases = 1;
1614 cpu_address_space_init(cpu, as, 0);
1615 }
1616
1617 if (kvm_enabled()) {
1618 qemu_kvm_start_vcpu(cpu);
1619 } else if (hax_enabled()) {
1620 qemu_hax_start_vcpu(cpu);
1621 } else if (tcg_enabled()) {
1622 qemu_tcg_init_vcpu(cpu);
1623 } else {
1624 qemu_dummy_start_vcpu(cpu);
1625 }
1626 }
1627
1628 void cpu_stop_current(void)
1629 {
1630 if (current_cpu) {
1631 current_cpu->stop = false;
1632 current_cpu->stopped = true;
1633 cpu_exit(current_cpu);
1634 qemu_cond_broadcast(&qemu_pause_cond);
1635 }
1636 }
1637
1638 int vm_stop(RunState state)
1639 {
1640 if (qemu_in_vcpu_thread()) {
1641 qemu_system_vmstop_request_prepare();
1642 qemu_system_vmstop_request(state);
1643 /*
1644 * FIXME: should not return to device code in case
1645 * vm_stop() has been requested.
1646 */
1647 cpu_stop_current();
1648 return 0;
1649 }
1650
1651 return do_vm_stop(state);
1652 }
1653
1654 /**
1655 * Prepare for (re)starting the VM.
1656 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1657 * running or in case of an error condition), 0 otherwise.
1658 */
1659 int vm_prepare_start(void)
1660 {
1661 RunState requested;
1662 int res = 0;
1663
1664 qemu_vmstop_requested(&requested);
1665 if (runstate_is_running() && requested == RUN_STATE__MAX) {
1666 return -1;
1667 }
1668
1669 /* Ensure that a STOP/RESUME pair of events is emitted if a
1670 * vmstop request was pending. The BLOCK_IO_ERROR event, for
1671 * example, according to documentation is always followed by
1672 * the STOP event.
1673 */
1674 if (runstate_is_running()) {
1675 qapi_event_send_stop(&error_abort);
1676 res = -1;
1677 } else {
1678 replay_enable_events();
1679 cpu_enable_ticks();
1680 runstate_set(RUN_STATE_RUNNING);
1681 vm_state_notify(1, RUN_STATE_RUNNING);
1682 }
1683
1684 /* We are sending this now, but the CPUs will be resumed shortly later */
1685 qapi_event_send_resume(&error_abort);
1686 return res;
1687 }
1688
1689 void vm_start(void)
1690 {
1691 if (!vm_prepare_start()) {
1692 resume_all_vcpus();
1693 }
1694 }
1695
1696 /* does a state transition even if the VM is already stopped,
1697 current state is forgotten forever */
1698 int vm_stop_force_state(RunState state)
1699 {
1700 if (runstate_is_running()) {
1701 return vm_stop(state);
1702 } else {
1703 runstate_set(state);
1704
1705 bdrv_drain_all();
1706 /* Make sure to return an error if the flush in a previous vm_stop()
1707 * failed. */
1708 return bdrv_flush_all();
1709 }
1710 }
1711
1712 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1713 {
1714 /* XXX: implement xxx_cpu_list for targets that still miss it */
1715 #if defined(cpu_list)
1716 cpu_list(f, cpu_fprintf);
1717 #endif
1718 }
1719
1720 CpuInfoList *qmp_query_cpus(Error **errp)
1721 {
1722 CpuInfoList *head = NULL, *cur_item = NULL;
1723 CPUState *cpu;
1724
1725 CPU_FOREACH(cpu) {
1726 CpuInfoList *info;
1727 #if defined(TARGET_I386)
1728 X86CPU *x86_cpu = X86_CPU(cpu);
1729 CPUX86State *env = &x86_cpu->env;
1730 #elif defined(TARGET_PPC)
1731 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1732 CPUPPCState *env = &ppc_cpu->env;
1733 #elif defined(TARGET_SPARC)
1734 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1735 CPUSPARCState *env = &sparc_cpu->env;
1736 #elif defined(TARGET_MIPS)
1737 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1738 CPUMIPSState *env = &mips_cpu->env;
1739 #elif defined(TARGET_TRICORE)
1740 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1741 CPUTriCoreState *env = &tricore_cpu->env;
1742 #endif
1743
1744 cpu_synchronize_state(cpu);
1745
1746 info = g_malloc0(sizeof(*info));
1747 info->value = g_malloc0(sizeof(*info->value));
1748 info->value->CPU = cpu->cpu_index;
1749 info->value->current = (cpu == first_cpu);
1750 info->value->halted = cpu->halted;
1751 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1752 info->value->thread_id = cpu->thread_id;
1753 #if defined(TARGET_I386)
1754 info->value->arch = CPU_INFO_ARCH_X86;
1755 info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1756 #elif defined(TARGET_PPC)
1757 info->value->arch = CPU_INFO_ARCH_PPC;
1758 info->value->u.ppc.nip = env->nip;
1759 #elif defined(TARGET_SPARC)
1760 info->value->arch = CPU_INFO_ARCH_SPARC;
1761 info->value->u.q_sparc.pc = env->pc;
1762 info->value->u.q_sparc.npc = env->npc;
1763 #elif defined(TARGET_MIPS)
1764 info->value->arch = CPU_INFO_ARCH_MIPS;
1765 info->value->u.q_mips.PC = env->active_tc.PC;
1766 #elif defined(TARGET_TRICORE)
1767 info->value->arch = CPU_INFO_ARCH_TRICORE;
1768 info->value->u.tricore.PC = env->PC;
1769 #else
1770 info->value->arch = CPU_INFO_ARCH_OTHER;
1771 #endif
1772
1773 /* XXX: waiting for the qapi to support GSList */
1774 if (!cur_item) {
1775 head = cur_item = info;
1776 } else {
1777 cur_item->next = info;
1778 cur_item = info;
1779 }
1780 }
1781
1782 return head;
1783 }
1784
1785 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1786 bool has_cpu, int64_t cpu_index, Error **errp)
1787 {
1788 FILE *f;
1789 uint32_t l;
1790 CPUState *cpu;
1791 uint8_t buf[1024];
1792 int64_t orig_addr = addr, orig_size = size;
1793
1794 if (!has_cpu) {
1795 cpu_index = 0;
1796 }
1797
1798 cpu = qemu_get_cpu(cpu_index);
1799 if (cpu == NULL) {
1800 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1801 "a CPU number");
1802 return;
1803 }
1804
1805 f = fopen(filename, "wb");
1806 if (!f) {
1807 error_setg_file_open(errp, errno, filename);
1808 return;
1809 }
1810
1811 while (size != 0) {
1812 l = sizeof(buf);
1813 if (l > size)
1814 l = size;
1815 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1816 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1817 " specified", orig_addr, orig_size);
1818 goto exit;
1819 }
1820 if (fwrite(buf, 1, l, f) != l) {
1821 error_setg(errp, QERR_IO_ERROR);
1822 goto exit;
1823 }
1824 addr += l;
1825 size -= l;
1826 }
1827
1828 exit:
1829 fclose(f);
1830 }
1831
1832 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1833 Error **errp)
1834 {
1835 FILE *f;
1836 uint32_t l;
1837 uint8_t buf[1024];
1838
1839 f = fopen(filename, "wb");
1840 if (!f) {
1841 error_setg_file_open(errp, errno, filename);
1842 return;
1843 }
1844
1845 while (size != 0) {
1846 l = sizeof(buf);
1847 if (l > size)
1848 l = size;
1849 cpu_physical_memory_read(addr, buf, l);
1850 if (fwrite(buf, 1, l, f) != l) {
1851 error_setg(errp, QERR_IO_ERROR);
1852 goto exit;
1853 }
1854 addr += l;
1855 size -= l;
1856 }
1857
1858 exit:
1859 fclose(f);
1860 }
1861
1862 void qmp_inject_nmi(Error **errp)
1863 {
1864 nmi_monitor_handle(monitor_get_cpu_index(), errp);
1865 }
1866
1867 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1868 {
1869 if (!use_icount) {
1870 return;
1871 }
1872
1873 cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
1874 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1875 if (icount_align_option) {
1876 cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
1877 cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
1878 } else {
1879 cpu_fprintf(f, "Max guest delay NA\n");
1880 cpu_fprintf(f, "Max guest advance NA\n");
1881 }
1882 }