]> git.proxmox.com Git - mirror_qemu.git/blob - cpus.c
KVM: do not use sigtimedwait to catch SIGBUS
[mirror_qemu.git] / cpus.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 /* Needed early for CONFIG_BSD etc. */
26 #include "qemu/osdep.h"
27 #include "qemu-common.h"
28 #include "qemu/config-file.h"
29 #include "cpu.h"
30 #include "monitor/monitor.h"
31 #include "qapi/qmp/qerror.h"
32 #include "qemu/error-report.h"
33 #include "sysemu/sysemu.h"
34 #include "sysemu/block-backend.h"
35 #include "exec/gdbstub.h"
36 #include "sysemu/dma.h"
37 #include "sysemu/hw_accel.h"
38 #include "sysemu/kvm.h"
39 #include "sysemu/hax.h"
40 #include "qmp-commands.h"
41 #include "exec/exec-all.h"
42
43 #include "qemu/thread.h"
44 #include "sysemu/cpus.h"
45 #include "sysemu/qtest.h"
46 #include "qemu/main-loop.h"
47 #include "qemu/bitmap.h"
48 #include "qemu/seqlock.h"
49 #include "tcg.h"
50 #include "qapi-event.h"
51 #include "hw/nmi.h"
52 #include "sysemu/replay.h"
53
54 #ifdef CONFIG_LINUX
55
56 #include <sys/prctl.h>
57
58 #ifndef PR_MCE_KILL
59 #define PR_MCE_KILL 33
60 #endif
61
62 #ifndef PR_MCE_KILL_SET
63 #define PR_MCE_KILL_SET 1
64 #endif
65
66 #ifndef PR_MCE_KILL_EARLY
67 #define PR_MCE_KILL_EARLY 1
68 #endif
69
70 #endif /* CONFIG_LINUX */
71
72 int64_t max_delay;
73 int64_t max_advance;
74
75 /* vcpu throttling controls */
76 static QEMUTimer *throttle_timer;
77 static unsigned int throttle_percentage;
78
79 #define CPU_THROTTLE_PCT_MIN 1
80 #define CPU_THROTTLE_PCT_MAX 99
81 #define CPU_THROTTLE_TIMESLICE_NS 10000000
82
83 bool cpu_is_stopped(CPUState *cpu)
84 {
85 return cpu->stopped || !runstate_is_running();
86 }
87
88 static bool cpu_thread_is_idle(CPUState *cpu)
89 {
90 if (cpu->stop || cpu->queued_work_first) {
91 return false;
92 }
93 if (cpu_is_stopped(cpu)) {
94 return true;
95 }
96 if (!cpu->halted || cpu_has_work(cpu) ||
97 kvm_halt_in_kernel()) {
98 return false;
99 }
100 return true;
101 }
102
103 static bool all_cpu_threads_idle(void)
104 {
105 CPUState *cpu;
106
107 CPU_FOREACH(cpu) {
108 if (!cpu_thread_is_idle(cpu)) {
109 return false;
110 }
111 }
112 return true;
113 }
114
115 /***********************************************************/
116 /* guest cycle counter */
117
118 /* Protected by TimersState seqlock */
119
120 static bool icount_sleep = true;
121 static int64_t vm_clock_warp_start = -1;
122 /* Conversion factor from emulated instructions to virtual clock ticks. */
123 static int icount_time_shift;
124 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
125 #define MAX_ICOUNT_SHIFT 10
126
127 static QEMUTimer *icount_rt_timer;
128 static QEMUTimer *icount_vm_timer;
129 static QEMUTimer *icount_warp_timer;
130
131 typedef struct TimersState {
132 /* Protected by BQL. */
133 int64_t cpu_ticks_prev;
134 int64_t cpu_ticks_offset;
135
136 /* cpu_clock_offset can be read out of BQL, so protect it with
137 * this lock.
138 */
139 QemuSeqLock vm_clock_seqlock;
140 int64_t cpu_clock_offset;
141 int32_t cpu_ticks_enabled;
142 int64_t dummy;
143
144 /* Compensate for varying guest execution speed. */
145 int64_t qemu_icount_bias;
146 /* Only written by TCG thread */
147 int64_t qemu_icount;
148 } TimersState;
149
150 static TimersState timers_state;
151 bool mttcg_enabled;
152
153 /*
154 * We default to false if we know other options have been enabled
155 * which are currently incompatible with MTTCG. Otherwise when each
156 * guest (target) has been updated to support:
157 * - atomic instructions
158 * - memory ordering primitives (barriers)
159 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
160 *
161 * Once a guest architecture has been converted to the new primitives
162 * there are two remaining limitations to check.
163 *
164 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
165 * - The host must have a stronger memory order than the guest
166 *
167 * It may be possible in future to support strong guests on weak hosts
168 * but that will require tagging all load/stores in a guest with their
169 * implicit memory order requirements which would likely slow things
170 * down a lot.
171 */
172
173 static bool check_tcg_memory_orders_compatible(void)
174 {
175 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
176 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
177 #else
178 return false;
179 #endif
180 }
181
182 static bool default_mttcg_enabled(void)
183 {
184 QemuOpts *icount_opts = qemu_find_opts_singleton("icount");
185 const char *rr = qemu_opt_get(icount_opts, "rr");
186
187 if (rr || TCG_OVERSIZED_GUEST) {
188 return false;
189 } else {
190 #ifdef TARGET_SUPPORTS_MTTCG
191 return check_tcg_memory_orders_compatible();
192 #else
193 return false;
194 #endif
195 }
196 }
197
198 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
199 {
200 const char *t = qemu_opt_get(opts, "thread");
201 if (t) {
202 if (strcmp(t, "multi") == 0) {
203 if (TCG_OVERSIZED_GUEST) {
204 error_setg(errp, "No MTTCG when guest word size > hosts");
205 } else {
206 if (!check_tcg_memory_orders_compatible()) {
207 error_report("Guest expects a stronger memory ordering "
208 "than the host provides");
209 error_printf("This may cause strange/hard to debug errors");
210 }
211 mttcg_enabled = true;
212 }
213 } else if (strcmp(t, "single") == 0) {
214 mttcg_enabled = false;
215 } else {
216 error_setg(errp, "Invalid 'thread' setting %s", t);
217 }
218 } else {
219 mttcg_enabled = default_mttcg_enabled();
220 }
221 }
222
223 int64_t cpu_get_icount_raw(void)
224 {
225 int64_t icount;
226 CPUState *cpu = current_cpu;
227
228 icount = timers_state.qemu_icount;
229 if (cpu) {
230 if (!cpu->can_do_io) {
231 fprintf(stderr, "Bad icount read\n");
232 exit(1);
233 }
234 icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
235 }
236 return icount;
237 }
238
239 /* Return the virtual CPU time, based on the instruction counter. */
240 static int64_t cpu_get_icount_locked(void)
241 {
242 int64_t icount = cpu_get_icount_raw();
243 return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
244 }
245
246 int64_t cpu_get_icount(void)
247 {
248 int64_t icount;
249 unsigned start;
250
251 do {
252 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
253 icount = cpu_get_icount_locked();
254 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
255
256 return icount;
257 }
258
259 int64_t cpu_icount_to_ns(int64_t icount)
260 {
261 return icount << icount_time_shift;
262 }
263
264 /* return the time elapsed in VM between vm_start and vm_stop. Unless
265 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
266 * counter.
267 *
268 * Caller must hold the BQL
269 */
270 int64_t cpu_get_ticks(void)
271 {
272 int64_t ticks;
273
274 if (use_icount) {
275 return cpu_get_icount();
276 }
277
278 ticks = timers_state.cpu_ticks_offset;
279 if (timers_state.cpu_ticks_enabled) {
280 ticks += cpu_get_host_ticks();
281 }
282
283 if (timers_state.cpu_ticks_prev > ticks) {
284 /* Note: non increasing ticks may happen if the host uses
285 software suspend */
286 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
287 ticks = timers_state.cpu_ticks_prev;
288 }
289
290 timers_state.cpu_ticks_prev = ticks;
291 return ticks;
292 }
293
294 static int64_t cpu_get_clock_locked(void)
295 {
296 int64_t time;
297
298 time = timers_state.cpu_clock_offset;
299 if (timers_state.cpu_ticks_enabled) {
300 time += get_clock();
301 }
302
303 return time;
304 }
305
306 /* Return the monotonic time elapsed in VM, i.e.,
307 * the time between vm_start and vm_stop
308 */
309 int64_t cpu_get_clock(void)
310 {
311 int64_t ti;
312 unsigned start;
313
314 do {
315 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
316 ti = cpu_get_clock_locked();
317 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
318
319 return ti;
320 }
321
322 /* enable cpu_get_ticks()
323 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
324 */
325 void cpu_enable_ticks(void)
326 {
327 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
328 seqlock_write_begin(&timers_state.vm_clock_seqlock);
329 if (!timers_state.cpu_ticks_enabled) {
330 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
331 timers_state.cpu_clock_offset -= get_clock();
332 timers_state.cpu_ticks_enabled = 1;
333 }
334 seqlock_write_end(&timers_state.vm_clock_seqlock);
335 }
336
337 /* disable cpu_get_ticks() : the clock is stopped. You must not call
338 * cpu_get_ticks() after that.
339 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
340 */
341 void cpu_disable_ticks(void)
342 {
343 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
344 seqlock_write_begin(&timers_state.vm_clock_seqlock);
345 if (timers_state.cpu_ticks_enabled) {
346 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
347 timers_state.cpu_clock_offset = cpu_get_clock_locked();
348 timers_state.cpu_ticks_enabled = 0;
349 }
350 seqlock_write_end(&timers_state.vm_clock_seqlock);
351 }
352
353 /* Correlation between real and virtual time is always going to be
354 fairly approximate, so ignore small variation.
355 When the guest is idle real and virtual time will be aligned in
356 the IO wait loop. */
357 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
358
359 static void icount_adjust(void)
360 {
361 int64_t cur_time;
362 int64_t cur_icount;
363 int64_t delta;
364
365 /* Protected by TimersState mutex. */
366 static int64_t last_delta;
367
368 /* If the VM is not running, then do nothing. */
369 if (!runstate_is_running()) {
370 return;
371 }
372
373 seqlock_write_begin(&timers_state.vm_clock_seqlock);
374 cur_time = cpu_get_clock_locked();
375 cur_icount = cpu_get_icount_locked();
376
377 delta = cur_icount - cur_time;
378 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
379 if (delta > 0
380 && last_delta + ICOUNT_WOBBLE < delta * 2
381 && icount_time_shift > 0) {
382 /* The guest is getting too far ahead. Slow time down. */
383 icount_time_shift--;
384 }
385 if (delta < 0
386 && last_delta - ICOUNT_WOBBLE > delta * 2
387 && icount_time_shift < MAX_ICOUNT_SHIFT) {
388 /* The guest is getting too far behind. Speed time up. */
389 icount_time_shift++;
390 }
391 last_delta = delta;
392 timers_state.qemu_icount_bias = cur_icount
393 - (timers_state.qemu_icount << icount_time_shift);
394 seqlock_write_end(&timers_state.vm_clock_seqlock);
395 }
396
397 static void icount_adjust_rt(void *opaque)
398 {
399 timer_mod(icount_rt_timer,
400 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
401 icount_adjust();
402 }
403
404 static void icount_adjust_vm(void *opaque)
405 {
406 timer_mod(icount_vm_timer,
407 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
408 NANOSECONDS_PER_SECOND / 10);
409 icount_adjust();
410 }
411
412 static int64_t qemu_icount_round(int64_t count)
413 {
414 return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
415 }
416
417 static void icount_warp_rt(void)
418 {
419 unsigned seq;
420 int64_t warp_start;
421
422 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
423 * changes from -1 to another value, so the race here is okay.
424 */
425 do {
426 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
427 warp_start = vm_clock_warp_start;
428 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
429
430 if (warp_start == -1) {
431 return;
432 }
433
434 seqlock_write_begin(&timers_state.vm_clock_seqlock);
435 if (runstate_is_running()) {
436 int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
437 cpu_get_clock_locked());
438 int64_t warp_delta;
439
440 warp_delta = clock - vm_clock_warp_start;
441 if (use_icount == 2) {
442 /*
443 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
444 * far ahead of real time.
445 */
446 int64_t cur_icount = cpu_get_icount_locked();
447 int64_t delta = clock - cur_icount;
448 warp_delta = MIN(warp_delta, delta);
449 }
450 timers_state.qemu_icount_bias += warp_delta;
451 }
452 vm_clock_warp_start = -1;
453 seqlock_write_end(&timers_state.vm_clock_seqlock);
454
455 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
456 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
457 }
458 }
459
460 static void icount_timer_cb(void *opaque)
461 {
462 /* No need for a checkpoint because the timer already synchronizes
463 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
464 */
465 icount_warp_rt();
466 }
467
468 void qtest_clock_warp(int64_t dest)
469 {
470 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
471 AioContext *aio_context;
472 assert(qtest_enabled());
473 aio_context = qemu_get_aio_context();
474 while (clock < dest) {
475 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
476 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
477
478 seqlock_write_begin(&timers_state.vm_clock_seqlock);
479 timers_state.qemu_icount_bias += warp;
480 seqlock_write_end(&timers_state.vm_clock_seqlock);
481
482 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
483 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
484 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
485 }
486 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
487 }
488
489 void qemu_start_warp_timer(void)
490 {
491 int64_t clock;
492 int64_t deadline;
493
494 if (!use_icount) {
495 return;
496 }
497
498 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
499 * do not fire, so computing the deadline does not make sense.
500 */
501 if (!runstate_is_running()) {
502 return;
503 }
504
505 /* warp clock deterministically in record/replay mode */
506 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
507 return;
508 }
509
510 if (!all_cpu_threads_idle()) {
511 return;
512 }
513
514 if (qtest_enabled()) {
515 /* When testing, qtest commands advance icount. */
516 return;
517 }
518
519 /* We want to use the earliest deadline from ALL vm_clocks */
520 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
521 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
522 if (deadline < 0) {
523 static bool notified;
524 if (!icount_sleep && !notified) {
525 error_report("WARNING: icount sleep disabled and no active timers");
526 notified = true;
527 }
528 return;
529 }
530
531 if (deadline > 0) {
532 /*
533 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
534 * sleep. Otherwise, the CPU might be waiting for a future timer
535 * interrupt to wake it up, but the interrupt never comes because
536 * the vCPU isn't running any insns and thus doesn't advance the
537 * QEMU_CLOCK_VIRTUAL.
538 */
539 if (!icount_sleep) {
540 /*
541 * We never let VCPUs sleep in no sleep icount mode.
542 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
543 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
544 * It is useful when we want a deterministic execution time,
545 * isolated from host latencies.
546 */
547 seqlock_write_begin(&timers_state.vm_clock_seqlock);
548 timers_state.qemu_icount_bias += deadline;
549 seqlock_write_end(&timers_state.vm_clock_seqlock);
550 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
551 } else {
552 /*
553 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
554 * "real" time, (related to the time left until the next event) has
555 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
556 * This avoids that the warps are visible externally; for example,
557 * you will not be sending network packets continuously instead of
558 * every 100ms.
559 */
560 seqlock_write_begin(&timers_state.vm_clock_seqlock);
561 if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
562 vm_clock_warp_start = clock;
563 }
564 seqlock_write_end(&timers_state.vm_clock_seqlock);
565 timer_mod_anticipate(icount_warp_timer, clock + deadline);
566 }
567 } else if (deadline == 0) {
568 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
569 }
570 }
571
572 static void qemu_account_warp_timer(void)
573 {
574 if (!use_icount || !icount_sleep) {
575 return;
576 }
577
578 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
579 * do not fire, so computing the deadline does not make sense.
580 */
581 if (!runstate_is_running()) {
582 return;
583 }
584
585 /* warp clock deterministically in record/replay mode */
586 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
587 return;
588 }
589
590 timer_del(icount_warp_timer);
591 icount_warp_rt();
592 }
593
594 static bool icount_state_needed(void *opaque)
595 {
596 return use_icount;
597 }
598
599 /*
600 * This is a subsection for icount migration.
601 */
602 static const VMStateDescription icount_vmstate_timers = {
603 .name = "timer/icount",
604 .version_id = 1,
605 .minimum_version_id = 1,
606 .needed = icount_state_needed,
607 .fields = (VMStateField[]) {
608 VMSTATE_INT64(qemu_icount_bias, TimersState),
609 VMSTATE_INT64(qemu_icount, TimersState),
610 VMSTATE_END_OF_LIST()
611 }
612 };
613
614 static const VMStateDescription vmstate_timers = {
615 .name = "timer",
616 .version_id = 2,
617 .minimum_version_id = 1,
618 .fields = (VMStateField[]) {
619 VMSTATE_INT64(cpu_ticks_offset, TimersState),
620 VMSTATE_INT64(dummy, TimersState),
621 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
622 VMSTATE_END_OF_LIST()
623 },
624 .subsections = (const VMStateDescription*[]) {
625 &icount_vmstate_timers,
626 NULL
627 }
628 };
629
630 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
631 {
632 double pct;
633 double throttle_ratio;
634 long sleeptime_ns;
635
636 if (!cpu_throttle_get_percentage()) {
637 return;
638 }
639
640 pct = (double)cpu_throttle_get_percentage()/100;
641 throttle_ratio = pct / (1 - pct);
642 sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
643
644 qemu_mutex_unlock_iothread();
645 atomic_set(&cpu->throttle_thread_scheduled, 0);
646 g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
647 qemu_mutex_lock_iothread();
648 }
649
650 static void cpu_throttle_timer_tick(void *opaque)
651 {
652 CPUState *cpu;
653 double pct;
654
655 /* Stop the timer if needed */
656 if (!cpu_throttle_get_percentage()) {
657 return;
658 }
659 CPU_FOREACH(cpu) {
660 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
661 async_run_on_cpu(cpu, cpu_throttle_thread,
662 RUN_ON_CPU_NULL);
663 }
664 }
665
666 pct = (double)cpu_throttle_get_percentage()/100;
667 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
668 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
669 }
670
671 void cpu_throttle_set(int new_throttle_pct)
672 {
673 /* Ensure throttle percentage is within valid range */
674 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
675 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
676
677 atomic_set(&throttle_percentage, new_throttle_pct);
678
679 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
680 CPU_THROTTLE_TIMESLICE_NS);
681 }
682
683 void cpu_throttle_stop(void)
684 {
685 atomic_set(&throttle_percentage, 0);
686 }
687
688 bool cpu_throttle_active(void)
689 {
690 return (cpu_throttle_get_percentage() != 0);
691 }
692
693 int cpu_throttle_get_percentage(void)
694 {
695 return atomic_read(&throttle_percentage);
696 }
697
698 void cpu_ticks_init(void)
699 {
700 seqlock_init(&timers_state.vm_clock_seqlock);
701 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
702 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
703 cpu_throttle_timer_tick, NULL);
704 }
705
706 void configure_icount(QemuOpts *opts, Error **errp)
707 {
708 const char *option;
709 char *rem_str = NULL;
710
711 option = qemu_opt_get(opts, "shift");
712 if (!option) {
713 if (qemu_opt_get(opts, "align") != NULL) {
714 error_setg(errp, "Please specify shift option when using align");
715 }
716 return;
717 }
718
719 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
720 if (icount_sleep) {
721 icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
722 icount_timer_cb, NULL);
723 }
724
725 icount_align_option = qemu_opt_get_bool(opts, "align", false);
726
727 if (icount_align_option && !icount_sleep) {
728 error_setg(errp, "align=on and sleep=off are incompatible");
729 }
730 if (strcmp(option, "auto") != 0) {
731 errno = 0;
732 icount_time_shift = strtol(option, &rem_str, 0);
733 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
734 error_setg(errp, "icount: Invalid shift value");
735 }
736 use_icount = 1;
737 return;
738 } else if (icount_align_option) {
739 error_setg(errp, "shift=auto and align=on are incompatible");
740 } else if (!icount_sleep) {
741 error_setg(errp, "shift=auto and sleep=off are incompatible");
742 }
743
744 use_icount = 2;
745
746 /* 125MIPS seems a reasonable initial guess at the guest speed.
747 It will be corrected fairly quickly anyway. */
748 icount_time_shift = 3;
749
750 /* Have both realtime and virtual time triggers for speed adjustment.
751 The realtime trigger catches emulated time passing too slowly,
752 the virtual time trigger catches emulated time passing too fast.
753 Realtime triggers occur even when idle, so use them less frequently
754 than VM triggers. */
755 icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
756 icount_adjust_rt, NULL);
757 timer_mod(icount_rt_timer,
758 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
759 icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
760 icount_adjust_vm, NULL);
761 timer_mod(icount_vm_timer,
762 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
763 NANOSECONDS_PER_SECOND / 10);
764 }
765
766 /***********************************************************/
767 /* TCG vCPU kick timer
768 *
769 * The kick timer is responsible for moving single threaded vCPU
770 * emulation on to the next vCPU. If more than one vCPU is running a
771 * timer event with force a cpu->exit so the next vCPU can get
772 * scheduled.
773 *
774 * The timer is removed if all vCPUs are idle and restarted again once
775 * idleness is complete.
776 */
777
778 static QEMUTimer *tcg_kick_vcpu_timer;
779 static CPUState *tcg_current_rr_cpu;
780
781 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
782
783 static inline int64_t qemu_tcg_next_kick(void)
784 {
785 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
786 }
787
788 /* Kick the currently round-robin scheduled vCPU */
789 static void qemu_cpu_kick_rr_cpu(void)
790 {
791 CPUState *cpu;
792 do {
793 cpu = atomic_mb_read(&tcg_current_rr_cpu);
794 if (cpu) {
795 cpu_exit(cpu);
796 }
797 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
798 }
799
800 static void kick_tcg_thread(void *opaque)
801 {
802 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
803 qemu_cpu_kick_rr_cpu();
804 }
805
806 static void start_tcg_kick_timer(void)
807 {
808 if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
809 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
810 kick_tcg_thread, NULL);
811 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
812 }
813 }
814
815 static void stop_tcg_kick_timer(void)
816 {
817 if (tcg_kick_vcpu_timer) {
818 timer_del(tcg_kick_vcpu_timer);
819 tcg_kick_vcpu_timer = NULL;
820 }
821 }
822
823 /***********************************************************/
824 void hw_error(const char *fmt, ...)
825 {
826 va_list ap;
827 CPUState *cpu;
828
829 va_start(ap, fmt);
830 fprintf(stderr, "qemu: hardware error: ");
831 vfprintf(stderr, fmt, ap);
832 fprintf(stderr, "\n");
833 CPU_FOREACH(cpu) {
834 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
835 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
836 }
837 va_end(ap);
838 abort();
839 }
840
841 void cpu_synchronize_all_states(void)
842 {
843 CPUState *cpu;
844
845 CPU_FOREACH(cpu) {
846 cpu_synchronize_state(cpu);
847 }
848 }
849
850 void cpu_synchronize_all_post_reset(void)
851 {
852 CPUState *cpu;
853
854 CPU_FOREACH(cpu) {
855 cpu_synchronize_post_reset(cpu);
856 }
857 }
858
859 void cpu_synchronize_all_post_init(void)
860 {
861 CPUState *cpu;
862
863 CPU_FOREACH(cpu) {
864 cpu_synchronize_post_init(cpu);
865 }
866 }
867
868 static int do_vm_stop(RunState state)
869 {
870 int ret = 0;
871
872 if (runstate_is_running()) {
873 cpu_disable_ticks();
874 pause_all_vcpus();
875 runstate_set(state);
876 vm_state_notify(0, state);
877 qapi_event_send_stop(&error_abort);
878 }
879
880 bdrv_drain_all();
881 replay_disable_events();
882 ret = bdrv_flush_all();
883
884 return ret;
885 }
886
887 static bool cpu_can_run(CPUState *cpu)
888 {
889 if (cpu->stop) {
890 return false;
891 }
892 if (cpu_is_stopped(cpu)) {
893 return false;
894 }
895 return true;
896 }
897
898 static void cpu_handle_guest_debug(CPUState *cpu)
899 {
900 gdb_set_stop_cpu(cpu);
901 qemu_system_debug_request();
902 cpu->stopped = true;
903 }
904
905 #ifdef CONFIG_LINUX
906 static void sigbus_reraise(void)
907 {
908 sigset_t set;
909 struct sigaction action;
910
911 memset(&action, 0, sizeof(action));
912 action.sa_handler = SIG_DFL;
913 if (!sigaction(SIGBUS, &action, NULL)) {
914 raise(SIGBUS);
915 sigemptyset(&set);
916 sigaddset(&set, SIGBUS);
917 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
918 }
919 perror("Failed to re-raise SIGBUS!\n");
920 abort();
921 }
922
923 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
924 {
925 if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
926 sigbus_reraise();
927 }
928
929 if (current_cpu) {
930 /* Called asynchronously in VCPU thread. */
931 if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
932 sigbus_reraise();
933 }
934 } else {
935 /* Called synchronously (via signalfd) in main thread. */
936 if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
937 sigbus_reraise();
938 }
939 }
940 }
941
942 static void qemu_init_sigbus(void)
943 {
944 struct sigaction action;
945
946 memset(&action, 0, sizeof(action));
947 action.sa_flags = SA_SIGINFO;
948 action.sa_sigaction = sigbus_handler;
949 sigaction(SIGBUS, &action, NULL);
950
951 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
952 }
953
954 static void dummy_signal(int sig)
955 {
956 }
957
958 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
959 {
960 int r;
961 sigset_t set;
962 struct sigaction sigact;
963
964 memset(&sigact, 0, sizeof(sigact));
965 sigact.sa_handler = dummy_signal;
966 sigaction(SIG_IPI, &sigact, NULL);
967
968 pthread_sigmask(SIG_BLOCK, NULL, &set);
969 sigdelset(&set, SIGBUS);
970 pthread_sigmask(SIG_SETMASK, &set, NULL);
971 sigdelset(&set, SIG_IPI);
972 r = kvm_set_signal_mask(cpu, &set);
973 if (r) {
974 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
975 exit(1);
976 }
977 }
978
979 static void qemu_kvm_eat_signals(CPUState *cpu)
980 {
981 struct timespec ts = { 0, 0 };
982 siginfo_t siginfo;
983 sigset_t waitset;
984 sigset_t chkset;
985 int r;
986
987 sigemptyset(&waitset);
988 sigaddset(&waitset, SIG_IPI);
989
990 do {
991 r = sigtimedwait(&waitset, &siginfo, &ts);
992 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
993 perror("sigtimedwait");
994 exit(1);
995 }
996
997 r = sigpending(&chkset);
998 if (r == -1) {
999 perror("sigpending");
1000 exit(1);
1001 }
1002 } while (sigismember(&chkset, SIG_IPI));
1003 }
1004 #else /* !CONFIG_LINUX */
1005 static void qemu_init_sigbus(void)
1006 {
1007 }
1008
1009 static void qemu_kvm_eat_signals(CPUState *cpu)
1010 {
1011 }
1012
1013 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
1014 {
1015 }
1016 #endif /* !CONFIG_LINUX */
1017
1018 static QemuMutex qemu_global_mutex;
1019
1020 static QemuThread io_thread;
1021
1022 /* cpu creation */
1023 static QemuCond qemu_cpu_cond;
1024 /* system init */
1025 static QemuCond qemu_pause_cond;
1026
1027 void qemu_init_cpu_loop(void)
1028 {
1029 qemu_init_sigbus();
1030 qemu_cond_init(&qemu_cpu_cond);
1031 qemu_cond_init(&qemu_pause_cond);
1032 qemu_mutex_init(&qemu_global_mutex);
1033
1034 qemu_thread_get_self(&io_thread);
1035 }
1036
1037 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1038 {
1039 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1040 }
1041
1042 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1043 {
1044 if (kvm_destroy_vcpu(cpu) < 0) {
1045 error_report("kvm_destroy_vcpu failed");
1046 exit(EXIT_FAILURE);
1047 }
1048 }
1049
1050 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1051 {
1052 }
1053
1054 static void qemu_wait_io_event_common(CPUState *cpu)
1055 {
1056 atomic_mb_set(&cpu->thread_kicked, false);
1057 if (cpu->stop) {
1058 cpu->stop = false;
1059 cpu->stopped = true;
1060 qemu_cond_broadcast(&qemu_pause_cond);
1061 }
1062 process_queued_cpu_work(cpu);
1063 }
1064
1065 static bool qemu_tcg_should_sleep(CPUState *cpu)
1066 {
1067 if (mttcg_enabled) {
1068 return cpu_thread_is_idle(cpu);
1069 } else {
1070 return all_cpu_threads_idle();
1071 }
1072 }
1073
1074 static void qemu_tcg_wait_io_event(CPUState *cpu)
1075 {
1076 while (qemu_tcg_should_sleep(cpu)) {
1077 stop_tcg_kick_timer();
1078 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1079 }
1080
1081 start_tcg_kick_timer();
1082
1083 qemu_wait_io_event_common(cpu);
1084 }
1085
1086 static void qemu_kvm_wait_io_event(CPUState *cpu)
1087 {
1088 while (cpu_thread_is_idle(cpu)) {
1089 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1090 }
1091
1092 qemu_kvm_eat_signals(cpu);
1093 qemu_wait_io_event_common(cpu);
1094 }
1095
1096 static void *qemu_kvm_cpu_thread_fn(void *arg)
1097 {
1098 CPUState *cpu = arg;
1099 int r;
1100
1101 rcu_register_thread();
1102
1103 qemu_mutex_lock_iothread();
1104 qemu_thread_get_self(cpu->thread);
1105 cpu->thread_id = qemu_get_thread_id();
1106 cpu->can_do_io = 1;
1107 current_cpu = cpu;
1108
1109 r = kvm_init_vcpu(cpu);
1110 if (r < 0) {
1111 fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1112 exit(1);
1113 }
1114
1115 qemu_kvm_init_cpu_signals(cpu);
1116
1117 /* signal CPU creation */
1118 cpu->created = true;
1119 qemu_cond_signal(&qemu_cpu_cond);
1120
1121 do {
1122 if (cpu_can_run(cpu)) {
1123 r = kvm_cpu_exec(cpu);
1124 if (r == EXCP_DEBUG) {
1125 cpu_handle_guest_debug(cpu);
1126 }
1127 }
1128 qemu_kvm_wait_io_event(cpu);
1129 } while (!cpu->unplug || cpu_can_run(cpu));
1130
1131 qemu_kvm_destroy_vcpu(cpu);
1132 cpu->created = false;
1133 qemu_cond_signal(&qemu_cpu_cond);
1134 qemu_mutex_unlock_iothread();
1135 return NULL;
1136 }
1137
1138 static void *qemu_dummy_cpu_thread_fn(void *arg)
1139 {
1140 #ifdef _WIN32
1141 fprintf(stderr, "qtest is not supported under Windows\n");
1142 exit(1);
1143 #else
1144 CPUState *cpu = arg;
1145 sigset_t waitset;
1146 int r;
1147
1148 rcu_register_thread();
1149
1150 qemu_mutex_lock_iothread();
1151 qemu_thread_get_self(cpu->thread);
1152 cpu->thread_id = qemu_get_thread_id();
1153 cpu->can_do_io = 1;
1154 current_cpu = cpu;
1155
1156 sigemptyset(&waitset);
1157 sigaddset(&waitset, SIG_IPI);
1158
1159 /* signal CPU creation */
1160 cpu->created = true;
1161 qemu_cond_signal(&qemu_cpu_cond);
1162
1163 while (1) {
1164 qemu_mutex_unlock_iothread();
1165 do {
1166 int sig;
1167 r = sigwait(&waitset, &sig);
1168 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1169 if (r == -1) {
1170 perror("sigwait");
1171 exit(1);
1172 }
1173 qemu_mutex_lock_iothread();
1174 qemu_wait_io_event_common(cpu);
1175 }
1176
1177 return NULL;
1178 #endif
1179 }
1180
1181 static int64_t tcg_get_icount_limit(void)
1182 {
1183 int64_t deadline;
1184
1185 if (replay_mode != REPLAY_MODE_PLAY) {
1186 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1187
1188 /* Maintain prior (possibly buggy) behaviour where if no deadline
1189 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1190 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1191 * nanoseconds.
1192 */
1193 if ((deadline < 0) || (deadline > INT32_MAX)) {
1194 deadline = INT32_MAX;
1195 }
1196
1197 return qemu_icount_round(deadline);
1198 } else {
1199 return replay_get_instructions();
1200 }
1201 }
1202
1203 static void handle_icount_deadline(void)
1204 {
1205 if (use_icount) {
1206 int64_t deadline =
1207 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1208
1209 if (deadline == 0) {
1210 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1211 }
1212 }
1213 }
1214
1215 static int tcg_cpu_exec(CPUState *cpu)
1216 {
1217 int ret;
1218 #ifdef CONFIG_PROFILER
1219 int64_t ti;
1220 #endif
1221
1222 #ifdef CONFIG_PROFILER
1223 ti = profile_getclock();
1224 #endif
1225 if (use_icount) {
1226 int64_t count;
1227 int decr;
1228 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1229 + cpu->icount_extra);
1230 cpu->icount_decr.u16.low = 0;
1231 cpu->icount_extra = 0;
1232 count = tcg_get_icount_limit();
1233 timers_state.qemu_icount += count;
1234 decr = (count > 0xffff) ? 0xffff : count;
1235 count -= decr;
1236 cpu->icount_decr.u16.low = decr;
1237 cpu->icount_extra = count;
1238 }
1239 qemu_mutex_unlock_iothread();
1240 cpu_exec_start(cpu);
1241 ret = cpu_exec(cpu);
1242 cpu_exec_end(cpu);
1243 qemu_mutex_lock_iothread();
1244 #ifdef CONFIG_PROFILER
1245 tcg_time += profile_getclock() - ti;
1246 #endif
1247 if (use_icount) {
1248 /* Fold pending instructions back into the
1249 instruction counter, and clear the interrupt flag. */
1250 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1251 + cpu->icount_extra);
1252 cpu->icount_decr.u32 = 0;
1253 cpu->icount_extra = 0;
1254 replay_account_executed_instructions();
1255 }
1256 return ret;
1257 }
1258
1259 /* Destroy any remaining vCPUs which have been unplugged and have
1260 * finished running
1261 */
1262 static void deal_with_unplugged_cpus(void)
1263 {
1264 CPUState *cpu;
1265
1266 CPU_FOREACH(cpu) {
1267 if (cpu->unplug && !cpu_can_run(cpu)) {
1268 qemu_tcg_destroy_vcpu(cpu);
1269 cpu->created = false;
1270 qemu_cond_signal(&qemu_cpu_cond);
1271 break;
1272 }
1273 }
1274 }
1275
1276 /* Single-threaded TCG
1277 *
1278 * In the single-threaded case each vCPU is simulated in turn. If
1279 * there is more than a single vCPU we create a simple timer to kick
1280 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1281 * This is done explicitly rather than relying on side-effects
1282 * elsewhere.
1283 */
1284
1285 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1286 {
1287 CPUState *cpu = arg;
1288
1289 rcu_register_thread();
1290
1291 qemu_mutex_lock_iothread();
1292 qemu_thread_get_self(cpu->thread);
1293
1294 CPU_FOREACH(cpu) {
1295 cpu->thread_id = qemu_get_thread_id();
1296 cpu->created = true;
1297 cpu->can_do_io = 1;
1298 }
1299 qemu_cond_signal(&qemu_cpu_cond);
1300
1301 /* wait for initial kick-off after machine start */
1302 while (first_cpu->stopped) {
1303 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1304
1305 /* process any pending work */
1306 CPU_FOREACH(cpu) {
1307 current_cpu = cpu;
1308 qemu_wait_io_event_common(cpu);
1309 }
1310 }
1311
1312 start_tcg_kick_timer();
1313
1314 cpu = first_cpu;
1315
1316 /* process any pending work */
1317 cpu->exit_request = 1;
1318
1319 while (1) {
1320 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1321 qemu_account_warp_timer();
1322
1323 if (!cpu) {
1324 cpu = first_cpu;
1325 }
1326
1327 while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1328
1329 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1330 current_cpu = cpu;
1331
1332 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1333 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1334
1335 if (cpu_can_run(cpu)) {
1336 int r;
1337 r = tcg_cpu_exec(cpu);
1338 if (r == EXCP_DEBUG) {
1339 cpu_handle_guest_debug(cpu);
1340 break;
1341 } else if (r == EXCP_ATOMIC) {
1342 qemu_mutex_unlock_iothread();
1343 cpu_exec_step_atomic(cpu);
1344 qemu_mutex_lock_iothread();
1345 break;
1346 }
1347 } else if (cpu->stop) {
1348 if (cpu->unplug) {
1349 cpu = CPU_NEXT(cpu);
1350 }
1351 break;
1352 }
1353
1354 cpu = CPU_NEXT(cpu);
1355 } /* while (cpu && !cpu->exit_request).. */
1356
1357 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1358 atomic_set(&tcg_current_rr_cpu, NULL);
1359
1360 if (cpu && cpu->exit_request) {
1361 atomic_mb_set(&cpu->exit_request, 0);
1362 }
1363
1364 handle_icount_deadline();
1365
1366 qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1367 deal_with_unplugged_cpus();
1368 }
1369
1370 return NULL;
1371 }
1372
1373 static void *qemu_hax_cpu_thread_fn(void *arg)
1374 {
1375 CPUState *cpu = arg;
1376 int r;
1377 qemu_thread_get_self(cpu->thread);
1378 qemu_mutex_lock(&qemu_global_mutex);
1379
1380 cpu->thread_id = qemu_get_thread_id();
1381 cpu->created = true;
1382 cpu->halted = 0;
1383 current_cpu = cpu;
1384
1385 hax_init_vcpu(cpu);
1386 qemu_cond_signal(&qemu_cpu_cond);
1387
1388 while (1) {
1389 if (cpu_can_run(cpu)) {
1390 r = hax_smp_cpu_exec(cpu);
1391 if (r == EXCP_DEBUG) {
1392 cpu_handle_guest_debug(cpu);
1393 }
1394 }
1395
1396 while (cpu_thread_is_idle(cpu)) {
1397 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1398 }
1399 #ifdef _WIN32
1400 SleepEx(0, TRUE);
1401 #endif
1402 qemu_wait_io_event_common(cpu);
1403 }
1404 return NULL;
1405 }
1406
1407 #ifdef _WIN32
1408 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1409 {
1410 }
1411 #endif
1412
1413 /* Multi-threaded TCG
1414 *
1415 * In the multi-threaded case each vCPU has its own thread. The TLS
1416 * variable current_cpu can be used deep in the code to find the
1417 * current CPUState for a given thread.
1418 */
1419
1420 static void *qemu_tcg_cpu_thread_fn(void *arg)
1421 {
1422 CPUState *cpu = arg;
1423
1424 rcu_register_thread();
1425
1426 qemu_mutex_lock_iothread();
1427 qemu_thread_get_self(cpu->thread);
1428
1429 cpu->thread_id = qemu_get_thread_id();
1430 cpu->created = true;
1431 cpu->can_do_io = 1;
1432 current_cpu = cpu;
1433 qemu_cond_signal(&qemu_cpu_cond);
1434
1435 /* process any pending work */
1436 cpu->exit_request = 1;
1437
1438 while (1) {
1439 if (cpu_can_run(cpu)) {
1440 int r;
1441 r = tcg_cpu_exec(cpu);
1442 switch (r) {
1443 case EXCP_DEBUG:
1444 cpu_handle_guest_debug(cpu);
1445 break;
1446 case EXCP_HALTED:
1447 /* during start-up the vCPU is reset and the thread is
1448 * kicked several times. If we don't ensure we go back
1449 * to sleep in the halted state we won't cleanly
1450 * start-up when the vCPU is enabled.
1451 *
1452 * cpu->halted should ensure we sleep in wait_io_event
1453 */
1454 g_assert(cpu->halted);
1455 break;
1456 case EXCP_ATOMIC:
1457 qemu_mutex_unlock_iothread();
1458 cpu_exec_step_atomic(cpu);
1459 qemu_mutex_lock_iothread();
1460 default:
1461 /* Ignore everything else? */
1462 break;
1463 }
1464 }
1465
1466 handle_icount_deadline();
1467
1468 atomic_mb_set(&cpu->exit_request, 0);
1469 qemu_tcg_wait_io_event(cpu);
1470 }
1471
1472 return NULL;
1473 }
1474
1475 static void qemu_cpu_kick_thread(CPUState *cpu)
1476 {
1477 #ifndef _WIN32
1478 int err;
1479
1480 if (cpu->thread_kicked) {
1481 return;
1482 }
1483 cpu->thread_kicked = true;
1484 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1485 if (err) {
1486 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1487 exit(1);
1488 }
1489 #else /* _WIN32 */
1490 if (!qemu_cpu_is_self(cpu)) {
1491 if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1492 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1493 __func__, GetLastError());
1494 exit(1);
1495 }
1496 }
1497 #endif
1498 }
1499
1500 void qemu_cpu_kick(CPUState *cpu)
1501 {
1502 qemu_cond_broadcast(cpu->halt_cond);
1503 if (tcg_enabled()) {
1504 cpu_exit(cpu);
1505 /* NOP unless doing single-thread RR */
1506 qemu_cpu_kick_rr_cpu();
1507 } else {
1508 if (hax_enabled()) {
1509 /*
1510 * FIXME: race condition with the exit_request check in
1511 * hax_vcpu_hax_exec
1512 */
1513 cpu->exit_request = 1;
1514 }
1515 qemu_cpu_kick_thread(cpu);
1516 }
1517 }
1518
1519 void qemu_cpu_kick_self(void)
1520 {
1521 assert(current_cpu);
1522 qemu_cpu_kick_thread(current_cpu);
1523 }
1524
1525 bool qemu_cpu_is_self(CPUState *cpu)
1526 {
1527 return qemu_thread_is_self(cpu->thread);
1528 }
1529
1530 bool qemu_in_vcpu_thread(void)
1531 {
1532 return current_cpu && qemu_cpu_is_self(current_cpu);
1533 }
1534
1535 static __thread bool iothread_locked = false;
1536
1537 bool qemu_mutex_iothread_locked(void)
1538 {
1539 return iothread_locked;
1540 }
1541
1542 void qemu_mutex_lock_iothread(void)
1543 {
1544 g_assert(!qemu_mutex_iothread_locked());
1545 qemu_mutex_lock(&qemu_global_mutex);
1546 iothread_locked = true;
1547 }
1548
1549 void qemu_mutex_unlock_iothread(void)
1550 {
1551 g_assert(qemu_mutex_iothread_locked());
1552 iothread_locked = false;
1553 qemu_mutex_unlock(&qemu_global_mutex);
1554 }
1555
1556 static bool all_vcpus_paused(void)
1557 {
1558 CPUState *cpu;
1559
1560 CPU_FOREACH(cpu) {
1561 if (!cpu->stopped) {
1562 return false;
1563 }
1564 }
1565
1566 return true;
1567 }
1568
1569 void pause_all_vcpus(void)
1570 {
1571 CPUState *cpu;
1572
1573 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1574 CPU_FOREACH(cpu) {
1575 cpu->stop = true;
1576 qemu_cpu_kick(cpu);
1577 }
1578
1579 if (qemu_in_vcpu_thread()) {
1580 cpu_stop_current();
1581 }
1582
1583 while (!all_vcpus_paused()) {
1584 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1585 CPU_FOREACH(cpu) {
1586 qemu_cpu_kick(cpu);
1587 }
1588 }
1589 }
1590
1591 void cpu_resume(CPUState *cpu)
1592 {
1593 cpu->stop = false;
1594 cpu->stopped = false;
1595 qemu_cpu_kick(cpu);
1596 }
1597
1598 void resume_all_vcpus(void)
1599 {
1600 CPUState *cpu;
1601
1602 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1603 CPU_FOREACH(cpu) {
1604 cpu_resume(cpu);
1605 }
1606 }
1607
1608 void cpu_remove(CPUState *cpu)
1609 {
1610 cpu->stop = true;
1611 cpu->unplug = true;
1612 qemu_cpu_kick(cpu);
1613 }
1614
1615 void cpu_remove_sync(CPUState *cpu)
1616 {
1617 cpu_remove(cpu);
1618 while (cpu->created) {
1619 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1620 }
1621 }
1622
1623 /* For temporary buffers for forming a name */
1624 #define VCPU_THREAD_NAME_SIZE 16
1625
1626 static void qemu_tcg_init_vcpu(CPUState *cpu)
1627 {
1628 char thread_name[VCPU_THREAD_NAME_SIZE];
1629 static QemuCond *single_tcg_halt_cond;
1630 static QemuThread *single_tcg_cpu_thread;
1631
1632 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1633 cpu->thread = g_malloc0(sizeof(QemuThread));
1634 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1635 qemu_cond_init(cpu->halt_cond);
1636
1637 if (qemu_tcg_mttcg_enabled()) {
1638 /* create a thread per vCPU with TCG (MTTCG) */
1639 parallel_cpus = true;
1640 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1641 cpu->cpu_index);
1642
1643 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1644 cpu, QEMU_THREAD_JOINABLE);
1645
1646 } else {
1647 /* share a single thread for all cpus with TCG */
1648 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1649 qemu_thread_create(cpu->thread, thread_name,
1650 qemu_tcg_rr_cpu_thread_fn,
1651 cpu, QEMU_THREAD_JOINABLE);
1652
1653 single_tcg_halt_cond = cpu->halt_cond;
1654 single_tcg_cpu_thread = cpu->thread;
1655 }
1656 #ifdef _WIN32
1657 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1658 #endif
1659 while (!cpu->created) {
1660 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1661 }
1662 } else {
1663 /* For non-MTTCG cases we share the thread */
1664 cpu->thread = single_tcg_cpu_thread;
1665 cpu->halt_cond = single_tcg_halt_cond;
1666 }
1667 }
1668
1669 static void qemu_hax_start_vcpu(CPUState *cpu)
1670 {
1671 char thread_name[VCPU_THREAD_NAME_SIZE];
1672
1673 cpu->thread = g_malloc0(sizeof(QemuThread));
1674 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1675 qemu_cond_init(cpu->halt_cond);
1676
1677 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1678 cpu->cpu_index);
1679 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1680 cpu, QEMU_THREAD_JOINABLE);
1681 #ifdef _WIN32
1682 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1683 #endif
1684 while (!cpu->created) {
1685 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1686 }
1687 }
1688
1689 static void qemu_kvm_start_vcpu(CPUState *cpu)
1690 {
1691 char thread_name[VCPU_THREAD_NAME_SIZE];
1692
1693 cpu->thread = g_malloc0(sizeof(QemuThread));
1694 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1695 qemu_cond_init(cpu->halt_cond);
1696 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1697 cpu->cpu_index);
1698 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1699 cpu, QEMU_THREAD_JOINABLE);
1700 while (!cpu->created) {
1701 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1702 }
1703 }
1704
1705 static void qemu_dummy_start_vcpu(CPUState *cpu)
1706 {
1707 char thread_name[VCPU_THREAD_NAME_SIZE];
1708
1709 cpu->thread = g_malloc0(sizeof(QemuThread));
1710 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1711 qemu_cond_init(cpu->halt_cond);
1712 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1713 cpu->cpu_index);
1714 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1715 QEMU_THREAD_JOINABLE);
1716 while (!cpu->created) {
1717 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1718 }
1719 }
1720
1721 void qemu_init_vcpu(CPUState *cpu)
1722 {
1723 cpu->nr_cores = smp_cores;
1724 cpu->nr_threads = smp_threads;
1725 cpu->stopped = true;
1726
1727 if (!cpu->as) {
1728 /* If the target cpu hasn't set up any address spaces itself,
1729 * give it the default one.
1730 */
1731 AddressSpace *as = address_space_init_shareable(cpu->memory,
1732 "cpu-memory");
1733 cpu->num_ases = 1;
1734 cpu_address_space_init(cpu, as, 0);
1735 }
1736
1737 if (kvm_enabled()) {
1738 qemu_kvm_start_vcpu(cpu);
1739 } else if (hax_enabled()) {
1740 qemu_hax_start_vcpu(cpu);
1741 } else if (tcg_enabled()) {
1742 qemu_tcg_init_vcpu(cpu);
1743 } else {
1744 qemu_dummy_start_vcpu(cpu);
1745 }
1746 }
1747
1748 void cpu_stop_current(void)
1749 {
1750 if (current_cpu) {
1751 current_cpu->stop = false;
1752 current_cpu->stopped = true;
1753 cpu_exit(current_cpu);
1754 qemu_cond_broadcast(&qemu_pause_cond);
1755 }
1756 }
1757
1758 int vm_stop(RunState state)
1759 {
1760 if (qemu_in_vcpu_thread()) {
1761 qemu_system_vmstop_request_prepare();
1762 qemu_system_vmstop_request(state);
1763 /*
1764 * FIXME: should not return to device code in case
1765 * vm_stop() has been requested.
1766 */
1767 cpu_stop_current();
1768 return 0;
1769 }
1770
1771 return do_vm_stop(state);
1772 }
1773
1774 /**
1775 * Prepare for (re)starting the VM.
1776 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1777 * running or in case of an error condition), 0 otherwise.
1778 */
1779 int vm_prepare_start(void)
1780 {
1781 RunState requested;
1782 int res = 0;
1783
1784 qemu_vmstop_requested(&requested);
1785 if (runstate_is_running() && requested == RUN_STATE__MAX) {
1786 return -1;
1787 }
1788
1789 /* Ensure that a STOP/RESUME pair of events is emitted if a
1790 * vmstop request was pending. The BLOCK_IO_ERROR event, for
1791 * example, according to documentation is always followed by
1792 * the STOP event.
1793 */
1794 if (runstate_is_running()) {
1795 qapi_event_send_stop(&error_abort);
1796 res = -1;
1797 } else {
1798 replay_enable_events();
1799 cpu_enable_ticks();
1800 runstate_set(RUN_STATE_RUNNING);
1801 vm_state_notify(1, RUN_STATE_RUNNING);
1802 }
1803
1804 /* We are sending this now, but the CPUs will be resumed shortly later */
1805 qapi_event_send_resume(&error_abort);
1806 return res;
1807 }
1808
1809 void vm_start(void)
1810 {
1811 if (!vm_prepare_start()) {
1812 resume_all_vcpus();
1813 }
1814 }
1815
1816 /* does a state transition even if the VM is already stopped,
1817 current state is forgotten forever */
1818 int vm_stop_force_state(RunState state)
1819 {
1820 if (runstate_is_running()) {
1821 return vm_stop(state);
1822 } else {
1823 runstate_set(state);
1824
1825 bdrv_drain_all();
1826 /* Make sure to return an error if the flush in a previous vm_stop()
1827 * failed. */
1828 return bdrv_flush_all();
1829 }
1830 }
1831
1832 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1833 {
1834 /* XXX: implement xxx_cpu_list for targets that still miss it */
1835 #if defined(cpu_list)
1836 cpu_list(f, cpu_fprintf);
1837 #endif
1838 }
1839
1840 CpuInfoList *qmp_query_cpus(Error **errp)
1841 {
1842 CpuInfoList *head = NULL, *cur_item = NULL;
1843 CPUState *cpu;
1844
1845 CPU_FOREACH(cpu) {
1846 CpuInfoList *info;
1847 #if defined(TARGET_I386)
1848 X86CPU *x86_cpu = X86_CPU(cpu);
1849 CPUX86State *env = &x86_cpu->env;
1850 #elif defined(TARGET_PPC)
1851 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1852 CPUPPCState *env = &ppc_cpu->env;
1853 #elif defined(TARGET_SPARC)
1854 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1855 CPUSPARCState *env = &sparc_cpu->env;
1856 #elif defined(TARGET_MIPS)
1857 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1858 CPUMIPSState *env = &mips_cpu->env;
1859 #elif defined(TARGET_TRICORE)
1860 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1861 CPUTriCoreState *env = &tricore_cpu->env;
1862 #endif
1863
1864 cpu_synchronize_state(cpu);
1865
1866 info = g_malloc0(sizeof(*info));
1867 info->value = g_malloc0(sizeof(*info->value));
1868 info->value->CPU = cpu->cpu_index;
1869 info->value->current = (cpu == first_cpu);
1870 info->value->halted = cpu->halted;
1871 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1872 info->value->thread_id = cpu->thread_id;
1873 #if defined(TARGET_I386)
1874 info->value->arch = CPU_INFO_ARCH_X86;
1875 info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1876 #elif defined(TARGET_PPC)
1877 info->value->arch = CPU_INFO_ARCH_PPC;
1878 info->value->u.ppc.nip = env->nip;
1879 #elif defined(TARGET_SPARC)
1880 info->value->arch = CPU_INFO_ARCH_SPARC;
1881 info->value->u.q_sparc.pc = env->pc;
1882 info->value->u.q_sparc.npc = env->npc;
1883 #elif defined(TARGET_MIPS)
1884 info->value->arch = CPU_INFO_ARCH_MIPS;
1885 info->value->u.q_mips.PC = env->active_tc.PC;
1886 #elif defined(TARGET_TRICORE)
1887 info->value->arch = CPU_INFO_ARCH_TRICORE;
1888 info->value->u.tricore.PC = env->PC;
1889 #else
1890 info->value->arch = CPU_INFO_ARCH_OTHER;
1891 #endif
1892
1893 /* XXX: waiting for the qapi to support GSList */
1894 if (!cur_item) {
1895 head = cur_item = info;
1896 } else {
1897 cur_item->next = info;
1898 cur_item = info;
1899 }
1900 }
1901
1902 return head;
1903 }
1904
1905 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1906 bool has_cpu, int64_t cpu_index, Error **errp)
1907 {
1908 FILE *f;
1909 uint32_t l;
1910 CPUState *cpu;
1911 uint8_t buf[1024];
1912 int64_t orig_addr = addr, orig_size = size;
1913
1914 if (!has_cpu) {
1915 cpu_index = 0;
1916 }
1917
1918 cpu = qemu_get_cpu(cpu_index);
1919 if (cpu == NULL) {
1920 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1921 "a CPU number");
1922 return;
1923 }
1924
1925 f = fopen(filename, "wb");
1926 if (!f) {
1927 error_setg_file_open(errp, errno, filename);
1928 return;
1929 }
1930
1931 while (size != 0) {
1932 l = sizeof(buf);
1933 if (l > size)
1934 l = size;
1935 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1936 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1937 " specified", orig_addr, orig_size);
1938 goto exit;
1939 }
1940 if (fwrite(buf, 1, l, f) != l) {
1941 error_setg(errp, QERR_IO_ERROR);
1942 goto exit;
1943 }
1944 addr += l;
1945 size -= l;
1946 }
1947
1948 exit:
1949 fclose(f);
1950 }
1951
1952 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1953 Error **errp)
1954 {
1955 FILE *f;
1956 uint32_t l;
1957 uint8_t buf[1024];
1958
1959 f = fopen(filename, "wb");
1960 if (!f) {
1961 error_setg_file_open(errp, errno, filename);
1962 return;
1963 }
1964
1965 while (size != 0) {
1966 l = sizeof(buf);
1967 if (l > size)
1968 l = size;
1969 cpu_physical_memory_read(addr, buf, l);
1970 if (fwrite(buf, 1, l, f) != l) {
1971 error_setg(errp, QERR_IO_ERROR);
1972 goto exit;
1973 }
1974 addr += l;
1975 size -= l;
1976 }
1977
1978 exit:
1979 fclose(f);
1980 }
1981
1982 void qmp_inject_nmi(Error **errp)
1983 {
1984 nmi_monitor_handle(monitor_get_cpu_index(), errp);
1985 }
1986
1987 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1988 {
1989 if (!use_icount) {
1990 return;
1991 }
1992
1993 cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
1994 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1995 if (icount_align_option) {
1996 cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
1997 cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
1998 } else {
1999 cpu_fprintf(f, "Max guest delay NA\n");
2000 cpu_fprintf(f, "Max guest advance NA\n");
2001 }
2002 }