]> git.proxmox.com Git - mirror_qemu.git/blob - cpus.c
cpus: define QEMUTimerListNotifyCB for QEMU system emulation
[mirror_qemu.git] / cpus.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 /* Needed early for CONFIG_BSD etc. */
26 #include "qemu/osdep.h"
27 #include "qemu-common.h"
28 #include "qemu/config-file.h"
29 #include "cpu.h"
30 #include "monitor/monitor.h"
31 #include "qapi/qmp/qerror.h"
32 #include "qemu/error-report.h"
33 #include "sysemu/sysemu.h"
34 #include "sysemu/block-backend.h"
35 #include "exec/gdbstub.h"
36 #include "sysemu/dma.h"
37 #include "sysemu/hw_accel.h"
38 #include "sysemu/kvm.h"
39 #include "sysemu/hax.h"
40 #include "qmp-commands.h"
41 #include "exec/exec-all.h"
42
43 #include "qemu/thread.h"
44 #include "sysemu/cpus.h"
45 #include "sysemu/qtest.h"
46 #include "qemu/main-loop.h"
47 #include "qemu/bitmap.h"
48 #include "qemu/seqlock.h"
49 #include "tcg.h"
50 #include "qapi-event.h"
51 #include "hw/nmi.h"
52 #include "sysemu/replay.h"
53
54 #ifdef CONFIG_LINUX
55
56 #include <sys/prctl.h>
57
58 #ifndef PR_MCE_KILL
59 #define PR_MCE_KILL 33
60 #endif
61
62 #ifndef PR_MCE_KILL_SET
63 #define PR_MCE_KILL_SET 1
64 #endif
65
66 #ifndef PR_MCE_KILL_EARLY
67 #define PR_MCE_KILL_EARLY 1
68 #endif
69
70 #endif /* CONFIG_LINUX */
71
72 int64_t max_delay;
73 int64_t max_advance;
74
75 /* vcpu throttling controls */
76 static QEMUTimer *throttle_timer;
77 static unsigned int throttle_percentage;
78
79 #define CPU_THROTTLE_PCT_MIN 1
80 #define CPU_THROTTLE_PCT_MAX 99
81 #define CPU_THROTTLE_TIMESLICE_NS 10000000
82
83 bool cpu_is_stopped(CPUState *cpu)
84 {
85 return cpu->stopped || !runstate_is_running();
86 }
87
88 static bool cpu_thread_is_idle(CPUState *cpu)
89 {
90 if (cpu->stop || cpu->queued_work_first) {
91 return false;
92 }
93 if (cpu_is_stopped(cpu)) {
94 return true;
95 }
96 if (!cpu->halted || cpu_has_work(cpu) ||
97 kvm_halt_in_kernel()) {
98 return false;
99 }
100 return true;
101 }
102
103 static bool all_cpu_threads_idle(void)
104 {
105 CPUState *cpu;
106
107 CPU_FOREACH(cpu) {
108 if (!cpu_thread_is_idle(cpu)) {
109 return false;
110 }
111 }
112 return true;
113 }
114
115 /***********************************************************/
116 /* guest cycle counter */
117
118 /* Protected by TimersState seqlock */
119
120 static bool icount_sleep = true;
121 static int64_t vm_clock_warp_start = -1;
122 /* Conversion factor from emulated instructions to virtual clock ticks. */
123 static int icount_time_shift;
124 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
125 #define MAX_ICOUNT_SHIFT 10
126
127 static QEMUTimer *icount_rt_timer;
128 static QEMUTimer *icount_vm_timer;
129 static QEMUTimer *icount_warp_timer;
130
131 typedef struct TimersState {
132 /* Protected by BQL. */
133 int64_t cpu_ticks_prev;
134 int64_t cpu_ticks_offset;
135
136 /* cpu_clock_offset can be read out of BQL, so protect it with
137 * this lock.
138 */
139 QemuSeqLock vm_clock_seqlock;
140 int64_t cpu_clock_offset;
141 int32_t cpu_ticks_enabled;
142 int64_t dummy;
143
144 /* Compensate for varying guest execution speed. */
145 int64_t qemu_icount_bias;
146 /* Only written by TCG thread */
147 int64_t qemu_icount;
148 } TimersState;
149
150 static TimersState timers_state;
151 bool mttcg_enabled;
152
153 /*
154 * We default to false if we know other options have been enabled
155 * which are currently incompatible with MTTCG. Otherwise when each
156 * guest (target) has been updated to support:
157 * - atomic instructions
158 * - memory ordering primitives (barriers)
159 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
160 *
161 * Once a guest architecture has been converted to the new primitives
162 * there are two remaining limitations to check.
163 *
164 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
165 * - The host must have a stronger memory order than the guest
166 *
167 * It may be possible in future to support strong guests on weak hosts
168 * but that will require tagging all load/stores in a guest with their
169 * implicit memory order requirements which would likely slow things
170 * down a lot.
171 */
172
173 static bool check_tcg_memory_orders_compatible(void)
174 {
175 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
176 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
177 #else
178 return false;
179 #endif
180 }
181
182 static bool default_mttcg_enabled(void)
183 {
184 if (use_icount || TCG_OVERSIZED_GUEST) {
185 return false;
186 } else {
187 #ifdef TARGET_SUPPORTS_MTTCG
188 return check_tcg_memory_orders_compatible();
189 #else
190 return false;
191 #endif
192 }
193 }
194
195 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
196 {
197 const char *t = qemu_opt_get(opts, "thread");
198 if (t) {
199 if (strcmp(t, "multi") == 0) {
200 if (TCG_OVERSIZED_GUEST) {
201 error_setg(errp, "No MTTCG when guest word size > hosts");
202 } else if (use_icount) {
203 error_setg(errp, "No MTTCG when icount is enabled");
204 } else {
205 #ifndef TARGET_SUPPORT_MTTCG
206 error_report("Guest not yet converted to MTTCG - "
207 "you may get unexpected results");
208 #endif
209 if (!check_tcg_memory_orders_compatible()) {
210 error_report("Guest expects a stronger memory ordering "
211 "than the host provides");
212 error_printf("This may cause strange/hard to debug errors");
213 }
214 mttcg_enabled = true;
215 }
216 } else if (strcmp(t, "single") == 0) {
217 mttcg_enabled = false;
218 } else {
219 error_setg(errp, "Invalid 'thread' setting %s", t);
220 }
221 } else {
222 mttcg_enabled = default_mttcg_enabled();
223 }
224 }
225
226 int64_t cpu_get_icount_raw(void)
227 {
228 int64_t icount;
229 CPUState *cpu = current_cpu;
230
231 icount = timers_state.qemu_icount;
232 if (cpu) {
233 if (!cpu->can_do_io) {
234 fprintf(stderr, "Bad icount read\n");
235 exit(1);
236 }
237 icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
238 }
239 return icount;
240 }
241
242 /* Return the virtual CPU time, based on the instruction counter. */
243 static int64_t cpu_get_icount_locked(void)
244 {
245 int64_t icount = cpu_get_icount_raw();
246 return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
247 }
248
249 int64_t cpu_get_icount(void)
250 {
251 int64_t icount;
252 unsigned start;
253
254 do {
255 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
256 icount = cpu_get_icount_locked();
257 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
258
259 return icount;
260 }
261
262 int64_t cpu_icount_to_ns(int64_t icount)
263 {
264 return icount << icount_time_shift;
265 }
266
267 /* return the time elapsed in VM between vm_start and vm_stop. Unless
268 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
269 * counter.
270 *
271 * Caller must hold the BQL
272 */
273 int64_t cpu_get_ticks(void)
274 {
275 int64_t ticks;
276
277 if (use_icount) {
278 return cpu_get_icount();
279 }
280
281 ticks = timers_state.cpu_ticks_offset;
282 if (timers_state.cpu_ticks_enabled) {
283 ticks += cpu_get_host_ticks();
284 }
285
286 if (timers_state.cpu_ticks_prev > ticks) {
287 /* Note: non increasing ticks may happen if the host uses
288 software suspend */
289 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
290 ticks = timers_state.cpu_ticks_prev;
291 }
292
293 timers_state.cpu_ticks_prev = ticks;
294 return ticks;
295 }
296
297 static int64_t cpu_get_clock_locked(void)
298 {
299 int64_t time;
300
301 time = timers_state.cpu_clock_offset;
302 if (timers_state.cpu_ticks_enabled) {
303 time += get_clock();
304 }
305
306 return time;
307 }
308
309 /* Return the monotonic time elapsed in VM, i.e.,
310 * the time between vm_start and vm_stop
311 */
312 int64_t cpu_get_clock(void)
313 {
314 int64_t ti;
315 unsigned start;
316
317 do {
318 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
319 ti = cpu_get_clock_locked();
320 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
321
322 return ti;
323 }
324
325 /* enable cpu_get_ticks()
326 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
327 */
328 void cpu_enable_ticks(void)
329 {
330 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
331 seqlock_write_begin(&timers_state.vm_clock_seqlock);
332 if (!timers_state.cpu_ticks_enabled) {
333 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
334 timers_state.cpu_clock_offset -= get_clock();
335 timers_state.cpu_ticks_enabled = 1;
336 }
337 seqlock_write_end(&timers_state.vm_clock_seqlock);
338 }
339
340 /* disable cpu_get_ticks() : the clock is stopped. You must not call
341 * cpu_get_ticks() after that.
342 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
343 */
344 void cpu_disable_ticks(void)
345 {
346 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
347 seqlock_write_begin(&timers_state.vm_clock_seqlock);
348 if (timers_state.cpu_ticks_enabled) {
349 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
350 timers_state.cpu_clock_offset = cpu_get_clock_locked();
351 timers_state.cpu_ticks_enabled = 0;
352 }
353 seqlock_write_end(&timers_state.vm_clock_seqlock);
354 }
355
356 /* Correlation between real and virtual time is always going to be
357 fairly approximate, so ignore small variation.
358 When the guest is idle real and virtual time will be aligned in
359 the IO wait loop. */
360 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
361
362 static void icount_adjust(void)
363 {
364 int64_t cur_time;
365 int64_t cur_icount;
366 int64_t delta;
367
368 /* Protected by TimersState mutex. */
369 static int64_t last_delta;
370
371 /* If the VM is not running, then do nothing. */
372 if (!runstate_is_running()) {
373 return;
374 }
375
376 seqlock_write_begin(&timers_state.vm_clock_seqlock);
377 cur_time = cpu_get_clock_locked();
378 cur_icount = cpu_get_icount_locked();
379
380 delta = cur_icount - cur_time;
381 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
382 if (delta > 0
383 && last_delta + ICOUNT_WOBBLE < delta * 2
384 && icount_time_shift > 0) {
385 /* The guest is getting too far ahead. Slow time down. */
386 icount_time_shift--;
387 }
388 if (delta < 0
389 && last_delta - ICOUNT_WOBBLE > delta * 2
390 && icount_time_shift < MAX_ICOUNT_SHIFT) {
391 /* The guest is getting too far behind. Speed time up. */
392 icount_time_shift++;
393 }
394 last_delta = delta;
395 timers_state.qemu_icount_bias = cur_icount
396 - (timers_state.qemu_icount << icount_time_shift);
397 seqlock_write_end(&timers_state.vm_clock_seqlock);
398 }
399
400 static void icount_adjust_rt(void *opaque)
401 {
402 timer_mod(icount_rt_timer,
403 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
404 icount_adjust();
405 }
406
407 static void icount_adjust_vm(void *opaque)
408 {
409 timer_mod(icount_vm_timer,
410 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
411 NANOSECONDS_PER_SECOND / 10);
412 icount_adjust();
413 }
414
415 static int64_t qemu_icount_round(int64_t count)
416 {
417 return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
418 }
419
420 static void icount_warp_rt(void)
421 {
422 unsigned seq;
423 int64_t warp_start;
424
425 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
426 * changes from -1 to another value, so the race here is okay.
427 */
428 do {
429 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
430 warp_start = vm_clock_warp_start;
431 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
432
433 if (warp_start == -1) {
434 return;
435 }
436
437 seqlock_write_begin(&timers_state.vm_clock_seqlock);
438 if (runstate_is_running()) {
439 int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
440 cpu_get_clock_locked());
441 int64_t warp_delta;
442
443 warp_delta = clock - vm_clock_warp_start;
444 if (use_icount == 2) {
445 /*
446 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
447 * far ahead of real time.
448 */
449 int64_t cur_icount = cpu_get_icount_locked();
450 int64_t delta = clock - cur_icount;
451 warp_delta = MIN(warp_delta, delta);
452 }
453 timers_state.qemu_icount_bias += warp_delta;
454 }
455 vm_clock_warp_start = -1;
456 seqlock_write_end(&timers_state.vm_clock_seqlock);
457
458 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
459 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
460 }
461 }
462
463 static void icount_timer_cb(void *opaque)
464 {
465 /* No need for a checkpoint because the timer already synchronizes
466 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
467 */
468 icount_warp_rt();
469 }
470
471 void qtest_clock_warp(int64_t dest)
472 {
473 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
474 AioContext *aio_context;
475 assert(qtest_enabled());
476 aio_context = qemu_get_aio_context();
477 while (clock < dest) {
478 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
479 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
480
481 seqlock_write_begin(&timers_state.vm_clock_seqlock);
482 timers_state.qemu_icount_bias += warp;
483 seqlock_write_end(&timers_state.vm_clock_seqlock);
484
485 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
486 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
487 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
488 }
489 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
490 }
491
492 void qemu_start_warp_timer(void)
493 {
494 int64_t clock;
495 int64_t deadline;
496
497 if (!use_icount) {
498 return;
499 }
500
501 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
502 * do not fire, so computing the deadline does not make sense.
503 */
504 if (!runstate_is_running()) {
505 return;
506 }
507
508 /* warp clock deterministically in record/replay mode */
509 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
510 return;
511 }
512
513 if (!all_cpu_threads_idle()) {
514 return;
515 }
516
517 if (qtest_enabled()) {
518 /* When testing, qtest commands advance icount. */
519 return;
520 }
521
522 /* We want to use the earliest deadline from ALL vm_clocks */
523 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
524 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
525 if (deadline < 0) {
526 static bool notified;
527 if (!icount_sleep && !notified) {
528 error_report("WARNING: icount sleep disabled and no active timers");
529 notified = true;
530 }
531 return;
532 }
533
534 if (deadline > 0) {
535 /*
536 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
537 * sleep. Otherwise, the CPU might be waiting for a future timer
538 * interrupt to wake it up, but the interrupt never comes because
539 * the vCPU isn't running any insns and thus doesn't advance the
540 * QEMU_CLOCK_VIRTUAL.
541 */
542 if (!icount_sleep) {
543 /*
544 * We never let VCPUs sleep in no sleep icount mode.
545 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
546 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
547 * It is useful when we want a deterministic execution time,
548 * isolated from host latencies.
549 */
550 seqlock_write_begin(&timers_state.vm_clock_seqlock);
551 timers_state.qemu_icount_bias += deadline;
552 seqlock_write_end(&timers_state.vm_clock_seqlock);
553 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
554 } else {
555 /*
556 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
557 * "real" time, (related to the time left until the next event) has
558 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
559 * This avoids that the warps are visible externally; for example,
560 * you will not be sending network packets continuously instead of
561 * every 100ms.
562 */
563 seqlock_write_begin(&timers_state.vm_clock_seqlock);
564 if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
565 vm_clock_warp_start = clock;
566 }
567 seqlock_write_end(&timers_state.vm_clock_seqlock);
568 timer_mod_anticipate(icount_warp_timer, clock + deadline);
569 }
570 } else if (deadline == 0) {
571 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
572 }
573 }
574
575 static void qemu_account_warp_timer(void)
576 {
577 if (!use_icount || !icount_sleep) {
578 return;
579 }
580
581 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
582 * do not fire, so computing the deadline does not make sense.
583 */
584 if (!runstate_is_running()) {
585 return;
586 }
587
588 /* warp clock deterministically in record/replay mode */
589 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
590 return;
591 }
592
593 timer_del(icount_warp_timer);
594 icount_warp_rt();
595 }
596
597 static bool icount_state_needed(void *opaque)
598 {
599 return use_icount;
600 }
601
602 /*
603 * This is a subsection for icount migration.
604 */
605 static const VMStateDescription icount_vmstate_timers = {
606 .name = "timer/icount",
607 .version_id = 1,
608 .minimum_version_id = 1,
609 .needed = icount_state_needed,
610 .fields = (VMStateField[]) {
611 VMSTATE_INT64(qemu_icount_bias, TimersState),
612 VMSTATE_INT64(qemu_icount, TimersState),
613 VMSTATE_END_OF_LIST()
614 }
615 };
616
617 static const VMStateDescription vmstate_timers = {
618 .name = "timer",
619 .version_id = 2,
620 .minimum_version_id = 1,
621 .fields = (VMStateField[]) {
622 VMSTATE_INT64(cpu_ticks_offset, TimersState),
623 VMSTATE_INT64(dummy, TimersState),
624 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
625 VMSTATE_END_OF_LIST()
626 },
627 .subsections = (const VMStateDescription*[]) {
628 &icount_vmstate_timers,
629 NULL
630 }
631 };
632
633 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
634 {
635 double pct;
636 double throttle_ratio;
637 long sleeptime_ns;
638
639 if (!cpu_throttle_get_percentage()) {
640 return;
641 }
642
643 pct = (double)cpu_throttle_get_percentage()/100;
644 throttle_ratio = pct / (1 - pct);
645 sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
646
647 qemu_mutex_unlock_iothread();
648 atomic_set(&cpu->throttle_thread_scheduled, 0);
649 g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
650 qemu_mutex_lock_iothread();
651 }
652
653 static void cpu_throttle_timer_tick(void *opaque)
654 {
655 CPUState *cpu;
656 double pct;
657
658 /* Stop the timer if needed */
659 if (!cpu_throttle_get_percentage()) {
660 return;
661 }
662 CPU_FOREACH(cpu) {
663 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
664 async_run_on_cpu(cpu, cpu_throttle_thread,
665 RUN_ON_CPU_NULL);
666 }
667 }
668
669 pct = (double)cpu_throttle_get_percentage()/100;
670 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
671 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
672 }
673
674 void cpu_throttle_set(int new_throttle_pct)
675 {
676 /* Ensure throttle percentage is within valid range */
677 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
678 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
679
680 atomic_set(&throttle_percentage, new_throttle_pct);
681
682 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
683 CPU_THROTTLE_TIMESLICE_NS);
684 }
685
686 void cpu_throttle_stop(void)
687 {
688 atomic_set(&throttle_percentage, 0);
689 }
690
691 bool cpu_throttle_active(void)
692 {
693 return (cpu_throttle_get_percentage() != 0);
694 }
695
696 int cpu_throttle_get_percentage(void)
697 {
698 return atomic_read(&throttle_percentage);
699 }
700
701 void cpu_ticks_init(void)
702 {
703 seqlock_init(&timers_state.vm_clock_seqlock);
704 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
705 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
706 cpu_throttle_timer_tick, NULL);
707 }
708
709 void configure_icount(QemuOpts *opts, Error **errp)
710 {
711 const char *option;
712 char *rem_str = NULL;
713
714 option = qemu_opt_get(opts, "shift");
715 if (!option) {
716 if (qemu_opt_get(opts, "align") != NULL) {
717 error_setg(errp, "Please specify shift option when using align");
718 }
719 return;
720 }
721
722 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
723 if (icount_sleep) {
724 icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
725 icount_timer_cb, NULL);
726 }
727
728 icount_align_option = qemu_opt_get_bool(opts, "align", false);
729
730 if (icount_align_option && !icount_sleep) {
731 error_setg(errp, "align=on and sleep=off are incompatible");
732 }
733 if (strcmp(option, "auto") != 0) {
734 errno = 0;
735 icount_time_shift = strtol(option, &rem_str, 0);
736 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
737 error_setg(errp, "icount: Invalid shift value");
738 }
739 use_icount = 1;
740 return;
741 } else if (icount_align_option) {
742 error_setg(errp, "shift=auto and align=on are incompatible");
743 } else if (!icount_sleep) {
744 error_setg(errp, "shift=auto and sleep=off are incompatible");
745 }
746
747 use_icount = 2;
748
749 /* 125MIPS seems a reasonable initial guess at the guest speed.
750 It will be corrected fairly quickly anyway. */
751 icount_time_shift = 3;
752
753 /* Have both realtime and virtual time triggers for speed adjustment.
754 The realtime trigger catches emulated time passing too slowly,
755 the virtual time trigger catches emulated time passing too fast.
756 Realtime triggers occur even when idle, so use them less frequently
757 than VM triggers. */
758 icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
759 icount_adjust_rt, NULL);
760 timer_mod(icount_rt_timer,
761 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
762 icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
763 icount_adjust_vm, NULL);
764 timer_mod(icount_vm_timer,
765 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
766 NANOSECONDS_PER_SECOND / 10);
767 }
768
769 /***********************************************************/
770 /* TCG vCPU kick timer
771 *
772 * The kick timer is responsible for moving single threaded vCPU
773 * emulation on to the next vCPU. If more than one vCPU is running a
774 * timer event with force a cpu->exit so the next vCPU can get
775 * scheduled.
776 *
777 * The timer is removed if all vCPUs are idle and restarted again once
778 * idleness is complete.
779 */
780
781 static QEMUTimer *tcg_kick_vcpu_timer;
782 static CPUState *tcg_current_rr_cpu;
783
784 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
785
786 static inline int64_t qemu_tcg_next_kick(void)
787 {
788 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
789 }
790
791 /* Kick the currently round-robin scheduled vCPU */
792 static void qemu_cpu_kick_rr_cpu(void)
793 {
794 CPUState *cpu;
795 do {
796 cpu = atomic_mb_read(&tcg_current_rr_cpu);
797 if (cpu) {
798 cpu_exit(cpu);
799 }
800 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
801 }
802
803 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
804 {
805 qemu_notify_event();
806 }
807
808 static void kick_tcg_thread(void *opaque)
809 {
810 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
811 qemu_cpu_kick_rr_cpu();
812 }
813
814 static void start_tcg_kick_timer(void)
815 {
816 if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
817 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
818 kick_tcg_thread, NULL);
819 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
820 }
821 }
822
823 static void stop_tcg_kick_timer(void)
824 {
825 if (tcg_kick_vcpu_timer) {
826 timer_del(tcg_kick_vcpu_timer);
827 tcg_kick_vcpu_timer = NULL;
828 }
829 }
830
831 /***********************************************************/
832 void hw_error(const char *fmt, ...)
833 {
834 va_list ap;
835 CPUState *cpu;
836
837 va_start(ap, fmt);
838 fprintf(stderr, "qemu: hardware error: ");
839 vfprintf(stderr, fmt, ap);
840 fprintf(stderr, "\n");
841 CPU_FOREACH(cpu) {
842 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
843 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
844 }
845 va_end(ap);
846 abort();
847 }
848
849 void cpu_synchronize_all_states(void)
850 {
851 CPUState *cpu;
852
853 CPU_FOREACH(cpu) {
854 cpu_synchronize_state(cpu);
855 }
856 }
857
858 void cpu_synchronize_all_post_reset(void)
859 {
860 CPUState *cpu;
861
862 CPU_FOREACH(cpu) {
863 cpu_synchronize_post_reset(cpu);
864 }
865 }
866
867 void cpu_synchronize_all_post_init(void)
868 {
869 CPUState *cpu;
870
871 CPU_FOREACH(cpu) {
872 cpu_synchronize_post_init(cpu);
873 }
874 }
875
876 static int do_vm_stop(RunState state)
877 {
878 int ret = 0;
879
880 if (runstate_is_running()) {
881 cpu_disable_ticks();
882 pause_all_vcpus();
883 runstate_set(state);
884 vm_state_notify(0, state);
885 qapi_event_send_stop(&error_abort);
886 }
887
888 bdrv_drain_all();
889 replay_disable_events();
890 ret = bdrv_flush_all();
891
892 return ret;
893 }
894
895 static bool cpu_can_run(CPUState *cpu)
896 {
897 if (cpu->stop) {
898 return false;
899 }
900 if (cpu_is_stopped(cpu)) {
901 return false;
902 }
903 return true;
904 }
905
906 static void cpu_handle_guest_debug(CPUState *cpu)
907 {
908 gdb_set_stop_cpu(cpu);
909 qemu_system_debug_request();
910 cpu->stopped = true;
911 }
912
913 #ifdef CONFIG_LINUX
914 static void sigbus_reraise(void)
915 {
916 sigset_t set;
917 struct sigaction action;
918
919 memset(&action, 0, sizeof(action));
920 action.sa_handler = SIG_DFL;
921 if (!sigaction(SIGBUS, &action, NULL)) {
922 raise(SIGBUS);
923 sigemptyset(&set);
924 sigaddset(&set, SIGBUS);
925 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
926 }
927 perror("Failed to re-raise SIGBUS!\n");
928 abort();
929 }
930
931 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
932 {
933 if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
934 sigbus_reraise();
935 }
936
937 if (current_cpu) {
938 /* Called asynchronously in VCPU thread. */
939 if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
940 sigbus_reraise();
941 }
942 } else {
943 /* Called synchronously (via signalfd) in main thread. */
944 if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
945 sigbus_reraise();
946 }
947 }
948 }
949
950 static void qemu_init_sigbus(void)
951 {
952 struct sigaction action;
953
954 memset(&action, 0, sizeof(action));
955 action.sa_flags = SA_SIGINFO;
956 action.sa_sigaction = sigbus_handler;
957 sigaction(SIGBUS, &action, NULL);
958
959 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
960 }
961 #else /* !CONFIG_LINUX */
962 static void qemu_init_sigbus(void)
963 {
964 }
965 #endif /* !CONFIG_LINUX */
966
967 static QemuMutex qemu_global_mutex;
968
969 static QemuThread io_thread;
970
971 /* cpu creation */
972 static QemuCond qemu_cpu_cond;
973 /* system init */
974 static QemuCond qemu_pause_cond;
975
976 void qemu_init_cpu_loop(void)
977 {
978 qemu_init_sigbus();
979 qemu_cond_init(&qemu_cpu_cond);
980 qemu_cond_init(&qemu_pause_cond);
981 qemu_mutex_init(&qemu_global_mutex);
982
983 qemu_thread_get_self(&io_thread);
984 }
985
986 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
987 {
988 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
989 }
990
991 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
992 {
993 if (kvm_destroy_vcpu(cpu) < 0) {
994 error_report("kvm_destroy_vcpu failed");
995 exit(EXIT_FAILURE);
996 }
997 }
998
999 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1000 {
1001 }
1002
1003 static void qemu_wait_io_event_common(CPUState *cpu)
1004 {
1005 atomic_mb_set(&cpu->thread_kicked, false);
1006 if (cpu->stop) {
1007 cpu->stop = false;
1008 cpu->stopped = true;
1009 qemu_cond_broadcast(&qemu_pause_cond);
1010 }
1011 process_queued_cpu_work(cpu);
1012 }
1013
1014 static bool qemu_tcg_should_sleep(CPUState *cpu)
1015 {
1016 if (mttcg_enabled) {
1017 return cpu_thread_is_idle(cpu);
1018 } else {
1019 return all_cpu_threads_idle();
1020 }
1021 }
1022
1023 static void qemu_tcg_wait_io_event(CPUState *cpu)
1024 {
1025 while (qemu_tcg_should_sleep(cpu)) {
1026 stop_tcg_kick_timer();
1027 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1028 }
1029
1030 start_tcg_kick_timer();
1031
1032 qemu_wait_io_event_common(cpu);
1033 }
1034
1035 static void qemu_kvm_wait_io_event(CPUState *cpu)
1036 {
1037 while (cpu_thread_is_idle(cpu)) {
1038 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1039 }
1040
1041 qemu_wait_io_event_common(cpu);
1042 }
1043
1044 static void *qemu_kvm_cpu_thread_fn(void *arg)
1045 {
1046 CPUState *cpu = arg;
1047 int r;
1048
1049 rcu_register_thread();
1050
1051 qemu_mutex_lock_iothread();
1052 qemu_thread_get_self(cpu->thread);
1053 cpu->thread_id = qemu_get_thread_id();
1054 cpu->can_do_io = 1;
1055 current_cpu = cpu;
1056
1057 r = kvm_init_vcpu(cpu);
1058 if (r < 0) {
1059 fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1060 exit(1);
1061 }
1062
1063 kvm_init_cpu_signals(cpu);
1064
1065 /* signal CPU creation */
1066 cpu->created = true;
1067 qemu_cond_signal(&qemu_cpu_cond);
1068
1069 do {
1070 if (cpu_can_run(cpu)) {
1071 r = kvm_cpu_exec(cpu);
1072 if (r == EXCP_DEBUG) {
1073 cpu_handle_guest_debug(cpu);
1074 }
1075 }
1076 qemu_kvm_wait_io_event(cpu);
1077 } while (!cpu->unplug || cpu_can_run(cpu));
1078
1079 qemu_kvm_destroy_vcpu(cpu);
1080 cpu->created = false;
1081 qemu_cond_signal(&qemu_cpu_cond);
1082 qemu_mutex_unlock_iothread();
1083 return NULL;
1084 }
1085
1086 static void *qemu_dummy_cpu_thread_fn(void *arg)
1087 {
1088 #ifdef _WIN32
1089 fprintf(stderr, "qtest is not supported under Windows\n");
1090 exit(1);
1091 #else
1092 CPUState *cpu = arg;
1093 sigset_t waitset;
1094 int r;
1095
1096 rcu_register_thread();
1097
1098 qemu_mutex_lock_iothread();
1099 qemu_thread_get_self(cpu->thread);
1100 cpu->thread_id = qemu_get_thread_id();
1101 cpu->can_do_io = 1;
1102 current_cpu = cpu;
1103
1104 sigemptyset(&waitset);
1105 sigaddset(&waitset, SIG_IPI);
1106
1107 /* signal CPU creation */
1108 cpu->created = true;
1109 qemu_cond_signal(&qemu_cpu_cond);
1110
1111 while (1) {
1112 qemu_mutex_unlock_iothread();
1113 do {
1114 int sig;
1115 r = sigwait(&waitset, &sig);
1116 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1117 if (r == -1) {
1118 perror("sigwait");
1119 exit(1);
1120 }
1121 qemu_mutex_lock_iothread();
1122 qemu_wait_io_event_common(cpu);
1123 }
1124
1125 return NULL;
1126 #endif
1127 }
1128
1129 static int64_t tcg_get_icount_limit(void)
1130 {
1131 int64_t deadline;
1132
1133 if (replay_mode != REPLAY_MODE_PLAY) {
1134 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1135
1136 /* Maintain prior (possibly buggy) behaviour where if no deadline
1137 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1138 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1139 * nanoseconds.
1140 */
1141 if ((deadline < 0) || (deadline > INT32_MAX)) {
1142 deadline = INT32_MAX;
1143 }
1144
1145 return qemu_icount_round(deadline);
1146 } else {
1147 return replay_get_instructions();
1148 }
1149 }
1150
1151 static void handle_icount_deadline(void)
1152 {
1153 if (use_icount) {
1154 int64_t deadline =
1155 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1156
1157 if (deadline == 0) {
1158 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1159 }
1160 }
1161 }
1162
1163 static int tcg_cpu_exec(CPUState *cpu)
1164 {
1165 int ret;
1166 #ifdef CONFIG_PROFILER
1167 int64_t ti;
1168 #endif
1169
1170 #ifdef CONFIG_PROFILER
1171 ti = profile_getclock();
1172 #endif
1173 if (use_icount) {
1174 int64_t count;
1175 int decr;
1176 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1177 + cpu->icount_extra);
1178 cpu->icount_decr.u16.low = 0;
1179 cpu->icount_extra = 0;
1180 count = tcg_get_icount_limit();
1181 timers_state.qemu_icount += count;
1182 decr = (count > 0xffff) ? 0xffff : count;
1183 count -= decr;
1184 cpu->icount_decr.u16.low = decr;
1185 cpu->icount_extra = count;
1186 }
1187 qemu_mutex_unlock_iothread();
1188 cpu_exec_start(cpu);
1189 ret = cpu_exec(cpu);
1190 cpu_exec_end(cpu);
1191 qemu_mutex_lock_iothread();
1192 #ifdef CONFIG_PROFILER
1193 tcg_time += profile_getclock() - ti;
1194 #endif
1195 if (use_icount) {
1196 /* Fold pending instructions back into the
1197 instruction counter, and clear the interrupt flag. */
1198 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1199 + cpu->icount_extra);
1200 cpu->icount_decr.u32 = 0;
1201 cpu->icount_extra = 0;
1202 replay_account_executed_instructions();
1203 }
1204 return ret;
1205 }
1206
1207 /* Destroy any remaining vCPUs which have been unplugged and have
1208 * finished running
1209 */
1210 static void deal_with_unplugged_cpus(void)
1211 {
1212 CPUState *cpu;
1213
1214 CPU_FOREACH(cpu) {
1215 if (cpu->unplug && !cpu_can_run(cpu)) {
1216 qemu_tcg_destroy_vcpu(cpu);
1217 cpu->created = false;
1218 qemu_cond_signal(&qemu_cpu_cond);
1219 break;
1220 }
1221 }
1222 }
1223
1224 /* Single-threaded TCG
1225 *
1226 * In the single-threaded case each vCPU is simulated in turn. If
1227 * there is more than a single vCPU we create a simple timer to kick
1228 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1229 * This is done explicitly rather than relying on side-effects
1230 * elsewhere.
1231 */
1232
1233 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1234 {
1235 CPUState *cpu = arg;
1236
1237 rcu_register_thread();
1238
1239 qemu_mutex_lock_iothread();
1240 qemu_thread_get_self(cpu->thread);
1241
1242 CPU_FOREACH(cpu) {
1243 cpu->thread_id = qemu_get_thread_id();
1244 cpu->created = true;
1245 cpu->can_do_io = 1;
1246 }
1247 qemu_cond_signal(&qemu_cpu_cond);
1248
1249 /* wait for initial kick-off after machine start */
1250 while (first_cpu->stopped) {
1251 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1252
1253 /* process any pending work */
1254 CPU_FOREACH(cpu) {
1255 current_cpu = cpu;
1256 qemu_wait_io_event_common(cpu);
1257 }
1258 }
1259
1260 start_tcg_kick_timer();
1261
1262 cpu = first_cpu;
1263
1264 /* process any pending work */
1265 cpu->exit_request = 1;
1266
1267 while (1) {
1268 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1269 qemu_account_warp_timer();
1270
1271 if (!cpu) {
1272 cpu = first_cpu;
1273 }
1274
1275 while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1276
1277 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1278 current_cpu = cpu;
1279
1280 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1281 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1282
1283 if (cpu_can_run(cpu)) {
1284 int r;
1285 r = tcg_cpu_exec(cpu);
1286 if (r == EXCP_DEBUG) {
1287 cpu_handle_guest_debug(cpu);
1288 break;
1289 } else if (r == EXCP_ATOMIC) {
1290 qemu_mutex_unlock_iothread();
1291 cpu_exec_step_atomic(cpu);
1292 qemu_mutex_lock_iothread();
1293 break;
1294 }
1295 } else if (cpu->stop) {
1296 if (cpu->unplug) {
1297 cpu = CPU_NEXT(cpu);
1298 }
1299 break;
1300 }
1301
1302 cpu = CPU_NEXT(cpu);
1303 } /* while (cpu && !cpu->exit_request).. */
1304
1305 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1306 atomic_set(&tcg_current_rr_cpu, NULL);
1307
1308 if (cpu && cpu->exit_request) {
1309 atomic_mb_set(&cpu->exit_request, 0);
1310 }
1311
1312 handle_icount_deadline();
1313
1314 qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1315 deal_with_unplugged_cpus();
1316 }
1317
1318 return NULL;
1319 }
1320
1321 static void *qemu_hax_cpu_thread_fn(void *arg)
1322 {
1323 CPUState *cpu = arg;
1324 int r;
1325 qemu_thread_get_self(cpu->thread);
1326 qemu_mutex_lock(&qemu_global_mutex);
1327
1328 cpu->thread_id = qemu_get_thread_id();
1329 cpu->created = true;
1330 cpu->halted = 0;
1331 current_cpu = cpu;
1332
1333 hax_init_vcpu(cpu);
1334 qemu_cond_signal(&qemu_cpu_cond);
1335
1336 while (1) {
1337 if (cpu_can_run(cpu)) {
1338 r = hax_smp_cpu_exec(cpu);
1339 if (r == EXCP_DEBUG) {
1340 cpu_handle_guest_debug(cpu);
1341 }
1342 }
1343
1344 while (cpu_thread_is_idle(cpu)) {
1345 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1346 }
1347 #ifdef _WIN32
1348 SleepEx(0, TRUE);
1349 #endif
1350 qemu_wait_io_event_common(cpu);
1351 }
1352 return NULL;
1353 }
1354
1355 #ifdef _WIN32
1356 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1357 {
1358 }
1359 #endif
1360
1361 /* Multi-threaded TCG
1362 *
1363 * In the multi-threaded case each vCPU has its own thread. The TLS
1364 * variable current_cpu can be used deep in the code to find the
1365 * current CPUState for a given thread.
1366 */
1367
1368 static void *qemu_tcg_cpu_thread_fn(void *arg)
1369 {
1370 CPUState *cpu = arg;
1371
1372 rcu_register_thread();
1373
1374 qemu_mutex_lock_iothread();
1375 qemu_thread_get_self(cpu->thread);
1376
1377 cpu->thread_id = qemu_get_thread_id();
1378 cpu->created = true;
1379 cpu->can_do_io = 1;
1380 current_cpu = cpu;
1381 qemu_cond_signal(&qemu_cpu_cond);
1382
1383 /* process any pending work */
1384 cpu->exit_request = 1;
1385
1386 while (1) {
1387 if (cpu_can_run(cpu)) {
1388 int r;
1389 r = tcg_cpu_exec(cpu);
1390 switch (r) {
1391 case EXCP_DEBUG:
1392 cpu_handle_guest_debug(cpu);
1393 break;
1394 case EXCP_HALTED:
1395 /* during start-up the vCPU is reset and the thread is
1396 * kicked several times. If we don't ensure we go back
1397 * to sleep in the halted state we won't cleanly
1398 * start-up when the vCPU is enabled.
1399 *
1400 * cpu->halted should ensure we sleep in wait_io_event
1401 */
1402 g_assert(cpu->halted);
1403 break;
1404 case EXCP_ATOMIC:
1405 qemu_mutex_unlock_iothread();
1406 cpu_exec_step_atomic(cpu);
1407 qemu_mutex_lock_iothread();
1408 default:
1409 /* Ignore everything else? */
1410 break;
1411 }
1412 }
1413
1414 handle_icount_deadline();
1415
1416 atomic_mb_set(&cpu->exit_request, 0);
1417 qemu_tcg_wait_io_event(cpu);
1418 }
1419
1420 return NULL;
1421 }
1422
1423 static void qemu_cpu_kick_thread(CPUState *cpu)
1424 {
1425 #ifndef _WIN32
1426 int err;
1427
1428 if (cpu->thread_kicked) {
1429 return;
1430 }
1431 cpu->thread_kicked = true;
1432 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1433 if (err) {
1434 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1435 exit(1);
1436 }
1437 #else /* _WIN32 */
1438 if (!qemu_cpu_is_self(cpu)) {
1439 if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1440 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1441 __func__, GetLastError());
1442 exit(1);
1443 }
1444 }
1445 #endif
1446 }
1447
1448 void qemu_cpu_kick(CPUState *cpu)
1449 {
1450 qemu_cond_broadcast(cpu->halt_cond);
1451 if (tcg_enabled()) {
1452 cpu_exit(cpu);
1453 /* NOP unless doing single-thread RR */
1454 qemu_cpu_kick_rr_cpu();
1455 } else {
1456 if (hax_enabled()) {
1457 /*
1458 * FIXME: race condition with the exit_request check in
1459 * hax_vcpu_hax_exec
1460 */
1461 cpu->exit_request = 1;
1462 }
1463 qemu_cpu_kick_thread(cpu);
1464 }
1465 }
1466
1467 void qemu_cpu_kick_self(void)
1468 {
1469 assert(current_cpu);
1470 qemu_cpu_kick_thread(current_cpu);
1471 }
1472
1473 bool qemu_cpu_is_self(CPUState *cpu)
1474 {
1475 return qemu_thread_is_self(cpu->thread);
1476 }
1477
1478 bool qemu_in_vcpu_thread(void)
1479 {
1480 return current_cpu && qemu_cpu_is_self(current_cpu);
1481 }
1482
1483 static __thread bool iothread_locked = false;
1484
1485 bool qemu_mutex_iothread_locked(void)
1486 {
1487 return iothread_locked;
1488 }
1489
1490 void qemu_mutex_lock_iothread(void)
1491 {
1492 g_assert(!qemu_mutex_iothread_locked());
1493 qemu_mutex_lock(&qemu_global_mutex);
1494 iothread_locked = true;
1495 }
1496
1497 void qemu_mutex_unlock_iothread(void)
1498 {
1499 g_assert(qemu_mutex_iothread_locked());
1500 iothread_locked = false;
1501 qemu_mutex_unlock(&qemu_global_mutex);
1502 }
1503
1504 static bool all_vcpus_paused(void)
1505 {
1506 CPUState *cpu;
1507
1508 CPU_FOREACH(cpu) {
1509 if (!cpu->stopped) {
1510 return false;
1511 }
1512 }
1513
1514 return true;
1515 }
1516
1517 void pause_all_vcpus(void)
1518 {
1519 CPUState *cpu;
1520
1521 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1522 CPU_FOREACH(cpu) {
1523 cpu->stop = true;
1524 qemu_cpu_kick(cpu);
1525 }
1526
1527 if (qemu_in_vcpu_thread()) {
1528 cpu_stop_current();
1529 }
1530
1531 while (!all_vcpus_paused()) {
1532 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1533 CPU_FOREACH(cpu) {
1534 qemu_cpu_kick(cpu);
1535 }
1536 }
1537 }
1538
1539 void cpu_resume(CPUState *cpu)
1540 {
1541 cpu->stop = false;
1542 cpu->stopped = false;
1543 qemu_cpu_kick(cpu);
1544 }
1545
1546 void resume_all_vcpus(void)
1547 {
1548 CPUState *cpu;
1549
1550 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1551 CPU_FOREACH(cpu) {
1552 cpu_resume(cpu);
1553 }
1554 }
1555
1556 void cpu_remove(CPUState *cpu)
1557 {
1558 cpu->stop = true;
1559 cpu->unplug = true;
1560 qemu_cpu_kick(cpu);
1561 }
1562
1563 void cpu_remove_sync(CPUState *cpu)
1564 {
1565 cpu_remove(cpu);
1566 while (cpu->created) {
1567 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1568 }
1569 }
1570
1571 /* For temporary buffers for forming a name */
1572 #define VCPU_THREAD_NAME_SIZE 16
1573
1574 static void qemu_tcg_init_vcpu(CPUState *cpu)
1575 {
1576 char thread_name[VCPU_THREAD_NAME_SIZE];
1577 static QemuCond *single_tcg_halt_cond;
1578 static QemuThread *single_tcg_cpu_thread;
1579
1580 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1581 cpu->thread = g_malloc0(sizeof(QemuThread));
1582 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1583 qemu_cond_init(cpu->halt_cond);
1584
1585 if (qemu_tcg_mttcg_enabled()) {
1586 /* create a thread per vCPU with TCG (MTTCG) */
1587 parallel_cpus = true;
1588 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1589 cpu->cpu_index);
1590
1591 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1592 cpu, QEMU_THREAD_JOINABLE);
1593
1594 } else {
1595 /* share a single thread for all cpus with TCG */
1596 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1597 qemu_thread_create(cpu->thread, thread_name,
1598 qemu_tcg_rr_cpu_thread_fn,
1599 cpu, QEMU_THREAD_JOINABLE);
1600
1601 single_tcg_halt_cond = cpu->halt_cond;
1602 single_tcg_cpu_thread = cpu->thread;
1603 }
1604 #ifdef _WIN32
1605 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1606 #endif
1607 while (!cpu->created) {
1608 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1609 }
1610 } else {
1611 /* For non-MTTCG cases we share the thread */
1612 cpu->thread = single_tcg_cpu_thread;
1613 cpu->halt_cond = single_tcg_halt_cond;
1614 }
1615 }
1616
1617 static void qemu_hax_start_vcpu(CPUState *cpu)
1618 {
1619 char thread_name[VCPU_THREAD_NAME_SIZE];
1620
1621 cpu->thread = g_malloc0(sizeof(QemuThread));
1622 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1623 qemu_cond_init(cpu->halt_cond);
1624
1625 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1626 cpu->cpu_index);
1627 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1628 cpu, QEMU_THREAD_JOINABLE);
1629 #ifdef _WIN32
1630 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1631 #endif
1632 while (!cpu->created) {
1633 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1634 }
1635 }
1636
1637 static void qemu_kvm_start_vcpu(CPUState *cpu)
1638 {
1639 char thread_name[VCPU_THREAD_NAME_SIZE];
1640
1641 cpu->thread = g_malloc0(sizeof(QemuThread));
1642 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1643 qemu_cond_init(cpu->halt_cond);
1644 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1645 cpu->cpu_index);
1646 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1647 cpu, QEMU_THREAD_JOINABLE);
1648 while (!cpu->created) {
1649 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1650 }
1651 }
1652
1653 static void qemu_dummy_start_vcpu(CPUState *cpu)
1654 {
1655 char thread_name[VCPU_THREAD_NAME_SIZE];
1656
1657 cpu->thread = g_malloc0(sizeof(QemuThread));
1658 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1659 qemu_cond_init(cpu->halt_cond);
1660 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1661 cpu->cpu_index);
1662 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1663 QEMU_THREAD_JOINABLE);
1664 while (!cpu->created) {
1665 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1666 }
1667 }
1668
1669 void qemu_init_vcpu(CPUState *cpu)
1670 {
1671 cpu->nr_cores = smp_cores;
1672 cpu->nr_threads = smp_threads;
1673 cpu->stopped = true;
1674
1675 if (!cpu->as) {
1676 /* If the target cpu hasn't set up any address spaces itself,
1677 * give it the default one.
1678 */
1679 AddressSpace *as = address_space_init_shareable(cpu->memory,
1680 "cpu-memory");
1681 cpu->num_ases = 1;
1682 cpu_address_space_init(cpu, as, 0);
1683 }
1684
1685 if (kvm_enabled()) {
1686 qemu_kvm_start_vcpu(cpu);
1687 } else if (hax_enabled()) {
1688 qemu_hax_start_vcpu(cpu);
1689 } else if (tcg_enabled()) {
1690 qemu_tcg_init_vcpu(cpu);
1691 } else {
1692 qemu_dummy_start_vcpu(cpu);
1693 }
1694 }
1695
1696 void cpu_stop_current(void)
1697 {
1698 if (current_cpu) {
1699 current_cpu->stop = false;
1700 current_cpu->stopped = true;
1701 cpu_exit(current_cpu);
1702 qemu_cond_broadcast(&qemu_pause_cond);
1703 }
1704 }
1705
1706 int vm_stop(RunState state)
1707 {
1708 if (qemu_in_vcpu_thread()) {
1709 qemu_system_vmstop_request_prepare();
1710 qemu_system_vmstop_request(state);
1711 /*
1712 * FIXME: should not return to device code in case
1713 * vm_stop() has been requested.
1714 */
1715 cpu_stop_current();
1716 return 0;
1717 }
1718
1719 return do_vm_stop(state);
1720 }
1721
1722 /**
1723 * Prepare for (re)starting the VM.
1724 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1725 * running or in case of an error condition), 0 otherwise.
1726 */
1727 int vm_prepare_start(void)
1728 {
1729 RunState requested;
1730 int res = 0;
1731
1732 qemu_vmstop_requested(&requested);
1733 if (runstate_is_running() && requested == RUN_STATE__MAX) {
1734 return -1;
1735 }
1736
1737 /* Ensure that a STOP/RESUME pair of events is emitted if a
1738 * vmstop request was pending. The BLOCK_IO_ERROR event, for
1739 * example, according to documentation is always followed by
1740 * the STOP event.
1741 */
1742 if (runstate_is_running()) {
1743 qapi_event_send_stop(&error_abort);
1744 res = -1;
1745 } else {
1746 replay_enable_events();
1747 cpu_enable_ticks();
1748 runstate_set(RUN_STATE_RUNNING);
1749 vm_state_notify(1, RUN_STATE_RUNNING);
1750 }
1751
1752 /* We are sending this now, but the CPUs will be resumed shortly later */
1753 qapi_event_send_resume(&error_abort);
1754 return res;
1755 }
1756
1757 void vm_start(void)
1758 {
1759 if (!vm_prepare_start()) {
1760 resume_all_vcpus();
1761 }
1762 }
1763
1764 /* does a state transition even if the VM is already stopped,
1765 current state is forgotten forever */
1766 int vm_stop_force_state(RunState state)
1767 {
1768 if (runstate_is_running()) {
1769 return vm_stop(state);
1770 } else {
1771 runstate_set(state);
1772
1773 bdrv_drain_all();
1774 /* Make sure to return an error if the flush in a previous vm_stop()
1775 * failed. */
1776 return bdrv_flush_all();
1777 }
1778 }
1779
1780 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1781 {
1782 /* XXX: implement xxx_cpu_list for targets that still miss it */
1783 #if defined(cpu_list)
1784 cpu_list(f, cpu_fprintf);
1785 #endif
1786 }
1787
1788 CpuInfoList *qmp_query_cpus(Error **errp)
1789 {
1790 CpuInfoList *head = NULL, *cur_item = NULL;
1791 CPUState *cpu;
1792
1793 CPU_FOREACH(cpu) {
1794 CpuInfoList *info;
1795 #if defined(TARGET_I386)
1796 X86CPU *x86_cpu = X86_CPU(cpu);
1797 CPUX86State *env = &x86_cpu->env;
1798 #elif defined(TARGET_PPC)
1799 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1800 CPUPPCState *env = &ppc_cpu->env;
1801 #elif defined(TARGET_SPARC)
1802 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1803 CPUSPARCState *env = &sparc_cpu->env;
1804 #elif defined(TARGET_MIPS)
1805 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1806 CPUMIPSState *env = &mips_cpu->env;
1807 #elif defined(TARGET_TRICORE)
1808 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1809 CPUTriCoreState *env = &tricore_cpu->env;
1810 #endif
1811
1812 cpu_synchronize_state(cpu);
1813
1814 info = g_malloc0(sizeof(*info));
1815 info->value = g_malloc0(sizeof(*info->value));
1816 info->value->CPU = cpu->cpu_index;
1817 info->value->current = (cpu == first_cpu);
1818 info->value->halted = cpu->halted;
1819 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1820 info->value->thread_id = cpu->thread_id;
1821 #if defined(TARGET_I386)
1822 info->value->arch = CPU_INFO_ARCH_X86;
1823 info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1824 #elif defined(TARGET_PPC)
1825 info->value->arch = CPU_INFO_ARCH_PPC;
1826 info->value->u.ppc.nip = env->nip;
1827 #elif defined(TARGET_SPARC)
1828 info->value->arch = CPU_INFO_ARCH_SPARC;
1829 info->value->u.q_sparc.pc = env->pc;
1830 info->value->u.q_sparc.npc = env->npc;
1831 #elif defined(TARGET_MIPS)
1832 info->value->arch = CPU_INFO_ARCH_MIPS;
1833 info->value->u.q_mips.PC = env->active_tc.PC;
1834 #elif defined(TARGET_TRICORE)
1835 info->value->arch = CPU_INFO_ARCH_TRICORE;
1836 info->value->u.tricore.PC = env->PC;
1837 #else
1838 info->value->arch = CPU_INFO_ARCH_OTHER;
1839 #endif
1840
1841 /* XXX: waiting for the qapi to support GSList */
1842 if (!cur_item) {
1843 head = cur_item = info;
1844 } else {
1845 cur_item->next = info;
1846 cur_item = info;
1847 }
1848 }
1849
1850 return head;
1851 }
1852
1853 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1854 bool has_cpu, int64_t cpu_index, Error **errp)
1855 {
1856 FILE *f;
1857 uint32_t l;
1858 CPUState *cpu;
1859 uint8_t buf[1024];
1860 int64_t orig_addr = addr, orig_size = size;
1861
1862 if (!has_cpu) {
1863 cpu_index = 0;
1864 }
1865
1866 cpu = qemu_get_cpu(cpu_index);
1867 if (cpu == NULL) {
1868 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1869 "a CPU number");
1870 return;
1871 }
1872
1873 f = fopen(filename, "wb");
1874 if (!f) {
1875 error_setg_file_open(errp, errno, filename);
1876 return;
1877 }
1878
1879 while (size != 0) {
1880 l = sizeof(buf);
1881 if (l > size)
1882 l = size;
1883 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1884 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1885 " specified", orig_addr, orig_size);
1886 goto exit;
1887 }
1888 if (fwrite(buf, 1, l, f) != l) {
1889 error_setg(errp, QERR_IO_ERROR);
1890 goto exit;
1891 }
1892 addr += l;
1893 size -= l;
1894 }
1895
1896 exit:
1897 fclose(f);
1898 }
1899
1900 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1901 Error **errp)
1902 {
1903 FILE *f;
1904 uint32_t l;
1905 uint8_t buf[1024];
1906
1907 f = fopen(filename, "wb");
1908 if (!f) {
1909 error_setg_file_open(errp, errno, filename);
1910 return;
1911 }
1912
1913 while (size != 0) {
1914 l = sizeof(buf);
1915 if (l > size)
1916 l = size;
1917 cpu_physical_memory_read(addr, buf, l);
1918 if (fwrite(buf, 1, l, f) != l) {
1919 error_setg(errp, QERR_IO_ERROR);
1920 goto exit;
1921 }
1922 addr += l;
1923 size -= l;
1924 }
1925
1926 exit:
1927 fclose(f);
1928 }
1929
1930 void qmp_inject_nmi(Error **errp)
1931 {
1932 nmi_monitor_handle(monitor_get_cpu_index(), errp);
1933 }
1934
1935 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1936 {
1937 if (!use_icount) {
1938 return;
1939 }
1940
1941 cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
1942 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1943 if (icount_align_option) {
1944 cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
1945 cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
1946 } else {
1947 cpu_fprintf(f, "Max guest delay NA\n");
1948 cpu_fprintf(f, "Max guest advance NA\n");
1949 }
1950 }