]> git.proxmox.com Git - mirror_qemu.git/blob - cpus.c
QMP: include CpuInstanceProperties into query_cpus output output
[mirror_qemu.git] / cpus.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 /* Needed early for CONFIG_BSD etc. */
26 #include "qemu/osdep.h"
27 #include "qemu-common.h"
28 #include "qemu/config-file.h"
29 #include "cpu.h"
30 #include "monitor/monitor.h"
31 #include "qapi/qmp/qerror.h"
32 #include "qemu/error-report.h"
33 #include "sysemu/sysemu.h"
34 #include "sysemu/block-backend.h"
35 #include "exec/gdbstub.h"
36 #include "sysemu/dma.h"
37 #include "sysemu/hw_accel.h"
38 #include "sysemu/kvm.h"
39 #include "sysemu/hax.h"
40 #include "qmp-commands.h"
41 #include "exec/exec-all.h"
42
43 #include "qemu/thread.h"
44 #include "sysemu/cpus.h"
45 #include "sysemu/qtest.h"
46 #include "qemu/main-loop.h"
47 #include "qemu/bitmap.h"
48 #include "qemu/seqlock.h"
49 #include "tcg.h"
50 #include "qapi-event.h"
51 #include "hw/nmi.h"
52 #include "sysemu/replay.h"
53 #include "hw/boards.h"
54
55 #ifdef CONFIG_LINUX
56
57 #include <sys/prctl.h>
58
59 #ifndef PR_MCE_KILL
60 #define PR_MCE_KILL 33
61 #endif
62
63 #ifndef PR_MCE_KILL_SET
64 #define PR_MCE_KILL_SET 1
65 #endif
66
67 #ifndef PR_MCE_KILL_EARLY
68 #define PR_MCE_KILL_EARLY 1
69 #endif
70
71 #endif /* CONFIG_LINUX */
72
73 int64_t max_delay;
74 int64_t max_advance;
75
76 /* vcpu throttling controls */
77 static QEMUTimer *throttle_timer;
78 static unsigned int throttle_percentage;
79
80 #define CPU_THROTTLE_PCT_MIN 1
81 #define CPU_THROTTLE_PCT_MAX 99
82 #define CPU_THROTTLE_TIMESLICE_NS 10000000
83
84 bool cpu_is_stopped(CPUState *cpu)
85 {
86 return cpu->stopped || !runstate_is_running();
87 }
88
89 static bool cpu_thread_is_idle(CPUState *cpu)
90 {
91 if (cpu->stop || cpu->queued_work_first) {
92 return false;
93 }
94 if (cpu_is_stopped(cpu)) {
95 return true;
96 }
97 if (!cpu->halted || cpu_has_work(cpu) ||
98 kvm_halt_in_kernel()) {
99 return false;
100 }
101 return true;
102 }
103
104 static bool all_cpu_threads_idle(void)
105 {
106 CPUState *cpu;
107
108 CPU_FOREACH(cpu) {
109 if (!cpu_thread_is_idle(cpu)) {
110 return false;
111 }
112 }
113 return true;
114 }
115
116 /***********************************************************/
117 /* guest cycle counter */
118
119 /* Protected by TimersState seqlock */
120
121 static bool icount_sleep = true;
122 static int64_t vm_clock_warp_start = -1;
123 /* Conversion factor from emulated instructions to virtual clock ticks. */
124 static int icount_time_shift;
125 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
126 #define MAX_ICOUNT_SHIFT 10
127
128 static QEMUTimer *icount_rt_timer;
129 static QEMUTimer *icount_vm_timer;
130 static QEMUTimer *icount_warp_timer;
131
132 typedef struct TimersState {
133 /* Protected by BQL. */
134 int64_t cpu_ticks_prev;
135 int64_t cpu_ticks_offset;
136
137 /* cpu_clock_offset can be read out of BQL, so protect it with
138 * this lock.
139 */
140 QemuSeqLock vm_clock_seqlock;
141 int64_t cpu_clock_offset;
142 int32_t cpu_ticks_enabled;
143 int64_t dummy;
144
145 /* Compensate for varying guest execution speed. */
146 int64_t qemu_icount_bias;
147 /* Only written by TCG thread */
148 int64_t qemu_icount;
149 } TimersState;
150
151 static TimersState timers_state;
152 bool mttcg_enabled;
153
154 /*
155 * We default to false if we know other options have been enabled
156 * which are currently incompatible with MTTCG. Otherwise when each
157 * guest (target) has been updated to support:
158 * - atomic instructions
159 * - memory ordering primitives (barriers)
160 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
161 *
162 * Once a guest architecture has been converted to the new primitives
163 * there are two remaining limitations to check.
164 *
165 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
166 * - The host must have a stronger memory order than the guest
167 *
168 * It may be possible in future to support strong guests on weak hosts
169 * but that will require tagging all load/stores in a guest with their
170 * implicit memory order requirements which would likely slow things
171 * down a lot.
172 */
173
174 static bool check_tcg_memory_orders_compatible(void)
175 {
176 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
177 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
178 #else
179 return false;
180 #endif
181 }
182
183 static bool default_mttcg_enabled(void)
184 {
185 if (use_icount || TCG_OVERSIZED_GUEST) {
186 return false;
187 } else {
188 #ifdef TARGET_SUPPORTS_MTTCG
189 return check_tcg_memory_orders_compatible();
190 #else
191 return false;
192 #endif
193 }
194 }
195
196 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
197 {
198 const char *t = qemu_opt_get(opts, "thread");
199 if (t) {
200 if (strcmp(t, "multi") == 0) {
201 if (TCG_OVERSIZED_GUEST) {
202 error_setg(errp, "No MTTCG when guest word size > hosts");
203 } else if (use_icount) {
204 error_setg(errp, "No MTTCG when icount is enabled");
205 } else {
206 #ifndef TARGET_SUPPORTS_MTTCG
207 error_report("Guest not yet converted to MTTCG - "
208 "you may get unexpected results");
209 #endif
210 if (!check_tcg_memory_orders_compatible()) {
211 error_report("Guest expects a stronger memory ordering "
212 "than the host provides");
213 error_printf("This may cause strange/hard to debug errors\n");
214 }
215 mttcg_enabled = true;
216 }
217 } else if (strcmp(t, "single") == 0) {
218 mttcg_enabled = false;
219 } else {
220 error_setg(errp, "Invalid 'thread' setting %s", t);
221 }
222 } else {
223 mttcg_enabled = default_mttcg_enabled();
224 }
225 }
226
227 /* The current number of executed instructions is based on what we
228 * originally budgeted minus the current state of the decrementing
229 * icount counters in extra/u16.low.
230 */
231 static int64_t cpu_get_icount_executed(CPUState *cpu)
232 {
233 return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
234 }
235
236 /*
237 * Update the global shared timer_state.qemu_icount to take into
238 * account executed instructions. This is done by the TCG vCPU
239 * thread so the main-loop can see time has moved forward.
240 */
241 void cpu_update_icount(CPUState *cpu)
242 {
243 int64_t executed = cpu_get_icount_executed(cpu);
244 cpu->icount_budget -= executed;
245
246 #ifdef CONFIG_ATOMIC64
247 atomic_set__nocheck(&timers_state.qemu_icount,
248 atomic_read__nocheck(&timers_state.qemu_icount) +
249 executed);
250 #else /* FIXME: we need 64bit atomics to do this safely */
251 timers_state.qemu_icount += executed;
252 #endif
253 }
254
255 int64_t cpu_get_icount_raw(void)
256 {
257 CPUState *cpu = current_cpu;
258
259 if (cpu && cpu->running) {
260 if (!cpu->can_do_io) {
261 fprintf(stderr, "Bad icount read\n");
262 exit(1);
263 }
264 /* Take into account what has run */
265 cpu_update_icount(cpu);
266 }
267 #ifdef CONFIG_ATOMIC64
268 return atomic_read__nocheck(&timers_state.qemu_icount);
269 #else /* FIXME: we need 64bit atomics to do this safely */
270 return timers_state.qemu_icount;
271 #endif
272 }
273
274 /* Return the virtual CPU time, based on the instruction counter. */
275 static int64_t cpu_get_icount_locked(void)
276 {
277 int64_t icount = cpu_get_icount_raw();
278 return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
279 }
280
281 int64_t cpu_get_icount(void)
282 {
283 int64_t icount;
284 unsigned start;
285
286 do {
287 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
288 icount = cpu_get_icount_locked();
289 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
290
291 return icount;
292 }
293
294 int64_t cpu_icount_to_ns(int64_t icount)
295 {
296 return icount << icount_time_shift;
297 }
298
299 /* return the time elapsed in VM between vm_start and vm_stop. Unless
300 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
301 * counter.
302 *
303 * Caller must hold the BQL
304 */
305 int64_t cpu_get_ticks(void)
306 {
307 int64_t ticks;
308
309 if (use_icount) {
310 return cpu_get_icount();
311 }
312
313 ticks = timers_state.cpu_ticks_offset;
314 if (timers_state.cpu_ticks_enabled) {
315 ticks += cpu_get_host_ticks();
316 }
317
318 if (timers_state.cpu_ticks_prev > ticks) {
319 /* Note: non increasing ticks may happen if the host uses
320 software suspend */
321 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
322 ticks = timers_state.cpu_ticks_prev;
323 }
324
325 timers_state.cpu_ticks_prev = ticks;
326 return ticks;
327 }
328
329 static int64_t cpu_get_clock_locked(void)
330 {
331 int64_t time;
332
333 time = timers_state.cpu_clock_offset;
334 if (timers_state.cpu_ticks_enabled) {
335 time += get_clock();
336 }
337
338 return time;
339 }
340
341 /* Return the monotonic time elapsed in VM, i.e.,
342 * the time between vm_start and vm_stop
343 */
344 int64_t cpu_get_clock(void)
345 {
346 int64_t ti;
347 unsigned start;
348
349 do {
350 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
351 ti = cpu_get_clock_locked();
352 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
353
354 return ti;
355 }
356
357 /* enable cpu_get_ticks()
358 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
359 */
360 void cpu_enable_ticks(void)
361 {
362 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
363 seqlock_write_begin(&timers_state.vm_clock_seqlock);
364 if (!timers_state.cpu_ticks_enabled) {
365 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
366 timers_state.cpu_clock_offset -= get_clock();
367 timers_state.cpu_ticks_enabled = 1;
368 }
369 seqlock_write_end(&timers_state.vm_clock_seqlock);
370 }
371
372 /* disable cpu_get_ticks() : the clock is stopped. You must not call
373 * cpu_get_ticks() after that.
374 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
375 */
376 void cpu_disable_ticks(void)
377 {
378 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
379 seqlock_write_begin(&timers_state.vm_clock_seqlock);
380 if (timers_state.cpu_ticks_enabled) {
381 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
382 timers_state.cpu_clock_offset = cpu_get_clock_locked();
383 timers_state.cpu_ticks_enabled = 0;
384 }
385 seqlock_write_end(&timers_state.vm_clock_seqlock);
386 }
387
388 /* Correlation between real and virtual time is always going to be
389 fairly approximate, so ignore small variation.
390 When the guest is idle real and virtual time will be aligned in
391 the IO wait loop. */
392 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
393
394 static void icount_adjust(void)
395 {
396 int64_t cur_time;
397 int64_t cur_icount;
398 int64_t delta;
399
400 /* Protected by TimersState mutex. */
401 static int64_t last_delta;
402
403 /* If the VM is not running, then do nothing. */
404 if (!runstate_is_running()) {
405 return;
406 }
407
408 seqlock_write_begin(&timers_state.vm_clock_seqlock);
409 cur_time = cpu_get_clock_locked();
410 cur_icount = cpu_get_icount_locked();
411
412 delta = cur_icount - cur_time;
413 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
414 if (delta > 0
415 && last_delta + ICOUNT_WOBBLE < delta * 2
416 && icount_time_shift > 0) {
417 /* The guest is getting too far ahead. Slow time down. */
418 icount_time_shift--;
419 }
420 if (delta < 0
421 && last_delta - ICOUNT_WOBBLE > delta * 2
422 && icount_time_shift < MAX_ICOUNT_SHIFT) {
423 /* The guest is getting too far behind. Speed time up. */
424 icount_time_shift++;
425 }
426 last_delta = delta;
427 timers_state.qemu_icount_bias = cur_icount
428 - (timers_state.qemu_icount << icount_time_shift);
429 seqlock_write_end(&timers_state.vm_clock_seqlock);
430 }
431
432 static void icount_adjust_rt(void *opaque)
433 {
434 timer_mod(icount_rt_timer,
435 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
436 icount_adjust();
437 }
438
439 static void icount_adjust_vm(void *opaque)
440 {
441 timer_mod(icount_vm_timer,
442 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
443 NANOSECONDS_PER_SECOND / 10);
444 icount_adjust();
445 }
446
447 static int64_t qemu_icount_round(int64_t count)
448 {
449 return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
450 }
451
452 static void icount_warp_rt(void)
453 {
454 unsigned seq;
455 int64_t warp_start;
456
457 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
458 * changes from -1 to another value, so the race here is okay.
459 */
460 do {
461 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
462 warp_start = vm_clock_warp_start;
463 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
464
465 if (warp_start == -1) {
466 return;
467 }
468
469 seqlock_write_begin(&timers_state.vm_clock_seqlock);
470 if (runstate_is_running()) {
471 int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
472 cpu_get_clock_locked());
473 int64_t warp_delta;
474
475 warp_delta = clock - vm_clock_warp_start;
476 if (use_icount == 2) {
477 /*
478 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
479 * far ahead of real time.
480 */
481 int64_t cur_icount = cpu_get_icount_locked();
482 int64_t delta = clock - cur_icount;
483 warp_delta = MIN(warp_delta, delta);
484 }
485 timers_state.qemu_icount_bias += warp_delta;
486 }
487 vm_clock_warp_start = -1;
488 seqlock_write_end(&timers_state.vm_clock_seqlock);
489
490 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
491 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
492 }
493 }
494
495 static void icount_timer_cb(void *opaque)
496 {
497 /* No need for a checkpoint because the timer already synchronizes
498 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
499 */
500 icount_warp_rt();
501 }
502
503 void qtest_clock_warp(int64_t dest)
504 {
505 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
506 AioContext *aio_context;
507 assert(qtest_enabled());
508 aio_context = qemu_get_aio_context();
509 while (clock < dest) {
510 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
511 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
512
513 seqlock_write_begin(&timers_state.vm_clock_seqlock);
514 timers_state.qemu_icount_bias += warp;
515 seqlock_write_end(&timers_state.vm_clock_seqlock);
516
517 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
518 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
519 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
520 }
521 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
522 }
523
524 void qemu_start_warp_timer(void)
525 {
526 int64_t clock;
527 int64_t deadline;
528
529 if (!use_icount) {
530 return;
531 }
532
533 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
534 * do not fire, so computing the deadline does not make sense.
535 */
536 if (!runstate_is_running()) {
537 return;
538 }
539
540 /* warp clock deterministically in record/replay mode */
541 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
542 return;
543 }
544
545 if (!all_cpu_threads_idle()) {
546 return;
547 }
548
549 if (qtest_enabled()) {
550 /* When testing, qtest commands advance icount. */
551 return;
552 }
553
554 /* We want to use the earliest deadline from ALL vm_clocks */
555 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
556 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
557 if (deadline < 0) {
558 static bool notified;
559 if (!icount_sleep && !notified) {
560 error_report("WARNING: icount sleep disabled and no active timers");
561 notified = true;
562 }
563 return;
564 }
565
566 if (deadline > 0) {
567 /*
568 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
569 * sleep. Otherwise, the CPU might be waiting for a future timer
570 * interrupt to wake it up, but the interrupt never comes because
571 * the vCPU isn't running any insns and thus doesn't advance the
572 * QEMU_CLOCK_VIRTUAL.
573 */
574 if (!icount_sleep) {
575 /*
576 * We never let VCPUs sleep in no sleep icount mode.
577 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
578 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
579 * It is useful when we want a deterministic execution time,
580 * isolated from host latencies.
581 */
582 seqlock_write_begin(&timers_state.vm_clock_seqlock);
583 timers_state.qemu_icount_bias += deadline;
584 seqlock_write_end(&timers_state.vm_clock_seqlock);
585 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
586 } else {
587 /*
588 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
589 * "real" time, (related to the time left until the next event) has
590 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
591 * This avoids that the warps are visible externally; for example,
592 * you will not be sending network packets continuously instead of
593 * every 100ms.
594 */
595 seqlock_write_begin(&timers_state.vm_clock_seqlock);
596 if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
597 vm_clock_warp_start = clock;
598 }
599 seqlock_write_end(&timers_state.vm_clock_seqlock);
600 timer_mod_anticipate(icount_warp_timer, clock + deadline);
601 }
602 } else if (deadline == 0) {
603 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
604 }
605 }
606
607 static void qemu_account_warp_timer(void)
608 {
609 if (!use_icount || !icount_sleep) {
610 return;
611 }
612
613 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
614 * do not fire, so computing the deadline does not make sense.
615 */
616 if (!runstate_is_running()) {
617 return;
618 }
619
620 /* warp clock deterministically in record/replay mode */
621 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
622 return;
623 }
624
625 timer_del(icount_warp_timer);
626 icount_warp_rt();
627 }
628
629 static bool icount_state_needed(void *opaque)
630 {
631 return use_icount;
632 }
633
634 /*
635 * This is a subsection for icount migration.
636 */
637 static const VMStateDescription icount_vmstate_timers = {
638 .name = "timer/icount",
639 .version_id = 1,
640 .minimum_version_id = 1,
641 .needed = icount_state_needed,
642 .fields = (VMStateField[]) {
643 VMSTATE_INT64(qemu_icount_bias, TimersState),
644 VMSTATE_INT64(qemu_icount, TimersState),
645 VMSTATE_END_OF_LIST()
646 }
647 };
648
649 static const VMStateDescription vmstate_timers = {
650 .name = "timer",
651 .version_id = 2,
652 .minimum_version_id = 1,
653 .fields = (VMStateField[]) {
654 VMSTATE_INT64(cpu_ticks_offset, TimersState),
655 VMSTATE_INT64(dummy, TimersState),
656 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
657 VMSTATE_END_OF_LIST()
658 },
659 .subsections = (const VMStateDescription*[]) {
660 &icount_vmstate_timers,
661 NULL
662 }
663 };
664
665 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
666 {
667 double pct;
668 double throttle_ratio;
669 long sleeptime_ns;
670
671 if (!cpu_throttle_get_percentage()) {
672 return;
673 }
674
675 pct = (double)cpu_throttle_get_percentage()/100;
676 throttle_ratio = pct / (1 - pct);
677 sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
678
679 qemu_mutex_unlock_iothread();
680 atomic_set(&cpu->throttle_thread_scheduled, 0);
681 g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
682 qemu_mutex_lock_iothread();
683 }
684
685 static void cpu_throttle_timer_tick(void *opaque)
686 {
687 CPUState *cpu;
688 double pct;
689
690 /* Stop the timer if needed */
691 if (!cpu_throttle_get_percentage()) {
692 return;
693 }
694 CPU_FOREACH(cpu) {
695 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
696 async_run_on_cpu(cpu, cpu_throttle_thread,
697 RUN_ON_CPU_NULL);
698 }
699 }
700
701 pct = (double)cpu_throttle_get_percentage()/100;
702 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
703 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
704 }
705
706 void cpu_throttle_set(int new_throttle_pct)
707 {
708 /* Ensure throttle percentage is within valid range */
709 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
710 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
711
712 atomic_set(&throttle_percentage, new_throttle_pct);
713
714 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
715 CPU_THROTTLE_TIMESLICE_NS);
716 }
717
718 void cpu_throttle_stop(void)
719 {
720 atomic_set(&throttle_percentage, 0);
721 }
722
723 bool cpu_throttle_active(void)
724 {
725 return (cpu_throttle_get_percentage() != 0);
726 }
727
728 int cpu_throttle_get_percentage(void)
729 {
730 return atomic_read(&throttle_percentage);
731 }
732
733 void cpu_ticks_init(void)
734 {
735 seqlock_init(&timers_state.vm_clock_seqlock);
736 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
737 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
738 cpu_throttle_timer_tick, NULL);
739 }
740
741 void configure_icount(QemuOpts *opts, Error **errp)
742 {
743 const char *option;
744 char *rem_str = NULL;
745
746 option = qemu_opt_get(opts, "shift");
747 if (!option) {
748 if (qemu_opt_get(opts, "align") != NULL) {
749 error_setg(errp, "Please specify shift option when using align");
750 }
751 return;
752 }
753
754 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
755 if (icount_sleep) {
756 icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
757 icount_timer_cb, NULL);
758 }
759
760 icount_align_option = qemu_opt_get_bool(opts, "align", false);
761
762 if (icount_align_option && !icount_sleep) {
763 error_setg(errp, "align=on and sleep=off are incompatible");
764 }
765 if (strcmp(option, "auto") != 0) {
766 errno = 0;
767 icount_time_shift = strtol(option, &rem_str, 0);
768 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
769 error_setg(errp, "icount: Invalid shift value");
770 }
771 use_icount = 1;
772 return;
773 } else if (icount_align_option) {
774 error_setg(errp, "shift=auto and align=on are incompatible");
775 } else if (!icount_sleep) {
776 error_setg(errp, "shift=auto and sleep=off are incompatible");
777 }
778
779 use_icount = 2;
780
781 /* 125MIPS seems a reasonable initial guess at the guest speed.
782 It will be corrected fairly quickly anyway. */
783 icount_time_shift = 3;
784
785 /* Have both realtime and virtual time triggers for speed adjustment.
786 The realtime trigger catches emulated time passing too slowly,
787 the virtual time trigger catches emulated time passing too fast.
788 Realtime triggers occur even when idle, so use them less frequently
789 than VM triggers. */
790 icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
791 icount_adjust_rt, NULL);
792 timer_mod(icount_rt_timer,
793 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
794 icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
795 icount_adjust_vm, NULL);
796 timer_mod(icount_vm_timer,
797 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
798 NANOSECONDS_PER_SECOND / 10);
799 }
800
801 /***********************************************************/
802 /* TCG vCPU kick timer
803 *
804 * The kick timer is responsible for moving single threaded vCPU
805 * emulation on to the next vCPU. If more than one vCPU is running a
806 * timer event with force a cpu->exit so the next vCPU can get
807 * scheduled.
808 *
809 * The timer is removed if all vCPUs are idle and restarted again once
810 * idleness is complete.
811 */
812
813 static QEMUTimer *tcg_kick_vcpu_timer;
814 static CPUState *tcg_current_rr_cpu;
815
816 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
817
818 static inline int64_t qemu_tcg_next_kick(void)
819 {
820 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
821 }
822
823 /* Kick the currently round-robin scheduled vCPU */
824 static void qemu_cpu_kick_rr_cpu(void)
825 {
826 CPUState *cpu;
827 do {
828 cpu = atomic_mb_read(&tcg_current_rr_cpu);
829 if (cpu) {
830 cpu_exit(cpu);
831 }
832 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
833 }
834
835 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
836 {
837 }
838
839 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
840 {
841 if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
842 qemu_notify_event();
843 return;
844 }
845
846 if (!qemu_in_vcpu_thread() && first_cpu) {
847 /* qemu_cpu_kick is not enough to kick a halted CPU out of
848 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
849 * causes cpu_thread_is_idle to return false. This way,
850 * handle_icount_deadline can run.
851 */
852 async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
853 }
854 }
855
856 static void kick_tcg_thread(void *opaque)
857 {
858 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
859 qemu_cpu_kick_rr_cpu();
860 }
861
862 static void start_tcg_kick_timer(void)
863 {
864 if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
865 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
866 kick_tcg_thread, NULL);
867 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
868 }
869 }
870
871 static void stop_tcg_kick_timer(void)
872 {
873 if (tcg_kick_vcpu_timer) {
874 timer_del(tcg_kick_vcpu_timer);
875 tcg_kick_vcpu_timer = NULL;
876 }
877 }
878
879 /***********************************************************/
880 void hw_error(const char *fmt, ...)
881 {
882 va_list ap;
883 CPUState *cpu;
884
885 va_start(ap, fmt);
886 fprintf(stderr, "qemu: hardware error: ");
887 vfprintf(stderr, fmt, ap);
888 fprintf(stderr, "\n");
889 CPU_FOREACH(cpu) {
890 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
891 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
892 }
893 va_end(ap);
894 abort();
895 }
896
897 void cpu_synchronize_all_states(void)
898 {
899 CPUState *cpu;
900
901 CPU_FOREACH(cpu) {
902 cpu_synchronize_state(cpu);
903 }
904 }
905
906 void cpu_synchronize_all_post_reset(void)
907 {
908 CPUState *cpu;
909
910 CPU_FOREACH(cpu) {
911 cpu_synchronize_post_reset(cpu);
912 }
913 }
914
915 void cpu_synchronize_all_post_init(void)
916 {
917 CPUState *cpu;
918
919 CPU_FOREACH(cpu) {
920 cpu_synchronize_post_init(cpu);
921 }
922 }
923
924 static int do_vm_stop(RunState state)
925 {
926 int ret = 0;
927
928 if (runstate_is_running()) {
929 cpu_disable_ticks();
930 pause_all_vcpus();
931 runstate_set(state);
932 vm_state_notify(0, state);
933 qapi_event_send_stop(&error_abort);
934 }
935
936 bdrv_drain_all();
937 replay_disable_events();
938 ret = bdrv_flush_all();
939
940 return ret;
941 }
942
943 static bool cpu_can_run(CPUState *cpu)
944 {
945 if (cpu->stop) {
946 return false;
947 }
948 if (cpu_is_stopped(cpu)) {
949 return false;
950 }
951 return true;
952 }
953
954 static void cpu_handle_guest_debug(CPUState *cpu)
955 {
956 gdb_set_stop_cpu(cpu);
957 qemu_system_debug_request();
958 cpu->stopped = true;
959 }
960
961 #ifdef CONFIG_LINUX
962 static void sigbus_reraise(void)
963 {
964 sigset_t set;
965 struct sigaction action;
966
967 memset(&action, 0, sizeof(action));
968 action.sa_handler = SIG_DFL;
969 if (!sigaction(SIGBUS, &action, NULL)) {
970 raise(SIGBUS);
971 sigemptyset(&set);
972 sigaddset(&set, SIGBUS);
973 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
974 }
975 perror("Failed to re-raise SIGBUS!\n");
976 abort();
977 }
978
979 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
980 {
981 if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
982 sigbus_reraise();
983 }
984
985 if (current_cpu) {
986 /* Called asynchronously in VCPU thread. */
987 if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
988 sigbus_reraise();
989 }
990 } else {
991 /* Called synchronously (via signalfd) in main thread. */
992 if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
993 sigbus_reraise();
994 }
995 }
996 }
997
998 static void qemu_init_sigbus(void)
999 {
1000 struct sigaction action;
1001
1002 memset(&action, 0, sizeof(action));
1003 action.sa_flags = SA_SIGINFO;
1004 action.sa_sigaction = sigbus_handler;
1005 sigaction(SIGBUS, &action, NULL);
1006
1007 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1008 }
1009 #else /* !CONFIG_LINUX */
1010 static void qemu_init_sigbus(void)
1011 {
1012 }
1013 #endif /* !CONFIG_LINUX */
1014
1015 static QemuMutex qemu_global_mutex;
1016
1017 static QemuThread io_thread;
1018
1019 /* cpu creation */
1020 static QemuCond qemu_cpu_cond;
1021 /* system init */
1022 static QemuCond qemu_pause_cond;
1023
1024 void qemu_init_cpu_loop(void)
1025 {
1026 qemu_init_sigbus();
1027 qemu_cond_init(&qemu_cpu_cond);
1028 qemu_cond_init(&qemu_pause_cond);
1029 qemu_mutex_init(&qemu_global_mutex);
1030
1031 qemu_thread_get_self(&io_thread);
1032 }
1033
1034 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1035 {
1036 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1037 }
1038
1039 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1040 {
1041 if (kvm_destroy_vcpu(cpu) < 0) {
1042 error_report("kvm_destroy_vcpu failed");
1043 exit(EXIT_FAILURE);
1044 }
1045 }
1046
1047 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1048 {
1049 }
1050
1051 static void qemu_wait_io_event_common(CPUState *cpu)
1052 {
1053 atomic_mb_set(&cpu->thread_kicked, false);
1054 if (cpu->stop) {
1055 cpu->stop = false;
1056 cpu->stopped = true;
1057 qemu_cond_broadcast(&qemu_pause_cond);
1058 }
1059 process_queued_cpu_work(cpu);
1060 }
1061
1062 static bool qemu_tcg_should_sleep(CPUState *cpu)
1063 {
1064 if (mttcg_enabled) {
1065 return cpu_thread_is_idle(cpu);
1066 } else {
1067 return all_cpu_threads_idle();
1068 }
1069 }
1070
1071 static void qemu_tcg_wait_io_event(CPUState *cpu)
1072 {
1073 while (qemu_tcg_should_sleep(cpu)) {
1074 stop_tcg_kick_timer();
1075 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1076 }
1077
1078 start_tcg_kick_timer();
1079
1080 qemu_wait_io_event_common(cpu);
1081 }
1082
1083 static void qemu_kvm_wait_io_event(CPUState *cpu)
1084 {
1085 while (cpu_thread_is_idle(cpu)) {
1086 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1087 }
1088
1089 qemu_wait_io_event_common(cpu);
1090 }
1091
1092 static void *qemu_kvm_cpu_thread_fn(void *arg)
1093 {
1094 CPUState *cpu = arg;
1095 int r;
1096
1097 rcu_register_thread();
1098
1099 qemu_mutex_lock_iothread();
1100 qemu_thread_get_self(cpu->thread);
1101 cpu->thread_id = qemu_get_thread_id();
1102 cpu->can_do_io = 1;
1103 current_cpu = cpu;
1104
1105 r = kvm_init_vcpu(cpu);
1106 if (r < 0) {
1107 fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1108 exit(1);
1109 }
1110
1111 kvm_init_cpu_signals(cpu);
1112
1113 /* signal CPU creation */
1114 cpu->created = true;
1115 qemu_cond_signal(&qemu_cpu_cond);
1116
1117 do {
1118 if (cpu_can_run(cpu)) {
1119 r = kvm_cpu_exec(cpu);
1120 if (r == EXCP_DEBUG) {
1121 cpu_handle_guest_debug(cpu);
1122 }
1123 }
1124 qemu_kvm_wait_io_event(cpu);
1125 } while (!cpu->unplug || cpu_can_run(cpu));
1126
1127 qemu_kvm_destroy_vcpu(cpu);
1128 cpu->created = false;
1129 qemu_cond_signal(&qemu_cpu_cond);
1130 qemu_mutex_unlock_iothread();
1131 return NULL;
1132 }
1133
1134 static void *qemu_dummy_cpu_thread_fn(void *arg)
1135 {
1136 #ifdef _WIN32
1137 fprintf(stderr, "qtest is not supported under Windows\n");
1138 exit(1);
1139 #else
1140 CPUState *cpu = arg;
1141 sigset_t waitset;
1142 int r;
1143
1144 rcu_register_thread();
1145
1146 qemu_mutex_lock_iothread();
1147 qemu_thread_get_self(cpu->thread);
1148 cpu->thread_id = qemu_get_thread_id();
1149 cpu->can_do_io = 1;
1150 current_cpu = cpu;
1151
1152 sigemptyset(&waitset);
1153 sigaddset(&waitset, SIG_IPI);
1154
1155 /* signal CPU creation */
1156 cpu->created = true;
1157 qemu_cond_signal(&qemu_cpu_cond);
1158
1159 while (1) {
1160 qemu_mutex_unlock_iothread();
1161 do {
1162 int sig;
1163 r = sigwait(&waitset, &sig);
1164 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1165 if (r == -1) {
1166 perror("sigwait");
1167 exit(1);
1168 }
1169 qemu_mutex_lock_iothread();
1170 qemu_wait_io_event_common(cpu);
1171 }
1172
1173 return NULL;
1174 #endif
1175 }
1176
1177 static int64_t tcg_get_icount_limit(void)
1178 {
1179 int64_t deadline;
1180
1181 if (replay_mode != REPLAY_MODE_PLAY) {
1182 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1183
1184 /* Maintain prior (possibly buggy) behaviour where if no deadline
1185 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1186 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1187 * nanoseconds.
1188 */
1189 if ((deadline < 0) || (deadline > INT32_MAX)) {
1190 deadline = INT32_MAX;
1191 }
1192
1193 return qemu_icount_round(deadline);
1194 } else {
1195 return replay_get_instructions();
1196 }
1197 }
1198
1199 static void handle_icount_deadline(void)
1200 {
1201 assert(qemu_in_vcpu_thread());
1202 if (use_icount) {
1203 int64_t deadline =
1204 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1205
1206 if (deadline == 0) {
1207 /* Wake up other AioContexts. */
1208 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1209 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1210 }
1211 }
1212 }
1213
1214 static void prepare_icount_for_run(CPUState *cpu)
1215 {
1216 if (use_icount) {
1217 int insns_left;
1218
1219 /* These should always be cleared by process_icount_data after
1220 * each vCPU execution. However u16.high can be raised
1221 * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1222 */
1223 g_assert(cpu->icount_decr.u16.low == 0);
1224 g_assert(cpu->icount_extra == 0);
1225
1226 cpu->icount_budget = tcg_get_icount_limit();
1227 insns_left = MIN(0xffff, cpu->icount_budget);
1228 cpu->icount_decr.u16.low = insns_left;
1229 cpu->icount_extra = cpu->icount_budget - insns_left;
1230 }
1231 }
1232
1233 static void process_icount_data(CPUState *cpu)
1234 {
1235 if (use_icount) {
1236 /* Account for executed instructions */
1237 cpu_update_icount(cpu);
1238
1239 /* Reset the counters */
1240 cpu->icount_decr.u16.low = 0;
1241 cpu->icount_extra = 0;
1242 cpu->icount_budget = 0;
1243
1244 replay_account_executed_instructions();
1245 }
1246 }
1247
1248
1249 static int tcg_cpu_exec(CPUState *cpu)
1250 {
1251 int ret;
1252 #ifdef CONFIG_PROFILER
1253 int64_t ti;
1254 #endif
1255
1256 #ifdef CONFIG_PROFILER
1257 ti = profile_getclock();
1258 #endif
1259 qemu_mutex_unlock_iothread();
1260 cpu_exec_start(cpu);
1261 ret = cpu_exec(cpu);
1262 cpu_exec_end(cpu);
1263 qemu_mutex_lock_iothread();
1264 #ifdef CONFIG_PROFILER
1265 tcg_time += profile_getclock() - ti;
1266 #endif
1267 return ret;
1268 }
1269
1270 /* Destroy any remaining vCPUs which have been unplugged and have
1271 * finished running
1272 */
1273 static void deal_with_unplugged_cpus(void)
1274 {
1275 CPUState *cpu;
1276
1277 CPU_FOREACH(cpu) {
1278 if (cpu->unplug && !cpu_can_run(cpu)) {
1279 qemu_tcg_destroy_vcpu(cpu);
1280 cpu->created = false;
1281 qemu_cond_signal(&qemu_cpu_cond);
1282 break;
1283 }
1284 }
1285 }
1286
1287 /* Single-threaded TCG
1288 *
1289 * In the single-threaded case each vCPU is simulated in turn. If
1290 * there is more than a single vCPU we create a simple timer to kick
1291 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1292 * This is done explicitly rather than relying on side-effects
1293 * elsewhere.
1294 */
1295
1296 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1297 {
1298 CPUState *cpu = arg;
1299
1300 rcu_register_thread();
1301
1302 qemu_mutex_lock_iothread();
1303 qemu_thread_get_self(cpu->thread);
1304
1305 CPU_FOREACH(cpu) {
1306 cpu->thread_id = qemu_get_thread_id();
1307 cpu->created = true;
1308 cpu->can_do_io = 1;
1309 }
1310 qemu_cond_signal(&qemu_cpu_cond);
1311
1312 /* wait for initial kick-off after machine start */
1313 while (first_cpu->stopped) {
1314 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1315
1316 /* process any pending work */
1317 CPU_FOREACH(cpu) {
1318 current_cpu = cpu;
1319 qemu_wait_io_event_common(cpu);
1320 }
1321 }
1322
1323 start_tcg_kick_timer();
1324
1325 cpu = first_cpu;
1326
1327 /* process any pending work */
1328 cpu->exit_request = 1;
1329
1330 while (1) {
1331 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1332 qemu_account_warp_timer();
1333
1334 /* Run the timers here. This is much more efficient than
1335 * waking up the I/O thread and waiting for completion.
1336 */
1337 handle_icount_deadline();
1338
1339 if (!cpu) {
1340 cpu = first_cpu;
1341 }
1342
1343 while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1344
1345 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1346 current_cpu = cpu;
1347
1348 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1349 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1350
1351 if (cpu_can_run(cpu)) {
1352 int r;
1353
1354 prepare_icount_for_run(cpu);
1355
1356 r = tcg_cpu_exec(cpu);
1357
1358 process_icount_data(cpu);
1359
1360 if (r == EXCP_DEBUG) {
1361 cpu_handle_guest_debug(cpu);
1362 break;
1363 } else if (r == EXCP_ATOMIC) {
1364 qemu_mutex_unlock_iothread();
1365 cpu_exec_step_atomic(cpu);
1366 qemu_mutex_lock_iothread();
1367 break;
1368 }
1369 } else if (cpu->stop) {
1370 if (cpu->unplug) {
1371 cpu = CPU_NEXT(cpu);
1372 }
1373 break;
1374 }
1375
1376 cpu = CPU_NEXT(cpu);
1377 } /* while (cpu && !cpu->exit_request).. */
1378
1379 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1380 atomic_set(&tcg_current_rr_cpu, NULL);
1381
1382 if (cpu && cpu->exit_request) {
1383 atomic_mb_set(&cpu->exit_request, 0);
1384 }
1385
1386 qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1387 deal_with_unplugged_cpus();
1388 }
1389
1390 return NULL;
1391 }
1392
1393 static void *qemu_hax_cpu_thread_fn(void *arg)
1394 {
1395 CPUState *cpu = arg;
1396 int r;
1397
1398 qemu_mutex_lock_iothread();
1399 qemu_thread_get_self(cpu->thread);
1400
1401 cpu->thread_id = qemu_get_thread_id();
1402 cpu->created = true;
1403 cpu->halted = 0;
1404 current_cpu = cpu;
1405
1406 hax_init_vcpu(cpu);
1407 qemu_cond_signal(&qemu_cpu_cond);
1408
1409 while (1) {
1410 if (cpu_can_run(cpu)) {
1411 r = hax_smp_cpu_exec(cpu);
1412 if (r == EXCP_DEBUG) {
1413 cpu_handle_guest_debug(cpu);
1414 }
1415 }
1416
1417 while (cpu_thread_is_idle(cpu)) {
1418 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1419 }
1420 #ifdef _WIN32
1421 SleepEx(0, TRUE);
1422 #endif
1423 qemu_wait_io_event_common(cpu);
1424 }
1425 return NULL;
1426 }
1427
1428 #ifdef _WIN32
1429 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1430 {
1431 }
1432 #endif
1433
1434 /* Multi-threaded TCG
1435 *
1436 * In the multi-threaded case each vCPU has its own thread. The TLS
1437 * variable current_cpu can be used deep in the code to find the
1438 * current CPUState for a given thread.
1439 */
1440
1441 static void *qemu_tcg_cpu_thread_fn(void *arg)
1442 {
1443 CPUState *cpu = arg;
1444
1445 g_assert(!use_icount);
1446
1447 rcu_register_thread();
1448
1449 qemu_mutex_lock_iothread();
1450 qemu_thread_get_self(cpu->thread);
1451
1452 cpu->thread_id = qemu_get_thread_id();
1453 cpu->created = true;
1454 cpu->can_do_io = 1;
1455 current_cpu = cpu;
1456 qemu_cond_signal(&qemu_cpu_cond);
1457
1458 /* process any pending work */
1459 cpu->exit_request = 1;
1460
1461 while (1) {
1462 if (cpu_can_run(cpu)) {
1463 int r;
1464 r = tcg_cpu_exec(cpu);
1465 switch (r) {
1466 case EXCP_DEBUG:
1467 cpu_handle_guest_debug(cpu);
1468 break;
1469 case EXCP_HALTED:
1470 /* during start-up the vCPU is reset and the thread is
1471 * kicked several times. If we don't ensure we go back
1472 * to sleep in the halted state we won't cleanly
1473 * start-up when the vCPU is enabled.
1474 *
1475 * cpu->halted should ensure we sleep in wait_io_event
1476 */
1477 g_assert(cpu->halted);
1478 break;
1479 case EXCP_ATOMIC:
1480 qemu_mutex_unlock_iothread();
1481 cpu_exec_step_atomic(cpu);
1482 qemu_mutex_lock_iothread();
1483 default:
1484 /* Ignore everything else? */
1485 break;
1486 }
1487 }
1488
1489 atomic_mb_set(&cpu->exit_request, 0);
1490 qemu_tcg_wait_io_event(cpu);
1491 }
1492
1493 return NULL;
1494 }
1495
1496 static void qemu_cpu_kick_thread(CPUState *cpu)
1497 {
1498 #ifndef _WIN32
1499 int err;
1500
1501 if (cpu->thread_kicked) {
1502 return;
1503 }
1504 cpu->thread_kicked = true;
1505 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1506 if (err) {
1507 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1508 exit(1);
1509 }
1510 #else /* _WIN32 */
1511 if (!qemu_cpu_is_self(cpu)) {
1512 if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1513 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1514 __func__, GetLastError());
1515 exit(1);
1516 }
1517 }
1518 #endif
1519 }
1520
1521 void qemu_cpu_kick(CPUState *cpu)
1522 {
1523 qemu_cond_broadcast(cpu->halt_cond);
1524 if (tcg_enabled()) {
1525 cpu_exit(cpu);
1526 /* NOP unless doing single-thread RR */
1527 qemu_cpu_kick_rr_cpu();
1528 } else {
1529 if (hax_enabled()) {
1530 /*
1531 * FIXME: race condition with the exit_request check in
1532 * hax_vcpu_hax_exec
1533 */
1534 cpu->exit_request = 1;
1535 }
1536 qemu_cpu_kick_thread(cpu);
1537 }
1538 }
1539
1540 void qemu_cpu_kick_self(void)
1541 {
1542 assert(current_cpu);
1543 qemu_cpu_kick_thread(current_cpu);
1544 }
1545
1546 bool qemu_cpu_is_self(CPUState *cpu)
1547 {
1548 return qemu_thread_is_self(cpu->thread);
1549 }
1550
1551 bool qemu_in_vcpu_thread(void)
1552 {
1553 return current_cpu && qemu_cpu_is_self(current_cpu);
1554 }
1555
1556 static __thread bool iothread_locked = false;
1557
1558 bool qemu_mutex_iothread_locked(void)
1559 {
1560 return iothread_locked;
1561 }
1562
1563 void qemu_mutex_lock_iothread(void)
1564 {
1565 g_assert(!qemu_mutex_iothread_locked());
1566 qemu_mutex_lock(&qemu_global_mutex);
1567 iothread_locked = true;
1568 }
1569
1570 void qemu_mutex_unlock_iothread(void)
1571 {
1572 g_assert(qemu_mutex_iothread_locked());
1573 iothread_locked = false;
1574 qemu_mutex_unlock(&qemu_global_mutex);
1575 }
1576
1577 static bool all_vcpus_paused(void)
1578 {
1579 CPUState *cpu;
1580
1581 CPU_FOREACH(cpu) {
1582 if (!cpu->stopped) {
1583 return false;
1584 }
1585 }
1586
1587 return true;
1588 }
1589
1590 void pause_all_vcpus(void)
1591 {
1592 CPUState *cpu;
1593
1594 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1595 CPU_FOREACH(cpu) {
1596 cpu->stop = true;
1597 qemu_cpu_kick(cpu);
1598 }
1599
1600 if (qemu_in_vcpu_thread()) {
1601 cpu_stop_current();
1602 }
1603
1604 while (!all_vcpus_paused()) {
1605 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1606 CPU_FOREACH(cpu) {
1607 qemu_cpu_kick(cpu);
1608 }
1609 }
1610 }
1611
1612 void cpu_resume(CPUState *cpu)
1613 {
1614 cpu->stop = false;
1615 cpu->stopped = false;
1616 qemu_cpu_kick(cpu);
1617 }
1618
1619 void resume_all_vcpus(void)
1620 {
1621 CPUState *cpu;
1622
1623 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1624 CPU_FOREACH(cpu) {
1625 cpu_resume(cpu);
1626 }
1627 }
1628
1629 void cpu_remove(CPUState *cpu)
1630 {
1631 cpu->stop = true;
1632 cpu->unplug = true;
1633 qemu_cpu_kick(cpu);
1634 }
1635
1636 void cpu_remove_sync(CPUState *cpu)
1637 {
1638 cpu_remove(cpu);
1639 while (cpu->created) {
1640 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1641 }
1642 }
1643
1644 /* For temporary buffers for forming a name */
1645 #define VCPU_THREAD_NAME_SIZE 16
1646
1647 static void qemu_tcg_init_vcpu(CPUState *cpu)
1648 {
1649 char thread_name[VCPU_THREAD_NAME_SIZE];
1650 static QemuCond *single_tcg_halt_cond;
1651 static QemuThread *single_tcg_cpu_thread;
1652
1653 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1654 cpu->thread = g_malloc0(sizeof(QemuThread));
1655 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1656 qemu_cond_init(cpu->halt_cond);
1657
1658 if (qemu_tcg_mttcg_enabled()) {
1659 /* create a thread per vCPU with TCG (MTTCG) */
1660 parallel_cpus = true;
1661 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1662 cpu->cpu_index);
1663
1664 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1665 cpu, QEMU_THREAD_JOINABLE);
1666
1667 } else {
1668 /* share a single thread for all cpus with TCG */
1669 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1670 qemu_thread_create(cpu->thread, thread_name,
1671 qemu_tcg_rr_cpu_thread_fn,
1672 cpu, QEMU_THREAD_JOINABLE);
1673
1674 single_tcg_halt_cond = cpu->halt_cond;
1675 single_tcg_cpu_thread = cpu->thread;
1676 }
1677 #ifdef _WIN32
1678 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1679 #endif
1680 while (!cpu->created) {
1681 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1682 }
1683 } else {
1684 /* For non-MTTCG cases we share the thread */
1685 cpu->thread = single_tcg_cpu_thread;
1686 cpu->halt_cond = single_tcg_halt_cond;
1687 }
1688 }
1689
1690 static void qemu_hax_start_vcpu(CPUState *cpu)
1691 {
1692 char thread_name[VCPU_THREAD_NAME_SIZE];
1693
1694 cpu->thread = g_malloc0(sizeof(QemuThread));
1695 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1696 qemu_cond_init(cpu->halt_cond);
1697
1698 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1699 cpu->cpu_index);
1700 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1701 cpu, QEMU_THREAD_JOINABLE);
1702 #ifdef _WIN32
1703 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1704 #endif
1705 while (!cpu->created) {
1706 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1707 }
1708 }
1709
1710 static void qemu_kvm_start_vcpu(CPUState *cpu)
1711 {
1712 char thread_name[VCPU_THREAD_NAME_SIZE];
1713
1714 cpu->thread = g_malloc0(sizeof(QemuThread));
1715 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1716 qemu_cond_init(cpu->halt_cond);
1717 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1718 cpu->cpu_index);
1719 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1720 cpu, QEMU_THREAD_JOINABLE);
1721 while (!cpu->created) {
1722 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1723 }
1724 }
1725
1726 static void qemu_dummy_start_vcpu(CPUState *cpu)
1727 {
1728 char thread_name[VCPU_THREAD_NAME_SIZE];
1729
1730 cpu->thread = g_malloc0(sizeof(QemuThread));
1731 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1732 qemu_cond_init(cpu->halt_cond);
1733 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1734 cpu->cpu_index);
1735 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1736 QEMU_THREAD_JOINABLE);
1737 while (!cpu->created) {
1738 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1739 }
1740 }
1741
1742 void qemu_init_vcpu(CPUState *cpu)
1743 {
1744 cpu->nr_cores = smp_cores;
1745 cpu->nr_threads = smp_threads;
1746 cpu->stopped = true;
1747
1748 if (!cpu->as) {
1749 /* If the target cpu hasn't set up any address spaces itself,
1750 * give it the default one.
1751 */
1752 AddressSpace *as = address_space_init_shareable(cpu->memory,
1753 "cpu-memory");
1754 cpu->num_ases = 1;
1755 cpu_address_space_init(cpu, as, 0);
1756 }
1757
1758 if (kvm_enabled()) {
1759 qemu_kvm_start_vcpu(cpu);
1760 } else if (hax_enabled()) {
1761 qemu_hax_start_vcpu(cpu);
1762 } else if (tcg_enabled()) {
1763 qemu_tcg_init_vcpu(cpu);
1764 } else {
1765 qemu_dummy_start_vcpu(cpu);
1766 }
1767 }
1768
1769 void cpu_stop_current(void)
1770 {
1771 if (current_cpu) {
1772 current_cpu->stop = false;
1773 current_cpu->stopped = true;
1774 cpu_exit(current_cpu);
1775 qemu_cond_broadcast(&qemu_pause_cond);
1776 }
1777 }
1778
1779 int vm_stop(RunState state)
1780 {
1781 if (qemu_in_vcpu_thread()) {
1782 qemu_system_vmstop_request_prepare();
1783 qemu_system_vmstop_request(state);
1784 /*
1785 * FIXME: should not return to device code in case
1786 * vm_stop() has been requested.
1787 */
1788 cpu_stop_current();
1789 return 0;
1790 }
1791
1792 return do_vm_stop(state);
1793 }
1794
1795 /**
1796 * Prepare for (re)starting the VM.
1797 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1798 * running or in case of an error condition), 0 otherwise.
1799 */
1800 int vm_prepare_start(void)
1801 {
1802 RunState requested;
1803 int res = 0;
1804
1805 qemu_vmstop_requested(&requested);
1806 if (runstate_is_running() && requested == RUN_STATE__MAX) {
1807 return -1;
1808 }
1809
1810 /* Ensure that a STOP/RESUME pair of events is emitted if a
1811 * vmstop request was pending. The BLOCK_IO_ERROR event, for
1812 * example, according to documentation is always followed by
1813 * the STOP event.
1814 */
1815 if (runstate_is_running()) {
1816 qapi_event_send_stop(&error_abort);
1817 res = -1;
1818 } else {
1819 replay_enable_events();
1820 cpu_enable_ticks();
1821 runstate_set(RUN_STATE_RUNNING);
1822 vm_state_notify(1, RUN_STATE_RUNNING);
1823 }
1824
1825 /* We are sending this now, but the CPUs will be resumed shortly later */
1826 qapi_event_send_resume(&error_abort);
1827 return res;
1828 }
1829
1830 void vm_start(void)
1831 {
1832 if (!vm_prepare_start()) {
1833 resume_all_vcpus();
1834 }
1835 }
1836
1837 /* does a state transition even if the VM is already stopped,
1838 current state is forgotten forever */
1839 int vm_stop_force_state(RunState state)
1840 {
1841 if (runstate_is_running()) {
1842 return vm_stop(state);
1843 } else {
1844 runstate_set(state);
1845
1846 bdrv_drain_all();
1847 /* Make sure to return an error if the flush in a previous vm_stop()
1848 * failed. */
1849 return bdrv_flush_all();
1850 }
1851 }
1852
1853 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1854 {
1855 /* XXX: implement xxx_cpu_list for targets that still miss it */
1856 #if defined(cpu_list)
1857 cpu_list(f, cpu_fprintf);
1858 #endif
1859 }
1860
1861 CpuInfoList *qmp_query_cpus(Error **errp)
1862 {
1863 MachineState *ms = MACHINE(qdev_get_machine());
1864 MachineClass *mc = MACHINE_GET_CLASS(ms);
1865 CpuInfoList *head = NULL, *cur_item = NULL;
1866 CPUState *cpu;
1867
1868 CPU_FOREACH(cpu) {
1869 CpuInfoList *info;
1870 #if defined(TARGET_I386)
1871 X86CPU *x86_cpu = X86_CPU(cpu);
1872 CPUX86State *env = &x86_cpu->env;
1873 #elif defined(TARGET_PPC)
1874 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1875 CPUPPCState *env = &ppc_cpu->env;
1876 #elif defined(TARGET_SPARC)
1877 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1878 CPUSPARCState *env = &sparc_cpu->env;
1879 #elif defined(TARGET_MIPS)
1880 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1881 CPUMIPSState *env = &mips_cpu->env;
1882 #elif defined(TARGET_TRICORE)
1883 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1884 CPUTriCoreState *env = &tricore_cpu->env;
1885 #endif
1886
1887 cpu_synchronize_state(cpu);
1888
1889 info = g_malloc0(sizeof(*info));
1890 info->value = g_malloc0(sizeof(*info->value));
1891 info->value->CPU = cpu->cpu_index;
1892 info->value->current = (cpu == first_cpu);
1893 info->value->halted = cpu->halted;
1894 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1895 info->value->thread_id = cpu->thread_id;
1896 #if defined(TARGET_I386)
1897 info->value->arch = CPU_INFO_ARCH_X86;
1898 info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1899 #elif defined(TARGET_PPC)
1900 info->value->arch = CPU_INFO_ARCH_PPC;
1901 info->value->u.ppc.nip = env->nip;
1902 #elif defined(TARGET_SPARC)
1903 info->value->arch = CPU_INFO_ARCH_SPARC;
1904 info->value->u.q_sparc.pc = env->pc;
1905 info->value->u.q_sparc.npc = env->npc;
1906 #elif defined(TARGET_MIPS)
1907 info->value->arch = CPU_INFO_ARCH_MIPS;
1908 info->value->u.q_mips.PC = env->active_tc.PC;
1909 #elif defined(TARGET_TRICORE)
1910 info->value->arch = CPU_INFO_ARCH_TRICORE;
1911 info->value->u.tricore.PC = env->PC;
1912 #else
1913 info->value->arch = CPU_INFO_ARCH_OTHER;
1914 #endif
1915 info->value->has_props = !!mc->cpu_index_to_instance_props;
1916 if (info->value->has_props) {
1917 CpuInstanceProperties *props;
1918 props = g_malloc0(sizeof(*props));
1919 *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
1920 info->value->props = props;
1921 }
1922
1923 /* XXX: waiting for the qapi to support GSList */
1924 if (!cur_item) {
1925 head = cur_item = info;
1926 } else {
1927 cur_item->next = info;
1928 cur_item = info;
1929 }
1930 }
1931
1932 return head;
1933 }
1934
1935 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1936 bool has_cpu, int64_t cpu_index, Error **errp)
1937 {
1938 FILE *f;
1939 uint32_t l;
1940 CPUState *cpu;
1941 uint8_t buf[1024];
1942 int64_t orig_addr = addr, orig_size = size;
1943
1944 if (!has_cpu) {
1945 cpu_index = 0;
1946 }
1947
1948 cpu = qemu_get_cpu(cpu_index);
1949 if (cpu == NULL) {
1950 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1951 "a CPU number");
1952 return;
1953 }
1954
1955 f = fopen(filename, "wb");
1956 if (!f) {
1957 error_setg_file_open(errp, errno, filename);
1958 return;
1959 }
1960
1961 while (size != 0) {
1962 l = sizeof(buf);
1963 if (l > size)
1964 l = size;
1965 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1966 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1967 " specified", orig_addr, orig_size);
1968 goto exit;
1969 }
1970 if (fwrite(buf, 1, l, f) != l) {
1971 error_setg(errp, QERR_IO_ERROR);
1972 goto exit;
1973 }
1974 addr += l;
1975 size -= l;
1976 }
1977
1978 exit:
1979 fclose(f);
1980 }
1981
1982 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1983 Error **errp)
1984 {
1985 FILE *f;
1986 uint32_t l;
1987 uint8_t buf[1024];
1988
1989 f = fopen(filename, "wb");
1990 if (!f) {
1991 error_setg_file_open(errp, errno, filename);
1992 return;
1993 }
1994
1995 while (size != 0) {
1996 l = sizeof(buf);
1997 if (l > size)
1998 l = size;
1999 cpu_physical_memory_read(addr, buf, l);
2000 if (fwrite(buf, 1, l, f) != l) {
2001 error_setg(errp, QERR_IO_ERROR);
2002 goto exit;
2003 }
2004 addr += l;
2005 size -= l;
2006 }
2007
2008 exit:
2009 fclose(f);
2010 }
2011
2012 void qmp_inject_nmi(Error **errp)
2013 {
2014 nmi_monitor_handle(monitor_get_cpu_index(), errp);
2015 }
2016
2017 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2018 {
2019 if (!use_icount) {
2020 return;
2021 }
2022
2023 cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
2024 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2025 if (icount_align_option) {
2026 cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
2027 cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
2028 } else {
2029 cpu_fprintf(f, "Max guest delay NA\n");
2030 cpu_fprintf(f, "Max guest advance NA\n");
2031 }
2032 }