]> git.proxmox.com Git - mirror_qemu.git/blob - cpus.c
tests: fix leaks in test-io-channel-command
[mirror_qemu.git] / cpus.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 /* Needed early for CONFIG_BSD etc. */
26 #include "qemu/osdep.h"
27 #include "qemu-common.h"
28 #include "qemu/config-file.h"
29 #include "cpu.h"
30 #include "monitor/monitor.h"
31 #include "qapi/qmp/qerror.h"
32 #include "qemu/error-report.h"
33 #include "sysemu/sysemu.h"
34 #include "sysemu/block-backend.h"
35 #include "exec/gdbstub.h"
36 #include "sysemu/dma.h"
37 #include "sysemu/hw_accel.h"
38 #include "sysemu/kvm.h"
39 #include "sysemu/hax.h"
40 #include "qmp-commands.h"
41 #include "exec/exec-all.h"
42
43 #include "qemu/thread.h"
44 #include "sysemu/cpus.h"
45 #include "sysemu/qtest.h"
46 #include "qemu/main-loop.h"
47 #include "qemu/bitmap.h"
48 #include "qemu/seqlock.h"
49 #include "tcg.h"
50 #include "qapi-event.h"
51 #include "hw/nmi.h"
52 #include "sysemu/replay.h"
53
54 #ifndef _WIN32
55 #include "qemu/compatfd.h"
56 #endif
57
58 #ifdef CONFIG_LINUX
59
60 #include <sys/prctl.h>
61
62 #ifndef PR_MCE_KILL
63 #define PR_MCE_KILL 33
64 #endif
65
66 #ifndef PR_MCE_KILL_SET
67 #define PR_MCE_KILL_SET 1
68 #endif
69
70 #ifndef PR_MCE_KILL_EARLY
71 #define PR_MCE_KILL_EARLY 1
72 #endif
73
74 #endif /* CONFIG_LINUX */
75
76 int64_t max_delay;
77 int64_t max_advance;
78
79 /* vcpu throttling controls */
80 static QEMUTimer *throttle_timer;
81 static unsigned int throttle_percentage;
82
83 #define CPU_THROTTLE_PCT_MIN 1
84 #define CPU_THROTTLE_PCT_MAX 99
85 #define CPU_THROTTLE_TIMESLICE_NS 10000000
86
87 bool cpu_is_stopped(CPUState *cpu)
88 {
89 return cpu->stopped || !runstate_is_running();
90 }
91
92 static bool cpu_thread_is_idle(CPUState *cpu)
93 {
94 if (cpu->stop || cpu->queued_work_first) {
95 return false;
96 }
97 if (cpu_is_stopped(cpu)) {
98 return true;
99 }
100 if (!cpu->halted || cpu_has_work(cpu) ||
101 kvm_halt_in_kernel()) {
102 return false;
103 }
104 return true;
105 }
106
107 static bool all_cpu_threads_idle(void)
108 {
109 CPUState *cpu;
110
111 CPU_FOREACH(cpu) {
112 if (!cpu_thread_is_idle(cpu)) {
113 return false;
114 }
115 }
116 return true;
117 }
118
119 /***********************************************************/
120 /* guest cycle counter */
121
122 /* Protected by TimersState seqlock */
123
124 static bool icount_sleep = true;
125 static int64_t vm_clock_warp_start = -1;
126 /* Conversion factor from emulated instructions to virtual clock ticks. */
127 static int icount_time_shift;
128 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
129 #define MAX_ICOUNT_SHIFT 10
130
131 static QEMUTimer *icount_rt_timer;
132 static QEMUTimer *icount_vm_timer;
133 static QEMUTimer *icount_warp_timer;
134
135 typedef struct TimersState {
136 /* Protected by BQL. */
137 int64_t cpu_ticks_prev;
138 int64_t cpu_ticks_offset;
139
140 /* cpu_clock_offset can be read out of BQL, so protect it with
141 * this lock.
142 */
143 QemuSeqLock vm_clock_seqlock;
144 int64_t cpu_clock_offset;
145 int32_t cpu_ticks_enabled;
146 int64_t dummy;
147
148 /* Compensate for varying guest execution speed. */
149 int64_t qemu_icount_bias;
150 /* Only written by TCG thread */
151 int64_t qemu_icount;
152 } TimersState;
153
154 static TimersState timers_state;
155 bool mttcg_enabled;
156
157 /*
158 * We default to false if we know other options have been enabled
159 * which are currently incompatible with MTTCG. Otherwise when each
160 * guest (target) has been updated to support:
161 * - atomic instructions
162 * - memory ordering primitives (barriers)
163 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
164 *
165 * Once a guest architecture has been converted to the new primitives
166 * there are two remaining limitations to check.
167 *
168 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
169 * - The host must have a stronger memory order than the guest
170 *
171 * It may be possible in future to support strong guests on weak hosts
172 * but that will require tagging all load/stores in a guest with their
173 * implicit memory order requirements which would likely slow things
174 * down a lot.
175 */
176
177 static bool check_tcg_memory_orders_compatible(void)
178 {
179 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
180 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
181 #else
182 return false;
183 #endif
184 }
185
186 static bool default_mttcg_enabled(void)
187 {
188 QemuOpts *icount_opts = qemu_find_opts_singleton("icount");
189 const char *rr = qemu_opt_get(icount_opts, "rr");
190
191 if (rr || TCG_OVERSIZED_GUEST) {
192 return false;
193 } else {
194 #ifdef TARGET_SUPPORTS_MTTCG
195 return check_tcg_memory_orders_compatible();
196 #else
197 return false;
198 #endif
199 }
200 }
201
202 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
203 {
204 const char *t = qemu_opt_get(opts, "thread");
205 if (t) {
206 if (strcmp(t, "multi") == 0) {
207 if (TCG_OVERSIZED_GUEST) {
208 error_setg(errp, "No MTTCG when guest word size > hosts");
209 } else {
210 if (!check_tcg_memory_orders_compatible()) {
211 error_report("Guest expects a stronger memory ordering "
212 "than the host provides");
213 error_printf("This may cause strange/hard to debug errors");
214 }
215 mttcg_enabled = true;
216 }
217 } else if (strcmp(t, "single") == 0) {
218 mttcg_enabled = false;
219 } else {
220 error_setg(errp, "Invalid 'thread' setting %s", t);
221 }
222 } else {
223 mttcg_enabled = default_mttcg_enabled();
224 }
225 }
226
227 int64_t cpu_get_icount_raw(void)
228 {
229 int64_t icount;
230 CPUState *cpu = current_cpu;
231
232 icount = timers_state.qemu_icount;
233 if (cpu) {
234 if (!cpu->can_do_io) {
235 fprintf(stderr, "Bad icount read\n");
236 exit(1);
237 }
238 icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
239 }
240 return icount;
241 }
242
243 /* Return the virtual CPU time, based on the instruction counter. */
244 static int64_t cpu_get_icount_locked(void)
245 {
246 int64_t icount = cpu_get_icount_raw();
247 return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
248 }
249
250 int64_t cpu_get_icount(void)
251 {
252 int64_t icount;
253 unsigned start;
254
255 do {
256 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
257 icount = cpu_get_icount_locked();
258 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
259
260 return icount;
261 }
262
263 int64_t cpu_icount_to_ns(int64_t icount)
264 {
265 return icount << icount_time_shift;
266 }
267
268 /* return the time elapsed in VM between vm_start and vm_stop. Unless
269 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
270 * counter.
271 *
272 * Caller must hold the BQL
273 */
274 int64_t cpu_get_ticks(void)
275 {
276 int64_t ticks;
277
278 if (use_icount) {
279 return cpu_get_icount();
280 }
281
282 ticks = timers_state.cpu_ticks_offset;
283 if (timers_state.cpu_ticks_enabled) {
284 ticks += cpu_get_host_ticks();
285 }
286
287 if (timers_state.cpu_ticks_prev > ticks) {
288 /* Note: non increasing ticks may happen if the host uses
289 software suspend */
290 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
291 ticks = timers_state.cpu_ticks_prev;
292 }
293
294 timers_state.cpu_ticks_prev = ticks;
295 return ticks;
296 }
297
298 static int64_t cpu_get_clock_locked(void)
299 {
300 int64_t time;
301
302 time = timers_state.cpu_clock_offset;
303 if (timers_state.cpu_ticks_enabled) {
304 time += get_clock();
305 }
306
307 return time;
308 }
309
310 /* Return the monotonic time elapsed in VM, i.e.,
311 * the time between vm_start and vm_stop
312 */
313 int64_t cpu_get_clock(void)
314 {
315 int64_t ti;
316 unsigned start;
317
318 do {
319 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
320 ti = cpu_get_clock_locked();
321 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
322
323 return ti;
324 }
325
326 /* enable cpu_get_ticks()
327 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
328 */
329 void cpu_enable_ticks(void)
330 {
331 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
332 seqlock_write_begin(&timers_state.vm_clock_seqlock);
333 if (!timers_state.cpu_ticks_enabled) {
334 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
335 timers_state.cpu_clock_offset -= get_clock();
336 timers_state.cpu_ticks_enabled = 1;
337 }
338 seqlock_write_end(&timers_state.vm_clock_seqlock);
339 }
340
341 /* disable cpu_get_ticks() : the clock is stopped. You must not call
342 * cpu_get_ticks() after that.
343 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
344 */
345 void cpu_disable_ticks(void)
346 {
347 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
348 seqlock_write_begin(&timers_state.vm_clock_seqlock);
349 if (timers_state.cpu_ticks_enabled) {
350 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
351 timers_state.cpu_clock_offset = cpu_get_clock_locked();
352 timers_state.cpu_ticks_enabled = 0;
353 }
354 seqlock_write_end(&timers_state.vm_clock_seqlock);
355 }
356
357 /* Correlation between real and virtual time is always going to be
358 fairly approximate, so ignore small variation.
359 When the guest is idle real and virtual time will be aligned in
360 the IO wait loop. */
361 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
362
363 static void icount_adjust(void)
364 {
365 int64_t cur_time;
366 int64_t cur_icount;
367 int64_t delta;
368
369 /* Protected by TimersState mutex. */
370 static int64_t last_delta;
371
372 /* If the VM is not running, then do nothing. */
373 if (!runstate_is_running()) {
374 return;
375 }
376
377 seqlock_write_begin(&timers_state.vm_clock_seqlock);
378 cur_time = cpu_get_clock_locked();
379 cur_icount = cpu_get_icount_locked();
380
381 delta = cur_icount - cur_time;
382 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
383 if (delta > 0
384 && last_delta + ICOUNT_WOBBLE < delta * 2
385 && icount_time_shift > 0) {
386 /* The guest is getting too far ahead. Slow time down. */
387 icount_time_shift--;
388 }
389 if (delta < 0
390 && last_delta - ICOUNT_WOBBLE > delta * 2
391 && icount_time_shift < MAX_ICOUNT_SHIFT) {
392 /* The guest is getting too far behind. Speed time up. */
393 icount_time_shift++;
394 }
395 last_delta = delta;
396 timers_state.qemu_icount_bias = cur_icount
397 - (timers_state.qemu_icount << icount_time_shift);
398 seqlock_write_end(&timers_state.vm_clock_seqlock);
399 }
400
401 static void icount_adjust_rt(void *opaque)
402 {
403 timer_mod(icount_rt_timer,
404 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
405 icount_adjust();
406 }
407
408 static void icount_adjust_vm(void *opaque)
409 {
410 timer_mod(icount_vm_timer,
411 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
412 NANOSECONDS_PER_SECOND / 10);
413 icount_adjust();
414 }
415
416 static int64_t qemu_icount_round(int64_t count)
417 {
418 return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
419 }
420
421 static void icount_warp_rt(void)
422 {
423 unsigned seq;
424 int64_t warp_start;
425
426 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
427 * changes from -1 to another value, so the race here is okay.
428 */
429 do {
430 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
431 warp_start = vm_clock_warp_start;
432 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
433
434 if (warp_start == -1) {
435 return;
436 }
437
438 seqlock_write_begin(&timers_state.vm_clock_seqlock);
439 if (runstate_is_running()) {
440 int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
441 cpu_get_clock_locked());
442 int64_t warp_delta;
443
444 warp_delta = clock - vm_clock_warp_start;
445 if (use_icount == 2) {
446 /*
447 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
448 * far ahead of real time.
449 */
450 int64_t cur_icount = cpu_get_icount_locked();
451 int64_t delta = clock - cur_icount;
452 warp_delta = MIN(warp_delta, delta);
453 }
454 timers_state.qemu_icount_bias += warp_delta;
455 }
456 vm_clock_warp_start = -1;
457 seqlock_write_end(&timers_state.vm_clock_seqlock);
458
459 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
460 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
461 }
462 }
463
464 static void icount_timer_cb(void *opaque)
465 {
466 /* No need for a checkpoint because the timer already synchronizes
467 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
468 */
469 icount_warp_rt();
470 }
471
472 void qtest_clock_warp(int64_t dest)
473 {
474 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
475 AioContext *aio_context;
476 assert(qtest_enabled());
477 aio_context = qemu_get_aio_context();
478 while (clock < dest) {
479 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
480 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
481
482 seqlock_write_begin(&timers_state.vm_clock_seqlock);
483 timers_state.qemu_icount_bias += warp;
484 seqlock_write_end(&timers_state.vm_clock_seqlock);
485
486 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
487 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
488 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
489 }
490 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
491 }
492
493 void qemu_start_warp_timer(void)
494 {
495 int64_t clock;
496 int64_t deadline;
497
498 if (!use_icount) {
499 return;
500 }
501
502 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
503 * do not fire, so computing the deadline does not make sense.
504 */
505 if (!runstate_is_running()) {
506 return;
507 }
508
509 /* warp clock deterministically in record/replay mode */
510 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
511 return;
512 }
513
514 if (!all_cpu_threads_idle()) {
515 return;
516 }
517
518 if (qtest_enabled()) {
519 /* When testing, qtest commands advance icount. */
520 return;
521 }
522
523 /* We want to use the earliest deadline from ALL vm_clocks */
524 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
525 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
526 if (deadline < 0) {
527 static bool notified;
528 if (!icount_sleep && !notified) {
529 error_report("WARNING: icount sleep disabled and no active timers");
530 notified = true;
531 }
532 return;
533 }
534
535 if (deadline > 0) {
536 /*
537 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
538 * sleep. Otherwise, the CPU might be waiting for a future timer
539 * interrupt to wake it up, but the interrupt never comes because
540 * the vCPU isn't running any insns and thus doesn't advance the
541 * QEMU_CLOCK_VIRTUAL.
542 */
543 if (!icount_sleep) {
544 /*
545 * We never let VCPUs sleep in no sleep icount mode.
546 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
547 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
548 * It is useful when we want a deterministic execution time,
549 * isolated from host latencies.
550 */
551 seqlock_write_begin(&timers_state.vm_clock_seqlock);
552 timers_state.qemu_icount_bias += deadline;
553 seqlock_write_end(&timers_state.vm_clock_seqlock);
554 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
555 } else {
556 /*
557 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
558 * "real" time, (related to the time left until the next event) has
559 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
560 * This avoids that the warps are visible externally; for example,
561 * you will not be sending network packets continuously instead of
562 * every 100ms.
563 */
564 seqlock_write_begin(&timers_state.vm_clock_seqlock);
565 if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
566 vm_clock_warp_start = clock;
567 }
568 seqlock_write_end(&timers_state.vm_clock_seqlock);
569 timer_mod_anticipate(icount_warp_timer, clock + deadline);
570 }
571 } else if (deadline == 0) {
572 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
573 }
574 }
575
576 static void qemu_account_warp_timer(void)
577 {
578 if (!use_icount || !icount_sleep) {
579 return;
580 }
581
582 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
583 * do not fire, so computing the deadline does not make sense.
584 */
585 if (!runstate_is_running()) {
586 return;
587 }
588
589 /* warp clock deterministically in record/replay mode */
590 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
591 return;
592 }
593
594 timer_del(icount_warp_timer);
595 icount_warp_rt();
596 }
597
598 static bool icount_state_needed(void *opaque)
599 {
600 return use_icount;
601 }
602
603 /*
604 * This is a subsection for icount migration.
605 */
606 static const VMStateDescription icount_vmstate_timers = {
607 .name = "timer/icount",
608 .version_id = 1,
609 .minimum_version_id = 1,
610 .needed = icount_state_needed,
611 .fields = (VMStateField[]) {
612 VMSTATE_INT64(qemu_icount_bias, TimersState),
613 VMSTATE_INT64(qemu_icount, TimersState),
614 VMSTATE_END_OF_LIST()
615 }
616 };
617
618 static const VMStateDescription vmstate_timers = {
619 .name = "timer",
620 .version_id = 2,
621 .minimum_version_id = 1,
622 .fields = (VMStateField[]) {
623 VMSTATE_INT64(cpu_ticks_offset, TimersState),
624 VMSTATE_INT64(dummy, TimersState),
625 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
626 VMSTATE_END_OF_LIST()
627 },
628 .subsections = (const VMStateDescription*[]) {
629 &icount_vmstate_timers,
630 NULL
631 }
632 };
633
634 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
635 {
636 double pct;
637 double throttle_ratio;
638 long sleeptime_ns;
639
640 if (!cpu_throttle_get_percentage()) {
641 return;
642 }
643
644 pct = (double)cpu_throttle_get_percentage()/100;
645 throttle_ratio = pct / (1 - pct);
646 sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
647
648 qemu_mutex_unlock_iothread();
649 atomic_set(&cpu->throttle_thread_scheduled, 0);
650 g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
651 qemu_mutex_lock_iothread();
652 }
653
654 static void cpu_throttle_timer_tick(void *opaque)
655 {
656 CPUState *cpu;
657 double pct;
658
659 /* Stop the timer if needed */
660 if (!cpu_throttle_get_percentage()) {
661 return;
662 }
663 CPU_FOREACH(cpu) {
664 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
665 async_run_on_cpu(cpu, cpu_throttle_thread,
666 RUN_ON_CPU_NULL);
667 }
668 }
669
670 pct = (double)cpu_throttle_get_percentage()/100;
671 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
672 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
673 }
674
675 void cpu_throttle_set(int new_throttle_pct)
676 {
677 /* Ensure throttle percentage is within valid range */
678 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
679 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
680
681 atomic_set(&throttle_percentage, new_throttle_pct);
682
683 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
684 CPU_THROTTLE_TIMESLICE_NS);
685 }
686
687 void cpu_throttle_stop(void)
688 {
689 atomic_set(&throttle_percentage, 0);
690 }
691
692 bool cpu_throttle_active(void)
693 {
694 return (cpu_throttle_get_percentage() != 0);
695 }
696
697 int cpu_throttle_get_percentage(void)
698 {
699 return atomic_read(&throttle_percentage);
700 }
701
702 void cpu_ticks_init(void)
703 {
704 seqlock_init(&timers_state.vm_clock_seqlock);
705 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
706 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
707 cpu_throttle_timer_tick, NULL);
708 }
709
710 void configure_icount(QemuOpts *opts, Error **errp)
711 {
712 const char *option;
713 char *rem_str = NULL;
714
715 option = qemu_opt_get(opts, "shift");
716 if (!option) {
717 if (qemu_opt_get(opts, "align") != NULL) {
718 error_setg(errp, "Please specify shift option when using align");
719 }
720 return;
721 }
722
723 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
724 if (icount_sleep) {
725 icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
726 icount_timer_cb, NULL);
727 }
728
729 icount_align_option = qemu_opt_get_bool(opts, "align", false);
730
731 if (icount_align_option && !icount_sleep) {
732 error_setg(errp, "align=on and sleep=off are incompatible");
733 }
734 if (strcmp(option, "auto") != 0) {
735 errno = 0;
736 icount_time_shift = strtol(option, &rem_str, 0);
737 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
738 error_setg(errp, "icount: Invalid shift value");
739 }
740 use_icount = 1;
741 return;
742 } else if (icount_align_option) {
743 error_setg(errp, "shift=auto and align=on are incompatible");
744 } else if (!icount_sleep) {
745 error_setg(errp, "shift=auto and sleep=off are incompatible");
746 }
747
748 use_icount = 2;
749
750 /* 125MIPS seems a reasonable initial guess at the guest speed.
751 It will be corrected fairly quickly anyway. */
752 icount_time_shift = 3;
753
754 /* Have both realtime and virtual time triggers for speed adjustment.
755 The realtime trigger catches emulated time passing too slowly,
756 the virtual time trigger catches emulated time passing too fast.
757 Realtime triggers occur even when idle, so use them less frequently
758 than VM triggers. */
759 icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
760 icount_adjust_rt, NULL);
761 timer_mod(icount_rt_timer,
762 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
763 icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
764 icount_adjust_vm, NULL);
765 timer_mod(icount_vm_timer,
766 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
767 NANOSECONDS_PER_SECOND / 10);
768 }
769
770 /***********************************************************/
771 /* TCG vCPU kick timer
772 *
773 * The kick timer is responsible for moving single threaded vCPU
774 * emulation on to the next vCPU. If more than one vCPU is running a
775 * timer event with force a cpu->exit so the next vCPU can get
776 * scheduled.
777 *
778 * The timer is removed if all vCPUs are idle and restarted again once
779 * idleness is complete.
780 */
781
782 static QEMUTimer *tcg_kick_vcpu_timer;
783 static CPUState *tcg_current_rr_cpu;
784
785 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
786
787 static inline int64_t qemu_tcg_next_kick(void)
788 {
789 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
790 }
791
792 /* Kick the currently round-robin scheduled vCPU */
793 static void qemu_cpu_kick_rr_cpu(void)
794 {
795 CPUState *cpu;
796 do {
797 cpu = atomic_mb_read(&tcg_current_rr_cpu);
798 if (cpu) {
799 cpu_exit(cpu);
800 }
801 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
802 }
803
804 static void kick_tcg_thread(void *opaque)
805 {
806 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
807 qemu_cpu_kick_rr_cpu();
808 }
809
810 static void start_tcg_kick_timer(void)
811 {
812 if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
813 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
814 kick_tcg_thread, NULL);
815 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
816 }
817 }
818
819 static void stop_tcg_kick_timer(void)
820 {
821 if (tcg_kick_vcpu_timer) {
822 timer_del(tcg_kick_vcpu_timer);
823 tcg_kick_vcpu_timer = NULL;
824 }
825 }
826
827 /***********************************************************/
828 void hw_error(const char *fmt, ...)
829 {
830 va_list ap;
831 CPUState *cpu;
832
833 va_start(ap, fmt);
834 fprintf(stderr, "qemu: hardware error: ");
835 vfprintf(stderr, fmt, ap);
836 fprintf(stderr, "\n");
837 CPU_FOREACH(cpu) {
838 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
839 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
840 }
841 va_end(ap);
842 abort();
843 }
844
845 void cpu_synchronize_all_states(void)
846 {
847 CPUState *cpu;
848
849 CPU_FOREACH(cpu) {
850 cpu_synchronize_state(cpu);
851 }
852 }
853
854 void cpu_synchronize_all_post_reset(void)
855 {
856 CPUState *cpu;
857
858 CPU_FOREACH(cpu) {
859 cpu_synchronize_post_reset(cpu);
860 }
861 }
862
863 void cpu_synchronize_all_post_init(void)
864 {
865 CPUState *cpu;
866
867 CPU_FOREACH(cpu) {
868 cpu_synchronize_post_init(cpu);
869 }
870 }
871
872 static int do_vm_stop(RunState state)
873 {
874 int ret = 0;
875
876 if (runstate_is_running()) {
877 cpu_disable_ticks();
878 pause_all_vcpus();
879 runstate_set(state);
880 vm_state_notify(0, state);
881 qapi_event_send_stop(&error_abort);
882 }
883
884 bdrv_drain_all();
885 replay_disable_events();
886 ret = bdrv_flush_all();
887
888 return ret;
889 }
890
891 static bool cpu_can_run(CPUState *cpu)
892 {
893 if (cpu->stop) {
894 return false;
895 }
896 if (cpu_is_stopped(cpu)) {
897 return false;
898 }
899 return true;
900 }
901
902 static void cpu_handle_guest_debug(CPUState *cpu)
903 {
904 gdb_set_stop_cpu(cpu);
905 qemu_system_debug_request();
906 cpu->stopped = true;
907 }
908
909 #ifdef CONFIG_LINUX
910 static void sigbus_reraise(void)
911 {
912 sigset_t set;
913 struct sigaction action;
914
915 memset(&action, 0, sizeof(action));
916 action.sa_handler = SIG_DFL;
917 if (!sigaction(SIGBUS, &action, NULL)) {
918 raise(SIGBUS);
919 sigemptyset(&set);
920 sigaddset(&set, SIGBUS);
921 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
922 }
923 perror("Failed to re-raise SIGBUS!\n");
924 abort();
925 }
926
927 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
928 void *ctx)
929 {
930 if (kvm_on_sigbus(siginfo->ssi_code,
931 (void *)(intptr_t)siginfo->ssi_addr)) {
932 sigbus_reraise();
933 }
934 }
935
936 static void qemu_init_sigbus(void)
937 {
938 struct sigaction action;
939
940 memset(&action, 0, sizeof(action));
941 action.sa_flags = SA_SIGINFO;
942 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
943 sigaction(SIGBUS, &action, NULL);
944
945 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
946 }
947
948 static void qemu_kvm_eat_signals(CPUState *cpu)
949 {
950 struct timespec ts = { 0, 0 };
951 siginfo_t siginfo;
952 sigset_t waitset;
953 sigset_t chkset;
954 int r;
955
956 sigemptyset(&waitset);
957 sigaddset(&waitset, SIG_IPI);
958 sigaddset(&waitset, SIGBUS);
959
960 do {
961 r = sigtimedwait(&waitset, &siginfo, &ts);
962 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
963 perror("sigtimedwait");
964 exit(1);
965 }
966
967 switch (r) {
968 case SIGBUS:
969 if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
970 sigbus_reraise();
971 }
972 break;
973 default:
974 break;
975 }
976
977 r = sigpending(&chkset);
978 if (r == -1) {
979 perror("sigpending");
980 exit(1);
981 }
982 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
983 }
984
985 #else /* !CONFIG_LINUX */
986
987 static void qemu_init_sigbus(void)
988 {
989 }
990
991 static void qemu_kvm_eat_signals(CPUState *cpu)
992 {
993 }
994 #endif /* !CONFIG_LINUX */
995
996 #ifndef _WIN32
997 static void dummy_signal(int sig)
998 {
999 }
1000
1001 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
1002 {
1003 int r;
1004 sigset_t set;
1005 struct sigaction sigact;
1006
1007 memset(&sigact, 0, sizeof(sigact));
1008 sigact.sa_handler = dummy_signal;
1009 sigaction(SIG_IPI, &sigact, NULL);
1010
1011 pthread_sigmask(SIG_BLOCK, NULL, &set);
1012 sigdelset(&set, SIG_IPI);
1013 sigdelset(&set, SIGBUS);
1014 r = kvm_set_signal_mask(cpu, &set);
1015 if (r) {
1016 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
1017 exit(1);
1018 }
1019 }
1020
1021 #else /* _WIN32 */
1022 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
1023 {
1024 abort();
1025 }
1026 #endif /* _WIN32 */
1027
1028 static QemuMutex qemu_global_mutex;
1029
1030 static QemuThread io_thread;
1031
1032 /* cpu creation */
1033 static QemuCond qemu_cpu_cond;
1034 /* system init */
1035 static QemuCond qemu_pause_cond;
1036
1037 void qemu_init_cpu_loop(void)
1038 {
1039 qemu_init_sigbus();
1040 qemu_cond_init(&qemu_cpu_cond);
1041 qemu_cond_init(&qemu_pause_cond);
1042 qemu_mutex_init(&qemu_global_mutex);
1043
1044 qemu_thread_get_self(&io_thread);
1045 }
1046
1047 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1048 {
1049 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1050 }
1051
1052 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1053 {
1054 if (kvm_destroy_vcpu(cpu) < 0) {
1055 error_report("kvm_destroy_vcpu failed");
1056 exit(EXIT_FAILURE);
1057 }
1058 }
1059
1060 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1061 {
1062 }
1063
1064 static void qemu_wait_io_event_common(CPUState *cpu)
1065 {
1066 atomic_mb_set(&cpu->thread_kicked, false);
1067 if (cpu->stop) {
1068 cpu->stop = false;
1069 cpu->stopped = true;
1070 qemu_cond_broadcast(&qemu_pause_cond);
1071 }
1072 process_queued_cpu_work(cpu);
1073 }
1074
1075 static bool qemu_tcg_should_sleep(CPUState *cpu)
1076 {
1077 if (mttcg_enabled) {
1078 return cpu_thread_is_idle(cpu);
1079 } else {
1080 return all_cpu_threads_idle();
1081 }
1082 }
1083
1084 static void qemu_tcg_wait_io_event(CPUState *cpu)
1085 {
1086 while (qemu_tcg_should_sleep(cpu)) {
1087 stop_tcg_kick_timer();
1088 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1089 }
1090
1091 start_tcg_kick_timer();
1092
1093 qemu_wait_io_event_common(cpu);
1094 }
1095
1096 static void qemu_kvm_wait_io_event(CPUState *cpu)
1097 {
1098 while (cpu_thread_is_idle(cpu)) {
1099 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1100 }
1101
1102 qemu_kvm_eat_signals(cpu);
1103 qemu_wait_io_event_common(cpu);
1104 }
1105
1106 static void *qemu_kvm_cpu_thread_fn(void *arg)
1107 {
1108 CPUState *cpu = arg;
1109 int r;
1110
1111 rcu_register_thread();
1112
1113 qemu_mutex_lock_iothread();
1114 qemu_thread_get_self(cpu->thread);
1115 cpu->thread_id = qemu_get_thread_id();
1116 cpu->can_do_io = 1;
1117 current_cpu = cpu;
1118
1119 r = kvm_init_vcpu(cpu);
1120 if (r < 0) {
1121 fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1122 exit(1);
1123 }
1124
1125 qemu_kvm_init_cpu_signals(cpu);
1126
1127 /* signal CPU creation */
1128 cpu->created = true;
1129 qemu_cond_signal(&qemu_cpu_cond);
1130
1131 do {
1132 if (cpu_can_run(cpu)) {
1133 r = kvm_cpu_exec(cpu);
1134 if (r == EXCP_DEBUG) {
1135 cpu_handle_guest_debug(cpu);
1136 }
1137 }
1138 qemu_kvm_wait_io_event(cpu);
1139 } while (!cpu->unplug || cpu_can_run(cpu));
1140
1141 qemu_kvm_destroy_vcpu(cpu);
1142 cpu->created = false;
1143 qemu_cond_signal(&qemu_cpu_cond);
1144 qemu_mutex_unlock_iothread();
1145 return NULL;
1146 }
1147
1148 static void *qemu_dummy_cpu_thread_fn(void *arg)
1149 {
1150 #ifdef _WIN32
1151 fprintf(stderr, "qtest is not supported under Windows\n");
1152 exit(1);
1153 #else
1154 CPUState *cpu = arg;
1155 sigset_t waitset;
1156 int r;
1157
1158 rcu_register_thread();
1159
1160 qemu_mutex_lock_iothread();
1161 qemu_thread_get_self(cpu->thread);
1162 cpu->thread_id = qemu_get_thread_id();
1163 cpu->can_do_io = 1;
1164 current_cpu = cpu;
1165
1166 sigemptyset(&waitset);
1167 sigaddset(&waitset, SIG_IPI);
1168
1169 /* signal CPU creation */
1170 cpu->created = true;
1171 qemu_cond_signal(&qemu_cpu_cond);
1172
1173 while (1) {
1174 qemu_mutex_unlock_iothread();
1175 do {
1176 int sig;
1177 r = sigwait(&waitset, &sig);
1178 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1179 if (r == -1) {
1180 perror("sigwait");
1181 exit(1);
1182 }
1183 qemu_mutex_lock_iothread();
1184 qemu_wait_io_event_common(cpu);
1185 }
1186
1187 return NULL;
1188 #endif
1189 }
1190
1191 static int64_t tcg_get_icount_limit(void)
1192 {
1193 int64_t deadline;
1194
1195 if (replay_mode != REPLAY_MODE_PLAY) {
1196 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1197
1198 /* Maintain prior (possibly buggy) behaviour where if no deadline
1199 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1200 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1201 * nanoseconds.
1202 */
1203 if ((deadline < 0) || (deadline > INT32_MAX)) {
1204 deadline = INT32_MAX;
1205 }
1206
1207 return qemu_icount_round(deadline);
1208 } else {
1209 return replay_get_instructions();
1210 }
1211 }
1212
1213 static void handle_icount_deadline(void)
1214 {
1215 if (use_icount) {
1216 int64_t deadline =
1217 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1218
1219 if (deadline == 0) {
1220 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1221 }
1222 }
1223 }
1224
1225 static int tcg_cpu_exec(CPUState *cpu)
1226 {
1227 int ret;
1228 #ifdef CONFIG_PROFILER
1229 int64_t ti;
1230 #endif
1231
1232 #ifdef CONFIG_PROFILER
1233 ti = profile_getclock();
1234 #endif
1235 if (use_icount) {
1236 int64_t count;
1237 int decr;
1238 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1239 + cpu->icount_extra);
1240 cpu->icount_decr.u16.low = 0;
1241 cpu->icount_extra = 0;
1242 count = tcg_get_icount_limit();
1243 timers_state.qemu_icount += count;
1244 decr = (count > 0xffff) ? 0xffff : count;
1245 count -= decr;
1246 cpu->icount_decr.u16.low = decr;
1247 cpu->icount_extra = count;
1248 }
1249 qemu_mutex_unlock_iothread();
1250 cpu_exec_start(cpu);
1251 ret = cpu_exec(cpu);
1252 cpu_exec_end(cpu);
1253 qemu_mutex_lock_iothread();
1254 #ifdef CONFIG_PROFILER
1255 tcg_time += profile_getclock() - ti;
1256 #endif
1257 if (use_icount) {
1258 /* Fold pending instructions back into the
1259 instruction counter, and clear the interrupt flag. */
1260 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1261 + cpu->icount_extra);
1262 cpu->icount_decr.u32 = 0;
1263 cpu->icount_extra = 0;
1264 replay_account_executed_instructions();
1265 }
1266 return ret;
1267 }
1268
1269 /* Destroy any remaining vCPUs which have been unplugged and have
1270 * finished running
1271 */
1272 static void deal_with_unplugged_cpus(void)
1273 {
1274 CPUState *cpu;
1275
1276 CPU_FOREACH(cpu) {
1277 if (cpu->unplug && !cpu_can_run(cpu)) {
1278 qemu_tcg_destroy_vcpu(cpu);
1279 cpu->created = false;
1280 qemu_cond_signal(&qemu_cpu_cond);
1281 break;
1282 }
1283 }
1284 }
1285
1286 /* Single-threaded TCG
1287 *
1288 * In the single-threaded case each vCPU is simulated in turn. If
1289 * there is more than a single vCPU we create a simple timer to kick
1290 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1291 * This is done explicitly rather than relying on side-effects
1292 * elsewhere.
1293 */
1294
1295 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1296 {
1297 CPUState *cpu = arg;
1298
1299 rcu_register_thread();
1300
1301 qemu_mutex_lock_iothread();
1302 qemu_thread_get_self(cpu->thread);
1303
1304 CPU_FOREACH(cpu) {
1305 cpu->thread_id = qemu_get_thread_id();
1306 cpu->created = true;
1307 cpu->can_do_io = 1;
1308 }
1309 qemu_cond_signal(&qemu_cpu_cond);
1310
1311 /* wait for initial kick-off after machine start */
1312 while (first_cpu->stopped) {
1313 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1314
1315 /* process any pending work */
1316 CPU_FOREACH(cpu) {
1317 current_cpu = cpu;
1318 qemu_wait_io_event_common(cpu);
1319 }
1320 }
1321
1322 start_tcg_kick_timer();
1323
1324 cpu = first_cpu;
1325
1326 /* process any pending work */
1327 cpu->exit_request = 1;
1328
1329 while (1) {
1330 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1331 qemu_account_warp_timer();
1332
1333 if (!cpu) {
1334 cpu = first_cpu;
1335 }
1336
1337 while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1338
1339 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1340 current_cpu = cpu;
1341
1342 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1343 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1344
1345 if (cpu_can_run(cpu)) {
1346 int r;
1347 r = tcg_cpu_exec(cpu);
1348 if (r == EXCP_DEBUG) {
1349 cpu_handle_guest_debug(cpu);
1350 break;
1351 } else if (r == EXCP_ATOMIC) {
1352 qemu_mutex_unlock_iothread();
1353 cpu_exec_step_atomic(cpu);
1354 qemu_mutex_lock_iothread();
1355 break;
1356 }
1357 } else if (cpu->stop) {
1358 if (cpu->unplug) {
1359 cpu = CPU_NEXT(cpu);
1360 }
1361 break;
1362 }
1363
1364 cpu = CPU_NEXT(cpu);
1365 } /* while (cpu && !cpu->exit_request).. */
1366
1367 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1368 atomic_set(&tcg_current_rr_cpu, NULL);
1369
1370 if (cpu && cpu->exit_request) {
1371 atomic_mb_set(&cpu->exit_request, 0);
1372 }
1373
1374 handle_icount_deadline();
1375
1376 qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1377 deal_with_unplugged_cpus();
1378 }
1379
1380 return NULL;
1381 }
1382
1383 static void *qemu_hax_cpu_thread_fn(void *arg)
1384 {
1385 CPUState *cpu = arg;
1386 int r;
1387 qemu_thread_get_self(cpu->thread);
1388 qemu_mutex_lock(&qemu_global_mutex);
1389
1390 cpu->thread_id = qemu_get_thread_id();
1391 cpu->created = true;
1392 cpu->halted = 0;
1393 current_cpu = cpu;
1394
1395 hax_init_vcpu(cpu);
1396 qemu_cond_signal(&qemu_cpu_cond);
1397
1398 while (1) {
1399 if (cpu_can_run(cpu)) {
1400 r = hax_smp_cpu_exec(cpu);
1401 if (r == EXCP_DEBUG) {
1402 cpu_handle_guest_debug(cpu);
1403 }
1404 }
1405
1406 while (cpu_thread_is_idle(cpu)) {
1407 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1408 }
1409 #ifdef _WIN32
1410 SleepEx(0, TRUE);
1411 #endif
1412 qemu_wait_io_event_common(cpu);
1413 }
1414 return NULL;
1415 }
1416
1417 #ifdef _WIN32
1418 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1419 {
1420 }
1421 #endif
1422
1423 /* Multi-threaded TCG
1424 *
1425 * In the multi-threaded case each vCPU has its own thread. The TLS
1426 * variable current_cpu can be used deep in the code to find the
1427 * current CPUState for a given thread.
1428 */
1429
1430 static void *qemu_tcg_cpu_thread_fn(void *arg)
1431 {
1432 CPUState *cpu = arg;
1433
1434 rcu_register_thread();
1435
1436 qemu_mutex_lock_iothread();
1437 qemu_thread_get_self(cpu->thread);
1438
1439 cpu->thread_id = qemu_get_thread_id();
1440 cpu->created = true;
1441 cpu->can_do_io = 1;
1442 current_cpu = cpu;
1443 qemu_cond_signal(&qemu_cpu_cond);
1444
1445 /* process any pending work */
1446 cpu->exit_request = 1;
1447
1448 while (1) {
1449 if (cpu_can_run(cpu)) {
1450 int r;
1451 r = tcg_cpu_exec(cpu);
1452 switch (r) {
1453 case EXCP_DEBUG:
1454 cpu_handle_guest_debug(cpu);
1455 break;
1456 case EXCP_HALTED:
1457 /* during start-up the vCPU is reset and the thread is
1458 * kicked several times. If we don't ensure we go back
1459 * to sleep in the halted state we won't cleanly
1460 * start-up when the vCPU is enabled.
1461 *
1462 * cpu->halted should ensure we sleep in wait_io_event
1463 */
1464 g_assert(cpu->halted);
1465 break;
1466 case EXCP_ATOMIC:
1467 qemu_mutex_unlock_iothread();
1468 cpu_exec_step_atomic(cpu);
1469 qemu_mutex_lock_iothread();
1470 default:
1471 /* Ignore everything else? */
1472 break;
1473 }
1474 }
1475
1476 handle_icount_deadline();
1477
1478 atomic_mb_set(&cpu->exit_request, 0);
1479 qemu_tcg_wait_io_event(cpu);
1480 }
1481
1482 return NULL;
1483 }
1484
1485 static void qemu_cpu_kick_thread(CPUState *cpu)
1486 {
1487 #ifndef _WIN32
1488 int err;
1489
1490 if (cpu->thread_kicked) {
1491 return;
1492 }
1493 cpu->thread_kicked = true;
1494 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1495 if (err) {
1496 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1497 exit(1);
1498 }
1499 #else /* _WIN32 */
1500 if (!qemu_cpu_is_self(cpu)) {
1501 if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1502 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1503 __func__, GetLastError());
1504 exit(1);
1505 }
1506 }
1507 #endif
1508 }
1509
1510 void qemu_cpu_kick(CPUState *cpu)
1511 {
1512 qemu_cond_broadcast(cpu->halt_cond);
1513 if (tcg_enabled()) {
1514 cpu_exit(cpu);
1515 /* NOP unless doing single-thread RR */
1516 qemu_cpu_kick_rr_cpu();
1517 } else {
1518 if (hax_enabled()) {
1519 /*
1520 * FIXME: race condition with the exit_request check in
1521 * hax_vcpu_hax_exec
1522 */
1523 cpu->exit_request = 1;
1524 }
1525 qemu_cpu_kick_thread(cpu);
1526 }
1527 }
1528
1529 void qemu_cpu_kick_self(void)
1530 {
1531 assert(current_cpu);
1532 qemu_cpu_kick_thread(current_cpu);
1533 }
1534
1535 bool qemu_cpu_is_self(CPUState *cpu)
1536 {
1537 return qemu_thread_is_self(cpu->thread);
1538 }
1539
1540 bool qemu_in_vcpu_thread(void)
1541 {
1542 return current_cpu && qemu_cpu_is_self(current_cpu);
1543 }
1544
1545 static __thread bool iothread_locked = false;
1546
1547 bool qemu_mutex_iothread_locked(void)
1548 {
1549 return iothread_locked;
1550 }
1551
1552 void qemu_mutex_lock_iothread(void)
1553 {
1554 g_assert(!qemu_mutex_iothread_locked());
1555 qemu_mutex_lock(&qemu_global_mutex);
1556 iothread_locked = true;
1557 }
1558
1559 void qemu_mutex_unlock_iothread(void)
1560 {
1561 g_assert(qemu_mutex_iothread_locked());
1562 iothread_locked = false;
1563 qemu_mutex_unlock(&qemu_global_mutex);
1564 }
1565
1566 static bool all_vcpus_paused(void)
1567 {
1568 CPUState *cpu;
1569
1570 CPU_FOREACH(cpu) {
1571 if (!cpu->stopped) {
1572 return false;
1573 }
1574 }
1575
1576 return true;
1577 }
1578
1579 void pause_all_vcpus(void)
1580 {
1581 CPUState *cpu;
1582
1583 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1584 CPU_FOREACH(cpu) {
1585 cpu->stop = true;
1586 qemu_cpu_kick(cpu);
1587 }
1588
1589 if (qemu_in_vcpu_thread()) {
1590 cpu_stop_current();
1591 }
1592
1593 while (!all_vcpus_paused()) {
1594 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1595 CPU_FOREACH(cpu) {
1596 qemu_cpu_kick(cpu);
1597 }
1598 }
1599 }
1600
1601 void cpu_resume(CPUState *cpu)
1602 {
1603 cpu->stop = false;
1604 cpu->stopped = false;
1605 qemu_cpu_kick(cpu);
1606 }
1607
1608 void resume_all_vcpus(void)
1609 {
1610 CPUState *cpu;
1611
1612 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1613 CPU_FOREACH(cpu) {
1614 cpu_resume(cpu);
1615 }
1616 }
1617
1618 void cpu_remove(CPUState *cpu)
1619 {
1620 cpu->stop = true;
1621 cpu->unplug = true;
1622 qemu_cpu_kick(cpu);
1623 }
1624
1625 void cpu_remove_sync(CPUState *cpu)
1626 {
1627 cpu_remove(cpu);
1628 while (cpu->created) {
1629 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1630 }
1631 }
1632
1633 /* For temporary buffers for forming a name */
1634 #define VCPU_THREAD_NAME_SIZE 16
1635
1636 static void qemu_tcg_init_vcpu(CPUState *cpu)
1637 {
1638 char thread_name[VCPU_THREAD_NAME_SIZE];
1639 static QemuCond *single_tcg_halt_cond;
1640 static QemuThread *single_tcg_cpu_thread;
1641
1642 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1643 cpu->thread = g_malloc0(sizeof(QemuThread));
1644 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1645 qemu_cond_init(cpu->halt_cond);
1646
1647 if (qemu_tcg_mttcg_enabled()) {
1648 /* create a thread per vCPU with TCG (MTTCG) */
1649 parallel_cpus = true;
1650 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1651 cpu->cpu_index);
1652
1653 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1654 cpu, QEMU_THREAD_JOINABLE);
1655
1656 } else {
1657 /* share a single thread for all cpus with TCG */
1658 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1659 qemu_thread_create(cpu->thread, thread_name,
1660 qemu_tcg_rr_cpu_thread_fn,
1661 cpu, QEMU_THREAD_JOINABLE);
1662
1663 single_tcg_halt_cond = cpu->halt_cond;
1664 single_tcg_cpu_thread = cpu->thread;
1665 }
1666 #ifdef _WIN32
1667 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1668 #endif
1669 while (!cpu->created) {
1670 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1671 }
1672 } else {
1673 /* For non-MTTCG cases we share the thread */
1674 cpu->thread = single_tcg_cpu_thread;
1675 cpu->halt_cond = single_tcg_halt_cond;
1676 }
1677 }
1678
1679 static void qemu_hax_start_vcpu(CPUState *cpu)
1680 {
1681 char thread_name[VCPU_THREAD_NAME_SIZE];
1682
1683 cpu->thread = g_malloc0(sizeof(QemuThread));
1684 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1685 qemu_cond_init(cpu->halt_cond);
1686
1687 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1688 cpu->cpu_index);
1689 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1690 cpu, QEMU_THREAD_JOINABLE);
1691 #ifdef _WIN32
1692 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1693 #endif
1694 while (!cpu->created) {
1695 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1696 }
1697 }
1698
1699 static void qemu_kvm_start_vcpu(CPUState *cpu)
1700 {
1701 char thread_name[VCPU_THREAD_NAME_SIZE];
1702
1703 cpu->thread = g_malloc0(sizeof(QemuThread));
1704 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1705 qemu_cond_init(cpu->halt_cond);
1706 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1707 cpu->cpu_index);
1708 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1709 cpu, QEMU_THREAD_JOINABLE);
1710 while (!cpu->created) {
1711 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1712 }
1713 }
1714
1715 static void qemu_dummy_start_vcpu(CPUState *cpu)
1716 {
1717 char thread_name[VCPU_THREAD_NAME_SIZE];
1718
1719 cpu->thread = g_malloc0(sizeof(QemuThread));
1720 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1721 qemu_cond_init(cpu->halt_cond);
1722 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1723 cpu->cpu_index);
1724 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1725 QEMU_THREAD_JOINABLE);
1726 while (!cpu->created) {
1727 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1728 }
1729 }
1730
1731 void qemu_init_vcpu(CPUState *cpu)
1732 {
1733 cpu->nr_cores = smp_cores;
1734 cpu->nr_threads = smp_threads;
1735 cpu->stopped = true;
1736
1737 if (!cpu->as) {
1738 /* If the target cpu hasn't set up any address spaces itself,
1739 * give it the default one.
1740 */
1741 AddressSpace *as = address_space_init_shareable(cpu->memory,
1742 "cpu-memory");
1743 cpu->num_ases = 1;
1744 cpu_address_space_init(cpu, as, 0);
1745 }
1746
1747 if (kvm_enabled()) {
1748 qemu_kvm_start_vcpu(cpu);
1749 } else if (hax_enabled()) {
1750 qemu_hax_start_vcpu(cpu);
1751 } else if (tcg_enabled()) {
1752 qemu_tcg_init_vcpu(cpu);
1753 } else {
1754 qemu_dummy_start_vcpu(cpu);
1755 }
1756 }
1757
1758 void cpu_stop_current(void)
1759 {
1760 if (current_cpu) {
1761 current_cpu->stop = false;
1762 current_cpu->stopped = true;
1763 cpu_exit(current_cpu);
1764 qemu_cond_broadcast(&qemu_pause_cond);
1765 }
1766 }
1767
1768 int vm_stop(RunState state)
1769 {
1770 if (qemu_in_vcpu_thread()) {
1771 qemu_system_vmstop_request_prepare();
1772 qemu_system_vmstop_request(state);
1773 /*
1774 * FIXME: should not return to device code in case
1775 * vm_stop() has been requested.
1776 */
1777 cpu_stop_current();
1778 return 0;
1779 }
1780
1781 return do_vm_stop(state);
1782 }
1783
1784 /**
1785 * Prepare for (re)starting the VM.
1786 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1787 * running or in case of an error condition), 0 otherwise.
1788 */
1789 int vm_prepare_start(void)
1790 {
1791 RunState requested;
1792 int res = 0;
1793
1794 qemu_vmstop_requested(&requested);
1795 if (runstate_is_running() && requested == RUN_STATE__MAX) {
1796 return -1;
1797 }
1798
1799 /* Ensure that a STOP/RESUME pair of events is emitted if a
1800 * vmstop request was pending. The BLOCK_IO_ERROR event, for
1801 * example, according to documentation is always followed by
1802 * the STOP event.
1803 */
1804 if (runstate_is_running()) {
1805 qapi_event_send_stop(&error_abort);
1806 res = -1;
1807 } else {
1808 replay_enable_events();
1809 cpu_enable_ticks();
1810 runstate_set(RUN_STATE_RUNNING);
1811 vm_state_notify(1, RUN_STATE_RUNNING);
1812 }
1813
1814 /* We are sending this now, but the CPUs will be resumed shortly later */
1815 qapi_event_send_resume(&error_abort);
1816 return res;
1817 }
1818
1819 void vm_start(void)
1820 {
1821 if (!vm_prepare_start()) {
1822 resume_all_vcpus();
1823 }
1824 }
1825
1826 /* does a state transition even if the VM is already stopped,
1827 current state is forgotten forever */
1828 int vm_stop_force_state(RunState state)
1829 {
1830 if (runstate_is_running()) {
1831 return vm_stop(state);
1832 } else {
1833 runstate_set(state);
1834
1835 bdrv_drain_all();
1836 /* Make sure to return an error if the flush in a previous vm_stop()
1837 * failed. */
1838 return bdrv_flush_all();
1839 }
1840 }
1841
1842 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1843 {
1844 /* XXX: implement xxx_cpu_list for targets that still miss it */
1845 #if defined(cpu_list)
1846 cpu_list(f, cpu_fprintf);
1847 #endif
1848 }
1849
1850 CpuInfoList *qmp_query_cpus(Error **errp)
1851 {
1852 CpuInfoList *head = NULL, *cur_item = NULL;
1853 CPUState *cpu;
1854
1855 CPU_FOREACH(cpu) {
1856 CpuInfoList *info;
1857 #if defined(TARGET_I386)
1858 X86CPU *x86_cpu = X86_CPU(cpu);
1859 CPUX86State *env = &x86_cpu->env;
1860 #elif defined(TARGET_PPC)
1861 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1862 CPUPPCState *env = &ppc_cpu->env;
1863 #elif defined(TARGET_SPARC)
1864 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1865 CPUSPARCState *env = &sparc_cpu->env;
1866 #elif defined(TARGET_MIPS)
1867 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1868 CPUMIPSState *env = &mips_cpu->env;
1869 #elif defined(TARGET_TRICORE)
1870 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1871 CPUTriCoreState *env = &tricore_cpu->env;
1872 #endif
1873
1874 cpu_synchronize_state(cpu);
1875
1876 info = g_malloc0(sizeof(*info));
1877 info->value = g_malloc0(sizeof(*info->value));
1878 info->value->CPU = cpu->cpu_index;
1879 info->value->current = (cpu == first_cpu);
1880 info->value->halted = cpu->halted;
1881 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1882 info->value->thread_id = cpu->thread_id;
1883 #if defined(TARGET_I386)
1884 info->value->arch = CPU_INFO_ARCH_X86;
1885 info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1886 #elif defined(TARGET_PPC)
1887 info->value->arch = CPU_INFO_ARCH_PPC;
1888 info->value->u.ppc.nip = env->nip;
1889 #elif defined(TARGET_SPARC)
1890 info->value->arch = CPU_INFO_ARCH_SPARC;
1891 info->value->u.q_sparc.pc = env->pc;
1892 info->value->u.q_sparc.npc = env->npc;
1893 #elif defined(TARGET_MIPS)
1894 info->value->arch = CPU_INFO_ARCH_MIPS;
1895 info->value->u.q_mips.PC = env->active_tc.PC;
1896 #elif defined(TARGET_TRICORE)
1897 info->value->arch = CPU_INFO_ARCH_TRICORE;
1898 info->value->u.tricore.PC = env->PC;
1899 #else
1900 info->value->arch = CPU_INFO_ARCH_OTHER;
1901 #endif
1902
1903 /* XXX: waiting for the qapi to support GSList */
1904 if (!cur_item) {
1905 head = cur_item = info;
1906 } else {
1907 cur_item->next = info;
1908 cur_item = info;
1909 }
1910 }
1911
1912 return head;
1913 }
1914
1915 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1916 bool has_cpu, int64_t cpu_index, Error **errp)
1917 {
1918 FILE *f;
1919 uint32_t l;
1920 CPUState *cpu;
1921 uint8_t buf[1024];
1922 int64_t orig_addr = addr, orig_size = size;
1923
1924 if (!has_cpu) {
1925 cpu_index = 0;
1926 }
1927
1928 cpu = qemu_get_cpu(cpu_index);
1929 if (cpu == NULL) {
1930 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1931 "a CPU number");
1932 return;
1933 }
1934
1935 f = fopen(filename, "wb");
1936 if (!f) {
1937 error_setg_file_open(errp, errno, filename);
1938 return;
1939 }
1940
1941 while (size != 0) {
1942 l = sizeof(buf);
1943 if (l > size)
1944 l = size;
1945 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1946 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1947 " specified", orig_addr, orig_size);
1948 goto exit;
1949 }
1950 if (fwrite(buf, 1, l, f) != l) {
1951 error_setg(errp, QERR_IO_ERROR);
1952 goto exit;
1953 }
1954 addr += l;
1955 size -= l;
1956 }
1957
1958 exit:
1959 fclose(f);
1960 }
1961
1962 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1963 Error **errp)
1964 {
1965 FILE *f;
1966 uint32_t l;
1967 uint8_t buf[1024];
1968
1969 f = fopen(filename, "wb");
1970 if (!f) {
1971 error_setg_file_open(errp, errno, filename);
1972 return;
1973 }
1974
1975 while (size != 0) {
1976 l = sizeof(buf);
1977 if (l > size)
1978 l = size;
1979 cpu_physical_memory_read(addr, buf, l);
1980 if (fwrite(buf, 1, l, f) != l) {
1981 error_setg(errp, QERR_IO_ERROR);
1982 goto exit;
1983 }
1984 addr += l;
1985 size -= l;
1986 }
1987
1988 exit:
1989 fclose(f);
1990 }
1991
1992 void qmp_inject_nmi(Error **errp)
1993 {
1994 nmi_monitor_handle(monitor_get_cpu_index(), errp);
1995 }
1996
1997 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1998 {
1999 if (!use_icount) {
2000 return;
2001 }
2002
2003 cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
2004 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2005 if (icount_align_option) {
2006 cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
2007 cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
2008 } else {
2009 cpu_fprintf(f, "Max guest delay NA\n");
2010 cpu_fprintf(f, "Max guest advance NA\n");
2011 }
2012 }