]> git.proxmox.com Git - mirror_qemu.git/blob - cpus.c
Merge remote-tracking branch 'remotes/maxreitz/tags/pull-block-2017-04-03' into staging
[mirror_qemu.git] / cpus.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 /* Needed early for CONFIG_BSD etc. */
26 #include "qemu/osdep.h"
27 #include "qemu-common.h"
28 #include "qemu/config-file.h"
29 #include "cpu.h"
30 #include "monitor/monitor.h"
31 #include "qapi/qmp/qerror.h"
32 #include "qemu/error-report.h"
33 #include "sysemu/sysemu.h"
34 #include "sysemu/block-backend.h"
35 #include "exec/gdbstub.h"
36 #include "sysemu/dma.h"
37 #include "sysemu/hw_accel.h"
38 #include "sysemu/kvm.h"
39 #include "sysemu/hax.h"
40 #include "qmp-commands.h"
41 #include "exec/exec-all.h"
42
43 #include "qemu/thread.h"
44 #include "sysemu/cpus.h"
45 #include "sysemu/qtest.h"
46 #include "qemu/main-loop.h"
47 #include "qemu/bitmap.h"
48 #include "qemu/seqlock.h"
49 #include "tcg.h"
50 #include "qapi-event.h"
51 #include "hw/nmi.h"
52 #include "sysemu/replay.h"
53
54 #ifdef CONFIG_LINUX
55
56 #include <sys/prctl.h>
57
58 #ifndef PR_MCE_KILL
59 #define PR_MCE_KILL 33
60 #endif
61
62 #ifndef PR_MCE_KILL_SET
63 #define PR_MCE_KILL_SET 1
64 #endif
65
66 #ifndef PR_MCE_KILL_EARLY
67 #define PR_MCE_KILL_EARLY 1
68 #endif
69
70 #endif /* CONFIG_LINUX */
71
72 int64_t max_delay;
73 int64_t max_advance;
74
75 /* vcpu throttling controls */
76 static QEMUTimer *throttle_timer;
77 static unsigned int throttle_percentage;
78
79 #define CPU_THROTTLE_PCT_MIN 1
80 #define CPU_THROTTLE_PCT_MAX 99
81 #define CPU_THROTTLE_TIMESLICE_NS 10000000
82
83 bool cpu_is_stopped(CPUState *cpu)
84 {
85 return cpu->stopped || !runstate_is_running();
86 }
87
88 static bool cpu_thread_is_idle(CPUState *cpu)
89 {
90 if (cpu->stop || cpu->queued_work_first) {
91 return false;
92 }
93 if (cpu_is_stopped(cpu)) {
94 return true;
95 }
96 if (!cpu->halted || cpu_has_work(cpu) ||
97 kvm_halt_in_kernel()) {
98 return false;
99 }
100 return true;
101 }
102
103 static bool all_cpu_threads_idle(void)
104 {
105 CPUState *cpu;
106
107 CPU_FOREACH(cpu) {
108 if (!cpu_thread_is_idle(cpu)) {
109 return false;
110 }
111 }
112 return true;
113 }
114
115 /***********************************************************/
116 /* guest cycle counter */
117
118 /* Protected by TimersState seqlock */
119
120 static bool icount_sleep = true;
121 static int64_t vm_clock_warp_start = -1;
122 /* Conversion factor from emulated instructions to virtual clock ticks. */
123 static int icount_time_shift;
124 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
125 #define MAX_ICOUNT_SHIFT 10
126
127 static QEMUTimer *icount_rt_timer;
128 static QEMUTimer *icount_vm_timer;
129 static QEMUTimer *icount_warp_timer;
130
131 typedef struct TimersState {
132 /* Protected by BQL. */
133 int64_t cpu_ticks_prev;
134 int64_t cpu_ticks_offset;
135
136 /* cpu_clock_offset can be read out of BQL, so protect it with
137 * this lock.
138 */
139 QemuSeqLock vm_clock_seqlock;
140 int64_t cpu_clock_offset;
141 int32_t cpu_ticks_enabled;
142 int64_t dummy;
143
144 /* Compensate for varying guest execution speed. */
145 int64_t qemu_icount_bias;
146 /* Only written by TCG thread */
147 int64_t qemu_icount;
148 } TimersState;
149
150 static TimersState timers_state;
151 bool mttcg_enabled;
152
153 /*
154 * We default to false if we know other options have been enabled
155 * which are currently incompatible with MTTCG. Otherwise when each
156 * guest (target) has been updated to support:
157 * - atomic instructions
158 * - memory ordering primitives (barriers)
159 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
160 *
161 * Once a guest architecture has been converted to the new primitives
162 * there are two remaining limitations to check.
163 *
164 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
165 * - The host must have a stronger memory order than the guest
166 *
167 * It may be possible in future to support strong guests on weak hosts
168 * but that will require tagging all load/stores in a guest with their
169 * implicit memory order requirements which would likely slow things
170 * down a lot.
171 */
172
173 static bool check_tcg_memory_orders_compatible(void)
174 {
175 #if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
176 return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
177 #else
178 return false;
179 #endif
180 }
181
182 static bool default_mttcg_enabled(void)
183 {
184 if (use_icount || TCG_OVERSIZED_GUEST) {
185 return false;
186 } else {
187 #ifdef TARGET_SUPPORTS_MTTCG
188 return check_tcg_memory_orders_compatible();
189 #else
190 return false;
191 #endif
192 }
193 }
194
195 void qemu_tcg_configure(QemuOpts *opts, Error **errp)
196 {
197 const char *t = qemu_opt_get(opts, "thread");
198 if (t) {
199 if (strcmp(t, "multi") == 0) {
200 if (TCG_OVERSIZED_GUEST) {
201 error_setg(errp, "No MTTCG when guest word size > hosts");
202 } else if (use_icount) {
203 error_setg(errp, "No MTTCG when icount is enabled");
204 } else {
205 #ifndef TARGET_SUPPORT_MTTCG
206 error_report("Guest not yet converted to MTTCG - "
207 "you may get unexpected results");
208 #endif
209 if (!check_tcg_memory_orders_compatible()) {
210 error_report("Guest expects a stronger memory ordering "
211 "than the host provides");
212 error_printf("This may cause strange/hard to debug errors\n");
213 }
214 mttcg_enabled = true;
215 }
216 } else if (strcmp(t, "single") == 0) {
217 mttcg_enabled = false;
218 } else {
219 error_setg(errp, "Invalid 'thread' setting %s", t);
220 }
221 } else {
222 mttcg_enabled = default_mttcg_enabled();
223 }
224 }
225
226 int64_t cpu_get_icount_raw(void)
227 {
228 int64_t icount;
229 CPUState *cpu = current_cpu;
230
231 icount = timers_state.qemu_icount;
232 if (cpu) {
233 if (!cpu->can_do_io) {
234 fprintf(stderr, "Bad icount read\n");
235 exit(1);
236 }
237 icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
238 }
239 return icount;
240 }
241
242 /* Return the virtual CPU time, based on the instruction counter. */
243 static int64_t cpu_get_icount_locked(void)
244 {
245 int64_t icount = cpu_get_icount_raw();
246 return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
247 }
248
249 int64_t cpu_get_icount(void)
250 {
251 int64_t icount;
252 unsigned start;
253
254 do {
255 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
256 icount = cpu_get_icount_locked();
257 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
258
259 return icount;
260 }
261
262 int64_t cpu_icount_to_ns(int64_t icount)
263 {
264 return icount << icount_time_shift;
265 }
266
267 /* return the time elapsed in VM between vm_start and vm_stop. Unless
268 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
269 * counter.
270 *
271 * Caller must hold the BQL
272 */
273 int64_t cpu_get_ticks(void)
274 {
275 int64_t ticks;
276
277 if (use_icount) {
278 return cpu_get_icount();
279 }
280
281 ticks = timers_state.cpu_ticks_offset;
282 if (timers_state.cpu_ticks_enabled) {
283 ticks += cpu_get_host_ticks();
284 }
285
286 if (timers_state.cpu_ticks_prev > ticks) {
287 /* Note: non increasing ticks may happen if the host uses
288 software suspend */
289 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
290 ticks = timers_state.cpu_ticks_prev;
291 }
292
293 timers_state.cpu_ticks_prev = ticks;
294 return ticks;
295 }
296
297 static int64_t cpu_get_clock_locked(void)
298 {
299 int64_t time;
300
301 time = timers_state.cpu_clock_offset;
302 if (timers_state.cpu_ticks_enabled) {
303 time += get_clock();
304 }
305
306 return time;
307 }
308
309 /* Return the monotonic time elapsed in VM, i.e.,
310 * the time between vm_start and vm_stop
311 */
312 int64_t cpu_get_clock(void)
313 {
314 int64_t ti;
315 unsigned start;
316
317 do {
318 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
319 ti = cpu_get_clock_locked();
320 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
321
322 return ti;
323 }
324
325 /* enable cpu_get_ticks()
326 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
327 */
328 void cpu_enable_ticks(void)
329 {
330 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
331 seqlock_write_begin(&timers_state.vm_clock_seqlock);
332 if (!timers_state.cpu_ticks_enabled) {
333 timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
334 timers_state.cpu_clock_offset -= get_clock();
335 timers_state.cpu_ticks_enabled = 1;
336 }
337 seqlock_write_end(&timers_state.vm_clock_seqlock);
338 }
339
340 /* disable cpu_get_ticks() : the clock is stopped. You must not call
341 * cpu_get_ticks() after that.
342 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
343 */
344 void cpu_disable_ticks(void)
345 {
346 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
347 seqlock_write_begin(&timers_state.vm_clock_seqlock);
348 if (timers_state.cpu_ticks_enabled) {
349 timers_state.cpu_ticks_offset += cpu_get_host_ticks();
350 timers_state.cpu_clock_offset = cpu_get_clock_locked();
351 timers_state.cpu_ticks_enabled = 0;
352 }
353 seqlock_write_end(&timers_state.vm_clock_seqlock);
354 }
355
356 /* Correlation between real and virtual time is always going to be
357 fairly approximate, so ignore small variation.
358 When the guest is idle real and virtual time will be aligned in
359 the IO wait loop. */
360 #define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
361
362 static void icount_adjust(void)
363 {
364 int64_t cur_time;
365 int64_t cur_icount;
366 int64_t delta;
367
368 /* Protected by TimersState mutex. */
369 static int64_t last_delta;
370
371 /* If the VM is not running, then do nothing. */
372 if (!runstate_is_running()) {
373 return;
374 }
375
376 seqlock_write_begin(&timers_state.vm_clock_seqlock);
377 cur_time = cpu_get_clock_locked();
378 cur_icount = cpu_get_icount_locked();
379
380 delta = cur_icount - cur_time;
381 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
382 if (delta > 0
383 && last_delta + ICOUNT_WOBBLE < delta * 2
384 && icount_time_shift > 0) {
385 /* The guest is getting too far ahead. Slow time down. */
386 icount_time_shift--;
387 }
388 if (delta < 0
389 && last_delta - ICOUNT_WOBBLE > delta * 2
390 && icount_time_shift < MAX_ICOUNT_SHIFT) {
391 /* The guest is getting too far behind. Speed time up. */
392 icount_time_shift++;
393 }
394 last_delta = delta;
395 timers_state.qemu_icount_bias = cur_icount
396 - (timers_state.qemu_icount << icount_time_shift);
397 seqlock_write_end(&timers_state.vm_clock_seqlock);
398 }
399
400 static void icount_adjust_rt(void *opaque)
401 {
402 timer_mod(icount_rt_timer,
403 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
404 icount_adjust();
405 }
406
407 static void icount_adjust_vm(void *opaque)
408 {
409 timer_mod(icount_vm_timer,
410 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
411 NANOSECONDS_PER_SECOND / 10);
412 icount_adjust();
413 }
414
415 static int64_t qemu_icount_round(int64_t count)
416 {
417 return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
418 }
419
420 static void icount_warp_rt(void)
421 {
422 unsigned seq;
423 int64_t warp_start;
424
425 /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
426 * changes from -1 to another value, so the race here is okay.
427 */
428 do {
429 seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
430 warp_start = vm_clock_warp_start;
431 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
432
433 if (warp_start == -1) {
434 return;
435 }
436
437 seqlock_write_begin(&timers_state.vm_clock_seqlock);
438 if (runstate_is_running()) {
439 int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
440 cpu_get_clock_locked());
441 int64_t warp_delta;
442
443 warp_delta = clock - vm_clock_warp_start;
444 if (use_icount == 2) {
445 /*
446 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
447 * far ahead of real time.
448 */
449 int64_t cur_icount = cpu_get_icount_locked();
450 int64_t delta = clock - cur_icount;
451 warp_delta = MIN(warp_delta, delta);
452 }
453 timers_state.qemu_icount_bias += warp_delta;
454 }
455 vm_clock_warp_start = -1;
456 seqlock_write_end(&timers_state.vm_clock_seqlock);
457
458 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
459 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
460 }
461 }
462
463 static void icount_timer_cb(void *opaque)
464 {
465 /* No need for a checkpoint because the timer already synchronizes
466 * with CHECKPOINT_CLOCK_VIRTUAL_RT.
467 */
468 icount_warp_rt();
469 }
470
471 void qtest_clock_warp(int64_t dest)
472 {
473 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
474 AioContext *aio_context;
475 assert(qtest_enabled());
476 aio_context = qemu_get_aio_context();
477 while (clock < dest) {
478 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
479 int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
480
481 seqlock_write_begin(&timers_state.vm_clock_seqlock);
482 timers_state.qemu_icount_bias += warp;
483 seqlock_write_end(&timers_state.vm_clock_seqlock);
484
485 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
486 timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
487 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
488 }
489 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
490 }
491
492 void qemu_start_warp_timer(void)
493 {
494 int64_t clock;
495 int64_t deadline;
496
497 if (!use_icount) {
498 return;
499 }
500
501 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
502 * do not fire, so computing the deadline does not make sense.
503 */
504 if (!runstate_is_running()) {
505 return;
506 }
507
508 /* warp clock deterministically in record/replay mode */
509 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
510 return;
511 }
512
513 if (!all_cpu_threads_idle()) {
514 return;
515 }
516
517 if (qtest_enabled()) {
518 /* When testing, qtest commands advance icount. */
519 return;
520 }
521
522 /* We want to use the earliest deadline from ALL vm_clocks */
523 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
524 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
525 if (deadline < 0) {
526 static bool notified;
527 if (!icount_sleep && !notified) {
528 error_report("WARNING: icount sleep disabled and no active timers");
529 notified = true;
530 }
531 return;
532 }
533
534 if (deadline > 0) {
535 /*
536 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
537 * sleep. Otherwise, the CPU might be waiting for a future timer
538 * interrupt to wake it up, but the interrupt never comes because
539 * the vCPU isn't running any insns and thus doesn't advance the
540 * QEMU_CLOCK_VIRTUAL.
541 */
542 if (!icount_sleep) {
543 /*
544 * We never let VCPUs sleep in no sleep icount mode.
545 * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
546 * to the next QEMU_CLOCK_VIRTUAL event and notify it.
547 * It is useful when we want a deterministic execution time,
548 * isolated from host latencies.
549 */
550 seqlock_write_begin(&timers_state.vm_clock_seqlock);
551 timers_state.qemu_icount_bias += deadline;
552 seqlock_write_end(&timers_state.vm_clock_seqlock);
553 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
554 } else {
555 /*
556 * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
557 * "real" time, (related to the time left until the next event) has
558 * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
559 * This avoids that the warps are visible externally; for example,
560 * you will not be sending network packets continuously instead of
561 * every 100ms.
562 */
563 seqlock_write_begin(&timers_state.vm_clock_seqlock);
564 if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
565 vm_clock_warp_start = clock;
566 }
567 seqlock_write_end(&timers_state.vm_clock_seqlock);
568 timer_mod_anticipate(icount_warp_timer, clock + deadline);
569 }
570 } else if (deadline == 0) {
571 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
572 }
573 }
574
575 static void qemu_account_warp_timer(void)
576 {
577 if (!use_icount || !icount_sleep) {
578 return;
579 }
580
581 /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
582 * do not fire, so computing the deadline does not make sense.
583 */
584 if (!runstate_is_running()) {
585 return;
586 }
587
588 /* warp clock deterministically in record/replay mode */
589 if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
590 return;
591 }
592
593 timer_del(icount_warp_timer);
594 icount_warp_rt();
595 }
596
597 static bool icount_state_needed(void *opaque)
598 {
599 return use_icount;
600 }
601
602 /*
603 * This is a subsection for icount migration.
604 */
605 static const VMStateDescription icount_vmstate_timers = {
606 .name = "timer/icount",
607 .version_id = 1,
608 .minimum_version_id = 1,
609 .needed = icount_state_needed,
610 .fields = (VMStateField[]) {
611 VMSTATE_INT64(qemu_icount_bias, TimersState),
612 VMSTATE_INT64(qemu_icount, TimersState),
613 VMSTATE_END_OF_LIST()
614 }
615 };
616
617 static const VMStateDescription vmstate_timers = {
618 .name = "timer",
619 .version_id = 2,
620 .minimum_version_id = 1,
621 .fields = (VMStateField[]) {
622 VMSTATE_INT64(cpu_ticks_offset, TimersState),
623 VMSTATE_INT64(dummy, TimersState),
624 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
625 VMSTATE_END_OF_LIST()
626 },
627 .subsections = (const VMStateDescription*[]) {
628 &icount_vmstate_timers,
629 NULL
630 }
631 };
632
633 static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
634 {
635 double pct;
636 double throttle_ratio;
637 long sleeptime_ns;
638
639 if (!cpu_throttle_get_percentage()) {
640 return;
641 }
642
643 pct = (double)cpu_throttle_get_percentage()/100;
644 throttle_ratio = pct / (1 - pct);
645 sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
646
647 qemu_mutex_unlock_iothread();
648 atomic_set(&cpu->throttle_thread_scheduled, 0);
649 g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
650 qemu_mutex_lock_iothread();
651 }
652
653 static void cpu_throttle_timer_tick(void *opaque)
654 {
655 CPUState *cpu;
656 double pct;
657
658 /* Stop the timer if needed */
659 if (!cpu_throttle_get_percentage()) {
660 return;
661 }
662 CPU_FOREACH(cpu) {
663 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
664 async_run_on_cpu(cpu, cpu_throttle_thread,
665 RUN_ON_CPU_NULL);
666 }
667 }
668
669 pct = (double)cpu_throttle_get_percentage()/100;
670 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
671 CPU_THROTTLE_TIMESLICE_NS / (1-pct));
672 }
673
674 void cpu_throttle_set(int new_throttle_pct)
675 {
676 /* Ensure throttle percentage is within valid range */
677 new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
678 new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
679
680 atomic_set(&throttle_percentage, new_throttle_pct);
681
682 timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
683 CPU_THROTTLE_TIMESLICE_NS);
684 }
685
686 void cpu_throttle_stop(void)
687 {
688 atomic_set(&throttle_percentage, 0);
689 }
690
691 bool cpu_throttle_active(void)
692 {
693 return (cpu_throttle_get_percentage() != 0);
694 }
695
696 int cpu_throttle_get_percentage(void)
697 {
698 return atomic_read(&throttle_percentage);
699 }
700
701 void cpu_ticks_init(void)
702 {
703 seqlock_init(&timers_state.vm_clock_seqlock);
704 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
705 throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
706 cpu_throttle_timer_tick, NULL);
707 }
708
709 void configure_icount(QemuOpts *opts, Error **errp)
710 {
711 const char *option;
712 char *rem_str = NULL;
713
714 option = qemu_opt_get(opts, "shift");
715 if (!option) {
716 if (qemu_opt_get(opts, "align") != NULL) {
717 error_setg(errp, "Please specify shift option when using align");
718 }
719 return;
720 }
721
722 icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
723 if (icount_sleep) {
724 icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
725 icount_timer_cb, NULL);
726 }
727
728 icount_align_option = qemu_opt_get_bool(opts, "align", false);
729
730 if (icount_align_option && !icount_sleep) {
731 error_setg(errp, "align=on and sleep=off are incompatible");
732 }
733 if (strcmp(option, "auto") != 0) {
734 errno = 0;
735 icount_time_shift = strtol(option, &rem_str, 0);
736 if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
737 error_setg(errp, "icount: Invalid shift value");
738 }
739 use_icount = 1;
740 return;
741 } else if (icount_align_option) {
742 error_setg(errp, "shift=auto and align=on are incompatible");
743 } else if (!icount_sleep) {
744 error_setg(errp, "shift=auto and sleep=off are incompatible");
745 }
746
747 use_icount = 2;
748
749 /* 125MIPS seems a reasonable initial guess at the guest speed.
750 It will be corrected fairly quickly anyway. */
751 icount_time_shift = 3;
752
753 /* Have both realtime and virtual time triggers for speed adjustment.
754 The realtime trigger catches emulated time passing too slowly,
755 the virtual time trigger catches emulated time passing too fast.
756 Realtime triggers occur even when idle, so use them less frequently
757 than VM triggers. */
758 icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
759 icount_adjust_rt, NULL);
760 timer_mod(icount_rt_timer,
761 qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
762 icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
763 icount_adjust_vm, NULL);
764 timer_mod(icount_vm_timer,
765 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
766 NANOSECONDS_PER_SECOND / 10);
767 }
768
769 /***********************************************************/
770 /* TCG vCPU kick timer
771 *
772 * The kick timer is responsible for moving single threaded vCPU
773 * emulation on to the next vCPU. If more than one vCPU is running a
774 * timer event with force a cpu->exit so the next vCPU can get
775 * scheduled.
776 *
777 * The timer is removed if all vCPUs are idle and restarted again once
778 * idleness is complete.
779 */
780
781 static QEMUTimer *tcg_kick_vcpu_timer;
782 static CPUState *tcg_current_rr_cpu;
783
784 #define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
785
786 static inline int64_t qemu_tcg_next_kick(void)
787 {
788 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
789 }
790
791 /* Kick the currently round-robin scheduled vCPU */
792 static void qemu_cpu_kick_rr_cpu(void)
793 {
794 CPUState *cpu;
795 do {
796 cpu = atomic_mb_read(&tcg_current_rr_cpu);
797 if (cpu) {
798 cpu_exit(cpu);
799 }
800 } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
801 }
802
803 static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
804 {
805 }
806
807 void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
808 {
809 if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
810 qemu_notify_event();
811 return;
812 }
813
814 if (!qemu_in_vcpu_thread() && first_cpu) {
815 /* qemu_cpu_kick is not enough to kick a halted CPU out of
816 * qemu_tcg_wait_io_event. async_run_on_cpu, instead,
817 * causes cpu_thread_is_idle to return false. This way,
818 * handle_icount_deadline can run.
819 */
820 async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
821 }
822 }
823
824 static void kick_tcg_thread(void *opaque)
825 {
826 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
827 qemu_cpu_kick_rr_cpu();
828 }
829
830 static void start_tcg_kick_timer(void)
831 {
832 if (!mttcg_enabled && !tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
833 tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
834 kick_tcg_thread, NULL);
835 timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
836 }
837 }
838
839 static void stop_tcg_kick_timer(void)
840 {
841 if (tcg_kick_vcpu_timer) {
842 timer_del(tcg_kick_vcpu_timer);
843 tcg_kick_vcpu_timer = NULL;
844 }
845 }
846
847 /***********************************************************/
848 void hw_error(const char *fmt, ...)
849 {
850 va_list ap;
851 CPUState *cpu;
852
853 va_start(ap, fmt);
854 fprintf(stderr, "qemu: hardware error: ");
855 vfprintf(stderr, fmt, ap);
856 fprintf(stderr, "\n");
857 CPU_FOREACH(cpu) {
858 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
859 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
860 }
861 va_end(ap);
862 abort();
863 }
864
865 void cpu_synchronize_all_states(void)
866 {
867 CPUState *cpu;
868
869 CPU_FOREACH(cpu) {
870 cpu_synchronize_state(cpu);
871 }
872 }
873
874 void cpu_synchronize_all_post_reset(void)
875 {
876 CPUState *cpu;
877
878 CPU_FOREACH(cpu) {
879 cpu_synchronize_post_reset(cpu);
880 }
881 }
882
883 void cpu_synchronize_all_post_init(void)
884 {
885 CPUState *cpu;
886
887 CPU_FOREACH(cpu) {
888 cpu_synchronize_post_init(cpu);
889 }
890 }
891
892 static int do_vm_stop(RunState state)
893 {
894 int ret = 0;
895
896 if (runstate_is_running()) {
897 cpu_disable_ticks();
898 pause_all_vcpus();
899 runstate_set(state);
900 vm_state_notify(0, state);
901 qapi_event_send_stop(&error_abort);
902 }
903
904 bdrv_drain_all();
905 replay_disable_events();
906 ret = bdrv_flush_all();
907
908 return ret;
909 }
910
911 static bool cpu_can_run(CPUState *cpu)
912 {
913 if (cpu->stop) {
914 return false;
915 }
916 if (cpu_is_stopped(cpu)) {
917 return false;
918 }
919 return true;
920 }
921
922 static void cpu_handle_guest_debug(CPUState *cpu)
923 {
924 gdb_set_stop_cpu(cpu);
925 qemu_system_debug_request();
926 cpu->stopped = true;
927 }
928
929 #ifdef CONFIG_LINUX
930 static void sigbus_reraise(void)
931 {
932 sigset_t set;
933 struct sigaction action;
934
935 memset(&action, 0, sizeof(action));
936 action.sa_handler = SIG_DFL;
937 if (!sigaction(SIGBUS, &action, NULL)) {
938 raise(SIGBUS);
939 sigemptyset(&set);
940 sigaddset(&set, SIGBUS);
941 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
942 }
943 perror("Failed to re-raise SIGBUS!\n");
944 abort();
945 }
946
947 static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
948 {
949 if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
950 sigbus_reraise();
951 }
952
953 if (current_cpu) {
954 /* Called asynchronously in VCPU thread. */
955 if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
956 sigbus_reraise();
957 }
958 } else {
959 /* Called synchronously (via signalfd) in main thread. */
960 if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
961 sigbus_reraise();
962 }
963 }
964 }
965
966 static void qemu_init_sigbus(void)
967 {
968 struct sigaction action;
969
970 memset(&action, 0, sizeof(action));
971 action.sa_flags = SA_SIGINFO;
972 action.sa_sigaction = sigbus_handler;
973 sigaction(SIGBUS, &action, NULL);
974
975 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
976 }
977 #else /* !CONFIG_LINUX */
978 static void qemu_init_sigbus(void)
979 {
980 }
981 #endif /* !CONFIG_LINUX */
982
983 static QemuMutex qemu_global_mutex;
984
985 static QemuThread io_thread;
986
987 /* cpu creation */
988 static QemuCond qemu_cpu_cond;
989 /* system init */
990 static QemuCond qemu_pause_cond;
991
992 void qemu_init_cpu_loop(void)
993 {
994 qemu_init_sigbus();
995 qemu_cond_init(&qemu_cpu_cond);
996 qemu_cond_init(&qemu_pause_cond);
997 qemu_mutex_init(&qemu_global_mutex);
998
999 qemu_thread_get_self(&io_thread);
1000 }
1001
1002 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1003 {
1004 do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1005 }
1006
1007 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1008 {
1009 if (kvm_destroy_vcpu(cpu) < 0) {
1010 error_report("kvm_destroy_vcpu failed");
1011 exit(EXIT_FAILURE);
1012 }
1013 }
1014
1015 static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1016 {
1017 }
1018
1019 static void qemu_wait_io_event_common(CPUState *cpu)
1020 {
1021 atomic_mb_set(&cpu->thread_kicked, false);
1022 if (cpu->stop) {
1023 cpu->stop = false;
1024 cpu->stopped = true;
1025 qemu_cond_broadcast(&qemu_pause_cond);
1026 }
1027 process_queued_cpu_work(cpu);
1028 }
1029
1030 static bool qemu_tcg_should_sleep(CPUState *cpu)
1031 {
1032 if (mttcg_enabled) {
1033 return cpu_thread_is_idle(cpu);
1034 } else {
1035 return all_cpu_threads_idle();
1036 }
1037 }
1038
1039 static void qemu_tcg_wait_io_event(CPUState *cpu)
1040 {
1041 while (qemu_tcg_should_sleep(cpu)) {
1042 stop_tcg_kick_timer();
1043 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1044 }
1045
1046 start_tcg_kick_timer();
1047
1048 qemu_wait_io_event_common(cpu);
1049 }
1050
1051 static void qemu_kvm_wait_io_event(CPUState *cpu)
1052 {
1053 while (cpu_thread_is_idle(cpu)) {
1054 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1055 }
1056
1057 qemu_wait_io_event_common(cpu);
1058 }
1059
1060 static void *qemu_kvm_cpu_thread_fn(void *arg)
1061 {
1062 CPUState *cpu = arg;
1063 int r;
1064
1065 rcu_register_thread();
1066
1067 qemu_mutex_lock_iothread();
1068 qemu_thread_get_self(cpu->thread);
1069 cpu->thread_id = qemu_get_thread_id();
1070 cpu->can_do_io = 1;
1071 current_cpu = cpu;
1072
1073 r = kvm_init_vcpu(cpu);
1074 if (r < 0) {
1075 fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1076 exit(1);
1077 }
1078
1079 kvm_init_cpu_signals(cpu);
1080
1081 /* signal CPU creation */
1082 cpu->created = true;
1083 qemu_cond_signal(&qemu_cpu_cond);
1084
1085 do {
1086 if (cpu_can_run(cpu)) {
1087 r = kvm_cpu_exec(cpu);
1088 if (r == EXCP_DEBUG) {
1089 cpu_handle_guest_debug(cpu);
1090 }
1091 }
1092 qemu_kvm_wait_io_event(cpu);
1093 } while (!cpu->unplug || cpu_can_run(cpu));
1094
1095 qemu_kvm_destroy_vcpu(cpu);
1096 cpu->created = false;
1097 qemu_cond_signal(&qemu_cpu_cond);
1098 qemu_mutex_unlock_iothread();
1099 return NULL;
1100 }
1101
1102 static void *qemu_dummy_cpu_thread_fn(void *arg)
1103 {
1104 #ifdef _WIN32
1105 fprintf(stderr, "qtest is not supported under Windows\n");
1106 exit(1);
1107 #else
1108 CPUState *cpu = arg;
1109 sigset_t waitset;
1110 int r;
1111
1112 rcu_register_thread();
1113
1114 qemu_mutex_lock_iothread();
1115 qemu_thread_get_self(cpu->thread);
1116 cpu->thread_id = qemu_get_thread_id();
1117 cpu->can_do_io = 1;
1118 current_cpu = cpu;
1119
1120 sigemptyset(&waitset);
1121 sigaddset(&waitset, SIG_IPI);
1122
1123 /* signal CPU creation */
1124 cpu->created = true;
1125 qemu_cond_signal(&qemu_cpu_cond);
1126
1127 while (1) {
1128 qemu_mutex_unlock_iothread();
1129 do {
1130 int sig;
1131 r = sigwait(&waitset, &sig);
1132 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1133 if (r == -1) {
1134 perror("sigwait");
1135 exit(1);
1136 }
1137 qemu_mutex_lock_iothread();
1138 qemu_wait_io_event_common(cpu);
1139 }
1140
1141 return NULL;
1142 #endif
1143 }
1144
1145 static int64_t tcg_get_icount_limit(void)
1146 {
1147 int64_t deadline;
1148
1149 if (replay_mode != REPLAY_MODE_PLAY) {
1150 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1151
1152 /* Maintain prior (possibly buggy) behaviour where if no deadline
1153 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1154 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1155 * nanoseconds.
1156 */
1157 if ((deadline < 0) || (deadline > INT32_MAX)) {
1158 deadline = INT32_MAX;
1159 }
1160
1161 return qemu_icount_round(deadline);
1162 } else {
1163 return replay_get_instructions();
1164 }
1165 }
1166
1167 static void handle_icount_deadline(void)
1168 {
1169 assert(qemu_in_vcpu_thread());
1170 if (use_icount) {
1171 int64_t deadline =
1172 qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1173
1174 if (deadline == 0) {
1175 /* Wake up other AioContexts. */
1176 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1177 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1178 }
1179 }
1180 }
1181
1182 static int tcg_cpu_exec(CPUState *cpu)
1183 {
1184 int ret;
1185 #ifdef CONFIG_PROFILER
1186 int64_t ti;
1187 #endif
1188
1189 #ifdef CONFIG_PROFILER
1190 ti = profile_getclock();
1191 #endif
1192 if (use_icount) {
1193 int64_t count;
1194 int decr;
1195 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1196 + cpu->icount_extra);
1197 cpu->icount_decr.u16.low = 0;
1198 cpu->icount_extra = 0;
1199 count = tcg_get_icount_limit();
1200 timers_state.qemu_icount += count;
1201 decr = (count > 0xffff) ? 0xffff : count;
1202 count -= decr;
1203 cpu->icount_decr.u16.low = decr;
1204 cpu->icount_extra = count;
1205 }
1206 qemu_mutex_unlock_iothread();
1207 cpu_exec_start(cpu);
1208 ret = cpu_exec(cpu);
1209 cpu_exec_end(cpu);
1210 qemu_mutex_lock_iothread();
1211 #ifdef CONFIG_PROFILER
1212 tcg_time += profile_getclock() - ti;
1213 #endif
1214 if (use_icount) {
1215 /* Fold pending instructions back into the
1216 instruction counter, and clear the interrupt flag. */
1217 timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1218 + cpu->icount_extra);
1219 cpu->icount_decr.u32 = 0;
1220 cpu->icount_extra = 0;
1221 replay_account_executed_instructions();
1222 }
1223 return ret;
1224 }
1225
1226 /* Destroy any remaining vCPUs which have been unplugged and have
1227 * finished running
1228 */
1229 static void deal_with_unplugged_cpus(void)
1230 {
1231 CPUState *cpu;
1232
1233 CPU_FOREACH(cpu) {
1234 if (cpu->unplug && !cpu_can_run(cpu)) {
1235 qemu_tcg_destroy_vcpu(cpu);
1236 cpu->created = false;
1237 qemu_cond_signal(&qemu_cpu_cond);
1238 break;
1239 }
1240 }
1241 }
1242
1243 /* Single-threaded TCG
1244 *
1245 * In the single-threaded case each vCPU is simulated in turn. If
1246 * there is more than a single vCPU we create a simple timer to kick
1247 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1248 * This is done explicitly rather than relying on side-effects
1249 * elsewhere.
1250 */
1251
1252 static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1253 {
1254 CPUState *cpu = arg;
1255
1256 rcu_register_thread();
1257
1258 qemu_mutex_lock_iothread();
1259 qemu_thread_get_self(cpu->thread);
1260
1261 CPU_FOREACH(cpu) {
1262 cpu->thread_id = qemu_get_thread_id();
1263 cpu->created = true;
1264 cpu->can_do_io = 1;
1265 }
1266 qemu_cond_signal(&qemu_cpu_cond);
1267
1268 /* wait for initial kick-off after machine start */
1269 while (first_cpu->stopped) {
1270 qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1271
1272 /* process any pending work */
1273 CPU_FOREACH(cpu) {
1274 current_cpu = cpu;
1275 qemu_wait_io_event_common(cpu);
1276 }
1277 }
1278
1279 start_tcg_kick_timer();
1280
1281 cpu = first_cpu;
1282
1283 /* process any pending work */
1284 cpu->exit_request = 1;
1285
1286 while (1) {
1287 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1288 qemu_account_warp_timer();
1289
1290 /* Run the timers here. This is much more efficient than
1291 * waking up the I/O thread and waiting for completion.
1292 */
1293 handle_icount_deadline();
1294
1295 if (!cpu) {
1296 cpu = first_cpu;
1297 }
1298
1299 while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1300
1301 atomic_mb_set(&tcg_current_rr_cpu, cpu);
1302 current_cpu = cpu;
1303
1304 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1305 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1306
1307 if (cpu_can_run(cpu)) {
1308 int r;
1309 r = tcg_cpu_exec(cpu);
1310 if (r == EXCP_DEBUG) {
1311 cpu_handle_guest_debug(cpu);
1312 break;
1313 } else if (r == EXCP_ATOMIC) {
1314 qemu_mutex_unlock_iothread();
1315 cpu_exec_step_atomic(cpu);
1316 qemu_mutex_lock_iothread();
1317 break;
1318 }
1319 } else if (cpu->stop) {
1320 if (cpu->unplug) {
1321 cpu = CPU_NEXT(cpu);
1322 }
1323 break;
1324 }
1325
1326 cpu = CPU_NEXT(cpu);
1327 } /* while (cpu && !cpu->exit_request).. */
1328
1329 /* Does not need atomic_mb_set because a spurious wakeup is okay. */
1330 atomic_set(&tcg_current_rr_cpu, NULL);
1331
1332 if (cpu && cpu->exit_request) {
1333 atomic_mb_set(&cpu->exit_request, 0);
1334 }
1335
1336 qemu_tcg_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1337 deal_with_unplugged_cpus();
1338 }
1339
1340 return NULL;
1341 }
1342
1343 static void *qemu_hax_cpu_thread_fn(void *arg)
1344 {
1345 CPUState *cpu = arg;
1346 int r;
1347
1348 qemu_mutex_lock_iothread();
1349 qemu_thread_get_self(cpu->thread);
1350
1351 cpu->thread_id = qemu_get_thread_id();
1352 cpu->created = true;
1353 cpu->halted = 0;
1354 current_cpu = cpu;
1355
1356 hax_init_vcpu(cpu);
1357 qemu_cond_signal(&qemu_cpu_cond);
1358
1359 while (1) {
1360 if (cpu_can_run(cpu)) {
1361 r = hax_smp_cpu_exec(cpu);
1362 if (r == EXCP_DEBUG) {
1363 cpu_handle_guest_debug(cpu);
1364 }
1365 }
1366
1367 while (cpu_thread_is_idle(cpu)) {
1368 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1369 }
1370 #ifdef _WIN32
1371 SleepEx(0, TRUE);
1372 #endif
1373 qemu_wait_io_event_common(cpu);
1374 }
1375 return NULL;
1376 }
1377
1378 #ifdef _WIN32
1379 static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1380 {
1381 }
1382 #endif
1383
1384 /* Multi-threaded TCG
1385 *
1386 * In the multi-threaded case each vCPU has its own thread. The TLS
1387 * variable current_cpu can be used deep in the code to find the
1388 * current CPUState for a given thread.
1389 */
1390
1391 static void *qemu_tcg_cpu_thread_fn(void *arg)
1392 {
1393 CPUState *cpu = arg;
1394
1395 rcu_register_thread();
1396
1397 qemu_mutex_lock_iothread();
1398 qemu_thread_get_self(cpu->thread);
1399
1400 cpu->thread_id = qemu_get_thread_id();
1401 cpu->created = true;
1402 cpu->can_do_io = 1;
1403 current_cpu = cpu;
1404 qemu_cond_signal(&qemu_cpu_cond);
1405
1406 /* process any pending work */
1407 cpu->exit_request = 1;
1408
1409 while (1) {
1410 if (cpu_can_run(cpu)) {
1411 int r;
1412 r = tcg_cpu_exec(cpu);
1413 switch (r) {
1414 case EXCP_DEBUG:
1415 cpu_handle_guest_debug(cpu);
1416 break;
1417 case EXCP_HALTED:
1418 /* during start-up the vCPU is reset and the thread is
1419 * kicked several times. If we don't ensure we go back
1420 * to sleep in the halted state we won't cleanly
1421 * start-up when the vCPU is enabled.
1422 *
1423 * cpu->halted should ensure we sleep in wait_io_event
1424 */
1425 g_assert(cpu->halted);
1426 break;
1427 case EXCP_ATOMIC:
1428 qemu_mutex_unlock_iothread();
1429 cpu_exec_step_atomic(cpu);
1430 qemu_mutex_lock_iothread();
1431 default:
1432 /* Ignore everything else? */
1433 break;
1434 }
1435 }
1436
1437 handle_icount_deadline();
1438
1439 atomic_mb_set(&cpu->exit_request, 0);
1440 qemu_tcg_wait_io_event(cpu);
1441 }
1442
1443 return NULL;
1444 }
1445
1446 static void qemu_cpu_kick_thread(CPUState *cpu)
1447 {
1448 #ifndef _WIN32
1449 int err;
1450
1451 if (cpu->thread_kicked) {
1452 return;
1453 }
1454 cpu->thread_kicked = true;
1455 err = pthread_kill(cpu->thread->thread, SIG_IPI);
1456 if (err) {
1457 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1458 exit(1);
1459 }
1460 #else /* _WIN32 */
1461 if (!qemu_cpu_is_self(cpu)) {
1462 if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1463 fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1464 __func__, GetLastError());
1465 exit(1);
1466 }
1467 }
1468 #endif
1469 }
1470
1471 void qemu_cpu_kick(CPUState *cpu)
1472 {
1473 qemu_cond_broadcast(cpu->halt_cond);
1474 if (tcg_enabled()) {
1475 cpu_exit(cpu);
1476 /* NOP unless doing single-thread RR */
1477 qemu_cpu_kick_rr_cpu();
1478 } else {
1479 if (hax_enabled()) {
1480 /*
1481 * FIXME: race condition with the exit_request check in
1482 * hax_vcpu_hax_exec
1483 */
1484 cpu->exit_request = 1;
1485 }
1486 qemu_cpu_kick_thread(cpu);
1487 }
1488 }
1489
1490 void qemu_cpu_kick_self(void)
1491 {
1492 assert(current_cpu);
1493 qemu_cpu_kick_thread(current_cpu);
1494 }
1495
1496 bool qemu_cpu_is_self(CPUState *cpu)
1497 {
1498 return qemu_thread_is_self(cpu->thread);
1499 }
1500
1501 bool qemu_in_vcpu_thread(void)
1502 {
1503 return current_cpu && qemu_cpu_is_self(current_cpu);
1504 }
1505
1506 static __thread bool iothread_locked = false;
1507
1508 bool qemu_mutex_iothread_locked(void)
1509 {
1510 return iothread_locked;
1511 }
1512
1513 void qemu_mutex_lock_iothread(void)
1514 {
1515 g_assert(!qemu_mutex_iothread_locked());
1516 qemu_mutex_lock(&qemu_global_mutex);
1517 iothread_locked = true;
1518 }
1519
1520 void qemu_mutex_unlock_iothread(void)
1521 {
1522 g_assert(qemu_mutex_iothread_locked());
1523 iothread_locked = false;
1524 qemu_mutex_unlock(&qemu_global_mutex);
1525 }
1526
1527 static bool all_vcpus_paused(void)
1528 {
1529 CPUState *cpu;
1530
1531 CPU_FOREACH(cpu) {
1532 if (!cpu->stopped) {
1533 return false;
1534 }
1535 }
1536
1537 return true;
1538 }
1539
1540 void pause_all_vcpus(void)
1541 {
1542 CPUState *cpu;
1543
1544 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1545 CPU_FOREACH(cpu) {
1546 cpu->stop = true;
1547 qemu_cpu_kick(cpu);
1548 }
1549
1550 if (qemu_in_vcpu_thread()) {
1551 cpu_stop_current();
1552 }
1553
1554 while (!all_vcpus_paused()) {
1555 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1556 CPU_FOREACH(cpu) {
1557 qemu_cpu_kick(cpu);
1558 }
1559 }
1560 }
1561
1562 void cpu_resume(CPUState *cpu)
1563 {
1564 cpu->stop = false;
1565 cpu->stopped = false;
1566 qemu_cpu_kick(cpu);
1567 }
1568
1569 void resume_all_vcpus(void)
1570 {
1571 CPUState *cpu;
1572
1573 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1574 CPU_FOREACH(cpu) {
1575 cpu_resume(cpu);
1576 }
1577 }
1578
1579 void cpu_remove(CPUState *cpu)
1580 {
1581 cpu->stop = true;
1582 cpu->unplug = true;
1583 qemu_cpu_kick(cpu);
1584 }
1585
1586 void cpu_remove_sync(CPUState *cpu)
1587 {
1588 cpu_remove(cpu);
1589 while (cpu->created) {
1590 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1591 }
1592 }
1593
1594 /* For temporary buffers for forming a name */
1595 #define VCPU_THREAD_NAME_SIZE 16
1596
1597 static void qemu_tcg_init_vcpu(CPUState *cpu)
1598 {
1599 char thread_name[VCPU_THREAD_NAME_SIZE];
1600 static QemuCond *single_tcg_halt_cond;
1601 static QemuThread *single_tcg_cpu_thread;
1602
1603 if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1604 cpu->thread = g_malloc0(sizeof(QemuThread));
1605 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1606 qemu_cond_init(cpu->halt_cond);
1607
1608 if (qemu_tcg_mttcg_enabled()) {
1609 /* create a thread per vCPU with TCG (MTTCG) */
1610 parallel_cpus = true;
1611 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1612 cpu->cpu_index);
1613
1614 qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1615 cpu, QEMU_THREAD_JOINABLE);
1616
1617 } else {
1618 /* share a single thread for all cpus with TCG */
1619 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1620 qemu_thread_create(cpu->thread, thread_name,
1621 qemu_tcg_rr_cpu_thread_fn,
1622 cpu, QEMU_THREAD_JOINABLE);
1623
1624 single_tcg_halt_cond = cpu->halt_cond;
1625 single_tcg_cpu_thread = cpu->thread;
1626 }
1627 #ifdef _WIN32
1628 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1629 #endif
1630 while (!cpu->created) {
1631 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1632 }
1633 } else {
1634 /* For non-MTTCG cases we share the thread */
1635 cpu->thread = single_tcg_cpu_thread;
1636 cpu->halt_cond = single_tcg_halt_cond;
1637 }
1638 }
1639
1640 static void qemu_hax_start_vcpu(CPUState *cpu)
1641 {
1642 char thread_name[VCPU_THREAD_NAME_SIZE];
1643
1644 cpu->thread = g_malloc0(sizeof(QemuThread));
1645 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1646 qemu_cond_init(cpu->halt_cond);
1647
1648 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1649 cpu->cpu_index);
1650 qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1651 cpu, QEMU_THREAD_JOINABLE);
1652 #ifdef _WIN32
1653 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1654 #endif
1655 while (!cpu->created) {
1656 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1657 }
1658 }
1659
1660 static void qemu_kvm_start_vcpu(CPUState *cpu)
1661 {
1662 char thread_name[VCPU_THREAD_NAME_SIZE];
1663
1664 cpu->thread = g_malloc0(sizeof(QemuThread));
1665 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1666 qemu_cond_init(cpu->halt_cond);
1667 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1668 cpu->cpu_index);
1669 qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1670 cpu, QEMU_THREAD_JOINABLE);
1671 while (!cpu->created) {
1672 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1673 }
1674 }
1675
1676 static void qemu_dummy_start_vcpu(CPUState *cpu)
1677 {
1678 char thread_name[VCPU_THREAD_NAME_SIZE];
1679
1680 cpu->thread = g_malloc0(sizeof(QemuThread));
1681 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1682 qemu_cond_init(cpu->halt_cond);
1683 snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1684 cpu->cpu_index);
1685 qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1686 QEMU_THREAD_JOINABLE);
1687 while (!cpu->created) {
1688 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1689 }
1690 }
1691
1692 void qemu_init_vcpu(CPUState *cpu)
1693 {
1694 cpu->nr_cores = smp_cores;
1695 cpu->nr_threads = smp_threads;
1696 cpu->stopped = true;
1697
1698 if (!cpu->as) {
1699 /* If the target cpu hasn't set up any address spaces itself,
1700 * give it the default one.
1701 */
1702 AddressSpace *as = address_space_init_shareable(cpu->memory,
1703 "cpu-memory");
1704 cpu->num_ases = 1;
1705 cpu_address_space_init(cpu, as, 0);
1706 }
1707
1708 if (kvm_enabled()) {
1709 qemu_kvm_start_vcpu(cpu);
1710 } else if (hax_enabled()) {
1711 qemu_hax_start_vcpu(cpu);
1712 } else if (tcg_enabled()) {
1713 qemu_tcg_init_vcpu(cpu);
1714 } else {
1715 qemu_dummy_start_vcpu(cpu);
1716 }
1717 }
1718
1719 void cpu_stop_current(void)
1720 {
1721 if (current_cpu) {
1722 current_cpu->stop = false;
1723 current_cpu->stopped = true;
1724 cpu_exit(current_cpu);
1725 qemu_cond_broadcast(&qemu_pause_cond);
1726 }
1727 }
1728
1729 int vm_stop(RunState state)
1730 {
1731 if (qemu_in_vcpu_thread()) {
1732 qemu_system_vmstop_request_prepare();
1733 qemu_system_vmstop_request(state);
1734 /*
1735 * FIXME: should not return to device code in case
1736 * vm_stop() has been requested.
1737 */
1738 cpu_stop_current();
1739 return 0;
1740 }
1741
1742 return do_vm_stop(state);
1743 }
1744
1745 /**
1746 * Prepare for (re)starting the VM.
1747 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
1748 * running or in case of an error condition), 0 otherwise.
1749 */
1750 int vm_prepare_start(void)
1751 {
1752 RunState requested;
1753 int res = 0;
1754
1755 qemu_vmstop_requested(&requested);
1756 if (runstate_is_running() && requested == RUN_STATE__MAX) {
1757 return -1;
1758 }
1759
1760 /* Ensure that a STOP/RESUME pair of events is emitted if a
1761 * vmstop request was pending. The BLOCK_IO_ERROR event, for
1762 * example, according to documentation is always followed by
1763 * the STOP event.
1764 */
1765 if (runstate_is_running()) {
1766 qapi_event_send_stop(&error_abort);
1767 res = -1;
1768 } else {
1769 replay_enable_events();
1770 cpu_enable_ticks();
1771 runstate_set(RUN_STATE_RUNNING);
1772 vm_state_notify(1, RUN_STATE_RUNNING);
1773 }
1774
1775 /* We are sending this now, but the CPUs will be resumed shortly later */
1776 qapi_event_send_resume(&error_abort);
1777 return res;
1778 }
1779
1780 void vm_start(void)
1781 {
1782 if (!vm_prepare_start()) {
1783 resume_all_vcpus();
1784 }
1785 }
1786
1787 /* does a state transition even if the VM is already stopped,
1788 current state is forgotten forever */
1789 int vm_stop_force_state(RunState state)
1790 {
1791 if (runstate_is_running()) {
1792 return vm_stop(state);
1793 } else {
1794 runstate_set(state);
1795
1796 bdrv_drain_all();
1797 /* Make sure to return an error if the flush in a previous vm_stop()
1798 * failed. */
1799 return bdrv_flush_all();
1800 }
1801 }
1802
1803 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1804 {
1805 /* XXX: implement xxx_cpu_list for targets that still miss it */
1806 #if defined(cpu_list)
1807 cpu_list(f, cpu_fprintf);
1808 #endif
1809 }
1810
1811 CpuInfoList *qmp_query_cpus(Error **errp)
1812 {
1813 CpuInfoList *head = NULL, *cur_item = NULL;
1814 CPUState *cpu;
1815
1816 CPU_FOREACH(cpu) {
1817 CpuInfoList *info;
1818 #if defined(TARGET_I386)
1819 X86CPU *x86_cpu = X86_CPU(cpu);
1820 CPUX86State *env = &x86_cpu->env;
1821 #elif defined(TARGET_PPC)
1822 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1823 CPUPPCState *env = &ppc_cpu->env;
1824 #elif defined(TARGET_SPARC)
1825 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1826 CPUSPARCState *env = &sparc_cpu->env;
1827 #elif defined(TARGET_MIPS)
1828 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1829 CPUMIPSState *env = &mips_cpu->env;
1830 #elif defined(TARGET_TRICORE)
1831 TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1832 CPUTriCoreState *env = &tricore_cpu->env;
1833 #endif
1834
1835 cpu_synchronize_state(cpu);
1836
1837 info = g_malloc0(sizeof(*info));
1838 info->value = g_malloc0(sizeof(*info->value));
1839 info->value->CPU = cpu->cpu_index;
1840 info->value->current = (cpu == first_cpu);
1841 info->value->halted = cpu->halted;
1842 info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1843 info->value->thread_id = cpu->thread_id;
1844 #if defined(TARGET_I386)
1845 info->value->arch = CPU_INFO_ARCH_X86;
1846 info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1847 #elif defined(TARGET_PPC)
1848 info->value->arch = CPU_INFO_ARCH_PPC;
1849 info->value->u.ppc.nip = env->nip;
1850 #elif defined(TARGET_SPARC)
1851 info->value->arch = CPU_INFO_ARCH_SPARC;
1852 info->value->u.q_sparc.pc = env->pc;
1853 info->value->u.q_sparc.npc = env->npc;
1854 #elif defined(TARGET_MIPS)
1855 info->value->arch = CPU_INFO_ARCH_MIPS;
1856 info->value->u.q_mips.PC = env->active_tc.PC;
1857 #elif defined(TARGET_TRICORE)
1858 info->value->arch = CPU_INFO_ARCH_TRICORE;
1859 info->value->u.tricore.PC = env->PC;
1860 #else
1861 info->value->arch = CPU_INFO_ARCH_OTHER;
1862 #endif
1863
1864 /* XXX: waiting for the qapi to support GSList */
1865 if (!cur_item) {
1866 head = cur_item = info;
1867 } else {
1868 cur_item->next = info;
1869 cur_item = info;
1870 }
1871 }
1872
1873 return head;
1874 }
1875
1876 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1877 bool has_cpu, int64_t cpu_index, Error **errp)
1878 {
1879 FILE *f;
1880 uint32_t l;
1881 CPUState *cpu;
1882 uint8_t buf[1024];
1883 int64_t orig_addr = addr, orig_size = size;
1884
1885 if (!has_cpu) {
1886 cpu_index = 0;
1887 }
1888
1889 cpu = qemu_get_cpu(cpu_index);
1890 if (cpu == NULL) {
1891 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1892 "a CPU number");
1893 return;
1894 }
1895
1896 f = fopen(filename, "wb");
1897 if (!f) {
1898 error_setg_file_open(errp, errno, filename);
1899 return;
1900 }
1901
1902 while (size != 0) {
1903 l = sizeof(buf);
1904 if (l > size)
1905 l = size;
1906 if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1907 error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1908 " specified", orig_addr, orig_size);
1909 goto exit;
1910 }
1911 if (fwrite(buf, 1, l, f) != l) {
1912 error_setg(errp, QERR_IO_ERROR);
1913 goto exit;
1914 }
1915 addr += l;
1916 size -= l;
1917 }
1918
1919 exit:
1920 fclose(f);
1921 }
1922
1923 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1924 Error **errp)
1925 {
1926 FILE *f;
1927 uint32_t l;
1928 uint8_t buf[1024];
1929
1930 f = fopen(filename, "wb");
1931 if (!f) {
1932 error_setg_file_open(errp, errno, filename);
1933 return;
1934 }
1935
1936 while (size != 0) {
1937 l = sizeof(buf);
1938 if (l > size)
1939 l = size;
1940 cpu_physical_memory_read(addr, buf, l);
1941 if (fwrite(buf, 1, l, f) != l) {
1942 error_setg(errp, QERR_IO_ERROR);
1943 goto exit;
1944 }
1945 addr += l;
1946 size -= l;
1947 }
1948
1949 exit:
1950 fclose(f);
1951 }
1952
1953 void qmp_inject_nmi(Error **errp)
1954 {
1955 nmi_monitor_handle(monitor_get_cpu_index(), errp);
1956 }
1957
1958 void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1959 {
1960 if (!use_icount) {
1961 return;
1962 }
1963
1964 cpu_fprintf(f, "Host - Guest clock %"PRIi64" ms\n",
1965 (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1966 if (icount_align_option) {
1967 cpu_fprintf(f, "Max guest delay %"PRIi64" ms\n", -max_delay/SCALE_MS);
1968 cpu_fprintf(f, "Max guest advance %"PRIi64" ms\n", max_advance/SCALE_MS);
1969 } else {
1970 cpu_fprintf(f, "Max guest delay NA\n");
1971 cpu_fprintf(f, "Max guest advance NA\n");
1972 }
1973 }