]> git.proxmox.com Git - qemu.git/blob - cpus.c
icount: reorganize icount_warp_rt
[qemu.git] / cpus.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 /* Needed early for CONFIG_BSD etc. */
26 #include "config-host.h"
27
28 #include "monitor/monitor.h"
29 #include "sysemu/sysemu.h"
30 #include "exec/gdbstub.h"
31 #include "sysemu/dma.h"
32 #include "sysemu/kvm.h"
33 #include "qmp-commands.h"
34
35 #include "qemu/thread.h"
36 #include "sysemu/cpus.h"
37 #include "sysemu/qtest.h"
38 #include "qemu/main-loop.h"
39 #include "qemu/bitmap.h"
40 #include "qemu/seqlock.h"
41
42 #ifndef _WIN32
43 #include "qemu/compatfd.h"
44 #endif
45
46 #ifdef CONFIG_LINUX
47
48 #include <sys/prctl.h>
49
50 #ifndef PR_MCE_KILL
51 #define PR_MCE_KILL 33
52 #endif
53
54 #ifndef PR_MCE_KILL_SET
55 #define PR_MCE_KILL_SET 1
56 #endif
57
58 #ifndef PR_MCE_KILL_EARLY
59 #define PR_MCE_KILL_EARLY 1
60 #endif
61
62 #endif /* CONFIG_LINUX */
63
64 static CPUState *next_cpu;
65
66 bool cpu_is_stopped(CPUState *cpu)
67 {
68 return cpu->stopped || !runstate_is_running();
69 }
70
71 static bool cpu_thread_is_idle(CPUState *cpu)
72 {
73 if (cpu->stop || cpu->queued_work_first) {
74 return false;
75 }
76 if (cpu_is_stopped(cpu)) {
77 return true;
78 }
79 if (!cpu->halted || qemu_cpu_has_work(cpu) ||
80 kvm_halt_in_kernel()) {
81 return false;
82 }
83 return true;
84 }
85
86 static bool all_cpu_threads_idle(void)
87 {
88 CPUState *cpu;
89
90 CPU_FOREACH(cpu) {
91 if (!cpu_thread_is_idle(cpu)) {
92 return false;
93 }
94 }
95 return true;
96 }
97
98 /***********************************************************/
99 /* guest cycle counter */
100
101 /* Conversion factor from emulated instructions to virtual clock ticks. */
102 static int icount_time_shift;
103 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
104 #define MAX_ICOUNT_SHIFT 10
105 /* Compensate for varying guest execution speed. */
106 static int64_t qemu_icount_bias;
107 static QEMUTimer *icount_rt_timer;
108 static QEMUTimer *icount_vm_timer;
109 static QEMUTimer *icount_warp_timer;
110 static int64_t vm_clock_warp_start;
111 static int64_t qemu_icount;
112
113 typedef struct TimersState {
114 /* Protected by BQL. */
115 int64_t cpu_ticks_prev;
116 int64_t cpu_ticks_offset;
117
118 /* cpu_clock_offset can be read out of BQL, so protect it with
119 * this lock.
120 */
121 QemuSeqLock vm_clock_seqlock;
122 int64_t cpu_clock_offset;
123 int32_t cpu_ticks_enabled;
124 int64_t dummy;
125 } TimersState;
126
127 static TimersState timers_state;
128
129 /* Return the virtual CPU time, based on the instruction counter. */
130 int64_t cpu_get_icount(void)
131 {
132 int64_t icount;
133 CPUState *cpu = current_cpu;
134
135 icount = qemu_icount;
136 if (cpu) {
137 CPUArchState *env = cpu->env_ptr;
138 if (!can_do_io(env)) {
139 fprintf(stderr, "Bad clock read\n");
140 }
141 icount -= (env->icount_decr.u16.low + env->icount_extra);
142 }
143 return qemu_icount_bias + (icount << icount_time_shift);
144 }
145
146 /* return the host CPU cycle counter and handle stop/restart */
147 /* Caller must hold the BQL */
148 int64_t cpu_get_ticks(void)
149 {
150 if (use_icount) {
151 return cpu_get_icount();
152 }
153 if (!timers_state.cpu_ticks_enabled) {
154 return timers_state.cpu_ticks_offset;
155 } else {
156 int64_t ticks;
157 ticks = cpu_get_real_ticks();
158 if (timers_state.cpu_ticks_prev > ticks) {
159 /* Note: non increasing ticks may happen if the host uses
160 software suspend */
161 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
162 }
163 timers_state.cpu_ticks_prev = ticks;
164 return ticks + timers_state.cpu_ticks_offset;
165 }
166 }
167
168 static int64_t cpu_get_clock_locked(void)
169 {
170 int64_t ti;
171
172 if (!timers_state.cpu_ticks_enabled) {
173 ti = timers_state.cpu_clock_offset;
174 } else {
175 ti = get_clock();
176 ti += timers_state.cpu_clock_offset;
177 }
178
179 return ti;
180 }
181
182 /* return the host CPU monotonic timer and handle stop/restart */
183 int64_t cpu_get_clock(void)
184 {
185 int64_t ti;
186 unsigned start;
187
188 do {
189 start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
190 ti = cpu_get_clock_locked();
191 } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
192
193 return ti;
194 }
195
196 /* enable cpu_get_ticks()
197 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
198 */
199 void cpu_enable_ticks(void)
200 {
201 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
202 seqlock_write_lock(&timers_state.vm_clock_seqlock);
203 if (!timers_state.cpu_ticks_enabled) {
204 timers_state.cpu_ticks_offset -= cpu_get_real_ticks();
205 timers_state.cpu_clock_offset -= get_clock();
206 timers_state.cpu_ticks_enabled = 1;
207 }
208 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
209 }
210
211 /* disable cpu_get_ticks() : the clock is stopped. You must not call
212 * cpu_get_ticks() after that.
213 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
214 */
215 void cpu_disable_ticks(void)
216 {
217 /* Here, the really thing protected by seqlock is cpu_clock_offset. */
218 seqlock_write_lock(&timers_state.vm_clock_seqlock);
219 if (timers_state.cpu_ticks_enabled) {
220 timers_state.cpu_ticks_offset = cpu_get_ticks();
221 timers_state.cpu_clock_offset = cpu_get_clock_locked();
222 timers_state.cpu_ticks_enabled = 0;
223 }
224 seqlock_write_unlock(&timers_state.vm_clock_seqlock);
225 }
226
227 /* Correlation between real and virtual time is always going to be
228 fairly approximate, so ignore small variation.
229 When the guest is idle real and virtual time will be aligned in
230 the IO wait loop. */
231 #define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)
232
233 static void icount_adjust(void)
234 {
235 int64_t cur_time;
236 int64_t cur_icount;
237 int64_t delta;
238 static int64_t last_delta;
239
240 /* If the VM is not running, then do nothing. */
241 if (!runstate_is_running()) {
242 return;
243 }
244
245 cur_time = cpu_get_clock();
246 cur_icount = cpu_get_icount();
247
248 delta = cur_icount - cur_time;
249 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
250 if (delta > 0
251 && last_delta + ICOUNT_WOBBLE < delta * 2
252 && icount_time_shift > 0) {
253 /* The guest is getting too far ahead. Slow time down. */
254 icount_time_shift--;
255 }
256 if (delta < 0
257 && last_delta - ICOUNT_WOBBLE > delta * 2
258 && icount_time_shift < MAX_ICOUNT_SHIFT) {
259 /* The guest is getting too far behind. Speed time up. */
260 icount_time_shift++;
261 }
262 last_delta = delta;
263 qemu_icount_bias = cur_icount - (qemu_icount << icount_time_shift);
264 }
265
266 static void icount_adjust_rt(void *opaque)
267 {
268 timer_mod(icount_rt_timer,
269 qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + 1000);
270 icount_adjust();
271 }
272
273 static void icount_adjust_vm(void *opaque)
274 {
275 timer_mod(icount_vm_timer,
276 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
277 get_ticks_per_sec() / 10);
278 icount_adjust();
279 }
280
281 static int64_t qemu_icount_round(int64_t count)
282 {
283 return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
284 }
285
286 static void icount_warp_rt(void *opaque)
287 {
288 if (vm_clock_warp_start == -1) {
289 return;
290 }
291
292 if (runstate_is_running()) {
293 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
294 int64_t warp_delta;
295
296 warp_delta = clock - vm_clock_warp_start;
297 if (use_icount == 2) {
298 /*
299 * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
300 * far ahead of real time.
301 */
302 int64_t cur_time = cpu_get_clock();
303 int64_t cur_icount = cpu_get_icount();
304 int64_t delta = cur_time - cur_icount;
305 warp_delta = MIN(warp_delta, delta);
306 }
307 qemu_icount_bias += warp_delta;
308 }
309 vm_clock_warp_start = -1;
310
311 if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
312 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
313 }
314 }
315
316 void qtest_clock_warp(int64_t dest)
317 {
318 int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
319 assert(qtest_enabled());
320 while (clock < dest) {
321 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
322 int64_t warp = MIN(dest - clock, deadline);
323 qemu_icount_bias += warp;
324 qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
325 clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
326 }
327 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
328 }
329
330 void qemu_clock_warp(QEMUClockType type)
331 {
332 int64_t deadline;
333
334 /*
335 * There are too many global variables to make the "warp" behavior
336 * applicable to other clocks. But a clock argument removes the
337 * need for if statements all over the place.
338 */
339 if (type != QEMU_CLOCK_VIRTUAL || !use_icount) {
340 return;
341 }
342
343 /*
344 * If the CPUs have been sleeping, advance QEMU_CLOCK_VIRTUAL timer now.
345 * This ensures that the deadline for the timer is computed correctly below.
346 * This also makes sure that the insn counter is synchronized before the
347 * CPU starts running, in case the CPU is woken by an event other than
348 * the earliest QEMU_CLOCK_VIRTUAL timer.
349 */
350 icount_warp_rt(NULL);
351 if (!all_cpu_threads_idle() || !qemu_clock_has_timers(QEMU_CLOCK_VIRTUAL)) {
352 timer_del(icount_warp_timer);
353 return;
354 }
355
356 if (qtest_enabled()) {
357 /* When testing, qtest commands advance icount. */
358 return;
359 }
360
361 vm_clock_warp_start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
362 /* We want to use the earliest deadline from ALL vm_clocks */
363 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
364
365 /* Maintain prior (possibly buggy) behaviour where if no deadline
366 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
367 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
368 * nanoseconds.
369 */
370 if ((deadline < 0) || (deadline > INT32_MAX)) {
371 deadline = INT32_MAX;
372 }
373
374 if (deadline > 0) {
375 /*
376 * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
377 * sleep. Otherwise, the CPU might be waiting for a future timer
378 * interrupt to wake it up, but the interrupt never comes because
379 * the vCPU isn't running any insns and thus doesn't advance the
380 * QEMU_CLOCK_VIRTUAL.
381 *
382 * An extreme solution for this problem would be to never let VCPUs
383 * sleep in icount mode if there is a pending QEMU_CLOCK_VIRTUAL
384 * timer; rather time could just advance to the next QEMU_CLOCK_VIRTUAL
385 * event. Instead, we do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL
386 * after some e"real" time, (related to the time left until the next
387 * event) has passed. The QEMU_CLOCK_REALTIME timer will do this.
388 * This avoids that the warps are visible externally; for example,
389 * you will not be sending network packets continuously instead of
390 * every 100ms.
391 */
392 timer_mod(icount_warp_timer, vm_clock_warp_start + deadline);
393 } else if (deadline == 0) {
394 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
395 }
396 }
397
398 static const VMStateDescription vmstate_timers = {
399 .name = "timer",
400 .version_id = 2,
401 .minimum_version_id = 1,
402 .minimum_version_id_old = 1,
403 .fields = (VMStateField[]) {
404 VMSTATE_INT64(cpu_ticks_offset, TimersState),
405 VMSTATE_INT64(dummy, TimersState),
406 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
407 VMSTATE_END_OF_LIST()
408 }
409 };
410
411 void configure_icount(const char *option)
412 {
413 seqlock_init(&timers_state.vm_clock_seqlock, NULL);
414 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
415 if (!option) {
416 return;
417 }
418
419 icount_warp_timer = timer_new_ns(QEMU_CLOCK_REALTIME,
420 icount_warp_rt, NULL);
421 if (strcmp(option, "auto") != 0) {
422 icount_time_shift = strtol(option, NULL, 0);
423 use_icount = 1;
424 return;
425 }
426
427 use_icount = 2;
428
429 /* 125MIPS seems a reasonable initial guess at the guest speed.
430 It will be corrected fairly quickly anyway. */
431 icount_time_shift = 3;
432
433 /* Have both realtime and virtual time triggers for speed adjustment.
434 The realtime trigger catches emulated time passing too slowly,
435 the virtual time trigger catches emulated time passing too fast.
436 Realtime triggers occur even when idle, so use them less frequently
437 than VM triggers. */
438 icount_rt_timer = timer_new_ms(QEMU_CLOCK_REALTIME,
439 icount_adjust_rt, NULL);
440 timer_mod(icount_rt_timer,
441 qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + 1000);
442 icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
443 icount_adjust_vm, NULL);
444 timer_mod(icount_vm_timer,
445 qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
446 get_ticks_per_sec() / 10);
447 }
448
449 /***********************************************************/
450 void hw_error(const char *fmt, ...)
451 {
452 va_list ap;
453 CPUState *cpu;
454
455 va_start(ap, fmt);
456 fprintf(stderr, "qemu: hardware error: ");
457 vfprintf(stderr, fmt, ap);
458 fprintf(stderr, "\n");
459 CPU_FOREACH(cpu) {
460 fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
461 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
462 }
463 va_end(ap);
464 abort();
465 }
466
467 void cpu_synchronize_all_states(void)
468 {
469 CPUState *cpu;
470
471 CPU_FOREACH(cpu) {
472 cpu_synchronize_state(cpu);
473 }
474 }
475
476 void cpu_synchronize_all_post_reset(void)
477 {
478 CPUState *cpu;
479
480 CPU_FOREACH(cpu) {
481 cpu_synchronize_post_reset(cpu);
482 }
483 }
484
485 void cpu_synchronize_all_post_init(void)
486 {
487 CPUState *cpu;
488
489 CPU_FOREACH(cpu) {
490 cpu_synchronize_post_init(cpu);
491 }
492 }
493
494 static int do_vm_stop(RunState state)
495 {
496 int ret = 0;
497
498 if (runstate_is_running()) {
499 cpu_disable_ticks();
500 pause_all_vcpus();
501 runstate_set(state);
502 vm_state_notify(0, state);
503 monitor_protocol_event(QEVENT_STOP, NULL);
504 }
505
506 bdrv_drain_all();
507 ret = bdrv_flush_all();
508
509 return ret;
510 }
511
512 static bool cpu_can_run(CPUState *cpu)
513 {
514 if (cpu->stop) {
515 return false;
516 }
517 if (cpu_is_stopped(cpu)) {
518 return false;
519 }
520 return true;
521 }
522
523 static void cpu_handle_guest_debug(CPUState *cpu)
524 {
525 gdb_set_stop_cpu(cpu);
526 qemu_system_debug_request();
527 cpu->stopped = true;
528 }
529
530 static void cpu_signal(int sig)
531 {
532 if (current_cpu) {
533 cpu_exit(current_cpu);
534 }
535 exit_request = 1;
536 }
537
538 #ifdef CONFIG_LINUX
539 static void sigbus_reraise(void)
540 {
541 sigset_t set;
542 struct sigaction action;
543
544 memset(&action, 0, sizeof(action));
545 action.sa_handler = SIG_DFL;
546 if (!sigaction(SIGBUS, &action, NULL)) {
547 raise(SIGBUS);
548 sigemptyset(&set);
549 sigaddset(&set, SIGBUS);
550 sigprocmask(SIG_UNBLOCK, &set, NULL);
551 }
552 perror("Failed to re-raise SIGBUS!\n");
553 abort();
554 }
555
556 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
557 void *ctx)
558 {
559 if (kvm_on_sigbus(siginfo->ssi_code,
560 (void *)(intptr_t)siginfo->ssi_addr)) {
561 sigbus_reraise();
562 }
563 }
564
565 static void qemu_init_sigbus(void)
566 {
567 struct sigaction action;
568
569 memset(&action, 0, sizeof(action));
570 action.sa_flags = SA_SIGINFO;
571 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
572 sigaction(SIGBUS, &action, NULL);
573
574 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
575 }
576
577 static void qemu_kvm_eat_signals(CPUState *cpu)
578 {
579 struct timespec ts = { 0, 0 };
580 siginfo_t siginfo;
581 sigset_t waitset;
582 sigset_t chkset;
583 int r;
584
585 sigemptyset(&waitset);
586 sigaddset(&waitset, SIG_IPI);
587 sigaddset(&waitset, SIGBUS);
588
589 do {
590 r = sigtimedwait(&waitset, &siginfo, &ts);
591 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
592 perror("sigtimedwait");
593 exit(1);
594 }
595
596 switch (r) {
597 case SIGBUS:
598 if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
599 sigbus_reraise();
600 }
601 break;
602 default:
603 break;
604 }
605
606 r = sigpending(&chkset);
607 if (r == -1) {
608 perror("sigpending");
609 exit(1);
610 }
611 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
612 }
613
614 #else /* !CONFIG_LINUX */
615
616 static void qemu_init_sigbus(void)
617 {
618 }
619
620 static void qemu_kvm_eat_signals(CPUState *cpu)
621 {
622 }
623 #endif /* !CONFIG_LINUX */
624
625 #ifndef _WIN32
626 static void dummy_signal(int sig)
627 {
628 }
629
630 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
631 {
632 int r;
633 sigset_t set;
634 struct sigaction sigact;
635
636 memset(&sigact, 0, sizeof(sigact));
637 sigact.sa_handler = dummy_signal;
638 sigaction(SIG_IPI, &sigact, NULL);
639
640 pthread_sigmask(SIG_BLOCK, NULL, &set);
641 sigdelset(&set, SIG_IPI);
642 sigdelset(&set, SIGBUS);
643 r = kvm_set_signal_mask(cpu, &set);
644 if (r) {
645 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
646 exit(1);
647 }
648 }
649
650 static void qemu_tcg_init_cpu_signals(void)
651 {
652 sigset_t set;
653 struct sigaction sigact;
654
655 memset(&sigact, 0, sizeof(sigact));
656 sigact.sa_handler = cpu_signal;
657 sigaction(SIG_IPI, &sigact, NULL);
658
659 sigemptyset(&set);
660 sigaddset(&set, SIG_IPI);
661 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
662 }
663
664 #else /* _WIN32 */
665 static void qemu_kvm_init_cpu_signals(CPUState *cpu)
666 {
667 abort();
668 }
669
670 static void qemu_tcg_init_cpu_signals(void)
671 {
672 }
673 #endif /* _WIN32 */
674
675 static QemuMutex qemu_global_mutex;
676 static QemuCond qemu_io_proceeded_cond;
677 static bool iothread_requesting_mutex;
678
679 static QemuThread io_thread;
680
681 static QemuThread *tcg_cpu_thread;
682 static QemuCond *tcg_halt_cond;
683
684 /* cpu creation */
685 static QemuCond qemu_cpu_cond;
686 /* system init */
687 static QemuCond qemu_pause_cond;
688 static QemuCond qemu_work_cond;
689
690 void qemu_init_cpu_loop(void)
691 {
692 qemu_init_sigbus();
693 qemu_cond_init(&qemu_cpu_cond);
694 qemu_cond_init(&qemu_pause_cond);
695 qemu_cond_init(&qemu_work_cond);
696 qemu_cond_init(&qemu_io_proceeded_cond);
697 qemu_mutex_init(&qemu_global_mutex);
698
699 qemu_thread_get_self(&io_thread);
700 }
701
702 void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
703 {
704 struct qemu_work_item wi;
705
706 if (qemu_cpu_is_self(cpu)) {
707 func(data);
708 return;
709 }
710
711 wi.func = func;
712 wi.data = data;
713 wi.free = false;
714 if (cpu->queued_work_first == NULL) {
715 cpu->queued_work_first = &wi;
716 } else {
717 cpu->queued_work_last->next = &wi;
718 }
719 cpu->queued_work_last = &wi;
720 wi.next = NULL;
721 wi.done = false;
722
723 qemu_cpu_kick(cpu);
724 while (!wi.done) {
725 CPUState *self_cpu = current_cpu;
726
727 qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
728 current_cpu = self_cpu;
729 }
730 }
731
732 void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
733 {
734 struct qemu_work_item *wi;
735
736 if (qemu_cpu_is_self(cpu)) {
737 func(data);
738 return;
739 }
740
741 wi = g_malloc0(sizeof(struct qemu_work_item));
742 wi->func = func;
743 wi->data = data;
744 wi->free = true;
745 if (cpu->queued_work_first == NULL) {
746 cpu->queued_work_first = wi;
747 } else {
748 cpu->queued_work_last->next = wi;
749 }
750 cpu->queued_work_last = wi;
751 wi->next = NULL;
752 wi->done = false;
753
754 qemu_cpu_kick(cpu);
755 }
756
757 static void flush_queued_work(CPUState *cpu)
758 {
759 struct qemu_work_item *wi;
760
761 if (cpu->queued_work_first == NULL) {
762 return;
763 }
764
765 while ((wi = cpu->queued_work_first)) {
766 cpu->queued_work_first = wi->next;
767 wi->func(wi->data);
768 wi->done = true;
769 if (wi->free) {
770 g_free(wi);
771 }
772 }
773 cpu->queued_work_last = NULL;
774 qemu_cond_broadcast(&qemu_work_cond);
775 }
776
777 static void qemu_wait_io_event_common(CPUState *cpu)
778 {
779 if (cpu->stop) {
780 cpu->stop = false;
781 cpu->stopped = true;
782 qemu_cond_signal(&qemu_pause_cond);
783 }
784 flush_queued_work(cpu);
785 cpu->thread_kicked = false;
786 }
787
788 static void qemu_tcg_wait_io_event(void)
789 {
790 CPUState *cpu;
791
792 while (all_cpu_threads_idle()) {
793 /* Start accounting real time to the virtual clock if the CPUs
794 are idle. */
795 qemu_clock_warp(QEMU_CLOCK_VIRTUAL);
796 qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
797 }
798
799 while (iothread_requesting_mutex) {
800 qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
801 }
802
803 CPU_FOREACH(cpu) {
804 qemu_wait_io_event_common(cpu);
805 }
806 }
807
808 static void qemu_kvm_wait_io_event(CPUState *cpu)
809 {
810 while (cpu_thread_is_idle(cpu)) {
811 qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
812 }
813
814 qemu_kvm_eat_signals(cpu);
815 qemu_wait_io_event_common(cpu);
816 }
817
818 static void *qemu_kvm_cpu_thread_fn(void *arg)
819 {
820 CPUState *cpu = arg;
821 int r;
822
823 qemu_mutex_lock(&qemu_global_mutex);
824 qemu_thread_get_self(cpu->thread);
825 cpu->thread_id = qemu_get_thread_id();
826 current_cpu = cpu;
827
828 r = kvm_init_vcpu(cpu);
829 if (r < 0) {
830 fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
831 exit(1);
832 }
833
834 qemu_kvm_init_cpu_signals(cpu);
835
836 /* signal CPU creation */
837 cpu->created = true;
838 qemu_cond_signal(&qemu_cpu_cond);
839
840 while (1) {
841 if (cpu_can_run(cpu)) {
842 r = kvm_cpu_exec(cpu);
843 if (r == EXCP_DEBUG) {
844 cpu_handle_guest_debug(cpu);
845 }
846 }
847 qemu_kvm_wait_io_event(cpu);
848 }
849
850 return NULL;
851 }
852
853 static void *qemu_dummy_cpu_thread_fn(void *arg)
854 {
855 #ifdef _WIN32
856 fprintf(stderr, "qtest is not supported under Windows\n");
857 exit(1);
858 #else
859 CPUState *cpu = arg;
860 sigset_t waitset;
861 int r;
862
863 qemu_mutex_lock_iothread();
864 qemu_thread_get_self(cpu->thread);
865 cpu->thread_id = qemu_get_thread_id();
866
867 sigemptyset(&waitset);
868 sigaddset(&waitset, SIG_IPI);
869
870 /* signal CPU creation */
871 cpu->created = true;
872 qemu_cond_signal(&qemu_cpu_cond);
873
874 current_cpu = cpu;
875 while (1) {
876 current_cpu = NULL;
877 qemu_mutex_unlock_iothread();
878 do {
879 int sig;
880 r = sigwait(&waitset, &sig);
881 } while (r == -1 && (errno == EAGAIN || errno == EINTR));
882 if (r == -1) {
883 perror("sigwait");
884 exit(1);
885 }
886 qemu_mutex_lock_iothread();
887 current_cpu = cpu;
888 qemu_wait_io_event_common(cpu);
889 }
890
891 return NULL;
892 #endif
893 }
894
895 static void tcg_exec_all(void);
896
897 static void *qemu_tcg_cpu_thread_fn(void *arg)
898 {
899 CPUState *cpu = arg;
900
901 qemu_tcg_init_cpu_signals();
902 qemu_thread_get_self(cpu->thread);
903
904 qemu_mutex_lock(&qemu_global_mutex);
905 CPU_FOREACH(cpu) {
906 cpu->thread_id = qemu_get_thread_id();
907 cpu->created = true;
908 }
909 qemu_cond_signal(&qemu_cpu_cond);
910
911 /* wait for initial kick-off after machine start */
912 while (QTAILQ_FIRST(&cpus)->stopped) {
913 qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
914
915 /* process any pending work */
916 CPU_FOREACH(cpu) {
917 qemu_wait_io_event_common(cpu);
918 }
919 }
920
921 while (1) {
922 tcg_exec_all();
923
924 if (use_icount) {
925 int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
926
927 if (deadline == 0) {
928 qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
929 }
930 }
931 qemu_tcg_wait_io_event();
932 }
933
934 return NULL;
935 }
936
937 static void qemu_cpu_kick_thread(CPUState *cpu)
938 {
939 #ifndef _WIN32
940 int err;
941
942 err = pthread_kill(cpu->thread->thread, SIG_IPI);
943 if (err) {
944 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
945 exit(1);
946 }
947 #else /* _WIN32 */
948 if (!qemu_cpu_is_self(cpu)) {
949 CONTEXT tcgContext;
950
951 if (SuspendThread(cpu->hThread) == (DWORD)-1) {
952 fprintf(stderr, "qemu:%s: GetLastError:%lu\n", __func__,
953 GetLastError());
954 exit(1);
955 }
956
957 /* On multi-core systems, we are not sure that the thread is actually
958 * suspended until we can get the context.
959 */
960 tcgContext.ContextFlags = CONTEXT_CONTROL;
961 while (GetThreadContext(cpu->hThread, &tcgContext) != 0) {
962 continue;
963 }
964
965 cpu_signal(0);
966
967 if (ResumeThread(cpu->hThread) == (DWORD)-1) {
968 fprintf(stderr, "qemu:%s: GetLastError:%lu\n", __func__,
969 GetLastError());
970 exit(1);
971 }
972 }
973 #endif
974 }
975
976 void qemu_cpu_kick(CPUState *cpu)
977 {
978 qemu_cond_broadcast(cpu->halt_cond);
979 if (!tcg_enabled() && !cpu->thread_kicked) {
980 qemu_cpu_kick_thread(cpu);
981 cpu->thread_kicked = true;
982 }
983 }
984
985 void qemu_cpu_kick_self(void)
986 {
987 #ifndef _WIN32
988 assert(current_cpu);
989
990 if (!current_cpu->thread_kicked) {
991 qemu_cpu_kick_thread(current_cpu);
992 current_cpu->thread_kicked = true;
993 }
994 #else
995 abort();
996 #endif
997 }
998
999 bool qemu_cpu_is_self(CPUState *cpu)
1000 {
1001 return qemu_thread_is_self(cpu->thread);
1002 }
1003
1004 static bool qemu_in_vcpu_thread(void)
1005 {
1006 return current_cpu && qemu_cpu_is_self(current_cpu);
1007 }
1008
1009 void qemu_mutex_lock_iothread(void)
1010 {
1011 if (!tcg_enabled()) {
1012 qemu_mutex_lock(&qemu_global_mutex);
1013 } else {
1014 iothread_requesting_mutex = true;
1015 if (qemu_mutex_trylock(&qemu_global_mutex)) {
1016 qemu_cpu_kick_thread(first_cpu);
1017 qemu_mutex_lock(&qemu_global_mutex);
1018 }
1019 iothread_requesting_mutex = false;
1020 qemu_cond_broadcast(&qemu_io_proceeded_cond);
1021 }
1022 }
1023
1024 void qemu_mutex_unlock_iothread(void)
1025 {
1026 qemu_mutex_unlock(&qemu_global_mutex);
1027 }
1028
1029 static int all_vcpus_paused(void)
1030 {
1031 CPUState *cpu;
1032
1033 CPU_FOREACH(cpu) {
1034 if (!cpu->stopped) {
1035 return 0;
1036 }
1037 }
1038
1039 return 1;
1040 }
1041
1042 void pause_all_vcpus(void)
1043 {
1044 CPUState *cpu;
1045
1046 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1047 CPU_FOREACH(cpu) {
1048 cpu->stop = true;
1049 qemu_cpu_kick(cpu);
1050 }
1051
1052 if (qemu_in_vcpu_thread()) {
1053 cpu_stop_current();
1054 if (!kvm_enabled()) {
1055 CPU_FOREACH(cpu) {
1056 cpu->stop = false;
1057 cpu->stopped = true;
1058 }
1059 return;
1060 }
1061 }
1062
1063 while (!all_vcpus_paused()) {
1064 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1065 CPU_FOREACH(cpu) {
1066 qemu_cpu_kick(cpu);
1067 }
1068 }
1069 }
1070
1071 void cpu_resume(CPUState *cpu)
1072 {
1073 cpu->stop = false;
1074 cpu->stopped = false;
1075 qemu_cpu_kick(cpu);
1076 }
1077
1078 void resume_all_vcpus(void)
1079 {
1080 CPUState *cpu;
1081
1082 qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1083 CPU_FOREACH(cpu) {
1084 cpu_resume(cpu);
1085 }
1086 }
1087
1088 static void qemu_tcg_init_vcpu(CPUState *cpu)
1089 {
1090 /* share a single thread for all cpus with TCG */
1091 if (!tcg_cpu_thread) {
1092 cpu->thread = g_malloc0(sizeof(QemuThread));
1093 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1094 qemu_cond_init(cpu->halt_cond);
1095 tcg_halt_cond = cpu->halt_cond;
1096 qemu_thread_create(cpu->thread, qemu_tcg_cpu_thread_fn, cpu,
1097 QEMU_THREAD_JOINABLE);
1098 #ifdef _WIN32
1099 cpu->hThread = qemu_thread_get_handle(cpu->thread);
1100 #endif
1101 while (!cpu->created) {
1102 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1103 }
1104 tcg_cpu_thread = cpu->thread;
1105 } else {
1106 cpu->thread = tcg_cpu_thread;
1107 cpu->halt_cond = tcg_halt_cond;
1108 }
1109 }
1110
1111 static void qemu_kvm_start_vcpu(CPUState *cpu)
1112 {
1113 cpu->thread = g_malloc0(sizeof(QemuThread));
1114 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1115 qemu_cond_init(cpu->halt_cond);
1116 qemu_thread_create(cpu->thread, qemu_kvm_cpu_thread_fn, cpu,
1117 QEMU_THREAD_JOINABLE);
1118 while (!cpu->created) {
1119 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1120 }
1121 }
1122
1123 static void qemu_dummy_start_vcpu(CPUState *cpu)
1124 {
1125 cpu->thread = g_malloc0(sizeof(QemuThread));
1126 cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1127 qemu_cond_init(cpu->halt_cond);
1128 qemu_thread_create(cpu->thread, qemu_dummy_cpu_thread_fn, cpu,
1129 QEMU_THREAD_JOINABLE);
1130 while (!cpu->created) {
1131 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1132 }
1133 }
1134
1135 void qemu_init_vcpu(CPUState *cpu)
1136 {
1137 cpu->nr_cores = smp_cores;
1138 cpu->nr_threads = smp_threads;
1139 cpu->stopped = true;
1140 if (kvm_enabled()) {
1141 qemu_kvm_start_vcpu(cpu);
1142 } else if (tcg_enabled()) {
1143 qemu_tcg_init_vcpu(cpu);
1144 } else {
1145 qemu_dummy_start_vcpu(cpu);
1146 }
1147 }
1148
1149 void cpu_stop_current(void)
1150 {
1151 if (current_cpu) {
1152 current_cpu->stop = false;
1153 current_cpu->stopped = true;
1154 cpu_exit(current_cpu);
1155 qemu_cond_signal(&qemu_pause_cond);
1156 }
1157 }
1158
1159 int vm_stop(RunState state)
1160 {
1161 if (qemu_in_vcpu_thread()) {
1162 qemu_system_vmstop_request(state);
1163 /*
1164 * FIXME: should not return to device code in case
1165 * vm_stop() has been requested.
1166 */
1167 cpu_stop_current();
1168 return 0;
1169 }
1170
1171 return do_vm_stop(state);
1172 }
1173
1174 /* does a state transition even if the VM is already stopped,
1175 current state is forgotten forever */
1176 int vm_stop_force_state(RunState state)
1177 {
1178 if (runstate_is_running()) {
1179 return vm_stop(state);
1180 } else {
1181 runstate_set(state);
1182 /* Make sure to return an error if the flush in a previous vm_stop()
1183 * failed. */
1184 return bdrv_flush_all();
1185 }
1186 }
1187
1188 static int tcg_cpu_exec(CPUArchState *env)
1189 {
1190 int ret;
1191 #ifdef CONFIG_PROFILER
1192 int64_t ti;
1193 #endif
1194
1195 #ifdef CONFIG_PROFILER
1196 ti = profile_getclock();
1197 #endif
1198 if (use_icount) {
1199 int64_t count;
1200 int64_t deadline;
1201 int decr;
1202 qemu_icount -= (env->icount_decr.u16.low + env->icount_extra);
1203 env->icount_decr.u16.low = 0;
1204 env->icount_extra = 0;
1205 deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1206
1207 /* Maintain prior (possibly buggy) behaviour where if no deadline
1208 * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1209 * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1210 * nanoseconds.
1211 */
1212 if ((deadline < 0) || (deadline > INT32_MAX)) {
1213 deadline = INT32_MAX;
1214 }
1215
1216 count = qemu_icount_round(deadline);
1217 qemu_icount += count;
1218 decr = (count > 0xffff) ? 0xffff : count;
1219 count -= decr;
1220 env->icount_decr.u16.low = decr;
1221 env->icount_extra = count;
1222 }
1223 ret = cpu_exec(env);
1224 #ifdef CONFIG_PROFILER
1225 qemu_time += profile_getclock() - ti;
1226 #endif
1227 if (use_icount) {
1228 /* Fold pending instructions back into the
1229 instruction counter, and clear the interrupt flag. */
1230 qemu_icount -= (env->icount_decr.u16.low
1231 + env->icount_extra);
1232 env->icount_decr.u32 = 0;
1233 env->icount_extra = 0;
1234 }
1235 return ret;
1236 }
1237
1238 static void tcg_exec_all(void)
1239 {
1240 int r;
1241
1242 /* Account partial waits to QEMU_CLOCK_VIRTUAL. */
1243 qemu_clock_warp(QEMU_CLOCK_VIRTUAL);
1244
1245 if (next_cpu == NULL) {
1246 next_cpu = first_cpu;
1247 }
1248 for (; next_cpu != NULL && !exit_request; next_cpu = CPU_NEXT(next_cpu)) {
1249 CPUState *cpu = next_cpu;
1250 CPUArchState *env = cpu->env_ptr;
1251
1252 qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1253 (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1254
1255 if (cpu_can_run(cpu)) {
1256 r = tcg_cpu_exec(env);
1257 if (r == EXCP_DEBUG) {
1258 cpu_handle_guest_debug(cpu);
1259 break;
1260 }
1261 } else if (cpu->stop || cpu->stopped) {
1262 break;
1263 }
1264 }
1265 exit_request = 0;
1266 }
1267
1268 void set_numa_modes(void)
1269 {
1270 CPUState *cpu;
1271 int i;
1272
1273 CPU_FOREACH(cpu) {
1274 for (i = 0; i < nb_numa_nodes; i++) {
1275 if (test_bit(cpu->cpu_index, node_cpumask[i])) {
1276 cpu->numa_node = i;
1277 }
1278 }
1279 }
1280 }
1281
1282 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1283 {
1284 /* XXX: implement xxx_cpu_list for targets that still miss it */
1285 #if defined(cpu_list)
1286 cpu_list(f, cpu_fprintf);
1287 #endif
1288 }
1289
1290 CpuInfoList *qmp_query_cpus(Error **errp)
1291 {
1292 CpuInfoList *head = NULL, *cur_item = NULL;
1293 CPUState *cpu;
1294
1295 CPU_FOREACH(cpu) {
1296 CpuInfoList *info;
1297 #if defined(TARGET_I386)
1298 X86CPU *x86_cpu = X86_CPU(cpu);
1299 CPUX86State *env = &x86_cpu->env;
1300 #elif defined(TARGET_PPC)
1301 PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1302 CPUPPCState *env = &ppc_cpu->env;
1303 #elif defined(TARGET_SPARC)
1304 SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1305 CPUSPARCState *env = &sparc_cpu->env;
1306 #elif defined(TARGET_MIPS)
1307 MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1308 CPUMIPSState *env = &mips_cpu->env;
1309 #endif
1310
1311 cpu_synchronize_state(cpu);
1312
1313 info = g_malloc0(sizeof(*info));
1314 info->value = g_malloc0(sizeof(*info->value));
1315 info->value->CPU = cpu->cpu_index;
1316 info->value->current = (cpu == first_cpu);
1317 info->value->halted = cpu->halted;
1318 info->value->thread_id = cpu->thread_id;
1319 #if defined(TARGET_I386)
1320 info->value->has_pc = true;
1321 info->value->pc = env->eip + env->segs[R_CS].base;
1322 #elif defined(TARGET_PPC)
1323 info->value->has_nip = true;
1324 info->value->nip = env->nip;
1325 #elif defined(TARGET_SPARC)
1326 info->value->has_pc = true;
1327 info->value->pc = env->pc;
1328 info->value->has_npc = true;
1329 info->value->npc = env->npc;
1330 #elif defined(TARGET_MIPS)
1331 info->value->has_PC = true;
1332 info->value->PC = env->active_tc.PC;
1333 #endif
1334
1335 /* XXX: waiting for the qapi to support GSList */
1336 if (!cur_item) {
1337 head = cur_item = info;
1338 } else {
1339 cur_item->next = info;
1340 cur_item = info;
1341 }
1342 }
1343
1344 return head;
1345 }
1346
1347 void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1348 bool has_cpu, int64_t cpu_index, Error **errp)
1349 {
1350 FILE *f;
1351 uint32_t l;
1352 CPUState *cpu;
1353 uint8_t buf[1024];
1354
1355 if (!has_cpu) {
1356 cpu_index = 0;
1357 }
1358
1359 cpu = qemu_get_cpu(cpu_index);
1360 if (cpu == NULL) {
1361 error_set(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1362 "a CPU number");
1363 return;
1364 }
1365
1366 f = fopen(filename, "wb");
1367 if (!f) {
1368 error_setg_file_open(errp, errno, filename);
1369 return;
1370 }
1371
1372 while (size != 0) {
1373 l = sizeof(buf);
1374 if (l > size)
1375 l = size;
1376 cpu_memory_rw_debug(cpu, addr, buf, l, 0);
1377 if (fwrite(buf, 1, l, f) != l) {
1378 error_set(errp, QERR_IO_ERROR);
1379 goto exit;
1380 }
1381 addr += l;
1382 size -= l;
1383 }
1384
1385 exit:
1386 fclose(f);
1387 }
1388
1389 void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1390 Error **errp)
1391 {
1392 FILE *f;
1393 uint32_t l;
1394 uint8_t buf[1024];
1395
1396 f = fopen(filename, "wb");
1397 if (!f) {
1398 error_setg_file_open(errp, errno, filename);
1399 return;
1400 }
1401
1402 while (size != 0) {
1403 l = sizeof(buf);
1404 if (l > size)
1405 l = size;
1406 cpu_physical_memory_rw(addr, buf, l, 0);
1407 if (fwrite(buf, 1, l, f) != l) {
1408 error_set(errp, QERR_IO_ERROR);
1409 goto exit;
1410 }
1411 addr += l;
1412 size -= l;
1413 }
1414
1415 exit:
1416 fclose(f);
1417 }
1418
1419 void qmp_inject_nmi(Error **errp)
1420 {
1421 #if defined(TARGET_I386)
1422 CPUState *cs;
1423
1424 CPU_FOREACH(cs) {
1425 X86CPU *cpu = X86_CPU(cs);
1426 CPUX86State *env = &cpu->env;
1427
1428 if (!env->apic_state) {
1429 cpu_interrupt(cs, CPU_INTERRUPT_NMI);
1430 } else {
1431 apic_deliver_nmi(env->apic_state);
1432 }
1433 }
1434 #elif defined(TARGET_S390X)
1435 CPUState *cs;
1436 S390CPU *cpu;
1437
1438 CPU_FOREACH(cs) {
1439 cpu = S390_CPU(cs);
1440 if (cpu->env.cpu_num == monitor_get_cpu_index()) {
1441 if (s390_cpu_restart(S390_CPU(cs)) == -1) {
1442 error_set(errp, QERR_UNSUPPORTED);
1443 return;
1444 }
1445 break;
1446 }
1447 }
1448 #else
1449 error_set(errp, QERR_UNSUPPORTED);
1450 #endif
1451 }