]> git.proxmox.com Git - qemu.git/blob - cpus.c
1328baac70f61b78e064e60148db6d21849c98c8
[qemu.git] / cpus.c
1 /*
2 * QEMU System Emulator
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 /* Needed early for CONFIG_BSD etc. */
26 #include "config-host.h"
27
28 #include "monitor.h"
29 #include "sysemu.h"
30 #include "gdbstub.h"
31 #include "dma.h"
32 #include "kvm.h"
33
34 #include "qemu-thread.h"
35 #include "cpus.h"
36
37 #ifndef _WIN32
38 #include "compatfd.h"
39 #endif
40
41 #ifdef SIGRTMIN
42 #define SIG_IPI (SIGRTMIN+4)
43 #else
44 #define SIG_IPI SIGUSR1
45 #endif
46
47 #ifdef CONFIG_LINUX
48
49 #include <sys/prctl.h>
50
51 #ifndef PR_MCE_KILL
52 #define PR_MCE_KILL 33
53 #endif
54
55 #ifndef PR_MCE_KILL_SET
56 #define PR_MCE_KILL_SET 1
57 #endif
58
59 #ifndef PR_MCE_KILL_EARLY
60 #define PR_MCE_KILL_EARLY 1
61 #endif
62
63 #endif /* CONFIG_LINUX */
64
65 static CPUState *next_cpu;
66
67 /***********************************************************/
68 /* guest cycle counter */
69
70 /* Conversion factor from emulated instructions to virtual clock ticks. */
71 static int icount_time_shift;
72 /* Arbitrarily pick 1MIPS as the minimum allowable speed. */
73 #define MAX_ICOUNT_SHIFT 10
74 /* Compensate for varying guest execution speed. */
75 static int64_t qemu_icount_bias;
76 static QEMUTimer *icount_rt_timer;
77 static QEMUTimer *icount_vm_timer;
78 static QEMUTimer *icount_warp_timer;
79 static int64_t vm_clock_warp_start;
80 static int64_t qemu_icount;
81
82 typedef struct TimersState {
83 int64_t cpu_ticks_prev;
84 int64_t cpu_ticks_offset;
85 int64_t cpu_clock_offset;
86 int32_t cpu_ticks_enabled;
87 int64_t dummy;
88 } TimersState;
89
90 TimersState timers_state;
91
92 /* Return the virtual CPU time, based on the instruction counter. */
93 int64_t cpu_get_icount(void)
94 {
95 int64_t icount;
96 CPUState *env = cpu_single_env;;
97
98 icount = qemu_icount;
99 if (env) {
100 if (!can_do_io(env)) {
101 fprintf(stderr, "Bad clock read\n");
102 }
103 icount -= (env->icount_decr.u16.low + env->icount_extra);
104 }
105 return qemu_icount_bias + (icount << icount_time_shift);
106 }
107
108 /* return the host CPU cycle counter and handle stop/restart */
109 int64_t cpu_get_ticks(void)
110 {
111 if (use_icount) {
112 return cpu_get_icount();
113 }
114 if (!timers_state.cpu_ticks_enabled) {
115 return timers_state.cpu_ticks_offset;
116 } else {
117 int64_t ticks;
118 ticks = cpu_get_real_ticks();
119 if (timers_state.cpu_ticks_prev > ticks) {
120 /* Note: non increasing ticks may happen if the host uses
121 software suspend */
122 timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
123 }
124 timers_state.cpu_ticks_prev = ticks;
125 return ticks + timers_state.cpu_ticks_offset;
126 }
127 }
128
129 /* return the host CPU monotonic timer and handle stop/restart */
130 int64_t cpu_get_clock(void)
131 {
132 int64_t ti;
133 if (!timers_state.cpu_ticks_enabled) {
134 return timers_state.cpu_clock_offset;
135 } else {
136 ti = get_clock();
137 return ti + timers_state.cpu_clock_offset;
138 }
139 }
140
141 /* enable cpu_get_ticks() */
142 void cpu_enable_ticks(void)
143 {
144 if (!timers_state.cpu_ticks_enabled) {
145 timers_state.cpu_ticks_offset -= cpu_get_real_ticks();
146 timers_state.cpu_clock_offset -= get_clock();
147 timers_state.cpu_ticks_enabled = 1;
148 }
149 }
150
151 /* disable cpu_get_ticks() : the clock is stopped. You must not call
152 cpu_get_ticks() after that. */
153 void cpu_disable_ticks(void)
154 {
155 if (timers_state.cpu_ticks_enabled) {
156 timers_state.cpu_ticks_offset = cpu_get_ticks();
157 timers_state.cpu_clock_offset = cpu_get_clock();
158 timers_state.cpu_ticks_enabled = 0;
159 }
160 }
161
162 /* Correlation between real and virtual time is always going to be
163 fairly approximate, so ignore small variation.
164 When the guest is idle real and virtual time will be aligned in
165 the IO wait loop. */
166 #define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)
167
168 static void icount_adjust(void)
169 {
170 int64_t cur_time;
171 int64_t cur_icount;
172 int64_t delta;
173 static int64_t last_delta;
174 /* If the VM is not running, then do nothing. */
175 if (!runstate_is_running()) {
176 return;
177 }
178 cur_time = cpu_get_clock();
179 cur_icount = qemu_get_clock_ns(vm_clock);
180 delta = cur_icount - cur_time;
181 /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */
182 if (delta > 0
183 && last_delta + ICOUNT_WOBBLE < delta * 2
184 && icount_time_shift > 0) {
185 /* The guest is getting too far ahead. Slow time down. */
186 icount_time_shift--;
187 }
188 if (delta < 0
189 && last_delta - ICOUNT_WOBBLE > delta * 2
190 && icount_time_shift < MAX_ICOUNT_SHIFT) {
191 /* The guest is getting too far behind. Speed time up. */
192 icount_time_shift++;
193 }
194 last_delta = delta;
195 qemu_icount_bias = cur_icount - (qemu_icount << icount_time_shift);
196 }
197
198 static void icount_adjust_rt(void *opaque)
199 {
200 qemu_mod_timer(icount_rt_timer,
201 qemu_get_clock_ms(rt_clock) + 1000);
202 icount_adjust();
203 }
204
205 static void icount_adjust_vm(void *opaque)
206 {
207 qemu_mod_timer(icount_vm_timer,
208 qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10);
209 icount_adjust();
210 }
211
212 static int64_t qemu_icount_round(int64_t count)
213 {
214 return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
215 }
216
217 static void icount_warp_rt(void *opaque)
218 {
219 if (vm_clock_warp_start == -1) {
220 return;
221 }
222
223 if (runstate_is_running()) {
224 int64_t clock = qemu_get_clock_ns(rt_clock);
225 int64_t warp_delta = clock - vm_clock_warp_start;
226 if (use_icount == 1) {
227 qemu_icount_bias += warp_delta;
228 } else {
229 /*
230 * In adaptive mode, do not let the vm_clock run too
231 * far ahead of real time.
232 */
233 int64_t cur_time = cpu_get_clock();
234 int64_t cur_icount = qemu_get_clock_ns(vm_clock);
235 int64_t delta = cur_time - cur_icount;
236 qemu_icount_bias += MIN(warp_delta, delta);
237 }
238 if (qemu_clock_expired(vm_clock)) {
239 qemu_notify_event();
240 }
241 }
242 vm_clock_warp_start = -1;
243 }
244
245 void qemu_clock_warp(QEMUClock *clock)
246 {
247 int64_t deadline;
248
249 /*
250 * There are too many global variables to make the "warp" behavior
251 * applicable to other clocks. But a clock argument removes the
252 * need for if statements all over the place.
253 */
254 if (clock != vm_clock || !use_icount) {
255 return;
256 }
257
258 /*
259 * If the CPUs have been sleeping, advance the vm_clock timer now. This
260 * ensures that the deadline for the timer is computed correctly below.
261 * This also makes sure that the insn counter is synchronized before the
262 * CPU starts running, in case the CPU is woken by an event other than
263 * the earliest vm_clock timer.
264 */
265 icount_warp_rt(NULL);
266 if (!all_cpu_threads_idle() || !qemu_clock_has_timers(vm_clock)) {
267 qemu_del_timer(icount_warp_timer);
268 return;
269 }
270
271 vm_clock_warp_start = qemu_get_clock_ns(rt_clock);
272 deadline = qemu_clock_deadline(vm_clock);
273 if (deadline > 0) {
274 /*
275 * Ensure the vm_clock proceeds even when the virtual CPU goes to
276 * sleep. Otherwise, the CPU might be waiting for a future timer
277 * interrupt to wake it up, but the interrupt never comes because
278 * the vCPU isn't running any insns and thus doesn't advance the
279 * vm_clock.
280 *
281 * An extreme solution for this problem would be to never let VCPUs
282 * sleep in icount mode if there is a pending vm_clock timer; rather
283 * time could just advance to the next vm_clock event. Instead, we
284 * do stop VCPUs and only advance vm_clock after some "real" time,
285 * (related to the time left until the next event) has passed. This
286 * rt_clock timer will do this. This avoids that the warps are too
287 * visible externally---for example, you will not be sending network
288 * packets continously instead of every 100ms.
289 */
290 qemu_mod_timer(icount_warp_timer, vm_clock_warp_start + deadline);
291 } else {
292 qemu_notify_event();
293 }
294 }
295
296 static const VMStateDescription vmstate_timers = {
297 .name = "timer",
298 .version_id = 2,
299 .minimum_version_id = 1,
300 .minimum_version_id_old = 1,
301 .fields = (VMStateField[]) {
302 VMSTATE_INT64(cpu_ticks_offset, TimersState),
303 VMSTATE_INT64(dummy, TimersState),
304 VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
305 VMSTATE_END_OF_LIST()
306 }
307 };
308
309 void configure_icount(const char *option)
310 {
311 vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
312 if (!option) {
313 return;
314 }
315
316 icount_warp_timer = qemu_new_timer_ns(rt_clock, icount_warp_rt, NULL);
317 if (strcmp(option, "auto") != 0) {
318 icount_time_shift = strtol(option, NULL, 0);
319 use_icount = 1;
320 return;
321 }
322
323 use_icount = 2;
324
325 /* 125MIPS seems a reasonable initial guess at the guest speed.
326 It will be corrected fairly quickly anyway. */
327 icount_time_shift = 3;
328
329 /* Have both realtime and virtual time triggers for speed adjustment.
330 The realtime trigger catches emulated time passing too slowly,
331 the virtual time trigger catches emulated time passing too fast.
332 Realtime triggers occur even when idle, so use them less frequently
333 than VM triggers. */
334 icount_rt_timer = qemu_new_timer_ms(rt_clock, icount_adjust_rt, NULL);
335 qemu_mod_timer(icount_rt_timer,
336 qemu_get_clock_ms(rt_clock) + 1000);
337 icount_vm_timer = qemu_new_timer_ns(vm_clock, icount_adjust_vm, NULL);
338 qemu_mod_timer(icount_vm_timer,
339 qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10);
340 }
341
342 /***********************************************************/
343 void hw_error(const char *fmt, ...)
344 {
345 va_list ap;
346 CPUState *env;
347
348 va_start(ap, fmt);
349 fprintf(stderr, "qemu: hardware error: ");
350 vfprintf(stderr, fmt, ap);
351 fprintf(stderr, "\n");
352 for(env = first_cpu; env != NULL; env = env->next_cpu) {
353 fprintf(stderr, "CPU #%d:\n", env->cpu_index);
354 #ifdef TARGET_I386
355 cpu_dump_state(env, stderr, fprintf, X86_DUMP_FPU);
356 #else
357 cpu_dump_state(env, stderr, fprintf, 0);
358 #endif
359 }
360 va_end(ap);
361 abort();
362 }
363
364 void cpu_synchronize_all_states(void)
365 {
366 CPUState *cpu;
367
368 for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) {
369 cpu_synchronize_state(cpu);
370 }
371 }
372
373 void cpu_synchronize_all_post_reset(void)
374 {
375 CPUState *cpu;
376
377 for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) {
378 cpu_synchronize_post_reset(cpu);
379 }
380 }
381
382 void cpu_synchronize_all_post_init(void)
383 {
384 CPUState *cpu;
385
386 for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) {
387 cpu_synchronize_post_init(cpu);
388 }
389 }
390
391 int cpu_is_stopped(CPUState *env)
392 {
393 return !runstate_is_running() || env->stopped;
394 }
395
396 static void do_vm_stop(RunState state)
397 {
398 if (runstate_is_running()) {
399 cpu_disable_ticks();
400 pause_all_vcpus();
401 runstate_set(state);
402 vm_state_notify(0, state);
403 qemu_aio_flush();
404 bdrv_flush_all();
405 monitor_protocol_event(QEVENT_STOP, NULL);
406 }
407 }
408
409 static int cpu_can_run(CPUState *env)
410 {
411 if (env->stop) {
412 return 0;
413 }
414 if (env->stopped || !runstate_is_running()) {
415 return 0;
416 }
417 return 1;
418 }
419
420 static bool cpu_thread_is_idle(CPUState *env)
421 {
422 if (env->stop || env->queued_work_first) {
423 return false;
424 }
425 if (env->stopped || !runstate_is_running()) {
426 return true;
427 }
428 if (!env->halted || qemu_cpu_has_work(env) ||
429 (kvm_enabled() && kvm_irqchip_in_kernel())) {
430 return false;
431 }
432 return true;
433 }
434
435 bool all_cpu_threads_idle(void)
436 {
437 CPUState *env;
438
439 for (env = first_cpu; env != NULL; env = env->next_cpu) {
440 if (!cpu_thread_is_idle(env)) {
441 return false;
442 }
443 }
444 return true;
445 }
446
447 static void cpu_handle_guest_debug(CPUState *env)
448 {
449 gdb_set_stop_cpu(env);
450 qemu_system_debug_request();
451 env->stopped = 1;
452 }
453
454 static void cpu_signal(int sig)
455 {
456 if (cpu_single_env) {
457 cpu_exit(cpu_single_env);
458 }
459 exit_request = 1;
460 }
461
462 #ifdef CONFIG_LINUX
463 static void sigbus_reraise(void)
464 {
465 sigset_t set;
466 struct sigaction action;
467
468 memset(&action, 0, sizeof(action));
469 action.sa_handler = SIG_DFL;
470 if (!sigaction(SIGBUS, &action, NULL)) {
471 raise(SIGBUS);
472 sigemptyset(&set);
473 sigaddset(&set, SIGBUS);
474 sigprocmask(SIG_UNBLOCK, &set, NULL);
475 }
476 perror("Failed to re-raise SIGBUS!\n");
477 abort();
478 }
479
480 static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
481 void *ctx)
482 {
483 if (kvm_on_sigbus(siginfo->ssi_code,
484 (void *)(intptr_t)siginfo->ssi_addr)) {
485 sigbus_reraise();
486 }
487 }
488
489 static void qemu_init_sigbus(void)
490 {
491 struct sigaction action;
492
493 memset(&action, 0, sizeof(action));
494 action.sa_flags = SA_SIGINFO;
495 action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
496 sigaction(SIGBUS, &action, NULL);
497
498 prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
499 }
500
501 static void qemu_kvm_eat_signals(CPUState *env)
502 {
503 struct timespec ts = { 0, 0 };
504 siginfo_t siginfo;
505 sigset_t waitset;
506 sigset_t chkset;
507 int r;
508
509 sigemptyset(&waitset);
510 sigaddset(&waitset, SIG_IPI);
511 sigaddset(&waitset, SIGBUS);
512
513 do {
514 r = sigtimedwait(&waitset, &siginfo, &ts);
515 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
516 perror("sigtimedwait");
517 exit(1);
518 }
519
520 switch (r) {
521 case SIGBUS:
522 if (kvm_on_sigbus_vcpu(env, siginfo.si_code, siginfo.si_addr)) {
523 sigbus_reraise();
524 }
525 break;
526 default:
527 break;
528 }
529
530 r = sigpending(&chkset);
531 if (r == -1) {
532 perror("sigpending");
533 exit(1);
534 }
535 } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
536 }
537
538 #else /* !CONFIG_LINUX */
539
540 static void qemu_init_sigbus(void)
541 {
542 }
543
544 static void qemu_kvm_eat_signals(CPUState *env)
545 {
546 }
547 #endif /* !CONFIG_LINUX */
548
549 #ifndef _WIN32
550 static int io_thread_fd = -1;
551
552 static void qemu_event_increment(void)
553 {
554 /* Write 8 bytes to be compatible with eventfd. */
555 static const uint64_t val = 1;
556 ssize_t ret;
557
558 if (io_thread_fd == -1) {
559 return;
560 }
561 do {
562 ret = write(io_thread_fd, &val, sizeof(val));
563 } while (ret < 0 && errno == EINTR);
564
565 /* EAGAIN is fine, a read must be pending. */
566 if (ret < 0 && errno != EAGAIN) {
567 fprintf(stderr, "qemu_event_increment: write() failed: %s\n",
568 strerror(errno));
569 exit (1);
570 }
571 }
572
573 static void qemu_event_read(void *opaque)
574 {
575 int fd = (intptr_t)opaque;
576 ssize_t len;
577 char buffer[512];
578
579 /* Drain the notify pipe. For eventfd, only 8 bytes will be read. */
580 do {
581 len = read(fd, buffer, sizeof(buffer));
582 } while ((len == -1 && errno == EINTR) || len == sizeof(buffer));
583 }
584
585 static int qemu_event_init(void)
586 {
587 int err;
588 int fds[2];
589
590 err = qemu_eventfd(fds);
591 if (err == -1) {
592 return -errno;
593 }
594 err = fcntl_setfl(fds[0], O_NONBLOCK);
595 if (err < 0) {
596 goto fail;
597 }
598 err = fcntl_setfl(fds[1], O_NONBLOCK);
599 if (err < 0) {
600 goto fail;
601 }
602 qemu_set_fd_handler2(fds[0], NULL, qemu_event_read, NULL,
603 (void *)(intptr_t)fds[0]);
604
605 io_thread_fd = fds[1];
606 return 0;
607
608 fail:
609 close(fds[0]);
610 close(fds[1]);
611 return err;
612 }
613
614 static void dummy_signal(int sig)
615 {
616 }
617
618 /* If we have signalfd, we mask out the signals we want to handle and then
619 * use signalfd to listen for them. We rely on whatever the current signal
620 * handler is to dispatch the signals when we receive them.
621 */
622 static void sigfd_handler(void *opaque)
623 {
624 int fd = (intptr_t)opaque;
625 struct qemu_signalfd_siginfo info;
626 struct sigaction action;
627 ssize_t len;
628
629 while (1) {
630 do {
631 len = read(fd, &info, sizeof(info));
632 } while (len == -1 && errno == EINTR);
633
634 if (len == -1 && errno == EAGAIN) {
635 break;
636 }
637
638 if (len != sizeof(info)) {
639 printf("read from sigfd returned %zd: %m\n", len);
640 return;
641 }
642
643 sigaction(info.ssi_signo, NULL, &action);
644 if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) {
645 action.sa_sigaction(info.ssi_signo,
646 (siginfo_t *)&info, NULL);
647 } else if (action.sa_handler) {
648 action.sa_handler(info.ssi_signo);
649 }
650 }
651 }
652
653 static int qemu_signal_init(void)
654 {
655 int sigfd;
656 sigset_t set;
657
658 /*
659 * SIG_IPI must be blocked in the main thread and must not be caught
660 * by sigwait() in the signal thread. Otherwise, the cpu thread will
661 * not catch it reliably.
662 */
663 sigemptyset(&set);
664 sigaddset(&set, SIG_IPI);
665 pthread_sigmask(SIG_BLOCK, &set, NULL);
666
667 sigemptyset(&set);
668 sigaddset(&set, SIGIO);
669 sigaddset(&set, SIGALRM);
670 sigaddset(&set, SIGBUS);
671 pthread_sigmask(SIG_BLOCK, &set, NULL);
672
673 sigfd = qemu_signalfd(&set);
674 if (sigfd == -1) {
675 fprintf(stderr, "failed to create signalfd\n");
676 return -errno;
677 }
678
679 fcntl_setfl(sigfd, O_NONBLOCK);
680
681 qemu_set_fd_handler2(sigfd, NULL, sigfd_handler, NULL,
682 (void *)(intptr_t)sigfd);
683
684 return 0;
685 }
686
687 static void qemu_kvm_init_cpu_signals(CPUState *env)
688 {
689 int r;
690 sigset_t set;
691 struct sigaction sigact;
692
693 memset(&sigact, 0, sizeof(sigact));
694 sigact.sa_handler = dummy_signal;
695 sigaction(SIG_IPI, &sigact, NULL);
696
697 pthread_sigmask(SIG_BLOCK, NULL, &set);
698 sigdelset(&set, SIG_IPI);
699 sigdelset(&set, SIGBUS);
700 r = kvm_set_signal_mask(env, &set);
701 if (r) {
702 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
703 exit(1);
704 }
705
706 sigdelset(&set, SIG_IPI);
707 sigdelset(&set, SIGBUS);
708 r = kvm_set_signal_mask(env, &set);
709 if (r) {
710 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
711 exit(1);
712 }
713 }
714
715 static void qemu_tcg_init_cpu_signals(void)
716 {
717 sigset_t set;
718 struct sigaction sigact;
719
720 memset(&sigact, 0, sizeof(sigact));
721 sigact.sa_handler = cpu_signal;
722 sigaction(SIG_IPI, &sigact, NULL);
723
724 sigemptyset(&set);
725 sigaddset(&set, SIG_IPI);
726 pthread_sigmask(SIG_UNBLOCK, &set, NULL);
727 }
728
729 #else /* _WIN32 */
730
731 HANDLE qemu_event_handle;
732
733 static void dummy_event_handler(void *opaque)
734 {
735 }
736
737 static int qemu_event_init(void)
738 {
739 qemu_event_handle = CreateEvent(NULL, FALSE, FALSE, NULL);
740 if (!qemu_event_handle) {
741 fprintf(stderr, "Failed CreateEvent: %ld\n", GetLastError());
742 return -1;
743 }
744 qemu_add_wait_object(qemu_event_handle, dummy_event_handler, NULL);
745 return 0;
746 }
747
748 static void qemu_event_increment(void)
749 {
750 if (!SetEvent(qemu_event_handle)) {
751 fprintf(stderr, "qemu_event_increment: SetEvent failed: %ld\n",
752 GetLastError());
753 exit (1);
754 }
755 }
756
757 static int qemu_signal_init(void)
758 {
759 return 0;
760 }
761
762 static void qemu_kvm_init_cpu_signals(CPUState *env)
763 {
764 abort();
765 }
766
767 static void qemu_tcg_init_cpu_signals(void)
768 {
769 }
770 #endif /* _WIN32 */
771
772 QemuMutex qemu_global_mutex;
773 static QemuCond qemu_io_proceeded_cond;
774 static bool iothread_requesting_mutex;
775
776 static QemuThread io_thread;
777
778 static QemuThread *tcg_cpu_thread;
779 static QemuCond *tcg_halt_cond;
780
781 /* cpu creation */
782 static QemuCond qemu_cpu_cond;
783 /* system init */
784 static QemuCond qemu_pause_cond;
785 static QemuCond qemu_work_cond;
786
787 int qemu_init_main_loop(void)
788 {
789 int ret;
790
791 qemu_init_sigbus();
792
793 ret = qemu_signal_init();
794 if (ret) {
795 return ret;
796 }
797
798 /* Note eventfd must be drained before signalfd handlers run */
799 ret = qemu_event_init();
800 if (ret) {
801 return ret;
802 }
803
804 qemu_cond_init(&qemu_cpu_cond);
805 qemu_cond_init(&qemu_pause_cond);
806 qemu_cond_init(&qemu_work_cond);
807 qemu_cond_init(&qemu_io_proceeded_cond);
808 qemu_mutex_init(&qemu_global_mutex);
809 qemu_mutex_lock(&qemu_global_mutex);
810
811 qemu_thread_get_self(&io_thread);
812
813 return 0;
814 }
815
816 void qemu_main_loop_start(void)
817 {
818 resume_all_vcpus();
819 }
820
821 void run_on_cpu(CPUState *env, void (*func)(void *data), void *data)
822 {
823 struct qemu_work_item wi;
824
825 if (qemu_cpu_is_self(env)) {
826 func(data);
827 return;
828 }
829
830 wi.func = func;
831 wi.data = data;
832 if (!env->queued_work_first) {
833 env->queued_work_first = &wi;
834 } else {
835 env->queued_work_last->next = &wi;
836 }
837 env->queued_work_last = &wi;
838 wi.next = NULL;
839 wi.done = false;
840
841 qemu_cpu_kick(env);
842 while (!wi.done) {
843 CPUState *self_env = cpu_single_env;
844
845 qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
846 cpu_single_env = self_env;
847 }
848 }
849
850 static void flush_queued_work(CPUState *env)
851 {
852 struct qemu_work_item *wi;
853
854 if (!env->queued_work_first) {
855 return;
856 }
857
858 while ((wi = env->queued_work_first)) {
859 env->queued_work_first = wi->next;
860 wi->func(wi->data);
861 wi->done = true;
862 }
863 env->queued_work_last = NULL;
864 qemu_cond_broadcast(&qemu_work_cond);
865 }
866
867 static void qemu_wait_io_event_common(CPUState *env)
868 {
869 if (env->stop) {
870 env->stop = 0;
871 env->stopped = 1;
872 qemu_cond_signal(&qemu_pause_cond);
873 }
874 flush_queued_work(env);
875 env->thread_kicked = false;
876 }
877
878 static void qemu_tcg_wait_io_event(void)
879 {
880 CPUState *env;
881
882 while (all_cpu_threads_idle()) {
883 /* Start accounting real time to the virtual clock if the CPUs
884 are idle. */
885 qemu_clock_warp(vm_clock);
886 qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
887 }
888
889 while (iothread_requesting_mutex) {
890 qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
891 }
892
893 for (env = first_cpu; env != NULL; env = env->next_cpu) {
894 qemu_wait_io_event_common(env);
895 }
896 }
897
898 static void qemu_kvm_wait_io_event(CPUState *env)
899 {
900 while (cpu_thread_is_idle(env)) {
901 qemu_cond_wait(env->halt_cond, &qemu_global_mutex);
902 }
903
904 qemu_kvm_eat_signals(env);
905 qemu_wait_io_event_common(env);
906 }
907
908 static void *qemu_kvm_cpu_thread_fn(void *arg)
909 {
910 CPUState *env = arg;
911 int r;
912
913 qemu_mutex_lock(&qemu_global_mutex);
914 qemu_thread_get_self(env->thread);
915 env->thread_id = qemu_get_thread_id();
916
917 r = kvm_init_vcpu(env);
918 if (r < 0) {
919 fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
920 exit(1);
921 }
922
923 qemu_kvm_init_cpu_signals(env);
924
925 /* signal CPU creation */
926 env->created = 1;
927 qemu_cond_signal(&qemu_cpu_cond);
928
929 while (1) {
930 if (cpu_can_run(env)) {
931 r = kvm_cpu_exec(env);
932 if (r == EXCP_DEBUG) {
933 cpu_handle_guest_debug(env);
934 }
935 }
936 qemu_kvm_wait_io_event(env);
937 }
938
939 return NULL;
940 }
941
942 static void *qemu_tcg_cpu_thread_fn(void *arg)
943 {
944 CPUState *env = arg;
945
946 qemu_tcg_init_cpu_signals();
947 qemu_thread_get_self(env->thread);
948
949 /* signal CPU creation */
950 qemu_mutex_lock(&qemu_global_mutex);
951 for (env = first_cpu; env != NULL; env = env->next_cpu) {
952 env->thread_id = qemu_get_thread_id();
953 env->created = 1;
954 }
955 qemu_cond_signal(&qemu_cpu_cond);
956
957 /* wait for initial kick-off after machine start */
958 while (first_cpu->stopped) {
959 qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
960 }
961
962 while (1) {
963 cpu_exec_all();
964 if (use_icount && qemu_clock_deadline(vm_clock) <= 0) {
965 qemu_notify_event();
966 }
967 qemu_tcg_wait_io_event();
968 }
969
970 return NULL;
971 }
972
973 static void qemu_cpu_kick_thread(CPUState *env)
974 {
975 #ifndef _WIN32
976 int err;
977
978 err = pthread_kill(env->thread->thread, SIG_IPI);
979 if (err) {
980 fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
981 exit(1);
982 }
983 #else /* _WIN32 */
984 if (!qemu_cpu_is_self(env)) {
985 SuspendThread(env->thread->thread);
986 cpu_signal(0);
987 ResumeThread(env->thread->thread);
988 }
989 #endif
990 }
991
992 void qemu_cpu_kick(void *_env)
993 {
994 CPUState *env = _env;
995
996 qemu_cond_broadcast(env->halt_cond);
997 if (kvm_enabled() && !env->thread_kicked) {
998 qemu_cpu_kick_thread(env);
999 env->thread_kicked = true;
1000 }
1001 }
1002
1003 void qemu_cpu_kick_self(void)
1004 {
1005 #ifndef _WIN32
1006 assert(cpu_single_env);
1007
1008 if (!cpu_single_env->thread_kicked) {
1009 qemu_cpu_kick_thread(cpu_single_env);
1010 cpu_single_env->thread_kicked = true;
1011 }
1012 #else
1013 abort();
1014 #endif
1015 }
1016
1017 int qemu_cpu_is_self(void *_env)
1018 {
1019 CPUState *env = _env;
1020
1021 return qemu_thread_is_self(env->thread);
1022 }
1023
1024 void qemu_mutex_lock_iothread(void)
1025 {
1026 if (kvm_enabled()) {
1027 qemu_mutex_lock(&qemu_global_mutex);
1028 } else {
1029 iothread_requesting_mutex = true;
1030 if (qemu_mutex_trylock(&qemu_global_mutex)) {
1031 qemu_cpu_kick_thread(first_cpu);
1032 qemu_mutex_lock(&qemu_global_mutex);
1033 }
1034 iothread_requesting_mutex = false;
1035 qemu_cond_broadcast(&qemu_io_proceeded_cond);
1036 }
1037 }
1038
1039 void qemu_mutex_unlock_iothread(void)
1040 {
1041 qemu_mutex_unlock(&qemu_global_mutex);
1042 }
1043
1044 static int all_vcpus_paused(void)
1045 {
1046 CPUState *penv = first_cpu;
1047
1048 while (penv) {
1049 if (!penv->stopped) {
1050 return 0;
1051 }
1052 penv = (CPUState *)penv->next_cpu;
1053 }
1054
1055 return 1;
1056 }
1057
1058 void pause_all_vcpus(void)
1059 {
1060 CPUState *penv = first_cpu;
1061
1062 qemu_clock_enable(vm_clock, false);
1063 while (penv) {
1064 penv->stop = 1;
1065 qemu_cpu_kick(penv);
1066 penv = (CPUState *)penv->next_cpu;
1067 }
1068
1069 while (!all_vcpus_paused()) {
1070 qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1071 penv = first_cpu;
1072 while (penv) {
1073 qemu_cpu_kick(penv);
1074 penv = (CPUState *)penv->next_cpu;
1075 }
1076 }
1077 }
1078
1079 void resume_all_vcpus(void)
1080 {
1081 CPUState *penv = first_cpu;
1082
1083 while (penv) {
1084 penv->stop = 0;
1085 penv->stopped = 0;
1086 qemu_cpu_kick(penv);
1087 penv = (CPUState *)penv->next_cpu;
1088 }
1089 }
1090
1091 static void qemu_tcg_init_vcpu(void *_env)
1092 {
1093 CPUState *env = _env;
1094
1095 /* share a single thread for all cpus with TCG */
1096 if (!tcg_cpu_thread) {
1097 env->thread = g_malloc0(sizeof(QemuThread));
1098 env->halt_cond = g_malloc0(sizeof(QemuCond));
1099 qemu_cond_init(env->halt_cond);
1100 tcg_halt_cond = env->halt_cond;
1101 qemu_thread_create(env->thread, qemu_tcg_cpu_thread_fn, env);
1102 while (env->created == 0) {
1103 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1104 }
1105 tcg_cpu_thread = env->thread;
1106 } else {
1107 env->thread = tcg_cpu_thread;
1108 env->halt_cond = tcg_halt_cond;
1109 }
1110 }
1111
1112 static void qemu_kvm_start_vcpu(CPUState *env)
1113 {
1114 env->thread = g_malloc0(sizeof(QemuThread));
1115 env->halt_cond = g_malloc0(sizeof(QemuCond));
1116 qemu_cond_init(env->halt_cond);
1117 qemu_thread_create(env->thread, qemu_kvm_cpu_thread_fn, env);
1118 while (env->created == 0) {
1119 qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1120 }
1121 }
1122
1123 void qemu_init_vcpu(void *_env)
1124 {
1125 CPUState *env = _env;
1126
1127 env->nr_cores = smp_cores;
1128 env->nr_threads = smp_threads;
1129 env->stopped = 1;
1130 if (kvm_enabled()) {
1131 qemu_kvm_start_vcpu(env);
1132 } else {
1133 qemu_tcg_init_vcpu(env);
1134 }
1135 }
1136
1137 void qemu_notify_event(void)
1138 {
1139 qemu_event_increment();
1140 }
1141
1142 void cpu_stop_current(void)
1143 {
1144 if (cpu_single_env) {
1145 cpu_single_env->stop = 0;
1146 cpu_single_env->stopped = 1;
1147 cpu_exit(cpu_single_env);
1148 qemu_cond_signal(&qemu_pause_cond);
1149 }
1150 }
1151
1152 void vm_stop(RunState state)
1153 {
1154 if (!qemu_thread_is_self(&io_thread)) {
1155 qemu_system_vmstop_request(state);
1156 /*
1157 * FIXME: should not return to device code in case
1158 * vm_stop() has been requested.
1159 */
1160 cpu_stop_current();
1161 return;
1162 }
1163 do_vm_stop(state);
1164 }
1165
1166 /* does a state transition even if the VM is already stopped,
1167 current state is forgotten forever */
1168 void vm_stop_force_state(RunState state)
1169 {
1170 if (runstate_is_running()) {
1171 vm_stop(state);
1172 } else {
1173 runstate_set(state);
1174 }
1175 }
1176
1177 static int tcg_cpu_exec(CPUState *env)
1178 {
1179 int ret;
1180 #ifdef CONFIG_PROFILER
1181 int64_t ti;
1182 #endif
1183
1184 #ifdef CONFIG_PROFILER
1185 ti = profile_getclock();
1186 #endif
1187 if (use_icount) {
1188 int64_t count;
1189 int decr;
1190 qemu_icount -= (env->icount_decr.u16.low + env->icount_extra);
1191 env->icount_decr.u16.low = 0;
1192 env->icount_extra = 0;
1193 count = qemu_icount_round(qemu_clock_deadline(vm_clock));
1194 qemu_icount += count;
1195 decr = (count > 0xffff) ? 0xffff : count;
1196 count -= decr;
1197 env->icount_decr.u16.low = decr;
1198 env->icount_extra = count;
1199 }
1200 ret = cpu_exec(env);
1201 #ifdef CONFIG_PROFILER
1202 qemu_time += profile_getclock() - ti;
1203 #endif
1204 if (use_icount) {
1205 /* Fold pending instructions back into the
1206 instruction counter, and clear the interrupt flag. */
1207 qemu_icount -= (env->icount_decr.u16.low
1208 + env->icount_extra);
1209 env->icount_decr.u32 = 0;
1210 env->icount_extra = 0;
1211 }
1212 return ret;
1213 }
1214
1215 bool cpu_exec_all(void)
1216 {
1217 int r;
1218
1219 /* Account partial waits to the vm_clock. */
1220 qemu_clock_warp(vm_clock);
1221
1222 if (next_cpu == NULL) {
1223 next_cpu = first_cpu;
1224 }
1225 for (; next_cpu != NULL && !exit_request; next_cpu = next_cpu->next_cpu) {
1226 CPUState *env = next_cpu;
1227
1228 qemu_clock_enable(vm_clock,
1229 (env->singlestep_enabled & SSTEP_NOTIMER) == 0);
1230
1231 if (cpu_can_run(env)) {
1232 if (kvm_enabled()) {
1233 r = kvm_cpu_exec(env);
1234 qemu_kvm_eat_signals(env);
1235 } else {
1236 r = tcg_cpu_exec(env);
1237 }
1238 if (r == EXCP_DEBUG) {
1239 cpu_handle_guest_debug(env);
1240 break;
1241 }
1242 } else if (env->stop || env->stopped) {
1243 break;
1244 }
1245 }
1246 exit_request = 0;
1247 return !all_cpu_threads_idle();
1248 }
1249
1250 void set_numa_modes(void)
1251 {
1252 CPUState *env;
1253 int i;
1254
1255 for (env = first_cpu; env != NULL; env = env->next_cpu) {
1256 for (i = 0; i < nb_numa_nodes; i++) {
1257 if (node_cpumask[i] & (1 << env->cpu_index)) {
1258 env->numa_node = i;
1259 }
1260 }
1261 }
1262 }
1263
1264 void set_cpu_log(const char *optarg)
1265 {
1266 int mask;
1267 const CPULogItem *item;
1268
1269 mask = cpu_str_to_log_mask(optarg);
1270 if (!mask) {
1271 printf("Log items (comma separated):\n");
1272 for (item = cpu_log_items; item->mask != 0; item++) {
1273 printf("%-10s %s\n", item->name, item->help);
1274 }
1275 exit(1);
1276 }
1277 cpu_set_log(mask);
1278 }
1279
1280 void set_cpu_log_filename(const char *optarg)
1281 {
1282 cpu_set_log_filename(optarg);
1283 }
1284
1285 void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1286 {
1287 /* XXX: implement xxx_cpu_list for targets that still miss it */
1288 #if defined(cpu_list_id)
1289 cpu_list_id(f, cpu_fprintf, optarg);
1290 #elif defined(cpu_list)
1291 cpu_list(f, cpu_fprintf); /* deprecated */
1292 #endif
1293 }