util/oslib-posix.c

   1 /*
   2  * os-posix-lib.c
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2010 Red Hat, Inc.
   6  *
   7  * QEMU library functions on POSIX which are shared between QEMU and
   8  * the QEMU tools.
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include <termios.h>
  31
  32 #include <glib/gprintf.h>
  33
  34 #include "sysemu/sysemu.h"
  35 #include "trace.h"
  36 #include "qapi/error.h"
  37 #include "qemu/error-report.h"
  38 #include "qemu/madvise.h"
  39 #include "qemu/sockets.h"
  40 #include "qemu/thread.h"
  41 #include <libgen.h>
  42 #include "qemu/cutils.h"
  43 #include "qemu/compiler.h"
  44 #include "qemu/units.h"
  45
  46 #ifdef CONFIG_LINUX
  47 #include <sys/syscall.h>
  48 #endif
  49
  50 #ifdef __FreeBSD__
  51 #include <sys/sysctl.h>
  52 #include <sys/user.h>
  53 #include <sys/thr.h>
  54 #include <libutil.h>
  55 #endif
  56
  57 #ifdef __NetBSD__
  58 #include <sys/sysctl.h>
  59 #include <lwp.h>
  60 #endif
  61
  62 #ifdef __APPLE__
  63 #include <mach-o/dyld.h>
  64 #endif
  65
  66 #ifdef __HAIKU__
  67 #include <kernel/image.h>
  68 #endif
  69
  70 #include "qemu/mmap-alloc.h"
  71
  72 #ifdef CONFIG_DEBUG_STACK_USAGE
  73 #include "qemu/error-report.h"
  74 #endif
  75
  76 #define MAX_MEM_PREALLOC_THREAD_COUNT 16
  77
  78 struct MemsetThread;
  79
  80 typedef struct MemsetContext {
  81     bool all_threads_created;
  82     bool any_thread_failed;
  83     struct MemsetThread *threads;
  84     int num_threads;
  85 } MemsetContext;
  86
  87 struct MemsetThread {
  88     char *addr;
  89     size_t numpages;
  90     size_t hpagesize;
  91     QemuThread pgthread;
  92     sigjmp_buf env;
  93     MemsetContext *context;
  94 };
  95 typedef struct MemsetThread MemsetThread;
  96
  97 /* used by sigbus_handler() */
  98 static MemsetContext *sigbus_memset_context;
  99 struct sigaction sigbus_oldact;
 100 static QemuMutex sigbus_mutex;
 101
 102 static QemuMutex page_mutex;
 103 static QemuCond page_cond;
 104
 105 int qemu_get_thread_id(void)
 106 {
 107 #if defined(__linux__)
 108     return syscall(SYS_gettid);
 109 #elif defined(__FreeBSD__)
 110     /* thread id is up to INT_MAX */
 111     long tid;
 112     thr_self(&tid);
 113     return (int)tid;
 114 #elif defined(__NetBSD__)
 115     return _lwp_self();
 116 #elif defined(__OpenBSD__)
 117     return getthrid();
 118 #else
 119     return getpid();
 120 #endif
 121 }
 122
 123 int qemu_daemon(int nochdir, int noclose)
 124 {
 125     return daemon(nochdir, noclose);
 126 }
 127
 128 bool qemu_write_pidfile(const char *path, Error **errp)
 129 {
 130     int fd;
 131     char pidstr[32];
 132
 133     while (1) {
 134         struct stat a, b;
 135         struct flock lock = {
 136             .l_type = F_WRLCK,
 137             .l_whence = SEEK_SET,
 138             .l_len = 0,
 139         };
 140
 141         fd = qemu_create(path, O_WRONLY, S_IRUSR | S_IWUSR, errp);
 142         if (fd == -1) {
 143             return false;
 144         }
 145
 146         if (fstat(fd, &b) < 0) {
 147             error_setg_errno(errp, errno, "Cannot stat file");
 148             goto fail_close;
 149         }
 150
 151         if (fcntl(fd, F_SETLK, &lock)) {
 152             error_setg_errno(errp, errno, "Cannot lock pid file");
 153             goto fail_close;
 154         }
 155
 156         /*
 157          * Now make sure the path we locked is the same one that now
 158          * exists on the filesystem.
 159          */
 160         if (stat(path, &a) < 0) {
 161             /*
 162              * PID file disappeared, someone else must be racing with
 163              * us, so try again.
 164              */
 165             close(fd);
 166             continue;
 167         }
 168
 169         if (a.st_ino == b.st_ino) {
 170             break;
 171         }
 172
 173         /*
 174          * PID file was recreated, someone else must be racing with
 175          * us, so try again.
 176          */
 177         close(fd);
 178     }
 179
 180     if (ftruncate(fd, 0) < 0) {
 181         error_setg_errno(errp, errno, "Failed to truncate pid file");
 182         goto fail_unlink;
 183     }
 184
 185     snprintf(pidstr, sizeof(pidstr), FMT_pid "\n", getpid());
 186     if (qemu_write_full(fd, pidstr, strlen(pidstr)) != strlen(pidstr)) {
 187         error_setg(errp, "Failed to write pid file");
 188         goto fail_unlink;
 189     }
 190
 191     return true;
 192
 193 fail_unlink:
 194     unlink(path);
 195 fail_close:
 196     close(fd);
 197     return false;
 198 }
 199
 200 /* alloc shared memory pages */
 201 void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared,
 202                           bool noreserve)
 203 {
 204     const uint32_t qemu_map_flags = (shared ? QEMU_MAP_SHARED : 0) |
 205                                     (noreserve ? QEMU_MAP_NORESERVE : 0);
 206     size_t align = QEMU_VMALLOC_ALIGN;
 207     void *ptr = qemu_ram_mmap(-1, size, align, qemu_map_flags, 0);
 208
 209     if (ptr == MAP_FAILED) {
 210         return NULL;
 211     }
 212
 213     if (alignment) {
 214         *alignment = align;
 215     }
 216
 217     trace_qemu_anon_ram_alloc(size, ptr);
 218     return ptr;
 219 }
 220
 221 void qemu_anon_ram_free(void *ptr, size_t size)
 222 {
 223     trace_qemu_anon_ram_free(ptr, size);
 224     qemu_ram_munmap(-1, ptr, size);
 225 }
 226
 227 void qemu_set_block(int fd)
 228 {
 229     int f;
 230     f = fcntl(fd, F_GETFL);
 231     assert(f != -1);
 232     f = fcntl(fd, F_SETFL, f & ~O_NONBLOCK);
 233     assert(f != -1);
 234 }
 235
 236 int qemu_try_set_nonblock(int fd)
 237 {
 238     int f;
 239     f = fcntl(fd, F_GETFL);
 240     if (f == -1) {
 241         return -errno;
 242     }
 243     if (fcntl(fd, F_SETFL, f | O_NONBLOCK) == -1) {
 244         return -errno;
 245     }
 246     return 0;
 247 }
 248
 249 void qemu_set_nonblock(int fd)
 250 {
 251     int f;
 252     f = qemu_try_set_nonblock(fd);
 253     assert(f == 0);
 254 }
 255
 256 int socket_set_fast_reuse(int fd)
 257 {
 258     int val = 1, ret;
 259
 260     ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
 261                      (const char *)&val, sizeof(val));
 262
 263     assert(ret == 0);
 264
 265     return ret;
 266 }
 267
 268 void qemu_set_cloexec(int fd)
 269 {
 270     int f;
 271     f = fcntl(fd, F_GETFD);
 272     assert(f != -1);
 273     f = fcntl(fd, F_SETFD, f | FD_CLOEXEC);
 274     assert(f != -1);
 275 }
 276
 277 /*
 278  * Creates a pipe with FD_CLOEXEC set on both file descriptors
 279  */
 280 int qemu_pipe(int pipefd[2])
 281 {
 282     int ret;
 283
 284 #ifdef CONFIG_PIPE2
 285     ret = pipe2(pipefd, O_CLOEXEC);
 286     if (ret != -1 || errno != ENOSYS) {
 287         return ret;
 288     }
 289 #endif
 290     ret = pipe(pipefd);
 291     if (ret == 0) {
 292         qemu_set_cloexec(pipefd[0]);
 293         qemu_set_cloexec(pipefd[1]);
 294     }
 295
 296     return ret;
 297 }
 298
 299 char *
 300 qemu_get_local_state_dir(void)
 301 {
 302     return get_relocated_path(CONFIG_QEMU_LOCALSTATEDIR);
 303 }
 304
 305 void qemu_set_tty_echo(int fd, bool echo)
 306 {
 307     struct termios tty;
 308
 309     tcgetattr(fd, &tty);
 310
 311     if (echo) {
 312         tty.c_lflag |= ECHO | ECHONL | ICANON | IEXTEN;
 313     } else {
 314         tty.c_lflag &= ~(ECHO | ECHONL | ICANON | IEXTEN);
 315     }
 316
 317     tcsetattr(fd, TCSANOW, &tty);
 318 }
 319
 320 static const char *exec_dir;
 321
 322 void qemu_init_exec_dir(const char *argv0)
 323 {
 324     char *p = NULL;
 325     char buf[PATH_MAX];
 326
 327     if (exec_dir) {
 328         return;
 329     }
 330
 331 #if defined(__linux__)
 332     {
 333         int len;
 334         len = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
 335         if (len > 0) {
 336             buf[len] = 0;
 337             p = buf;
 338         }
 339     }
 340 #elif defined(__FreeBSD__) \
 341       || (defined(__NetBSD__) && defined(KERN_PROC_PATHNAME))
 342     {
 343 #if defined(__FreeBSD__)
 344         static int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1};
 345 #else
 346         static int mib[4] = {CTL_KERN, KERN_PROC_ARGS, -1, KERN_PROC_PATHNAME};
 347 #endif
 348         size_t len = sizeof(buf) - 1;
 349
 350         *buf = '\0';
 351         if (!sysctl(mib, ARRAY_SIZE(mib), buf, &len, NULL, 0) &&
 352             *buf) {
 353             buf[sizeof(buf) - 1] = '\0';
 354             p = buf;
 355         }
 356     }
 357 #elif defined(__APPLE__)
 358     {
 359         char fpath[PATH_MAX];
 360         uint32_t len = sizeof(fpath);
 361         if (_NSGetExecutablePath(fpath, &len) == 0) {
 362             p = realpath(fpath, buf);
 363             if (!p) {
 364                 return;
 365             }
 366         }
 367     }
 368 #elif defined(__HAIKU__)
 369     {
 370         image_info ii;
 371         int32_t c = 0;
 372
 373         *buf = '\0';
 374         while (get_next_image_info(0, &c, &ii) == B_OK) {
 375             if (ii.type == B_APP_IMAGE) {
 376                 strncpy(buf, ii.name, sizeof(buf));
 377                 buf[sizeof(buf) - 1] = 0;
 378                 p = buf;
 379                 break;
 380             }
 381         }
 382     }
 383 #endif
 384     /* If we don't have any way of figuring out the actual executable
 385        location then try argv[0].  */
 386     if (!p && argv0) {
 387         p = realpath(argv0, buf);
 388     }
 389     if (p) {
 390         exec_dir = g_path_get_dirname(p);
 391     } else {
 392         exec_dir = CONFIG_BINDIR;
 393     }
 394 }
 395
 396 const char *qemu_get_exec_dir(void)
 397 {
 398     return exec_dir;
 399 }
 400
 401 #ifdef CONFIG_LINUX
 402 static void sigbus_handler(int signal, siginfo_t *siginfo, void *ctx)
 403 #else /* CONFIG_LINUX */
 404 static void sigbus_handler(int signal)
 405 #endif /* CONFIG_LINUX */
 406 {
 407     int i;
 408
 409     if (sigbus_memset_context) {
 410         for (i = 0; i < sigbus_memset_context->num_threads; i++) {
 411             MemsetThread *thread = &sigbus_memset_context->threads[i];
 412
 413             if (qemu_thread_is_self(&thread->pgthread)) {
 414                 siglongjmp(thread->env, 1);
 415             }
 416         }
 417     }
 418
 419 #ifdef CONFIG_LINUX
 420     /*
 421      * We assume that the MCE SIGBUS handler could have been registered. We
 422      * should never receive BUS_MCEERR_AO on any of our threads, but only on
 423      * the main thread registered for PR_MCE_KILL_EARLY. Further, we should not
 424      * receive BUS_MCEERR_AR triggered by action of other threads on one of
 425      * our threads. So, no need to check for unrelated SIGBUS when seeing one
 426      * for our threads.
 427      *
 428      * We will forward to the MCE handler, which will either handle the SIGBUS
 429      * or reinstall the default SIGBUS handler and reraise the SIGBUS. The
 430      * default SIGBUS handler will crash the process, so we don't care.
 431      */
 432     if (sigbus_oldact.sa_flags & SA_SIGINFO) {
 433         sigbus_oldact.sa_sigaction(signal, siginfo, ctx);
 434         return;
 435     }
 436 #endif /* CONFIG_LINUX */
 437     warn_report("os_mem_prealloc: unrelated SIGBUS detected and ignored");
 438 }
 439
 440 static void *do_touch_pages(void *arg)
 441 {
 442     MemsetThread *memset_args = (MemsetThread *)arg;
 443     sigset_t set, oldset;
 444     int ret = 0;
 445
 446     /*
 447      * On Linux, the page faults from the loop below can cause mmap_sem
 448      * contention with allocation of the thread stacks.  Do not start
 449      * clearing until all threads have been created.
 450      */
 451     qemu_mutex_lock(&page_mutex);
 452     while (!memset_args->context->all_threads_created) {
 453         qemu_cond_wait(&page_cond, &page_mutex);
 454     }
 455     qemu_mutex_unlock(&page_mutex);
 456
 457     /* unblock SIGBUS */
 458     sigemptyset(&set);
 459     sigaddset(&set, SIGBUS);
 460     pthread_sigmask(SIG_UNBLOCK, &set, &oldset);
 461
 462     if (sigsetjmp(memset_args->env, 1)) {
 463         ret = -EFAULT;
 464     } else {
 465         char *addr = memset_args->addr;
 466         size_t numpages = memset_args->numpages;
 467         size_t hpagesize = memset_args->hpagesize;
 468         size_t i;
 469         for (i = 0; i < numpages; i++) {
 470             /*
 471              * Read & write back the same value, so we don't
 472              * corrupt existing user/app data that might be
 473              * stored.
 474              *
 475              * 'volatile' to stop compiler optimizing this away
 476              * to a no-op
 477              */
 478             *(volatile char *)addr = *addr;
 479             addr += hpagesize;
 480         }
 481     }
 482     pthread_sigmask(SIG_SETMASK, &oldset, NULL);
 483     return (void *)(uintptr_t)ret;
 484 }
 485
 486 static void *do_madv_populate_write_pages(void *arg)
 487 {
 488     MemsetThread *memset_args = (MemsetThread *)arg;
 489     const size_t size = memset_args->numpages * memset_args->hpagesize;
 490     char * const addr = memset_args->addr;
 491     int ret = 0;
 492
 493     /* See do_touch_pages(). */
 494     qemu_mutex_lock(&page_mutex);
 495     while (!memset_args->context->all_threads_created) {
 496         qemu_cond_wait(&page_cond, &page_mutex);
 497     }
 498     qemu_mutex_unlock(&page_mutex);
 499
 500     if (size && qemu_madvise(addr, size, QEMU_MADV_POPULATE_WRITE)) {
 501         ret = -errno;
 502     }
 503     return (void *)(uintptr_t)ret;
 504 }
 505
 506 static inline int get_memset_num_threads(size_t hpagesize, size_t numpages,
 507                                          int smp_cpus)
 508 {
 509     long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
 510     int ret = 1;
 511
 512     if (host_procs > 0) {
 513         ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), smp_cpus);
 514     }
 515
 516     /* Especially with gigantic pages, don't create more threads than pages. */
 517     ret = MIN(ret, numpages);
 518     /* Don't start threads to prealloc comparatively little memory. */
 519     ret = MIN(ret, MAX(1, hpagesize * numpages / (64 * MiB)));
 520
 521     /* In case sysconf() fails, we fall back to single threaded */
 522     return ret;
 523 }
 524
 525 static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
 526                            int smp_cpus, bool use_madv_populate_write)
 527 {
 528     static gsize initialized = 0;
 529     MemsetContext context = {
 530         .num_threads = get_memset_num_threads(hpagesize, numpages, smp_cpus),
 531     };
 532     size_t numpages_per_thread, leftover;
 533     void *(*touch_fn)(void *);
 534     int ret = 0, i = 0;
 535     char *addr = area;
 536
 537     if (g_once_init_enter(&initialized)) {
 538         qemu_mutex_init(&page_mutex);
 539         qemu_cond_init(&page_cond);
 540         g_once_init_leave(&initialized, 1);
 541     }
 542
 543     if (use_madv_populate_write) {
 544         /* Avoid creating a single thread for MADV_POPULATE_WRITE */
 545         if (context.num_threads == 1) {
 546             if (qemu_madvise(area, hpagesize * numpages,
 547                              QEMU_MADV_POPULATE_WRITE)) {
 548                 return -errno;
 549             }
 550             return 0;
 551         }
 552         touch_fn = do_madv_populate_write_pages;
 553     } else {
 554         touch_fn = do_touch_pages;
 555     }
 556
 557     context.threads = g_new0(MemsetThread, context.num_threads);
 558     numpages_per_thread = numpages / context.num_threads;
 559     leftover = numpages % context.num_threads;
 560     for (i = 0; i < context.num_threads; i++) {
 561         context.threads[i].addr = addr;
 562         context.threads[i].numpages = numpages_per_thread + (i < leftover);
 563         context.threads[i].hpagesize = hpagesize;
 564         context.threads[i].context = &context;
 565         qemu_thread_create(&context.threads[i].pgthread, "touch_pages",
 566                            touch_fn, &context.threads[i],
 567                            QEMU_THREAD_JOINABLE);
 568         addr += context.threads[i].numpages * hpagesize;
 569     }
 570
 571     if (!use_madv_populate_write) {
 572         sigbus_memset_context = &context;
 573     }
 574
 575     qemu_mutex_lock(&page_mutex);
 576     context.all_threads_created = true;
 577     qemu_cond_broadcast(&page_cond);
 578     qemu_mutex_unlock(&page_mutex);
 579
 580     for (i = 0; i < context.num_threads; i++) {
 581         int tmp = (uintptr_t)qemu_thread_join(&context.threads[i].pgthread);
 582
 583         if (tmp) {
 584             ret = tmp;
 585         }
 586     }
 587
 588     if (!use_madv_populate_write) {
 589         sigbus_memset_context = NULL;
 590     }
 591     g_free(context.threads);
 592
 593     return ret;
 594 }
 595
 596 static bool madv_populate_write_possible(char *area, size_t pagesize)
 597 {
 598     return !qemu_madvise(area, pagesize, QEMU_MADV_POPULATE_WRITE) ||
 599            errno != EINVAL;
 600 }
 601
 602 void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
 603                      Error **errp)
 604 {
 605     static gsize initialized;
 606     int ret;
 607     size_t hpagesize = qemu_fd_getpagesize(fd);
 608     size_t numpages = DIV_ROUND_UP(memory, hpagesize);
 609     bool use_madv_populate_write;
 610     struct sigaction act;
 611
 612     /*
 613      * Sense on every invocation, as MADV_POPULATE_WRITE cannot be used for
 614      * some special mappings, such as mapping /dev/mem.
 615      */
 616     use_madv_populate_write = madv_populate_write_possible(area, hpagesize);
 617
 618     if (!use_madv_populate_write) {
 619         if (g_once_init_enter(&initialized)) {
 620             qemu_mutex_init(&sigbus_mutex);
 621             g_once_init_leave(&initialized, 1);
 622         }
 623
 624         qemu_mutex_lock(&sigbus_mutex);
 625         memset(&act, 0, sizeof(act));
 626 #ifdef CONFIG_LINUX
 627         act.sa_sigaction = &sigbus_handler;
 628         act.sa_flags = SA_SIGINFO;
 629 #else /* CONFIG_LINUX */
 630         act.sa_handler = &sigbus_handler;
 631         act.sa_flags = 0;
 632 #endif /* CONFIG_LINUX */
 633
 634         ret = sigaction(SIGBUS, &act, &sigbus_oldact);
 635         if (ret) {
 636             qemu_mutex_unlock(&sigbus_mutex);
 637             error_setg_errno(errp, errno,
 638                 "os_mem_prealloc: failed to install signal handler");
 639             return;
 640         }
 641     }
 642
 643     /* touch pages simultaneously */
 644     ret = touch_all_pages(area, hpagesize, numpages, smp_cpus,
 645                           use_madv_populate_write);
 646     if (ret) {
 647         error_setg_errno(errp, -ret,
 648                          "os_mem_prealloc: preallocating memory failed");
 649     }
 650
 651     if (!use_madv_populate_write) {
 652         ret = sigaction(SIGBUS, &sigbus_oldact, NULL);
 653         if (ret) {
 654             /* Terminate QEMU since it can't recover from error */
 655             perror("os_mem_prealloc: failed to reinstall signal handler");
 656             exit(1);
 657         }
 658         qemu_mutex_unlock(&sigbus_mutex);
 659     }
 660 }
 661
 662 char *qemu_get_pid_name(pid_t pid)
 663 {
 664     char *name = NULL;
 665
 666 #if defined(__FreeBSD__)
 667     /* BSDs don't have /proc, but they provide a nice substitute */
 668     struct kinfo_proc *proc = kinfo_getproc(pid);
 669
 670     if (proc) {
 671         name = g_strdup(proc->ki_comm);
 672         free(proc);
 673     }
 674 #else
 675     /* Assume a system with reasonable procfs */
 676     char *pid_path;
 677     size_t len;
 678
 679     pid_path = g_strdup_printf("/proc/%d/cmdline", pid);
 680     g_file_get_contents(pid_path, &name, &len, NULL);
 681     g_free(pid_path);
 682 #endif
 683
 684     return name;
 685 }
 686
 687
 688 pid_t qemu_fork(Error **errp)
 689 {
 690     sigset_t oldmask, newmask;
 691     struct sigaction sig_action;
 692     int saved_errno;
 693     pid_t pid;
 694
 695     /*
 696      * Need to block signals now, so that child process can safely
 697      * kill off caller's signal handlers without a race.
 698      */
 699     sigfillset(&newmask);
 700     if (pthread_sigmask(SIG_SETMASK, &newmask, &oldmask) != 0) {
 701         error_setg_errno(errp, errno,
 702                          "cannot block signals");
 703         return -1;
 704     }
 705
 706     pid = fork();
 707     saved_errno = errno;
 708
 709     if (pid < 0) {
 710         /* attempt to restore signal mask, but ignore failure, to
 711          * avoid obscuring the fork failure */
 712         (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
 713         error_setg_errno(errp, saved_errno,
 714                          "cannot fork child process");
 715         errno = saved_errno;
 716         return -1;
 717     } else if (pid) {
 718         /* parent process */
 719
 720         /* Restore our original signal mask now that the child is
 721          * safely running. Only documented failures are EFAULT (not
 722          * possible, since we are using just-grabbed mask) or EINVAL
 723          * (not possible, since we are using correct arguments).  */
 724         (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
 725     } else {
 726         /* child process */
 727         size_t i;
 728
 729         /* Clear out all signal handlers from parent so nothing
 730          * unexpected can happen in our child once we unblock
 731          * signals */
 732         sig_action.sa_handler = SIG_DFL;
 733         sig_action.sa_flags = 0;
 734         sigemptyset(&sig_action.sa_mask);
 735
 736         for (i = 1; i < NSIG; i++) {
 737             /* Only possible errors are EFAULT or EINVAL The former
 738              * won't happen, the latter we expect, so no need to check
 739              * return value */
 740             (void)sigaction(i, &sig_action, NULL);
 741         }
 742
 743         /* Unmask all signals in child, since we've no idea what the
 744          * caller's done with their signal mask and don't want to
 745          * propagate that to children */
 746         sigemptyset(&newmask);
 747         if (pthread_sigmask(SIG_SETMASK, &newmask, NULL) != 0) {
 748             Error *local_err = NULL;
 749             error_setg_errno(&local_err, errno,
 750                              "cannot unblock signals");
 751             error_report_err(local_err);
 752             _exit(1);
 753         }
 754     }
 755     return pid;
 756 }
 757
 758 void *qemu_alloc_stack(size_t *sz)
 759 {
 760     void *ptr, *guardpage;
 761     int flags;
 762 #ifdef CONFIG_DEBUG_STACK_USAGE
 763     void *ptr2;
 764 #endif
 765     size_t pagesz = qemu_real_host_page_size();
 766 #ifdef _SC_THREAD_STACK_MIN
 767     /* avoid stacks smaller than _SC_THREAD_STACK_MIN */
 768     long min_stack_sz = sysconf(_SC_THREAD_STACK_MIN);
 769     *sz = MAX(MAX(min_stack_sz, 0), *sz);
 770 #endif
 771     /* adjust stack size to a multiple of the page size */
 772     *sz = ROUND_UP(*sz, pagesz);
 773     /* allocate one extra page for the guard page */
 774     *sz += pagesz;
 775
 776     flags = MAP_PRIVATE | MAP_ANONYMOUS;
 777 #if defined(MAP_STACK) && defined(__OpenBSD__)
 778     /* Only enable MAP_STACK on OpenBSD. Other OS's such as
 779      * Linux/FreeBSD/NetBSD have a flag with the same name
 780      * but have differing functionality. OpenBSD will SEGV
 781      * if it spots execution with a stack pointer pointing
 782      * at memory that was not allocated with MAP_STACK.
 783      */
 784     flags |= MAP_STACK;
 785 #endif
 786
 787     ptr = mmap(NULL, *sz, PROT_READ | PROT_WRITE, flags, -1, 0);
 788     if (ptr == MAP_FAILED) {
 789         perror("failed to allocate memory for stack");
 790         abort();
 791     }
 792
 793 #if defined(HOST_IA64)
 794     /* separate register stack */
 795     guardpage = ptr + (((*sz - pagesz) / 2) & ~pagesz);
 796 #elif defined(HOST_HPPA)
 797     /* stack grows up */
 798     guardpage = ptr + *sz - pagesz;
 799 #else
 800     /* stack grows down */
 801     guardpage = ptr;
 802 #endif
 803     if (mprotect(guardpage, pagesz, PROT_NONE) != 0) {
 804         perror("failed to set up stack guard page");
 805         abort();
 806     }
 807
 808 #ifdef CONFIG_DEBUG_STACK_USAGE
 809     for (ptr2 = ptr + pagesz; ptr2 < ptr + *sz; ptr2 += sizeof(uint32_t)) {
 810         *(uint32_t *)ptr2 = 0xdeadbeaf;
 811     }
 812 #endif
 813
 814     return ptr;
 815 }
 816
 817 #ifdef CONFIG_DEBUG_STACK_USAGE
 818 static __thread unsigned int max_stack_usage;
 819 #endif
 820
 821 void qemu_free_stack(void *stack, size_t sz)
 822 {
 823 #ifdef CONFIG_DEBUG_STACK_USAGE
 824     unsigned int usage;
 825     void *ptr;
 826
 827     for (ptr = stack + qemu_real_host_page_size(); ptr < stack + sz;
 828          ptr += sizeof(uint32_t)) {
 829         if (*(uint32_t *)ptr != 0xdeadbeaf) {
 830             break;
 831         }
 832     }
 833     usage = sz - (uintptr_t) (ptr - stack);
 834     if (usage > max_stack_usage) {
 835         error_report("thread %d max stack usage increased from %u to %u",
 836                      qemu_get_thread_id(), max_stack_usage, usage);
 837         max_stack_usage = usage;
 838     }
 839 #endif
 840
 841     munmap(stack, sz);
 842 }
 843
 844 /*
 845  * Disable CFI checks.
 846  * We are going to call a signal hander directly. Such handler may or may not
 847  * have been defined in our binary, so there's no guarantee that the pointer
 848  * used to set the handler is a cfi-valid pointer. Since the handlers are
 849  * stored in kernel memory, changing the handler to an attacker-defined
 850  * function requires being able to call a sigaction() syscall,
 851  * which is not as easy as overwriting a pointer in memory.
 852  */
 853 QEMU_DISABLE_CFI
 854 void sigaction_invoke(struct sigaction *action,
 855                       struct qemu_signalfd_siginfo *info)
 856 {
 857     siginfo_t si = {};
 858     si.si_signo = info->ssi_signo;
 859     si.si_errno = info->ssi_errno;
 860     si.si_code = info->ssi_code;
 861
 862     /* Convert the minimal set of fields defined by POSIX.
 863      * Positive si_code values are reserved for kernel-generated
 864      * signals, where the valid siginfo fields are determined by
 865      * the signal number.  But according to POSIX, it is unspecified
 866      * whether SI_USER and SI_QUEUE have values less than or equal to
 867      * zero.
 868      */
 869     if (info->ssi_code == SI_USER || info->ssi_code == SI_QUEUE ||
 870         info->ssi_code <= 0) {
 871         /* SIGTERM, etc.  */
 872         si.si_pid = info->ssi_pid;
 873         si.si_uid = info->ssi_uid;
 874     } else if (info->ssi_signo == SIGILL || info->ssi_signo == SIGFPE ||
 875                info->ssi_signo == SIGSEGV || info->ssi_signo == SIGBUS) {
 876         si.si_addr = (void *)(uintptr_t)info->ssi_addr;
 877     } else if (info->ssi_signo == SIGCHLD) {
 878         si.si_pid = info->ssi_pid;
 879         si.si_status = info->ssi_status;
 880         si.si_uid = info->ssi_uid;
 881     }
 882     action->sa_sigaction(info->ssi_signo, &si, NULL);
 883 }
 884
 885 size_t qemu_get_host_physmem(void)
 886 {
 887 #ifdef _SC_PHYS_PAGES
 888     long pages = sysconf(_SC_PHYS_PAGES);
 889     if (pages > 0) {
 890         if (pages > SIZE_MAX / qemu_real_host_page_size()) {
 891             return SIZE_MAX;
 892         } else {
 893             return pages * qemu_real_host_page_size();
 894         }
 895     }
 896 #endif
 897     return 0;
 898 }
 899
 900 /* Sets a specific flag */
 901 int fcntl_setfl(int fd, int flag)
 902 {
 903     int flags;
 904
 905     flags = fcntl(fd, F_GETFL);
 906     if (flags == -1) {
 907         return -errno;
 908     }
 909     if (fcntl(fd, F_SETFL, flags | flag) == -1) {
 910         return -errno;
 911     }
 912     return 0;
 913 }
 914
 915 int qemu_msync(void *addr, size_t length, int fd)
 916 {
 917     size_t align_mask = ~(qemu_real_host_page_size() - 1);
 918
 919     /**
 920      * There are no strict reqs as per the length of mapping
 921      * to be synced. Still the length needs to follow the address
 922      * alignment changes. Additionally - round the size to the multiple
 923      * of PAGE_SIZE
 924      */
 925     length += ((uintptr_t)addr & (qemu_real_host_page_size() - 1));
 926     length = (length + ~align_mask) & align_mask;
 927
 928     addr = (void *)((uintptr_t)addr & align_mask);
 929
 930     return msync(addr, length, MS_SYNC);
 931 }