]> git.proxmox.com Git - mirror_qemu.git/blame - util/oslib-posix.c
util/oslib-posix: Don't create too many threads with small memory or little pages
[mirror_qemu.git] / util / oslib-posix.c
CommitLineData
c1b0b93b
JS
1/*
2 * os-posix-lib.c
3 *
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2010 Red Hat, Inc.
6 *
7 * QEMU library functions on POSIX which are shared between QEMU and
8 * the QEMU tools.
9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
16 *
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
27 */
28
aafd7584 29#include "qemu/osdep.h"
13401ba0 30#include <termios.h>
13401ba0 31
e2ea3515
LE
32#include <glib/gprintf.h>
33
a8d25326 34#include "qemu-common.h"
9c17d615 35#include "sysemu/sysemu.h"
c1b0b93b 36#include "trace.h"
da34e65c 37#include "qapi/error.h"
1de7afc9 38#include "qemu/sockets.h"
db725815 39#include "qemu/thread.h"
10f5bff6 40#include <libgen.h>
f348b6d1 41#include "qemu/cutils.h"
c905a368 42#include "qemu/compiler.h"
89aec641 43#include "qemu/units.h"
c1b0b93b 44
cbcfa041
PB
45#ifdef CONFIG_LINUX
46#include <sys/syscall.h>
47#endif
cbcfa041 48
41975b26
AF
49#ifdef __FreeBSD__
50#include <sys/sysctl.h>
a7764f15 51#include <sys/user.h>
9548a891 52#include <sys/thr.h>
7dc9ae43 53#include <libutil.h>
41975b26
AF
54#endif
55
094611b4
KR
56#ifdef __NetBSD__
57#include <sys/sysctl.h>
9548a891 58#include <lwp.h>
094611b4
KR
59#endif
60
2032e243
DC
61#ifdef __APPLE__
62#include <mach-o/dyld.h>
63#endif
64
2b9b9e70
DC
65#ifdef __HAIKU__
66#include <kernel/image.h>
67#endif
68
a9c94277 69#include "qemu/mmap-alloc.h"
794e8f30 70
7d992e4d
PL
71#ifdef CONFIG_DEBUG_STACK_USAGE
72#include "qemu/error-report.h"
73#endif
74
dfd0dcc7 75#define MAX_MEM_PREALLOC_THREAD_COUNT 16
1e356fc1 76
dba50678
DH
77struct MemsetThread;
78
79typedef struct MemsetContext {
80 bool all_threads_created;
81 bool any_thread_failed;
82 struct MemsetThread *threads;
83 int num_threads;
84} MemsetContext;
85
1e356fc1
JK
86struct MemsetThread {
87 char *addr;
e947d47d
SW
88 size_t numpages;
89 size_t hpagesize;
1e356fc1
JK
90 QemuThread pgthread;
91 sigjmp_buf env;
dba50678 92 MemsetContext *context;
1e356fc1
JK
93};
94typedef struct MemsetThread MemsetThread;
95
dba50678
DH
96/* used by sigbus_handler() */
97static MemsetContext *sigbus_memset_context;
1e356fc1 98
037fb5eb 99static QemuMutex page_mutex;
100static QemuCond page_cond;
037fb5eb 101
cbcfa041
PB
102int qemu_get_thread_id(void)
103{
104#if defined(__linux__)
105 return syscall(SYS_gettid);
9548a891
DC
106#elif defined(__FreeBSD__)
107 /* thread id is up to INT_MAX */
108 long tid;
109 thr_self(&tid);
110 return (int)tid;
111#elif defined(__NetBSD__)
112 return _lwp_self();
8edbca51
DC
113#elif defined(__OpenBSD__)
114 return getthrid();
cbcfa041
PB
115#else
116 return getpid();
117#endif
118}
f97742d0
AR
119
120int qemu_daemon(int nochdir, int noclose)
121{
122 return daemon(nochdir, noclose);
123}
124
9e6bdef2
MAL
125bool qemu_write_pidfile(const char *path, Error **errp)
126{
127 int fd;
128 char pidstr[32];
129
130 while (1) {
131 struct stat a, b;
35f7f3fb
MAL
132 struct flock lock = {
133 .l_type = F_WRLCK,
134 .l_whence = SEEK_SET,
135 .l_len = 0,
136 };
9e6bdef2 137
448058aa 138 fd = qemu_open_old(path, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR);
9e6bdef2
MAL
139 if (fd == -1) {
140 error_setg_errno(errp, errno, "Cannot open pid file");
141 return false;
142 }
143
144 if (fstat(fd, &b) < 0) {
145 error_setg_errno(errp, errno, "Cannot stat file");
146 goto fail_close;
147 }
148
35f7f3fb 149 if (fcntl(fd, F_SETLK, &lock)) {
9e6bdef2
MAL
150 error_setg_errno(errp, errno, "Cannot lock pid file");
151 goto fail_close;
152 }
153
154 /*
155 * Now make sure the path we locked is the same one that now
156 * exists on the filesystem.
157 */
158 if (stat(path, &a) < 0) {
159 /*
160 * PID file disappeared, someone else must be racing with
161 * us, so try again.
162 */
163 close(fd);
164 continue;
165 }
166
167 if (a.st_ino == b.st_ino) {
168 break;
169 }
170
171 /*
172 * PID file was recreated, someone else must be racing with
173 * us, so try again.
174 */
175 close(fd);
176 }
177
178 if (ftruncate(fd, 0) < 0) {
179 error_setg_errno(errp, errno, "Failed to truncate pid file");
180 goto fail_unlink;
181 }
182
183 snprintf(pidstr, sizeof(pidstr), FMT_pid "\n", getpid());
184 if (write(fd, pidstr, strlen(pidstr)) != strlen(pidstr)) {
185 error_setg(errp, "Failed to write pid file");
186 goto fail_unlink;
187 }
188
189 return true;
190
191fail_unlink:
192 unlink(path);
193fail_close:
194 close(fd);
195 return false;
196}
197
b152aa84 198void *qemu_oom_check(void *ptr)
c1b0b93b
JS
199{
200 if (ptr == NULL) {
201 fprintf(stderr, "Failed to allocate memory: %s\n", strerror(errno));
202 abort();
203 }
204 return ptr;
205}
c1b0b93b 206
7d2a35cc 207void *qemu_try_memalign(size_t alignment, size_t size)
c1b0b93b
JS
208{
209 void *ptr;
e5354657
KW
210
211 if (alignment < sizeof(void*)) {
212 alignment = sizeof(void*);
ed6f53f9
PMD
213 } else {
214 g_assert(is_power_of_2(alignment));
e5354657
KW
215 }
216
9bc5a719 217#if defined(CONFIG_POSIX_MEMALIGN)
c1b0b93b
JS
218 int ret;
219 ret = posix_memalign(&ptr, alignment, size);
220 if (ret != 0) {
7d2a35cc
KW
221 errno = ret;
222 ptr = NULL;
c1b0b93b
JS
223 }
224#elif defined(CONFIG_BSD)
7d2a35cc 225 ptr = valloc(size);
c1b0b93b 226#else
7d2a35cc 227 ptr = memalign(alignment, size);
c1b0b93b
JS
228#endif
229 trace_qemu_memalign(alignment, size, ptr);
230 return ptr;
231}
232
7d2a35cc
KW
233void *qemu_memalign(size_t alignment, size_t size)
234{
235 return qemu_oom_check(qemu_try_memalign(alignment, size));
236}
237
c1b0b93b 238/* alloc shared memory pages */
8dbe22c6
DH
239void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared,
240 bool noreserve)
c1b0b93b 241{
8dbe22c6
DH
242 const uint32_t qemu_map_flags = (shared ? QEMU_MAP_SHARED : 0) |
243 (noreserve ? QEMU_MAP_NORESERVE : 0);
36b58628 244 size_t align = QEMU_VMALLOC_ALIGN;
b444f5c0 245 void *ptr = qemu_ram_mmap(-1, size, align, qemu_map_flags, 0);
36b58628 246
7dda5dc8 247 if (ptr == MAP_FAILED) {
39228250 248 return NULL;
c2a8238a 249 }
c2a8238a 250
a2b257d6
IM
251 if (alignment) {
252 *alignment = align;
253 }
c2dfc5ba 254
6eebf958 255 trace_qemu_anon_ram_alloc(size, ptr);
c7f4111a 256 return ptr;
c1b0b93b
JS
257}
258
259void qemu_vfree(void *ptr)
260{
261 trace_qemu_vfree(ptr);
262 free(ptr);
263}
9549e764 264
e7a09b92
PB
265void qemu_anon_ram_free(void *ptr, size_t size)
266{
267 trace_qemu_anon_ram_free(ptr, size);
53adb9d4 268 qemu_ram_munmap(-1, ptr, size);
e7a09b92
PB
269}
270
f9e8cacc 271void qemu_set_block(int fd)
154b9a0c
PB
272{
273 int f;
274 f = fcntl(fd, F_GETFL);
da93b820
LQ
275 assert(f != -1);
276 f = fcntl(fd, F_SETFL, f & ~O_NONBLOCK);
277 assert(f != -1);
154b9a0c
PB
278}
279
894022e6 280int qemu_try_set_nonblock(int fd)
9549e764
JS
281{
282 int f;
283 f = fcntl(fd, F_GETFL);
02cdcc96 284 if (f == -1) {
894022e6
LV
285 return -errno;
286 }
287 if (fcntl(fd, F_SETFL, f | O_NONBLOCK) == -1) {
894022e6
LV
288 return -errno;
289 }
290 return 0;
291}
292
293void qemu_set_nonblock(int fd)
294{
295 int f;
296 f = qemu_try_set_nonblock(fd);
297 assert(f == 0);
9549e764
JS
298}
299
606600a1
SO
300int socket_set_fast_reuse(int fd)
301{
302 int val = 1, ret;
303
304 ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
305 (const char *)&val, sizeof(val));
306
307 assert(ret == 0);
308
309 return ret;
310}
311
9549e764
JS
312void qemu_set_cloexec(int fd)
313{
314 int f;
315 f = fcntl(fd, F_GETFD);
7e6478e7
SS
316 assert(f != -1);
317 f = fcntl(fd, F_SETFD, f | FD_CLOEXEC);
318 assert(f != -1);
9549e764 319}
70e72ce4
JS
320
321/*
322 * Creates a pipe with FD_CLOEXEC set on both file descriptors
323 */
324int qemu_pipe(int pipefd[2])
325{
326 int ret;
327
328#ifdef CONFIG_PIPE2
329 ret = pipe2(pipefd, O_CLOEXEC);
330 if (ret != -1 || errno != ENOSYS) {
331 return ret;
332 }
333#endif
334 ret = pipe(pipefd);
335 if (ret == 0) {
336 qemu_set_cloexec(pipefd[0]);
337 qemu_set_cloexec(pipefd[1]);
338 }
339
340 return ret;
341}
38671423 342
e2ea3515
LE
343char *
344qemu_get_local_state_pathname(const char *relative_pathname)
345{
fcb4f59c
PB
346 g_autofree char *dir = g_strdup_printf("%s/%s",
347 CONFIG_QEMU_LOCALSTATEDIR,
348 relative_pathname);
349 return get_relocated_path(dir);
e2ea3515 350}
13401ba0
SH
351
352void qemu_set_tty_echo(int fd, bool echo)
353{
354 struct termios tty;
355
356 tcgetattr(fd, &tty);
357
358 if (echo) {
359 tty.c_lflag |= ECHO | ECHONL | ICANON | IEXTEN;
360 } else {
361 tty.c_lflag &= ~(ECHO | ECHONL | ICANON | IEXTEN);
362 }
363
364 tcsetattr(fd, TCSANOW, &tty);
365}
10f5bff6 366
9386a4a7 367static const char *exec_dir;
10f5bff6
FZ
368
369void qemu_init_exec_dir(const char *argv0)
370{
10f5bff6
FZ
371 char *p = NULL;
372 char buf[PATH_MAX];
373
9386a4a7 374 if (exec_dir) {
a4c13869
PB
375 return;
376 }
10f5bff6
FZ
377
378#if defined(__linux__)
379 {
380 int len;
381 len = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
382 if (len > 0) {
383 buf[len] = 0;
384 p = buf;
385 }
386 }
094611b4
KR
387#elif defined(__FreeBSD__) \
388 || (defined(__NetBSD__) && defined(KERN_PROC_PATHNAME))
10f5bff6 389 {
094611b4 390#if defined(__FreeBSD__)
10f5bff6 391 static int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1};
094611b4
KR
392#else
393 static int mib[4] = {CTL_KERN, KERN_PROC_ARGS, -1, KERN_PROC_PATHNAME};
394#endif
10f5bff6
FZ
395 size_t len = sizeof(buf) - 1;
396
397 *buf = '\0';
398 if (!sysctl(mib, ARRAY_SIZE(mib), buf, &len, NULL, 0) &&
399 *buf) {
400 buf[sizeof(buf) - 1] = '\0';
401 p = buf;
402 }
403 }
2032e243
DC
404#elif defined(__APPLE__)
405 {
406 char fpath[PATH_MAX];
407 uint32_t len = sizeof(fpath);
408 if (_NSGetExecutablePath(fpath, &len) == 0) {
409 p = realpath(fpath, buf);
410 if (!p) {
411 return;
412 }
413 }
414 }
2b9b9e70
DC
415#elif defined(__HAIKU__)
416 {
417 image_info ii;
418 int32_t c = 0;
419
420 *buf = '\0';
421 while (get_next_image_info(0, &c, &ii) == B_OK) {
422 if (ii.type == B_APP_IMAGE) {
423 strncpy(buf, ii.name, sizeof(buf));
424 buf[sizeof(buf) - 1] = 0;
425 p = buf;
426 break;
427 }
428 }
429 }
10f5bff6
FZ
430#endif
431 /* If we don't have any way of figuring out the actual executable
432 location then try argv[0]. */
9386a4a7 433 if (!p && argv0) {
10f5bff6 434 p = realpath(argv0, buf);
10f5bff6 435 }
9386a4a7
PB
436 if (p) {
437 exec_dir = g_path_get_dirname(p);
438 } else {
439 exec_dir = CONFIG_BINDIR;
440 }
10f5bff6
FZ
441}
442
a4c13869 443const char *qemu_get_exec_dir(void)
10f5bff6 444{
a4c13869 445 return exec_dir;
10f5bff6 446}
38183310 447
38183310
PB
448static void sigbus_handler(int signal)
449{
1e356fc1 450 int i;
dba50678
DH
451
452 if (sigbus_memset_context) {
453 for (i = 0; i < sigbus_memset_context->num_threads; i++) {
454 MemsetThread *thread = &sigbus_memset_context->threads[i];
455
456 if (qemu_thread_is_self(&thread->pgthread)) {
457 siglongjmp(thread->env, 1);
1e356fc1
JK
458 }
459 }
460 }
38183310
PB
461}
462
1e356fc1
JK
463static void *do_touch_pages(void *arg)
464{
465 MemsetThread *memset_args = (MemsetThread *)arg;
1e356fc1 466 sigset_t set, oldset;
6c427ab9 467 int ret = 0;
1e356fc1 468
037fb5eb 469 /*
470 * On Linux, the page faults from the loop below can cause mmap_sem
471 * contention with allocation of the thread stacks. Do not start
472 * clearing until all threads have been created.
473 */
474 qemu_mutex_lock(&page_mutex);
dba50678 475 while (!memset_args->context->all_threads_created) {
037fb5eb 476 qemu_cond_wait(&page_cond, &page_mutex);
477 }
478 qemu_mutex_unlock(&page_mutex);
479
1e356fc1
JK
480 /* unblock SIGBUS */
481 sigemptyset(&set);
482 sigaddset(&set, SIGBUS);
483 pthread_sigmask(SIG_UNBLOCK, &set, &oldset);
484
485 if (sigsetjmp(memset_args->env, 1)) {
6c427ab9 486 ret = -EFAULT;
1e356fc1 487 } else {
e947d47d
SW
488 char *addr = memset_args->addr;
489 size_t numpages = memset_args->numpages;
490 size_t hpagesize = memset_args->hpagesize;
491 size_t i;
1e356fc1 492 for (i = 0; i < numpages; i++) {
9dc44aa5
DB
493 /*
494 * Read & write back the same value, so we don't
495 * corrupt existing user/app data that might be
496 * stored.
497 *
498 * 'volatile' to stop compiler optimizing this away
499 * to a no-op
9dc44aa5
DB
500 */
501 *(volatile char *)addr = *addr;
1e356fc1
JK
502 addr += hpagesize;
503 }
504 }
505 pthread_sigmask(SIG_SETMASK, &oldset, NULL);
6c427ab9 506 return (void *)(uintptr_t)ret;
1e356fc1
JK
507}
508
a384bfa3
DH
509static void *do_madv_populate_write_pages(void *arg)
510{
511 MemsetThread *memset_args = (MemsetThread *)arg;
512 const size_t size = memset_args->numpages * memset_args->hpagesize;
513 char * const addr = memset_args->addr;
514 int ret = 0;
515
516 /* See do_touch_pages(). */
517 qemu_mutex_lock(&page_mutex);
dba50678 518 while (!memset_args->context->all_threads_created) {
a384bfa3
DH
519 qemu_cond_wait(&page_cond, &page_mutex);
520 }
521 qemu_mutex_unlock(&page_mutex);
522
523 if (size && qemu_madvise(addr, size, QEMU_MADV_POPULATE_WRITE)) {
524 ret = -errno;
525 }
526 return (void *)(uintptr_t)ret;
527}
528
89aec641
DH
529static inline int get_memset_num_threads(size_t hpagesize, size_t numpages,
530 int smp_cpus)
dfd0dcc7
JK
531{
532 long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
533 int ret = 1;
534
535 if (host_procs > 0) {
536 ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), smp_cpus);
537 }
89aec641
DH
538
539 /* Especially with gigantic pages, don't create more threads than pages. */
540 ret = MIN(ret, numpages);
541 /* Don't start threads to prealloc comparatively little memory. */
542 ret = MIN(ret, MAX(1, hpagesize * numpages / (64 * MiB)));
543
dfd0dcc7
JK
544 /* In case sysconf() fails, we fall back to single threaded */
545 return ret;
546}
547
6c427ab9 548static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
a384bfa3 549 int smp_cpus, bool use_madv_populate_write)
1e356fc1 550{
78b3f67a 551 static gsize initialized = 0;
dba50678 552 MemsetContext context = {
89aec641 553 .num_threads = get_memset_num_threads(hpagesize, numpages, smp_cpus),
dba50678 554 };
037fb5eb 555 size_t numpages_per_thread, leftover;
a384bfa3 556 void *(*touch_fn)(void *);
6c427ab9 557 int ret = 0, i = 0;
1e356fc1 558 char *addr = area;
1e356fc1 559
78b3f67a
PB
560 if (g_once_init_enter(&initialized)) {
561 qemu_mutex_init(&page_mutex);
562 qemu_cond_init(&page_cond);
563 g_once_init_leave(&initialized, 1);
564 }
565
a384bfa3
DH
566 if (use_madv_populate_write) {
567 touch_fn = do_madv_populate_write_pages;
568 } else {
569 touch_fn = do_touch_pages;
570 }
571
dba50678
DH
572 context.threads = g_new0(MemsetThread, context.num_threads);
573 numpages_per_thread = numpages / context.num_threads;
574 leftover = numpages % context.num_threads;
575 for (i = 0; i < context.num_threads; i++) {
576 context.threads[i].addr = addr;
577 context.threads[i].numpages = numpages_per_thread + (i < leftover);
578 context.threads[i].hpagesize = hpagesize;
579 context.threads[i].context = &context;
580 qemu_thread_create(&context.threads[i].pgthread, "touch_pages",
581 touch_fn, &context.threads[i],
1e356fc1 582 QEMU_THREAD_JOINABLE);
dba50678
DH
583 addr += context.threads[i].numpages * hpagesize;
584 }
585
586 if (!use_madv_populate_write) {
587 sigbus_memset_context = &context;
1e356fc1 588 }
278fb162
B
589
590 qemu_mutex_lock(&page_mutex);
dba50678 591 context.all_threads_created = true;
037fb5eb 592 qemu_cond_broadcast(&page_cond);
278fb162 593 qemu_mutex_unlock(&page_mutex);
037fb5eb 594
dba50678
DH
595 for (i = 0; i < context.num_threads; i++) {
596 int tmp = (uintptr_t)qemu_thread_join(&context.threads[i].pgthread);
6c427ab9
DH
597
598 if (tmp) {
599 ret = tmp;
600 }
1e356fc1 601 }
dba50678
DH
602
603 if (!use_madv_populate_write) {
604 sigbus_memset_context = NULL;
605 }
606 g_free(context.threads);
1e356fc1 607
6c427ab9 608 return ret;
1e356fc1
JK
609}
610
a384bfa3
DH
611static bool madv_populate_write_possible(char *area, size_t pagesize)
612{
613 return !qemu_madvise(area, pagesize, QEMU_MADV_POPULATE_WRITE) ||
614 errno != EINVAL;
615}
616
1e356fc1
JK
617void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
618 Error **errp)
38183310 619{
b7bf8f56 620 int ret;
38183310 621 struct sigaction act, oldact;
1e356fc1
JK
622 size_t hpagesize = qemu_fd_getpagesize(fd);
623 size_t numpages = DIV_ROUND_UP(memory, hpagesize);
a384bfa3 624 bool use_madv_populate_write;
38183310 625
a384bfa3
DH
626 /*
627 * Sense on every invocation, as MADV_POPULATE_WRITE cannot be used for
628 * some special mappings, such as mapping /dev/mem.
629 */
630 use_madv_populate_write = madv_populate_write_possible(area, hpagesize);
631
632 if (!use_madv_populate_write) {
633 memset(&act, 0, sizeof(act));
634 act.sa_handler = &sigbus_handler;
635 act.sa_flags = 0;
636
637 ret = sigaction(SIGBUS, &act, &oldact);
638 if (ret) {
639 error_setg_errno(errp, errno,
640 "os_mem_prealloc: failed to install signal handler");
641 return;
642 }
38183310
PB
643 }
644
1e356fc1 645 /* touch pages simultaneously */
a384bfa3
DH
646 ret = touch_all_pages(area, hpagesize, numpages, smp_cpus,
647 use_madv_populate_write);
6c427ab9
DH
648 if (ret) {
649 error_setg_errno(errp, -ret,
650 "os_mem_prealloc: preallocating memory failed");
056b68af 651 }
38183310 652
a384bfa3
DH
653 if (!use_madv_populate_write) {
654 ret = sigaction(SIGBUS, &oldact, NULL);
655 if (ret) {
656 /* Terminate QEMU since it can't recover from error */
657 perror("os_mem_prealloc: failed to reinstall signal handler");
658 exit(1);
659 }
b7bf8f56 660 }
38183310 661}
d57e4e48 662
7dc9ae43
MP
663char *qemu_get_pid_name(pid_t pid)
664{
665 char *name = NULL;
666
667#if defined(__FreeBSD__)
668 /* BSDs don't have /proc, but they provide a nice substitute */
669 struct kinfo_proc *proc = kinfo_getproc(pid);
670
671 if (proc) {
672 name = g_strdup(proc->ki_comm);
673 free(proc);
674 }
675#else
676 /* Assume a system with reasonable procfs */
677 char *pid_path;
678 size_t len;
679
680 pid_path = g_strdup_printf("/proc/%d/cmdline", pid);
681 g_file_get_contents(pid_path, &name, &len, NULL);
682 g_free(pid_path);
683#endif
684
685 return name;
686}
687
688
57cb38b3
DB
689pid_t qemu_fork(Error **errp)
690{
691 sigset_t oldmask, newmask;
692 struct sigaction sig_action;
693 int saved_errno;
694 pid_t pid;
695
696 /*
697 * Need to block signals now, so that child process can safely
698 * kill off caller's signal handlers without a race.
699 */
700 sigfillset(&newmask);
701 if (pthread_sigmask(SIG_SETMASK, &newmask, &oldmask) != 0) {
702 error_setg_errno(errp, errno,
703 "cannot block signals");
704 return -1;
705 }
706
707 pid = fork();
708 saved_errno = errno;
709
710 if (pid < 0) {
711 /* attempt to restore signal mask, but ignore failure, to
712 * avoid obscuring the fork failure */
713 (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
714 error_setg_errno(errp, saved_errno,
715 "cannot fork child process");
716 errno = saved_errno;
717 return -1;
718 } else if (pid) {
719 /* parent process */
720
721 /* Restore our original signal mask now that the child is
722 * safely running. Only documented failures are EFAULT (not
723 * possible, since we are using just-grabbed mask) or EINVAL
724 * (not possible, since we are using correct arguments). */
725 (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
726 } else {
727 /* child process */
728 size_t i;
729
730 /* Clear out all signal handlers from parent so nothing
731 * unexpected can happen in our child once we unblock
732 * signals */
733 sig_action.sa_handler = SIG_DFL;
734 sig_action.sa_flags = 0;
735 sigemptyset(&sig_action.sa_mask);
736
737 for (i = 1; i < NSIG; i++) {
738 /* Only possible errors are EFAULT or EINVAL The former
739 * won't happen, the latter we expect, so no need to check
740 * return value */
741 (void)sigaction(i, &sig_action, NULL);
742 }
743
744 /* Unmask all signals in child, since we've no idea what the
745 * caller's done with their signal mask and don't want to
746 * propagate that to children */
747 sigemptyset(&newmask);
748 if (pthread_sigmask(SIG_SETMASK, &newmask, NULL) != 0) {
749 Error *local_err = NULL;
750 error_setg_errno(&local_err, errno,
751 "cannot unblock signals");
752 error_report_err(local_err);
753 _exit(1);
754 }
755 }
756 return pid;
757}
8737d9e0
PL
758
759void *qemu_alloc_stack(size_t *sz)
760{
761 void *ptr, *guardpage;
fc3d1bad 762 int flags;
7d992e4d
PL
763#ifdef CONFIG_DEBUG_STACK_USAGE
764 void *ptr2;
765#endif
038adc2f 766 size_t pagesz = qemu_real_host_page_size;
8737d9e0
PL
767#ifdef _SC_THREAD_STACK_MIN
768 /* avoid stacks smaller than _SC_THREAD_STACK_MIN */
769 long min_stack_sz = sysconf(_SC_THREAD_STACK_MIN);
770 *sz = MAX(MAX(min_stack_sz, 0), *sz);
771#endif
772 /* adjust stack size to a multiple of the page size */
773 *sz = ROUND_UP(*sz, pagesz);
774 /* allocate one extra page for the guard page */
775 *sz += pagesz;
776
fc3d1bad
BS
777 flags = MAP_PRIVATE | MAP_ANONYMOUS;
778#if defined(MAP_STACK) && defined(__OpenBSD__)
779 /* Only enable MAP_STACK on OpenBSD. Other OS's such as
780 * Linux/FreeBSD/NetBSD have a flag with the same name
781 * but have differing functionality. OpenBSD will SEGV
782 * if it spots execution with a stack pointer pointing
783 * at memory that was not allocated with MAP_STACK.
784 */
785 flags |= MAP_STACK;
786#endif
787
788 ptr = mmap(NULL, *sz, PROT_READ | PROT_WRITE, flags, -1, 0);
8737d9e0 789 if (ptr == MAP_FAILED) {
e916a6e8 790 perror("failed to allocate memory for stack");
8737d9e0
PL
791 abort();
792 }
793
794#if defined(HOST_IA64)
795 /* separate register stack */
796 guardpage = ptr + (((*sz - pagesz) / 2) & ~pagesz);
797#elif defined(HOST_HPPA)
798 /* stack grows up */
799 guardpage = ptr + *sz - pagesz;
800#else
801 /* stack grows down */
802 guardpage = ptr;
803#endif
804 if (mprotect(guardpage, pagesz, PROT_NONE) != 0) {
e916a6e8 805 perror("failed to set up stack guard page");
8737d9e0
PL
806 abort();
807 }
808
7d992e4d
PL
809#ifdef CONFIG_DEBUG_STACK_USAGE
810 for (ptr2 = ptr + pagesz; ptr2 < ptr + *sz; ptr2 += sizeof(uint32_t)) {
811 *(uint32_t *)ptr2 = 0xdeadbeaf;
812 }
813#endif
814
8737d9e0
PL
815 return ptr;
816}
817
7d992e4d
PL
818#ifdef CONFIG_DEBUG_STACK_USAGE
819static __thread unsigned int max_stack_usage;
820#endif
821
8737d9e0
PL
822void qemu_free_stack(void *stack, size_t sz)
823{
7d992e4d
PL
824#ifdef CONFIG_DEBUG_STACK_USAGE
825 unsigned int usage;
826 void *ptr;
827
038adc2f 828 for (ptr = stack + qemu_real_host_page_size; ptr < stack + sz;
7d992e4d
PL
829 ptr += sizeof(uint32_t)) {
830 if (*(uint32_t *)ptr != 0xdeadbeaf) {
831 break;
832 }
833 }
834 usage = sz - (uintptr_t) (ptr - stack);
835 if (usage > max_stack_usage) {
836 error_report("thread %d max stack usage increased from %u to %u",
837 qemu_get_thread_id(), max_stack_usage, usage);
838 max_stack_usage = usage;
839 }
840#endif
841
8737d9e0
PL
842 munmap(stack, sz);
843}
d98d4072 844
c905a368
DB
845/*
846 * Disable CFI checks.
847 * We are going to call a signal hander directly. Such handler may or may not
848 * have been defined in our binary, so there's no guarantee that the pointer
849 * used to set the handler is a cfi-valid pointer. Since the handlers are
850 * stored in kernel memory, changing the handler to an attacker-defined
851 * function requires being able to call a sigaction() syscall,
852 * which is not as easy as overwriting a pointer in memory.
853 */
854QEMU_DISABLE_CFI
d98d4072
PB
855void sigaction_invoke(struct sigaction *action,
856 struct qemu_signalfd_siginfo *info)
857{
02ffa034 858 siginfo_t si = {};
d98d4072
PB
859 si.si_signo = info->ssi_signo;
860 si.si_errno = info->ssi_errno;
861 si.si_code = info->ssi_code;
862
863 /* Convert the minimal set of fields defined by POSIX.
864 * Positive si_code values are reserved for kernel-generated
865 * signals, where the valid siginfo fields are determined by
866 * the signal number. But according to POSIX, it is unspecified
867 * whether SI_USER and SI_QUEUE have values less than or equal to
868 * zero.
869 */
870 if (info->ssi_code == SI_USER || info->ssi_code == SI_QUEUE ||
871 info->ssi_code <= 0) {
872 /* SIGTERM, etc. */
873 si.si_pid = info->ssi_pid;
874 si.si_uid = info->ssi_uid;
875 } else if (info->ssi_signo == SIGILL || info->ssi_signo == SIGFPE ||
876 info->ssi_signo == SIGSEGV || info->ssi_signo == SIGBUS) {
877 si.si_addr = (void *)(uintptr_t)info->ssi_addr;
878 } else if (info->ssi_signo == SIGCHLD) {
879 si.si_pid = info->ssi_pid;
880 si.si_status = info->ssi_status;
881 si.si_uid = info->ssi_uid;
d98d4072
PB
882 }
883 action->sa_sigaction(info->ssi_signo, &si, NULL);
884}
e47f4765
MP
885
886#ifndef HOST_NAME_MAX
887# ifdef _POSIX_HOST_NAME_MAX
888# define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
889# else
890# define HOST_NAME_MAX 255
891# endif
892#endif
893
894char *qemu_get_host_name(Error **errp)
895{
896 long len = -1;
897 g_autofree char *hostname = NULL;
898
899#ifdef _SC_HOST_NAME_MAX
900 len = sysconf(_SC_HOST_NAME_MAX);
901#endif /* _SC_HOST_NAME_MAX */
902
903 if (len < 0) {
904 len = HOST_NAME_MAX;
905 }
906
907 /* Unfortunately, gethostname() below does not guarantee a
908 * NULL terminated string. Therefore, allocate one byte more
909 * to be sure. */
910 hostname = g_new0(char, len + 1);
911
912 if (gethostname(hostname, len) < 0) {
913 error_setg_errno(errp, errno,
914 "cannot get hostname");
915 return NULL;
916 }
917
918 return g_steal_pointer(&hostname);
919}
ad06ef0e
AB
920
921size_t qemu_get_host_physmem(void)
922{
923#ifdef _SC_PHYS_PAGES
924 long pages = sysconf(_SC_PHYS_PAGES);
925 if (pages > 0) {
926 if (pages > SIZE_MAX / qemu_real_host_page_size) {
927 return SIZE_MAX;
928 } else {
929 return pages * qemu_real_host_page_size;
930 }
931 }
932#endif
933 return 0;
934}