]> git.proxmox.com Git - mirror_qemu.git/blame - util/qsp.c
qsp: add sort_by option to qsp_report
[mirror_qemu.git] / util / qsp.c
CommitLineData
fe9959a2
EC
1/*
2 * qsp.c - QEMU Synchronization Profiler
3 *
4 * Copyright (C) 2018, Emilio G. Cota <cota@braap.org>
5 *
6 * License: GNU GPL, version 2 or later.
7 * See the COPYING file in the top-level directory.
8 *
9 * QSP profiles the time spent in synchronization primitives, which can
10 * help diagnose performance problems, e.g. scalability issues when
11 * contention is high.
12 *
13 * The primitives currently supported are mutexes, recursive mutexes and
14 * condition variables. Note that not all related functions are intercepted;
15 * instead we profile only those functions that can have a performance impact,
16 * either due to blocking (e.g. cond_wait, mutex_lock) or cache line
17 * contention (e.g. mutex_lock, mutex_trylock).
18 *
19 * QSP's design focuses on speed and scalability. This is achieved
20 * by having threads do their profiling entirely on thread-local data.
21 * The appropriate thread-local data is found via a QHT, i.e. a concurrent hash
22 * table. To aggregate data in order to generate a report, we iterate over
23 * all entries in the hash table. Depending on the number of threads and
24 * synchronization objects this might be expensive, but note that it is
25 * very rarely called -- reports are generated only when requested by users.
26 *
27 * Reports are generated as a table where each row represents a call site. A
28 * call site is the triplet formed by the __file__ and __LINE__ of the caller
29 * as well as the address of the "object" (i.e. mutex, rec. mutex or condvar)
30 * being operated on. Focusing on call sites instead of just on objects might
31 * seem puzzling. However, it is a sensible choice since otherwise dealing with
32 * dynamically-allocated objects becomes difficult (e.g. what to do when an
33 * object is destroyed, or reused?). Furthermore, the call site info is of most
34 * importance, since it is callers, and not objects, what cause wait time.
35 *
36 * Alternative designs considered:
37 *
38 * - Use an off-the-shelf profiler such as mutrace. This is not a viable option
39 * for us because QEMU has __malloc_hook set (by one of the libraries it
40 * uses); leaving this hook unset is required to avoid deadlock in mutrace.
41 *
42 * - Use a glib HT for each thread, protecting each HT with its own lock.
43 * This isn't simpler than the current design, and is 10% slower in the
44 * atomic_add-bench microbenchmark (-m option).
45 *
46 * - For reports, just use a binary tree as we aggregate data, instead of having
47 * an intermediate hash table. This would simplify the code only slightly, but
48 * would perform badly if there were many threads and objects to track.
49 *
50 * Related Work:
51 * - Lennart Poettering's mutrace: http://0pointer.de/blog/projects/mutrace.html
52 * - Lozi, David, Thomas, Lawall and Muller. "Remote Core Locking: Migrating
53 * Critical-Section Execution to Improve the Performance of Multithreaded
54 * Applications", USENIX ATC'12.
55 */
56#include "qemu/osdep.h"
57#include "qemu/thread.h"
58#include "qemu/timer.h"
59#include "qemu/qht.h"
60#include "exec/tb-hash-xx.h"
61
62enum QSPType {
63 QSP_MUTEX,
64 QSP_REC_MUTEX,
65 QSP_CONDVAR,
66};
67
68struct QSPCallSite {
69 const void *obj;
70 const char *file; /* i.e. __FILE__; shortened later */
71 int line;
72 enum QSPType type;
73};
74typedef struct QSPCallSite QSPCallSite;
75
76struct QSPEntry {
77 void *thread_ptr;
78 const QSPCallSite *callsite;
79 uint64_t n_acqs;
80 uint64_t ns;
81#ifndef CONFIG_ATOMIC64
82 /*
83 * If we cannot update the counts atomically, then use a seqlock.
84 * We don't need an associated lock because the updates are thread-local.
85 */
86 QemuSeqLock sequence;
87#endif
88};
89typedef struct QSPEntry QSPEntry;
90
91/* initial sizing for hash tables */
92#define QSP_INITIAL_SIZE 64
93
94/* If this file is moved, QSP_REL_PATH should be updated accordingly */
95#define QSP_REL_PATH "util/qsp.c"
96
97/* this file's full path. Used to present all call sites with relative paths */
98static size_t qsp_qemu_path_len;
99
100/* the address of qsp_thread gives us a unique 'thread ID' */
101static __thread int qsp_thread;
102
103/*
104 * Call sites are the same for all threads, so we track them in a separate hash
105 * table to save memory.
106 */
107static struct qht qsp_callsite_ht;
108
109static struct qht qsp_ht;
110static bool qsp_initialized, qsp_initializing;
111
112static const char * const qsp_typenames[] = {
113 [QSP_MUTEX] = "mutex",
114 [QSP_REC_MUTEX] = "rec_mutex",
115 [QSP_CONDVAR] = "condvar",
116};
117
118QemuMutexLockFunc qemu_mutex_lock_func = qemu_mutex_lock_impl;
119QemuMutexTrylockFunc qemu_mutex_trylock_func = qemu_mutex_trylock_impl;
120QemuRecMutexLockFunc qemu_rec_mutex_lock_func = qemu_rec_mutex_lock_impl;
121QemuRecMutexTrylockFunc qemu_rec_mutex_trylock_func =
122 qemu_rec_mutex_trylock_impl;
123QemuCondWaitFunc qemu_cond_wait_func = qemu_cond_wait_impl;
124
125/*
126 * It pays off to _not_ hash callsite->file; hashing a string is slow, and
127 * without it we still get a pretty unique hash.
128 */
129static inline
130uint32_t do_qsp_callsite_hash(const QSPCallSite *callsite, uint64_t a)
131{
132 uint64_t b = (uint64_t)(uintptr_t)callsite->obj;
133 uint32_t e = callsite->line;
134 uint32_t f = callsite->type;
135
136 return tb_hash_func7(a, b, e, f, 0);
137}
138
139static inline
140uint32_t qsp_callsite_hash(const QSPCallSite *callsite)
141{
142 return do_qsp_callsite_hash(callsite, 0);
143}
144
145static inline uint32_t do_qsp_entry_hash(const QSPEntry *entry, uint64_t a)
146{
147 return do_qsp_callsite_hash(entry->callsite, a);
148}
149
150static uint32_t qsp_entry_hash(const QSPEntry *entry)
151{
152 return do_qsp_entry_hash(entry, (uint64_t)(uintptr_t)entry->thread_ptr);
153}
154
155static uint32_t qsp_entry_no_thread_hash(const QSPEntry *entry)
156{
157 return do_qsp_entry_hash(entry, 0);
158}
159
160static bool qsp_callsite_cmp(const void *ap, const void *bp)
161{
162 const QSPCallSite *a = ap;
163 const QSPCallSite *b = bp;
164
165 return a == b ||
166 (a->obj == b->obj &&
167 a->line == b->line &&
168 a->type == b->type &&
169 (a->file == b->file || !strcmp(a->file, b->file)));
170}
171
172static bool qsp_entry_no_thread_cmp(const void *ap, const void *bp)
173{
174 const QSPEntry *a = ap;
175 const QSPEntry *b = bp;
176
177 return qsp_callsite_cmp(a->callsite, b->callsite);
178}
179
180static bool qsp_entry_cmp(const void *ap, const void *bp)
181{
182 const QSPEntry *a = ap;
183 const QSPEntry *b = bp;
184
185 return a->thread_ptr == b->thread_ptr &&
186 qsp_callsite_cmp(a->callsite, b->callsite);
187}
188
189/*
190 * Normally we'd call this from a constructor function, but we want it to work
191 * via libutil as well.
192 */
193static void qsp_do_init(void)
194{
195 /* make sure this file's path in the tree is up to date with QSP_REL_PATH */
196 g_assert(strstr(__FILE__, QSP_REL_PATH));
197 qsp_qemu_path_len = strlen(__FILE__) - strlen(QSP_REL_PATH);
198
199 qht_init(&qsp_ht, qsp_entry_cmp, QSP_INITIAL_SIZE,
200 QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
201 qht_init(&qsp_callsite_ht, qsp_callsite_cmp, QSP_INITIAL_SIZE,
202 QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
203}
204
205static __attribute__((noinline)) void qsp_init__slowpath(void)
206{
207 if (atomic_cmpxchg(&qsp_initializing, false, true) == false) {
208 qsp_do_init();
209 atomic_set(&qsp_initialized, true);
210 } else {
211 while (!atomic_read(&qsp_initialized)) {
212 cpu_relax();
213 }
214 }
215}
216
217/* qsp_init() must be called from _all_ exported functions */
218static inline void qsp_init(void)
219{
220 if (likely(atomic_read(&qsp_initialized))) {
221 return;
222 }
223 qsp_init__slowpath();
224}
225
226static QSPCallSite *qsp_callsite_find(const QSPCallSite *orig)
227{
228 QSPCallSite *callsite;
229 uint32_t hash;
230
231 hash = qsp_callsite_hash(orig);
232 callsite = qht_lookup(&qsp_callsite_ht, orig, hash);
233 if (callsite == NULL) {
234 void *existing = NULL;
235
236 callsite = g_new(QSPCallSite, 1);
237 memcpy(callsite, orig, sizeof(*callsite));
238 qht_insert(&qsp_callsite_ht, callsite, hash, &existing);
239 if (unlikely(existing)) {
240 g_free(callsite);
241 callsite = existing;
242 }
243 }
244 return callsite;
245}
246
247static QSPEntry *
248qsp_entry_create(struct qht *ht, const QSPEntry *entry, uint32_t hash)
249{
250 QSPEntry *e;
251 void *existing = NULL;
252
253 e = g_new0(QSPEntry, 1);
254 e->thread_ptr = entry->thread_ptr;
255 e->callsite = qsp_callsite_find(entry->callsite);
256
257 qht_insert(ht, e, hash, &existing);
258 if (unlikely(existing)) {
259 g_free(e);
260 e = existing;
261 }
262 return e;
263}
264
265static QSPEntry *
266qsp_entry_find(struct qht *ht, const QSPEntry *entry, uint32_t hash)
267{
268 QSPEntry *e;
269
270 e = qht_lookup(ht, entry, hash);
271 if (e == NULL) {
272 e = qsp_entry_create(ht, entry, hash);
273 }
274 return e;
275}
276
277/*
278 * Note: Entries are never removed, so callers do not have to be in an RCU
279 * read-side critical section.
280 */
281static QSPEntry *qsp_entry_get(const void *obj, const char *file, int line,
282 enum QSPType type)
283{
284 QSPCallSite callsite = {
285 .obj = obj,
286 .file = file,
287 .line = line,
288 .type = type,
289 };
290 QSPEntry orig;
291 uint32_t hash;
292
293 qsp_init();
294
295 orig.thread_ptr = &qsp_thread;
296 orig.callsite = &callsite;
297
298 hash = qsp_entry_hash(&orig);
299 return qsp_entry_find(&qsp_ht, &orig, hash);
300}
301
302/*
303 * @from is in the global hash table; read it atomically if the host
304 * supports it, otherwise use the seqlock.
305 */
306static void qsp_entry_aggregate(QSPEntry *to, const QSPEntry *from)
307{
308#ifdef CONFIG_ATOMIC64
309 to->ns += atomic_read__nocheck(&from->ns);
310 to->n_acqs += atomic_read__nocheck(&from->n_acqs);
311#else
312 unsigned int version;
313 uint64_t ns, n_acqs;
314
315 do {
316 version = seqlock_read_begin(&from->sequence);
317 ns = atomic_read__nocheck(&from->ns);
318 n_acqs = atomic_read__nocheck(&from->n_acqs);
319 } while (seqlock_read_retry(&from->sequence, version));
320
321 to->ns += ns;
322 to->n_acqs += n_acqs;
323#endif
324}
325
326/*
327 * @e is in the global hash table; it is only written to by the current thread,
328 * so we write to it atomically (as in "write once") to prevent torn reads.
329 * If the host doesn't support u64 atomics, use the seqlock.
330 */
331static inline void do_qsp_entry_record(QSPEntry *e, int64_t delta, bool acq)
332{
333#ifndef CONFIG_ATOMIC64
334 seqlock_write_begin(&e->sequence);
335#endif
336 atomic_set__nocheck(&e->ns, e->ns + delta);
337 if (acq) {
338 atomic_set__nocheck(&e->n_acqs, e->n_acqs + 1);
339 }
340#ifndef CONFIG_ATOMIC64
341 seqlock_write_end(&e->sequence);
342#endif
343}
344
345static inline void qsp_entry_record(QSPEntry *e, int64_t delta)
346{
347 do_qsp_entry_record(e, delta, true);
348}
349
350#define QSP_GEN_VOID(type_, qsp_t_, func_, impl_) \
351 static void func_(type_ *obj, const char *file, int line) \
352 { \
353 QSPEntry *e; \
354 int64_t t0, t1; \
355 \
356 t0 = get_clock(); \
357 impl_(obj, file, line); \
358 t1 = get_clock(); \
359 \
360 e = qsp_entry_get(obj, file, line, qsp_t_); \
361 qsp_entry_record(e, t1 - t0); \
362 }
363
364#define QSP_GEN_RET1(type_, qsp_t_, func_, impl_) \
365 static int func_(type_ *obj, const char *file, int line) \
366 { \
367 QSPEntry *e; \
368 int64_t t0, t1; \
369 int err; \
370 \
371 t0 = get_clock(); \
372 err = impl_(obj, file, line); \
373 t1 = get_clock(); \
374 \
375 e = qsp_entry_get(obj, file, line, qsp_t_); \
376 do_qsp_entry_record(e, t1 - t0, !err); \
377 return err; \
378 }
379
380QSP_GEN_VOID(QemuMutex, QSP_MUTEX, qsp_mutex_lock, qemu_mutex_lock_impl)
381QSP_GEN_RET1(QemuMutex, QSP_MUTEX, qsp_mutex_trylock, qemu_mutex_trylock_impl)
382
383QSP_GEN_VOID(QemuRecMutex, QSP_REC_MUTEX, qsp_rec_mutex_lock,
384 qemu_rec_mutex_lock_impl)
385QSP_GEN_RET1(QemuRecMutex, QSP_REC_MUTEX, qsp_rec_mutex_trylock,
386 qemu_rec_mutex_trylock_impl)
387
388#undef QSP_GEN_RET1
389#undef QSP_GEN_VOID
390
391static void
392qsp_cond_wait(QemuCond *cond, QemuMutex *mutex, const char *file, int line)
393{
394 QSPEntry *e;
395 int64_t t0, t1;
396
397 t0 = get_clock();
398 qemu_cond_wait_impl(cond, mutex, file, line);
399 t1 = get_clock();
400
401 e = qsp_entry_get(cond, file, line, QSP_CONDVAR);
402 qsp_entry_record(e, t1 - t0);
403}
404
405bool qsp_is_enabled(void)
406{
407 return atomic_read(&qemu_mutex_lock_func) == qsp_mutex_lock;
408}
409
410void qsp_enable(void)
411{
412 atomic_set(&qemu_mutex_lock_func, qsp_mutex_lock);
413 atomic_set(&qemu_mutex_trylock_func, qsp_mutex_trylock);
414 atomic_set(&qemu_rec_mutex_lock_func, qsp_rec_mutex_lock);
415 atomic_set(&qemu_rec_mutex_trylock_func, qsp_rec_mutex_trylock);
416 atomic_set(&qemu_cond_wait_func, qsp_cond_wait);
417}
418
419void qsp_disable(void)
420{
421 atomic_set(&qemu_mutex_lock_func, qemu_mutex_lock_impl);
422 atomic_set(&qemu_mutex_trylock_func, qemu_mutex_trylock_impl);
423 atomic_set(&qemu_rec_mutex_lock_func, qemu_rec_mutex_lock_impl);
424 atomic_set(&qemu_rec_mutex_trylock_func, qemu_rec_mutex_trylock_impl);
425 atomic_set(&qemu_cond_wait_func, qemu_cond_wait_impl);
426}
427
428static gint qsp_tree_cmp(gconstpointer ap, gconstpointer bp, gpointer up)
429{
430 const QSPEntry *a = ap;
431 const QSPEntry *b = bp;
0a22777c 432 enum QSPSortBy sort_by = *(enum QSPSortBy *)up;
fe9959a2
EC
433 const QSPCallSite *ca;
434 const QSPCallSite *cb;
435
0a22777c
EC
436 switch (sort_by) {
437 case QSP_SORT_BY_TOTAL_WAIT_TIME:
438 if (a->ns > b->ns) {
439 return -1;
440 } else if (a->ns < b->ns) {
441 return 1;
442 }
443 break;
444 case QSP_SORT_BY_AVG_WAIT_TIME:
445 {
446 double avg_a = a->n_acqs ? a->ns / a->n_acqs : 0;
447 double avg_b = b->n_acqs ? b->ns / b->n_acqs : 0;
448
449 if (avg_a > avg_b) {
450 return -1;
451 } else if (avg_a < avg_b) {
452 return 1;
453 }
454 break;
fe9959a2 455 }
0a22777c
EC
456 default:
457 g_assert_not_reached();
458 }
459
fe9959a2
EC
460 ca = a->callsite;
461 cb = b->callsite;
462 /* Break the tie with the object's address */
463 if (ca->obj < cb->obj) {
464 return -1;
465 } else if (ca->obj > cb->obj) {
466 return 1;
467 } else {
468 int cmp;
469
470 /* same obj. Break the tie with the callsite's file */
471 cmp = strcmp(ca->file, cb->file);
472 if (cmp) {
473 return cmp;
474 }
475 /* same callsite file. Break the tie with the callsite's line */
476 g_assert(ca->line != cb->line);
477 if (ca->line < cb->line) {
478 return -1;
479 } else if (ca->line > cb->line) {
480 return 1;
481 } else {
482 /* break the tie with the callsite's type */
483 return cb->type - ca->type;
484 }
485 }
486}
487
488static void qsp_sort(struct qht *ht, void *p, uint32_t h, void *userp)
489{
490 QSPEntry *e = p;
491 GTree *tree = userp;
492
493 g_tree_insert(tree, e, NULL);
494}
495
496static void qsp_aggregate(struct qht *global_ht, void *p, uint32_t h, void *up)
497{
498 struct qht *ht = up;
499 const QSPEntry *e = p;
500 QSPEntry *agg;
501 uint32_t hash;
502
503 hash = qsp_entry_no_thread_hash(e);
504 agg = qsp_entry_find(ht, e, hash);
505 qsp_entry_aggregate(agg, e);
506}
507
508static void qsp_mktree(GTree *tree)
509{
510 struct qht ht;
511
512 /* Aggregate all results from the global hash table into a local one */
513 qht_init(&ht, qsp_entry_no_thread_cmp, QSP_INITIAL_SIZE,
514 QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
515 qht_iter(&qsp_ht, qsp_aggregate, &ht);
516
517 /* sort the hash table elements by using a tree */
518 qht_iter(&ht, qsp_sort, tree);
519
520 /* free the hash table, but keep the elements (those are in the tree now) */
521 qht_destroy(&ht);
522}
523
524/* free string with g_free */
525static char *qsp_at(const QSPCallSite *callsite)
526{
527 GString *s = g_string_new(NULL);
528 const char *shortened;
529
530 /* remove the absolute path to qemu */
531 if (unlikely(strlen(callsite->file) < qsp_qemu_path_len)) {
532 shortened = callsite->file;
533 } else {
534 shortened = callsite->file + qsp_qemu_path_len;
535 }
536 g_string_append_printf(s, "%s:%u", shortened, callsite->line);
537 return g_string_free(s, FALSE);
538}
539
540struct QSPReportEntry {
541 const void *obj;
542 char *callsite_at;
543 const char *typename;
544 double time_s;
545 double ns_avg;
546 uint64_t n_acqs;
547};
548typedef struct QSPReportEntry QSPReportEntry;
549
550struct QSPReport {
551 QSPReportEntry *entries;
552 size_t n_entries;
553 size_t max_n_entries;
554};
555typedef struct QSPReport QSPReport;
556
557static gboolean qsp_tree_report(gpointer key, gpointer value, gpointer udata)
558{
559 const QSPEntry *e = key;
560 QSPReport *report = udata;
561 QSPReportEntry *entry;
562
563 if (report->n_entries == report->max_n_entries) {
564 return TRUE;
565 }
566 entry = &report->entries[report->n_entries];
567 report->n_entries++;
568
569 entry->obj = e->callsite->obj;
570 entry->callsite_at = qsp_at(e->callsite);
571 entry->typename = qsp_typenames[e->callsite->type];
572 entry->time_s = e->ns * 1e-9;
573 entry->n_acqs = e->n_acqs;
574 entry->ns_avg = e->n_acqs ? e->ns / e->n_acqs : 0;
575 return FALSE;
576}
577
578static void
579pr_report(const QSPReport *rep, FILE *f, fprintf_function pr)
580{
581 char *dashes;
582 size_t max_len = 0;
583 int callsite_len = 0;
584 int callsite_rspace;
585 int n_dashes;
586 size_t i;
587
588 /* find out the maximum length of all 'callsite' fields */
589 for (i = 0; i < rep->n_entries; i++) {
590 const QSPReportEntry *e = &rep->entries[i];
591 size_t len = strlen(e->callsite_at);
592
593 if (len > max_len) {
594 max_len = len;
595 }
596 }
597
598 callsite_len = MAX(max_len, strlen("Call site"));
599 /* white space to leave to the right of "Call site" */
600 callsite_rspace = callsite_len - strlen("Call site");
601
602 pr(f, "Type Object Call site%*s Wait Time (s) "
603 " Count Average (us)\n", callsite_rspace, "");
604
605 /* build a horizontal rule with dashes */
606 n_dashes = 79 + callsite_rspace;
607 dashes = g_malloc(n_dashes + 1);
608 memset(dashes, '-', n_dashes);
609 dashes[n_dashes] = '\0';
610 pr(f, "%s\n", dashes);
611
612 for (i = 0; i < rep->n_entries; i++) {
613 const QSPReportEntry *e = &rep->entries[i];
614
615 pr(f, "%-9s %14p %s%*s %13.5f %12" PRIu64 " %12.2f\n", e->typename,
616 e->obj, e->callsite_at, callsite_len - (int)strlen(e->callsite_at),
617 "", e->time_s, e->n_acqs, e->ns_avg * 1e-3);
618 }
619
620 pr(f, "%s\n", dashes);
621 g_free(dashes);
622}
623
624static void report_destroy(QSPReport *rep)
625{
626 size_t i;
627
628 for (i = 0; i < rep->n_entries; i++) {
629 QSPReportEntry *e = &rep->entries[i];
630
631 g_free(e->callsite_at);
632 }
633 g_free(rep->entries);
634}
635
0a22777c
EC
636void qsp_report(FILE *f, fprintf_function cpu_fprintf, size_t max,
637 enum QSPSortBy sort_by)
fe9959a2 638{
0a22777c 639 GTree *tree = g_tree_new_full(qsp_tree_cmp, &sort_by, g_free, NULL);
fe9959a2
EC
640 QSPReport rep;
641
642 qsp_init();
643
644 rep.entries = g_new0(QSPReportEntry, max);
645 rep.n_entries = 0;
646 rep.max_n_entries = max;
647
648 qsp_mktree(tree);
649 g_tree_foreach(tree, qsp_tree_report, &rep);
650 g_tree_destroy(tree);
651
652 pr_report(&rep, f, cpu_fprintf);
653 report_destroy(&rep);
654}