#include "qemu/osdep.h"
#include "trace.h"
-#include "qemu-common.h"
#include "qemu/thread.h"
#include "qemu/atomic.h"
-#include "qemu/coroutine.h"
#include "qemu/coroutine_int.h"
+#include "qemu/coroutine-tls.h"
+#include "qemu/cutils.h"
#include "block/aio.h"
enum {
- POOL_BATCH_SIZE = 64,
+ COROUTINE_POOL_BATCH_MAX_SIZE = 128,
};
-/** Free list to speed up creation */
-static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
-static unsigned int release_pool_size;
-static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool);
-static __thread unsigned int alloc_pool_size;
-static __thread Notifier coroutine_pool_cleanup_notifier;
+/*
+ * Coroutine creation and deletion is expensive so a pool of unused coroutines
+ * is kept as a cache. When the pool has coroutines available, they are
+ * recycled instead of creating new ones from scratch. Coroutines are added to
+ * the pool upon termination.
+ *
+ * The pool is global but each thread maintains a small local pool to avoid
+ * global pool contention. Threads fetch and return batches of coroutines from
+ * the global pool to maintain their local pool. The local pool holds up to two
+ * batches whereas the maximum size of the global pool is controlled by the
+ * qemu_coroutine_inc_pool_size() API.
+ *
+ * .-----------------------------------.
+ * | Batch 1 | Batch 2 | Batch 3 | ... | global_pool
+ * `-----------------------------------'
+ *
+ * .-------------------.
+ * | Batch 1 | Batch 2 | per-thread local_pool (maximum 2 batches)
+ * `-------------------'
+ */
+typedef struct CoroutinePoolBatch {
+ /* Batches are kept in a list */
+ QSLIST_ENTRY(CoroutinePoolBatch) next;
+
+ /* This batch holds up to @COROUTINE_POOL_BATCH_MAX_SIZE coroutines */
+ QSLIST_HEAD(, Coroutine) list;
+ unsigned int size;
+} CoroutinePoolBatch;
+
+typedef QSLIST_HEAD(, CoroutinePoolBatch) CoroutinePool;
+
+/* Host operating system limit on number of pooled coroutines */
+static unsigned int global_pool_hard_max_size;
-static void coroutine_pool_cleanup(Notifier *n, void *value)
+static QemuMutex global_pool_lock; /* protects the following variables */
+static CoroutinePool global_pool = QSLIST_HEAD_INITIALIZER(global_pool);
+static unsigned int global_pool_size;
+static unsigned int global_pool_max_size = COROUTINE_POOL_BATCH_MAX_SIZE;
+
+QEMU_DEFINE_STATIC_CO_TLS(CoroutinePool, local_pool);
+QEMU_DEFINE_STATIC_CO_TLS(Notifier, local_pool_cleanup_notifier);
+
+static CoroutinePoolBatch *coroutine_pool_batch_new(void)
+{
+ CoroutinePoolBatch *batch = g_new(CoroutinePoolBatch, 1);
+
+ QSLIST_INIT(&batch->list);
+ batch->size = 0;
+ return batch;
+}
+
+static void coroutine_pool_batch_delete(CoroutinePoolBatch *batch)
{
Coroutine *co;
Coroutine *tmp;
- QSLIST_FOREACH_SAFE(co, &alloc_pool, pool_next, tmp) {
- QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
+ QSLIST_FOREACH_SAFE(co, &batch->list, pool_next, tmp) {
+ QSLIST_REMOVE_HEAD(&batch->list, pool_next);
qemu_coroutine_delete(co);
}
+ g_free(batch);
}
-Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque)
+static void local_pool_cleanup(Notifier *n, void *value)
{
- Coroutine *co = NULL;
+ CoroutinePool *local_pool = get_ptr_local_pool();
+ CoroutinePoolBatch *batch;
+ CoroutinePoolBatch *tmp;
+
+ QSLIST_FOREACH_SAFE(batch, local_pool, next, tmp) {
+ QSLIST_REMOVE_HEAD(local_pool, next);
+ coroutine_pool_batch_delete(batch);
+ }
+}
+
+/* Ensure the atexit notifier is registered */
+static void local_pool_cleanup_init_once(void)
+{
+ Notifier *notifier = get_ptr_local_pool_cleanup_notifier();
+ if (!notifier->notify) {
+ notifier->notify = local_pool_cleanup;
+ qemu_thread_atexit_add(notifier);
+ }
+}
- if (CONFIG_COROUTINE_POOL) {
- co = QSLIST_FIRST(&alloc_pool);
- if (!co) {
- if (release_pool_size > POOL_BATCH_SIZE) {
- /* Slow path; a good place to register the destructor, too. */
- if (!coroutine_pool_cleanup_notifier.notify) {
- coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup;
- qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier);
- }
-
- /* This is not exact; there could be a little skew between
- * release_pool_size and the actual size of release_pool. But
- * it is just a heuristic, it does not need to be perfect.
- */
- alloc_pool_size = atomic_xchg(&release_pool_size, 0);
- QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool);
- co = QSLIST_FIRST(&alloc_pool);
- }
+/* Helper to get the next unused coroutine from the local pool */
+static Coroutine *coroutine_pool_get_local(void)
+{
+ CoroutinePool *local_pool = get_ptr_local_pool();
+ CoroutinePoolBatch *batch = QSLIST_FIRST(local_pool);
+ Coroutine *co;
+
+ if (unlikely(!batch)) {
+ return NULL;
+ }
+
+ co = QSLIST_FIRST(&batch->list);
+ QSLIST_REMOVE_HEAD(&batch->list, pool_next);
+ batch->size--;
+
+ if (batch->size == 0) {
+ QSLIST_REMOVE_HEAD(local_pool, next);
+ coroutine_pool_batch_delete(batch);
+ }
+ return co;
+}
+
+/* Get the next batch from the global pool */
+static void coroutine_pool_refill_local(void)
+{
+ CoroutinePool *local_pool = get_ptr_local_pool();
+ CoroutinePoolBatch *batch;
+
+ WITH_QEMU_LOCK_GUARD(&global_pool_lock) {
+ batch = QSLIST_FIRST(&global_pool);
+
+ if (batch) {
+ QSLIST_REMOVE_HEAD(&global_pool, next);
+ global_pool_size -= batch->size;
}
- if (co) {
- QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
- alloc_pool_size--;
+ }
+
+ if (batch) {
+ QSLIST_INSERT_HEAD(local_pool, batch, next);
+ local_pool_cleanup_init_once();
+ }
+}
+
+/* Add a batch of coroutines to the global pool */
+static void coroutine_pool_put_global(CoroutinePoolBatch *batch)
+{
+ WITH_QEMU_LOCK_GUARD(&global_pool_lock) {
+ unsigned int max = MIN(global_pool_max_size,
+ global_pool_hard_max_size);
+
+ if (global_pool_size < max) {
+ QSLIST_INSERT_HEAD(&global_pool, batch, next);
+
+ /* Overshooting the max pool size is allowed */
+ global_pool_size += batch->size;
+ return;
}
}
+ /* The global pool was full, so throw away this batch */
+ coroutine_pool_batch_delete(batch);
+}
+
+/* Get the next unused coroutine from the pool or return NULL */
+static Coroutine *coroutine_pool_get(void)
+{
+ Coroutine *co;
+
+ co = coroutine_pool_get_local();
+ if (!co) {
+ coroutine_pool_refill_local();
+ co = coroutine_pool_get_local();
+ }
+ return co;
+}
+
+static void coroutine_pool_put(Coroutine *co)
+{
+ CoroutinePool *local_pool = get_ptr_local_pool();
+ CoroutinePoolBatch *batch = QSLIST_FIRST(local_pool);
+
+ if (unlikely(!batch)) {
+ batch = coroutine_pool_batch_new();
+ QSLIST_INSERT_HEAD(local_pool, batch, next);
+ local_pool_cleanup_init_once();
+ }
+
+ if (unlikely(batch->size >= COROUTINE_POOL_BATCH_MAX_SIZE)) {
+ CoroutinePoolBatch *next = QSLIST_NEXT(batch, next);
+
+ /* Is the local pool full? */
+ if (next) {
+ QSLIST_REMOVE_HEAD(local_pool, next);
+ coroutine_pool_put_global(batch);
+ }
+
+ batch = coroutine_pool_batch_new();
+ QSLIST_INSERT_HEAD(local_pool, batch, next);
+ }
+
+ QSLIST_INSERT_HEAD(&batch->list, co, pool_next);
+ batch->size++;
+}
+
+Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque)
+{
+ Coroutine *co = NULL;
+
+ if (IS_ENABLED(CONFIG_COROUTINE_POOL)) {
+ co = coroutine_pool_get();
+ }
+
if (!co) {
co = qemu_coroutine_new();
}
{
co->caller = NULL;
- if (CONFIG_COROUTINE_POOL) {
- if (release_pool_size < POOL_BATCH_SIZE * 2) {
- QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next);
- atomic_inc(&release_pool_size);
- return;
- }
- if (alloc_pool_size < POOL_BATCH_SIZE) {
- QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next);
- alloc_pool_size++;
- return;
- }
+ if (IS_ENABLED(CONFIG_COROUTINE_POOL)) {
+ coroutine_pool_put(co);
+ } else {
+ qemu_coroutine_delete(co);
}
-
- qemu_coroutine_delete(co);
}
-void qemu_coroutine_enter(Coroutine *co)
+void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co)
{
- Coroutine *self = qemu_coroutine_self();
- CoroutineAction ret;
+ QSIMPLEQ_HEAD(, Coroutine) pending = QSIMPLEQ_HEAD_INITIALIZER(pending);
+ Coroutine *from = qemu_coroutine_self();
- trace_qemu_coroutine_enter(self, co, co->entry_arg);
+ QSIMPLEQ_INSERT_TAIL(&pending, co, co_queue_next);
- if (co->caller) {
- fprintf(stderr, "Co-routine re-entered recursively\n");
- abort();
- }
+ /* Run co and any queued coroutines */
+ while (!QSIMPLEQ_EMPTY(&pending)) {
+ Coroutine *to = QSIMPLEQ_FIRST(&pending);
+ CoroutineAction ret;
- co->caller = self;
- co->ctx = qemu_get_current_aio_context();
+ /*
+ * Read to before to->scheduled; pairs with qatomic_cmpxchg in
+ * qemu_co_sleep(), aio_co_schedule() etc.
+ */
+ smp_read_barrier_depends();
- /* Store co->ctx before anything that stores co. Matches
- * barrier in aio_co_wake and qemu_co_mutex_wake.
- */
- smp_wmb();
+ const char *scheduled = qatomic_read(&to->scheduled);
- ret = qemu_coroutine_switch(self, co, COROUTINE_ENTER);
+ QSIMPLEQ_REMOVE_HEAD(&pending, co_queue_next);
- qemu_co_queue_run_restart(co);
+ trace_qemu_aio_coroutine_enter(ctx, from, to, to->entry_arg);
- switch (ret) {
- case COROUTINE_YIELD:
- return;
- case COROUTINE_TERMINATE:
- assert(!co->locks_held);
- trace_qemu_coroutine_terminate(co);
- coroutine_delete(co);
- return;
- default:
- abort();
+ /* if the Coroutine has already been scheduled, entering it again will
+ * cause us to enter it twice, potentially even after the coroutine has
+ * been deleted */
+ if (scheduled) {
+ fprintf(stderr,
+ "%s: Co-routine was already scheduled in '%s'\n",
+ __func__, scheduled);
+ abort();
+ }
+
+ if (to->caller) {
+ fprintf(stderr, "Co-routine re-entered recursively\n");
+ abort();
+ }
+
+ to->caller = from;
+ to->ctx = ctx;
+
+ /* Store to->ctx before anything that stores to. Matches
+ * barrier in aio_co_wake and qemu_co_mutex_wake.
+ */
+ smp_wmb();
+
+ ret = qemu_coroutine_switch(from, to, COROUTINE_ENTER);
+
+ /* Queued coroutines are run depth-first; previously pending coroutines
+ * run after those queued more recently.
+ */
+ QSIMPLEQ_PREPEND(&pending, &to->co_queue_wakeup);
+
+ switch (ret) {
+ case COROUTINE_YIELD:
+ break;
+ case COROUTINE_TERMINATE:
+ assert(!to->locks_held);
+ trace_qemu_coroutine_terminate(to);
+ coroutine_delete(to);
+ break;
+ default:
+ abort();
+ }
}
}
+void qemu_coroutine_enter(Coroutine *co)
+{
+ qemu_aio_coroutine_enter(qemu_get_current_aio_context(), co);
+}
+
void qemu_coroutine_enter_if_inactive(Coroutine *co)
{
if (!qemu_coroutine_entered(co)) {
{
return co->caller;
}
+
+AioContext *qemu_coroutine_get_aio_context(Coroutine *co)
+{
+ return co->ctx;
+}
+
+void qemu_coroutine_inc_pool_size(unsigned int additional_pool_size)
+{
+ QEMU_LOCK_GUARD(&global_pool_lock);
+ global_pool_max_size += additional_pool_size;
+}
+
+void qemu_coroutine_dec_pool_size(unsigned int removing_pool_size)
+{
+ QEMU_LOCK_GUARD(&global_pool_lock);
+ global_pool_max_size -= removing_pool_size;
+}
+
+static unsigned int get_global_pool_hard_max_size(void)
+{
+#ifdef __linux__
+ g_autofree char *contents = NULL;
+ int max_map_count;
+
+ /*
+ * Linux processes can have up to max_map_count virtual memory areas
+ * (VMAs). mmap(2), mprotect(2), etc fail with ENOMEM beyond this limit. We
+ * must limit the coroutine pool to a safe size to avoid running out of
+ * VMAs.
+ */
+ if (g_file_get_contents("/proc/sys/vm/max_map_count", &contents, NULL,
+ NULL) &&
+ qemu_strtoi(contents, NULL, 10, &max_map_count) == 0) {
+ /*
+ * This is an upper bound that avoids exceeding max_map_count. Leave a
+ * fixed amount for non-coroutine users like library dependencies,
+ * vhost-user, etc. Each coroutine takes up 2 VMAs so halve the
+ * remaining amount.
+ */
+ if (max_map_count > 5000) {
+ return (max_map_count - 5000) / 2;
+ } else {
+ /* Disable the global pool but threads still have local pools */
+ return 0;
+ }
+ }
+#endif
+
+ return UINT_MAX;
+}
+
+static void __attribute__((constructor)) qemu_coroutine_init(void)
+{
+ qemu_mutex_init(&global_pool_lock);
+ global_pool_hard_max_size = get_global_pool_hard_max_size();
+}