]> git.proxmox.com Git - pve-qemu.git/blob - debian/patches/pve/0037-PVE-Backup-Don-t-block-on-finishing-and-cleanup-crea.patch
fa4f2ce84bfcba8304f11339ed5bfc484a41e3f1
[pve-qemu.git] / debian / patches / pve / 0037-PVE-Backup-Don-t-block-on-finishing-and-cleanup-crea.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Stefan Reiter <s.reiter@proxmox.com>
3 Date: Mon, 28 Sep 2020 13:40:51 +0200
4 Subject: [PATCH] PVE-Backup: Don't block on finishing and cleanup
5 create_backup_jobs
6
7 proxmox_backup_co_finish is already async, but previously we would wait
8 for the coroutine using block_on_coroutine_fn(). Avoid this by
9 scheduling pvebackup_co_complete_stream (and thus pvebackup_co_cleanup)
10 as a real coroutine when calling from pvebackup_complete_cb. This is ok,
11 since complete_stream uses the backup_mutex internally to synchronize,
12 and other streams can happily continue writing in the meantime anyway.
13
14 To accomodate, backup_mutex is converted to a CoMutex. This means
15 converting every user to a coroutine. This is not just useful here, but
16 will come in handy once this series[0] is merged, and QMP calls can be
17 yield-able coroutines too. Then we can also finally get rid of
18 block_on_coroutine_fn.
19
20 Cases of aio_context_acquire/release from within what is now a coroutine
21 are changed to aio_co_reschedule_self, which works since a running
22 coroutine always holds the aio lock for the context it is running in.
23
24 job_cancel_sync is called from a BH since it can't be run from a
25 coroutine (uses AIO_WAIT_WHILE internally).
26
27 Same thing for create_backup_jobs, which is converted to a BH too.
28
29 To communicate the finishing state, a new property is introduced to
30 query-backup: 'finishing'. A new state is explicitly not used, since
31 that would break compatibility with older qemu-server versions.
32
33 Also fix create_backup_jobs:
34
35 No more weird bool returns, just the standard "errp" format used
36 everywhere else too. With this, if backup_job_create fails, the error
37 message is actually returned over QMP and can be shown to the user.
38
39 To facilitate correct cleanup on such an error, we call
40 create_backup_jobs as a bottom half directly from pvebackup_co_prepare.
41 This additionally allows us to actually hold the backup_mutex during
42 operation.
43
44 Also add a job_cancel_sync before job_unref, since a job must be in
45 STATUS_NULL to be deleted by unref, which could trigger an assert
46 before.
47
48 [0] https://lists.gnu.org/archive/html/qemu-devel/2020-09/msg03515.html
49
50 Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
51 ---
52 pve-backup.c | 217 ++++++++++++++++++++++++++++---------------
53 qapi/block-core.json | 5 +-
54 2 files changed, 144 insertions(+), 78 deletions(-)
55
56 diff --git a/pve-backup.c b/pve-backup.c
57 index 2e628d68e4..9c20ef3a5e 100644
58 --- a/pve-backup.c
59 +++ b/pve-backup.c
60 @@ -33,7 +33,9 @@ const char *PBS_BITMAP_NAME = "pbs-incremental-dirty-bitmap";
61
62 static struct PVEBackupState {
63 struct {
64 - // Everithing accessed from qmp_backup_query command is protected using lock
65 + // Everything accessed from qmp_backup_query command is protected using
66 + // this lock. Do NOT hold this lock for long times, as it is sometimes
67 + // acquired from coroutines, and thus any wait time may block the guest.
68 QemuMutex lock;
69 Error *error;
70 time_t start_time;
71 @@ -47,20 +49,22 @@ static struct PVEBackupState {
72 size_t reused;
73 size_t zero_bytes;
74 GList *bitmap_list;
75 + bool finishing;
76 + bool starting;
77 } stat;
78 int64_t speed;
79 VmaWriter *vmaw;
80 ProxmoxBackupHandle *pbs;
81 GList *di_list;
82 JobTxn *txn;
83 - QemuMutex backup_mutex;
84 + CoMutex backup_mutex;
85 CoMutex dump_callback_mutex;
86 } backup_state;
87
88 static void pvebackup_init(void)
89 {
90 qemu_mutex_init(&backup_state.stat.lock);
91 - qemu_mutex_init(&backup_state.backup_mutex);
92 + qemu_co_mutex_init(&backup_state.backup_mutex);
93 qemu_co_mutex_init(&backup_state.dump_callback_mutex);
94 }
95
96 @@ -72,6 +76,7 @@ typedef struct PVEBackupDevInfo {
97 size_t size;
98 uint64_t block_size;
99 uint8_t dev_id;
100 + int completed_ret; // INT_MAX if not completed
101 char targetfile[PATH_MAX];
102 BdrvDirtyBitmap *bitmap;
103 BlockDriverState *target;
104 @@ -227,12 +232,12 @@ pvebackup_co_dump_vma_cb(
105 }
106
107 // assumes the caller holds backup_mutex
108 -static void coroutine_fn pvebackup_co_cleanup(void *unused)
109 +static void coroutine_fn pvebackup_co_cleanup(void)
110 {
111 assert(qemu_in_coroutine());
112
113 qemu_mutex_lock(&backup_state.stat.lock);
114 - backup_state.stat.end_time = time(NULL);
115 + backup_state.stat.finishing = true;
116 qemu_mutex_unlock(&backup_state.stat.lock);
117
118 if (backup_state.vmaw) {
119 @@ -261,35 +266,29 @@ static void coroutine_fn pvebackup_co_cleanup(void *unused)
120
121 g_list_free(backup_state.di_list);
122 backup_state.di_list = NULL;
123 +
124 + qemu_mutex_lock(&backup_state.stat.lock);
125 + backup_state.stat.end_time = time(NULL);
126 + backup_state.stat.finishing = false;
127 + qemu_mutex_unlock(&backup_state.stat.lock);
128 }
129
130 -// assumes the caller holds backup_mutex
131 -static void coroutine_fn pvebackup_complete_stream(void *opaque)
132 +static void coroutine_fn pvebackup_co_complete_stream(void *opaque)
133 {
134 PVEBackupDevInfo *di = opaque;
135 + int ret = di->completed_ret;
136
137 - bool error_or_canceled = pvebackup_error_or_canceled();
138 -
139 - if (backup_state.vmaw) {
140 - vma_writer_close_stream(backup_state.vmaw, di->dev_id);
141 + qemu_mutex_lock(&backup_state.stat.lock);
142 + bool starting = backup_state.stat.starting;
143 + qemu_mutex_unlock(&backup_state.stat.lock);
144 + if (starting) {
145 + /* in 'starting' state, no tasks have been run yet, meaning we can (and
146 + * must) skip all cleanup, as we don't know what has and hasn't been
147 + * initialized yet. */
148 + return;
149 }
150
151 - if (backup_state.pbs && !error_or_canceled) {
152 - Error *local_err = NULL;
153 - proxmox_backup_co_close_image(backup_state.pbs, di->dev_id, &local_err);
154 - if (local_err != NULL) {
155 - pvebackup_propagate_error(local_err);
156 - }
157 - }
158 -}
159 -
160 -static void pvebackup_complete_cb(void *opaque, int ret)
161 -{
162 - assert(!qemu_in_coroutine());
163 -
164 - PVEBackupDevInfo *di = opaque;
165 -
166 - qemu_mutex_lock(&backup_state.backup_mutex);
167 + qemu_co_mutex_lock(&backup_state.backup_mutex);
168
169 if (ret < 0) {
170 Error *local_err = NULL;
171 @@ -301,7 +300,19 @@ static void pvebackup_complete_cb(void *opaque, int ret)
172
173 assert(di->target == NULL);
174
175 - block_on_coroutine_fn(pvebackup_complete_stream, di);
176 + bool error_or_canceled = pvebackup_error_or_canceled();
177 +
178 + if (backup_state.vmaw) {
179 + vma_writer_close_stream(backup_state.vmaw, di->dev_id);
180 + }
181 +
182 + if (backup_state.pbs && !error_or_canceled) {
183 + Error *local_err = NULL;
184 + proxmox_backup_co_close_image(backup_state.pbs, di->dev_id, &local_err);
185 + if (local_err != NULL) {
186 + pvebackup_propagate_error(local_err);
187 + }
188 + }
189
190 // remove self from job list
191 backup_state.di_list = g_list_remove(backup_state.di_list, di);
192 @@ -310,21 +321,49 @@ static void pvebackup_complete_cb(void *opaque, int ret)
193
194 /* call cleanup if we're the last job */
195 if (!g_list_first(backup_state.di_list)) {
196 - block_on_coroutine_fn(pvebackup_co_cleanup, NULL);
197 + pvebackup_co_cleanup();
198 }
199
200 - qemu_mutex_unlock(&backup_state.backup_mutex);
201 + qemu_co_mutex_unlock(&backup_state.backup_mutex);
202 }
203
204 -static void pvebackup_cancel(void)
205 +static void pvebackup_complete_cb(void *opaque, int ret)
206 {
207 - assert(!qemu_in_coroutine());
208 + PVEBackupDevInfo *di = opaque;
209 + di->completed_ret = ret;
210 +
211 + /*
212 + * Schedule stream cleanup in async coroutine. close_image and finish might
213 + * take a while, so we can't block on them here. This way it also doesn't
214 + * matter if we're already running in a coroutine or not.
215 + * Note: di is a pointer to an entry in the global backup_state struct, so
216 + * it stays valid.
217 + */
218 + Coroutine *co = qemu_coroutine_create(pvebackup_co_complete_stream, di);
219 + aio_co_enter(qemu_get_aio_context(), co);
220 +}
221 +
222 +/*
223 + * job_cancel(_sync) does not like to be called from coroutines, so defer to
224 + * main loop processing via a bottom half.
225 + */
226 +static void job_cancel_bh(void *opaque) {
227 + CoCtxData *data = (CoCtxData*)opaque;
228 + Job *job = (Job*)data->data;
229 + AioContext *job_ctx = job->aio_context;
230 + aio_context_acquire(job_ctx);
231 + job_cancel_sync(job);
232 + aio_context_release(job_ctx);
233 + aio_co_enter(data->ctx, data->co);
234 +}
235
236 +static void coroutine_fn pvebackup_co_cancel(void *opaque)
237 +{
238 Error *cancel_err = NULL;
239 error_setg(&cancel_err, "backup canceled");
240 pvebackup_propagate_error(cancel_err);
241
242 - qemu_mutex_lock(&backup_state.backup_mutex);
243 + qemu_co_mutex_lock(&backup_state.backup_mutex);
244
245 if (backup_state.vmaw) {
246 /* make sure vma writer does not block anymore */
247 @@ -342,27 +381,22 @@ static void pvebackup_cancel(void)
248 ((PVEBackupDevInfo *)bdi->data)->job :
249 NULL;
250
251 - /* ref the job before releasing the mutex, just to be safe */
252 if (cancel_job) {
253 - job_ref(&cancel_job->job);
254 + CoCtxData data = {
255 + .ctx = qemu_get_current_aio_context(),
256 + .co = qemu_coroutine_self(),
257 + .data = &cancel_job->job,
258 + };
259 + aio_bh_schedule_oneshot(data.ctx, job_cancel_bh, &data);
260 + qemu_coroutine_yield();
261 }
262
263 - /* job_cancel_sync may enter the job, so we need to release the
264 - * backup_mutex to avoid deadlock */
265 - qemu_mutex_unlock(&backup_state.backup_mutex);
266 -
267 - if (cancel_job) {
268 - AioContext *aio_context = cancel_job->job.aio_context;
269 - aio_context_acquire(aio_context);
270 - job_cancel_sync(&cancel_job->job);
271 - job_unref(&cancel_job->job);
272 - aio_context_release(aio_context);
273 - }
274 + qemu_co_mutex_unlock(&backup_state.backup_mutex);
275 }
276
277 void qmp_backup_cancel(Error **errp)
278 {
279 - pvebackup_cancel();
280 + block_on_coroutine_fn(pvebackup_co_cancel, NULL);
281 }
282
283 // assumes the caller holds backup_mutex
284 @@ -415,10 +449,18 @@ static int coroutine_fn pvebackup_co_add_config(
285 goto out;
286 }
287
288 -static bool create_backup_jobs(void) {
289 +/*
290 + * backup_job_create can *not* be run from a coroutine (and requires an
291 + * acquired AioContext), so this can't either.
292 + * The caller is responsible that backup_mutex is held nonetheless.
293 + */
294 +static void create_backup_jobs_bh(void *opaque) {
295
296 assert(!qemu_in_coroutine());
297
298 + CoCtxData *data = (CoCtxData*)opaque;
299 + Error **errp = (Error**)data->data;
300 +
301 Error *local_err = NULL;
302
303 /* create job transaction to synchronize bitmap commit and cancel all
304 @@ -454,24 +496,19 @@ static bool create_backup_jobs(void) {
305
306 aio_context_release(aio_context);
307
308 - if (!job || local_err != NULL) {
309 - Error *create_job_err = NULL;
310 - error_setg(&create_job_err, "backup_job_create failed: %s",
311 - local_err ? error_get_pretty(local_err) : "null");
312 + di->job = job;
313
314 - pvebackup_propagate_error(create_job_err);
315 + if (!job || local_err) {
316 + error_setg(errp, "backup_job_create failed: %s",
317 + local_err ? error_get_pretty(local_err) : "null");
318 break;
319 }
320
321 - di->job = job;
322 -
323 bdrv_unref(di->target);
324 di->target = NULL;
325 }
326
327 - bool errors = pvebackup_error_or_canceled();
328 -
329 - if (errors) {
330 + if (*errp) {
331 l = backup_state.di_list;
332 while (l) {
333 PVEBackupDevInfo *di = (PVEBackupDevInfo *)l->data;
334 @@ -483,12 +520,17 @@ static bool create_backup_jobs(void) {
335 }
336
337 if (di->job) {
338 + AioContext *ctx = di->job->job.aio_context;
339 + aio_context_acquire(ctx);
340 + job_cancel_sync(&di->job->job);
341 job_unref(&di->job->job);
342 + aio_context_release(ctx);
343 }
344 }
345 }
346
347 - return errors;
348 + /* return */
349 + aio_co_enter(data->ctx, data->co);
350 }
351
352 typedef struct QmpBackupTask {
353 @@ -525,11 +567,12 @@ typedef struct QmpBackupTask {
354 UuidInfo *result;
355 } QmpBackupTask;
356
357 -// assumes the caller holds backup_mutex
358 static void coroutine_fn pvebackup_co_prepare(void *opaque)
359 {
360 assert(qemu_in_coroutine());
361
362 + qemu_co_mutex_lock(&backup_state.backup_mutex);
363 +
364 QmpBackupTask *task = opaque;
365
366 task->result = NULL; // just to be sure
367 @@ -550,8 +593,9 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
368 const char *firewall_name = "qemu-server.fw";
369
370 if (backup_state.di_list) {
371 - error_set(task->errp, ERROR_CLASS_GENERIC_ERROR,
372 + error_set(task->errp, ERROR_CLASS_GENERIC_ERROR,
373 "previous backup not finished");
374 + qemu_co_mutex_unlock(&backup_state.backup_mutex);
375 return;
376 }
377
378 @@ -618,6 +662,8 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
379 }
380 di->size = size;
381 total += size;
382 +
383 + di->completed_ret = INT_MAX;
384 }
385
386 uuid_generate(uuid);
387 @@ -849,6 +895,8 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
388 backup_state.stat.dirty = total - backup_state.stat.reused;
389 backup_state.stat.transferred = 0;
390 backup_state.stat.zero_bytes = 0;
391 + backup_state.stat.finishing = false;
392 + backup_state.stat.starting = true;
393
394 qemu_mutex_unlock(&backup_state.stat.lock);
395
396 @@ -863,6 +911,33 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
397 uuid_info->UUID = uuid_str;
398
399 task->result = uuid_info;
400 +
401 + /* Run create_backup_jobs_bh outside of coroutine (in BH) but keep
402 + * backup_mutex locked. This is fine, a CoMutex can be held across yield
403 + * points, and we'll release it as soon as the BH reschedules us.
404 + */
405 + CoCtxData waker = {
406 + .co = qemu_coroutine_self(),
407 + .ctx = qemu_get_current_aio_context(),
408 + .data = &local_err,
409 + };
410 + aio_bh_schedule_oneshot(waker.ctx, create_backup_jobs_bh, &waker);
411 + qemu_coroutine_yield();
412 +
413 + if (local_err) {
414 + error_propagate(task->errp, local_err);
415 + goto err;
416 + }
417 +
418 + qemu_co_mutex_unlock(&backup_state.backup_mutex);
419 +
420 + qemu_mutex_lock(&backup_state.stat.lock);
421 + backup_state.stat.starting = false;
422 + qemu_mutex_unlock(&backup_state.stat.lock);
423 +
424 + /* start the first job in the transaction */
425 + job_txn_start_seq(backup_state.txn);
426 +
427 return;
428
429 err_mutex:
430 @@ -885,6 +960,7 @@ err:
431 g_free(di);
432 }
433 g_list_free(di_list);
434 + backup_state.di_list = NULL;
435
436 if (devs) {
437 g_strfreev(devs);
438 @@ -905,6 +981,8 @@ err:
439 }
440
441 task->result = NULL;
442 +
443 + qemu_co_mutex_unlock(&backup_state.backup_mutex);
444 return;
445 }
446
447 @@ -958,24 +1036,8 @@ UuidInfo *qmp_backup(
448 .errp = errp,
449 };
450
451 - qemu_mutex_lock(&backup_state.backup_mutex);
452 -
453 block_on_coroutine_fn(pvebackup_co_prepare, &task);
454
455 - if (*errp == NULL) {
456 - bool errors = create_backup_jobs();
457 - qemu_mutex_unlock(&backup_state.backup_mutex);
458 -
459 - if (!errors) {
460 - /* start the first job in the transaction
461 - * note: this might directly enter the job, so we need to do this
462 - * after unlocking the backup_mutex */
463 - job_txn_start_seq(backup_state.txn);
464 - }
465 - } else {
466 - qemu_mutex_unlock(&backup_state.backup_mutex);
467 - }
468 -
469 return task.result;
470 }
471
472 @@ -1027,6 +1089,7 @@ BackupStatus *qmp_query_backup(Error **errp)
473 info->transferred = backup_state.stat.transferred;
474 info->has_reused = true;
475 info->reused = backup_state.stat.reused;
476 + info->finishing = backup_state.stat.finishing;
477
478 qemu_mutex_unlock(&backup_state.stat.lock);
479
480 diff --git a/qapi/block-core.json b/qapi/block-core.json
481 index 170c13984d..a0d1d278e9 100644
482 --- a/qapi/block-core.json
483 +++ b/qapi/block-core.json
484 @@ -729,12 +729,15 @@
485 #
486 # @uuid: uuid for this backup job
487 #
488 +# @finishing: if status='active' and finishing=true, then the backup process is
489 +# waiting for the target to finish.
490 +#
491 ##
492 { 'struct': 'BackupStatus',
493 'data': {'*status': 'str', '*errmsg': 'str', '*total': 'int', '*dirty': 'int',
494 '*transferred': 'int', '*zero-bytes': 'int', '*reused': 'int',
495 '*start-time': 'int', '*end-time': 'int',
496 - '*backup-file': 'str', '*uuid': 'str' } }
497 + '*backup-file': 'str', '*uuid': 'str', 'finishing': 'bool' } }
498
499 ##
500 # @BackupFormat: