]> git.proxmox.com Git - pve-qemu.git/blob - debian/patches/pve/0038-PVE-Backup-Don-t-block-on-finishing-and-cleanup-crea.patch
re-export patches in normalized form
[pve-qemu.git] / debian / patches / pve / 0038-PVE-Backup-Don-t-block-on-finishing-and-cleanup-crea.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Stefan Reiter <s.reiter@proxmox.com>
3 Date: Mon, 28 Sep 2020 13:40:51 +0200
4 Subject: [PATCH] PVE-Backup: Don't block on finishing and cleanup
5 create_backup_jobs
6
7 proxmox_backup_co_finish is already async, but previously we would wait
8 for the coroutine using block_on_coroutine_fn(). Avoid this by
9 scheduling pvebackup_co_complete_stream (and thus pvebackup_co_cleanup)
10 as a real coroutine when calling from pvebackup_complete_cb. This is ok,
11 since complete_stream uses the backup_mutex internally to synchronize,
12 and other streams can happily continue writing in the meantime anyway.
13
14 To accomodate, backup_mutex is converted to a CoMutex. This means
15 converting every user to a coroutine. This is not just useful here, but
16 will come in handy once this series[0] is merged, and QMP calls can be
17 yield-able coroutines too. Then we can also finally get rid of
18 block_on_coroutine_fn.
19
20 Cases of aio_context_acquire/release from within what is now a coroutine
21 are changed to aio_co_reschedule_self, which works since a running
22 coroutine always holds the aio lock for the context it is running in.
23
24 job_cancel_sync is called from a BH since it can't be run from a
25 coroutine (uses AIO_WAIT_WHILE internally).
26
27 Same thing for create_backup_jobs, which is converted to a BH too.
28
29 To communicate the finishing state, a new property is introduced to
30 query-backup: 'finishing'. A new state is explicitly not used, since
31 that would break compatibility with older qemu-server versions.
32
33 Also fix create_backup_jobs:
34
35 No more weird bool returns, just the standard "errp" format used
36 everywhere else too. With this, if backup_job_create fails, the error
37 message is actually returned over QMP and can be shown to the user.
38
39 To facilitate correct cleanup on such an error, we call
40 create_backup_jobs as a bottom half directly from pvebackup_co_prepare.
41 This additionally allows us to actually hold the backup_mutex during
42 operation.
43
44 Also add a job_cancel_sync before job_unref, since a job must be in
45 STATUS_NULL to be deleted by unref, which could trigger an assert
46 before.
47
48 [0] https://lists.gnu.org/archive/html/qemu-devel/2020-09/msg03515.html
49
50 Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
51 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
52 [add new force parameter to job_cancel_sync calls]
53 Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
54 ---
55 pve-backup.c | 217 ++++++++++++++++++++++++++++---------------
56 qapi/block-core.json | 5 +-
57 2 files changed, 144 insertions(+), 78 deletions(-)
58
59 diff --git a/pve-backup.c b/pve-backup.c
60 index 63c686463f..6f05796fad 100644
61 --- a/pve-backup.c
62 +++ b/pve-backup.c
63 @@ -33,7 +33,9 @@ const char *PBS_BITMAP_NAME = "pbs-incremental-dirty-bitmap";
64
65 static struct PVEBackupState {
66 struct {
67 - // Everithing accessed from qmp_backup_query command is protected using lock
68 + // Everything accessed from qmp_backup_query command is protected using
69 + // this lock. Do NOT hold this lock for long times, as it is sometimes
70 + // acquired from coroutines, and thus any wait time may block the guest.
71 QemuMutex lock;
72 Error *error;
73 time_t start_time;
74 @@ -47,20 +49,22 @@ static struct PVEBackupState {
75 size_t reused;
76 size_t zero_bytes;
77 GList *bitmap_list;
78 + bool finishing;
79 + bool starting;
80 } stat;
81 int64_t speed;
82 VmaWriter *vmaw;
83 ProxmoxBackupHandle *pbs;
84 GList *di_list;
85 JobTxn *txn;
86 - QemuMutex backup_mutex;
87 + CoMutex backup_mutex;
88 CoMutex dump_callback_mutex;
89 } backup_state;
90
91 static void pvebackup_init(void)
92 {
93 qemu_mutex_init(&backup_state.stat.lock);
94 - qemu_mutex_init(&backup_state.backup_mutex);
95 + qemu_co_mutex_init(&backup_state.backup_mutex);
96 qemu_co_mutex_init(&backup_state.dump_callback_mutex);
97 }
98
99 @@ -72,6 +76,7 @@ typedef struct PVEBackupDevInfo {
100 size_t size;
101 uint64_t block_size;
102 uint8_t dev_id;
103 + int completed_ret; // INT_MAX if not completed
104 char targetfile[PATH_MAX];
105 BdrvDirtyBitmap *bitmap;
106 BlockDriverState *target;
107 @@ -227,12 +232,12 @@ pvebackup_co_dump_vma_cb(
108 }
109
110 // assumes the caller holds backup_mutex
111 -static void coroutine_fn pvebackup_co_cleanup(void *unused)
112 +static void coroutine_fn pvebackup_co_cleanup(void)
113 {
114 assert(qemu_in_coroutine());
115
116 qemu_mutex_lock(&backup_state.stat.lock);
117 - backup_state.stat.end_time = time(NULL);
118 + backup_state.stat.finishing = true;
119 qemu_mutex_unlock(&backup_state.stat.lock);
120
121 if (backup_state.vmaw) {
122 @@ -261,35 +266,29 @@ static void coroutine_fn pvebackup_co_cleanup(void *unused)
123
124 g_list_free(backup_state.di_list);
125 backup_state.di_list = NULL;
126 +
127 + qemu_mutex_lock(&backup_state.stat.lock);
128 + backup_state.stat.end_time = time(NULL);
129 + backup_state.stat.finishing = false;
130 + qemu_mutex_unlock(&backup_state.stat.lock);
131 }
132
133 -// assumes the caller holds backup_mutex
134 -static void coroutine_fn pvebackup_complete_stream(void *opaque)
135 +static void coroutine_fn pvebackup_co_complete_stream(void *opaque)
136 {
137 PVEBackupDevInfo *di = opaque;
138 + int ret = di->completed_ret;
139
140 - bool error_or_canceled = pvebackup_error_or_canceled();
141 -
142 - if (backup_state.vmaw) {
143 - vma_writer_close_stream(backup_state.vmaw, di->dev_id);
144 + qemu_mutex_lock(&backup_state.stat.lock);
145 + bool starting = backup_state.stat.starting;
146 + qemu_mutex_unlock(&backup_state.stat.lock);
147 + if (starting) {
148 + /* in 'starting' state, no tasks have been run yet, meaning we can (and
149 + * must) skip all cleanup, as we don't know what has and hasn't been
150 + * initialized yet. */
151 + return;
152 }
153
154 - if (backup_state.pbs && !error_or_canceled) {
155 - Error *local_err = NULL;
156 - proxmox_backup_co_close_image(backup_state.pbs, di->dev_id, &local_err);
157 - if (local_err != NULL) {
158 - pvebackup_propagate_error(local_err);
159 - }
160 - }
161 -}
162 -
163 -static void pvebackup_complete_cb(void *opaque, int ret)
164 -{
165 - assert(!qemu_in_coroutine());
166 -
167 - PVEBackupDevInfo *di = opaque;
168 -
169 - qemu_mutex_lock(&backup_state.backup_mutex);
170 + qemu_co_mutex_lock(&backup_state.backup_mutex);
171
172 if (ret < 0) {
173 Error *local_err = NULL;
174 @@ -301,7 +300,19 @@ static void pvebackup_complete_cb(void *opaque, int ret)
175
176 assert(di->target == NULL);
177
178 - block_on_coroutine_fn(pvebackup_complete_stream, di);
179 + bool error_or_canceled = pvebackup_error_or_canceled();
180 +
181 + if (backup_state.vmaw) {
182 + vma_writer_close_stream(backup_state.vmaw, di->dev_id);
183 + }
184 +
185 + if (backup_state.pbs && !error_or_canceled) {
186 + Error *local_err = NULL;
187 + proxmox_backup_co_close_image(backup_state.pbs, di->dev_id, &local_err);
188 + if (local_err != NULL) {
189 + pvebackup_propagate_error(local_err);
190 + }
191 + }
192
193 // remove self from job list
194 backup_state.di_list = g_list_remove(backup_state.di_list, di);
195 @@ -310,21 +321,49 @@ static void pvebackup_complete_cb(void *opaque, int ret)
196
197 /* call cleanup if we're the last job */
198 if (!g_list_first(backup_state.di_list)) {
199 - block_on_coroutine_fn(pvebackup_co_cleanup, NULL);
200 + pvebackup_co_cleanup();
201 }
202
203 - qemu_mutex_unlock(&backup_state.backup_mutex);
204 + qemu_co_mutex_unlock(&backup_state.backup_mutex);
205 }
206
207 -static void pvebackup_cancel(void)
208 +static void pvebackup_complete_cb(void *opaque, int ret)
209 {
210 - assert(!qemu_in_coroutine());
211 + PVEBackupDevInfo *di = opaque;
212 + di->completed_ret = ret;
213 +
214 + /*
215 + * Schedule stream cleanup in async coroutine. close_image and finish might
216 + * take a while, so we can't block on them here. This way it also doesn't
217 + * matter if we're already running in a coroutine or not.
218 + * Note: di is a pointer to an entry in the global backup_state struct, so
219 + * it stays valid.
220 + */
221 + Coroutine *co = qemu_coroutine_create(pvebackup_co_complete_stream, di);
222 + aio_co_enter(qemu_get_aio_context(), co);
223 +}
224 +
225 +/*
226 + * job_cancel(_sync) does not like to be called from coroutines, so defer to
227 + * main loop processing via a bottom half.
228 + */
229 +static void job_cancel_bh(void *opaque) {
230 + CoCtxData *data = (CoCtxData*)opaque;
231 + Job *job = (Job*)data->data;
232 + AioContext *job_ctx = job->aio_context;
233 + aio_context_acquire(job_ctx);
234 + job_cancel_sync(job, true);
235 + aio_context_release(job_ctx);
236 + aio_co_enter(data->ctx, data->co);
237 +}
238
239 +static void coroutine_fn pvebackup_co_cancel(void *opaque)
240 +{
241 Error *cancel_err = NULL;
242 error_setg(&cancel_err, "backup canceled");
243 pvebackup_propagate_error(cancel_err);
244
245 - qemu_mutex_lock(&backup_state.backup_mutex);
246 + qemu_co_mutex_lock(&backup_state.backup_mutex);
247
248 if (backup_state.vmaw) {
249 /* make sure vma writer does not block anymore */
250 @@ -342,27 +381,22 @@ static void pvebackup_cancel(void)
251 ((PVEBackupDevInfo *)bdi->data)->job :
252 NULL;
253
254 - /* ref the job before releasing the mutex, just to be safe */
255 if (cancel_job) {
256 - job_ref(&cancel_job->job);
257 + CoCtxData data = {
258 + .ctx = qemu_get_current_aio_context(),
259 + .co = qemu_coroutine_self(),
260 + .data = &cancel_job->job,
261 + };
262 + aio_bh_schedule_oneshot(data.ctx, job_cancel_bh, &data);
263 + qemu_coroutine_yield();
264 }
265
266 - /* job_cancel_sync may enter the job, so we need to release the
267 - * backup_mutex to avoid deadlock */
268 - qemu_mutex_unlock(&backup_state.backup_mutex);
269 -
270 - if (cancel_job) {
271 - AioContext *aio_context = cancel_job->job.aio_context;
272 - aio_context_acquire(aio_context);
273 - job_cancel_sync(&cancel_job->job, true);
274 - job_unref(&cancel_job->job);
275 - aio_context_release(aio_context);
276 - }
277 + qemu_co_mutex_unlock(&backup_state.backup_mutex);
278 }
279
280 void qmp_backup_cancel(Error **errp)
281 {
282 - pvebackup_cancel();
283 + block_on_coroutine_fn(pvebackup_co_cancel, NULL);
284 }
285
286 // assumes the caller holds backup_mutex
287 @@ -415,10 +449,18 @@ static int coroutine_fn pvebackup_co_add_config(
288 goto out;
289 }
290
291 -static bool create_backup_jobs(void) {
292 +/*
293 + * backup_job_create can *not* be run from a coroutine (and requires an
294 + * acquired AioContext), so this can't either.
295 + * The caller is responsible that backup_mutex is held nonetheless.
296 + */
297 +static void create_backup_jobs_bh(void *opaque) {
298
299 assert(!qemu_in_coroutine());
300
301 + CoCtxData *data = (CoCtxData*)opaque;
302 + Error **errp = (Error**)data->data;
303 +
304 Error *local_err = NULL;
305
306 /* create job transaction to synchronize bitmap commit and cancel all
307 @@ -454,24 +496,19 @@ static bool create_backup_jobs(void) {
308
309 aio_context_release(aio_context);
310
311 - if (!job || local_err != NULL) {
312 - Error *create_job_err = NULL;
313 - error_setg(&create_job_err, "backup_job_create failed: %s",
314 - local_err ? error_get_pretty(local_err) : "null");
315 + di->job = job;
316
317 - pvebackup_propagate_error(create_job_err);
318 + if (!job || local_err) {
319 + error_setg(errp, "backup_job_create failed: %s",
320 + local_err ? error_get_pretty(local_err) : "null");
321 break;
322 }
323
324 - di->job = job;
325 -
326 bdrv_unref(di->target);
327 di->target = NULL;
328 }
329
330 - bool errors = pvebackup_error_or_canceled();
331 -
332 - if (errors) {
333 + if (*errp) {
334 l = backup_state.di_list;
335 while (l) {
336 PVEBackupDevInfo *di = (PVEBackupDevInfo *)l->data;
337 @@ -483,12 +520,17 @@ static bool create_backup_jobs(void) {
338 }
339
340 if (di->job) {
341 + AioContext *ctx = di->job->job.aio_context;
342 + aio_context_acquire(ctx);
343 + job_cancel_sync(&di->job->job, true);
344 job_unref(&di->job->job);
345 + aio_context_release(ctx);
346 }
347 }
348 }
349
350 - return errors;
351 + /* return */
352 + aio_co_enter(data->ctx, data->co);
353 }
354
355 typedef struct QmpBackupTask {
356 @@ -525,11 +567,12 @@ typedef struct QmpBackupTask {
357 UuidInfo *result;
358 } QmpBackupTask;
359
360 -// assumes the caller holds backup_mutex
361 static void coroutine_fn pvebackup_co_prepare(void *opaque)
362 {
363 assert(qemu_in_coroutine());
364
365 + qemu_co_mutex_lock(&backup_state.backup_mutex);
366 +
367 QmpBackupTask *task = opaque;
368
369 task->result = NULL; // just to be sure
370 @@ -550,8 +593,9 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
371 const char *firewall_name = "qemu-server.fw";
372
373 if (backup_state.di_list) {
374 - error_set(task->errp, ERROR_CLASS_GENERIC_ERROR,
375 + error_set(task->errp, ERROR_CLASS_GENERIC_ERROR,
376 "previous backup not finished");
377 + qemu_co_mutex_unlock(&backup_state.backup_mutex);
378 return;
379 }
380
381 @@ -618,6 +662,8 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
382 }
383 di->size = size;
384 total += size;
385 +
386 + di->completed_ret = INT_MAX;
387 }
388
389 uuid_generate(uuid);
390 @@ -849,6 +895,8 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
391 backup_state.stat.dirty = total - backup_state.stat.reused;
392 backup_state.stat.transferred = 0;
393 backup_state.stat.zero_bytes = 0;
394 + backup_state.stat.finishing = false;
395 + backup_state.stat.starting = true;
396
397 qemu_mutex_unlock(&backup_state.stat.lock);
398
399 @@ -863,6 +911,33 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque)
400 uuid_info->UUID = uuid_str;
401
402 task->result = uuid_info;
403 +
404 + /* Run create_backup_jobs_bh outside of coroutine (in BH) but keep
405 + * backup_mutex locked. This is fine, a CoMutex can be held across yield
406 + * points, and we'll release it as soon as the BH reschedules us.
407 + */
408 + CoCtxData waker = {
409 + .co = qemu_coroutine_self(),
410 + .ctx = qemu_get_current_aio_context(),
411 + .data = &local_err,
412 + };
413 + aio_bh_schedule_oneshot(waker.ctx, create_backup_jobs_bh, &waker);
414 + qemu_coroutine_yield();
415 +
416 + if (local_err) {
417 + error_propagate(task->errp, local_err);
418 + goto err;
419 + }
420 +
421 + qemu_co_mutex_unlock(&backup_state.backup_mutex);
422 +
423 + qemu_mutex_lock(&backup_state.stat.lock);
424 + backup_state.stat.starting = false;
425 + qemu_mutex_unlock(&backup_state.stat.lock);
426 +
427 + /* start the first job in the transaction */
428 + job_txn_start_seq(backup_state.txn);
429 +
430 return;
431
432 err_mutex:
433 @@ -885,6 +960,7 @@ err:
434 g_free(di);
435 }
436 g_list_free(di_list);
437 + backup_state.di_list = NULL;
438
439 if (devs) {
440 g_strfreev(devs);
441 @@ -905,6 +981,8 @@ err:
442 }
443
444 task->result = NULL;
445 +
446 + qemu_co_mutex_unlock(&backup_state.backup_mutex);
447 return;
448 }
449
450 @@ -958,24 +1036,8 @@ UuidInfo *qmp_backup(
451 .errp = errp,
452 };
453
454 - qemu_mutex_lock(&backup_state.backup_mutex);
455 -
456 block_on_coroutine_fn(pvebackup_co_prepare, &task);
457
458 - if (*errp == NULL) {
459 - bool errors = create_backup_jobs();
460 - qemu_mutex_unlock(&backup_state.backup_mutex);
461 -
462 - if (!errors) {
463 - /* start the first job in the transaction
464 - * note: this might directly enter the job, so we need to do this
465 - * after unlocking the backup_mutex */
466 - job_txn_start_seq(backup_state.txn);
467 - }
468 - } else {
469 - qemu_mutex_unlock(&backup_state.backup_mutex);
470 - }
471 -
472 return task.result;
473 }
474
475 @@ -1027,6 +1089,7 @@ BackupStatus *qmp_query_backup(Error **errp)
476 info->transferred = backup_state.stat.transferred;
477 info->has_reused = true;
478 info->reused = backup_state.stat.reused;
479 + info->finishing = backup_state.stat.finishing;
480
481 qemu_mutex_unlock(&backup_state.stat.lock);
482
483 diff --git a/qapi/block-core.json b/qapi/block-core.json
484 index 8833060385..6a67adf923 100644
485 --- a/qapi/block-core.json
486 +++ b/qapi/block-core.json
487 @@ -774,12 +774,15 @@
488 #
489 # @uuid: uuid for this backup job
490 #
491 +# @finishing: if status='active' and finishing=true, then the backup process is
492 +# waiting for the target to finish.
493 +#
494 ##
495 { 'struct': 'BackupStatus',
496 'data': {'*status': 'str', '*errmsg': 'str', '*total': 'int', '*dirty': 'int',
497 '*transferred': 'int', '*zero-bytes': 'int', '*reused': 'int',
498 '*start-time': 'int', '*end-time': 'int',
499 - '*backup-file': 'str', '*uuid': 'str' } }
500 + '*backup-file': 'str', '*uuid': 'str', 'finishing': 'bool' } }
501
502 ##
503 # @BackupFormat: