]>
Commit | Line | Data |
---|---|---|
7f4326d1 WB |
1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
2 | From: Fabian Ebner <f.ebner@proxmox.com> | |
3 | Date: Wed, 25 May 2022 13:59:39 +0200 | |
4 | Subject: [PATCH] PVE-Backup: avoid segfault issues upon backup-cancel | |
5 | ||
6 | When canceling a backup in PVE via a signal it's easy to run into a | |
7 | situation where the job is already failing when the backup_cancel QMP | |
8 | command comes in. With a bit of unlucky timing on top, it can happen | |
9 | that job_exit() runs between schedulung of job_cancel_bh() and | |
10 | execution of job_cancel_bh(). But job_cancel_sync() does not expect | |
11 | that the job is already finalized (in fact, the job might've been | |
12 | freed already, but even if it isn't, job_cancel_sync() would try to | |
13 | deref job->txn which would be NULL at that point). | |
14 | ||
15 | It is not possible to simply use the job_cancel() (which is advertised | |
16 | as being async but isn't in all cases) in qmp_backup_cancel() for the | |
17 | same reason job_cancel_sync() cannot be used. Namely, because it can | |
18 | invoke job_finish_sync() (which uses AIO_WAIT_WHILE and thus hangs if | |
19 | called from a coroutine). This happens when there's multiple jobs in | |
20 | the transaction and job->deferred_to_main_loop is true (is set before | |
21 | scheduling job_exit()) or if the job was not started yet. | |
22 | ||
23 | Fix the issue by selecting the job to cancel in job_cancel_bh() itself | |
24 | using the first job that's not completed yet. This is not necessarily | |
25 | the first job in the list, because pvebackup_co_complete_stream() | |
26 | might not yet have removed a completed job when job_cancel_bh() runs. | |
27 | ||
28 | An alternative would be to continue using only the first job and | |
29 | checking against JOB_STATUS_CONCLUDED or JOB_STATUS_NULL to decide if | |
30 | it's still necessary and possible to cancel, but the approach with | |
31 | using the first non-completed job seemed more robust. | |
32 | ||
33 | Signed-off-by: Fabian Ebner <f.ebner@proxmox.com> | |
34 | Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com> | |
35 | --- | |
36 | pve-backup.c | 61 +++++++++++++++++++++++++++++++++------------------- | |
37 | 1 file changed, 39 insertions(+), 22 deletions(-) | |
38 | ||
39 | diff --git a/pve-backup.c b/pve-backup.c | |
4fd0fa7f | 40 | index 0c34428713..2e22030eec 100644 |
7f4326d1 WB |
41 | --- a/pve-backup.c |
42 | +++ b/pve-backup.c | |
4fd0fa7f | 43 | @@ -355,15 +355,42 @@ static void pvebackup_complete_cb(void *opaque, int ret) |
7f4326d1 WB |
44 | |
45 | /* | |
46 | * job_cancel(_sync) does not like to be called from coroutines, so defer to | |
47 | - * main loop processing via a bottom half. | |
48 | + * main loop processing via a bottom half. Assumes that caller holds | |
49 | + * backup_mutex. | |
50 | */ | |
51 | static void job_cancel_bh(void *opaque) { | |
52 | CoCtxData *data = (CoCtxData*)opaque; | |
53 | - Job *job = (Job*)data->data; | |
54 | - AioContext *job_ctx = job->aio_context; | |
55 | - aio_context_acquire(job_ctx); | |
56 | - job_cancel_sync(job, true); | |
57 | - aio_context_release(job_ctx); | |
58 | + | |
59 | + /* | |
60 | + * Be careful to pick a valid job to cancel: | |
61 | + * 1. job_cancel_sync() does not expect the job to be finalized already. | |
62 | + * 2. job_exit() might run between scheduling and running job_cancel_bh() | |
63 | + * and pvebackup_co_complete_stream() might not have removed the job from | |
64 | + * the list yet (in fact, cannot, because it waits for the backup_mutex). | |
65 | + * Requiring !job_is_completed() ensures that no finalized job is picked. | |
66 | + */ | |
67 | + GList *bdi = g_list_first(backup_state.di_list); | |
68 | + while (bdi) { | |
69 | + if (bdi->data) { | |
70 | + BlockJob *bj = ((PVEBackupDevInfo *)bdi->data)->job; | |
71 | + if (bj) { | |
72 | + Job *job = &bj->job; | |
73 | + if (!job_is_completed(job)) { | |
74 | + AioContext *job_ctx = job->aio_context; | |
75 | + aio_context_acquire(job_ctx); | |
76 | + job_cancel_sync(job, true); | |
77 | + aio_context_release(job_ctx); | |
78 | + /* | |
79 | + * It's enough to cancel one job in the transaction, the | |
80 | + * rest will follow automatically. | |
81 | + */ | |
82 | + break; | |
83 | + } | |
84 | + } | |
85 | + } | |
86 | + bdi = g_list_next(bdi); | |
87 | + } | |
88 | + | |
89 | aio_co_enter(data->ctx, data->co); | |
90 | } | |
91 | ||
4fd0fa7f | 92 | @@ -384,22 +411,12 @@ void coroutine_fn qmp_backup_cancel(Error **errp) |
7f4326d1 WB |
93 | proxmox_backup_abort(backup_state.pbs, "backup canceled"); |
94 | } | |
95 | ||
96 | - /* it's enough to cancel one job in the transaction, the rest will follow | |
97 | - * automatically */ | |
98 | - GList *bdi = g_list_first(backup_state.di_list); | |
99 | - BlockJob *cancel_job = bdi && bdi->data ? | |
100 | - ((PVEBackupDevInfo *)bdi->data)->job : | |
101 | - NULL; | |
102 | - | |
103 | - if (cancel_job) { | |
104 | - CoCtxData data = { | |
105 | - .ctx = qemu_get_current_aio_context(), | |
106 | - .co = qemu_coroutine_self(), | |
107 | - .data = &cancel_job->job, | |
108 | - }; | |
109 | - aio_bh_schedule_oneshot(data.ctx, job_cancel_bh, &data); | |
110 | - qemu_coroutine_yield(); | |
111 | - } | |
112 | + CoCtxData data = { | |
113 | + .ctx = qemu_get_current_aio_context(), | |
114 | + .co = qemu_coroutine_self(), | |
115 | + }; | |
116 | + aio_bh_schedule_oneshot(data.ctx, job_cancel_bh, &data); | |
117 | + qemu_coroutine_yield(); | |
118 | ||
119 | qemu_co_mutex_unlock(&backup_state.backup_mutex); | |
120 | } |