]>
Commit | Line | Data |
---|---|---|
7f4326d1 WB |
1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
2 | From: Fabian Ebner <f.ebner@proxmox.com> | |
3 | Date: Wed, 25 May 2022 13:59:39 +0200 | |
4 | Subject: [PATCH] PVE-Backup: avoid segfault issues upon backup-cancel | |
5 | ||
6 | When canceling a backup in PVE via a signal it's easy to run into a | |
7 | situation where the job is already failing when the backup_cancel QMP | |
8 | command comes in. With a bit of unlucky timing on top, it can happen | |
9 | that job_exit() runs between schedulung of job_cancel_bh() and | |
10 | execution of job_cancel_bh(). But job_cancel_sync() does not expect | |
11 | that the job is already finalized (in fact, the job might've been | |
12 | freed already, but even if it isn't, job_cancel_sync() would try to | |
13 | deref job->txn which would be NULL at that point). | |
14 | ||
15 | It is not possible to simply use the job_cancel() (which is advertised | |
16 | as being async but isn't in all cases) in qmp_backup_cancel() for the | |
17 | same reason job_cancel_sync() cannot be used. Namely, because it can | |
18 | invoke job_finish_sync() (which uses AIO_WAIT_WHILE and thus hangs if | |
19 | called from a coroutine). This happens when there's multiple jobs in | |
20 | the transaction and job->deferred_to_main_loop is true (is set before | |
21 | scheduling job_exit()) or if the job was not started yet. | |
22 | ||
23 | Fix the issue by selecting the job to cancel in job_cancel_bh() itself | |
24 | using the first job that's not completed yet. This is not necessarily | |
25 | the first job in the list, because pvebackup_co_complete_stream() | |
26 | might not yet have removed a completed job when job_cancel_bh() runs. | |
27 | ||
28 | An alternative would be to continue using only the first job and | |
29 | checking against JOB_STATUS_CONCLUDED or JOB_STATUS_NULL to decide if | |
30 | it's still necessary and possible to cancel, but the approach with | |
31 | using the first non-completed job seemed more robust. | |
32 | ||
d03e1b3c | 33 | Signed-off-by: Fiona Ebner <f.ebner@proxmox.com> |
7f4326d1 | 34 | Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com> |
d03e1b3c FE |
35 | [FE: adapt for new job lock mechanism replacing AioContext locks] |
36 | Signed-off-by: Fiona Ebner <f.ebner@proxmox.com> | |
7f4326d1 | 37 | --- |
d03e1b3c FE |
38 | pve-backup.c | 57 ++++++++++++++++++++++++++++++++++------------------ |
39 | 1 file changed, 38 insertions(+), 19 deletions(-) | |
7f4326d1 WB |
40 | |
41 | diff --git a/pve-backup.c b/pve-backup.c | |
d03e1b3c | 42 | index 0cf30e1ced..4067018dbe 100644 |
7f4326d1 WB |
43 | --- a/pve-backup.c |
44 | +++ b/pve-backup.c | |
d03e1b3c | 45 | @@ -354,12 +354,41 @@ static void pvebackup_complete_cb(void *opaque, int ret) |
7f4326d1 WB |
46 | |
47 | /* | |
48 | * job_cancel(_sync) does not like to be called from coroutines, so defer to | |
49 | - * main loop processing via a bottom half. | |
50 | + * main loop processing via a bottom half. Assumes that caller holds | |
51 | + * backup_mutex. | |
52 | */ | |
53 | static void job_cancel_bh(void *opaque) { | |
54 | CoCtxData *data = (CoCtxData*)opaque; | |
55 | - Job *job = (Job*)data->data; | |
7f4326d1 | 56 | - job_cancel_sync(job, true); |
7f4326d1 WB |
57 | + |
58 | + /* | |
59 | + * Be careful to pick a valid job to cancel: | |
60 | + * 1. job_cancel_sync() does not expect the job to be finalized already. | |
61 | + * 2. job_exit() might run between scheduling and running job_cancel_bh() | |
62 | + * and pvebackup_co_complete_stream() might not have removed the job from | |
63 | + * the list yet (in fact, cannot, because it waits for the backup_mutex). | |
64 | + * Requiring !job_is_completed() ensures that no finalized job is picked. | |
65 | + */ | |
66 | + GList *bdi = g_list_first(backup_state.di_list); | |
67 | + while (bdi) { | |
68 | + if (bdi->data) { | |
69 | + BlockJob *bj = ((PVEBackupDevInfo *)bdi->data)->job; | |
70 | + if (bj) { | |
71 | + Job *job = &bj->job; | |
d03e1b3c FE |
72 | + WITH_JOB_LOCK_GUARD() { |
73 | + if (!job_is_completed_locked(job)) { | |
74 | + job_cancel_sync_locked(job, true); | |
75 | + /* | |
76 | + * It's enough to cancel one job in the transaction, the | |
77 | + * rest will follow automatically. | |
78 | + */ | |
79 | + break; | |
80 | + } | |
7f4326d1 WB |
81 | + } |
82 | + } | |
83 | + } | |
84 | + bdi = g_list_next(bdi); | |
85 | + } | |
86 | + | |
87 | aio_co_enter(data->ctx, data->co); | |
88 | } | |
89 | ||
d03e1b3c | 90 | @@ -380,22 +409,12 @@ void coroutine_fn qmp_backup_cancel(Error **errp) |
7f4326d1 WB |
91 | proxmox_backup_abort(backup_state.pbs, "backup canceled"); |
92 | } | |
93 | ||
94 | - /* it's enough to cancel one job in the transaction, the rest will follow | |
95 | - * automatically */ | |
96 | - GList *bdi = g_list_first(backup_state.di_list); | |
97 | - BlockJob *cancel_job = bdi && bdi->data ? | |
98 | - ((PVEBackupDevInfo *)bdi->data)->job : | |
99 | - NULL; | |
100 | - | |
101 | - if (cancel_job) { | |
102 | - CoCtxData data = { | |
103 | - .ctx = qemu_get_current_aio_context(), | |
104 | - .co = qemu_coroutine_self(), | |
105 | - .data = &cancel_job->job, | |
106 | - }; | |
107 | - aio_bh_schedule_oneshot(data.ctx, job_cancel_bh, &data); | |
108 | - qemu_coroutine_yield(); | |
109 | - } | |
110 | + CoCtxData data = { | |
111 | + .ctx = qemu_get_current_aio_context(), | |
112 | + .co = qemu_coroutine_self(), | |
113 | + }; | |
114 | + aio_bh_schedule_oneshot(data.ctx, job_cancel_bh, &data); | |
115 | + qemu_coroutine_yield(); | |
116 | ||
117 | qemu_co_mutex_unlock(&backup_state.backup_mutex); | |
118 | } |