]>
Commit | Line | Data |
---|---|---|
72ae34ec SR |
1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
2 | From: Stefan Reiter <s.reiter@proxmox.com> | |
3 | Date: Thu, 22 Oct 2020 17:01:07 +0200 | |
4 | Subject: [PATCH] PVE: fix and clean up error handling for create_backup_jobs | |
5 | ||
6 | No more weird bool returns, just the standard "errp" format used | |
7 | everywhere else too. With this, if backup_job_create fails, the error | |
8 | message is actually returned over QMP and can be shown to the user. | |
9 | ||
10 | To facilitate correct cleanup on such an error, we call | |
11 | create_backup_jobs as a bottom half directly from pvebackup_co_prepare. | |
12 | This additionally allows us to actually hold the backup_mutex during | |
13 | operation. | |
14 | ||
15 | Also add a job_cancel_sync before job_unref, since a job must be in | |
16 | STATUS_NULL to be deleted by unref, which could trigger an assert | |
17 | before. | |
18 | ||
19 | Signed-off-by: Stefan Reiter <s.reiter@proxmox.com> | |
20 | --- | |
21 | pve-backup.c | 79 +++++++++++++++++++++++++++++++++++----------------- | |
22 | 1 file changed, 54 insertions(+), 25 deletions(-) | |
23 | ||
24 | diff --git a/pve-backup.c b/pve-backup.c | |
25 | index 0466145bec..1a2647e7a5 100644 | |
26 | --- a/pve-backup.c | |
27 | +++ b/pve-backup.c | |
28 | @@ -50,6 +50,7 @@ static struct PVEBackupState { | |
29 | size_t zero_bytes; | |
30 | GList *bitmap_list; | |
31 | bool finishing; | |
32 | + bool starting; | |
33 | } stat; | |
34 | int64_t speed; | |
35 | VmaWriter *vmaw; | |
36 | @@ -277,6 +278,16 @@ static void coroutine_fn pvebackup_co_complete_stream(void *opaque) | |
37 | PVEBackupDevInfo *di = opaque; | |
38 | int ret = di->completed_ret; | |
39 | ||
40 | + qemu_mutex_lock(&backup_state.stat.lock); | |
41 | + bool starting = backup_state.stat.starting; | |
42 | + qemu_mutex_unlock(&backup_state.stat.lock); | |
43 | + if (starting) { | |
44 | + /* in 'starting' state, no tasks have been run yet, meaning we can (and | |
45 | + * must) skip all cleanup, as we don't know what has and hasn't been | |
46 | + * initialized yet. */ | |
47 | + return; | |
48 | + } | |
49 | + | |
50 | qemu_co_mutex_lock(&backup_state.backup_mutex); | |
51 | ||
52 | if (ret < 0) { | |
53 | @@ -441,15 +452,15 @@ static int coroutine_fn pvebackup_co_add_config( | |
54 | /* | |
55 | * backup_job_create can *not* be run from a coroutine (and requires an | |
56 | * acquired AioContext), so this can't either. | |
57 | - * This does imply that this function cannot run with backup_mutex acquired. | |
58 | - * That is ok because it is only ever called between setting up the backup_state | |
59 | - * struct and starting the jobs, and from within a QMP call. This means that no | |
60 | - * other QMP call can interrupt, and no background job is running yet. | |
61 | + * The caller is responsible that backup_mutex is held nonetheless. | |
62 | */ | |
63 | -static bool create_backup_jobs(void) { | |
64 | +static void create_backup_jobs_bh(void *opaque) { | |
65 | ||
66 | assert(!qemu_in_coroutine()); | |
67 | ||
68 | + CoCtxData *data = (CoCtxData*)opaque; | |
69 | + Error **errp = (Error**)data->data; | |
70 | + | |
71 | Error *local_err = NULL; | |
72 | ||
73 | /* create job transaction to synchronize bitmap commit and cancel all | |
74 | @@ -483,24 +494,19 @@ static bool create_backup_jobs(void) { | |
75 | ||
76 | aio_context_release(aio_context); | |
77 | ||
78 | - if (!job || local_err != NULL) { | |
79 | - Error *create_job_err = NULL; | |
80 | - error_setg(&create_job_err, "backup_job_create failed: %s", | |
81 | - local_err ? error_get_pretty(local_err) : "null"); | |
82 | + di->job = job; | |
83 | ||
84 | - pvebackup_propagate_error(create_job_err); | |
85 | + if (!job || local_err) { | |
86 | + error_setg(errp, "backup_job_create failed: %s", | |
87 | + local_err ? error_get_pretty(local_err) : "null"); | |
88 | break; | |
89 | } | |
90 | ||
91 | - di->job = job; | |
92 | - | |
93 | bdrv_unref(di->target); | |
94 | di->target = NULL; | |
95 | } | |
96 | ||
97 | - bool errors = pvebackup_error_or_canceled(); | |
98 | - | |
99 | - if (errors) { | |
100 | + if (*errp) { | |
101 | l = backup_state.di_list; | |
102 | while (l) { | |
103 | PVEBackupDevInfo *di = (PVEBackupDevInfo *)l->data; | |
104 | @@ -512,12 +518,17 @@ static bool create_backup_jobs(void) { | |
105 | } | |
106 | ||
107 | if (di->job) { | |
108 | + AioContext *ctx = di->job->job.aio_context; | |
109 | + aio_context_acquire(ctx); | |
110 | + job_cancel_sync(&di->job->job); | |
111 | job_unref(&di->job->job); | |
112 | + aio_context_release(ctx); | |
113 | } | |
114 | } | |
115 | } | |
116 | ||
117 | - return errors; | |
118 | + /* return */ | |
119 | + aio_co_enter(data->ctx, data->co); | |
120 | } | |
121 | ||
122 | typedef struct QmpBackupTask { | |
123 | @@ -883,6 +894,7 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque) | |
124 | backup_state.stat.transferred = 0; | |
125 | backup_state.stat.zero_bytes = 0; | |
126 | backup_state.stat.finishing = false; | |
127 | + backup_state.stat.starting = true; | |
128 | ||
129 | qemu_mutex_unlock(&backup_state.stat.lock); | |
130 | ||
131 | @@ -898,7 +910,32 @@ static void coroutine_fn pvebackup_co_prepare(void *opaque) | |
132 | ||
133 | task->result = uuid_info; | |
134 | ||
135 | + /* Run create_backup_jobs_bh outside of coroutine (in BH) but keep | |
136 | + * backup_mutex locked. This is fine, a CoMutex can be held across yield | |
137 | + * points, and we'll release it as soon as the BH reschedules us. | |
138 | + */ | |
139 | + CoCtxData waker = { | |
140 | + .co = qemu_coroutine_self(), | |
141 | + .ctx = qemu_get_current_aio_context(), | |
142 | + .data = &local_err, | |
143 | + }; | |
144 | + aio_bh_schedule_oneshot(waker.ctx, create_backup_jobs_bh, &waker); | |
145 | + qemu_coroutine_yield(); | |
146 | + | |
147 | + if (local_err) { | |
148 | + error_propagate(task->errp, local_err); | |
149 | + goto err; | |
150 | + } | |
151 | + | |
152 | qemu_co_mutex_unlock(&backup_state.backup_mutex); | |
153 | + | |
154 | + qemu_mutex_lock(&backup_state.stat.lock); | |
155 | + backup_state.stat.starting = false; | |
156 | + qemu_mutex_unlock(&backup_state.stat.lock); | |
157 | + | |
158 | + /* start the first job in the transaction */ | |
159 | + job_txn_start_seq(backup_state.txn); | |
160 | + | |
161 | return; | |
162 | ||
163 | err_mutex: | |
164 | @@ -921,6 +958,7 @@ err: | |
165 | g_free(di); | |
166 | } | |
167 | g_list_free(di_list); | |
168 | + backup_state.di_list = NULL; | |
169 | ||
170 | if (devs) { | |
171 | g_strfreev(devs); | |
172 | @@ -998,15 +1036,6 @@ UuidInfo *qmp_backup( | |
173 | ||
174 | block_on_coroutine_fn(pvebackup_co_prepare, &task); | |
175 | ||
176 | - if (*errp == NULL) { | |
177 | - bool errors = create_backup_jobs(); | |
178 | - | |
179 | - if (!errors) { | |
180 | - // start the first job in the transaction | |
181 | - job_txn_start_seq(backup_state.txn); | |
182 | - } | |
183 | - } | |
184 | - | |
185 | return task.result; | |
186 | } | |
187 |