]>
Commit | Line | Data |
---|---|---|
a010b409 SI |
1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
2 | From: Prakash Surya <prakash.surya@delphix.com> | |
3 | Date: Mon, 8 Jan 2018 13:45:53 -0800 | |
4 | Subject: [PATCH] OpenZFS 8997 - ztest assertion failure in zil_lwb_write_issue | |
5 | ||
6 | PROBLEM | |
7 | ======= | |
8 | ||
9 | When `dmu_tx_assign` is called from `zil_lwb_write_issue`, it's possible | |
10 | for either `ERESTART` or `EIO` to be returned. | |
11 | ||
12 | If `ERESTART` is returned, this will cause an assertion to fail directly | |
13 | in `zil_lwb_write_issue`, where the code assumes the return value is | |
14 | `EIO` if `dmu_tx_assign` returns a non-zero value. This can occur if the | |
15 | SPA is suspended when `dmu_tx_assign` is called, and most often occurs | |
16 | when running `zloop`. | |
17 | ||
18 | If `EIO` is returned, this can cause assertions to fail elsewhere in the | |
19 | ZIL code. For example, `zil_commit_waiter_timeout` contains the | |
20 | following logic: | |
21 | ||
22 | lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); | |
23 | ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); | |
24 | ||
25 | In this case, if `dmu_tx_assign` returned `EIO` from within | |
26 | `zil_lwb_write_issue`, the `lwb` variable passed in will not be issued | |
27 | to disk. Thus, it's `lwb_state` field will remain `LWB_STATE_OPENED` and | |
28 | this assertion will fail. `zil_commit_waiter_timeout` assumes that after | |
29 | it calls `zil_lwb_write_issue`, the `lwb` will be issued to disk, and | |
30 | doesn't handle the case where this is not true; i.e. it doesn't handle | |
31 | the case where `dmu_tx_assign` returns `EIO`. | |
32 | ||
33 | SOLUTION | |
34 | ======== | |
35 | ||
36 | This change modifies the `dmu_tx_assign` function such that `txg_how` is | |
37 | a bitmask, rather than of the `txg_how_t` enum type. Now, the previous | |
38 | `TXG_WAITED` semantics can be used via `TXG_NOTHROTTLE`, along with | |
39 | specifying either `TXG_NOWAIT` or `TXG_WAIT` semantics. | |
40 | ||
41 | Previously, when `TXG_WAITED` was specified, `TXG_NOWAIT` semantics was | |
42 | automatically invoked. This was not ideal when using `TXG_WAITED` within | |
43 | `zil_lwb_write_issued`, leading the problem described above. Rather, we | |
44 | want to achieve the semantics of `TXG_WAIT`, while also preventing the | |
45 | `tx` from being penalized via the dirty delay throttling. | |
46 | ||
47 | With this change, `zil_lwb_write_issued` can acheive the semtantics that | |
48 | it requires by passing in the value `TXG_WAIT | TXG_NOTHROTTLE` to | |
49 | `dmu_tx_assign`. | |
50 | ||
51 | Further, consumers of `dmu_tx_assign` wishing to achieve the old | |
52 | `TXG_WAITED` semantics can pass in the value `TXG_NOWAIT | TXG_NOTHROTTLE`. | |
53 | ||
54 | Authored by: Prakash Surya <prakash.surya@delphix.com> | |
55 | Approved by: Robert Mustacchi <rm@joyent.com> | |
56 | Reviewed by: Matt Ahrens <mahrens@delphix.com> | |
57 | Reviewed by: Andriy Gapon <avg@FreeBSD.org> | |
58 | Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> | |
59 | ||
60 | Porting Notes: | |
61 | - Additionally updated `zfs_tmpfile` to use `TXG_NOTHROTTLE` | |
62 | ||
63 | OpenZFS-issue: https://www.illumos.org/issues/8997 | |
64 | OpenZFS-commit: https://github.com/openzfs/openzfs/commit/19ea6cb0f9 | |
65 | Closes #7084 | |
66 | ||
67 | Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com> | |
68 | --- | |
69 | include/sys/dmu.h | 15 +++++++------ | |
70 | include/sys/dmu_tx.h | 8 +++---- | |
71 | module/zfs/dmu_tx.c | 57 ++++++++++++++++++++++++++------------------------ | |
72 | module/zfs/zfs_vnops.c | 21 ++++++++++--------- | |
73 | module/zfs/zil.c | 10 ++++++++- | |
74 | 5 files changed, 63 insertions(+), 48 deletions(-) | |
75 | ||
76 | diff --git a/include/sys/dmu.h b/include/sys/dmu.h | |
77 | index 755a9056..5b355afb 100644 | |
78 | --- a/include/sys/dmu.h | |
79 | +++ b/include/sys/dmu.h | |
80 | @@ -227,11 +227,14 @@ typedef enum dmu_object_type { | |
81 | DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE), | |
82 | } dmu_object_type_t; | |
83 | ||
84 | -typedef enum txg_how { | |
85 | - TXG_WAIT = 1, | |
86 | - TXG_NOWAIT, | |
87 | - TXG_WAITED, | |
88 | -} txg_how_t; | |
89 | +/* | |
90 | + * These flags are intended to be used to specify the "txg_how" | |
91 | + * parameter when calling the dmu_tx_assign() function. See the comment | |
92 | + * above dmu_tx_assign() for more details on the meaning of these flags. | |
93 | + */ | |
94 | +#define TXG_NOWAIT (0ULL) | |
95 | +#define TXG_WAIT (1ULL<<0) | |
96 | +#define TXG_NOTHROTTLE (1ULL<<1) | |
97 | ||
98 | void byteswap_uint64_array(void *buf, size_t size); | |
99 | void byteswap_uint32_array(void *buf, size_t size); | |
100 | @@ -694,7 +697,7 @@ void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); | |
101 | void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); | |
102 | void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); | |
103 | void dmu_tx_abort(dmu_tx_t *tx); | |
104 | -int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how); | |
105 | +int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); | |
106 | void dmu_tx_wait(dmu_tx_t *tx); | |
107 | void dmu_tx_commit(dmu_tx_t *tx); | |
108 | void dmu_tx_mark_netfree(dmu_tx_t *tx); | |
109 | diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h | |
110 | index d82a7931..74b7e111 100644 | |
111 | --- a/include/sys/dmu_tx.h | |
112 | +++ b/include/sys/dmu_tx.h | |
113 | @@ -67,9 +67,6 @@ struct dmu_tx { | |
114 | /* placeholder for syncing context, doesn't need specific holds */ | |
115 | boolean_t tx_anyobj; | |
116 | ||
117 | - /* has this transaction already been delayed? */ | |
118 | - boolean_t tx_waited; | |
119 | - | |
120 | /* transaction is marked as being a "net free" of space */ | |
121 | boolean_t tx_netfree; | |
122 | ||
123 | @@ -79,6 +76,9 @@ struct dmu_tx { | |
124 | /* need to wait for sufficient dirty space */ | |
125 | boolean_t tx_wait_dirty; | |
126 | ||
127 | + /* has this transaction already been delayed? */ | |
128 | + boolean_t tx_dirty_delayed; | |
129 | + | |
130 | int tx_err; | |
131 | }; | |
132 | ||
133 | @@ -138,7 +138,7 @@ extern dmu_tx_stats_t dmu_tx_stats; | |
134 | * These routines are defined in dmu.h, and are called by the user. | |
135 | */ | |
136 | dmu_tx_t *dmu_tx_create(objset_t *dd); | |
137 | -int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how); | |
138 | +int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); | |
139 | void dmu_tx_commit(dmu_tx_t *tx); | |
140 | void dmu_tx_abort(dmu_tx_t *tx); | |
141 | uint64_t dmu_tx_get_txg(dmu_tx_t *tx); | |
142 | diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c | |
143 | index c3cc03a6..6ebff267 100644 | |
144 | --- a/module/zfs/dmu_tx.c | |
145 | +++ b/module/zfs/dmu_tx.c | |
146 | @@ -854,7 +854,7 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) | |
147 | * decreasing performance. | |
148 | */ | |
149 | static int | |
150 | -dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) | |
151 | +dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) | |
152 | { | |
153 | spa_t *spa = tx->tx_pool->dp_spa; | |
154 | ||
155 | @@ -878,13 +878,13 @@ dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) | |
156 | * of the failuremode setting. | |
157 | */ | |
158 | if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && | |
159 | - txg_how != TXG_WAIT) | |
160 | + !(txg_how & TXG_WAIT)) | |
161 | return (SET_ERROR(EIO)); | |
162 | ||
163 | return (SET_ERROR(ERESTART)); | |
164 | } | |
165 | ||
166 | - if (!tx->tx_waited && | |
167 | + if (!tx->tx_dirty_delayed && | |
168 | dsl_pool_need_dirty_delay(tx->tx_pool)) { | |
169 | tx->tx_wait_dirty = B_TRUE; | |
170 | DMU_TX_STAT_BUMP(dmu_tx_dirty_delay); | |
171 | @@ -976,41 +976,44 @@ dmu_tx_unassign(dmu_tx_t *tx) | |
172 | } | |
173 | ||
174 | /* | |
175 | - * Assign tx to a transaction group. txg_how can be one of: | |
176 | + * Assign tx to a transaction group; txg_how is a bitmask: | |
177 | * | |
178 | - * (1) TXG_WAIT. If the current open txg is full, waits until there's | |
179 | - * a new one. This should be used when you're not holding locks. | |
180 | - * It will only fail if we're truly out of space (or over quota). | |
181 | + * If TXG_WAIT is set and the currently open txg is full, this function | |
182 | + * will wait until there's a new txg. This should be used when no locks | |
183 | + * are being held. With this bit set, this function will only fail if | |
184 | + * we're truly out of space (or over quota). | |
185 | * | |
186 | - * (2) TXG_NOWAIT. If we can't assign into the current open txg without | |
187 | - * blocking, returns immediately with ERESTART. This should be used | |
188 | - * whenever you're holding locks. On an ERESTART error, the caller | |
189 | - * should drop locks, do a dmu_tx_wait(tx), and try again. | |
190 | + * If TXG_WAIT is *not* set and we can't assign into the currently open | |
191 | + * txg without blocking, this function will return immediately with | |
192 | + * ERESTART. This should be used whenever locks are being held. On an | |
193 | + * ERESTART error, the caller should drop all locks, call dmu_tx_wait(), | |
194 | + * and try again. | |
195 | * | |
196 | - * (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait() | |
197 | - * has already been called on behalf of this operation (though | |
198 | - * most likely on a different tx). | |
199 | + * If TXG_NOTHROTTLE is set, this indicates that this tx should not be | |
200 | + * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for | |
201 | + * details on the throttle). This is used by the VFS operations, after | |
202 | + * they have already called dmu_tx_wait() (though most likely on a | |
203 | + * different tx). | |
204 | */ | |
205 | int | |
206 | -dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) | |
207 | +dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) | |
208 | { | |
209 | int err; | |
210 | ||
211 | ASSERT(tx->tx_txg == 0); | |
212 | - ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT || | |
213 | - txg_how == TXG_WAITED); | |
214 | + ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE)); | |
215 | ASSERT(!dsl_pool_sync_context(tx->tx_pool)); | |
216 | ||
217 | - if (txg_how == TXG_WAITED) | |
218 | - tx->tx_waited = B_TRUE; | |
219 | - | |
220 | /* If we might wait, we must not hold the config lock. */ | |
221 | - ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool)); | |
222 | + IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool)); | |
223 | + | |
224 | + if ((txg_how & TXG_NOTHROTTLE)) | |
225 | + tx->tx_dirty_delayed = B_TRUE; | |
226 | ||
227 | while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { | |
228 | dmu_tx_unassign(tx); | |
229 | ||
230 | - if (err != ERESTART || txg_how != TXG_WAIT) | |
231 | + if (err != ERESTART || !(txg_how & TXG_WAIT)) | |
232 | return (err); | |
233 | ||
234 | dmu_tx_wait(tx); | |
235 | @@ -1054,12 +1057,12 @@ dmu_tx_wait(dmu_tx_t *tx) | |
236 | tx->tx_wait_dirty = B_FALSE; | |
237 | ||
238 | /* | |
239 | - * Note: setting tx_waited only has effect if the caller | |
240 | - * used TX_WAIT. Otherwise they are going to destroy | |
241 | - * this tx and try again. The common case, zfs_write(), | |
242 | - * uses TX_WAIT. | |
243 | + * Note: setting tx_dirty_delayed only has effect if the | |
244 | + * caller used TX_WAIT. Otherwise they are going to | |
245 | + * destroy this tx and try again. The common case, | |
246 | + * zfs_write(), uses TX_WAIT. | |
247 | */ | |
248 | - tx->tx_waited = B_TRUE; | |
249 | + tx->tx_dirty_delayed = B_TRUE; | |
250 | } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { | |
251 | /* | |
252 | * If the pool is suspended we need to wait until it | |
253 | diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c | |
254 | index 34ea751c..4805f897 100644 | |
255 | --- a/module/zfs/zfs_vnops.c | |
256 | +++ b/module/zfs/zfs_vnops.c | |
257 | @@ -129,7 +129,7 @@ | |
258 | * | |
259 | * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, | |
260 | * then drop all locks, call dmu_tx_wait(), and try again. On subsequent | |
261 | - * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT, | |
262 | + * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, | |
263 | * to indicate that this operation has already called dmu_tx_wait(). | |
264 | * This will ensure that we don't retry forever, waiting a short bit | |
265 | * each time. | |
266 | @@ -154,7 +154,7 @@ | |
267 | * rw_enter(...); // grab any other locks you need | |
268 | * tx = dmu_tx_create(...); // get DMU tx | |
269 | * dmu_tx_hold_*(); // hold each object you might modify | |
270 | - * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); | |
271 | + * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); | |
272 | * if (error) { | |
273 | * rw_exit(...); // drop locks | |
274 | * zfs_dirent_unlock(dl); // unlock directory entry | |
275 | @@ -1427,7 +1427,8 @@ top: | |
276 | dmu_tx_hold_write(tx, DMU_NEW_OBJECT, | |
277 | 0, acl_ids.z_aclp->z_acl_bytes); | |
278 | } | |
279 | - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); | |
280 | + error = dmu_tx_assign(tx, | |
281 | + (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); | |
282 | if (error) { | |
283 | zfs_dirent_unlock(dl); | |
284 | if (error == ERESTART) { | |
285 | @@ -1602,7 +1603,7 @@ top: | |
286 | dmu_tx_hold_write(tx, DMU_NEW_OBJECT, | |
287 | 0, acl_ids.z_aclp->z_acl_bytes); | |
288 | } | |
289 | - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); | |
290 | + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); | |
291 | if (error) { | |
292 | if (error == ERESTART) { | |
293 | waited = B_TRUE; | |
294 | @@ -1775,7 +1776,7 @@ top: | |
295 | */ | |
296 | dmu_tx_mark_netfree(tx); | |
297 | ||
298 | - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); | |
299 | + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); | |
300 | if (error) { | |
301 | zfs_dirent_unlock(dl); | |
302 | if (error == ERESTART) { | |
303 | @@ -2017,7 +2018,7 @@ top: | |
304 | dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + | |
305 | ZFS_SA_BASE_ATTR_SIZE); | |
306 | ||
307 | - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); | |
308 | + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); | |
309 | if (error) { | |
310 | zfs_dirent_unlock(dl); | |
311 | if (error == ERESTART) { | |
312 | @@ -2156,7 +2157,7 @@ top: | |
313 | zfs_sa_upgrade_txholds(tx, zp); | |
314 | zfs_sa_upgrade_txholds(tx, dzp); | |
315 | dmu_tx_mark_netfree(tx); | |
316 | - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); | |
317 | + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); | |
318 | if (error) { | |
319 | rw_exit(&zp->z_parent_lock); | |
320 | rw_exit(&zp->z_name_lock); | |
321 | @@ -3623,7 +3624,7 @@ top: | |
322 | ||
323 | zfs_sa_upgrade_txholds(tx, szp); | |
324 | dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); | |
325 | - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); | |
326 | + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); | |
327 | if (error) { | |
328 | if (zl != NULL) | |
329 | zfs_rename_unlock(&zl); | |
330 | @@ -3815,7 +3816,7 @@ top: | |
331 | } | |
332 | if (fuid_dirtied) | |
333 | zfs_fuid_txhold(zfsvfs, tx); | |
334 | - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); | |
335 | + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); | |
336 | if (error) { | |
337 | zfs_dirent_unlock(dl); | |
338 | if (error == ERESTART) { | |
339 | @@ -4041,7 +4042,7 @@ top: | |
340 | ||
341 | zfs_sa_upgrade_txholds(tx, szp); | |
342 | zfs_sa_upgrade_txholds(tx, dzp); | |
343 | - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT); | |
344 | + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); | |
345 | if (error) { | |
346 | zfs_dirent_unlock(dl); | |
347 | if (error == ERESTART) { | |
348 | diff --git a/module/zfs/zil.c b/module/zfs/zil.c | |
349 | index 645b1d4d..a2bbdcb9 100644 | |
350 | --- a/module/zfs/zil.c | |
351 | +++ b/module/zfs/zil.c | |
352 | @@ -1009,7 +1009,15 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) | |
353 | * to clean up in the event of allocation failure or I/O failure. | |
354 | */ | |
355 | tx = dmu_tx_create(zilog->zl_os); | |
356 | - VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); | |
357 | + | |
358 | + /* | |
359 | + * Since we are not going to create any new dirty data, and we | |
360 | + * can even help with clearing the existing dirty data, we | |
361 | + * should not be subject to the dirty data based delays. We | |
362 | + * use TXG_NOTHROTTLE to bypass the delay mechanism. | |
363 | + */ | |
364 | + VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); | |
365 | + | |
366 | dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); | |
367 | txg = dmu_tx_get_txg(tx); | |
368 |