]> git.proxmox.com Git - zfsonlinux.git/blob - zfs-patches/0008-OpenZFS-8997-ztest-assertion-failure-in-zil_lwb_writ.patch
bump version to 0.7.11-pve1~bpo1
[zfsonlinux.git] / zfs-patches / 0008-OpenZFS-8997-ztest-assertion-failure-in-zil_lwb_writ.patch
1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2 From: Prakash Surya <prakash.surya@delphix.com>
3 Date: Mon, 8 Jan 2018 13:45:53 -0800
4 Subject: [PATCH] OpenZFS 8997 - ztest assertion failure in zil_lwb_write_issue
5
6 PROBLEM
7 =======
8
9 When `dmu_tx_assign` is called from `zil_lwb_write_issue`, it's possible
10 for either `ERESTART` or `EIO` to be returned.
11
12 If `ERESTART` is returned, this will cause an assertion to fail directly
13 in `zil_lwb_write_issue`, where the code assumes the return value is
14 `EIO` if `dmu_tx_assign` returns a non-zero value. This can occur if the
15 SPA is suspended when `dmu_tx_assign` is called, and most often occurs
16 when running `zloop`.
17
18 If `EIO` is returned, this can cause assertions to fail elsewhere in the
19 ZIL code. For example, `zil_commit_waiter_timeout` contains the
20 following logic:
21
22 lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
23 ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
24
25 In this case, if `dmu_tx_assign` returned `EIO` from within
26 `zil_lwb_write_issue`, the `lwb` variable passed in will not be issued
27 to disk. Thus, it's `lwb_state` field will remain `LWB_STATE_OPENED` and
28 this assertion will fail. `zil_commit_waiter_timeout` assumes that after
29 it calls `zil_lwb_write_issue`, the `lwb` will be issued to disk, and
30 doesn't handle the case where this is not true; i.e. it doesn't handle
31 the case where `dmu_tx_assign` returns `EIO`.
32
33 SOLUTION
34 ========
35
36 This change modifies the `dmu_tx_assign` function such that `txg_how` is
37 a bitmask, rather than of the `txg_how_t` enum type. Now, the previous
38 `TXG_WAITED` semantics can be used via `TXG_NOTHROTTLE`, along with
39 specifying either `TXG_NOWAIT` or `TXG_WAIT` semantics.
40
41 Previously, when `TXG_WAITED` was specified, `TXG_NOWAIT` semantics was
42 automatically invoked. This was not ideal when using `TXG_WAITED` within
43 `zil_lwb_write_issued`, leading the problem described above. Rather, we
44 want to achieve the semantics of `TXG_WAIT`, while also preventing the
45 `tx` from being penalized via the dirty delay throttling.
46
47 With this change, `zil_lwb_write_issued` can acheive the semtantics that
48 it requires by passing in the value `TXG_WAIT | TXG_NOTHROTTLE` to
49 `dmu_tx_assign`.
50
51 Further, consumers of `dmu_tx_assign` wishing to achieve the old
52 `TXG_WAITED` semantics can pass in the value `TXG_NOWAIT | TXG_NOTHROTTLE`.
53
54 Authored by: Prakash Surya <prakash.surya@delphix.com>
55 Approved by: Robert Mustacchi <rm@joyent.com>
56 Reviewed by: Matt Ahrens <mahrens@delphix.com>
57 Reviewed by: Andriy Gapon <avg@FreeBSD.org>
58 Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
59
60 Porting Notes:
61 - Additionally updated `zfs_tmpfile` to use `TXG_NOTHROTTLE`
62
63 OpenZFS-issue: https://www.illumos.org/issues/8997
64 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/19ea6cb0f9
65 Closes #7084
66
67 Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
68 ---
69 include/sys/dmu.h | 15 +++++++------
70 include/sys/dmu_tx.h | 8 +++----
71 module/zfs/dmu_tx.c | 57 ++++++++++++++++++++++++++------------------------
72 module/zfs/zfs_vnops.c | 21 ++++++++++---------
73 module/zfs/zil.c | 10 ++++++++-
74 5 files changed, 63 insertions(+), 48 deletions(-)
75
76 diff --git a/include/sys/dmu.h b/include/sys/dmu.h
77 index 755a9056..5b355afb 100644
78 --- a/include/sys/dmu.h
79 +++ b/include/sys/dmu.h
80 @@ -227,11 +227,14 @@ typedef enum dmu_object_type {
81 DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
82 } dmu_object_type_t;
83
84 -typedef enum txg_how {
85 - TXG_WAIT = 1,
86 - TXG_NOWAIT,
87 - TXG_WAITED,
88 -} txg_how_t;
89 +/*
90 + * These flags are intended to be used to specify the "txg_how"
91 + * parameter when calling the dmu_tx_assign() function. See the comment
92 + * above dmu_tx_assign() for more details on the meaning of these flags.
93 + */
94 +#define TXG_NOWAIT (0ULL)
95 +#define TXG_WAIT (1ULL<<0)
96 +#define TXG_NOTHROTTLE (1ULL<<1)
97
98 void byteswap_uint64_array(void *buf, size_t size);
99 void byteswap_uint32_array(void *buf, size_t size);
100 @@ -694,7 +697,7 @@ void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
101 void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
102 void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
103 void dmu_tx_abort(dmu_tx_t *tx);
104 -int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how);
105 +int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
106 void dmu_tx_wait(dmu_tx_t *tx);
107 void dmu_tx_commit(dmu_tx_t *tx);
108 void dmu_tx_mark_netfree(dmu_tx_t *tx);
109 diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h
110 index d82a7931..74b7e111 100644
111 --- a/include/sys/dmu_tx.h
112 +++ b/include/sys/dmu_tx.h
113 @@ -67,9 +67,6 @@ struct dmu_tx {
114 /* placeholder for syncing context, doesn't need specific holds */
115 boolean_t tx_anyobj;
116
117 - /* has this transaction already been delayed? */
118 - boolean_t tx_waited;
119 -
120 /* transaction is marked as being a "net free" of space */
121 boolean_t tx_netfree;
122
123 @@ -79,6 +76,9 @@ struct dmu_tx {
124 /* need to wait for sufficient dirty space */
125 boolean_t tx_wait_dirty;
126
127 + /* has this transaction already been delayed? */
128 + boolean_t tx_dirty_delayed;
129 +
130 int tx_err;
131 };
132
133 @@ -138,7 +138,7 @@ extern dmu_tx_stats_t dmu_tx_stats;
134 * These routines are defined in dmu.h, and are called by the user.
135 */
136 dmu_tx_t *dmu_tx_create(objset_t *dd);
137 -int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how);
138 +int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
139 void dmu_tx_commit(dmu_tx_t *tx);
140 void dmu_tx_abort(dmu_tx_t *tx);
141 uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
142 diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
143 index c3cc03a6..6ebff267 100644
144 --- a/module/zfs/dmu_tx.c
145 +++ b/module/zfs/dmu_tx.c
146 @@ -854,7 +854,7 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
147 * decreasing performance.
148 */
149 static int
150 -dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
151 +dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
152 {
153 spa_t *spa = tx->tx_pool->dp_spa;
154
155 @@ -878,13 +878,13 @@ dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
156 * of the failuremode setting.
157 */
158 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
159 - txg_how != TXG_WAIT)
160 + !(txg_how & TXG_WAIT))
161 return (SET_ERROR(EIO));
162
163 return (SET_ERROR(ERESTART));
164 }
165
166 - if (!tx->tx_waited &&
167 + if (!tx->tx_dirty_delayed &&
168 dsl_pool_need_dirty_delay(tx->tx_pool)) {
169 tx->tx_wait_dirty = B_TRUE;
170 DMU_TX_STAT_BUMP(dmu_tx_dirty_delay);
171 @@ -976,41 +976,44 @@ dmu_tx_unassign(dmu_tx_t *tx)
172 }
173
174 /*
175 - * Assign tx to a transaction group. txg_how can be one of:
176 + * Assign tx to a transaction group; txg_how is a bitmask:
177 *
178 - * (1) TXG_WAIT. If the current open txg is full, waits until there's
179 - * a new one. This should be used when you're not holding locks.
180 - * It will only fail if we're truly out of space (or over quota).
181 + * If TXG_WAIT is set and the currently open txg is full, this function
182 + * will wait until there's a new txg. This should be used when no locks
183 + * are being held. With this bit set, this function will only fail if
184 + * we're truly out of space (or over quota).
185 *
186 - * (2) TXG_NOWAIT. If we can't assign into the current open txg without
187 - * blocking, returns immediately with ERESTART. This should be used
188 - * whenever you're holding locks. On an ERESTART error, the caller
189 - * should drop locks, do a dmu_tx_wait(tx), and try again.
190 + * If TXG_WAIT is *not* set and we can't assign into the currently open
191 + * txg without blocking, this function will return immediately with
192 + * ERESTART. This should be used whenever locks are being held. On an
193 + * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
194 + * and try again.
195 *
196 - * (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait()
197 - * has already been called on behalf of this operation (though
198 - * most likely on a different tx).
199 + * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
200 + * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
201 + * details on the throttle). This is used by the VFS operations, after
202 + * they have already called dmu_tx_wait() (though most likely on a
203 + * different tx).
204 */
205 int
206 -dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
207 +dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
208 {
209 int err;
210
211 ASSERT(tx->tx_txg == 0);
212 - ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
213 - txg_how == TXG_WAITED);
214 + ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
215 ASSERT(!dsl_pool_sync_context(tx->tx_pool));
216
217 - if (txg_how == TXG_WAITED)
218 - tx->tx_waited = B_TRUE;
219 -
220 /* If we might wait, we must not hold the config lock. */
221 - ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
222 + IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
223 +
224 + if ((txg_how & TXG_NOTHROTTLE))
225 + tx->tx_dirty_delayed = B_TRUE;
226
227 while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
228 dmu_tx_unassign(tx);
229
230 - if (err != ERESTART || txg_how != TXG_WAIT)
231 + if (err != ERESTART || !(txg_how & TXG_WAIT))
232 return (err);
233
234 dmu_tx_wait(tx);
235 @@ -1054,12 +1057,12 @@ dmu_tx_wait(dmu_tx_t *tx)
236 tx->tx_wait_dirty = B_FALSE;
237
238 /*
239 - * Note: setting tx_waited only has effect if the caller
240 - * used TX_WAIT. Otherwise they are going to destroy
241 - * this tx and try again. The common case, zfs_write(),
242 - * uses TX_WAIT.
243 + * Note: setting tx_dirty_delayed only has effect if the
244 + * caller used TX_WAIT. Otherwise they are going to
245 + * destroy this tx and try again. The common case,
246 + * zfs_write(), uses TX_WAIT.
247 */
248 - tx->tx_waited = B_TRUE;
249 + tx->tx_dirty_delayed = B_TRUE;
250 } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
251 /*
252 * If the pool is suspended we need to wait until it
253 diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
254 index 34ea751c..4805f897 100644
255 --- a/module/zfs/zfs_vnops.c
256 +++ b/module/zfs/zfs_vnops.c
257 @@ -129,7 +129,7 @@
258 *
259 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
260 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent
261 - * calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
262 + * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
263 * to indicate that this operation has already called dmu_tx_wait().
264 * This will ensure that we don't retry forever, waiting a short bit
265 * each time.
266 @@ -154,7 +154,7 @@
267 * rw_enter(...); // grab any other locks you need
268 * tx = dmu_tx_create(...); // get DMU tx
269 * dmu_tx_hold_*(); // hold each object you might modify
270 - * error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
271 + * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
272 * if (error) {
273 * rw_exit(...); // drop locks
274 * zfs_dirent_unlock(dl); // unlock directory entry
275 @@ -1427,7 +1427,8 @@ top:
276 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
277 0, acl_ids.z_aclp->z_acl_bytes);
278 }
279 - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
280 + error = dmu_tx_assign(tx,
281 + (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
282 if (error) {
283 zfs_dirent_unlock(dl);
284 if (error == ERESTART) {
285 @@ -1602,7 +1603,7 @@ top:
286 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
287 0, acl_ids.z_aclp->z_acl_bytes);
288 }
289 - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
290 + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
291 if (error) {
292 if (error == ERESTART) {
293 waited = B_TRUE;
294 @@ -1775,7 +1776,7 @@ top:
295 */
296 dmu_tx_mark_netfree(tx);
297
298 - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
299 + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
300 if (error) {
301 zfs_dirent_unlock(dl);
302 if (error == ERESTART) {
303 @@ -2017,7 +2018,7 @@ top:
304 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
305 ZFS_SA_BASE_ATTR_SIZE);
306
307 - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
308 + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
309 if (error) {
310 zfs_dirent_unlock(dl);
311 if (error == ERESTART) {
312 @@ -2156,7 +2157,7 @@ top:
313 zfs_sa_upgrade_txholds(tx, zp);
314 zfs_sa_upgrade_txholds(tx, dzp);
315 dmu_tx_mark_netfree(tx);
316 - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
317 + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
318 if (error) {
319 rw_exit(&zp->z_parent_lock);
320 rw_exit(&zp->z_name_lock);
321 @@ -3623,7 +3624,7 @@ top:
322
323 zfs_sa_upgrade_txholds(tx, szp);
324 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
325 - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
326 + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
327 if (error) {
328 if (zl != NULL)
329 zfs_rename_unlock(&zl);
330 @@ -3815,7 +3816,7 @@ top:
331 }
332 if (fuid_dirtied)
333 zfs_fuid_txhold(zfsvfs, tx);
334 - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
335 + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
336 if (error) {
337 zfs_dirent_unlock(dl);
338 if (error == ERESTART) {
339 @@ -4041,7 +4042,7 @@ top:
340
341 zfs_sa_upgrade_txholds(tx, szp);
342 zfs_sa_upgrade_txholds(tx, dzp);
343 - error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
344 + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
345 if (error) {
346 zfs_dirent_unlock(dl);
347 if (error == ERESTART) {
348 diff --git a/module/zfs/zil.c b/module/zfs/zil.c
349 index 645b1d4d..a2bbdcb9 100644
350 --- a/module/zfs/zil.c
351 +++ b/module/zfs/zil.c
352 @@ -1009,7 +1009,15 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
353 * to clean up in the event of allocation failure or I/O failure.
354 */
355 tx = dmu_tx_create(zilog->zl_os);
356 - VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
357 +
358 + /*
359 + * Since we are not going to create any new dirty data, and we
360 + * can even help with clearing the existing dirty data, we
361 + * should not be subject to the dirty data based delays. We
362 + * use TXG_NOTHROTTLE to bypass the delay mechanism.
363 + */
364 + VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
365 +
366 dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
367 txg = dmu_tx_get_txg(tx);
368