]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
34dc7c2f BB |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
22cd4a46 | 23 | * Copyright 2011 Nexenta Systems, Inc. All rights reserved. |
4747a7d3 | 24 | * Copyright (c) 2012, 2017 by Delphix. All rights reserved. |
22cd4a46 | 25 | */ |
34dc7c2f | 26 | |
34dc7c2f BB |
27 | #include <sys/dmu.h> |
28 | #include <sys/dmu_impl.h> | |
29 | #include <sys/dbuf.h> | |
30 | #include <sys/dmu_tx.h> | |
31 | #include <sys/dmu_objset.h> | |
3ec3bc21 BB |
32 | #include <sys/dsl_dataset.h> |
33 | #include <sys/dsl_dir.h> | |
34dc7c2f | 34 | #include <sys/dsl_pool.h> |
3ec3bc21 | 35 | #include <sys/zap_impl.h> |
34dc7c2f | 36 | #include <sys/spa.h> |
428870ff BB |
37 | #include <sys/sa.h> |
38 | #include <sys/sa_impl.h> | |
34dc7c2f | 39 | #include <sys/zfs_context.h> |
e5d1c27e | 40 | #include <sys/trace_zfs.h> |
34dc7c2f BB |
41 | |
42 | typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, | |
43 | uint64_t arg1, uint64_t arg2); | |
44 | ||
570827e1 BB |
45 | dmu_tx_stats_t dmu_tx_stats = { |
46 | { "dmu_tx_assigned", KSTAT_DATA_UINT64 }, | |
47 | { "dmu_tx_delay", KSTAT_DATA_UINT64 }, | |
48 | { "dmu_tx_error", KSTAT_DATA_UINT64 }, | |
49 | { "dmu_tx_suspended", KSTAT_DATA_UINT64 }, | |
50 | { "dmu_tx_group", KSTAT_DATA_UINT64 }, | |
570827e1 BB |
51 | { "dmu_tx_memory_reserve", KSTAT_DATA_UINT64 }, |
52 | { "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 }, | |
570827e1 | 53 | { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, |
e8b96c60 MA |
54 | { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, |
55 | { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, | |
750e1f88 | 56 | { "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 }, |
84d0a03f | 57 | { "dmu_tx_wrlog_delay", KSTAT_DATA_UINT64 }, |
570827e1 BB |
58 | { "dmu_tx_quota", KSTAT_DATA_UINT64 }, |
59 | }; | |
60 | ||
61 | static kstat_t *dmu_tx_ksp; | |
34dc7c2f BB |
62 | |
63 | dmu_tx_t * | |
64 | dmu_tx_create_dd(dsl_dir_t *dd) | |
65 | { | |
79c76d5b | 66 | dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); |
34dc7c2f | 67 | tx->tx_dir = dd; |
6f1ffb06 | 68 | if (dd != NULL) |
34dc7c2f BB |
69 | tx->tx_pool = dd->dd_pool; |
70 | list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), | |
71 | offsetof(dmu_tx_hold_t, txh_node)); | |
428870ff BB |
72 | list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), |
73 | offsetof(dmu_tx_callback_t, dcb_node)); | |
e8b96c60 | 74 | tx->tx_start = gethrtime(); |
34dc7c2f BB |
75 | return (tx); |
76 | } | |
77 | ||
78 | dmu_tx_t * | |
79 | dmu_tx_create(objset_t *os) | |
80 | { | |
428870ff | 81 | dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); |
34dc7c2f | 82 | tx->tx_objset = os; |
34dc7c2f BB |
83 | return (tx); |
84 | } | |
85 | ||
86 | dmu_tx_t * | |
87 | dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) | |
88 | { | |
89 | dmu_tx_t *tx = dmu_tx_create_dd(NULL); | |
90 | ||
8c4fb36a | 91 | TXG_VERIFY(dp->dp_spa, txg); |
34dc7c2f BB |
92 | tx->tx_pool = dp; |
93 | tx->tx_txg = txg; | |
94 | tx->tx_anyobj = TRUE; | |
95 | ||
96 | return (tx); | |
97 | } | |
98 | ||
99 | int | |
100 | dmu_tx_is_syncing(dmu_tx_t *tx) | |
101 | { | |
102 | return (tx->tx_anyobj); | |
103 | } | |
104 | ||
105 | int | |
106 | dmu_tx_private_ok(dmu_tx_t *tx) | |
107 | { | |
108 | return (tx->tx_anyobj); | |
109 | } | |
110 | ||
111 | static dmu_tx_hold_t * | |
0eef1bde | 112 | dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type, |
113 | uint64_t arg1, uint64_t arg2) | |
34dc7c2f BB |
114 | { |
115 | dmu_tx_hold_t *txh; | |
34dc7c2f | 116 | |
0eef1bde | 117 | if (dn != NULL) { |
c13060e4 | 118 | (void) zfs_refcount_add(&dn->dn_holds, tx); |
0eef1bde | 119 | if (tx->tx_txg != 0) { |
34dc7c2f BB |
120 | mutex_enter(&dn->dn_mtx); |
121 | /* | |
122 | * dn->dn_assigned_txg == tx->tx_txg doesn't pose a | |
123 | * problem, but there's no way for it to happen (for | |
124 | * now, at least). | |
125 | */ | |
126 | ASSERT(dn->dn_assigned_txg == 0); | |
127 | dn->dn_assigned_txg = tx->tx_txg; | |
c13060e4 | 128 | (void) zfs_refcount_add(&dn->dn_tx_holds, tx); |
34dc7c2f BB |
129 | mutex_exit(&dn->dn_mtx); |
130 | } | |
131 | } | |
132 | ||
79c76d5b | 133 | txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); |
34dc7c2f BB |
134 | txh->txh_tx = tx; |
135 | txh->txh_dnode = dn; | |
424fd7c3 TS |
136 | zfs_refcount_create(&txh->txh_space_towrite); |
137 | zfs_refcount_create(&txh->txh_memory_tohold); | |
34dc7c2f BB |
138 | txh->txh_type = type; |
139 | txh->txh_arg1 = arg1; | |
140 | txh->txh_arg2 = arg2; | |
34dc7c2f BB |
141 | list_insert_tail(&tx->tx_holds, txh); |
142 | ||
143 | return (txh); | |
144 | } | |
145 | ||
0eef1bde | 146 | static dmu_tx_hold_t * |
147 | dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, | |
148 | enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) | |
149 | { | |
150 | dnode_t *dn = NULL; | |
151 | dmu_tx_hold_t *txh; | |
152 | int err; | |
153 | ||
154 | if (object != DMU_NEW_OBJECT) { | |
155 | err = dnode_hold(os, object, FTAG, &dn); | |
66eead53 | 156 | if (err != 0) { |
0eef1bde | 157 | tx->tx_err = err; |
158 | return (NULL); | |
159 | } | |
160 | } | |
161 | txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2); | |
162 | if (dn != NULL) | |
163 | dnode_rele(dn, FTAG); | |
164 | return (txh); | |
165 | } | |
166 | ||
34dc7c2f | 167 | void |
66eead53 | 168 | dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn) |
34dc7c2f BB |
169 | { |
170 | /* | |
171 | * If we're syncing, they can manipulate any object anyhow, and | |
172 | * the hold on the dnode_t can cause problems. | |
173 | */ | |
0eef1bde | 174 | if (!dmu_tx_is_syncing(tx)) |
175 | (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0); | |
34dc7c2f BB |
176 | } |
177 | ||
3ec3bc21 BB |
178 | /* |
179 | * This function reads specified data from disk. The specified data will | |
180 | * be needed to perform the transaction -- i.e, it will be read after | |
181 | * we do dmu_tx_assign(). There are two reasons that we read the data now | |
182 | * (before dmu_tx_assign()): | |
183 | * | |
184 | * 1. Reading it now has potentially better performance. The transaction | |
185 | * has not yet been assigned, so the TXG is not held open, and also the | |
186 | * caller typically has less locks held when calling dmu_tx_hold_*() than | |
187 | * after the transaction has been assigned. This reduces the lock (and txg) | |
188 | * hold times, thus reducing lock contention. | |
189 | * | |
190 | * 2. It is easier for callers (primarily the ZPL) to handle i/o errors | |
191 | * that are detected before they start making changes to the DMU state | |
192 | * (i.e. now). Once the transaction has been assigned, and some DMU | |
193 | * state has been changed, it can be difficult to recover from an i/o | |
194 | * error (e.g. to undo the changes already made in memory at the DMU | |
195 | * layer). Typically code to do so does not exist in the caller -- it | |
196 | * assumes that the data has already been cached and thus i/o errors are | |
197 | * not possible. | |
198 | * | |
199 | * It has been observed that the i/o initiated here can be a performance | |
200 | * problem, and it appears to be optional, because we don't look at the | |
201 | * data which is read. However, removing this read would only serve to | |
202 | * move the work elsewhere (after the dmu_tx_assign()), where it may | |
203 | * have a greater impact on performance (in addition to the impact on | |
204 | * fault tolerance noted above). | |
205 | */ | |
34dc7c2f BB |
206 | static int |
207 | dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) | |
208 | { | |
209 | int err; | |
210 | dmu_buf_impl_t *db; | |
211 | ||
212 | rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
1b310dfb | 213 | err = dbuf_hold_impl(dn, level, blkid, TRUE, FALSE, FTAG, &db); |
34dc7c2f | 214 | rw_exit(&dn->dn_struct_rwlock); |
1b310dfb AM |
215 | if (err == ENOENT) |
216 | return (0); | |
217 | if (err != 0) | |
218 | return (err); | |
ed2f7ba0 AM |
219 | /* |
220 | * PARTIAL_FIRST allows caching for uncacheable blocks. It will | |
221 | * be cleared after dmu_buf_will_dirty() call dbuf_read() again. | |
222 | */ | |
223 | err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH | | |
224 | (level == 0 ? DB_RF_PARTIAL_FIRST : 0)); | |
34dc7c2f BB |
225 | dbuf_rele(db, FTAG); |
226 | return (err); | |
227 | } | |
228 | ||
34dc7c2f BB |
229 | static void |
230 | dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) | |
231 | { | |
232 | dnode_t *dn = txh->txh_dnode; | |
34dc7c2f BB |
233 | int err = 0; |
234 | ||
235 | if (len == 0) | |
236 | return; | |
237 | ||
424fd7c3 | 238 | (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); |
34dc7c2f | 239 | |
3ec3bc21 BB |
240 | if (dn == NULL) |
241 | return; | |
34dc7c2f | 242 | |
3ec3bc21 BB |
243 | /* |
244 | * For i/o error checking, read the blocks that will be needed | |
245 | * to perform the write: the first and last level-0 blocks (if | |
246 | * they are not aligned, i.e. if they are partial-block writes), | |
247 | * and all the level-1 blocks. | |
248 | */ | |
249 | if (dn->dn_maxblkid == 0) { | |
250 | if (off < dn->dn_datablksz && | |
251 | (off > 0 || len < dn->dn_datablksz)) { | |
252 | err = dmu_tx_check_ioerr(NULL, dn, 0, 0); | |
253 | if (err != 0) { | |
254 | txh->txh_tx->tx_err = err; | |
34dc7c2f | 255 | } |
9babb374 | 256 | } |
3ec3bc21 BB |
257 | } else { |
258 | zio_t *zio = zio_root(dn->dn_objset->os_spa, | |
259 | NULL, NULL, ZIO_FLAG_CANFAIL); | |
9babb374 | 260 | |
3ec3bc21 BB |
261 | /* first level-0 block */ |
262 | uint64_t start = off >> dn->dn_datablkshift; | |
263 | if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { | |
264 | err = dmu_tx_check_ioerr(zio, dn, 0, start); | |
265 | if (err != 0) { | |
266 | txh->txh_tx->tx_err = err; | |
267 | } | |
428870ff | 268 | } |
428870ff | 269 | |
3ec3bc21 BB |
270 | /* last level-0 block */ |
271 | uint64_t end = (off + len - 1) >> dn->dn_datablkshift; | |
272 | if (end != start && end <= dn->dn_maxblkid && | |
273 | P2PHASE(off + len, dn->dn_datablksz)) { | |
274 | err = dmu_tx_check_ioerr(zio, dn, 0, end); | |
275 | if (err != 0) { | |
428870ff | 276 | txh->txh_tx->tx_err = err; |
9babb374 | 277 | } |
3ec3bc21 | 278 | } |
428870ff | 279 | |
3ec3bc21 BB |
280 | /* level-1 blocks */ |
281 | if (dn->dn_nlevels > 1) { | |
282 | int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; | |
283 | for (uint64_t i = (start >> shft) + 1; | |
284 | i < end >> shft; i++) { | |
285 | err = dmu_tx_check_ioerr(zio, dn, 1, i); | |
286 | if (err != 0) { | |
287 | txh->txh_tx->tx_err = err; | |
288 | } | |
9babb374 | 289 | } |
9babb374 | 290 | } |
34dc7c2f | 291 | |
3ec3bc21 BB |
292 | err = zio_wait(zio); |
293 | if (err != 0) { | |
294 | txh->txh_tx->tx_err = err; | |
9babb374 | 295 | } |
34dc7c2f | 296 | } |
34dc7c2f BB |
297 | } |
298 | ||
903c3613 BB |
299 | static void |
300 | dmu_tx_count_append(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) | |
301 | { | |
302 | dnode_t *dn = txh->txh_dnode; | |
303 | int err = 0; | |
304 | ||
305 | if (len == 0) | |
306 | return; | |
307 | ||
308 | (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); | |
309 | ||
310 | if (dn == NULL) | |
311 | return; | |
312 | ||
313 | /* | |
314 | * For i/o error checking, read the blocks that will be needed | |
315 | * to perform the append; first level-0 block (if not aligned, i.e. | |
316 | * if they are partial-block writes), no additional blocks are read. | |
317 | */ | |
318 | if (dn->dn_maxblkid == 0) { | |
319 | if (off < dn->dn_datablksz && | |
320 | (off > 0 || len < dn->dn_datablksz)) { | |
321 | err = dmu_tx_check_ioerr(NULL, dn, 0, 0); | |
322 | if (err != 0) { | |
323 | txh->txh_tx->tx_err = err; | |
324 | } | |
325 | } | |
326 | } else { | |
327 | zio_t *zio = zio_root(dn->dn_objset->os_spa, | |
328 | NULL, NULL, ZIO_FLAG_CANFAIL); | |
329 | ||
330 | /* first level-0 block */ | |
331 | uint64_t start = off >> dn->dn_datablkshift; | |
332 | if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { | |
333 | err = dmu_tx_check_ioerr(zio, dn, 0, start); | |
334 | if (err != 0) { | |
335 | txh->txh_tx->tx_err = err; | |
336 | } | |
337 | } | |
338 | ||
339 | err = zio_wait(zio); | |
340 | if (err != 0) { | |
341 | txh->txh_tx->tx_err = err; | |
342 | } | |
343 | } | |
344 | } | |
345 | ||
34dc7c2f BB |
346 | static void |
347 | dmu_tx_count_dnode(dmu_tx_hold_t *txh) | |
348 | { | |
424fd7c3 TS |
349 | (void) zfs_refcount_add_many(&txh->txh_space_towrite, |
350 | DNODE_MIN_SIZE, FTAG); | |
34dc7c2f BB |
351 | } |
352 | ||
353 | void | |
354 | dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) | |
355 | { | |
356 | dmu_tx_hold_t *txh; | |
357 | ||
66eead53 MA |
358 | ASSERT0(tx->tx_txg); |
359 | ASSERT3U(len, <=, DMU_MAX_ACCESS); | |
34dc7c2f BB |
360 | ASSERT(len == 0 || UINT64_MAX - off >= len - 1); |
361 | ||
362 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
363 | object, THT_WRITE, off, len); | |
66eead53 MA |
364 | if (txh != NULL) { |
365 | dmu_tx_count_write(txh, off, len); | |
366 | dmu_tx_count_dnode(txh); | |
367 | } | |
34dc7c2f BB |
368 | } |
369 | ||
0eef1bde | 370 | void |
371 | dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) | |
372 | { | |
373 | dmu_tx_hold_t *txh; | |
374 | ||
66eead53 MA |
375 | ASSERT0(tx->tx_txg); |
376 | ASSERT3U(len, <=, DMU_MAX_ACCESS); | |
0eef1bde | 377 | ASSERT(len == 0 || UINT64_MAX - off >= len - 1); |
378 | ||
379 | txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len); | |
66eead53 MA |
380 | if (txh != NULL) { |
381 | dmu_tx_count_write(txh, off, len); | |
382 | dmu_tx_count_dnode(txh); | |
383 | } | |
0eef1bde | 384 | } |
385 | ||
903c3613 BB |
386 | /* |
387 | * Should be used when appending to an object and the exact offset is unknown. | |
388 | * The write must occur at or beyond the specified offset. Only the L0 block | |
389 | * at provided offset will be prefetched. | |
390 | */ | |
391 | void | |
392 | dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) | |
393 | { | |
394 | dmu_tx_hold_t *txh; | |
395 | ||
396 | ASSERT0(tx->tx_txg); | |
397 | ASSERT3U(len, <=, DMU_MAX_ACCESS); | |
398 | ||
399 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
400 | object, THT_APPEND, off, DMU_OBJECT_END); | |
401 | if (txh != NULL) { | |
402 | dmu_tx_count_append(txh, off, len); | |
403 | dmu_tx_count_dnode(txh); | |
404 | } | |
405 | } | |
406 | ||
407 | void | |
408 | dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) | |
409 | { | |
410 | dmu_tx_hold_t *txh; | |
411 | ||
412 | ASSERT0(tx->tx_txg); | |
413 | ASSERT3U(len, <=, DMU_MAX_ACCESS); | |
414 | ||
415 | txh = dmu_tx_hold_dnode_impl(tx, dn, THT_APPEND, off, DMU_OBJECT_END); | |
416 | if (txh != NULL) { | |
417 | dmu_tx_count_append(txh, off, len); | |
418 | dmu_tx_count_dnode(txh); | |
419 | } | |
420 | } | |
421 | ||
19d55079 MA |
422 | /* |
423 | * This function marks the transaction as being a "net free". The end | |
424 | * result is that refquotas will be disabled for this transaction, and | |
425 | * this transaction will be able to use half of the pool space overhead | |
426 | * (see dsl_pool_adjustedsize()). Therefore this function should only | |
427 | * be called for transactions that we expect will not cause a net increase | |
428 | * in the amount of space used (but it's OK if that is occasionally not true). | |
429 | */ | |
430 | void | |
431 | dmu_tx_mark_netfree(dmu_tx_t *tx) | |
432 | { | |
3ec3bc21 | 433 | tx->tx_netfree = B_TRUE; |
19d55079 MA |
434 | } |
435 | ||
0eef1bde | 436 | static void |
67a1b037 | 437 | dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) |
34dc7c2f | 438 | { |
3ec3bc21 BB |
439 | dmu_tx_t *tx = txh->txh_tx; |
440 | dnode_t *dn = txh->txh_dnode; | |
ea97f8ce | 441 | int err; |
34dc7c2f BB |
442 | |
443 | ASSERT(tx->tx_txg == 0); | |
444 | ||
3ec3bc21 | 445 | if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) |
34dc7c2f BB |
446 | return; |
447 | if (len == DMU_OBJECT_END) | |
3ec3bc21 | 448 | len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; |
34dc7c2f BB |
449 | |
450 | /* | |
ea97f8ce MA |
451 | * For i/o error checking, we read the first and last level-0 |
452 | * blocks if they are not aligned, and all the level-1 blocks. | |
453 | * | |
454 | * Note: dbuf_free_range() assumes that we have not instantiated | |
455 | * any level-0 dbufs that will be completely freed. Therefore we must | |
456 | * exercise care to not read or count the first and last blocks | |
457 | * if they are blocksize-aligned. | |
458 | */ | |
459 | if (dn->dn_datablkshift == 0) { | |
b663a23d | 460 | if (off != 0 || len < dn->dn_datablksz) |
92bc214c | 461 | dmu_tx_count_write(txh, 0, dn->dn_datablksz); |
ea97f8ce MA |
462 | } else { |
463 | /* first block will be modified if it is not aligned */ | |
464 | if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) | |
465 | dmu_tx_count_write(txh, off, 1); | |
466 | /* last block will be modified if it is not aligned */ | |
467 | if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) | |
3ec3bc21 | 468 | dmu_tx_count_write(txh, off + len, 1); |
ea97f8ce MA |
469 | } |
470 | ||
471 | /* | |
472 | * Check level-1 blocks. | |
34dc7c2f BB |
473 | */ |
474 | if (dn->dn_nlevels > 1) { | |
ea97f8ce | 475 | int shift = dn->dn_datablkshift + dn->dn_indblkshift - |
34dc7c2f | 476 | SPA_BLKPTRSHIFT; |
ea97f8ce MA |
477 | uint64_t start = off >> shift; |
478 | uint64_t end = (off + len) >> shift; | |
ea97f8ce | 479 | |
ea97f8ce | 480 | ASSERT(dn->dn_indblkshift != 0); |
34dc7c2f | 481 | |
2e7b7657 MA |
482 | /* |
483 | * dnode_reallocate() can result in an object with indirect | |
484 | * blocks having an odd data block size. In this case, | |
485 | * just check the single block. | |
486 | */ | |
487 | if (dn->dn_datablkshift == 0) | |
488 | start = end = 0; | |
489 | ||
3ec3bc21 | 490 | zio_t *zio = zio_root(tx->tx_pool->dp_spa, |
34dc7c2f | 491 | NULL, NULL, ZIO_FLAG_CANFAIL); |
1c27024e | 492 | for (uint64_t i = start; i <= end; i++) { |
34dc7c2f | 493 | uint64_t ibyte = i << shift; |
b128c09f | 494 | err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); |
34dc7c2f | 495 | i = ibyte >> shift; |
4bda3bd0 | 496 | if (err == ESRCH || i > end) |
34dc7c2f | 497 | break; |
3ec3bc21 | 498 | if (err != 0) { |
34dc7c2f | 499 | tx->tx_err = err; |
3ec3bc21 | 500 | (void) zio_wait(zio); |
34dc7c2f BB |
501 | return; |
502 | } | |
503 | ||
424fd7c3 | 504 | (void) zfs_refcount_add_many(&txh->txh_memory_tohold, |
3ec3bc21 BB |
505 | 1 << dn->dn_indblkshift, FTAG); |
506 | ||
34dc7c2f | 507 | err = dmu_tx_check_ioerr(zio, dn, 1, i); |
3ec3bc21 | 508 | if (err != 0) { |
34dc7c2f | 509 | tx->tx_err = err; |
3ec3bc21 | 510 | (void) zio_wait(zio); |
34dc7c2f BB |
511 | return; |
512 | } | |
513 | } | |
514 | err = zio_wait(zio); | |
3ec3bc21 | 515 | if (err != 0) { |
34dc7c2f BB |
516 | tx->tx_err = err; |
517 | return; | |
518 | } | |
519 | } | |
34dc7c2f BB |
520 | } |
521 | ||
522 | void | |
0eef1bde | 523 | dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) |
524 | { | |
525 | dmu_tx_hold_t *txh; | |
526 | ||
527 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
528 | object, THT_FREE, off, len); | |
67a1b037 PJD |
529 | if (txh != NULL) { |
530 | dmu_tx_count_dnode(txh); | |
531 | dmu_tx_count_free(txh, off, len); | |
532 | } | |
0eef1bde | 533 | } |
534 | ||
535 | void | |
536 | dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) | |
34dc7c2f BB |
537 | { |
538 | dmu_tx_hold_t *txh; | |
0eef1bde | 539 | |
540 | txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len); | |
67a1b037 PJD |
541 | if (txh != NULL) { |
542 | dmu_tx_count_dnode(txh); | |
543 | dmu_tx_count_free(txh, off, len); | |
544 | } | |
545 | } | |
546 | ||
547 | static void | |
548 | dmu_tx_count_clone(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) | |
549 | { | |
550 | ||
551 | /* | |
552 | * Reuse dmu_tx_count_free(), it does exactly what we need for clone. | |
553 | */ | |
554 | dmu_tx_count_free(txh, off, len); | |
555 | } | |
556 | ||
557 | void | |
558 | dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) | |
559 | { | |
560 | dmu_tx_hold_t *txh; | |
561 | ||
562 | ASSERT0(tx->tx_txg); | |
563 | ASSERT(len == 0 || UINT64_MAX - off >= len - 1); | |
564 | ||
565 | txh = dmu_tx_hold_dnode_impl(tx, dn, THT_CLONE, off, len); | |
566 | if (txh != NULL) { | |
567 | dmu_tx_count_dnode(txh); | |
568 | dmu_tx_count_clone(txh, off, len); | |
569 | } | |
0eef1bde | 570 | } |
571 | ||
572 | static void | |
9522bd24 | 573 | dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name) |
0eef1bde | 574 | { |
575 | dmu_tx_t *tx = txh->txh_tx; | |
3ec3bc21 | 576 | dnode_t *dn = txh->txh_dnode; |
f85c06be | 577 | int err; |
a4b21ead | 578 | extern int zap_micro_max_size; |
34dc7c2f BB |
579 | |
580 | ASSERT(tx->tx_txg == 0); | |
581 | ||
34dc7c2f BB |
582 | dmu_tx_count_dnode(txh); |
583 | ||
3ec3bc21 BB |
584 | /* |
585 | * Modifying a almost-full microzap is around the worst case (128KB) | |
586 | * | |
587 | * If it is a fat zap, the worst case would be 7*16KB=112KB: | |
588 | * - 3 blocks overwritten: target leaf, ptrtbl block, header block | |
589 | * - 4 new blocks written if adding: | |
590 | * - 2 blocks for possibly split leaves, | |
591 | * - 2 grown ptrtbl blocks | |
592 | */ | |
424fd7c3 | 593 | (void) zfs_refcount_add_many(&txh->txh_space_towrite, |
a4b21ead | 594 | zap_micro_max_size, FTAG); |
3ec3bc21 BB |
595 | |
596 | if (dn == NULL) | |
34dc7c2f | 597 | return; |
34dc7c2f | 598 | |
9ae529ec | 599 | ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); |
34dc7c2f | 600 | |
3ec3bc21 | 601 | if (dn->dn_maxblkid == 0 || name == NULL) { |
34dc7c2f | 602 | /* |
3ec3bc21 BB |
603 | * This is a microzap (only one block), or we don't know |
604 | * the name. Check the first block for i/o errors. | |
34dc7c2f BB |
605 | */ |
606 | err = dmu_tx_check_ioerr(NULL, dn, 0, 0); | |
3ec3bc21 | 607 | if (err != 0) { |
34dc7c2f | 608 | tx->tx_err = err; |
f85c06be | 609 | } |
3ec3bc21 | 610 | } else { |
34dc7c2f | 611 | /* |
3ec3bc21 BB |
612 | * Access the name so that we'll check for i/o errors to |
613 | * the leaf blocks, etc. We ignore ENOENT, as this name | |
614 | * may not yet exist. | |
34dc7c2f | 615 | */ |
2bce8049 | 616 | err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); |
3ec3bc21 | 617 | if (err == EIO || err == ECKSUM || err == ENXIO) { |
34dc7c2f | 618 | tx->tx_err = err; |
f85c06be GM |
619 | } |
620 | } | |
34dc7c2f BB |
621 | } |
622 | ||
0eef1bde | 623 | void |
624 | dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) | |
625 | { | |
626 | dmu_tx_hold_t *txh; | |
627 | ||
66eead53 | 628 | ASSERT0(tx->tx_txg); |
0eef1bde | 629 | |
630 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
631 | object, THT_ZAP, add, (uintptr_t)name); | |
66eead53 | 632 | if (txh != NULL) |
9522bd24 | 633 | dmu_tx_hold_zap_impl(txh, name); |
0eef1bde | 634 | } |
635 | ||
636 | void | |
637 | dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name) | |
638 | { | |
639 | dmu_tx_hold_t *txh; | |
640 | ||
66eead53 | 641 | ASSERT0(tx->tx_txg); |
0eef1bde | 642 | ASSERT(dn != NULL); |
643 | ||
644 | txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name); | |
66eead53 | 645 | if (txh != NULL) |
9522bd24 | 646 | dmu_tx_hold_zap_impl(txh, name); |
0eef1bde | 647 | } |
648 | ||
34dc7c2f BB |
649 | void |
650 | dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) | |
651 | { | |
652 | dmu_tx_hold_t *txh; | |
653 | ||
654 | ASSERT(tx->tx_txg == 0); | |
655 | ||
656 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
657 | object, THT_BONUS, 0, 0); | |
658 | if (txh) | |
659 | dmu_tx_count_dnode(txh); | |
660 | } | |
661 | ||
0eef1bde | 662 | void |
663 | dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn) | |
664 | { | |
665 | dmu_tx_hold_t *txh; | |
666 | ||
66eead53 | 667 | ASSERT0(tx->tx_txg); |
0eef1bde | 668 | |
669 | txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0); | |
670 | if (txh) | |
671 | dmu_tx_count_dnode(txh); | |
672 | } | |
673 | ||
34dc7c2f BB |
674 | void |
675 | dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) | |
676 | { | |
677 | dmu_tx_hold_t *txh; | |
7d637211 | 678 | |
34dc7c2f BB |
679 | ASSERT(tx->tx_txg == 0); |
680 | ||
681 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
682 | DMU_NEW_OBJECT, THT_SPACE, space, 0); | |
424fd7c3 TS |
683 | if (txh) { |
684 | (void) zfs_refcount_add_many( | |
685 | &txh->txh_space_towrite, space, FTAG); | |
686 | } | |
34dc7c2f BB |
687 | } |
688 | ||
3ec3bc21 | 689 | #ifdef ZFS_DEBUG |
34dc7c2f BB |
690 | void |
691 | dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) | |
692 | { | |
3ec3bc21 BB |
693 | boolean_t match_object = B_FALSE; |
694 | boolean_t match_offset = B_FALSE; | |
34dc7c2f | 695 | |
572e2857 | 696 | DB_DNODE_ENTER(db); |
3ec3bc21 | 697 | dnode_t *dn = DB_DNODE(db); |
34dc7c2f | 698 | ASSERT(tx->tx_txg != 0); |
428870ff | 699 | ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); |
34dc7c2f BB |
700 | ASSERT3U(dn->dn_object, ==, db->db.db_object); |
701 | ||
572e2857 BB |
702 | if (tx->tx_anyobj) { |
703 | DB_DNODE_EXIT(db); | |
34dc7c2f | 704 | return; |
572e2857 | 705 | } |
34dc7c2f BB |
706 | |
707 | /* XXX No checking on the meta dnode for now */ | |
572e2857 BB |
708 | if (db->db.db_object == DMU_META_DNODE_OBJECT) { |
709 | DB_DNODE_EXIT(db); | |
34dc7c2f | 710 | return; |
572e2857 | 711 | } |
34dc7c2f | 712 | |
3ec3bc21 | 713 | for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; |
34dc7c2f | 714 | txh = list_next(&tx->tx_holds, txh)) { |
99ea23c5 | 715 | ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); |
34dc7c2f BB |
716 | if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) |
717 | match_object = TRUE; | |
718 | if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { | |
719 | int datablkshift = dn->dn_datablkshift ? | |
720 | dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; | |
721 | int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; | |
722 | int shift = datablkshift + epbs * db->db_level; | |
723 | uint64_t beginblk = shift >= 64 ? 0 : | |
724 | (txh->txh_arg1 >> shift); | |
725 | uint64_t endblk = shift >= 64 ? 0 : | |
726 | ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); | |
727 | uint64_t blkid = db->db_blkid; | |
728 | ||
729 | /* XXX txh_arg2 better not be zero... */ | |
730 | ||
731 | dprintf("found txh type %x beginblk=%llx endblk=%llx\n", | |
8e739b2c RE |
732 | txh->txh_type, (u_longlong_t)beginblk, |
733 | (u_longlong_t)endblk); | |
34dc7c2f BB |
734 | |
735 | switch (txh->txh_type) { | |
736 | case THT_WRITE: | |
737 | if (blkid >= beginblk && blkid <= endblk) | |
738 | match_offset = TRUE; | |
739 | /* | |
740 | * We will let this hold work for the bonus | |
428870ff BB |
741 | * or spill buffer so that we don't need to |
742 | * hold it when creating a new object. | |
34dc7c2f | 743 | */ |
428870ff BB |
744 | if (blkid == DMU_BONUS_BLKID || |
745 | blkid == DMU_SPILL_BLKID) | |
34dc7c2f BB |
746 | match_offset = TRUE; |
747 | /* | |
748 | * They might have to increase nlevels, | |
749 | * thus dirtying the new TLIBs. Or the | |
750 | * might have to change the block size, | |
751 | * thus dirying the new lvl=0 blk=0. | |
752 | */ | |
753 | if (blkid == 0) | |
754 | match_offset = TRUE; | |
755 | break; | |
903c3613 BB |
756 | case THT_APPEND: |
757 | if (blkid >= beginblk && (blkid <= endblk || | |
758 | txh->txh_arg2 == DMU_OBJECT_END)) | |
759 | match_offset = TRUE; | |
760 | ||
761 | /* | |
762 | * THT_WRITE used for bonus and spill blocks. | |
763 | */ | |
764 | ASSERT(blkid != DMU_BONUS_BLKID && | |
765 | blkid != DMU_SPILL_BLKID); | |
766 | ||
767 | /* | |
768 | * They might have to increase nlevels, | |
769 | * thus dirtying the new TLIBs. Or the | |
770 | * might have to change the block size, | |
771 | * thus dirying the new lvl=0 blk=0. | |
772 | */ | |
773 | if (blkid == 0) | |
774 | match_offset = TRUE; | |
775 | break; | |
34dc7c2f | 776 | case THT_FREE: |
b128c09f BB |
777 | /* |
778 | * We will dirty all the level 1 blocks in | |
779 | * the free range and perhaps the first and | |
780 | * last level 0 block. | |
781 | */ | |
782 | if (blkid >= beginblk && (blkid <= endblk || | |
783 | txh->txh_arg2 == DMU_OBJECT_END)) | |
34dc7c2f BB |
784 | match_offset = TRUE; |
785 | break; | |
428870ff BB |
786 | case THT_SPILL: |
787 | if (blkid == DMU_SPILL_BLKID) | |
788 | match_offset = TRUE; | |
789 | break; | |
34dc7c2f | 790 | case THT_BONUS: |
428870ff | 791 | if (blkid == DMU_BONUS_BLKID) |
34dc7c2f BB |
792 | match_offset = TRUE; |
793 | break; | |
794 | case THT_ZAP: | |
795 | match_offset = TRUE; | |
796 | break; | |
797 | case THT_NEWOBJECT: | |
798 | match_object = TRUE; | |
799 | break; | |
67a1b037 PJD |
800 | case THT_CLONE: |
801 | if (blkid >= beginblk && blkid <= endblk) | |
802 | match_offset = TRUE; | |
803 | break; | |
34dc7c2f | 804 | default: |
989fd514 BB |
805 | cmn_err(CE_PANIC, "bad txh_type %d", |
806 | txh->txh_type); | |
34dc7c2f BB |
807 | } |
808 | } | |
572e2857 BB |
809 | if (match_object && match_offset) { |
810 | DB_DNODE_EXIT(db); | |
34dc7c2f | 811 | return; |
572e2857 | 812 | } |
34dc7c2f | 813 | } |
572e2857 | 814 | DB_DNODE_EXIT(db); |
34dc7c2f BB |
815 | panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", |
816 | (u_longlong_t)db->db.db_object, db->db_level, | |
817 | (u_longlong_t)db->db_blkid); | |
818 | } | |
819 | #endif | |
820 | ||
e8b96c60 MA |
821 | /* |
822 | * If we can't do 10 iops, something is wrong. Let us go ahead | |
823 | * and hit zfs_dirty_data_max. | |
824 | */ | |
18168da7 | 825 | static const hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */ |
e8b96c60 MA |
826 | |
827 | /* | |
828 | * We delay transactions when we've determined that the backend storage | |
829 | * isn't able to accommodate the rate of incoming writes. | |
830 | * | |
831 | * If there is already a transaction waiting, we delay relative to when | |
832 | * that transaction finishes waiting. This way the calculated min_time | |
833 | * is independent of the number of threads concurrently executing | |
834 | * transactions. | |
835 | * | |
836 | * If we are the only waiter, wait relative to when the transaction | |
837 | * started, rather than the current time. This credits the transaction for | |
838 | * "time already served", e.g. reading indirect blocks. | |
839 | * | |
840 | * The minimum time for a transaction to take is calculated as: | |
841 | * min_time = scale * (dirty - min) / (max - dirty) | |
842 | * min_time is then capped at zfs_delay_max_ns. | |
843 | * | |
844 | * The delay has two degrees of freedom that can be adjusted via tunables. | |
845 | * The percentage of dirty data at which we start to delay is defined by | |
846 | * zfs_delay_min_dirty_percent. This should typically be at or above | |
847 | * zfs_vdev_async_write_active_max_dirty_percent so that we only start to | |
848 | * delay after writing at full speed has failed to keep up with the incoming | |
849 | * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly | |
850 | * speaking, this variable determines the amount of delay at the midpoint of | |
851 | * the curve. | |
852 | * | |
853 | * delay | |
854 | * 10ms +-------------------------------------------------------------*+ | |
855 | * | *| | |
856 | * 9ms + *+ | |
857 | * | *| | |
858 | * 8ms + *+ | |
859 | * | * | | |
860 | * 7ms + * + | |
861 | * | * | | |
862 | * 6ms + * + | |
863 | * | * | | |
864 | * 5ms + * + | |
865 | * | * | | |
866 | * 4ms + * + | |
867 | * | * | | |
868 | * 3ms + * + | |
869 | * | * | | |
870 | * 2ms + (midpoint) * + | |
871 | * | | ** | | |
872 | * 1ms + v *** + | |
873 | * | zfs_delay_scale ----------> ******** | | |
874 | * 0 +-------------------------------------*********----------------+ | |
875 | * 0% <- zfs_dirty_data_max -> 100% | |
876 | * | |
877 | * Note that since the delay is added to the outstanding time remaining on the | |
878 | * most recent transaction, the delay is effectively the inverse of IOPS. | |
879 | * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve | |
880 | * was chosen such that small changes in the amount of accumulated dirty data | |
881 | * in the first 3/4 of the curve yield relatively small differences in the | |
882 | * amount of delay. | |
883 | * | |
884 | * The effects can be easier to understand when the amount of delay is | |
885 | * represented on a log scale: | |
886 | * | |
887 | * delay | |
888 | * 100ms +-------------------------------------------------------------++ | |
889 | * + + | |
890 | * | | | |
891 | * + *+ | |
892 | * 10ms + *+ | |
893 | * + ** + | |
894 | * | (midpoint) ** | | |
895 | * + | ** + | |
896 | * 1ms + v **** + | |
897 | * + zfs_delay_scale ----------> ***** + | |
898 | * | **** | | |
899 | * + **** + | |
900 | * 100us + ** + | |
901 | * + * + | |
902 | * | * | | |
903 | * + * + | |
904 | * 10us + * + | |
905 | * + + | |
906 | * | | | |
907 | * + + | |
908 | * +--------------------------------------------------------------+ | |
909 | * 0% <- zfs_dirty_data_max -> 100% | |
910 | * | |
911 | * Note here that only as the amount of dirty data approaches its limit does | |
912 | * the delay start to increase rapidly. The goal of a properly tuned system | |
913 | * should be to keep the amount of dirty data out of that range by first | |
914 | * ensuring that the appropriate limits are set for the I/O scheduler to reach | |
915 | * optimal throughput on the backend storage, and then by changing the value | |
916 | * of zfs_delay_scale to increase the steepness of the curve. | |
917 | */ | |
918 | static void | |
919 | dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) | |
920 | { | |
921 | dsl_pool_t *dp = tx->tx_pool; | |
84d0a03f AM |
922 | uint64_t delay_min_bytes, wrlog; |
923 | hrtime_t wakeup, tx_time = 0, now; | |
924 | ||
925 | /* Calculate minimum transaction time for the dirty data amount. */ | |
926 | delay_min_bytes = | |
e8b96c60 | 927 | zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; |
84d0a03f AM |
928 | if (dirty > delay_min_bytes) { |
929 | /* | |
930 | * The caller has already waited until we are under the max. | |
931 | * We make them pass us the amount of dirty data so we don't | |
932 | * have to handle the case of it being >= the max, which | |
933 | * could cause a divide-by-zero if it's == the max. | |
934 | */ | |
935 | ASSERT3U(dirty, <, zfs_dirty_data_max); | |
e8b96c60 | 936 | |
84d0a03f AM |
937 | tx_time = zfs_delay_scale * (dirty - delay_min_bytes) / |
938 | (zfs_dirty_data_max - dirty); | |
939 | } | |
e8b96c60 | 940 | |
84d0a03f AM |
941 | /* Calculate minimum transaction time for the TX_WRITE log size. */ |
942 | wrlog = aggsum_upper_bound(&dp->dp_wrlog_total); | |
943 | delay_min_bytes = | |
944 | zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100; | |
945 | if (wrlog >= zfs_wrlog_data_max) { | |
946 | tx_time = zfs_delay_max_ns; | |
947 | } else if (wrlog > delay_min_bytes) { | |
948 | tx_time = MAX(zfs_delay_scale * (wrlog - delay_min_bytes) / | |
949 | (zfs_wrlog_data_max - wrlog), tx_time); | |
950 | } | |
951 | ||
952 | if (tx_time == 0) | |
953 | return; | |
e8b96c60 | 954 | |
84d0a03f | 955 | tx_time = MIN(tx_time, zfs_delay_max_ns); |
e8b96c60 | 956 | now = gethrtime(); |
84d0a03f | 957 | if (now > tx->tx_start + tx_time) |
e8b96c60 MA |
958 | return; |
959 | ||
960 | DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, | |
84d0a03f | 961 | uint64_t, tx_time); |
e8b96c60 MA |
962 | |
963 | mutex_enter(&dp->dp_lock); | |
84d0a03f | 964 | wakeup = MAX(tx->tx_start + tx_time, dp->dp_last_wakeup + tx_time); |
e8b96c60 MA |
965 | dp->dp_last_wakeup = wakeup; |
966 | mutex_exit(&dp->dp_lock); | |
967 | ||
968 | zfs_sleep_until(wakeup); | |
969 | } | |
970 | ||
3ec3bc21 BB |
971 | /* |
972 | * This routine attempts to assign the transaction to a transaction group. | |
973 | * To do so, we must determine if there is sufficient free space on disk. | |
974 | * | |
975 | * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree() | |
976 | * on it), then it is assumed that there is sufficient free space, | |
977 | * unless there's insufficient slop space in the pool (see the comment | |
978 | * above spa_slop_shift in spa_misc.c). | |
979 | * | |
980 | * If it is not a "netfree" transaction, then if the data already on disk | |
981 | * is over the allowed usage (e.g. quota), this will fail with EDQUOT or | |
982 | * ENOSPC. Otherwise, if the current rough estimate of pending changes, | |
983 | * plus the rough estimate of this transaction's changes, may exceed the | |
984 | * allowed usage, then this will fail with ERESTART, which will cause the | |
985 | * caller to wait for the pending changes to be written to disk (by waiting | |
986 | * for the next TXG to open), and then check the space usage again. | |
987 | * | |
988 | * The rough estimate of pending changes is comprised of the sum of: | |
989 | * | |
990 | * - this transaction's holds' txh_space_towrite | |
991 | * | |
992 | * - dd_tempreserved[], which is the sum of in-flight transactions' | |
993 | * holds' txh_space_towrite (i.e. those transactions that have called | |
994 | * dmu_tx_assign() but not yet called dmu_tx_commit()). | |
995 | * | |
996 | * - dd_space_towrite[], which is the amount of dirtied dbufs. | |
997 | * | |
998 | * Note that all of these values are inflated by spa_get_worst_case_asize(), | |
999 | * which means that we may get ERESTART well before we are actually in danger | |
1000 | * of running out of space, but this also mitigates any small inaccuracies | |
1001 | * in the rough estimate (e.g. txh_space_towrite doesn't take into account | |
1002 | * indirect blocks, and dd_space_towrite[] doesn't take into account changes | |
1003 | * to the MOS). | |
1004 | * | |
1005 | * Note that due to this algorithm, it is possible to exceed the allowed | |
1006 | * usage by one transaction. Also, as we approach the allowed usage, | |
1007 | * we will allow a very limited amount of changes into each TXG, thus | |
1008 | * decreasing performance. | |
1009 | */ | |
34dc7c2f | 1010 | static int |
0735ecb3 | 1011 | dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) |
34dc7c2f | 1012 | { |
34dc7c2f | 1013 | spa_t *spa = tx->tx_pool->dp_spa; |
34dc7c2f | 1014 | |
c99c9001 | 1015 | ASSERT0(tx->tx_txg); |
34dc7c2f | 1016 | |
570827e1 BB |
1017 | if (tx->tx_err) { |
1018 | DMU_TX_STAT_BUMP(dmu_tx_error); | |
34dc7c2f | 1019 | return (tx->tx_err); |
570827e1 | 1020 | } |
34dc7c2f | 1021 | |
b128c09f | 1022 | if (spa_suspended(spa)) { |
570827e1 BB |
1023 | DMU_TX_STAT_BUMP(dmu_tx_suspended); |
1024 | ||
34dc7c2f BB |
1025 | /* |
1026 | * If the user has indicated a blocking failure mode | |
1027 | * then return ERESTART which will block in dmu_tx_wait(). | |
1028 | * Otherwise, return EIO so that an error can get | |
1029 | * propagated back to the VOP calls. | |
1030 | * | |
1031 | * Note that we always honor the txg_how flag regardless | |
1032 | * of the failuremode setting. | |
1033 | */ | |
1034 | if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && | |
0735ecb3 | 1035 | !(txg_how & TXG_WAIT)) |
2e528b49 | 1036 | return (SET_ERROR(EIO)); |
34dc7c2f | 1037 | |
2e528b49 | 1038 | return (SET_ERROR(ERESTART)); |
34dc7c2f BB |
1039 | } |
1040 | ||
a7bd20e3 | 1041 | if (!tx->tx_dirty_delayed && |
84d0a03f AM |
1042 | dsl_pool_need_wrlog_delay(tx->tx_pool)) { |
1043 | tx->tx_wait_dirty = B_TRUE; | |
1044 | DMU_TX_STAT_BUMP(dmu_tx_wrlog_delay); | |
a7bd20e3 KJ |
1045 | return (SET_ERROR(ERESTART)); |
1046 | } | |
1047 | ||
0735ecb3 | 1048 | if (!tx->tx_dirty_delayed && |
e8b96c60 MA |
1049 | dsl_pool_need_dirty_delay(tx->tx_pool)) { |
1050 | tx->tx_wait_dirty = B_TRUE; | |
1051 | DMU_TX_STAT_BUMP(dmu_tx_dirty_delay); | |
ecb2b7dc | 1052 | return (SET_ERROR(ERESTART)); |
e8b96c60 MA |
1053 | } |
1054 | ||
34dc7c2f BB |
1055 | tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); |
1056 | tx->tx_needassign_txh = NULL; | |
1057 | ||
1058 | /* | |
1059 | * NB: No error returns are allowed after txg_hold_open, but | |
1060 | * before processing the dnode holds, due to the | |
1061 | * dmu_tx_unassign() logic. | |
1062 | */ | |
1063 | ||
3ec3bc21 BB |
1064 | uint64_t towrite = 0; |
1065 | uint64_t tohold = 0; | |
1066 | for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; | |
34dc7c2f BB |
1067 | txh = list_next(&tx->tx_holds, txh)) { |
1068 | dnode_t *dn = txh->txh_dnode; | |
1069 | if (dn != NULL) { | |
cb9e5b7e MA |
1070 | /* |
1071 | * This thread can't hold the dn_struct_rwlock | |
1072 | * while assigning the tx, because this can lead to | |
1073 | * deadlock. Specifically, if this dnode is already | |
1074 | * assigned to an earlier txg, this thread may need | |
1075 | * to wait for that txg to sync (the ERESTART case | |
1076 | * below). The other thread that has assigned this | |
1077 | * dnode to an earlier txg prevents this txg from | |
1078 | * syncing until its tx can complete (calling | |
1079 | * dmu_tx_commit()), but it may need to acquire the | |
1080 | * dn_struct_rwlock to do so (e.g. via | |
1081 | * dmu_buf_hold*()). | |
1082 | * | |
1083 | * Note that this thread can't hold the lock for | |
1084 | * read either, but the rwlock doesn't record | |
1085 | * enough information to make that assertion. | |
1086 | */ | |
1087 | ASSERT(!RW_WRITE_HELD(&dn->dn_struct_rwlock)); | |
1088 | ||
34dc7c2f BB |
1089 | mutex_enter(&dn->dn_mtx); |
1090 | if (dn->dn_assigned_txg == tx->tx_txg - 1) { | |
1091 | mutex_exit(&dn->dn_mtx); | |
1092 | tx->tx_needassign_txh = txh; | |
570827e1 | 1093 | DMU_TX_STAT_BUMP(dmu_tx_group); |
2e528b49 | 1094 | return (SET_ERROR(ERESTART)); |
34dc7c2f BB |
1095 | } |
1096 | if (dn->dn_assigned_txg == 0) | |
1097 | dn->dn_assigned_txg = tx->tx_txg; | |
1098 | ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); | |
c13060e4 | 1099 | (void) zfs_refcount_add(&dn->dn_tx_holds, tx); |
34dc7c2f BB |
1100 | mutex_exit(&dn->dn_mtx); |
1101 | } | |
424fd7c3 TS |
1102 | towrite += zfs_refcount_count(&txh->txh_space_towrite); |
1103 | tohold += zfs_refcount_count(&txh->txh_memory_tohold); | |
34dc7c2f BB |
1104 | } |
1105 | ||
b128c09f | 1106 | /* needed allocation: worst-case estimate of write space */ |
3ec3bc21 | 1107 | uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite); |
b128c09f | 1108 | /* calculate memory footprint estimate */ |
3ec3bc21 | 1109 | uint64_t memory = towrite + tohold; |
34dc7c2f | 1110 | |
3ec3bc21 | 1111 | if (tx->tx_dir != NULL && asize != 0) { |
b128c09f | 1112 | int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, |
3ec3bc21 BB |
1113 | asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx); |
1114 | if (err != 0) | |
34dc7c2f BB |
1115 | return (err); |
1116 | } | |
1117 | ||
570827e1 BB |
1118 | DMU_TX_STAT_BUMP(dmu_tx_assigned); |
1119 | ||
34dc7c2f BB |
1120 | return (0); |
1121 | } | |
1122 | ||
1123 | static void | |
1124 | dmu_tx_unassign(dmu_tx_t *tx) | |
1125 | { | |
34dc7c2f BB |
1126 | if (tx->tx_txg == 0) |
1127 | return; | |
1128 | ||
1129 | txg_rele_to_quiesce(&tx->tx_txgh); | |
1130 | ||
e49f1e20 WA |
1131 | /* |
1132 | * Walk the transaction's hold list, removing the hold on the | |
1133 | * associated dnode, and notifying waiters if the refcount drops to 0. | |
1134 | */ | |
3ec3bc21 | 1135 | for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); |
981b2126 | 1136 | txh && txh != tx->tx_needassign_txh; |
34dc7c2f BB |
1137 | txh = list_next(&tx->tx_holds, txh)) { |
1138 | dnode_t *dn = txh->txh_dnode; | |
1139 | ||
1140 | if (dn == NULL) | |
1141 | continue; | |
1142 | mutex_enter(&dn->dn_mtx); | |
1143 | ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); | |
1144 | ||
424fd7c3 | 1145 | if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) { |
34dc7c2f BB |
1146 | dn->dn_assigned_txg = 0; |
1147 | cv_broadcast(&dn->dn_notxholds); | |
1148 | } | |
1149 | mutex_exit(&dn->dn_mtx); | |
1150 | } | |
1151 | ||
1152 | txg_rele_to_sync(&tx->tx_txgh); | |
1153 | ||
1154 | tx->tx_lasttried_txg = tx->tx_txg; | |
1155 | tx->tx_txg = 0; | |
1156 | } | |
1157 | ||
1158 | /* | |
0735ecb3 | 1159 | * Assign tx to a transaction group; txg_how is a bitmask: |
34dc7c2f | 1160 | * |
0735ecb3 PS |
1161 | * If TXG_WAIT is set and the currently open txg is full, this function |
1162 | * will wait until there's a new txg. This should be used when no locks | |
1163 | * are being held. With this bit set, this function will only fail if | |
1164 | * we're truly out of space (or over quota). | |
34dc7c2f | 1165 | * |
0735ecb3 PS |
1166 | * If TXG_WAIT is *not* set and we can't assign into the currently open |
1167 | * txg without blocking, this function will return immediately with | |
1168 | * ERESTART. This should be used whenever locks are being held. On an | |
1169 | * ERESTART error, the caller should drop all locks, call dmu_tx_wait(), | |
1170 | * and try again. | |
e8b96c60 | 1171 | * |
0735ecb3 PS |
1172 | * If TXG_NOTHROTTLE is set, this indicates that this tx should not be |
1173 | * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for | |
1174 | * details on the throttle). This is used by the VFS operations, after | |
1175 | * they have already called dmu_tx_wait() (though most likely on a | |
1176 | * different tx). | |
84268b09 CS |
1177 | * |
1178 | * It is guaranteed that subsequent successful calls to dmu_tx_assign() | |
1179 | * will assign the tx to monotonically increasing txgs. Of course this is | |
1180 | * not strong monotonicity, because the same txg can be returned multiple | |
1181 | * times in a row. This guarantee holds both for subsequent calls from | |
1182 | * one thread and for multiple threads. For example, it is impossible to | |
1183 | * observe the following sequence of events: | |
1184 | * | |
1185 | * Thread 1 Thread 2 | |
1186 | * | |
1187 | * dmu_tx_assign(T1, ...) | |
1188 | * 1 <- dmu_tx_get_txg(T1) | |
1189 | * dmu_tx_assign(T2, ...) | |
1190 | * 2 <- dmu_tx_get_txg(T2) | |
1191 | * dmu_tx_assign(T3, ...) | |
1192 | * 1 <- dmu_tx_get_txg(T3) | |
34dc7c2f BB |
1193 | */ |
1194 | int | |
0735ecb3 | 1195 | dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) |
34dc7c2f BB |
1196 | { |
1197 | int err; | |
1198 | ||
1199 | ASSERT(tx->tx_txg == 0); | |
0735ecb3 | 1200 | ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE)); |
34dc7c2f BB |
1201 | ASSERT(!dsl_pool_sync_context(tx->tx_pool)); |
1202 | ||
13fe0198 | 1203 | /* If we might wait, we must not hold the config lock. */ |
0735ecb3 PS |
1204 | IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool)); |
1205 | ||
1206 | if ((txg_how & TXG_NOTHROTTLE)) | |
1207 | tx->tx_dirty_delayed = B_TRUE; | |
13fe0198 | 1208 | |
34dc7c2f BB |
1209 | while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { |
1210 | dmu_tx_unassign(tx); | |
1211 | ||
0735ecb3 | 1212 | if (err != ERESTART || !(txg_how & TXG_WAIT)) |
34dc7c2f BB |
1213 | return (err); |
1214 | ||
1215 | dmu_tx_wait(tx); | |
1216 | } | |
1217 | ||
1218 | txg_rele_to_quiesce(&tx->tx_txgh); | |
1219 | ||
1220 | return (0); | |
1221 | } | |
1222 | ||
1223 | void | |
1224 | dmu_tx_wait(dmu_tx_t *tx) | |
1225 | { | |
1226 | spa_t *spa = tx->tx_pool->dp_spa; | |
e8b96c60 | 1227 | dsl_pool_t *dp = tx->tx_pool; |
a77c4c83 | 1228 | hrtime_t before; |
34dc7c2f BB |
1229 | |
1230 | ASSERT(tx->tx_txg == 0); | |
13fe0198 | 1231 | ASSERT(!dsl_pool_config_held(tx->tx_pool)); |
34dc7c2f | 1232 | |
a77c4c83 NB |
1233 | before = gethrtime(); |
1234 | ||
e8b96c60 MA |
1235 | if (tx->tx_wait_dirty) { |
1236 | uint64_t dirty; | |
1237 | ||
1238 | /* | |
1239 | * dmu_tx_try_assign() has determined that we need to wait | |
1240 | * because we've consumed much or all of the dirty buffer | |
1241 | * space. | |
1242 | */ | |
1243 | mutex_enter(&dp->dp_lock); | |
1244 | if (dp->dp_dirty_total >= zfs_dirty_data_max) | |
1245 | DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max); | |
1246 | while (dp->dp_dirty_total >= zfs_dirty_data_max) | |
1247 | cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); | |
1248 | dirty = dp->dp_dirty_total; | |
1249 | mutex_exit(&dp->dp_lock); | |
1250 | ||
1251 | dmu_tx_delay(tx, dirty); | |
1252 | ||
1253 | tx->tx_wait_dirty = B_FALSE; | |
1254 | ||
1255 | /* | |
0735ecb3 PS |
1256 | * Note: setting tx_dirty_delayed only has effect if the |
1257 | * caller used TX_WAIT. Otherwise they are going to | |
1258 | * destroy this tx and try again. The common case, | |
1259 | * zfs_write(), uses TX_WAIT. | |
e8b96c60 | 1260 | */ |
0735ecb3 | 1261 | tx->tx_dirty_delayed = B_TRUE; |
e8b96c60 MA |
1262 | } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { |
1263 | /* | |
1264 | * If the pool is suspended we need to wait until it | |
1265 | * is resumed. Note that it's possible that the pool | |
1266 | * has become active after this thread has tried to | |
1267 | * obtain a tx. If that's the case then tx_lasttried_txg | |
1268 | * would not have been set. | |
1269 | */ | |
1270 | txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); | |
34dc7c2f BB |
1271 | } else if (tx->tx_needassign_txh) { |
1272 | dnode_t *dn = tx->tx_needassign_txh->txh_dnode; | |
1273 | ||
1274 | mutex_enter(&dn->dn_mtx); | |
1275 | while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) | |
1276 | cv_wait(&dn->dn_notxholds, &dn->dn_mtx); | |
1277 | mutex_exit(&dn->dn_mtx); | |
1278 | tx->tx_needassign_txh = NULL; | |
1279 | } else { | |
e8b96c60 | 1280 | /* |
e48afbc4 SD |
1281 | * If we have a lot of dirty data just wait until we sync |
1282 | * out a TXG at which point we'll hopefully have synced | |
1283 | * a portion of the changes. | |
e8b96c60 | 1284 | */ |
e48afbc4 | 1285 | txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); |
34dc7c2f | 1286 | } |
a77c4c83 NB |
1287 | |
1288 | spa_tx_assign_add_nsecs(spa, gethrtime() - before); | |
34dc7c2f BB |
1289 | } |
1290 | ||
f85c06be GM |
1291 | static void |
1292 | dmu_tx_destroy(dmu_tx_t *tx) | |
1293 | { | |
1294 | dmu_tx_hold_t *txh; | |
1295 | ||
1296 | while ((txh = list_head(&tx->tx_holds)) != NULL) { | |
1297 | dnode_t *dn = txh->txh_dnode; | |
1298 | ||
1299 | list_remove(&tx->tx_holds, txh); | |
424fd7c3 TS |
1300 | zfs_refcount_destroy_many(&txh->txh_space_towrite, |
1301 | zfs_refcount_count(&txh->txh_space_towrite)); | |
1302 | zfs_refcount_destroy_many(&txh->txh_memory_tohold, | |
1303 | zfs_refcount_count(&txh->txh_memory_tohold)); | |
f85c06be GM |
1304 | kmem_free(txh, sizeof (dmu_tx_hold_t)); |
1305 | if (dn != NULL) | |
1306 | dnode_rele(dn, tx); | |
1307 | } | |
1308 | ||
1309 | list_destroy(&tx->tx_callbacks); | |
1310 | list_destroy(&tx->tx_holds); | |
f85c06be GM |
1311 | kmem_free(tx, sizeof (dmu_tx_t)); |
1312 | } | |
1313 | ||
34dc7c2f BB |
1314 | void |
1315 | dmu_tx_commit(dmu_tx_t *tx) | |
1316 | { | |
34dc7c2f BB |
1317 | ASSERT(tx->tx_txg != 0); |
1318 | ||
e49f1e20 WA |
1319 | /* |
1320 | * Go through the transaction's hold list and remove holds on | |
1321 | * associated dnodes, notifying waiters if no holds remain. | |
1322 | */ | |
1c27024e | 1323 | for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; |
f85c06be | 1324 | txh = list_next(&tx->tx_holds, txh)) { |
34dc7c2f BB |
1325 | dnode_t *dn = txh->txh_dnode; |
1326 | ||
34dc7c2f BB |
1327 | if (dn == NULL) |
1328 | continue; | |
f85c06be | 1329 | |
34dc7c2f BB |
1330 | mutex_enter(&dn->dn_mtx); |
1331 | ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); | |
1332 | ||
424fd7c3 | 1333 | if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) { |
34dc7c2f BB |
1334 | dn->dn_assigned_txg = 0; |
1335 | cv_broadcast(&dn->dn_notxholds); | |
1336 | } | |
1337 | mutex_exit(&dn->dn_mtx); | |
34dc7c2f BB |
1338 | } |
1339 | ||
1340 | if (tx->tx_tempreserve_cookie) | |
1341 | dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); | |
1342 | ||
428870ff BB |
1343 | if (!list_is_empty(&tx->tx_callbacks)) |
1344 | txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); | |
1345 | ||
34dc7c2f BB |
1346 | if (tx->tx_anyobj == FALSE) |
1347 | txg_rele_to_sync(&tx->tx_txgh); | |
428870ff | 1348 | |
f85c06be | 1349 | dmu_tx_destroy(tx); |
34dc7c2f BB |
1350 | } |
1351 | ||
1352 | void | |
1353 | dmu_tx_abort(dmu_tx_t *tx) | |
1354 | { | |
34dc7c2f BB |
1355 | ASSERT(tx->tx_txg == 0); |
1356 | ||
428870ff BB |
1357 | /* |
1358 | * Call any registered callbacks with an error code. | |
1359 | */ | |
1360 | if (!list_is_empty(&tx->tx_callbacks)) | |
28caa74b | 1361 | dmu_tx_do_callbacks(&tx->tx_callbacks, SET_ERROR(ECANCELED)); |
428870ff | 1362 | |
f85c06be | 1363 | dmu_tx_destroy(tx); |
34dc7c2f BB |
1364 | } |
1365 | ||
1366 | uint64_t | |
1367 | dmu_tx_get_txg(dmu_tx_t *tx) | |
1368 | { | |
1369 | ASSERT(tx->tx_txg != 0); | |
1370 | return (tx->tx_txg); | |
1371 | } | |
428870ff | 1372 | |
13fe0198 MA |
1373 | dsl_pool_t * |
1374 | dmu_tx_pool(dmu_tx_t *tx) | |
1375 | { | |
1376 | ASSERT(tx->tx_pool != NULL); | |
1377 | return (tx->tx_pool); | |
1378 | } | |
1379 | ||
428870ff BB |
1380 | void |
1381 | dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) | |
1382 | { | |
1383 | dmu_tx_callback_t *dcb; | |
1384 | ||
79c76d5b | 1385 | dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); |
428870ff BB |
1386 | |
1387 | dcb->dcb_func = func; | |
1388 | dcb->dcb_data = data; | |
1389 | ||
1390 | list_insert_tail(&tx->tx_callbacks, dcb); | |
1391 | } | |
1392 | ||
1393 | /* | |
1394 | * Call all the commit callbacks on a list, with a given error code. | |
1395 | */ | |
1396 | void | |
1397 | dmu_tx_do_callbacks(list_t *cb_list, int error) | |
1398 | { | |
1399 | dmu_tx_callback_t *dcb; | |
1400 | ||
b3ad3f48 | 1401 | while ((dcb = list_remove_tail(cb_list)) != NULL) { |
428870ff BB |
1402 | dcb->dcb_func(dcb->dcb_data, error); |
1403 | kmem_free(dcb, sizeof (dmu_tx_callback_t)); | |
1404 | } | |
1405 | } | |
1406 | ||
1407 | /* | |
1408 | * Interface to hold a bunch of attributes. | |
1409 | * used for creating new files. | |
1410 | * attrsize is the total size of all attributes | |
1411 | * to be added during object creation | |
1412 | * | |
1413 | * For updating/adding a single attribute dmu_tx_hold_sa() should be used. | |
1414 | */ | |
1415 | ||
1416 | /* | |
1417 | * hold necessary attribute name for attribute registration. | |
1418 | * should be a very rare case where this is needed. If it does | |
1419 | * happen it would only happen on the first write to the file system. | |
1420 | */ | |
1421 | static void | |
1422 | dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) | |
1423 | { | |
428870ff BB |
1424 | if (!sa->sa_need_attr_registration) |
1425 | return; | |
1426 | ||
3ec3bc21 | 1427 | for (int i = 0; i != sa->sa_num_attrs; i++) { |
428870ff BB |
1428 | if (!sa->sa_attr_table[i].sa_registered) { |
1429 | if (sa->sa_reg_attr_obj) | |
1430 | dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, | |
1431 | B_TRUE, sa->sa_attr_table[i].sa_name); | |
1432 | else | |
1433 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, | |
1434 | B_TRUE, sa->sa_attr_table[i].sa_name); | |
1435 | } | |
1436 | } | |
1437 | } | |
1438 | ||
428870ff BB |
1439 | void |
1440 | dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) | |
1441 | { | |
9631681b | 1442 | dmu_tx_hold_t *txh; |
428870ff | 1443 | |
9631681b BB |
1444 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, |
1445 | THT_SPILL, 0, 0); | |
1446 | if (txh != NULL) | |
424fd7c3 | 1447 | (void) zfs_refcount_add_many(&txh->txh_space_towrite, |
9631681b | 1448 | SPA_OLD_MAXBLOCKSIZE, FTAG); |
428870ff BB |
1449 | } |
1450 | ||
1451 | void | |
1452 | dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) | |
1453 | { | |
1454 | sa_os_t *sa = tx->tx_objset->os_sa; | |
1455 | ||
1456 | dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); | |
1457 | ||
1458 | if (tx->tx_objset->os_sa->sa_master_obj == 0) | |
1459 | return; | |
1460 | ||
3ec3bc21 | 1461 | if (tx->tx_objset->os_sa->sa_layout_attr_obj) { |
428870ff | 1462 | dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); |
3ec3bc21 | 1463 | } else { |
428870ff BB |
1464 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); |
1465 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); | |
1466 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1467 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1468 | } | |
1469 | ||
1470 | dmu_tx_sa_registration_hold(sa, tx); | |
1471 | ||
50c957f7 | 1472 | if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill) |
428870ff BB |
1473 | return; |
1474 | ||
1475 | (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, | |
1476 | THT_SPILL, 0, 0); | |
1477 | } | |
1478 | ||
1479 | /* | |
1480 | * Hold SA attribute | |
1481 | * | |
1482 | * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) | |
1483 | * | |
1484 | * variable_size is the total size of all variable sized attributes | |
1485 | * passed to this function. It is not the total size of all | |
1486 | * variable size attributes that *may* exist on this object. | |
1487 | */ | |
1488 | void | |
1489 | dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) | |
1490 | { | |
1491 | uint64_t object; | |
1492 | sa_os_t *sa = tx->tx_objset->os_sa; | |
1493 | ||
1494 | ASSERT(hdl != NULL); | |
1495 | ||
1496 | object = sa_handle_object(hdl); | |
1497 | ||
0eb8ba6a MA |
1498 | dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; |
1499 | DB_DNODE_ENTER(db); | |
1500 | dmu_tx_hold_bonus_by_dnode(tx, DB_DNODE(db)); | |
1501 | DB_DNODE_EXIT(db); | |
428870ff BB |
1502 | |
1503 | if (tx->tx_objset->os_sa->sa_master_obj == 0) | |
1504 | return; | |
1505 | ||
1506 | if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || | |
1507 | tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { | |
1508 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); | |
1509 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); | |
1510 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1511 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1512 | } | |
1513 | ||
1514 | dmu_tx_sa_registration_hold(sa, tx); | |
1515 | ||
1516 | if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) | |
1517 | dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); | |
1518 | ||
572e2857 | 1519 | if (sa->sa_force_spill || may_grow || hdl->sa_spill) { |
428870ff BB |
1520 | ASSERT(tx->tx_txg == 0); |
1521 | dmu_tx_hold_spill(tx, object); | |
572e2857 | 1522 | } else { |
572e2857 BB |
1523 | dnode_t *dn; |
1524 | ||
1525 | DB_DNODE_ENTER(db); | |
1526 | dn = DB_DNODE(db); | |
1527 | if (dn->dn_have_spill) { | |
1528 | ASSERT(tx->tx_txg == 0); | |
1529 | dmu_tx_hold_spill(tx, object); | |
1530 | } | |
1531 | DB_DNODE_EXIT(db); | |
428870ff BB |
1532 | } |
1533 | } | |
c28b2279 | 1534 | |
570827e1 BB |
1535 | void |
1536 | dmu_tx_init(void) | |
1537 | { | |
1538 | dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc", | |
1539 | KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t), | |
1540 | KSTAT_FLAG_VIRTUAL); | |
1541 | ||
1542 | if (dmu_tx_ksp != NULL) { | |
1543 | dmu_tx_ksp->ks_data = &dmu_tx_stats; | |
1544 | kstat_install(dmu_tx_ksp); | |
1545 | } | |
1546 | } | |
1547 | ||
1548 | void | |
1549 | dmu_tx_fini(void) | |
1550 | { | |
1551 | if (dmu_tx_ksp != NULL) { | |
1552 | kstat_delete(dmu_tx_ksp); | |
1553 | dmu_tx_ksp = NULL; | |
1554 | } | |
1555 | } | |
1556 | ||
93ce2b4c | 1557 | #if defined(_KERNEL) |
c28b2279 BB |
1558 | EXPORT_SYMBOL(dmu_tx_create); |
1559 | EXPORT_SYMBOL(dmu_tx_hold_write); | |
0eef1bde | 1560 | EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode); |
903c3613 BB |
1561 | EXPORT_SYMBOL(dmu_tx_hold_append); |
1562 | EXPORT_SYMBOL(dmu_tx_hold_append_by_dnode); | |
c28b2279 | 1563 | EXPORT_SYMBOL(dmu_tx_hold_free); |
0eef1bde | 1564 | EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode); |
c28b2279 | 1565 | EXPORT_SYMBOL(dmu_tx_hold_zap); |
0eef1bde | 1566 | EXPORT_SYMBOL(dmu_tx_hold_zap_by_dnode); |
c28b2279 | 1567 | EXPORT_SYMBOL(dmu_tx_hold_bonus); |
0eef1bde | 1568 | EXPORT_SYMBOL(dmu_tx_hold_bonus_by_dnode); |
c28b2279 BB |
1569 | EXPORT_SYMBOL(dmu_tx_abort); |
1570 | EXPORT_SYMBOL(dmu_tx_assign); | |
1571 | EXPORT_SYMBOL(dmu_tx_wait); | |
1572 | EXPORT_SYMBOL(dmu_tx_commit); | |
848259c1 | 1573 | EXPORT_SYMBOL(dmu_tx_mark_netfree); |
c28b2279 BB |
1574 | EXPORT_SYMBOL(dmu_tx_get_txg); |
1575 | EXPORT_SYMBOL(dmu_tx_callback_register); | |
1576 | EXPORT_SYMBOL(dmu_tx_do_callbacks); | |
1577 | EXPORT_SYMBOL(dmu_tx_hold_spill); | |
1578 | EXPORT_SYMBOL(dmu_tx_hold_sa_create); | |
1579 | EXPORT_SYMBOL(dmu_tx_hold_sa); | |
1580 | #endif |