]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
22cd4a46 | 23 | * Copyright 2011 Nexenta Systems, Inc. All rights reserved. |
4747a7d3 | 24 | * Copyright (c) 2012, 2017 by Delphix. All rights reserved. |
22cd4a46 | 25 | */ |
34dc7c2f | 26 | |
34dc7c2f BB |
27 | #include <sys/dmu.h> |
28 | #include <sys/dmu_impl.h> | |
29 | #include <sys/dbuf.h> | |
30 | #include <sys/dmu_tx.h> | |
31 | #include <sys/dmu_objset.h> | |
3ec3bc21 BB |
32 | #include <sys/dsl_dataset.h> |
33 | #include <sys/dsl_dir.h> | |
34dc7c2f | 34 | #include <sys/dsl_pool.h> |
3ec3bc21 | 35 | #include <sys/zap_impl.h> |
34dc7c2f | 36 | #include <sys/spa.h> |
428870ff BB |
37 | #include <sys/sa.h> |
38 | #include <sys/sa_impl.h> | |
34dc7c2f | 39 | #include <sys/zfs_context.h> |
428870ff | 40 | #include <sys/varargs.h> |
49ee64e5 | 41 | #include <sys/trace_dmu.h> |
34dc7c2f BB |
42 | |
43 | typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, | |
44 | uint64_t arg1, uint64_t arg2); | |
45 | ||
570827e1 BB |
46 | dmu_tx_stats_t dmu_tx_stats = { |
47 | { "dmu_tx_assigned", KSTAT_DATA_UINT64 }, | |
48 | { "dmu_tx_delay", KSTAT_DATA_UINT64 }, | |
49 | { "dmu_tx_error", KSTAT_DATA_UINT64 }, | |
50 | { "dmu_tx_suspended", KSTAT_DATA_UINT64 }, | |
51 | { "dmu_tx_group", KSTAT_DATA_UINT64 }, | |
570827e1 BB |
52 | { "dmu_tx_memory_reserve", KSTAT_DATA_UINT64 }, |
53 | { "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 }, | |
570827e1 | 54 | { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, |
e8b96c60 MA |
55 | { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, |
56 | { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, | |
570827e1 BB |
57 | { "dmu_tx_quota", KSTAT_DATA_UINT64 }, |
58 | }; | |
59 | ||
60 | static kstat_t *dmu_tx_ksp; | |
34dc7c2f BB |
61 | |
62 | dmu_tx_t * | |
63 | dmu_tx_create_dd(dsl_dir_t *dd) | |
64 | { | |
79c76d5b | 65 | dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); |
34dc7c2f | 66 | tx->tx_dir = dd; |
6f1ffb06 | 67 | if (dd != NULL) |
34dc7c2f BB |
68 | tx->tx_pool = dd->dd_pool; |
69 | list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), | |
70 | offsetof(dmu_tx_hold_t, txh_node)); | |
428870ff BB |
71 | list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), |
72 | offsetof(dmu_tx_callback_t, dcb_node)); | |
e8b96c60 | 73 | tx->tx_start = gethrtime(); |
34dc7c2f BB |
74 | return (tx); |
75 | } | |
76 | ||
77 | dmu_tx_t * | |
78 | dmu_tx_create(objset_t *os) | |
79 | { | |
428870ff | 80 | dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); |
34dc7c2f | 81 | tx->tx_objset = os; |
34dc7c2f BB |
82 | return (tx); |
83 | } | |
84 | ||
85 | dmu_tx_t * | |
86 | dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) | |
87 | { | |
88 | dmu_tx_t *tx = dmu_tx_create_dd(NULL); | |
89 | ||
4747a7d3 | 90 | txg_verify(dp->dp_spa, txg); |
34dc7c2f BB |
91 | tx->tx_pool = dp; |
92 | tx->tx_txg = txg; | |
93 | tx->tx_anyobj = TRUE; | |
94 | ||
95 | return (tx); | |
96 | } | |
97 | ||
98 | int | |
99 | dmu_tx_is_syncing(dmu_tx_t *tx) | |
100 | { | |
101 | return (tx->tx_anyobj); | |
102 | } | |
103 | ||
104 | int | |
105 | dmu_tx_private_ok(dmu_tx_t *tx) | |
106 | { | |
107 | return (tx->tx_anyobj); | |
108 | } | |
109 | ||
110 | static dmu_tx_hold_t * | |
0eef1bde | 111 | dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type, |
112 | uint64_t arg1, uint64_t arg2) | |
34dc7c2f BB |
113 | { |
114 | dmu_tx_hold_t *txh; | |
34dc7c2f | 115 | |
0eef1bde | 116 | if (dn != NULL) { |
66eead53 | 117 | (void) refcount_add(&dn->dn_holds, tx); |
0eef1bde | 118 | if (tx->tx_txg != 0) { |
34dc7c2f BB |
119 | mutex_enter(&dn->dn_mtx); |
120 | /* | |
121 | * dn->dn_assigned_txg == tx->tx_txg doesn't pose a | |
122 | * problem, but there's no way for it to happen (for | |
123 | * now, at least). | |
124 | */ | |
125 | ASSERT(dn->dn_assigned_txg == 0); | |
126 | dn->dn_assigned_txg = tx->tx_txg; | |
127 | (void) refcount_add(&dn->dn_tx_holds, tx); | |
128 | mutex_exit(&dn->dn_mtx); | |
129 | } | |
130 | } | |
131 | ||
79c76d5b | 132 | txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); |
34dc7c2f BB |
133 | txh->txh_tx = tx; |
134 | txh->txh_dnode = dn; | |
f85c06be | 135 | refcount_create(&txh->txh_space_towrite); |
f85c06be | 136 | refcount_create(&txh->txh_memory_tohold); |
34dc7c2f BB |
137 | txh->txh_type = type; |
138 | txh->txh_arg1 = arg1; | |
139 | txh->txh_arg2 = arg2; | |
34dc7c2f BB |
140 | list_insert_tail(&tx->tx_holds, txh); |
141 | ||
142 | return (txh); | |
143 | } | |
144 | ||
0eef1bde | 145 | static dmu_tx_hold_t * |
146 | dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, | |
147 | enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) | |
148 | { | |
149 | dnode_t *dn = NULL; | |
150 | dmu_tx_hold_t *txh; | |
151 | int err; | |
152 | ||
153 | if (object != DMU_NEW_OBJECT) { | |
154 | err = dnode_hold(os, object, FTAG, &dn); | |
66eead53 | 155 | if (err != 0) { |
0eef1bde | 156 | tx->tx_err = err; |
157 | return (NULL); | |
158 | } | |
159 | } | |
160 | txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2); | |
161 | if (dn != NULL) | |
162 | dnode_rele(dn, FTAG); | |
163 | return (txh); | |
164 | } | |
165 | ||
34dc7c2f | 166 | void |
66eead53 | 167 | dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn) |
34dc7c2f BB |
168 | { |
169 | /* | |
170 | * If we're syncing, they can manipulate any object anyhow, and | |
171 | * the hold on the dnode_t can cause problems. | |
172 | */ | |
0eef1bde | 173 | if (!dmu_tx_is_syncing(tx)) |
174 | (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0); | |
34dc7c2f BB |
175 | } |
176 | ||
3ec3bc21 BB |
177 | /* |
178 | * This function reads specified data from disk. The specified data will | |
179 | * be needed to perform the transaction -- i.e, it will be read after | |
180 | * we do dmu_tx_assign(). There are two reasons that we read the data now | |
181 | * (before dmu_tx_assign()): | |
182 | * | |
183 | * 1. Reading it now has potentially better performance. The transaction | |
184 | * has not yet been assigned, so the TXG is not held open, and also the | |
185 | * caller typically has less locks held when calling dmu_tx_hold_*() than | |
186 | * after the transaction has been assigned. This reduces the lock (and txg) | |
187 | * hold times, thus reducing lock contention. | |
188 | * | |
189 | * 2. It is easier for callers (primarily the ZPL) to handle i/o errors | |
190 | * that are detected before they start making changes to the DMU state | |
191 | * (i.e. now). Once the transaction has been assigned, and some DMU | |
192 | * state has been changed, it can be difficult to recover from an i/o | |
193 | * error (e.g. to undo the changes already made in memory at the DMU | |
194 | * layer). Typically code to do so does not exist in the caller -- it | |
195 | * assumes that the data has already been cached and thus i/o errors are | |
196 | * not possible. | |
197 | * | |
198 | * It has been observed that the i/o initiated here can be a performance | |
199 | * problem, and it appears to be optional, because we don't look at the | |
200 | * data which is read. However, removing this read would only serve to | |
201 | * move the work elsewhere (after the dmu_tx_assign()), where it may | |
202 | * have a greater impact on performance (in addition to the impact on | |
203 | * fault tolerance noted above). | |
204 | */ | |
34dc7c2f BB |
205 | static int |
206 | dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) | |
207 | { | |
208 | int err; | |
209 | dmu_buf_impl_t *db; | |
210 | ||
211 | rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
212 | db = dbuf_hold_level(dn, level, blkid, FTAG); | |
213 | rw_exit(&dn->dn_struct_rwlock); | |
214 | if (db == NULL) | |
2e528b49 | 215 | return (SET_ERROR(EIO)); |
34dc7c2f BB |
216 | err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); |
217 | dbuf_rele(db, FTAG); | |
218 | return (err); | |
219 | } | |
220 | ||
221 | /* ARGSUSED */ | |
222 | static void | |
223 | dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) | |
224 | { | |
225 | dnode_t *dn = txh->txh_dnode; | |
34dc7c2f BB |
226 | int err = 0; |
227 | ||
228 | if (len == 0) | |
229 | return; | |
230 | ||
3ec3bc21 | 231 | (void) refcount_add_many(&txh->txh_space_towrite, len, FTAG); |
34dc7c2f | 232 | |
3ec3bc21 BB |
233 | if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) |
234 | err = SET_ERROR(EFBIG); | |
34dc7c2f | 235 | |
3ec3bc21 BB |
236 | if (dn == NULL) |
237 | return; | |
34dc7c2f | 238 | |
3ec3bc21 BB |
239 | /* |
240 | * For i/o error checking, read the blocks that will be needed | |
241 | * to perform the write: the first and last level-0 blocks (if | |
242 | * they are not aligned, i.e. if they are partial-block writes), | |
243 | * and all the level-1 blocks. | |
244 | */ | |
245 | if (dn->dn_maxblkid == 0) { | |
246 | if (off < dn->dn_datablksz && | |
247 | (off > 0 || len < dn->dn_datablksz)) { | |
248 | err = dmu_tx_check_ioerr(NULL, dn, 0, 0); | |
249 | if (err != 0) { | |
250 | txh->txh_tx->tx_err = err; | |
34dc7c2f | 251 | } |
9babb374 | 252 | } |
3ec3bc21 BB |
253 | } else { |
254 | zio_t *zio = zio_root(dn->dn_objset->os_spa, | |
255 | NULL, NULL, ZIO_FLAG_CANFAIL); | |
9babb374 | 256 | |
3ec3bc21 BB |
257 | /* first level-0 block */ |
258 | uint64_t start = off >> dn->dn_datablkshift; | |
259 | if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { | |
260 | err = dmu_tx_check_ioerr(zio, dn, 0, start); | |
261 | if (err != 0) { | |
262 | txh->txh_tx->tx_err = err; | |
263 | } | |
428870ff | 264 | } |
428870ff | 265 | |
3ec3bc21 BB |
266 | /* last level-0 block */ |
267 | uint64_t end = (off + len - 1) >> dn->dn_datablkshift; | |
268 | if (end != start && end <= dn->dn_maxblkid && | |
269 | P2PHASE(off + len, dn->dn_datablksz)) { | |
270 | err = dmu_tx_check_ioerr(zio, dn, 0, end); | |
271 | if (err != 0) { | |
428870ff | 272 | txh->txh_tx->tx_err = err; |
9babb374 | 273 | } |
3ec3bc21 | 274 | } |
428870ff | 275 | |
3ec3bc21 BB |
276 | /* level-1 blocks */ |
277 | if (dn->dn_nlevels > 1) { | |
278 | int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; | |
279 | for (uint64_t i = (start >> shft) + 1; | |
280 | i < end >> shft; i++) { | |
281 | err = dmu_tx_check_ioerr(zio, dn, 1, i); | |
282 | if (err != 0) { | |
283 | txh->txh_tx->tx_err = err; | |
284 | } | |
9babb374 | 285 | } |
9babb374 | 286 | } |
34dc7c2f | 287 | |
3ec3bc21 BB |
288 | err = zio_wait(zio); |
289 | if (err != 0) { | |
290 | txh->txh_tx->tx_err = err; | |
9babb374 | 291 | } |
34dc7c2f | 292 | } |
34dc7c2f BB |
293 | } |
294 | ||
295 | static void | |
296 | dmu_tx_count_dnode(dmu_tx_hold_t *txh) | |
297 | { | |
3ec3bc21 | 298 | (void) refcount_add_many(&txh->txh_space_towrite, DNODE_MIN_SIZE, FTAG); |
34dc7c2f BB |
299 | } |
300 | ||
301 | void | |
302 | dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) | |
303 | { | |
304 | dmu_tx_hold_t *txh; | |
305 | ||
66eead53 MA |
306 | ASSERT0(tx->tx_txg); |
307 | ASSERT3U(len, <=, DMU_MAX_ACCESS); | |
34dc7c2f BB |
308 | ASSERT(len == 0 || UINT64_MAX - off >= len - 1); |
309 | ||
310 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
311 | object, THT_WRITE, off, len); | |
66eead53 MA |
312 | if (txh != NULL) { |
313 | dmu_tx_count_write(txh, off, len); | |
314 | dmu_tx_count_dnode(txh); | |
315 | } | |
34dc7c2f BB |
316 | } |
317 | ||
0eef1bde | 318 | void |
319 | dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) | |
320 | { | |
321 | dmu_tx_hold_t *txh; | |
322 | ||
66eead53 MA |
323 | ASSERT0(tx->tx_txg); |
324 | ASSERT3U(len, <=, DMU_MAX_ACCESS); | |
0eef1bde | 325 | ASSERT(len == 0 || UINT64_MAX - off >= len - 1); |
326 | ||
327 | txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len); | |
66eead53 MA |
328 | if (txh != NULL) { |
329 | dmu_tx_count_write(txh, off, len); | |
330 | dmu_tx_count_dnode(txh); | |
331 | } | |
0eef1bde | 332 | } |
333 | ||
19d55079 MA |
334 | /* |
335 | * This function marks the transaction as being a "net free". The end | |
336 | * result is that refquotas will be disabled for this transaction, and | |
337 | * this transaction will be able to use half of the pool space overhead | |
338 | * (see dsl_pool_adjustedsize()). Therefore this function should only | |
339 | * be called for transactions that we expect will not cause a net increase | |
340 | * in the amount of space used (but it's OK if that is occasionally not true). | |
341 | */ | |
342 | void | |
343 | dmu_tx_mark_netfree(dmu_tx_t *tx) | |
344 | { | |
3ec3bc21 | 345 | tx->tx_netfree = B_TRUE; |
19d55079 MA |
346 | } |
347 | ||
0eef1bde | 348 | static void |
349 | dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) | |
34dc7c2f | 350 | { |
3ec3bc21 BB |
351 | dmu_tx_t *tx = txh->txh_tx; |
352 | dnode_t *dn = txh->txh_dnode; | |
ea97f8ce | 353 | int err; |
34dc7c2f BB |
354 | |
355 | ASSERT(tx->tx_txg == 0); | |
356 | ||
e8b96c60 | 357 | dmu_tx_count_dnode(txh); |
34dc7c2f | 358 | |
3ec3bc21 | 359 | if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) |
34dc7c2f BB |
360 | return; |
361 | if (len == DMU_OBJECT_END) | |
3ec3bc21 | 362 | len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; |
34dc7c2f | 363 | |
ea97f8ce MA |
364 | dmu_tx_count_dnode(txh); |
365 | ||
34dc7c2f | 366 | /* |
ea97f8ce MA |
367 | * For i/o error checking, we read the first and last level-0 |
368 | * blocks if they are not aligned, and all the level-1 blocks. | |
369 | * | |
370 | * Note: dbuf_free_range() assumes that we have not instantiated | |
371 | * any level-0 dbufs that will be completely freed. Therefore we must | |
372 | * exercise care to not read or count the first and last blocks | |
373 | * if they are blocksize-aligned. | |
374 | */ | |
375 | if (dn->dn_datablkshift == 0) { | |
b663a23d | 376 | if (off != 0 || len < dn->dn_datablksz) |
92bc214c | 377 | dmu_tx_count_write(txh, 0, dn->dn_datablksz); |
ea97f8ce MA |
378 | } else { |
379 | /* first block will be modified if it is not aligned */ | |
380 | if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) | |
381 | dmu_tx_count_write(txh, off, 1); | |
382 | /* last block will be modified if it is not aligned */ | |
383 | if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) | |
3ec3bc21 | 384 | dmu_tx_count_write(txh, off + len, 1); |
ea97f8ce MA |
385 | } |
386 | ||
387 | /* | |
388 | * Check level-1 blocks. | |
34dc7c2f BB |
389 | */ |
390 | if (dn->dn_nlevels > 1) { | |
ea97f8ce | 391 | int shift = dn->dn_datablkshift + dn->dn_indblkshift - |
34dc7c2f | 392 | SPA_BLKPTRSHIFT; |
ea97f8ce MA |
393 | uint64_t start = off >> shift; |
394 | uint64_t end = (off + len) >> shift; | |
ea97f8ce | 395 | |
ea97f8ce | 396 | ASSERT(dn->dn_indblkshift != 0); |
34dc7c2f | 397 | |
2e7b7657 MA |
398 | /* |
399 | * dnode_reallocate() can result in an object with indirect | |
400 | * blocks having an odd data block size. In this case, | |
401 | * just check the single block. | |
402 | */ | |
403 | if (dn->dn_datablkshift == 0) | |
404 | start = end = 0; | |
405 | ||
3ec3bc21 | 406 | zio_t *zio = zio_root(tx->tx_pool->dp_spa, |
34dc7c2f | 407 | NULL, NULL, ZIO_FLAG_CANFAIL); |
1c27024e | 408 | for (uint64_t i = start; i <= end; i++) { |
34dc7c2f | 409 | uint64_t ibyte = i << shift; |
b128c09f | 410 | err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); |
34dc7c2f | 411 | i = ibyte >> shift; |
4bda3bd0 | 412 | if (err == ESRCH || i > end) |
34dc7c2f | 413 | break; |
3ec3bc21 | 414 | if (err != 0) { |
34dc7c2f | 415 | tx->tx_err = err; |
3ec3bc21 | 416 | (void) zio_wait(zio); |
34dc7c2f BB |
417 | return; |
418 | } | |
419 | ||
3ec3bc21 BB |
420 | (void) refcount_add_many(&txh->txh_memory_tohold, |
421 | 1 << dn->dn_indblkshift, FTAG); | |
422 | ||
34dc7c2f | 423 | err = dmu_tx_check_ioerr(zio, dn, 1, i); |
3ec3bc21 | 424 | if (err != 0) { |
34dc7c2f | 425 | tx->tx_err = err; |
3ec3bc21 | 426 | (void) zio_wait(zio); |
34dc7c2f BB |
427 | return; |
428 | } | |
429 | } | |
430 | err = zio_wait(zio); | |
3ec3bc21 | 431 | if (err != 0) { |
34dc7c2f BB |
432 | tx->tx_err = err; |
433 | return; | |
434 | } | |
435 | } | |
34dc7c2f BB |
436 | } |
437 | ||
438 | void | |
0eef1bde | 439 | dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) |
440 | { | |
441 | dmu_tx_hold_t *txh; | |
442 | ||
443 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
444 | object, THT_FREE, off, len); | |
66eead53 MA |
445 | if (txh != NULL) |
446 | (void) dmu_tx_hold_free_impl(txh, off, len); | |
0eef1bde | 447 | } |
448 | ||
449 | void | |
450 | dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) | |
34dc7c2f BB |
451 | { |
452 | dmu_tx_hold_t *txh; | |
0eef1bde | 453 | |
454 | txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len); | |
66eead53 MA |
455 | if (txh != NULL) |
456 | (void) dmu_tx_hold_free_impl(txh, off, len); | |
0eef1bde | 457 | } |
458 | ||
459 | static void | |
9522bd24 | 460 | dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name) |
0eef1bde | 461 | { |
462 | dmu_tx_t *tx = txh->txh_tx; | |
3ec3bc21 | 463 | dnode_t *dn = txh->txh_dnode; |
f85c06be | 464 | int err; |
34dc7c2f BB |
465 | |
466 | ASSERT(tx->tx_txg == 0); | |
467 | ||
34dc7c2f BB |
468 | dmu_tx_count_dnode(txh); |
469 | ||
3ec3bc21 BB |
470 | /* |
471 | * Modifying a almost-full microzap is around the worst case (128KB) | |
472 | * | |
473 | * If it is a fat zap, the worst case would be 7*16KB=112KB: | |
474 | * - 3 blocks overwritten: target leaf, ptrtbl block, header block | |
475 | * - 4 new blocks written if adding: | |
476 | * - 2 blocks for possibly split leaves, | |
477 | * - 2 grown ptrtbl blocks | |
478 | */ | |
479 | (void) refcount_add_many(&txh->txh_space_towrite, | |
480 | MZAP_MAX_BLKSZ, FTAG); | |
481 | ||
482 | if (dn == NULL) | |
34dc7c2f | 483 | return; |
34dc7c2f | 484 | |
9ae529ec | 485 | ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); |
34dc7c2f | 486 | |
3ec3bc21 | 487 | if (dn->dn_maxblkid == 0 || name == NULL) { |
34dc7c2f | 488 | /* |
3ec3bc21 BB |
489 | * This is a microzap (only one block), or we don't know |
490 | * the name. Check the first block for i/o errors. | |
34dc7c2f BB |
491 | */ |
492 | err = dmu_tx_check_ioerr(NULL, dn, 0, 0); | |
3ec3bc21 | 493 | if (err != 0) { |
34dc7c2f | 494 | tx->tx_err = err; |
f85c06be | 495 | } |
3ec3bc21 | 496 | } else { |
34dc7c2f | 497 | /* |
3ec3bc21 BB |
498 | * Access the name so that we'll check for i/o errors to |
499 | * the leaf blocks, etc. We ignore ENOENT, as this name | |
500 | * may not yet exist. | |
34dc7c2f | 501 | */ |
2bce8049 | 502 | err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); |
3ec3bc21 | 503 | if (err == EIO || err == ECKSUM || err == ENXIO) { |
34dc7c2f | 504 | tx->tx_err = err; |
f85c06be GM |
505 | } |
506 | } | |
34dc7c2f BB |
507 | } |
508 | ||
0eef1bde | 509 | void |
510 | dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) | |
511 | { | |
512 | dmu_tx_hold_t *txh; | |
513 | ||
66eead53 | 514 | ASSERT0(tx->tx_txg); |
0eef1bde | 515 | |
516 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
517 | object, THT_ZAP, add, (uintptr_t)name); | |
66eead53 | 518 | if (txh != NULL) |
9522bd24 | 519 | dmu_tx_hold_zap_impl(txh, name); |
0eef1bde | 520 | } |
521 | ||
522 | void | |
523 | dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name) | |
524 | { | |
525 | dmu_tx_hold_t *txh; | |
526 | ||
66eead53 | 527 | ASSERT0(tx->tx_txg); |
0eef1bde | 528 | ASSERT(dn != NULL); |
529 | ||
530 | txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name); | |
66eead53 | 531 | if (txh != NULL) |
9522bd24 | 532 | dmu_tx_hold_zap_impl(txh, name); |
0eef1bde | 533 | } |
534 | ||
34dc7c2f BB |
535 | void |
536 | dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) | |
537 | { | |
538 | dmu_tx_hold_t *txh; | |
539 | ||
540 | ASSERT(tx->tx_txg == 0); | |
541 | ||
542 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
543 | object, THT_BONUS, 0, 0); | |
544 | if (txh) | |
545 | dmu_tx_count_dnode(txh); | |
546 | } | |
547 | ||
0eef1bde | 548 | void |
549 | dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn) | |
550 | { | |
551 | dmu_tx_hold_t *txh; | |
552 | ||
66eead53 | 553 | ASSERT0(tx->tx_txg); |
0eef1bde | 554 | |
555 | txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0); | |
556 | if (txh) | |
557 | dmu_tx_count_dnode(txh); | |
558 | } | |
559 | ||
34dc7c2f BB |
560 | void |
561 | dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) | |
562 | { | |
563 | dmu_tx_hold_t *txh; | |
7d637211 | 564 | |
34dc7c2f BB |
565 | ASSERT(tx->tx_txg == 0); |
566 | ||
567 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
568 | DMU_NEW_OBJECT, THT_SPACE, space, 0); | |
7d637211 | 569 | if (txh) |
f85c06be | 570 | (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG); |
34dc7c2f BB |
571 | } |
572 | ||
3ec3bc21 | 573 | #ifdef ZFS_DEBUG |
34dc7c2f BB |
574 | void |
575 | dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) | |
576 | { | |
3ec3bc21 BB |
577 | boolean_t match_object = B_FALSE; |
578 | boolean_t match_offset = B_FALSE; | |
34dc7c2f | 579 | |
572e2857 | 580 | DB_DNODE_ENTER(db); |
3ec3bc21 | 581 | dnode_t *dn = DB_DNODE(db); |
34dc7c2f | 582 | ASSERT(tx->tx_txg != 0); |
428870ff | 583 | ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); |
34dc7c2f BB |
584 | ASSERT3U(dn->dn_object, ==, db->db.db_object); |
585 | ||
572e2857 BB |
586 | if (tx->tx_anyobj) { |
587 | DB_DNODE_EXIT(db); | |
34dc7c2f | 588 | return; |
572e2857 | 589 | } |
34dc7c2f BB |
590 | |
591 | /* XXX No checking on the meta dnode for now */ | |
572e2857 BB |
592 | if (db->db.db_object == DMU_META_DNODE_OBJECT) { |
593 | DB_DNODE_EXIT(db); | |
34dc7c2f | 594 | return; |
572e2857 | 595 | } |
34dc7c2f | 596 | |
3ec3bc21 | 597 | for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; |
34dc7c2f | 598 | txh = list_next(&tx->tx_holds, txh)) { |
99ea23c5 | 599 | ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); |
34dc7c2f BB |
600 | if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) |
601 | match_object = TRUE; | |
602 | if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { | |
603 | int datablkshift = dn->dn_datablkshift ? | |
604 | dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; | |
605 | int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; | |
606 | int shift = datablkshift + epbs * db->db_level; | |
607 | uint64_t beginblk = shift >= 64 ? 0 : | |
608 | (txh->txh_arg1 >> shift); | |
609 | uint64_t endblk = shift >= 64 ? 0 : | |
610 | ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); | |
611 | uint64_t blkid = db->db_blkid; | |
612 | ||
613 | /* XXX txh_arg2 better not be zero... */ | |
614 | ||
615 | dprintf("found txh type %x beginblk=%llx endblk=%llx\n", | |
616 | txh->txh_type, beginblk, endblk); | |
617 | ||
618 | switch (txh->txh_type) { | |
619 | case THT_WRITE: | |
620 | if (blkid >= beginblk && blkid <= endblk) | |
621 | match_offset = TRUE; | |
622 | /* | |
623 | * We will let this hold work for the bonus | |
428870ff BB |
624 | * or spill buffer so that we don't need to |
625 | * hold it when creating a new object. | |
34dc7c2f | 626 | */ |
428870ff BB |
627 | if (blkid == DMU_BONUS_BLKID || |
628 | blkid == DMU_SPILL_BLKID) | |
34dc7c2f BB |
629 | match_offset = TRUE; |
630 | /* | |
631 | * They might have to increase nlevels, | |
632 | * thus dirtying the new TLIBs. Or the | |
633 | * might have to change the block size, | |
634 | * thus dirying the new lvl=0 blk=0. | |
635 | */ | |
636 | if (blkid == 0) | |
637 | match_offset = TRUE; | |
638 | break; | |
639 | case THT_FREE: | |
b128c09f BB |
640 | /* |
641 | * We will dirty all the level 1 blocks in | |
642 | * the free range and perhaps the first and | |
643 | * last level 0 block. | |
644 | */ | |
645 | if (blkid >= beginblk && (blkid <= endblk || | |
646 | txh->txh_arg2 == DMU_OBJECT_END)) | |
34dc7c2f BB |
647 | match_offset = TRUE; |
648 | break; | |
428870ff BB |
649 | case THT_SPILL: |
650 | if (blkid == DMU_SPILL_BLKID) | |
651 | match_offset = TRUE; | |
652 | break; | |
34dc7c2f | 653 | case THT_BONUS: |
428870ff | 654 | if (blkid == DMU_BONUS_BLKID) |
34dc7c2f BB |
655 | match_offset = TRUE; |
656 | break; | |
657 | case THT_ZAP: | |
658 | match_offset = TRUE; | |
659 | break; | |
660 | case THT_NEWOBJECT: | |
661 | match_object = TRUE; | |
662 | break; | |
663 | default: | |
989fd514 BB |
664 | cmn_err(CE_PANIC, "bad txh_type %d", |
665 | txh->txh_type); | |
34dc7c2f BB |
666 | } |
667 | } | |
572e2857 BB |
668 | if (match_object && match_offset) { |
669 | DB_DNODE_EXIT(db); | |
34dc7c2f | 670 | return; |
572e2857 | 671 | } |
34dc7c2f | 672 | } |
572e2857 | 673 | DB_DNODE_EXIT(db); |
34dc7c2f BB |
674 | panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", |
675 | (u_longlong_t)db->db.db_object, db->db_level, | |
676 | (u_longlong_t)db->db_blkid); | |
677 | } | |
678 | #endif | |
679 | ||
e8b96c60 MA |
680 | /* |
681 | * If we can't do 10 iops, something is wrong. Let us go ahead | |
682 | * and hit zfs_dirty_data_max. | |
683 | */ | |
684 | hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */ | |
685 | int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ | |
686 | ||
687 | /* | |
688 | * We delay transactions when we've determined that the backend storage | |
689 | * isn't able to accommodate the rate of incoming writes. | |
690 | * | |
691 | * If there is already a transaction waiting, we delay relative to when | |
692 | * that transaction finishes waiting. This way the calculated min_time | |
693 | * is independent of the number of threads concurrently executing | |
694 | * transactions. | |
695 | * | |
696 | * If we are the only waiter, wait relative to when the transaction | |
697 | * started, rather than the current time. This credits the transaction for | |
698 | * "time already served", e.g. reading indirect blocks. | |
699 | * | |
700 | * The minimum time for a transaction to take is calculated as: | |
701 | * min_time = scale * (dirty - min) / (max - dirty) | |
702 | * min_time is then capped at zfs_delay_max_ns. | |
703 | * | |
704 | * The delay has two degrees of freedom that can be adjusted via tunables. | |
705 | * The percentage of dirty data at which we start to delay is defined by | |
706 | * zfs_delay_min_dirty_percent. This should typically be at or above | |
707 | * zfs_vdev_async_write_active_max_dirty_percent so that we only start to | |
708 | * delay after writing at full speed has failed to keep up with the incoming | |
709 | * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly | |
710 | * speaking, this variable determines the amount of delay at the midpoint of | |
711 | * the curve. | |
712 | * | |
713 | * delay | |
714 | * 10ms +-------------------------------------------------------------*+ | |
715 | * | *| | |
716 | * 9ms + *+ | |
717 | * | *| | |
718 | * 8ms + *+ | |
719 | * | * | | |
720 | * 7ms + * + | |
721 | * | * | | |
722 | * 6ms + * + | |
723 | * | * | | |
724 | * 5ms + * + | |
725 | * | * | | |
726 | * 4ms + * + | |
727 | * | * | | |
728 | * 3ms + * + | |
729 | * | * | | |
730 | * 2ms + (midpoint) * + | |
731 | * | | ** | | |
732 | * 1ms + v *** + | |
733 | * | zfs_delay_scale ----------> ******** | | |
734 | * 0 +-------------------------------------*********----------------+ | |
735 | * 0% <- zfs_dirty_data_max -> 100% | |
736 | * | |
737 | * Note that since the delay is added to the outstanding time remaining on the | |
738 | * most recent transaction, the delay is effectively the inverse of IOPS. | |
739 | * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve | |
740 | * was chosen such that small changes in the amount of accumulated dirty data | |
741 | * in the first 3/4 of the curve yield relatively small differences in the | |
742 | * amount of delay. | |
743 | * | |
744 | * The effects can be easier to understand when the amount of delay is | |
745 | * represented on a log scale: | |
746 | * | |
747 | * delay | |
748 | * 100ms +-------------------------------------------------------------++ | |
749 | * + + | |
750 | * | | | |
751 | * + *+ | |
752 | * 10ms + *+ | |
753 | * + ** + | |
754 | * | (midpoint) ** | | |
755 | * + | ** + | |
756 | * 1ms + v **** + | |
757 | * + zfs_delay_scale ----------> ***** + | |
758 | * | **** | | |
759 | * + **** + | |
760 | * 100us + ** + | |
761 | * + * + | |
762 | * | * | | |
763 | * + * + | |
764 | * 10us + * + | |
765 | * + + | |
766 | * | | | |
767 | * + + | |
768 | * +--------------------------------------------------------------+ | |
769 | * 0% <- zfs_dirty_data_max -> 100% | |
770 | * | |
771 | * Note here that only as the amount of dirty data approaches its limit does | |
772 | * the delay start to increase rapidly. The goal of a properly tuned system | |
773 | * should be to keep the amount of dirty data out of that range by first | |
774 | * ensuring that the appropriate limits are set for the I/O scheduler to reach | |
775 | * optimal throughput on the backend storage, and then by changing the value | |
776 | * of zfs_delay_scale to increase the steepness of the curve. | |
777 | */ | |
778 | static void | |
779 | dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) | |
780 | { | |
781 | dsl_pool_t *dp = tx->tx_pool; | |
782 | uint64_t delay_min_bytes = | |
783 | zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; | |
784 | hrtime_t wakeup, min_tx_time, now; | |
785 | ||
786 | if (dirty <= delay_min_bytes) | |
787 | return; | |
788 | ||
789 | /* | |
790 | * The caller has already waited until we are under the max. | |
791 | * We make them pass us the amount of dirty data so we don't | |
792 | * have to handle the case of it being >= the max, which could | |
793 | * cause a divide-by-zero if it's == the max. | |
794 | */ | |
795 | ASSERT3U(dirty, <, zfs_dirty_data_max); | |
796 | ||
797 | now = gethrtime(); | |
798 | min_tx_time = zfs_delay_scale * | |
799 | (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); | |
800 | min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); | |
801 | if (now > tx->tx_start + min_tx_time) | |
802 | return; | |
803 | ||
804 | DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, | |
805 | uint64_t, min_tx_time); | |
806 | ||
807 | mutex_enter(&dp->dp_lock); | |
808 | wakeup = MAX(tx->tx_start + min_tx_time, | |
809 | dp->dp_last_wakeup + min_tx_time); | |
810 | dp->dp_last_wakeup = wakeup; | |
811 | mutex_exit(&dp->dp_lock); | |
812 | ||
813 | zfs_sleep_until(wakeup); | |
814 | } | |
815 | ||
3ec3bc21 BB |
816 | /* |
817 | * This routine attempts to assign the transaction to a transaction group. | |
818 | * To do so, we must determine if there is sufficient free space on disk. | |
819 | * | |
820 | * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree() | |
821 | * on it), then it is assumed that there is sufficient free space, | |
822 | * unless there's insufficient slop space in the pool (see the comment | |
823 | * above spa_slop_shift in spa_misc.c). | |
824 | * | |
825 | * If it is not a "netfree" transaction, then if the data already on disk | |
826 | * is over the allowed usage (e.g. quota), this will fail with EDQUOT or | |
827 | * ENOSPC. Otherwise, if the current rough estimate of pending changes, | |
828 | * plus the rough estimate of this transaction's changes, may exceed the | |
829 | * allowed usage, then this will fail with ERESTART, which will cause the | |
830 | * caller to wait for the pending changes to be written to disk (by waiting | |
831 | * for the next TXG to open), and then check the space usage again. | |
832 | * | |
833 | * The rough estimate of pending changes is comprised of the sum of: | |
834 | * | |
835 | * - this transaction's holds' txh_space_towrite | |
836 | * | |
837 | * - dd_tempreserved[], which is the sum of in-flight transactions' | |
838 | * holds' txh_space_towrite (i.e. those transactions that have called | |
839 | * dmu_tx_assign() but not yet called dmu_tx_commit()). | |
840 | * | |
841 | * - dd_space_towrite[], which is the amount of dirtied dbufs. | |
842 | * | |
843 | * Note that all of these values are inflated by spa_get_worst_case_asize(), | |
844 | * which means that we may get ERESTART well before we are actually in danger | |
845 | * of running out of space, but this also mitigates any small inaccuracies | |
846 | * in the rough estimate (e.g. txh_space_towrite doesn't take into account | |
847 | * indirect blocks, and dd_space_towrite[] doesn't take into account changes | |
848 | * to the MOS). | |
849 | * | |
850 | * Note that due to this algorithm, it is possible to exceed the allowed | |
851 | * usage by one transaction. Also, as we approach the allowed usage, | |
852 | * we will allow a very limited amount of changes into each TXG, thus | |
853 | * decreasing performance. | |
854 | */ | |
34dc7c2f | 855 | static int |
0735ecb3 | 856 | dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) |
34dc7c2f | 857 | { |
34dc7c2f | 858 | spa_t *spa = tx->tx_pool->dp_spa; |
34dc7c2f | 859 | |
c99c9001 | 860 | ASSERT0(tx->tx_txg); |
34dc7c2f | 861 | |
570827e1 BB |
862 | if (tx->tx_err) { |
863 | DMU_TX_STAT_BUMP(dmu_tx_error); | |
34dc7c2f | 864 | return (tx->tx_err); |
570827e1 | 865 | } |
34dc7c2f | 866 | |
b128c09f | 867 | if (spa_suspended(spa)) { |
570827e1 BB |
868 | DMU_TX_STAT_BUMP(dmu_tx_suspended); |
869 | ||
34dc7c2f BB |
870 | /* |
871 | * If the user has indicated a blocking failure mode | |
872 | * then return ERESTART which will block in dmu_tx_wait(). | |
873 | * Otherwise, return EIO so that an error can get | |
874 | * propagated back to the VOP calls. | |
875 | * | |
876 | * Note that we always honor the txg_how flag regardless | |
877 | * of the failuremode setting. | |
878 | */ | |
879 | if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && | |
0735ecb3 | 880 | !(txg_how & TXG_WAIT)) |
2e528b49 | 881 | return (SET_ERROR(EIO)); |
34dc7c2f | 882 | |
2e528b49 | 883 | return (SET_ERROR(ERESTART)); |
34dc7c2f BB |
884 | } |
885 | ||
0735ecb3 | 886 | if (!tx->tx_dirty_delayed && |
e8b96c60 MA |
887 | dsl_pool_need_dirty_delay(tx->tx_pool)) { |
888 | tx->tx_wait_dirty = B_TRUE; | |
889 | DMU_TX_STAT_BUMP(dmu_tx_dirty_delay); | |
ecb2b7dc | 890 | return (SET_ERROR(ERESTART)); |
e8b96c60 MA |
891 | } |
892 | ||
34dc7c2f BB |
893 | tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); |
894 | tx->tx_needassign_txh = NULL; | |
895 | ||
896 | /* | |
897 | * NB: No error returns are allowed after txg_hold_open, but | |
898 | * before processing the dnode holds, due to the | |
899 | * dmu_tx_unassign() logic. | |
900 | */ | |
901 | ||
3ec3bc21 BB |
902 | uint64_t towrite = 0; |
903 | uint64_t tohold = 0; | |
904 | for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; | |
34dc7c2f BB |
905 | txh = list_next(&tx->tx_holds, txh)) { |
906 | dnode_t *dn = txh->txh_dnode; | |
907 | if (dn != NULL) { | |
908 | mutex_enter(&dn->dn_mtx); | |
909 | if (dn->dn_assigned_txg == tx->tx_txg - 1) { | |
910 | mutex_exit(&dn->dn_mtx); | |
911 | tx->tx_needassign_txh = txh; | |
570827e1 | 912 | DMU_TX_STAT_BUMP(dmu_tx_group); |
2e528b49 | 913 | return (SET_ERROR(ERESTART)); |
34dc7c2f BB |
914 | } |
915 | if (dn->dn_assigned_txg == 0) | |
916 | dn->dn_assigned_txg = tx->tx_txg; | |
917 | ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); | |
918 | (void) refcount_add(&dn->dn_tx_holds, tx); | |
919 | mutex_exit(&dn->dn_mtx); | |
920 | } | |
f85c06be | 921 | towrite += refcount_count(&txh->txh_space_towrite); |
f85c06be | 922 | tohold += refcount_count(&txh->txh_memory_tohold); |
34dc7c2f BB |
923 | } |
924 | ||
b128c09f | 925 | /* needed allocation: worst-case estimate of write space */ |
3ec3bc21 | 926 | uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite); |
b128c09f | 927 | /* calculate memory footprint estimate */ |
3ec3bc21 | 928 | uint64_t memory = towrite + tohold; |
34dc7c2f | 929 | |
3ec3bc21 | 930 | if (tx->tx_dir != NULL && asize != 0) { |
b128c09f | 931 | int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, |
3ec3bc21 BB |
932 | asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx); |
933 | if (err != 0) | |
34dc7c2f BB |
934 | return (err); |
935 | } | |
936 | ||
570827e1 BB |
937 | DMU_TX_STAT_BUMP(dmu_tx_assigned); |
938 | ||
34dc7c2f BB |
939 | return (0); |
940 | } | |
941 | ||
942 | static void | |
943 | dmu_tx_unassign(dmu_tx_t *tx) | |
944 | { | |
34dc7c2f BB |
945 | if (tx->tx_txg == 0) |
946 | return; | |
947 | ||
948 | txg_rele_to_quiesce(&tx->tx_txgh); | |
949 | ||
e49f1e20 WA |
950 | /* |
951 | * Walk the transaction's hold list, removing the hold on the | |
952 | * associated dnode, and notifying waiters if the refcount drops to 0. | |
953 | */ | |
3ec3bc21 | 954 | for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); |
981b2126 | 955 | txh && txh != tx->tx_needassign_txh; |
34dc7c2f BB |
956 | txh = list_next(&tx->tx_holds, txh)) { |
957 | dnode_t *dn = txh->txh_dnode; | |
958 | ||
959 | if (dn == NULL) | |
960 | continue; | |
961 | mutex_enter(&dn->dn_mtx); | |
962 | ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); | |
963 | ||
964 | if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { | |
965 | dn->dn_assigned_txg = 0; | |
966 | cv_broadcast(&dn->dn_notxholds); | |
967 | } | |
968 | mutex_exit(&dn->dn_mtx); | |
969 | } | |
970 | ||
971 | txg_rele_to_sync(&tx->tx_txgh); | |
972 | ||
973 | tx->tx_lasttried_txg = tx->tx_txg; | |
974 | tx->tx_txg = 0; | |
975 | } | |
976 | ||
977 | /* | |
0735ecb3 | 978 | * Assign tx to a transaction group; txg_how is a bitmask: |
34dc7c2f | 979 | * |
0735ecb3 PS |
980 | * If TXG_WAIT is set and the currently open txg is full, this function |
981 | * will wait until there's a new txg. This should be used when no locks | |
982 | * are being held. With this bit set, this function will only fail if | |
983 | * we're truly out of space (or over quota). | |
34dc7c2f | 984 | * |
0735ecb3 PS |
985 | * If TXG_WAIT is *not* set and we can't assign into the currently open |
986 | * txg without blocking, this function will return immediately with | |
987 | * ERESTART. This should be used whenever locks are being held. On an | |
988 | * ERESTART error, the caller should drop all locks, call dmu_tx_wait(), | |
989 | * and try again. | |
e8b96c60 | 990 | * |
0735ecb3 PS |
991 | * If TXG_NOTHROTTLE is set, this indicates that this tx should not be |
992 | * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for | |
993 | * details on the throttle). This is used by the VFS operations, after | |
994 | * they have already called dmu_tx_wait() (though most likely on a | |
995 | * different tx). | |
34dc7c2f BB |
996 | */ |
997 | int | |
0735ecb3 | 998 | dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) |
34dc7c2f BB |
999 | { |
1000 | int err; | |
1001 | ||
1002 | ASSERT(tx->tx_txg == 0); | |
0735ecb3 | 1003 | ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE)); |
34dc7c2f BB |
1004 | ASSERT(!dsl_pool_sync_context(tx->tx_pool)); |
1005 | ||
13fe0198 | 1006 | /* If we might wait, we must not hold the config lock. */ |
0735ecb3 PS |
1007 | IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool)); |
1008 | ||
1009 | if ((txg_how & TXG_NOTHROTTLE)) | |
1010 | tx->tx_dirty_delayed = B_TRUE; | |
13fe0198 | 1011 | |
34dc7c2f BB |
1012 | while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { |
1013 | dmu_tx_unassign(tx); | |
1014 | ||
0735ecb3 | 1015 | if (err != ERESTART || !(txg_how & TXG_WAIT)) |
34dc7c2f BB |
1016 | return (err); |
1017 | ||
1018 | dmu_tx_wait(tx); | |
1019 | } | |
1020 | ||
1021 | txg_rele_to_quiesce(&tx->tx_txgh); | |
1022 | ||
1023 | return (0); | |
1024 | } | |
1025 | ||
1026 | void | |
1027 | dmu_tx_wait(dmu_tx_t *tx) | |
1028 | { | |
1029 | spa_t *spa = tx->tx_pool->dp_spa; | |
e8b96c60 | 1030 | dsl_pool_t *dp = tx->tx_pool; |
a77c4c83 | 1031 | hrtime_t before; |
34dc7c2f BB |
1032 | |
1033 | ASSERT(tx->tx_txg == 0); | |
13fe0198 | 1034 | ASSERT(!dsl_pool_config_held(tx->tx_pool)); |
34dc7c2f | 1035 | |
a77c4c83 NB |
1036 | before = gethrtime(); |
1037 | ||
e8b96c60 MA |
1038 | if (tx->tx_wait_dirty) { |
1039 | uint64_t dirty; | |
1040 | ||
1041 | /* | |
1042 | * dmu_tx_try_assign() has determined that we need to wait | |
1043 | * because we've consumed much or all of the dirty buffer | |
1044 | * space. | |
1045 | */ | |
1046 | mutex_enter(&dp->dp_lock); | |
1047 | if (dp->dp_dirty_total >= zfs_dirty_data_max) | |
1048 | DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max); | |
1049 | while (dp->dp_dirty_total >= zfs_dirty_data_max) | |
1050 | cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); | |
1051 | dirty = dp->dp_dirty_total; | |
1052 | mutex_exit(&dp->dp_lock); | |
1053 | ||
1054 | dmu_tx_delay(tx, dirty); | |
1055 | ||
1056 | tx->tx_wait_dirty = B_FALSE; | |
1057 | ||
1058 | /* | |
0735ecb3 PS |
1059 | * Note: setting tx_dirty_delayed only has effect if the |
1060 | * caller used TX_WAIT. Otherwise they are going to | |
1061 | * destroy this tx and try again. The common case, | |
1062 | * zfs_write(), uses TX_WAIT. | |
e8b96c60 | 1063 | */ |
0735ecb3 | 1064 | tx->tx_dirty_delayed = B_TRUE; |
e8b96c60 MA |
1065 | } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { |
1066 | /* | |
1067 | * If the pool is suspended we need to wait until it | |
1068 | * is resumed. Note that it's possible that the pool | |
1069 | * has become active after this thread has tried to | |
1070 | * obtain a tx. If that's the case then tx_lasttried_txg | |
1071 | * would not have been set. | |
1072 | */ | |
1073 | txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); | |
34dc7c2f BB |
1074 | } else if (tx->tx_needassign_txh) { |
1075 | dnode_t *dn = tx->tx_needassign_txh->txh_dnode; | |
1076 | ||
1077 | mutex_enter(&dn->dn_mtx); | |
1078 | while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) | |
1079 | cv_wait(&dn->dn_notxholds, &dn->dn_mtx); | |
1080 | mutex_exit(&dn->dn_mtx); | |
1081 | tx->tx_needassign_txh = NULL; | |
1082 | } else { | |
e8b96c60 MA |
1083 | /* |
1084 | * A dnode is assigned to the quiescing txg. Wait for its | |
1085 | * transaction to complete. | |
1086 | */ | |
34dc7c2f BB |
1087 | txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); |
1088 | } | |
a77c4c83 NB |
1089 | |
1090 | spa_tx_assign_add_nsecs(spa, gethrtime() - before); | |
34dc7c2f BB |
1091 | } |
1092 | ||
f85c06be GM |
1093 | static void |
1094 | dmu_tx_destroy(dmu_tx_t *tx) | |
1095 | { | |
1096 | dmu_tx_hold_t *txh; | |
1097 | ||
1098 | while ((txh = list_head(&tx->tx_holds)) != NULL) { | |
1099 | dnode_t *dn = txh->txh_dnode; | |
1100 | ||
1101 | list_remove(&tx->tx_holds, txh); | |
1102 | refcount_destroy_many(&txh->txh_space_towrite, | |
1103 | refcount_count(&txh->txh_space_towrite)); | |
f85c06be GM |
1104 | refcount_destroy_many(&txh->txh_memory_tohold, |
1105 | refcount_count(&txh->txh_memory_tohold)); | |
f85c06be GM |
1106 | kmem_free(txh, sizeof (dmu_tx_hold_t)); |
1107 | if (dn != NULL) | |
1108 | dnode_rele(dn, tx); | |
1109 | } | |
1110 | ||
1111 | list_destroy(&tx->tx_callbacks); | |
1112 | list_destroy(&tx->tx_holds); | |
f85c06be GM |
1113 | kmem_free(tx, sizeof (dmu_tx_t)); |
1114 | } | |
1115 | ||
34dc7c2f BB |
1116 | void |
1117 | dmu_tx_commit(dmu_tx_t *tx) | |
1118 | { | |
34dc7c2f BB |
1119 | ASSERT(tx->tx_txg != 0); |
1120 | ||
e49f1e20 WA |
1121 | /* |
1122 | * Go through the transaction's hold list and remove holds on | |
1123 | * associated dnodes, notifying waiters if no holds remain. | |
1124 | */ | |
1c27024e | 1125 | for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; |
f85c06be | 1126 | txh = list_next(&tx->tx_holds, txh)) { |
34dc7c2f BB |
1127 | dnode_t *dn = txh->txh_dnode; |
1128 | ||
34dc7c2f BB |
1129 | if (dn == NULL) |
1130 | continue; | |
f85c06be | 1131 | |
34dc7c2f BB |
1132 | mutex_enter(&dn->dn_mtx); |
1133 | ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); | |
1134 | ||
1135 | if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { | |
1136 | dn->dn_assigned_txg = 0; | |
1137 | cv_broadcast(&dn->dn_notxholds); | |
1138 | } | |
1139 | mutex_exit(&dn->dn_mtx); | |
34dc7c2f BB |
1140 | } |
1141 | ||
1142 | if (tx->tx_tempreserve_cookie) | |
1143 | dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); | |
1144 | ||
428870ff BB |
1145 | if (!list_is_empty(&tx->tx_callbacks)) |
1146 | txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); | |
1147 | ||
34dc7c2f BB |
1148 | if (tx->tx_anyobj == FALSE) |
1149 | txg_rele_to_sync(&tx->tx_txgh); | |
428870ff | 1150 | |
f85c06be | 1151 | dmu_tx_destroy(tx); |
34dc7c2f BB |
1152 | } |
1153 | ||
1154 | void | |
1155 | dmu_tx_abort(dmu_tx_t *tx) | |
1156 | { | |
34dc7c2f BB |
1157 | ASSERT(tx->tx_txg == 0); |
1158 | ||
428870ff BB |
1159 | /* |
1160 | * Call any registered callbacks with an error code. | |
1161 | */ | |
1162 | if (!list_is_empty(&tx->tx_callbacks)) | |
1163 | dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); | |
1164 | ||
f85c06be | 1165 | dmu_tx_destroy(tx); |
34dc7c2f BB |
1166 | } |
1167 | ||
1168 | uint64_t | |
1169 | dmu_tx_get_txg(dmu_tx_t *tx) | |
1170 | { | |
1171 | ASSERT(tx->tx_txg != 0); | |
1172 | return (tx->tx_txg); | |
1173 | } | |
428870ff | 1174 | |
13fe0198 MA |
1175 | dsl_pool_t * |
1176 | dmu_tx_pool(dmu_tx_t *tx) | |
1177 | { | |
1178 | ASSERT(tx->tx_pool != NULL); | |
1179 | return (tx->tx_pool); | |
1180 | } | |
1181 | ||
428870ff BB |
1182 | void |
1183 | dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) | |
1184 | { | |
1185 | dmu_tx_callback_t *dcb; | |
1186 | ||
79c76d5b | 1187 | dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); |
428870ff BB |
1188 | |
1189 | dcb->dcb_func = func; | |
1190 | dcb->dcb_data = data; | |
1191 | ||
1192 | list_insert_tail(&tx->tx_callbacks, dcb); | |
1193 | } | |
1194 | ||
1195 | /* | |
1196 | * Call all the commit callbacks on a list, with a given error code. | |
1197 | */ | |
1198 | void | |
1199 | dmu_tx_do_callbacks(list_t *cb_list, int error) | |
1200 | { | |
1201 | dmu_tx_callback_t *dcb; | |
1202 | ||
823d48bf | 1203 | while ((dcb = list_tail(cb_list)) != NULL) { |
428870ff BB |
1204 | list_remove(cb_list, dcb); |
1205 | dcb->dcb_func(dcb->dcb_data, error); | |
1206 | kmem_free(dcb, sizeof (dmu_tx_callback_t)); | |
1207 | } | |
1208 | } | |
1209 | ||
1210 | /* | |
1211 | * Interface to hold a bunch of attributes. | |
1212 | * used for creating new files. | |
1213 | * attrsize is the total size of all attributes | |
1214 | * to be added during object creation | |
1215 | * | |
1216 | * For updating/adding a single attribute dmu_tx_hold_sa() should be used. | |
1217 | */ | |
1218 | ||
1219 | /* | |
1220 | * hold necessary attribute name for attribute registration. | |
1221 | * should be a very rare case where this is needed. If it does | |
1222 | * happen it would only happen on the first write to the file system. | |
1223 | */ | |
1224 | static void | |
1225 | dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) | |
1226 | { | |
428870ff BB |
1227 | if (!sa->sa_need_attr_registration) |
1228 | return; | |
1229 | ||
3ec3bc21 | 1230 | for (int i = 0; i != sa->sa_num_attrs; i++) { |
428870ff BB |
1231 | if (!sa->sa_attr_table[i].sa_registered) { |
1232 | if (sa->sa_reg_attr_obj) | |
1233 | dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, | |
1234 | B_TRUE, sa->sa_attr_table[i].sa_name); | |
1235 | else | |
1236 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, | |
1237 | B_TRUE, sa->sa_attr_table[i].sa_name); | |
1238 | } | |
1239 | } | |
1240 | } | |
1241 | ||
428870ff BB |
1242 | void |
1243 | dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) | |
1244 | { | |
9631681b | 1245 | dmu_tx_hold_t *txh; |
428870ff | 1246 | |
9631681b BB |
1247 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, |
1248 | THT_SPILL, 0, 0); | |
1249 | if (txh != NULL) | |
1250 | (void) refcount_add_many(&txh->txh_space_towrite, | |
1251 | SPA_OLD_MAXBLOCKSIZE, FTAG); | |
428870ff BB |
1252 | } |
1253 | ||
1254 | void | |
1255 | dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) | |
1256 | { | |
1257 | sa_os_t *sa = tx->tx_objset->os_sa; | |
1258 | ||
1259 | dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); | |
1260 | ||
1261 | if (tx->tx_objset->os_sa->sa_master_obj == 0) | |
1262 | return; | |
1263 | ||
3ec3bc21 | 1264 | if (tx->tx_objset->os_sa->sa_layout_attr_obj) { |
428870ff | 1265 | dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); |
3ec3bc21 | 1266 | } else { |
428870ff BB |
1267 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); |
1268 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); | |
1269 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1270 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1271 | } | |
1272 | ||
1273 | dmu_tx_sa_registration_hold(sa, tx); | |
1274 | ||
50c957f7 | 1275 | if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill) |
428870ff BB |
1276 | return; |
1277 | ||
1278 | (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, | |
1279 | THT_SPILL, 0, 0); | |
1280 | } | |
1281 | ||
1282 | /* | |
1283 | * Hold SA attribute | |
1284 | * | |
1285 | * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) | |
1286 | * | |
1287 | * variable_size is the total size of all variable sized attributes | |
1288 | * passed to this function. It is not the total size of all | |
1289 | * variable size attributes that *may* exist on this object. | |
1290 | */ | |
1291 | void | |
1292 | dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) | |
1293 | { | |
1294 | uint64_t object; | |
1295 | sa_os_t *sa = tx->tx_objset->os_sa; | |
1296 | ||
1297 | ASSERT(hdl != NULL); | |
1298 | ||
1299 | object = sa_handle_object(hdl); | |
1300 | ||
1301 | dmu_tx_hold_bonus(tx, object); | |
1302 | ||
1303 | if (tx->tx_objset->os_sa->sa_master_obj == 0) | |
1304 | return; | |
1305 | ||
1306 | if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || | |
1307 | tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { | |
1308 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); | |
1309 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); | |
1310 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1311 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1312 | } | |
1313 | ||
1314 | dmu_tx_sa_registration_hold(sa, tx); | |
1315 | ||
1316 | if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) | |
1317 | dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); | |
1318 | ||
572e2857 | 1319 | if (sa->sa_force_spill || may_grow || hdl->sa_spill) { |
428870ff BB |
1320 | ASSERT(tx->tx_txg == 0); |
1321 | dmu_tx_hold_spill(tx, object); | |
572e2857 BB |
1322 | } else { |
1323 | dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; | |
1324 | dnode_t *dn; | |
1325 | ||
1326 | DB_DNODE_ENTER(db); | |
1327 | dn = DB_DNODE(db); | |
1328 | if (dn->dn_have_spill) { | |
1329 | ASSERT(tx->tx_txg == 0); | |
1330 | dmu_tx_hold_spill(tx, object); | |
1331 | } | |
1332 | DB_DNODE_EXIT(db); | |
428870ff BB |
1333 | } |
1334 | } | |
c28b2279 | 1335 | |
570827e1 BB |
1336 | void |
1337 | dmu_tx_init(void) | |
1338 | { | |
1339 | dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc", | |
1340 | KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t), | |
1341 | KSTAT_FLAG_VIRTUAL); | |
1342 | ||
1343 | if (dmu_tx_ksp != NULL) { | |
1344 | dmu_tx_ksp->ks_data = &dmu_tx_stats; | |
1345 | kstat_install(dmu_tx_ksp); | |
1346 | } | |
1347 | } | |
1348 | ||
1349 | void | |
1350 | dmu_tx_fini(void) | |
1351 | { | |
1352 | if (dmu_tx_ksp != NULL) { | |
1353 | kstat_delete(dmu_tx_ksp); | |
1354 | dmu_tx_ksp = NULL; | |
1355 | } | |
1356 | } | |
1357 | ||
c28b2279 BB |
1358 | #if defined(_KERNEL) && defined(HAVE_SPL) |
1359 | EXPORT_SYMBOL(dmu_tx_create); | |
1360 | EXPORT_SYMBOL(dmu_tx_hold_write); | |
0eef1bde | 1361 | EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode); |
c28b2279 | 1362 | EXPORT_SYMBOL(dmu_tx_hold_free); |
0eef1bde | 1363 | EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode); |
c28b2279 | 1364 | EXPORT_SYMBOL(dmu_tx_hold_zap); |
0eef1bde | 1365 | EXPORT_SYMBOL(dmu_tx_hold_zap_by_dnode); |
c28b2279 | 1366 | EXPORT_SYMBOL(dmu_tx_hold_bonus); |
0eef1bde | 1367 | EXPORT_SYMBOL(dmu_tx_hold_bonus_by_dnode); |
c28b2279 BB |
1368 | EXPORT_SYMBOL(dmu_tx_abort); |
1369 | EXPORT_SYMBOL(dmu_tx_assign); | |
1370 | EXPORT_SYMBOL(dmu_tx_wait); | |
1371 | EXPORT_SYMBOL(dmu_tx_commit); | |
848259c1 | 1372 | EXPORT_SYMBOL(dmu_tx_mark_netfree); |
c28b2279 BB |
1373 | EXPORT_SYMBOL(dmu_tx_get_txg); |
1374 | EXPORT_SYMBOL(dmu_tx_callback_register); | |
1375 | EXPORT_SYMBOL(dmu_tx_do_callbacks); | |
1376 | EXPORT_SYMBOL(dmu_tx_hold_spill); | |
1377 | EXPORT_SYMBOL(dmu_tx_hold_sa_create); | |
1378 | EXPORT_SYMBOL(dmu_tx_hold_sa); | |
1379 | #endif |