]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
22cd4a46 | 23 | * Copyright 2011 Nexenta Systems, Inc. All rights reserved. |
4747a7d3 | 24 | * Copyright (c) 2012, 2017 by Delphix. All rights reserved. |
22cd4a46 | 25 | */ |
34dc7c2f | 26 | |
34dc7c2f BB |
27 | #include <sys/dmu.h> |
28 | #include <sys/dmu_impl.h> | |
29 | #include <sys/dbuf.h> | |
30 | #include <sys/dmu_tx.h> | |
31 | #include <sys/dmu_objset.h> | |
3ec3bc21 BB |
32 | #include <sys/dsl_dataset.h> |
33 | #include <sys/dsl_dir.h> | |
34dc7c2f | 34 | #include <sys/dsl_pool.h> |
3ec3bc21 | 35 | #include <sys/zap_impl.h> |
34dc7c2f | 36 | #include <sys/spa.h> |
428870ff BB |
37 | #include <sys/sa.h> |
38 | #include <sys/sa_impl.h> | |
34dc7c2f | 39 | #include <sys/zfs_context.h> |
428870ff | 40 | #include <sys/varargs.h> |
49ee64e5 | 41 | #include <sys/trace_dmu.h> |
34dc7c2f BB |
42 | |
43 | typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, | |
44 | uint64_t arg1, uint64_t arg2); | |
45 | ||
570827e1 BB |
46 | dmu_tx_stats_t dmu_tx_stats = { |
47 | { "dmu_tx_assigned", KSTAT_DATA_UINT64 }, | |
48 | { "dmu_tx_delay", KSTAT_DATA_UINT64 }, | |
49 | { "dmu_tx_error", KSTAT_DATA_UINT64 }, | |
50 | { "dmu_tx_suspended", KSTAT_DATA_UINT64 }, | |
51 | { "dmu_tx_group", KSTAT_DATA_UINT64 }, | |
570827e1 BB |
52 | { "dmu_tx_memory_reserve", KSTAT_DATA_UINT64 }, |
53 | { "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 }, | |
570827e1 | 54 | { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, |
e8b96c60 MA |
55 | { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, |
56 | { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, | |
570827e1 BB |
57 | { "dmu_tx_quota", KSTAT_DATA_UINT64 }, |
58 | }; | |
59 | ||
60 | static kstat_t *dmu_tx_ksp; | |
34dc7c2f BB |
61 | |
62 | dmu_tx_t * | |
63 | dmu_tx_create_dd(dsl_dir_t *dd) | |
64 | { | |
79c76d5b | 65 | dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); |
34dc7c2f | 66 | tx->tx_dir = dd; |
6f1ffb06 | 67 | if (dd != NULL) |
34dc7c2f BB |
68 | tx->tx_pool = dd->dd_pool; |
69 | list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), | |
70 | offsetof(dmu_tx_hold_t, txh_node)); | |
428870ff BB |
71 | list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), |
72 | offsetof(dmu_tx_callback_t, dcb_node)); | |
e8b96c60 | 73 | tx->tx_start = gethrtime(); |
34dc7c2f BB |
74 | return (tx); |
75 | } | |
76 | ||
77 | dmu_tx_t * | |
78 | dmu_tx_create(objset_t *os) | |
79 | { | |
428870ff | 80 | dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); |
34dc7c2f | 81 | tx->tx_objset = os; |
34dc7c2f BB |
82 | return (tx); |
83 | } | |
84 | ||
85 | dmu_tx_t * | |
86 | dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) | |
87 | { | |
88 | dmu_tx_t *tx = dmu_tx_create_dd(NULL); | |
89 | ||
4747a7d3 | 90 | txg_verify(dp->dp_spa, txg); |
34dc7c2f BB |
91 | tx->tx_pool = dp; |
92 | tx->tx_txg = txg; | |
93 | tx->tx_anyobj = TRUE; | |
94 | ||
95 | return (tx); | |
96 | } | |
97 | ||
98 | int | |
99 | dmu_tx_is_syncing(dmu_tx_t *tx) | |
100 | { | |
101 | return (tx->tx_anyobj); | |
102 | } | |
103 | ||
104 | int | |
105 | dmu_tx_private_ok(dmu_tx_t *tx) | |
106 | { | |
107 | return (tx->tx_anyobj); | |
108 | } | |
109 | ||
110 | static dmu_tx_hold_t * | |
0eef1bde | 111 | dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type, |
112 | uint64_t arg1, uint64_t arg2) | |
34dc7c2f BB |
113 | { |
114 | dmu_tx_hold_t *txh; | |
34dc7c2f | 115 | |
0eef1bde | 116 | if (dn != NULL) { |
66eead53 | 117 | (void) refcount_add(&dn->dn_holds, tx); |
0eef1bde | 118 | if (tx->tx_txg != 0) { |
34dc7c2f BB |
119 | mutex_enter(&dn->dn_mtx); |
120 | /* | |
121 | * dn->dn_assigned_txg == tx->tx_txg doesn't pose a | |
122 | * problem, but there's no way for it to happen (for | |
123 | * now, at least). | |
124 | */ | |
125 | ASSERT(dn->dn_assigned_txg == 0); | |
126 | dn->dn_assigned_txg = tx->tx_txg; | |
127 | (void) refcount_add(&dn->dn_tx_holds, tx); | |
128 | mutex_exit(&dn->dn_mtx); | |
129 | } | |
130 | } | |
131 | ||
79c76d5b | 132 | txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); |
34dc7c2f BB |
133 | txh->txh_tx = tx; |
134 | txh->txh_dnode = dn; | |
f85c06be | 135 | refcount_create(&txh->txh_space_towrite); |
f85c06be | 136 | refcount_create(&txh->txh_memory_tohold); |
34dc7c2f BB |
137 | txh->txh_type = type; |
138 | txh->txh_arg1 = arg1; | |
139 | txh->txh_arg2 = arg2; | |
34dc7c2f BB |
140 | list_insert_tail(&tx->tx_holds, txh); |
141 | ||
142 | return (txh); | |
143 | } | |
144 | ||
0eef1bde | 145 | static dmu_tx_hold_t * |
146 | dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, | |
147 | enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) | |
148 | { | |
149 | dnode_t *dn = NULL; | |
150 | dmu_tx_hold_t *txh; | |
151 | int err; | |
152 | ||
153 | if (object != DMU_NEW_OBJECT) { | |
154 | err = dnode_hold(os, object, FTAG, &dn); | |
66eead53 | 155 | if (err != 0) { |
0eef1bde | 156 | tx->tx_err = err; |
157 | return (NULL); | |
158 | } | |
159 | } | |
160 | txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2); | |
161 | if (dn != NULL) | |
162 | dnode_rele(dn, FTAG); | |
163 | return (txh); | |
164 | } | |
165 | ||
34dc7c2f | 166 | void |
66eead53 | 167 | dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn) |
34dc7c2f BB |
168 | { |
169 | /* | |
170 | * If we're syncing, they can manipulate any object anyhow, and | |
171 | * the hold on the dnode_t can cause problems. | |
172 | */ | |
0eef1bde | 173 | if (!dmu_tx_is_syncing(tx)) |
174 | (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0); | |
34dc7c2f BB |
175 | } |
176 | ||
3ec3bc21 BB |
177 | /* |
178 | * This function reads specified data from disk. The specified data will | |
179 | * be needed to perform the transaction -- i.e, it will be read after | |
180 | * we do dmu_tx_assign(). There are two reasons that we read the data now | |
181 | * (before dmu_tx_assign()): | |
182 | * | |
183 | * 1. Reading it now has potentially better performance. The transaction | |
184 | * has not yet been assigned, so the TXG is not held open, and also the | |
185 | * caller typically has less locks held when calling dmu_tx_hold_*() than | |
186 | * after the transaction has been assigned. This reduces the lock (and txg) | |
187 | * hold times, thus reducing lock contention. | |
188 | * | |
189 | * 2. It is easier for callers (primarily the ZPL) to handle i/o errors | |
190 | * that are detected before they start making changes to the DMU state | |
191 | * (i.e. now). Once the transaction has been assigned, and some DMU | |
192 | * state has been changed, it can be difficult to recover from an i/o | |
193 | * error (e.g. to undo the changes already made in memory at the DMU | |
194 | * layer). Typically code to do so does not exist in the caller -- it | |
195 | * assumes that the data has already been cached and thus i/o errors are | |
196 | * not possible. | |
197 | * | |
198 | * It has been observed that the i/o initiated here can be a performance | |
199 | * problem, and it appears to be optional, because we don't look at the | |
200 | * data which is read. However, removing this read would only serve to | |
201 | * move the work elsewhere (after the dmu_tx_assign()), where it may | |
202 | * have a greater impact on performance (in addition to the impact on | |
203 | * fault tolerance noted above). | |
204 | */ | |
34dc7c2f BB |
205 | static int |
206 | dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) | |
207 | { | |
208 | int err; | |
209 | dmu_buf_impl_t *db; | |
210 | ||
211 | rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
212 | db = dbuf_hold_level(dn, level, blkid, FTAG); | |
213 | rw_exit(&dn->dn_struct_rwlock); | |
214 | if (db == NULL) | |
2e528b49 | 215 | return (SET_ERROR(EIO)); |
34dc7c2f BB |
216 | err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); |
217 | dbuf_rele(db, FTAG); | |
218 | return (err); | |
219 | } | |
220 | ||
221 | /* ARGSUSED */ | |
222 | static void | |
223 | dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) | |
224 | { | |
225 | dnode_t *dn = txh->txh_dnode; | |
34dc7c2f BB |
226 | int err = 0; |
227 | ||
228 | if (len == 0) | |
229 | return; | |
230 | ||
3ec3bc21 | 231 | (void) refcount_add_many(&txh->txh_space_towrite, len, FTAG); |
34dc7c2f | 232 | |
3ec3bc21 BB |
233 | if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) |
234 | err = SET_ERROR(EFBIG); | |
34dc7c2f | 235 | |
3ec3bc21 BB |
236 | if (dn == NULL) |
237 | return; | |
34dc7c2f | 238 | |
3ec3bc21 BB |
239 | /* |
240 | * For i/o error checking, read the blocks that will be needed | |
241 | * to perform the write: the first and last level-0 blocks (if | |
242 | * they are not aligned, i.e. if they are partial-block writes), | |
243 | * and all the level-1 blocks. | |
244 | */ | |
245 | if (dn->dn_maxblkid == 0) { | |
246 | if (off < dn->dn_datablksz && | |
247 | (off > 0 || len < dn->dn_datablksz)) { | |
248 | err = dmu_tx_check_ioerr(NULL, dn, 0, 0); | |
249 | if (err != 0) { | |
250 | txh->txh_tx->tx_err = err; | |
34dc7c2f | 251 | } |
9babb374 | 252 | } |
3ec3bc21 BB |
253 | } else { |
254 | zio_t *zio = zio_root(dn->dn_objset->os_spa, | |
255 | NULL, NULL, ZIO_FLAG_CANFAIL); | |
9babb374 | 256 | |
3ec3bc21 BB |
257 | /* first level-0 block */ |
258 | uint64_t start = off >> dn->dn_datablkshift; | |
259 | if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { | |
260 | err = dmu_tx_check_ioerr(zio, dn, 0, start); | |
261 | if (err != 0) { | |
262 | txh->txh_tx->tx_err = err; | |
263 | } | |
428870ff | 264 | } |
428870ff | 265 | |
3ec3bc21 BB |
266 | /* last level-0 block */ |
267 | uint64_t end = (off + len - 1) >> dn->dn_datablkshift; | |
268 | if (end != start && end <= dn->dn_maxblkid && | |
269 | P2PHASE(off + len, dn->dn_datablksz)) { | |
270 | err = dmu_tx_check_ioerr(zio, dn, 0, end); | |
271 | if (err != 0) { | |
428870ff | 272 | txh->txh_tx->tx_err = err; |
9babb374 | 273 | } |
3ec3bc21 | 274 | } |
428870ff | 275 | |
3ec3bc21 BB |
276 | /* level-1 blocks */ |
277 | if (dn->dn_nlevels > 1) { | |
278 | int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; | |
279 | for (uint64_t i = (start >> shft) + 1; | |
280 | i < end >> shft; i++) { | |
281 | err = dmu_tx_check_ioerr(zio, dn, 1, i); | |
282 | if (err != 0) { | |
283 | txh->txh_tx->tx_err = err; | |
284 | } | |
9babb374 | 285 | } |
9babb374 | 286 | } |
34dc7c2f | 287 | |
3ec3bc21 BB |
288 | err = zio_wait(zio); |
289 | if (err != 0) { | |
290 | txh->txh_tx->tx_err = err; | |
9babb374 | 291 | } |
34dc7c2f | 292 | } |
34dc7c2f BB |
293 | } |
294 | ||
295 | static void | |
296 | dmu_tx_count_dnode(dmu_tx_hold_t *txh) | |
297 | { | |
3ec3bc21 | 298 | (void) refcount_add_many(&txh->txh_space_towrite, DNODE_MIN_SIZE, FTAG); |
34dc7c2f BB |
299 | } |
300 | ||
301 | void | |
302 | dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) | |
303 | { | |
304 | dmu_tx_hold_t *txh; | |
305 | ||
66eead53 MA |
306 | ASSERT0(tx->tx_txg); |
307 | ASSERT3U(len, <=, DMU_MAX_ACCESS); | |
34dc7c2f BB |
308 | ASSERT(len == 0 || UINT64_MAX - off >= len - 1); |
309 | ||
310 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
311 | object, THT_WRITE, off, len); | |
66eead53 MA |
312 | if (txh != NULL) { |
313 | dmu_tx_count_write(txh, off, len); | |
314 | dmu_tx_count_dnode(txh); | |
315 | } | |
34dc7c2f BB |
316 | } |
317 | ||
a1d477c2 MA |
318 | void |
319 | dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object) | |
320 | { | |
321 | dmu_tx_hold_t *txh; | |
322 | ||
323 | ASSERT(tx->tx_txg == 0); | |
324 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
325 | object, THT_WRITE, 0, 0); | |
326 | if (txh == NULL) | |
327 | return; | |
328 | ||
329 | dnode_t *dn = txh->txh_dnode; | |
330 | (void) refcount_add_many(&txh->txh_space_towrite, | |
331 | 1ULL << dn->dn_indblkshift, FTAG); | |
332 | dmu_tx_count_dnode(txh); | |
333 | } | |
334 | ||
0eef1bde | 335 | void |
336 | dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) | |
337 | { | |
338 | dmu_tx_hold_t *txh; | |
339 | ||
66eead53 MA |
340 | ASSERT0(tx->tx_txg); |
341 | ASSERT3U(len, <=, DMU_MAX_ACCESS); | |
0eef1bde | 342 | ASSERT(len == 0 || UINT64_MAX - off >= len - 1); |
343 | ||
344 | txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len); | |
66eead53 MA |
345 | if (txh != NULL) { |
346 | dmu_tx_count_write(txh, off, len); | |
347 | dmu_tx_count_dnode(txh); | |
348 | } | |
0eef1bde | 349 | } |
350 | ||
19d55079 MA |
351 | /* |
352 | * This function marks the transaction as being a "net free". The end | |
353 | * result is that refquotas will be disabled for this transaction, and | |
354 | * this transaction will be able to use half of the pool space overhead | |
355 | * (see dsl_pool_adjustedsize()). Therefore this function should only | |
356 | * be called for transactions that we expect will not cause a net increase | |
357 | * in the amount of space used (but it's OK if that is occasionally not true). | |
358 | */ | |
359 | void | |
360 | dmu_tx_mark_netfree(dmu_tx_t *tx) | |
361 | { | |
3ec3bc21 | 362 | tx->tx_netfree = B_TRUE; |
19d55079 MA |
363 | } |
364 | ||
0eef1bde | 365 | static void |
366 | dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) | |
34dc7c2f | 367 | { |
3ec3bc21 BB |
368 | dmu_tx_t *tx = txh->txh_tx; |
369 | dnode_t *dn = txh->txh_dnode; | |
ea97f8ce | 370 | int err; |
34dc7c2f BB |
371 | |
372 | ASSERT(tx->tx_txg == 0); | |
373 | ||
e8b96c60 | 374 | dmu_tx_count_dnode(txh); |
34dc7c2f | 375 | |
3ec3bc21 | 376 | if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) |
34dc7c2f BB |
377 | return; |
378 | if (len == DMU_OBJECT_END) | |
3ec3bc21 | 379 | len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; |
34dc7c2f | 380 | |
ea97f8ce MA |
381 | dmu_tx_count_dnode(txh); |
382 | ||
34dc7c2f | 383 | /* |
ea97f8ce MA |
384 | * For i/o error checking, we read the first and last level-0 |
385 | * blocks if they are not aligned, and all the level-1 blocks. | |
386 | * | |
387 | * Note: dbuf_free_range() assumes that we have not instantiated | |
388 | * any level-0 dbufs that will be completely freed. Therefore we must | |
389 | * exercise care to not read or count the first and last blocks | |
390 | * if they are blocksize-aligned. | |
391 | */ | |
392 | if (dn->dn_datablkshift == 0) { | |
b663a23d | 393 | if (off != 0 || len < dn->dn_datablksz) |
92bc214c | 394 | dmu_tx_count_write(txh, 0, dn->dn_datablksz); |
ea97f8ce MA |
395 | } else { |
396 | /* first block will be modified if it is not aligned */ | |
397 | if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) | |
398 | dmu_tx_count_write(txh, off, 1); | |
399 | /* last block will be modified if it is not aligned */ | |
400 | if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) | |
3ec3bc21 | 401 | dmu_tx_count_write(txh, off + len, 1); |
ea97f8ce MA |
402 | } |
403 | ||
404 | /* | |
405 | * Check level-1 blocks. | |
34dc7c2f BB |
406 | */ |
407 | if (dn->dn_nlevels > 1) { | |
ea97f8ce | 408 | int shift = dn->dn_datablkshift + dn->dn_indblkshift - |
34dc7c2f | 409 | SPA_BLKPTRSHIFT; |
ea97f8ce MA |
410 | uint64_t start = off >> shift; |
411 | uint64_t end = (off + len) >> shift; | |
ea97f8ce | 412 | |
ea97f8ce | 413 | ASSERT(dn->dn_indblkshift != 0); |
34dc7c2f | 414 | |
2e7b7657 MA |
415 | /* |
416 | * dnode_reallocate() can result in an object with indirect | |
417 | * blocks having an odd data block size. In this case, | |
418 | * just check the single block. | |
419 | */ | |
420 | if (dn->dn_datablkshift == 0) | |
421 | start = end = 0; | |
422 | ||
3ec3bc21 | 423 | zio_t *zio = zio_root(tx->tx_pool->dp_spa, |
34dc7c2f | 424 | NULL, NULL, ZIO_FLAG_CANFAIL); |
1c27024e | 425 | for (uint64_t i = start; i <= end; i++) { |
34dc7c2f | 426 | uint64_t ibyte = i << shift; |
b128c09f | 427 | err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); |
34dc7c2f | 428 | i = ibyte >> shift; |
4bda3bd0 | 429 | if (err == ESRCH || i > end) |
34dc7c2f | 430 | break; |
3ec3bc21 | 431 | if (err != 0) { |
34dc7c2f | 432 | tx->tx_err = err; |
3ec3bc21 | 433 | (void) zio_wait(zio); |
34dc7c2f BB |
434 | return; |
435 | } | |
436 | ||
3ec3bc21 BB |
437 | (void) refcount_add_many(&txh->txh_memory_tohold, |
438 | 1 << dn->dn_indblkshift, FTAG); | |
439 | ||
34dc7c2f | 440 | err = dmu_tx_check_ioerr(zio, dn, 1, i); |
3ec3bc21 | 441 | if (err != 0) { |
34dc7c2f | 442 | tx->tx_err = err; |
3ec3bc21 | 443 | (void) zio_wait(zio); |
34dc7c2f BB |
444 | return; |
445 | } | |
446 | } | |
447 | err = zio_wait(zio); | |
3ec3bc21 | 448 | if (err != 0) { |
34dc7c2f BB |
449 | tx->tx_err = err; |
450 | return; | |
451 | } | |
452 | } | |
34dc7c2f BB |
453 | } |
454 | ||
455 | void | |
0eef1bde | 456 | dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) |
457 | { | |
458 | dmu_tx_hold_t *txh; | |
459 | ||
460 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
461 | object, THT_FREE, off, len); | |
66eead53 MA |
462 | if (txh != NULL) |
463 | (void) dmu_tx_hold_free_impl(txh, off, len); | |
0eef1bde | 464 | } |
465 | ||
466 | void | |
467 | dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) | |
34dc7c2f BB |
468 | { |
469 | dmu_tx_hold_t *txh; | |
0eef1bde | 470 | |
471 | txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len); | |
66eead53 MA |
472 | if (txh != NULL) |
473 | (void) dmu_tx_hold_free_impl(txh, off, len); | |
0eef1bde | 474 | } |
475 | ||
476 | static void | |
9522bd24 | 477 | dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name) |
0eef1bde | 478 | { |
479 | dmu_tx_t *tx = txh->txh_tx; | |
3ec3bc21 | 480 | dnode_t *dn = txh->txh_dnode; |
f85c06be | 481 | int err; |
34dc7c2f BB |
482 | |
483 | ASSERT(tx->tx_txg == 0); | |
484 | ||
34dc7c2f BB |
485 | dmu_tx_count_dnode(txh); |
486 | ||
3ec3bc21 BB |
487 | /* |
488 | * Modifying a almost-full microzap is around the worst case (128KB) | |
489 | * | |
490 | * If it is a fat zap, the worst case would be 7*16KB=112KB: | |
491 | * - 3 blocks overwritten: target leaf, ptrtbl block, header block | |
492 | * - 4 new blocks written if adding: | |
493 | * - 2 blocks for possibly split leaves, | |
494 | * - 2 grown ptrtbl blocks | |
495 | */ | |
496 | (void) refcount_add_many(&txh->txh_space_towrite, | |
497 | MZAP_MAX_BLKSZ, FTAG); | |
498 | ||
499 | if (dn == NULL) | |
34dc7c2f | 500 | return; |
34dc7c2f | 501 | |
9ae529ec | 502 | ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); |
34dc7c2f | 503 | |
3ec3bc21 | 504 | if (dn->dn_maxblkid == 0 || name == NULL) { |
34dc7c2f | 505 | /* |
3ec3bc21 BB |
506 | * This is a microzap (only one block), or we don't know |
507 | * the name. Check the first block for i/o errors. | |
34dc7c2f BB |
508 | */ |
509 | err = dmu_tx_check_ioerr(NULL, dn, 0, 0); | |
3ec3bc21 | 510 | if (err != 0) { |
34dc7c2f | 511 | tx->tx_err = err; |
f85c06be | 512 | } |
3ec3bc21 | 513 | } else { |
34dc7c2f | 514 | /* |
3ec3bc21 BB |
515 | * Access the name so that we'll check for i/o errors to |
516 | * the leaf blocks, etc. We ignore ENOENT, as this name | |
517 | * may not yet exist. | |
34dc7c2f | 518 | */ |
2bce8049 | 519 | err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); |
3ec3bc21 | 520 | if (err == EIO || err == ECKSUM || err == ENXIO) { |
34dc7c2f | 521 | tx->tx_err = err; |
f85c06be GM |
522 | } |
523 | } | |
34dc7c2f BB |
524 | } |
525 | ||
0eef1bde | 526 | void |
527 | dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) | |
528 | { | |
529 | dmu_tx_hold_t *txh; | |
530 | ||
66eead53 | 531 | ASSERT0(tx->tx_txg); |
0eef1bde | 532 | |
533 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
534 | object, THT_ZAP, add, (uintptr_t)name); | |
66eead53 | 535 | if (txh != NULL) |
9522bd24 | 536 | dmu_tx_hold_zap_impl(txh, name); |
0eef1bde | 537 | } |
538 | ||
539 | void | |
540 | dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name) | |
541 | { | |
542 | dmu_tx_hold_t *txh; | |
543 | ||
66eead53 | 544 | ASSERT0(tx->tx_txg); |
0eef1bde | 545 | ASSERT(dn != NULL); |
546 | ||
547 | txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name); | |
66eead53 | 548 | if (txh != NULL) |
9522bd24 | 549 | dmu_tx_hold_zap_impl(txh, name); |
0eef1bde | 550 | } |
551 | ||
34dc7c2f BB |
552 | void |
553 | dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) | |
554 | { | |
555 | dmu_tx_hold_t *txh; | |
556 | ||
557 | ASSERT(tx->tx_txg == 0); | |
558 | ||
559 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
560 | object, THT_BONUS, 0, 0); | |
561 | if (txh) | |
562 | dmu_tx_count_dnode(txh); | |
563 | } | |
564 | ||
0eef1bde | 565 | void |
566 | dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn) | |
567 | { | |
568 | dmu_tx_hold_t *txh; | |
569 | ||
66eead53 | 570 | ASSERT0(tx->tx_txg); |
0eef1bde | 571 | |
572 | txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0); | |
573 | if (txh) | |
574 | dmu_tx_count_dnode(txh); | |
575 | } | |
576 | ||
34dc7c2f BB |
577 | void |
578 | dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) | |
579 | { | |
580 | dmu_tx_hold_t *txh; | |
7d637211 | 581 | |
34dc7c2f BB |
582 | ASSERT(tx->tx_txg == 0); |
583 | ||
584 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
585 | DMU_NEW_OBJECT, THT_SPACE, space, 0); | |
7d637211 | 586 | if (txh) |
f85c06be | 587 | (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG); |
34dc7c2f BB |
588 | } |
589 | ||
3ec3bc21 | 590 | #ifdef ZFS_DEBUG |
34dc7c2f BB |
591 | void |
592 | dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) | |
593 | { | |
3ec3bc21 BB |
594 | boolean_t match_object = B_FALSE; |
595 | boolean_t match_offset = B_FALSE; | |
34dc7c2f | 596 | |
572e2857 | 597 | DB_DNODE_ENTER(db); |
3ec3bc21 | 598 | dnode_t *dn = DB_DNODE(db); |
34dc7c2f | 599 | ASSERT(tx->tx_txg != 0); |
428870ff | 600 | ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); |
34dc7c2f BB |
601 | ASSERT3U(dn->dn_object, ==, db->db.db_object); |
602 | ||
572e2857 BB |
603 | if (tx->tx_anyobj) { |
604 | DB_DNODE_EXIT(db); | |
34dc7c2f | 605 | return; |
572e2857 | 606 | } |
34dc7c2f BB |
607 | |
608 | /* XXX No checking on the meta dnode for now */ | |
572e2857 BB |
609 | if (db->db.db_object == DMU_META_DNODE_OBJECT) { |
610 | DB_DNODE_EXIT(db); | |
34dc7c2f | 611 | return; |
572e2857 | 612 | } |
34dc7c2f | 613 | |
3ec3bc21 | 614 | for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; |
34dc7c2f | 615 | txh = list_next(&tx->tx_holds, txh)) { |
99ea23c5 | 616 | ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); |
34dc7c2f BB |
617 | if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) |
618 | match_object = TRUE; | |
619 | if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { | |
620 | int datablkshift = dn->dn_datablkshift ? | |
621 | dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; | |
622 | int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; | |
623 | int shift = datablkshift + epbs * db->db_level; | |
624 | uint64_t beginblk = shift >= 64 ? 0 : | |
625 | (txh->txh_arg1 >> shift); | |
626 | uint64_t endblk = shift >= 64 ? 0 : | |
627 | ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); | |
628 | uint64_t blkid = db->db_blkid; | |
629 | ||
630 | /* XXX txh_arg2 better not be zero... */ | |
631 | ||
632 | dprintf("found txh type %x beginblk=%llx endblk=%llx\n", | |
633 | txh->txh_type, beginblk, endblk); | |
634 | ||
635 | switch (txh->txh_type) { | |
636 | case THT_WRITE: | |
637 | if (blkid >= beginblk && blkid <= endblk) | |
638 | match_offset = TRUE; | |
639 | /* | |
640 | * We will let this hold work for the bonus | |
428870ff BB |
641 | * or spill buffer so that we don't need to |
642 | * hold it when creating a new object. | |
34dc7c2f | 643 | */ |
428870ff BB |
644 | if (blkid == DMU_BONUS_BLKID || |
645 | blkid == DMU_SPILL_BLKID) | |
34dc7c2f BB |
646 | match_offset = TRUE; |
647 | /* | |
648 | * They might have to increase nlevels, | |
649 | * thus dirtying the new TLIBs. Or the | |
650 | * might have to change the block size, | |
651 | * thus dirying the new lvl=0 blk=0. | |
652 | */ | |
653 | if (blkid == 0) | |
654 | match_offset = TRUE; | |
655 | break; | |
656 | case THT_FREE: | |
b128c09f BB |
657 | /* |
658 | * We will dirty all the level 1 blocks in | |
659 | * the free range and perhaps the first and | |
660 | * last level 0 block. | |
661 | */ | |
662 | if (blkid >= beginblk && (blkid <= endblk || | |
663 | txh->txh_arg2 == DMU_OBJECT_END)) | |
34dc7c2f BB |
664 | match_offset = TRUE; |
665 | break; | |
428870ff BB |
666 | case THT_SPILL: |
667 | if (blkid == DMU_SPILL_BLKID) | |
668 | match_offset = TRUE; | |
669 | break; | |
34dc7c2f | 670 | case THT_BONUS: |
428870ff | 671 | if (blkid == DMU_BONUS_BLKID) |
34dc7c2f BB |
672 | match_offset = TRUE; |
673 | break; | |
674 | case THT_ZAP: | |
675 | match_offset = TRUE; | |
676 | break; | |
677 | case THT_NEWOBJECT: | |
678 | match_object = TRUE; | |
679 | break; | |
680 | default: | |
989fd514 BB |
681 | cmn_err(CE_PANIC, "bad txh_type %d", |
682 | txh->txh_type); | |
34dc7c2f BB |
683 | } |
684 | } | |
572e2857 BB |
685 | if (match_object && match_offset) { |
686 | DB_DNODE_EXIT(db); | |
34dc7c2f | 687 | return; |
572e2857 | 688 | } |
34dc7c2f | 689 | } |
572e2857 | 690 | DB_DNODE_EXIT(db); |
34dc7c2f BB |
691 | panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", |
692 | (u_longlong_t)db->db.db_object, db->db_level, | |
693 | (u_longlong_t)db->db_blkid); | |
694 | } | |
695 | #endif | |
696 | ||
e8b96c60 MA |
697 | /* |
698 | * If we can't do 10 iops, something is wrong. Let us go ahead | |
699 | * and hit zfs_dirty_data_max. | |
700 | */ | |
701 | hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */ | |
702 | int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ | |
703 | ||
704 | /* | |
705 | * We delay transactions when we've determined that the backend storage | |
706 | * isn't able to accommodate the rate of incoming writes. | |
707 | * | |
708 | * If there is already a transaction waiting, we delay relative to when | |
709 | * that transaction finishes waiting. This way the calculated min_time | |
710 | * is independent of the number of threads concurrently executing | |
711 | * transactions. | |
712 | * | |
713 | * If we are the only waiter, wait relative to when the transaction | |
714 | * started, rather than the current time. This credits the transaction for | |
715 | * "time already served", e.g. reading indirect blocks. | |
716 | * | |
717 | * The minimum time for a transaction to take is calculated as: | |
718 | * min_time = scale * (dirty - min) / (max - dirty) | |
719 | * min_time is then capped at zfs_delay_max_ns. | |
720 | * | |
721 | * The delay has two degrees of freedom that can be adjusted via tunables. | |
722 | * The percentage of dirty data at which we start to delay is defined by | |
723 | * zfs_delay_min_dirty_percent. This should typically be at or above | |
724 | * zfs_vdev_async_write_active_max_dirty_percent so that we only start to | |
725 | * delay after writing at full speed has failed to keep up with the incoming | |
726 | * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly | |
727 | * speaking, this variable determines the amount of delay at the midpoint of | |
728 | * the curve. | |
729 | * | |
730 | * delay | |
731 | * 10ms +-------------------------------------------------------------*+ | |
732 | * | *| | |
733 | * 9ms + *+ | |
734 | * | *| | |
735 | * 8ms + *+ | |
736 | * | * | | |
737 | * 7ms + * + | |
738 | * | * | | |
739 | * 6ms + * + | |
740 | * | * | | |
741 | * 5ms + * + | |
742 | * | * | | |
743 | * 4ms + * + | |
744 | * | * | | |
745 | * 3ms + * + | |
746 | * | * | | |
747 | * 2ms + (midpoint) * + | |
748 | * | | ** | | |
749 | * 1ms + v *** + | |
750 | * | zfs_delay_scale ----------> ******** | | |
751 | * 0 +-------------------------------------*********----------------+ | |
752 | * 0% <- zfs_dirty_data_max -> 100% | |
753 | * | |
754 | * Note that since the delay is added to the outstanding time remaining on the | |
755 | * most recent transaction, the delay is effectively the inverse of IOPS. | |
756 | * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve | |
757 | * was chosen such that small changes in the amount of accumulated dirty data | |
758 | * in the first 3/4 of the curve yield relatively small differences in the | |
759 | * amount of delay. | |
760 | * | |
761 | * The effects can be easier to understand when the amount of delay is | |
762 | * represented on a log scale: | |
763 | * | |
764 | * delay | |
765 | * 100ms +-------------------------------------------------------------++ | |
766 | * + + | |
767 | * | | | |
768 | * + *+ | |
769 | * 10ms + *+ | |
770 | * + ** + | |
771 | * | (midpoint) ** | | |
772 | * + | ** + | |
773 | * 1ms + v **** + | |
774 | * + zfs_delay_scale ----------> ***** + | |
775 | * | **** | | |
776 | * + **** + | |
777 | * 100us + ** + | |
778 | * + * + | |
779 | * | * | | |
780 | * + * + | |
781 | * 10us + * + | |
782 | * + + | |
783 | * | | | |
784 | * + + | |
785 | * +--------------------------------------------------------------+ | |
786 | * 0% <- zfs_dirty_data_max -> 100% | |
787 | * | |
788 | * Note here that only as the amount of dirty data approaches its limit does | |
789 | * the delay start to increase rapidly. The goal of a properly tuned system | |
790 | * should be to keep the amount of dirty data out of that range by first | |
791 | * ensuring that the appropriate limits are set for the I/O scheduler to reach | |
792 | * optimal throughput on the backend storage, and then by changing the value | |
793 | * of zfs_delay_scale to increase the steepness of the curve. | |
794 | */ | |
795 | static void | |
796 | dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) | |
797 | { | |
798 | dsl_pool_t *dp = tx->tx_pool; | |
799 | uint64_t delay_min_bytes = | |
800 | zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; | |
801 | hrtime_t wakeup, min_tx_time, now; | |
802 | ||
803 | if (dirty <= delay_min_bytes) | |
804 | return; | |
805 | ||
806 | /* | |
807 | * The caller has already waited until we are under the max. | |
808 | * We make them pass us the amount of dirty data so we don't | |
809 | * have to handle the case of it being >= the max, which could | |
810 | * cause a divide-by-zero if it's == the max. | |
811 | */ | |
812 | ASSERT3U(dirty, <, zfs_dirty_data_max); | |
813 | ||
814 | now = gethrtime(); | |
815 | min_tx_time = zfs_delay_scale * | |
816 | (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); | |
817 | min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); | |
818 | if (now > tx->tx_start + min_tx_time) | |
819 | return; | |
820 | ||
821 | DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, | |
822 | uint64_t, min_tx_time); | |
823 | ||
824 | mutex_enter(&dp->dp_lock); | |
825 | wakeup = MAX(tx->tx_start + min_tx_time, | |
826 | dp->dp_last_wakeup + min_tx_time); | |
827 | dp->dp_last_wakeup = wakeup; | |
828 | mutex_exit(&dp->dp_lock); | |
829 | ||
830 | zfs_sleep_until(wakeup); | |
831 | } | |
832 | ||
3ec3bc21 BB |
833 | /* |
834 | * This routine attempts to assign the transaction to a transaction group. | |
835 | * To do so, we must determine if there is sufficient free space on disk. | |
836 | * | |
837 | * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree() | |
838 | * on it), then it is assumed that there is sufficient free space, | |
839 | * unless there's insufficient slop space in the pool (see the comment | |
840 | * above spa_slop_shift in spa_misc.c). | |
841 | * | |
842 | * If it is not a "netfree" transaction, then if the data already on disk | |
843 | * is over the allowed usage (e.g. quota), this will fail with EDQUOT or | |
844 | * ENOSPC. Otherwise, if the current rough estimate of pending changes, | |
845 | * plus the rough estimate of this transaction's changes, may exceed the | |
846 | * allowed usage, then this will fail with ERESTART, which will cause the | |
847 | * caller to wait for the pending changes to be written to disk (by waiting | |
848 | * for the next TXG to open), and then check the space usage again. | |
849 | * | |
850 | * The rough estimate of pending changes is comprised of the sum of: | |
851 | * | |
852 | * - this transaction's holds' txh_space_towrite | |
853 | * | |
854 | * - dd_tempreserved[], which is the sum of in-flight transactions' | |
855 | * holds' txh_space_towrite (i.e. those transactions that have called | |
856 | * dmu_tx_assign() but not yet called dmu_tx_commit()). | |
857 | * | |
858 | * - dd_space_towrite[], which is the amount of dirtied dbufs. | |
859 | * | |
860 | * Note that all of these values are inflated by spa_get_worst_case_asize(), | |
861 | * which means that we may get ERESTART well before we are actually in danger | |
862 | * of running out of space, but this also mitigates any small inaccuracies | |
863 | * in the rough estimate (e.g. txh_space_towrite doesn't take into account | |
864 | * indirect blocks, and dd_space_towrite[] doesn't take into account changes | |
865 | * to the MOS). | |
866 | * | |
867 | * Note that due to this algorithm, it is possible to exceed the allowed | |
868 | * usage by one transaction. Also, as we approach the allowed usage, | |
869 | * we will allow a very limited amount of changes into each TXG, thus | |
870 | * decreasing performance. | |
871 | */ | |
34dc7c2f | 872 | static int |
0735ecb3 | 873 | dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) |
34dc7c2f | 874 | { |
34dc7c2f | 875 | spa_t *spa = tx->tx_pool->dp_spa; |
34dc7c2f | 876 | |
c99c9001 | 877 | ASSERT0(tx->tx_txg); |
34dc7c2f | 878 | |
570827e1 BB |
879 | if (tx->tx_err) { |
880 | DMU_TX_STAT_BUMP(dmu_tx_error); | |
34dc7c2f | 881 | return (tx->tx_err); |
570827e1 | 882 | } |
34dc7c2f | 883 | |
b128c09f | 884 | if (spa_suspended(spa)) { |
570827e1 BB |
885 | DMU_TX_STAT_BUMP(dmu_tx_suspended); |
886 | ||
34dc7c2f BB |
887 | /* |
888 | * If the user has indicated a blocking failure mode | |
889 | * then return ERESTART which will block in dmu_tx_wait(). | |
890 | * Otherwise, return EIO so that an error can get | |
891 | * propagated back to the VOP calls. | |
892 | * | |
893 | * Note that we always honor the txg_how flag regardless | |
894 | * of the failuremode setting. | |
895 | */ | |
896 | if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && | |
0735ecb3 | 897 | !(txg_how & TXG_WAIT)) |
2e528b49 | 898 | return (SET_ERROR(EIO)); |
34dc7c2f | 899 | |
2e528b49 | 900 | return (SET_ERROR(ERESTART)); |
34dc7c2f BB |
901 | } |
902 | ||
0735ecb3 | 903 | if (!tx->tx_dirty_delayed && |
e8b96c60 MA |
904 | dsl_pool_need_dirty_delay(tx->tx_pool)) { |
905 | tx->tx_wait_dirty = B_TRUE; | |
906 | DMU_TX_STAT_BUMP(dmu_tx_dirty_delay); | |
ecb2b7dc | 907 | return (SET_ERROR(ERESTART)); |
e8b96c60 MA |
908 | } |
909 | ||
34dc7c2f BB |
910 | tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); |
911 | tx->tx_needassign_txh = NULL; | |
912 | ||
913 | /* | |
914 | * NB: No error returns are allowed after txg_hold_open, but | |
915 | * before processing the dnode holds, due to the | |
916 | * dmu_tx_unassign() logic. | |
917 | */ | |
918 | ||
3ec3bc21 BB |
919 | uint64_t towrite = 0; |
920 | uint64_t tohold = 0; | |
921 | for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; | |
34dc7c2f BB |
922 | txh = list_next(&tx->tx_holds, txh)) { |
923 | dnode_t *dn = txh->txh_dnode; | |
924 | if (dn != NULL) { | |
925 | mutex_enter(&dn->dn_mtx); | |
926 | if (dn->dn_assigned_txg == tx->tx_txg - 1) { | |
927 | mutex_exit(&dn->dn_mtx); | |
928 | tx->tx_needassign_txh = txh; | |
570827e1 | 929 | DMU_TX_STAT_BUMP(dmu_tx_group); |
2e528b49 | 930 | return (SET_ERROR(ERESTART)); |
34dc7c2f BB |
931 | } |
932 | if (dn->dn_assigned_txg == 0) | |
933 | dn->dn_assigned_txg = tx->tx_txg; | |
934 | ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); | |
935 | (void) refcount_add(&dn->dn_tx_holds, tx); | |
936 | mutex_exit(&dn->dn_mtx); | |
937 | } | |
f85c06be | 938 | towrite += refcount_count(&txh->txh_space_towrite); |
f85c06be | 939 | tohold += refcount_count(&txh->txh_memory_tohold); |
34dc7c2f BB |
940 | } |
941 | ||
b128c09f | 942 | /* needed allocation: worst-case estimate of write space */ |
3ec3bc21 | 943 | uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite); |
b128c09f | 944 | /* calculate memory footprint estimate */ |
3ec3bc21 | 945 | uint64_t memory = towrite + tohold; |
34dc7c2f | 946 | |
3ec3bc21 | 947 | if (tx->tx_dir != NULL && asize != 0) { |
b128c09f | 948 | int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, |
3ec3bc21 BB |
949 | asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx); |
950 | if (err != 0) | |
34dc7c2f BB |
951 | return (err); |
952 | } | |
953 | ||
570827e1 BB |
954 | DMU_TX_STAT_BUMP(dmu_tx_assigned); |
955 | ||
34dc7c2f BB |
956 | return (0); |
957 | } | |
958 | ||
959 | static void | |
960 | dmu_tx_unassign(dmu_tx_t *tx) | |
961 | { | |
34dc7c2f BB |
962 | if (tx->tx_txg == 0) |
963 | return; | |
964 | ||
965 | txg_rele_to_quiesce(&tx->tx_txgh); | |
966 | ||
e49f1e20 WA |
967 | /* |
968 | * Walk the transaction's hold list, removing the hold on the | |
969 | * associated dnode, and notifying waiters if the refcount drops to 0. | |
970 | */ | |
3ec3bc21 | 971 | for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); |
981b2126 | 972 | txh && txh != tx->tx_needassign_txh; |
34dc7c2f BB |
973 | txh = list_next(&tx->tx_holds, txh)) { |
974 | dnode_t *dn = txh->txh_dnode; | |
975 | ||
976 | if (dn == NULL) | |
977 | continue; | |
978 | mutex_enter(&dn->dn_mtx); | |
979 | ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); | |
980 | ||
981 | if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { | |
982 | dn->dn_assigned_txg = 0; | |
983 | cv_broadcast(&dn->dn_notxholds); | |
984 | } | |
985 | mutex_exit(&dn->dn_mtx); | |
986 | } | |
987 | ||
988 | txg_rele_to_sync(&tx->tx_txgh); | |
989 | ||
990 | tx->tx_lasttried_txg = tx->tx_txg; | |
991 | tx->tx_txg = 0; | |
992 | } | |
993 | ||
994 | /* | |
0735ecb3 | 995 | * Assign tx to a transaction group; txg_how is a bitmask: |
34dc7c2f | 996 | * |
0735ecb3 PS |
997 | * If TXG_WAIT is set and the currently open txg is full, this function |
998 | * will wait until there's a new txg. This should be used when no locks | |
999 | * are being held. With this bit set, this function will only fail if | |
1000 | * we're truly out of space (or over quota). | |
34dc7c2f | 1001 | * |
0735ecb3 PS |
1002 | * If TXG_WAIT is *not* set and we can't assign into the currently open |
1003 | * txg without blocking, this function will return immediately with | |
1004 | * ERESTART. This should be used whenever locks are being held. On an | |
1005 | * ERESTART error, the caller should drop all locks, call dmu_tx_wait(), | |
1006 | * and try again. | |
e8b96c60 | 1007 | * |
0735ecb3 PS |
1008 | * If TXG_NOTHROTTLE is set, this indicates that this tx should not be |
1009 | * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for | |
1010 | * details on the throttle). This is used by the VFS operations, after | |
1011 | * they have already called dmu_tx_wait() (though most likely on a | |
1012 | * different tx). | |
34dc7c2f BB |
1013 | */ |
1014 | int | |
0735ecb3 | 1015 | dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) |
34dc7c2f BB |
1016 | { |
1017 | int err; | |
1018 | ||
1019 | ASSERT(tx->tx_txg == 0); | |
0735ecb3 | 1020 | ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE)); |
34dc7c2f BB |
1021 | ASSERT(!dsl_pool_sync_context(tx->tx_pool)); |
1022 | ||
13fe0198 | 1023 | /* If we might wait, we must not hold the config lock. */ |
0735ecb3 PS |
1024 | IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool)); |
1025 | ||
1026 | if ((txg_how & TXG_NOTHROTTLE)) | |
1027 | tx->tx_dirty_delayed = B_TRUE; | |
13fe0198 | 1028 | |
34dc7c2f BB |
1029 | while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { |
1030 | dmu_tx_unassign(tx); | |
1031 | ||
0735ecb3 | 1032 | if (err != ERESTART || !(txg_how & TXG_WAIT)) |
34dc7c2f BB |
1033 | return (err); |
1034 | ||
1035 | dmu_tx_wait(tx); | |
1036 | } | |
1037 | ||
1038 | txg_rele_to_quiesce(&tx->tx_txgh); | |
1039 | ||
1040 | return (0); | |
1041 | } | |
1042 | ||
1043 | void | |
1044 | dmu_tx_wait(dmu_tx_t *tx) | |
1045 | { | |
1046 | spa_t *spa = tx->tx_pool->dp_spa; | |
e8b96c60 | 1047 | dsl_pool_t *dp = tx->tx_pool; |
a77c4c83 | 1048 | hrtime_t before; |
34dc7c2f BB |
1049 | |
1050 | ASSERT(tx->tx_txg == 0); | |
13fe0198 | 1051 | ASSERT(!dsl_pool_config_held(tx->tx_pool)); |
34dc7c2f | 1052 | |
a77c4c83 NB |
1053 | before = gethrtime(); |
1054 | ||
e8b96c60 MA |
1055 | if (tx->tx_wait_dirty) { |
1056 | uint64_t dirty; | |
1057 | ||
1058 | /* | |
1059 | * dmu_tx_try_assign() has determined that we need to wait | |
1060 | * because we've consumed much or all of the dirty buffer | |
1061 | * space. | |
1062 | */ | |
1063 | mutex_enter(&dp->dp_lock); | |
1064 | if (dp->dp_dirty_total >= zfs_dirty_data_max) | |
1065 | DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max); | |
1066 | while (dp->dp_dirty_total >= zfs_dirty_data_max) | |
1067 | cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); | |
1068 | dirty = dp->dp_dirty_total; | |
1069 | mutex_exit(&dp->dp_lock); | |
1070 | ||
1071 | dmu_tx_delay(tx, dirty); | |
1072 | ||
1073 | tx->tx_wait_dirty = B_FALSE; | |
1074 | ||
1075 | /* | |
0735ecb3 PS |
1076 | * Note: setting tx_dirty_delayed only has effect if the |
1077 | * caller used TX_WAIT. Otherwise they are going to | |
1078 | * destroy this tx and try again. The common case, | |
1079 | * zfs_write(), uses TX_WAIT. | |
e8b96c60 | 1080 | */ |
0735ecb3 | 1081 | tx->tx_dirty_delayed = B_TRUE; |
e8b96c60 MA |
1082 | } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { |
1083 | /* | |
1084 | * If the pool is suspended we need to wait until it | |
1085 | * is resumed. Note that it's possible that the pool | |
1086 | * has become active after this thread has tried to | |
1087 | * obtain a tx. If that's the case then tx_lasttried_txg | |
1088 | * would not have been set. | |
1089 | */ | |
1090 | txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); | |
34dc7c2f BB |
1091 | } else if (tx->tx_needassign_txh) { |
1092 | dnode_t *dn = tx->tx_needassign_txh->txh_dnode; | |
1093 | ||
1094 | mutex_enter(&dn->dn_mtx); | |
1095 | while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) | |
1096 | cv_wait(&dn->dn_notxholds, &dn->dn_mtx); | |
1097 | mutex_exit(&dn->dn_mtx); | |
1098 | tx->tx_needassign_txh = NULL; | |
1099 | } else { | |
e8b96c60 MA |
1100 | /* |
1101 | * A dnode is assigned to the quiescing txg. Wait for its | |
1102 | * transaction to complete. | |
1103 | */ | |
34dc7c2f BB |
1104 | txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); |
1105 | } | |
a77c4c83 NB |
1106 | |
1107 | spa_tx_assign_add_nsecs(spa, gethrtime() - before); | |
34dc7c2f BB |
1108 | } |
1109 | ||
f85c06be GM |
1110 | static void |
1111 | dmu_tx_destroy(dmu_tx_t *tx) | |
1112 | { | |
1113 | dmu_tx_hold_t *txh; | |
1114 | ||
1115 | while ((txh = list_head(&tx->tx_holds)) != NULL) { | |
1116 | dnode_t *dn = txh->txh_dnode; | |
1117 | ||
1118 | list_remove(&tx->tx_holds, txh); | |
1119 | refcount_destroy_many(&txh->txh_space_towrite, | |
1120 | refcount_count(&txh->txh_space_towrite)); | |
f85c06be GM |
1121 | refcount_destroy_many(&txh->txh_memory_tohold, |
1122 | refcount_count(&txh->txh_memory_tohold)); | |
f85c06be GM |
1123 | kmem_free(txh, sizeof (dmu_tx_hold_t)); |
1124 | if (dn != NULL) | |
1125 | dnode_rele(dn, tx); | |
1126 | } | |
1127 | ||
1128 | list_destroy(&tx->tx_callbacks); | |
1129 | list_destroy(&tx->tx_holds); | |
f85c06be GM |
1130 | kmem_free(tx, sizeof (dmu_tx_t)); |
1131 | } | |
1132 | ||
34dc7c2f BB |
1133 | void |
1134 | dmu_tx_commit(dmu_tx_t *tx) | |
1135 | { | |
34dc7c2f BB |
1136 | ASSERT(tx->tx_txg != 0); |
1137 | ||
e49f1e20 WA |
1138 | /* |
1139 | * Go through the transaction's hold list and remove holds on | |
1140 | * associated dnodes, notifying waiters if no holds remain. | |
1141 | */ | |
1c27024e | 1142 | for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; |
f85c06be | 1143 | txh = list_next(&tx->tx_holds, txh)) { |
34dc7c2f BB |
1144 | dnode_t *dn = txh->txh_dnode; |
1145 | ||
34dc7c2f BB |
1146 | if (dn == NULL) |
1147 | continue; | |
f85c06be | 1148 | |
34dc7c2f BB |
1149 | mutex_enter(&dn->dn_mtx); |
1150 | ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); | |
1151 | ||
1152 | if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { | |
1153 | dn->dn_assigned_txg = 0; | |
1154 | cv_broadcast(&dn->dn_notxholds); | |
1155 | } | |
1156 | mutex_exit(&dn->dn_mtx); | |
34dc7c2f BB |
1157 | } |
1158 | ||
1159 | if (tx->tx_tempreserve_cookie) | |
1160 | dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); | |
1161 | ||
428870ff BB |
1162 | if (!list_is_empty(&tx->tx_callbacks)) |
1163 | txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); | |
1164 | ||
34dc7c2f BB |
1165 | if (tx->tx_anyobj == FALSE) |
1166 | txg_rele_to_sync(&tx->tx_txgh); | |
428870ff | 1167 | |
f85c06be | 1168 | dmu_tx_destroy(tx); |
34dc7c2f BB |
1169 | } |
1170 | ||
1171 | void | |
1172 | dmu_tx_abort(dmu_tx_t *tx) | |
1173 | { | |
34dc7c2f BB |
1174 | ASSERT(tx->tx_txg == 0); |
1175 | ||
428870ff BB |
1176 | /* |
1177 | * Call any registered callbacks with an error code. | |
1178 | */ | |
1179 | if (!list_is_empty(&tx->tx_callbacks)) | |
1180 | dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); | |
1181 | ||
f85c06be | 1182 | dmu_tx_destroy(tx); |
34dc7c2f BB |
1183 | } |
1184 | ||
1185 | uint64_t | |
1186 | dmu_tx_get_txg(dmu_tx_t *tx) | |
1187 | { | |
1188 | ASSERT(tx->tx_txg != 0); | |
1189 | return (tx->tx_txg); | |
1190 | } | |
428870ff | 1191 | |
13fe0198 MA |
1192 | dsl_pool_t * |
1193 | dmu_tx_pool(dmu_tx_t *tx) | |
1194 | { | |
1195 | ASSERT(tx->tx_pool != NULL); | |
1196 | return (tx->tx_pool); | |
1197 | } | |
1198 | ||
428870ff BB |
1199 | void |
1200 | dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) | |
1201 | { | |
1202 | dmu_tx_callback_t *dcb; | |
1203 | ||
79c76d5b | 1204 | dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); |
428870ff BB |
1205 | |
1206 | dcb->dcb_func = func; | |
1207 | dcb->dcb_data = data; | |
1208 | ||
1209 | list_insert_tail(&tx->tx_callbacks, dcb); | |
1210 | } | |
1211 | ||
1212 | /* | |
1213 | * Call all the commit callbacks on a list, with a given error code. | |
1214 | */ | |
1215 | void | |
1216 | dmu_tx_do_callbacks(list_t *cb_list, int error) | |
1217 | { | |
1218 | dmu_tx_callback_t *dcb; | |
1219 | ||
823d48bf | 1220 | while ((dcb = list_tail(cb_list)) != NULL) { |
428870ff BB |
1221 | list_remove(cb_list, dcb); |
1222 | dcb->dcb_func(dcb->dcb_data, error); | |
1223 | kmem_free(dcb, sizeof (dmu_tx_callback_t)); | |
1224 | } | |
1225 | } | |
1226 | ||
1227 | /* | |
1228 | * Interface to hold a bunch of attributes. | |
1229 | * used for creating new files. | |
1230 | * attrsize is the total size of all attributes | |
1231 | * to be added during object creation | |
1232 | * | |
1233 | * For updating/adding a single attribute dmu_tx_hold_sa() should be used. | |
1234 | */ | |
1235 | ||
1236 | /* | |
1237 | * hold necessary attribute name for attribute registration. | |
1238 | * should be a very rare case where this is needed. If it does | |
1239 | * happen it would only happen on the first write to the file system. | |
1240 | */ | |
1241 | static void | |
1242 | dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) | |
1243 | { | |
428870ff BB |
1244 | if (!sa->sa_need_attr_registration) |
1245 | return; | |
1246 | ||
3ec3bc21 | 1247 | for (int i = 0; i != sa->sa_num_attrs; i++) { |
428870ff BB |
1248 | if (!sa->sa_attr_table[i].sa_registered) { |
1249 | if (sa->sa_reg_attr_obj) | |
1250 | dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, | |
1251 | B_TRUE, sa->sa_attr_table[i].sa_name); | |
1252 | else | |
1253 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, | |
1254 | B_TRUE, sa->sa_attr_table[i].sa_name); | |
1255 | } | |
1256 | } | |
1257 | } | |
1258 | ||
428870ff BB |
1259 | void |
1260 | dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) | |
1261 | { | |
9631681b | 1262 | dmu_tx_hold_t *txh; |
428870ff | 1263 | |
9631681b BB |
1264 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, |
1265 | THT_SPILL, 0, 0); | |
1266 | if (txh != NULL) | |
1267 | (void) refcount_add_many(&txh->txh_space_towrite, | |
1268 | SPA_OLD_MAXBLOCKSIZE, FTAG); | |
428870ff BB |
1269 | } |
1270 | ||
1271 | void | |
1272 | dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) | |
1273 | { | |
1274 | sa_os_t *sa = tx->tx_objset->os_sa; | |
1275 | ||
1276 | dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); | |
1277 | ||
1278 | if (tx->tx_objset->os_sa->sa_master_obj == 0) | |
1279 | return; | |
1280 | ||
3ec3bc21 | 1281 | if (tx->tx_objset->os_sa->sa_layout_attr_obj) { |
428870ff | 1282 | dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); |
3ec3bc21 | 1283 | } else { |
428870ff BB |
1284 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); |
1285 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); | |
1286 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1287 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1288 | } | |
1289 | ||
1290 | dmu_tx_sa_registration_hold(sa, tx); | |
1291 | ||
50c957f7 | 1292 | if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill) |
428870ff BB |
1293 | return; |
1294 | ||
1295 | (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, | |
1296 | THT_SPILL, 0, 0); | |
1297 | } | |
1298 | ||
1299 | /* | |
1300 | * Hold SA attribute | |
1301 | * | |
1302 | * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) | |
1303 | * | |
1304 | * variable_size is the total size of all variable sized attributes | |
1305 | * passed to this function. It is not the total size of all | |
1306 | * variable size attributes that *may* exist on this object. | |
1307 | */ | |
1308 | void | |
1309 | dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) | |
1310 | { | |
1311 | uint64_t object; | |
1312 | sa_os_t *sa = tx->tx_objset->os_sa; | |
1313 | ||
1314 | ASSERT(hdl != NULL); | |
1315 | ||
1316 | object = sa_handle_object(hdl); | |
1317 | ||
1318 | dmu_tx_hold_bonus(tx, object); | |
1319 | ||
1320 | if (tx->tx_objset->os_sa->sa_master_obj == 0) | |
1321 | return; | |
1322 | ||
1323 | if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || | |
1324 | tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { | |
1325 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); | |
1326 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); | |
1327 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1328 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1329 | } | |
1330 | ||
1331 | dmu_tx_sa_registration_hold(sa, tx); | |
1332 | ||
1333 | if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) | |
1334 | dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); | |
1335 | ||
572e2857 | 1336 | if (sa->sa_force_spill || may_grow || hdl->sa_spill) { |
428870ff BB |
1337 | ASSERT(tx->tx_txg == 0); |
1338 | dmu_tx_hold_spill(tx, object); | |
572e2857 BB |
1339 | } else { |
1340 | dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; | |
1341 | dnode_t *dn; | |
1342 | ||
1343 | DB_DNODE_ENTER(db); | |
1344 | dn = DB_DNODE(db); | |
1345 | if (dn->dn_have_spill) { | |
1346 | ASSERT(tx->tx_txg == 0); | |
1347 | dmu_tx_hold_spill(tx, object); | |
1348 | } | |
1349 | DB_DNODE_EXIT(db); | |
428870ff BB |
1350 | } |
1351 | } | |
c28b2279 | 1352 | |
570827e1 BB |
1353 | void |
1354 | dmu_tx_init(void) | |
1355 | { | |
1356 | dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc", | |
1357 | KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t), | |
1358 | KSTAT_FLAG_VIRTUAL); | |
1359 | ||
1360 | if (dmu_tx_ksp != NULL) { | |
1361 | dmu_tx_ksp->ks_data = &dmu_tx_stats; | |
1362 | kstat_install(dmu_tx_ksp); | |
1363 | } | |
1364 | } | |
1365 | ||
1366 | void | |
1367 | dmu_tx_fini(void) | |
1368 | { | |
1369 | if (dmu_tx_ksp != NULL) { | |
1370 | kstat_delete(dmu_tx_ksp); | |
1371 | dmu_tx_ksp = NULL; | |
1372 | } | |
1373 | } | |
1374 | ||
c28b2279 BB |
1375 | #if defined(_KERNEL) && defined(HAVE_SPL) |
1376 | EXPORT_SYMBOL(dmu_tx_create); | |
1377 | EXPORT_SYMBOL(dmu_tx_hold_write); | |
0eef1bde | 1378 | EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode); |
c28b2279 | 1379 | EXPORT_SYMBOL(dmu_tx_hold_free); |
0eef1bde | 1380 | EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode); |
c28b2279 | 1381 | EXPORT_SYMBOL(dmu_tx_hold_zap); |
0eef1bde | 1382 | EXPORT_SYMBOL(dmu_tx_hold_zap_by_dnode); |
c28b2279 | 1383 | EXPORT_SYMBOL(dmu_tx_hold_bonus); |
0eef1bde | 1384 | EXPORT_SYMBOL(dmu_tx_hold_bonus_by_dnode); |
c28b2279 BB |
1385 | EXPORT_SYMBOL(dmu_tx_abort); |
1386 | EXPORT_SYMBOL(dmu_tx_assign); | |
1387 | EXPORT_SYMBOL(dmu_tx_wait); | |
1388 | EXPORT_SYMBOL(dmu_tx_commit); | |
848259c1 | 1389 | EXPORT_SYMBOL(dmu_tx_mark_netfree); |
c28b2279 BB |
1390 | EXPORT_SYMBOL(dmu_tx_get_txg); |
1391 | EXPORT_SYMBOL(dmu_tx_callback_register); | |
1392 | EXPORT_SYMBOL(dmu_tx_do_callbacks); | |
1393 | EXPORT_SYMBOL(dmu_tx_hold_spill); | |
1394 | EXPORT_SYMBOL(dmu_tx_hold_sa_create); | |
1395 | EXPORT_SYMBOL(dmu_tx_hold_sa); | |
1396 | #endif |