]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
22cd4a46 | 23 | * Copyright 2011 Nexenta Systems, Inc. All rights reserved. |
2bce8049 | 24 | * Copyright (c) 2012, 2016 by Delphix. All rights reserved. |
22cd4a46 | 25 | */ |
34dc7c2f | 26 | |
34dc7c2f BB |
27 | #include <sys/dmu.h> |
28 | #include <sys/dmu_impl.h> | |
29 | #include <sys/dbuf.h> | |
30 | #include <sys/dmu_tx.h> | |
31 | #include <sys/dmu_objset.h> | |
32 | #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */ | |
33 | #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */ | |
34 | #include <sys/dsl_pool.h> | |
35 | #include <sys/zap_impl.h> /* for fzap_default_block_shift */ | |
36 | #include <sys/spa.h> | |
428870ff BB |
37 | #include <sys/sa.h> |
38 | #include <sys/sa_impl.h> | |
34dc7c2f | 39 | #include <sys/zfs_context.h> |
428870ff | 40 | #include <sys/varargs.h> |
49ee64e5 | 41 | #include <sys/trace_dmu.h> |
34dc7c2f BB |
42 | |
43 | typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, | |
44 | uint64_t arg1, uint64_t arg2); | |
45 | ||
570827e1 BB |
46 | dmu_tx_stats_t dmu_tx_stats = { |
47 | { "dmu_tx_assigned", KSTAT_DATA_UINT64 }, | |
48 | { "dmu_tx_delay", KSTAT_DATA_UINT64 }, | |
49 | { "dmu_tx_error", KSTAT_DATA_UINT64 }, | |
50 | { "dmu_tx_suspended", KSTAT_DATA_UINT64 }, | |
51 | { "dmu_tx_group", KSTAT_DATA_UINT64 }, | |
570827e1 BB |
52 | { "dmu_tx_memory_reserve", KSTAT_DATA_UINT64 }, |
53 | { "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 }, | |
570827e1 | 54 | { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, |
e8b96c60 MA |
55 | { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, |
56 | { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, | |
570827e1 BB |
57 | { "dmu_tx_quota", KSTAT_DATA_UINT64 }, |
58 | }; | |
59 | ||
60 | static kstat_t *dmu_tx_ksp; | |
34dc7c2f BB |
61 | |
62 | dmu_tx_t * | |
63 | dmu_tx_create_dd(dsl_dir_t *dd) | |
64 | { | |
79c76d5b | 65 | dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); |
34dc7c2f | 66 | tx->tx_dir = dd; |
6f1ffb06 | 67 | if (dd != NULL) |
34dc7c2f BB |
68 | tx->tx_pool = dd->dd_pool; |
69 | list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), | |
70 | offsetof(dmu_tx_hold_t, txh_node)); | |
428870ff BB |
71 | list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), |
72 | offsetof(dmu_tx_callback_t, dcb_node)); | |
e8b96c60 | 73 | tx->tx_start = gethrtime(); |
1c5de20a | 74 | #ifdef DEBUG_DMU_TX |
34dc7c2f BB |
75 | refcount_create(&tx->tx_space_written); |
76 | refcount_create(&tx->tx_space_freed); | |
77 | #endif | |
78 | return (tx); | |
79 | } | |
80 | ||
81 | dmu_tx_t * | |
82 | dmu_tx_create(objset_t *os) | |
83 | { | |
428870ff | 84 | dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); |
34dc7c2f | 85 | tx->tx_objset = os; |
428870ff | 86 | tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset); |
34dc7c2f BB |
87 | return (tx); |
88 | } | |
89 | ||
90 | dmu_tx_t * | |
91 | dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) | |
92 | { | |
93 | dmu_tx_t *tx = dmu_tx_create_dd(NULL); | |
94 | ||
95 | ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); | |
96 | tx->tx_pool = dp; | |
97 | tx->tx_txg = txg; | |
98 | tx->tx_anyobj = TRUE; | |
99 | ||
100 | return (tx); | |
101 | } | |
102 | ||
103 | int | |
104 | dmu_tx_is_syncing(dmu_tx_t *tx) | |
105 | { | |
106 | return (tx->tx_anyobj); | |
107 | } | |
108 | ||
109 | int | |
110 | dmu_tx_private_ok(dmu_tx_t *tx) | |
111 | { | |
112 | return (tx->tx_anyobj); | |
113 | } | |
114 | ||
115 | static dmu_tx_hold_t * | |
0eef1bde | 116 | dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type, |
117 | uint64_t arg1, uint64_t arg2) | |
34dc7c2f BB |
118 | { |
119 | dmu_tx_hold_t *txh; | |
34dc7c2f | 120 | |
0eef1bde | 121 | if (dn != NULL) { |
66eead53 | 122 | (void) refcount_add(&dn->dn_holds, tx); |
0eef1bde | 123 | if (tx->tx_txg != 0) { |
34dc7c2f BB |
124 | mutex_enter(&dn->dn_mtx); |
125 | /* | |
126 | * dn->dn_assigned_txg == tx->tx_txg doesn't pose a | |
127 | * problem, but there's no way for it to happen (for | |
128 | * now, at least). | |
129 | */ | |
130 | ASSERT(dn->dn_assigned_txg == 0); | |
131 | dn->dn_assigned_txg = tx->tx_txg; | |
132 | (void) refcount_add(&dn->dn_tx_holds, tx); | |
133 | mutex_exit(&dn->dn_mtx); | |
134 | } | |
135 | } | |
136 | ||
79c76d5b | 137 | txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); |
34dc7c2f BB |
138 | txh->txh_tx = tx; |
139 | txh->txh_dnode = dn; | |
f85c06be GM |
140 | refcount_create(&txh->txh_space_towrite); |
141 | refcount_create(&txh->txh_space_tofree); | |
142 | refcount_create(&txh->txh_space_tooverwrite); | |
143 | refcount_create(&txh->txh_space_tounref); | |
144 | refcount_create(&txh->txh_memory_tohold); | |
145 | refcount_create(&txh->txh_fudge); | |
1c5de20a | 146 | #ifdef DEBUG_DMU_TX |
34dc7c2f BB |
147 | txh->txh_type = type; |
148 | txh->txh_arg1 = arg1; | |
149 | txh->txh_arg2 = arg2; | |
150 | #endif | |
151 | list_insert_tail(&tx->tx_holds, txh); | |
152 | ||
153 | return (txh); | |
154 | } | |
155 | ||
0eef1bde | 156 | static dmu_tx_hold_t * |
157 | dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, | |
158 | enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) | |
159 | { | |
160 | dnode_t *dn = NULL; | |
161 | dmu_tx_hold_t *txh; | |
162 | int err; | |
163 | ||
164 | if (object != DMU_NEW_OBJECT) { | |
165 | err = dnode_hold(os, object, FTAG, &dn); | |
66eead53 | 166 | if (err != 0) { |
0eef1bde | 167 | tx->tx_err = err; |
168 | return (NULL); | |
169 | } | |
170 | } | |
171 | txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2); | |
172 | if (dn != NULL) | |
173 | dnode_rele(dn, FTAG); | |
174 | return (txh); | |
175 | } | |
176 | ||
34dc7c2f | 177 | void |
66eead53 | 178 | dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn) |
34dc7c2f BB |
179 | { |
180 | /* | |
181 | * If we're syncing, they can manipulate any object anyhow, and | |
182 | * the hold on the dnode_t can cause problems. | |
183 | */ | |
0eef1bde | 184 | if (!dmu_tx_is_syncing(tx)) |
185 | (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0); | |
34dc7c2f BB |
186 | } |
187 | ||
188 | static int | |
189 | dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) | |
190 | { | |
191 | int err; | |
192 | dmu_buf_impl_t *db; | |
193 | ||
194 | rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
195 | db = dbuf_hold_level(dn, level, blkid, FTAG); | |
196 | rw_exit(&dn->dn_struct_rwlock); | |
197 | if (db == NULL) | |
2e528b49 | 198 | return (SET_ERROR(EIO)); |
34dc7c2f BB |
199 | err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); |
200 | dbuf_rele(db, FTAG); | |
201 | return (err); | |
202 | } | |
203 | ||
9babb374 | 204 | static void |
428870ff BB |
205 | dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db, |
206 | int level, uint64_t blkid, boolean_t freeable, uint64_t *history) | |
9babb374 | 207 | { |
428870ff BB |
208 | objset_t *os = dn->dn_objset; |
209 | dsl_dataset_t *ds = os->os_dsl_dataset; | |
210 | int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; | |
211 | dmu_buf_impl_t *parent = NULL; | |
212 | blkptr_t *bp = NULL; | |
213 | uint64_t space; | |
214 | ||
215 | if (level >= dn->dn_nlevels || history[level] == blkid) | |
9babb374 BB |
216 | return; |
217 | ||
428870ff | 218 | history[level] = blkid; |
9babb374 | 219 | |
428870ff BB |
220 | space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift); |
221 | ||
222 | if (db == NULL || db == dn->dn_dbuf) { | |
223 | ASSERT(level != 0); | |
224 | db = NULL; | |
225 | } else { | |
572e2857 | 226 | ASSERT(DB_DNODE(db) == dn); |
428870ff BB |
227 | ASSERT(db->db_level == level); |
228 | ASSERT(db->db.db_size == space); | |
229 | ASSERT(db->db_blkid == blkid); | |
230 | bp = db->db_blkptr; | |
231 | parent = db->db_parent; | |
9babb374 BB |
232 | } |
233 | ||
428870ff BB |
234 | freeable = (bp && (freeable || |
235 | dsl_dataset_block_freeable(ds, bp, bp->blk_birth))); | |
9babb374 | 236 | |
f85c06be GM |
237 | if (freeable) { |
238 | (void) refcount_add_many(&txh->txh_space_tooverwrite, | |
239 | space, FTAG); | |
240 | } else { | |
241 | (void) refcount_add_many(&txh->txh_space_towrite, | |
242 | space, FTAG); | |
243 | } | |
244 | ||
245 | if (bp) { | |
246 | (void) refcount_add_many(&txh->txh_space_tounref, | |
247 | bp_get_dsize(os->os_spa, bp), FTAG); | |
248 | } | |
428870ff BB |
249 | |
250 | dmu_tx_count_twig(txh, dn, parent, level + 1, | |
251 | blkid >> epbs, freeable, history); | |
9babb374 BB |
252 | } |
253 | ||
34dc7c2f BB |
254 | /* ARGSUSED */ |
255 | static void | |
256 | dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) | |
257 | { | |
258 | dnode_t *dn = txh->txh_dnode; | |
259 | uint64_t start, end, i; | |
260 | int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; | |
261 | int err = 0; | |
d6320ddb | 262 | int l; |
34dc7c2f BB |
263 | |
264 | if (len == 0) | |
265 | return; | |
266 | ||
267 | min_bs = SPA_MINBLOCKSHIFT; | |
f1512ee6 | 268 | max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1; |
34dc7c2f BB |
269 | min_ibs = DN_MIN_INDBLKSHIFT; |
270 | max_ibs = DN_MAX_INDBLKSHIFT; | |
271 | ||
34dc7c2f | 272 | if (dn) { |
428870ff | 273 | uint64_t history[DN_MAX_LEVELS]; |
9babb374 BB |
274 | int nlvls = dn->dn_nlevels; |
275 | int delta; | |
276 | ||
277 | /* | |
278 | * For i/o error checking, read the first and last level-0 | |
279 | * blocks (if they are not aligned), and all the level-1 blocks. | |
280 | */ | |
34dc7c2f | 281 | if (dn->dn_maxblkid == 0) { |
9babb374 BB |
282 | delta = dn->dn_datablksz; |
283 | start = (off < dn->dn_datablksz) ? 0 : 1; | |
284 | end = (off+len <= dn->dn_datablksz) ? 0 : 1; | |
285 | if (start == 0 && (off > 0 || len < dn->dn_datablksz)) { | |
b128c09f BB |
286 | err = dmu_tx_check_ioerr(NULL, dn, 0, 0); |
287 | if (err) | |
288 | goto out; | |
9babb374 | 289 | delta -= off; |
b128c09f | 290 | } |
34dc7c2f BB |
291 | } else { |
292 | zio_t *zio = zio_root(dn->dn_objset->os_spa, | |
293 | NULL, NULL, ZIO_FLAG_CANFAIL); | |
294 | ||
295 | /* first level-0 block */ | |
296 | start = off >> dn->dn_datablkshift; | |
297 | if (P2PHASE(off, dn->dn_datablksz) || | |
298 | len < dn->dn_datablksz) { | |
299 | err = dmu_tx_check_ioerr(zio, dn, 0, start); | |
300 | if (err) | |
301 | goto out; | |
302 | } | |
303 | ||
304 | /* last level-0 block */ | |
305 | end = (off+len-1) >> dn->dn_datablkshift; | |
b128c09f | 306 | if (end != start && end <= dn->dn_maxblkid && |
34dc7c2f BB |
307 | P2PHASE(off+len, dn->dn_datablksz)) { |
308 | err = dmu_tx_check_ioerr(zio, dn, 0, end); | |
309 | if (err) | |
310 | goto out; | |
311 | } | |
312 | ||
313 | /* level-1 blocks */ | |
9babb374 BB |
314 | if (nlvls > 1) { |
315 | int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; | |
316 | for (i = (start>>shft)+1; i < end>>shft; i++) { | |
34dc7c2f BB |
317 | err = dmu_tx_check_ioerr(zio, dn, 1, i); |
318 | if (err) | |
319 | goto out; | |
320 | } | |
321 | } | |
322 | ||
323 | err = zio_wait(zio); | |
324 | if (err) | |
325 | goto out; | |
9babb374 | 326 | delta = P2NPHASE(off, dn->dn_datablksz); |
34dc7c2f | 327 | } |
34dc7c2f | 328 | |
295304be | 329 | min_ibs = max_ibs = dn->dn_indblkshift; |
9babb374 BB |
330 | if (dn->dn_maxblkid > 0) { |
331 | /* | |
332 | * The blocksize can't change, | |
333 | * so we can make a more precise estimate. | |
334 | */ | |
335 | ASSERT(dn->dn_datablkshift != 0); | |
34dc7c2f | 336 | min_bs = max_bs = dn->dn_datablkshift; |
f1512ee6 MA |
337 | } else { |
338 | /* | |
339 | * The blocksize can increase up to the recordsize, | |
340 | * or if it is already more than the recordsize, | |
341 | * up to the next power of 2. | |
342 | */ | |
343 | min_bs = highbit64(dn->dn_datablksz - 1); | |
344 | max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1)); | |
9babb374 BB |
345 | } |
346 | ||
347 | /* | |
348 | * If this write is not off the end of the file | |
349 | * we need to account for overwrites/unref. | |
350 | */ | |
428870ff | 351 | if (start <= dn->dn_maxblkid) { |
d6320ddb | 352 | for (l = 0; l < DN_MAX_LEVELS; l++) |
428870ff BB |
353 | history[l] = -1ULL; |
354 | } | |
9babb374 | 355 | while (start <= dn->dn_maxblkid) { |
9babb374 BB |
356 | dmu_buf_impl_t *db; |
357 | ||
358 | rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
fcff0f35 PD |
359 | err = dbuf_hold_impl(dn, 0, start, |
360 | FALSE, FALSE, FTAG, &db); | |
9babb374 | 361 | rw_exit(&dn->dn_struct_rwlock); |
428870ff BB |
362 | |
363 | if (err) { | |
364 | txh->txh_tx->tx_err = err; | |
365 | return; | |
9babb374 | 366 | } |
428870ff BB |
367 | |
368 | dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE, | |
369 | history); | |
9babb374 BB |
370 | dbuf_rele(db, FTAG); |
371 | if (++start > end) { | |
372 | /* | |
373 | * Account for new indirects appearing | |
374 | * before this IO gets assigned into a txg. | |
375 | */ | |
376 | bits = 64 - min_bs; | |
377 | epbs = min_ibs - SPA_BLKPTRSHIFT; | |
378 | for (bits -= epbs * (nlvls - 1); | |
f85c06be GM |
379 | bits >= 0; bits -= epbs) { |
380 | (void) refcount_add_many( | |
381 | &txh->txh_fudge, | |
382 | 1ULL << max_ibs, FTAG); | |
383 | } | |
9babb374 BB |
384 | goto out; |
385 | } | |
386 | off += delta; | |
387 | if (len >= delta) | |
388 | len -= delta; | |
389 | delta = dn->dn_datablksz; | |
390 | } | |
34dc7c2f BB |
391 | } |
392 | ||
393 | /* | |
394 | * 'end' is the last thing we will access, not one past. | |
395 | * This way we won't overflow when accessing the last byte. | |
396 | */ | |
397 | start = P2ALIGN(off, 1ULL << max_bs); | |
398 | end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; | |
f85c06be GM |
399 | (void) refcount_add_many(&txh->txh_space_towrite, |
400 | end - start + 1, FTAG); | |
34dc7c2f BB |
401 | |
402 | start >>= min_bs; | |
403 | end >>= min_bs; | |
404 | ||
405 | epbs = min_ibs - SPA_BLKPTRSHIFT; | |
406 | ||
407 | /* | |
408 | * The object contains at most 2^(64 - min_bs) blocks, | |
409 | * and each indirect level maps 2^epbs. | |
410 | */ | |
411 | for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { | |
412 | start >>= epbs; | |
413 | end >>= epbs; | |
9babb374 | 414 | ASSERT3U(end, >=, start); |
f85c06be GM |
415 | (void) refcount_add_many(&txh->txh_space_towrite, |
416 | (end - start + 1) << max_ibs, FTAG); | |
9babb374 BB |
417 | if (start != 0) { |
418 | /* | |
419 | * We also need a new blkid=0 indirect block | |
420 | * to reference any existing file data. | |
421 | */ | |
f85c06be GM |
422 | (void) refcount_add_many(&txh->txh_space_towrite, |
423 | 1ULL << max_ibs, FTAG); | |
9babb374 | 424 | } |
34dc7c2f BB |
425 | } |
426 | ||
34dc7c2f | 427 | out: |
f85c06be GM |
428 | if (refcount_count(&txh->txh_space_towrite) + |
429 | refcount_count(&txh->txh_space_tooverwrite) > | |
9babb374 | 430 | 2 * DMU_MAX_ACCESS) |
2e528b49 | 431 | err = SET_ERROR(EFBIG); |
9babb374 | 432 | |
34dc7c2f BB |
433 | if (err) |
434 | txh->txh_tx->tx_err = err; | |
435 | } | |
436 | ||
437 | static void | |
438 | dmu_tx_count_dnode(dmu_tx_hold_t *txh) | |
439 | { | |
440 | dnode_t *dn = txh->txh_dnode; | |
572e2857 | 441 | dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset); |
34dc7c2f | 442 | uint64_t space = mdn->dn_datablksz + |
05852b34 | 443 | ((uint64_t)(mdn->dn_nlevels-1) << mdn->dn_indblkshift); |
34dc7c2f BB |
444 | |
445 | if (dn && dn->dn_dbuf->db_blkptr && | |
446 | dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, | |
428870ff | 447 | dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) { |
f85c06be GM |
448 | (void) refcount_add_many(&txh->txh_space_tooverwrite, |
449 | space, FTAG); | |
450 | (void) refcount_add_many(&txh->txh_space_tounref, space, FTAG); | |
34dc7c2f | 451 | } else { |
f85c06be GM |
452 | (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG); |
453 | if (dn && dn->dn_dbuf->db_blkptr) { | |
454 | (void) refcount_add_many(&txh->txh_space_tounref, | |
455 | space, FTAG); | |
456 | } | |
34dc7c2f BB |
457 | } |
458 | } | |
459 | ||
460 | void | |
461 | dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) | |
462 | { | |
463 | dmu_tx_hold_t *txh; | |
464 | ||
66eead53 MA |
465 | ASSERT0(tx->tx_txg); |
466 | ASSERT3U(len, <=, DMU_MAX_ACCESS); | |
34dc7c2f BB |
467 | ASSERT(len == 0 || UINT64_MAX - off >= len - 1); |
468 | ||
469 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
470 | object, THT_WRITE, off, len); | |
66eead53 MA |
471 | if (txh != NULL) { |
472 | dmu_tx_count_write(txh, off, len); | |
473 | dmu_tx_count_dnode(txh); | |
474 | } | |
34dc7c2f BB |
475 | } |
476 | ||
0eef1bde | 477 | void |
478 | dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) | |
479 | { | |
480 | dmu_tx_hold_t *txh; | |
481 | ||
66eead53 MA |
482 | ASSERT0(tx->tx_txg); |
483 | ASSERT3U(len, <=, DMU_MAX_ACCESS); | |
0eef1bde | 484 | ASSERT(len == 0 || UINT64_MAX - off >= len - 1); |
485 | ||
486 | txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len); | |
66eead53 MA |
487 | if (txh != NULL) { |
488 | dmu_tx_count_write(txh, off, len); | |
489 | dmu_tx_count_dnode(txh); | |
490 | } | |
0eef1bde | 491 | } |
492 | ||
34dc7c2f BB |
493 | static void |
494 | dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) | |
495 | { | |
b128c09f BB |
496 | uint64_t blkid, nblks, lastblk; |
497 | uint64_t space = 0, unref = 0, skipped = 0; | |
34dc7c2f BB |
498 | dnode_t *dn = txh->txh_dnode; |
499 | dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; | |
500 | spa_t *spa = txh->txh_tx->tx_pool->dp_spa; | |
b128c09f | 501 | int epbs; |
ff80d9b1 | 502 | uint64_t l0span = 0, nl1blks = 0; |
34dc7c2f | 503 | |
b128c09f | 504 | if (dn->dn_nlevels == 0) |
34dc7c2f BB |
505 | return; |
506 | ||
507 | /* | |
b128c09f | 508 | * The struct_rwlock protects us against dn_nlevels |
34dc7c2f BB |
509 | * changing, in case (against all odds) we manage to dirty & |
510 | * sync out the changes after we check for being dirty. | |
428870ff | 511 | * Also, dbuf_hold_impl() wants us to have the struct_rwlock. |
34dc7c2f BB |
512 | */ |
513 | rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
b128c09f BB |
514 | epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; |
515 | if (dn->dn_maxblkid == 0) { | |
34dc7c2f BB |
516 | if (off == 0 && len >= dn->dn_datablksz) { |
517 | blkid = 0; | |
518 | nblks = 1; | |
519 | } else { | |
520 | rw_exit(&dn->dn_struct_rwlock); | |
521 | return; | |
522 | } | |
523 | } else { | |
524 | blkid = off >> dn->dn_datablkshift; | |
b128c09f | 525 | nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift; |
34dc7c2f | 526 | |
383fc4a9 | 527 | if (blkid > dn->dn_maxblkid) { |
34dc7c2f BB |
528 | rw_exit(&dn->dn_struct_rwlock); |
529 | return; | |
530 | } | |
b128c09f | 531 | if (blkid + nblks > dn->dn_maxblkid) |
383fc4a9 | 532 | nblks = dn->dn_maxblkid - blkid + 1; |
34dc7c2f | 533 | |
34dc7c2f | 534 | } |
ff80d9b1 | 535 | l0span = nblks; /* save for later use to calc level > 1 overhead */ |
b128c09f | 536 | if (dn->dn_nlevels == 1) { |
34dc7c2f BB |
537 | int i; |
538 | for (i = 0; i < nblks; i++) { | |
539 | blkptr_t *bp = dn->dn_phys->dn_blkptr; | |
b128c09f | 540 | ASSERT3U(blkid + i, <, dn->dn_nblkptr); |
34dc7c2f | 541 | bp += blkid + i; |
428870ff | 542 | if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) { |
34dc7c2f | 543 | dprintf_bp(bp, "can free old%s", ""); |
428870ff | 544 | space += bp_get_dsize(spa, bp); |
34dc7c2f BB |
545 | } |
546 | unref += BP_GET_ASIZE(bp); | |
547 | } | |
ff80d9b1 | 548 | nl1blks = 1; |
34dc7c2f BB |
549 | nblks = 0; |
550 | } | |
551 | ||
b128c09f | 552 | lastblk = blkid + nblks - 1; |
34dc7c2f BB |
553 | while (nblks) { |
554 | dmu_buf_impl_t *dbuf; | |
b128c09f BB |
555 | uint64_t ibyte, new_blkid; |
556 | int epb = 1 << epbs; | |
557 | int err, i, blkoff, tochk; | |
558 | blkptr_t *bp; | |
559 | ||
560 | ibyte = blkid << dn->dn_datablkshift; | |
561 | err = dnode_next_offset(dn, | |
562 | DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); | |
563 | new_blkid = ibyte >> dn->dn_datablkshift; | |
564 | if (err == ESRCH) { | |
565 | skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; | |
566 | break; | |
567 | } | |
568 | if (err) { | |
569 | txh->txh_tx->tx_err = err; | |
570 | break; | |
571 | } | |
572 | if (new_blkid > lastblk) { | |
573 | skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; | |
574 | break; | |
575 | } | |
34dc7c2f | 576 | |
b128c09f BB |
577 | if (new_blkid > blkid) { |
578 | ASSERT((new_blkid >> epbs) > (blkid >> epbs)); | |
579 | skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1; | |
580 | nblks -= new_blkid - blkid; | |
581 | blkid = new_blkid; | |
582 | } | |
583 | blkoff = P2PHASE(blkid, epb); | |
584 | tochk = MIN(epb - blkoff, nblks); | |
34dc7c2f | 585 | |
fcff0f35 PD |
586 | err = dbuf_hold_impl(dn, 1, blkid >> epbs, |
587 | FALSE, FALSE, FTAG, &dbuf); | |
428870ff BB |
588 | if (err) { |
589 | txh->txh_tx->tx_err = err; | |
b128c09f | 590 | break; |
34dc7c2f | 591 | } |
428870ff | 592 | |
f85c06be GM |
593 | (void) refcount_add_many(&txh->txh_memory_tohold, |
594 | dbuf->db.db_size, FTAG); | |
428870ff BB |
595 | |
596 | /* | |
597 | * We don't check memory_tohold against DMU_MAX_ACCESS because | |
598 | * memory_tohold is an over-estimation (especially the >L1 | |
599 | * indirect blocks), so it could fail. Callers should have | |
600 | * already verified that they will not be holding too much | |
601 | * memory. | |
602 | */ | |
603 | ||
b128c09f BB |
604 | err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); |
605 | if (err != 0) { | |
34dc7c2f | 606 | txh->txh_tx->tx_err = err; |
b128c09f | 607 | dbuf_rele(dbuf, FTAG); |
34dc7c2f BB |
608 | break; |
609 | } | |
610 | ||
b128c09f BB |
611 | bp = dbuf->db.db_data; |
612 | bp += blkoff; | |
613 | ||
614 | for (i = 0; i < tochk; i++) { | |
428870ff BB |
615 | if (dsl_dataset_block_freeable(ds, &bp[i], |
616 | bp[i].blk_birth)) { | |
b128c09f | 617 | dprintf_bp(&bp[i], "can free old%s", ""); |
428870ff | 618 | space += bp_get_dsize(spa, &bp[i]); |
b128c09f BB |
619 | } |
620 | unref += BP_GET_ASIZE(bp); | |
621 | } | |
622 | dbuf_rele(dbuf, FTAG); | |
623 | ||
ff80d9b1 | 624 | ++nl1blks; |
34dc7c2f BB |
625 | blkid += tochk; |
626 | nblks -= tochk; | |
627 | } | |
628 | rw_exit(&dn->dn_struct_rwlock); | |
629 | ||
ff80d9b1 AJ |
630 | /* |
631 | * Add in memory requirements of higher-level indirects. | |
632 | * This assumes a worst-possible scenario for dn_nlevels and a | |
633 | * worst-possible distribution of l1-blocks over the region to free. | |
634 | */ | |
635 | { | |
636 | uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs); | |
637 | int level = 2; | |
638 | /* | |
639 | * Here we don't use DN_MAX_LEVEL, but calculate it with the | |
640 | * given datablkshift and indblkshift. This makes the | |
641 | * difference between 19 and 8 on large files. | |
642 | */ | |
643 | int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) / | |
644 | (dn->dn_indblkshift - SPA_BLKPTRSHIFT); | |
645 | ||
646 | while (level++ < maxlevel) { | |
f85c06be GM |
647 | (void) refcount_add_many(&txh->txh_memory_tohold, |
648 | MAX(MIN(blkcnt, nl1blks), 1) << dn->dn_indblkshift, | |
649 | FTAG); | |
ff80d9b1 AJ |
650 | blkcnt = 1 + (blkcnt >> epbs); |
651 | } | |
652 | } | |
653 | ||
b128c09f BB |
654 | /* account for new level 1 indirect blocks that might show up */ |
655 | if (skipped > 0) { | |
f85c06be GM |
656 | (void) refcount_add_many(&txh->txh_fudge, |
657 | skipped << dn->dn_indblkshift, FTAG); | |
b128c09f | 658 | skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs); |
f85c06be GM |
659 | (void) refcount_add_many(&txh->txh_memory_tohold, |
660 | skipped << dn->dn_indblkshift, FTAG); | |
b128c09f | 661 | } |
f85c06be GM |
662 | (void) refcount_add_many(&txh->txh_space_tofree, space, FTAG); |
663 | (void) refcount_add_many(&txh->txh_space_tounref, unref, FTAG); | |
34dc7c2f BB |
664 | } |
665 | ||
19d55079 MA |
666 | /* |
667 | * This function marks the transaction as being a "net free". The end | |
668 | * result is that refquotas will be disabled for this transaction, and | |
669 | * this transaction will be able to use half of the pool space overhead | |
670 | * (see dsl_pool_adjustedsize()). Therefore this function should only | |
671 | * be called for transactions that we expect will not cause a net increase | |
672 | * in the amount of space used (but it's OK if that is occasionally not true). | |
673 | */ | |
674 | void | |
675 | dmu_tx_mark_netfree(dmu_tx_t *tx) | |
676 | { | |
677 | dmu_tx_hold_t *txh; | |
678 | ||
679 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
680 | DMU_NEW_OBJECT, THT_FREE, 0, 0); | |
681 | ||
682 | /* | |
683 | * Pretend that this operation will free 1GB of space. This | |
684 | * should be large enough to cancel out the largest write. | |
685 | * We don't want to use something like UINT64_MAX, because that would | |
686 | * cause overflows when doing math with these values (e.g. in | |
687 | * dmu_tx_try_assign()). | |
688 | */ | |
f85c06be GM |
689 | (void) refcount_add_many(&txh->txh_space_tofree, |
690 | 1024 * 1024 * 1024, FTAG); | |
691 | (void) refcount_add_many(&txh->txh_space_tounref, | |
692 | 1024 * 1024 * 1024, FTAG); | |
19d55079 MA |
693 | } |
694 | ||
0eef1bde | 695 | static void |
696 | dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) | |
34dc7c2f | 697 | { |
0eef1bde | 698 | dmu_tx_t *tx; |
34dc7c2f | 699 | dnode_t *dn; |
ea97f8ce | 700 | int err; |
34dc7c2f BB |
701 | zio_t *zio; |
702 | ||
0eef1bde | 703 | tx = txh->txh_tx; |
34dc7c2f BB |
704 | ASSERT(tx->tx_txg == 0); |
705 | ||
34dc7c2f | 706 | dn = txh->txh_dnode; |
e8b96c60 | 707 | dmu_tx_count_dnode(txh); |
34dc7c2f | 708 | |
34dc7c2f BB |
709 | if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) |
710 | return; | |
711 | if (len == DMU_OBJECT_END) | |
712 | len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; | |
713 | ||
ea97f8ce MA |
714 | dmu_tx_count_dnode(txh); |
715 | ||
34dc7c2f | 716 | /* |
ea97f8ce MA |
717 | * For i/o error checking, we read the first and last level-0 |
718 | * blocks if they are not aligned, and all the level-1 blocks. | |
719 | * | |
720 | * Note: dbuf_free_range() assumes that we have not instantiated | |
721 | * any level-0 dbufs that will be completely freed. Therefore we must | |
722 | * exercise care to not read or count the first and last blocks | |
723 | * if they are blocksize-aligned. | |
724 | */ | |
725 | if (dn->dn_datablkshift == 0) { | |
b663a23d | 726 | if (off != 0 || len < dn->dn_datablksz) |
92bc214c | 727 | dmu_tx_count_write(txh, 0, dn->dn_datablksz); |
ea97f8ce MA |
728 | } else { |
729 | /* first block will be modified if it is not aligned */ | |
730 | if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) | |
731 | dmu_tx_count_write(txh, off, 1); | |
732 | /* last block will be modified if it is not aligned */ | |
733 | if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) | |
734 | dmu_tx_count_write(txh, off+len, 1); | |
735 | } | |
736 | ||
737 | /* | |
738 | * Check level-1 blocks. | |
34dc7c2f BB |
739 | */ |
740 | if (dn->dn_nlevels > 1) { | |
ea97f8ce | 741 | int shift = dn->dn_datablkshift + dn->dn_indblkshift - |
34dc7c2f | 742 | SPA_BLKPTRSHIFT; |
ea97f8ce MA |
743 | uint64_t start = off >> shift; |
744 | uint64_t end = (off + len) >> shift; | |
745 | uint64_t i; | |
746 | ||
ea97f8ce | 747 | ASSERT(dn->dn_indblkshift != 0); |
34dc7c2f | 748 | |
2e7b7657 MA |
749 | /* |
750 | * dnode_reallocate() can result in an object with indirect | |
751 | * blocks having an odd data block size. In this case, | |
752 | * just check the single block. | |
753 | */ | |
754 | if (dn->dn_datablkshift == 0) | |
755 | start = end = 0; | |
756 | ||
34dc7c2f BB |
757 | zio = zio_root(tx->tx_pool->dp_spa, |
758 | NULL, NULL, ZIO_FLAG_CANFAIL); | |
759 | for (i = start; i <= end; i++) { | |
760 | uint64_t ibyte = i << shift; | |
b128c09f | 761 | err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); |
34dc7c2f | 762 | i = ibyte >> shift; |
4bda3bd0 | 763 | if (err == ESRCH || i > end) |
34dc7c2f BB |
764 | break; |
765 | if (err) { | |
766 | tx->tx_err = err; | |
767 | return; | |
768 | } | |
769 | ||
770 | err = dmu_tx_check_ioerr(zio, dn, 1, i); | |
771 | if (err) { | |
772 | tx->tx_err = err; | |
773 | return; | |
774 | } | |
775 | } | |
776 | err = zio_wait(zio); | |
777 | if (err) { | |
778 | tx->tx_err = err; | |
779 | return; | |
780 | } | |
781 | } | |
782 | ||
34dc7c2f BB |
783 | dmu_tx_count_free(txh, off, len); |
784 | } | |
785 | ||
786 | void | |
0eef1bde | 787 | dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) |
788 | { | |
789 | dmu_tx_hold_t *txh; | |
790 | ||
791 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
792 | object, THT_FREE, off, len); | |
66eead53 MA |
793 | if (txh != NULL) |
794 | (void) dmu_tx_hold_free_impl(txh, off, len); | |
0eef1bde | 795 | } |
796 | ||
797 | void | |
798 | dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) | |
34dc7c2f BB |
799 | { |
800 | dmu_tx_hold_t *txh; | |
0eef1bde | 801 | |
802 | txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len); | |
66eead53 MA |
803 | if (txh != NULL) |
804 | (void) dmu_tx_hold_free_impl(txh, off, len); | |
0eef1bde | 805 | } |
806 | ||
807 | static void | |
808 | dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, int add, const char *name) | |
809 | { | |
810 | dmu_tx_t *tx = txh->txh_tx; | |
34dc7c2f | 811 | dnode_t *dn; |
f85c06be GM |
812 | int err; |
813 | int epbs; | |
d683ddbb | 814 | dsl_dataset_phys_t *ds_phys; |
f85c06be | 815 | int lvl; |
34dc7c2f BB |
816 | |
817 | ASSERT(tx->tx_txg == 0); | |
818 | ||
34dc7c2f BB |
819 | dn = txh->txh_dnode; |
820 | ||
821 | dmu_tx_count_dnode(txh); | |
822 | ||
823 | if (dn == NULL) { | |
824 | /* | |
825 | * We will be able to fit a new object's entries into one leaf | |
826 | * block. So there will be at most 2 blocks total, | |
827 | * including the header block. | |
828 | */ | |
f4bae2ed | 829 | dmu_tx_count_write(txh, 0, 2ULL << fzap_default_block_shift); |
34dc7c2f BB |
830 | return; |
831 | } | |
832 | ||
9ae529ec | 833 | ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); |
34dc7c2f BB |
834 | |
835 | if (dn->dn_maxblkid == 0 && !add) { | |
22cd4a46 AL |
836 | blkptr_t *bp; |
837 | ||
34dc7c2f BB |
838 | /* |
839 | * If there is only one block (i.e. this is a micro-zap) | |
840 | * and we are not adding anything, the accounting is simple. | |
841 | */ | |
842 | err = dmu_tx_check_ioerr(NULL, dn, 0, 0); | |
843 | if (err) { | |
844 | tx->tx_err = err; | |
845 | return; | |
846 | } | |
847 | ||
848 | /* | |
849 | * Use max block size here, since we don't know how much | |
850 | * the size will change between now and the dbuf dirty call. | |
851 | */ | |
22cd4a46 | 852 | bp = &dn->dn_phys->dn_blkptr[0]; |
34dc7c2f | 853 | if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, |
f85c06be GM |
854 | bp, bp->blk_birth)) { |
855 | (void) refcount_add_many(&txh->txh_space_tooverwrite, | |
856 | MZAP_MAX_BLKSZ, FTAG); | |
857 | } else { | |
858 | (void) refcount_add_many(&txh->txh_space_towrite, | |
859 | MZAP_MAX_BLKSZ, FTAG); | |
860 | } | |
861 | if (!BP_IS_HOLE(bp)) { | |
862 | (void) refcount_add_many(&txh->txh_space_tounref, | |
863 | MZAP_MAX_BLKSZ, FTAG); | |
864 | } | |
34dc7c2f BB |
865 | return; |
866 | } | |
867 | ||
868 | if (dn->dn_maxblkid > 0 && name) { | |
869 | /* | |
870 | * access the name in this fat-zap so that we'll check | |
871 | * for i/o errors to the leaf blocks, etc. | |
872 | */ | |
2bce8049 | 873 | err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); |
34dc7c2f BB |
874 | if (err == EIO) { |
875 | tx->tx_err = err; | |
876 | return; | |
877 | } | |
878 | } | |
879 | ||
2bce8049 | 880 | err = zap_count_write_by_dnode(dn, name, add, |
45d1cae3 | 881 | &txh->txh_space_towrite, &txh->txh_space_tooverwrite); |
34dc7c2f BB |
882 | |
883 | /* | |
884 | * If the modified blocks are scattered to the four winds, | |
f85c06be GM |
885 | * we'll have to modify an indirect twig for each. We can make |
886 | * modifications at up to 3 locations: | |
887 | * - header block at the beginning of the object | |
888 | * - target leaf block | |
889 | * - end of the object, where we might need to write: | |
890 | * - a new leaf block if the target block needs to be split | |
891 | * - the new pointer table, if it is growing | |
892 | * - the new cookie table, if it is growing | |
34dc7c2f BB |
893 | */ |
894 | epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; | |
f85c06be GM |
895 | ds_phys = |
896 | dsl_dataset_phys(dn->dn_objset->os_dsl_dataset); | |
897 | for (lvl = 1; lvl < dn->dn_nlevels; lvl++) { | |
898 | uint64_t num_indirects = 1 + (dn->dn_maxblkid >> (epbs * lvl)); | |
899 | uint64_t spc = MIN(3, num_indirects) << dn->dn_indblkshift; | |
900 | if (ds_phys->ds_prev_snap_obj != 0) { | |
901 | (void) refcount_add_many(&txh->txh_space_towrite, | |
902 | spc, FTAG); | |
903 | } else { | |
904 | (void) refcount_add_many(&txh->txh_space_tooverwrite, | |
905 | spc, FTAG); | |
906 | } | |
907 | } | |
34dc7c2f BB |
908 | } |
909 | ||
0eef1bde | 910 | void |
911 | dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) | |
912 | { | |
913 | dmu_tx_hold_t *txh; | |
914 | ||
66eead53 | 915 | ASSERT0(tx->tx_txg); |
0eef1bde | 916 | |
917 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
918 | object, THT_ZAP, add, (uintptr_t)name); | |
66eead53 MA |
919 | if (txh != NULL) |
920 | dmu_tx_hold_zap_impl(txh, add, name); | |
0eef1bde | 921 | } |
922 | ||
923 | void | |
924 | dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name) | |
925 | { | |
926 | dmu_tx_hold_t *txh; | |
927 | ||
66eead53 | 928 | ASSERT0(tx->tx_txg); |
0eef1bde | 929 | ASSERT(dn != NULL); |
930 | ||
931 | txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name); | |
66eead53 MA |
932 | if (txh != NULL) |
933 | dmu_tx_hold_zap_impl(txh, add, name); | |
0eef1bde | 934 | } |
935 | ||
34dc7c2f BB |
936 | void |
937 | dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) | |
938 | { | |
939 | dmu_tx_hold_t *txh; | |
940 | ||
941 | ASSERT(tx->tx_txg == 0); | |
942 | ||
943 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
944 | object, THT_BONUS, 0, 0); | |
945 | if (txh) | |
946 | dmu_tx_count_dnode(txh); | |
947 | } | |
948 | ||
0eef1bde | 949 | void |
950 | dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn) | |
951 | { | |
952 | dmu_tx_hold_t *txh; | |
953 | ||
66eead53 | 954 | ASSERT0(tx->tx_txg); |
0eef1bde | 955 | |
956 | txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0); | |
957 | if (txh) | |
958 | dmu_tx_count_dnode(txh); | |
959 | } | |
960 | ||
34dc7c2f BB |
961 | void |
962 | dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) | |
963 | { | |
964 | dmu_tx_hold_t *txh; | |
7d637211 | 965 | |
34dc7c2f BB |
966 | ASSERT(tx->tx_txg == 0); |
967 | ||
968 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
969 | DMU_NEW_OBJECT, THT_SPACE, space, 0); | |
7d637211 | 970 | if (txh) |
f85c06be | 971 | (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG); |
34dc7c2f BB |
972 | } |
973 | ||
974 | int | |
975 | dmu_tx_holds(dmu_tx_t *tx, uint64_t object) | |
976 | { | |
977 | dmu_tx_hold_t *txh; | |
978 | int holds = 0; | |
979 | ||
980 | /* | |
981 | * By asserting that the tx is assigned, we're counting the | |
982 | * number of dn_tx_holds, which is the same as the number of | |
983 | * dn_holds. Otherwise, we'd be counting dn_holds, but | |
984 | * dn_tx_holds could be 0. | |
985 | */ | |
986 | ASSERT(tx->tx_txg != 0); | |
987 | ||
988 | /* if (tx->tx_anyobj == TRUE) */ | |
989 | /* return (0); */ | |
990 | ||
991 | for (txh = list_head(&tx->tx_holds); txh; | |
992 | txh = list_next(&tx->tx_holds, txh)) { | |
993 | if (txh->txh_dnode && txh->txh_dnode->dn_object == object) | |
994 | holds++; | |
995 | } | |
996 | ||
997 | return (holds); | |
998 | } | |
999 | ||
1c5de20a | 1000 | #ifdef DEBUG_DMU_TX |
34dc7c2f BB |
1001 | void |
1002 | dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) | |
1003 | { | |
1004 | dmu_tx_hold_t *txh; | |
1005 | int match_object = FALSE, match_offset = FALSE; | |
572e2857 | 1006 | dnode_t *dn; |
34dc7c2f | 1007 | |
572e2857 BB |
1008 | DB_DNODE_ENTER(db); |
1009 | dn = DB_DNODE(db); | |
99ea23c5 | 1010 | ASSERT(dn != NULL); |
34dc7c2f | 1011 | ASSERT(tx->tx_txg != 0); |
428870ff | 1012 | ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); |
34dc7c2f BB |
1013 | ASSERT3U(dn->dn_object, ==, db->db.db_object); |
1014 | ||
572e2857 BB |
1015 | if (tx->tx_anyobj) { |
1016 | DB_DNODE_EXIT(db); | |
34dc7c2f | 1017 | return; |
572e2857 | 1018 | } |
34dc7c2f BB |
1019 | |
1020 | /* XXX No checking on the meta dnode for now */ | |
572e2857 BB |
1021 | if (db->db.db_object == DMU_META_DNODE_OBJECT) { |
1022 | DB_DNODE_EXIT(db); | |
34dc7c2f | 1023 | return; |
572e2857 | 1024 | } |
34dc7c2f BB |
1025 | |
1026 | for (txh = list_head(&tx->tx_holds); txh; | |
1027 | txh = list_next(&tx->tx_holds, txh)) { | |
99ea23c5 | 1028 | ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); |
34dc7c2f BB |
1029 | if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) |
1030 | match_object = TRUE; | |
1031 | if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { | |
1032 | int datablkshift = dn->dn_datablkshift ? | |
1033 | dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; | |
1034 | int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; | |
1035 | int shift = datablkshift + epbs * db->db_level; | |
1036 | uint64_t beginblk = shift >= 64 ? 0 : | |
1037 | (txh->txh_arg1 >> shift); | |
1038 | uint64_t endblk = shift >= 64 ? 0 : | |
1039 | ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); | |
1040 | uint64_t blkid = db->db_blkid; | |
1041 | ||
1042 | /* XXX txh_arg2 better not be zero... */ | |
1043 | ||
1044 | dprintf("found txh type %x beginblk=%llx endblk=%llx\n", | |
1045 | txh->txh_type, beginblk, endblk); | |
1046 | ||
1047 | switch (txh->txh_type) { | |
1048 | case THT_WRITE: | |
1049 | if (blkid >= beginblk && blkid <= endblk) | |
1050 | match_offset = TRUE; | |
1051 | /* | |
1052 | * We will let this hold work for the bonus | |
428870ff BB |
1053 | * or spill buffer so that we don't need to |
1054 | * hold it when creating a new object. | |
34dc7c2f | 1055 | */ |
428870ff BB |
1056 | if (blkid == DMU_BONUS_BLKID || |
1057 | blkid == DMU_SPILL_BLKID) | |
34dc7c2f BB |
1058 | match_offset = TRUE; |
1059 | /* | |
1060 | * They might have to increase nlevels, | |
1061 | * thus dirtying the new TLIBs. Or the | |
1062 | * might have to change the block size, | |
1063 | * thus dirying the new lvl=0 blk=0. | |
1064 | */ | |
1065 | if (blkid == 0) | |
1066 | match_offset = TRUE; | |
1067 | break; | |
1068 | case THT_FREE: | |
b128c09f BB |
1069 | /* |
1070 | * We will dirty all the level 1 blocks in | |
1071 | * the free range and perhaps the first and | |
1072 | * last level 0 block. | |
1073 | */ | |
1074 | if (blkid >= beginblk && (blkid <= endblk || | |
1075 | txh->txh_arg2 == DMU_OBJECT_END)) | |
34dc7c2f BB |
1076 | match_offset = TRUE; |
1077 | break; | |
428870ff BB |
1078 | case THT_SPILL: |
1079 | if (blkid == DMU_SPILL_BLKID) | |
1080 | match_offset = TRUE; | |
1081 | break; | |
34dc7c2f | 1082 | case THT_BONUS: |
428870ff | 1083 | if (blkid == DMU_BONUS_BLKID) |
34dc7c2f BB |
1084 | match_offset = TRUE; |
1085 | break; | |
1086 | case THT_ZAP: | |
1087 | match_offset = TRUE; | |
1088 | break; | |
1089 | case THT_NEWOBJECT: | |
1090 | match_object = TRUE; | |
1091 | break; | |
1092 | default: | |
989fd514 BB |
1093 | cmn_err(CE_PANIC, "bad txh_type %d", |
1094 | txh->txh_type); | |
34dc7c2f BB |
1095 | } |
1096 | } | |
572e2857 BB |
1097 | if (match_object && match_offset) { |
1098 | DB_DNODE_EXIT(db); | |
34dc7c2f | 1099 | return; |
572e2857 | 1100 | } |
34dc7c2f | 1101 | } |
572e2857 | 1102 | DB_DNODE_EXIT(db); |
34dc7c2f BB |
1103 | panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", |
1104 | (u_longlong_t)db->db.db_object, db->db_level, | |
1105 | (u_longlong_t)db->db_blkid); | |
1106 | } | |
1107 | #endif | |
1108 | ||
e8b96c60 MA |
1109 | /* |
1110 | * If we can't do 10 iops, something is wrong. Let us go ahead | |
1111 | * and hit zfs_dirty_data_max. | |
1112 | */ | |
1113 | hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */ | |
1114 | int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ | |
1115 | ||
1116 | /* | |
1117 | * We delay transactions when we've determined that the backend storage | |
1118 | * isn't able to accommodate the rate of incoming writes. | |
1119 | * | |
1120 | * If there is already a transaction waiting, we delay relative to when | |
1121 | * that transaction finishes waiting. This way the calculated min_time | |
1122 | * is independent of the number of threads concurrently executing | |
1123 | * transactions. | |
1124 | * | |
1125 | * If we are the only waiter, wait relative to when the transaction | |
1126 | * started, rather than the current time. This credits the transaction for | |
1127 | * "time already served", e.g. reading indirect blocks. | |
1128 | * | |
1129 | * The minimum time for a transaction to take is calculated as: | |
1130 | * min_time = scale * (dirty - min) / (max - dirty) | |
1131 | * min_time is then capped at zfs_delay_max_ns. | |
1132 | * | |
1133 | * The delay has two degrees of freedom that can be adjusted via tunables. | |
1134 | * The percentage of dirty data at which we start to delay is defined by | |
1135 | * zfs_delay_min_dirty_percent. This should typically be at or above | |
1136 | * zfs_vdev_async_write_active_max_dirty_percent so that we only start to | |
1137 | * delay after writing at full speed has failed to keep up with the incoming | |
1138 | * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly | |
1139 | * speaking, this variable determines the amount of delay at the midpoint of | |
1140 | * the curve. | |
1141 | * | |
1142 | * delay | |
1143 | * 10ms +-------------------------------------------------------------*+ | |
1144 | * | *| | |
1145 | * 9ms + *+ | |
1146 | * | *| | |
1147 | * 8ms + *+ | |
1148 | * | * | | |
1149 | * 7ms + * + | |
1150 | * | * | | |
1151 | * 6ms + * + | |
1152 | * | * | | |
1153 | * 5ms + * + | |
1154 | * | * | | |
1155 | * 4ms + * + | |
1156 | * | * | | |
1157 | * 3ms + * + | |
1158 | * | * | | |
1159 | * 2ms + (midpoint) * + | |
1160 | * | | ** | | |
1161 | * 1ms + v *** + | |
1162 | * | zfs_delay_scale ----------> ******** | | |
1163 | * 0 +-------------------------------------*********----------------+ | |
1164 | * 0% <- zfs_dirty_data_max -> 100% | |
1165 | * | |
1166 | * Note that since the delay is added to the outstanding time remaining on the | |
1167 | * most recent transaction, the delay is effectively the inverse of IOPS. | |
1168 | * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve | |
1169 | * was chosen such that small changes in the amount of accumulated dirty data | |
1170 | * in the first 3/4 of the curve yield relatively small differences in the | |
1171 | * amount of delay. | |
1172 | * | |
1173 | * The effects can be easier to understand when the amount of delay is | |
1174 | * represented on a log scale: | |
1175 | * | |
1176 | * delay | |
1177 | * 100ms +-------------------------------------------------------------++ | |
1178 | * + + | |
1179 | * | | | |
1180 | * + *+ | |
1181 | * 10ms + *+ | |
1182 | * + ** + | |
1183 | * | (midpoint) ** | | |
1184 | * + | ** + | |
1185 | * 1ms + v **** + | |
1186 | * + zfs_delay_scale ----------> ***** + | |
1187 | * | **** | | |
1188 | * + **** + | |
1189 | * 100us + ** + | |
1190 | * + * + | |
1191 | * | * | | |
1192 | * + * + | |
1193 | * 10us + * + | |
1194 | * + + | |
1195 | * | | | |
1196 | * + + | |
1197 | * +--------------------------------------------------------------+ | |
1198 | * 0% <- zfs_dirty_data_max -> 100% | |
1199 | * | |
1200 | * Note here that only as the amount of dirty data approaches its limit does | |
1201 | * the delay start to increase rapidly. The goal of a properly tuned system | |
1202 | * should be to keep the amount of dirty data out of that range by first | |
1203 | * ensuring that the appropriate limits are set for the I/O scheduler to reach | |
1204 | * optimal throughput on the backend storage, and then by changing the value | |
1205 | * of zfs_delay_scale to increase the steepness of the curve. | |
1206 | */ | |
1207 | static void | |
1208 | dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) | |
1209 | { | |
1210 | dsl_pool_t *dp = tx->tx_pool; | |
1211 | uint64_t delay_min_bytes = | |
1212 | zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; | |
1213 | hrtime_t wakeup, min_tx_time, now; | |
1214 | ||
1215 | if (dirty <= delay_min_bytes) | |
1216 | return; | |
1217 | ||
1218 | /* | |
1219 | * The caller has already waited until we are under the max. | |
1220 | * We make them pass us the amount of dirty data so we don't | |
1221 | * have to handle the case of it being >= the max, which could | |
1222 | * cause a divide-by-zero if it's == the max. | |
1223 | */ | |
1224 | ASSERT3U(dirty, <, zfs_dirty_data_max); | |
1225 | ||
1226 | now = gethrtime(); | |
1227 | min_tx_time = zfs_delay_scale * | |
1228 | (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); | |
1229 | min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); | |
1230 | if (now > tx->tx_start + min_tx_time) | |
1231 | return; | |
1232 | ||
1233 | DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, | |
1234 | uint64_t, min_tx_time); | |
1235 | ||
1236 | mutex_enter(&dp->dp_lock); | |
1237 | wakeup = MAX(tx->tx_start + min_tx_time, | |
1238 | dp->dp_last_wakeup + min_tx_time); | |
1239 | dp->dp_last_wakeup = wakeup; | |
1240 | mutex_exit(&dp->dp_lock); | |
1241 | ||
1242 | zfs_sleep_until(wakeup); | |
1243 | } | |
1244 | ||
34dc7c2f | 1245 | static int |
13fe0198 | 1246 | dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) |
34dc7c2f BB |
1247 | { |
1248 | dmu_tx_hold_t *txh; | |
1249 | spa_t *spa = tx->tx_pool->dp_spa; | |
b128c09f BB |
1250 | uint64_t memory, asize, fsize, usize; |
1251 | uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge; | |
34dc7c2f | 1252 | |
c99c9001 | 1253 | ASSERT0(tx->tx_txg); |
34dc7c2f | 1254 | |
570827e1 BB |
1255 | if (tx->tx_err) { |
1256 | DMU_TX_STAT_BUMP(dmu_tx_error); | |
34dc7c2f | 1257 | return (tx->tx_err); |
570827e1 | 1258 | } |
34dc7c2f | 1259 | |
b128c09f | 1260 | if (spa_suspended(spa)) { |
570827e1 BB |
1261 | DMU_TX_STAT_BUMP(dmu_tx_suspended); |
1262 | ||
34dc7c2f BB |
1263 | /* |
1264 | * If the user has indicated a blocking failure mode | |
1265 | * then return ERESTART which will block in dmu_tx_wait(). | |
1266 | * Otherwise, return EIO so that an error can get | |
1267 | * propagated back to the VOP calls. | |
1268 | * | |
1269 | * Note that we always honor the txg_how flag regardless | |
1270 | * of the failuremode setting. | |
1271 | */ | |
1272 | if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && | |
1273 | txg_how != TXG_WAIT) | |
2e528b49 | 1274 | return (SET_ERROR(EIO)); |
34dc7c2f | 1275 | |
2e528b49 | 1276 | return (SET_ERROR(ERESTART)); |
34dc7c2f BB |
1277 | } |
1278 | ||
e8b96c60 MA |
1279 | if (!tx->tx_waited && |
1280 | dsl_pool_need_dirty_delay(tx->tx_pool)) { | |
1281 | tx->tx_wait_dirty = B_TRUE; | |
1282 | DMU_TX_STAT_BUMP(dmu_tx_dirty_delay); | |
1283 | return (ERESTART); | |
1284 | } | |
1285 | ||
34dc7c2f BB |
1286 | tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); |
1287 | tx->tx_needassign_txh = NULL; | |
1288 | ||
1289 | /* | |
1290 | * NB: No error returns are allowed after txg_hold_open, but | |
1291 | * before processing the dnode holds, due to the | |
1292 | * dmu_tx_unassign() logic. | |
1293 | */ | |
1294 | ||
b128c09f | 1295 | towrite = tofree = tooverwrite = tounref = tohold = fudge = 0; |
34dc7c2f BB |
1296 | for (txh = list_head(&tx->tx_holds); txh; |
1297 | txh = list_next(&tx->tx_holds, txh)) { | |
1298 | dnode_t *dn = txh->txh_dnode; | |
1299 | if (dn != NULL) { | |
1300 | mutex_enter(&dn->dn_mtx); | |
1301 | if (dn->dn_assigned_txg == tx->tx_txg - 1) { | |
1302 | mutex_exit(&dn->dn_mtx); | |
1303 | tx->tx_needassign_txh = txh; | |
570827e1 | 1304 | DMU_TX_STAT_BUMP(dmu_tx_group); |
2e528b49 | 1305 | return (SET_ERROR(ERESTART)); |
34dc7c2f BB |
1306 | } |
1307 | if (dn->dn_assigned_txg == 0) | |
1308 | dn->dn_assigned_txg = tx->tx_txg; | |
1309 | ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); | |
1310 | (void) refcount_add(&dn->dn_tx_holds, tx); | |
1311 | mutex_exit(&dn->dn_mtx); | |
1312 | } | |
f85c06be GM |
1313 | towrite += refcount_count(&txh->txh_space_towrite); |
1314 | tofree += refcount_count(&txh->txh_space_tofree); | |
1315 | tooverwrite += refcount_count(&txh->txh_space_tooverwrite); | |
1316 | tounref += refcount_count(&txh->txh_space_tounref); | |
1317 | tohold += refcount_count(&txh->txh_memory_tohold); | |
1318 | fudge += refcount_count(&txh->txh_fudge); | |
34dc7c2f BB |
1319 | } |
1320 | ||
34dc7c2f BB |
1321 | /* |
1322 | * If a snapshot has been taken since we made our estimates, | |
1323 | * assume that we won't be able to free or overwrite anything. | |
1324 | */ | |
1325 | if (tx->tx_objset && | |
428870ff | 1326 | dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) > |
34dc7c2f BB |
1327 | tx->tx_lastsnap_txg) { |
1328 | towrite += tooverwrite; | |
1329 | tooverwrite = tofree = 0; | |
1330 | } | |
1331 | ||
b128c09f BB |
1332 | /* needed allocation: worst-case estimate of write space */ |
1333 | asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite); | |
1334 | /* freed space estimate: worst-case overwrite + free estimate */ | |
34dc7c2f | 1335 | fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; |
b128c09f | 1336 | /* convert unrefd space to worst-case estimate */ |
34dc7c2f | 1337 | usize = spa_get_asize(tx->tx_pool->dp_spa, tounref); |
b128c09f BB |
1338 | /* calculate memory footprint estimate */ |
1339 | memory = towrite + tooverwrite + tohold; | |
34dc7c2f | 1340 | |
1c5de20a | 1341 | #ifdef DEBUG_DMU_TX |
b128c09f BB |
1342 | /* |
1343 | * Add in 'tohold' to account for our dirty holds on this memory | |
1344 | * XXX - the "fudge" factor is to account for skipped blocks that | |
1345 | * we missed because dnode_next_offset() misses in-core-only blocks. | |
1346 | */ | |
1347 | tx->tx_space_towrite = asize + | |
1348 | spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge); | |
34dc7c2f BB |
1349 | tx->tx_space_tofree = tofree; |
1350 | tx->tx_space_tooverwrite = tooverwrite; | |
1351 | tx->tx_space_tounref = tounref; | |
1352 | #endif | |
1353 | ||
1354 | if (tx->tx_dir && asize != 0) { | |
b128c09f BB |
1355 | int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, |
1356 | asize, fsize, usize, &tx->tx_tempreserve_cookie, tx); | |
34dc7c2f BB |
1357 | if (err) |
1358 | return (err); | |
1359 | } | |
1360 | ||
570827e1 BB |
1361 | DMU_TX_STAT_BUMP(dmu_tx_assigned); |
1362 | ||
34dc7c2f BB |
1363 | return (0); |
1364 | } | |
1365 | ||
1366 | static void | |
1367 | dmu_tx_unassign(dmu_tx_t *tx) | |
1368 | { | |
1369 | dmu_tx_hold_t *txh; | |
1370 | ||
1371 | if (tx->tx_txg == 0) | |
1372 | return; | |
1373 | ||
1374 | txg_rele_to_quiesce(&tx->tx_txgh); | |
1375 | ||
e49f1e20 WA |
1376 | /* |
1377 | * Walk the transaction's hold list, removing the hold on the | |
1378 | * associated dnode, and notifying waiters if the refcount drops to 0. | |
1379 | */ | |
981b2126 | 1380 | for (txh = list_head(&tx->tx_holds); |
1381 | txh && txh != tx->tx_needassign_txh; | |
34dc7c2f BB |
1382 | txh = list_next(&tx->tx_holds, txh)) { |
1383 | dnode_t *dn = txh->txh_dnode; | |
1384 | ||
1385 | if (dn == NULL) | |
1386 | continue; | |
1387 | mutex_enter(&dn->dn_mtx); | |
1388 | ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); | |
1389 | ||
1390 | if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { | |
1391 | dn->dn_assigned_txg = 0; | |
1392 | cv_broadcast(&dn->dn_notxholds); | |
1393 | } | |
1394 | mutex_exit(&dn->dn_mtx); | |
1395 | } | |
1396 | ||
1397 | txg_rele_to_sync(&tx->tx_txgh); | |
1398 | ||
1399 | tx->tx_lasttried_txg = tx->tx_txg; | |
1400 | tx->tx_txg = 0; | |
1401 | } | |
1402 | ||
1403 | /* | |
1404 | * Assign tx to a transaction group. txg_how can be one of: | |
1405 | * | |
1406 | * (1) TXG_WAIT. If the current open txg is full, waits until there's | |
1407 | * a new one. This should be used when you're not holding locks. | |
13fe0198 | 1408 | * It will only fail if we're truly out of space (or over quota). |
34dc7c2f BB |
1409 | * |
1410 | * (2) TXG_NOWAIT. If we can't assign into the current open txg without | |
1411 | * blocking, returns immediately with ERESTART. This should be used | |
1412 | * whenever you're holding locks. On an ERESTART error, the caller | |
1413 | * should drop locks, do a dmu_tx_wait(tx), and try again. | |
e8b96c60 MA |
1414 | * |
1415 | * (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait() | |
1416 | * has already been called on behalf of this operation (though | |
1417 | * most likely on a different tx). | |
34dc7c2f BB |
1418 | */ |
1419 | int | |
13fe0198 | 1420 | dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) |
34dc7c2f BB |
1421 | { |
1422 | int err; | |
1423 | ||
1424 | ASSERT(tx->tx_txg == 0); | |
e8b96c60 MA |
1425 | ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT || |
1426 | txg_how == TXG_WAITED); | |
34dc7c2f BB |
1427 | ASSERT(!dsl_pool_sync_context(tx->tx_pool)); |
1428 | ||
e8b96c60 MA |
1429 | if (txg_how == TXG_WAITED) |
1430 | tx->tx_waited = B_TRUE; | |
1431 | ||
13fe0198 MA |
1432 | /* If we might wait, we must not hold the config lock. */ |
1433 | ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool)); | |
1434 | ||
34dc7c2f BB |
1435 | while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { |
1436 | dmu_tx_unassign(tx); | |
1437 | ||
1438 | if (err != ERESTART || txg_how != TXG_WAIT) | |
1439 | return (err); | |
1440 | ||
1441 | dmu_tx_wait(tx); | |
1442 | } | |
1443 | ||
1444 | txg_rele_to_quiesce(&tx->tx_txgh); | |
1445 | ||
1446 | return (0); | |
1447 | } | |
1448 | ||
1449 | void | |
1450 | dmu_tx_wait(dmu_tx_t *tx) | |
1451 | { | |
1452 | spa_t *spa = tx->tx_pool->dp_spa; | |
e8b96c60 | 1453 | dsl_pool_t *dp = tx->tx_pool; |
a77c4c83 | 1454 | hrtime_t before; |
34dc7c2f BB |
1455 | |
1456 | ASSERT(tx->tx_txg == 0); | |
13fe0198 | 1457 | ASSERT(!dsl_pool_config_held(tx->tx_pool)); |
34dc7c2f | 1458 | |
a77c4c83 NB |
1459 | before = gethrtime(); |
1460 | ||
e8b96c60 MA |
1461 | if (tx->tx_wait_dirty) { |
1462 | uint64_t dirty; | |
1463 | ||
1464 | /* | |
1465 | * dmu_tx_try_assign() has determined that we need to wait | |
1466 | * because we've consumed much or all of the dirty buffer | |
1467 | * space. | |
1468 | */ | |
1469 | mutex_enter(&dp->dp_lock); | |
1470 | if (dp->dp_dirty_total >= zfs_dirty_data_max) | |
1471 | DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max); | |
1472 | while (dp->dp_dirty_total >= zfs_dirty_data_max) | |
1473 | cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); | |
1474 | dirty = dp->dp_dirty_total; | |
1475 | mutex_exit(&dp->dp_lock); | |
1476 | ||
1477 | dmu_tx_delay(tx, dirty); | |
1478 | ||
1479 | tx->tx_wait_dirty = B_FALSE; | |
1480 | ||
1481 | /* | |
1482 | * Note: setting tx_waited only has effect if the caller | |
1483 | * used TX_WAIT. Otherwise they are going to destroy | |
1484 | * this tx and try again. The common case, zfs_write(), | |
1485 | * uses TX_WAIT. | |
1486 | */ | |
1487 | tx->tx_waited = B_TRUE; | |
1488 | } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { | |
1489 | /* | |
1490 | * If the pool is suspended we need to wait until it | |
1491 | * is resumed. Note that it's possible that the pool | |
1492 | * has become active after this thread has tried to | |
1493 | * obtain a tx. If that's the case then tx_lasttried_txg | |
1494 | * would not have been set. | |
1495 | */ | |
1496 | txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); | |
34dc7c2f BB |
1497 | } else if (tx->tx_needassign_txh) { |
1498 | dnode_t *dn = tx->tx_needassign_txh->txh_dnode; | |
1499 | ||
1500 | mutex_enter(&dn->dn_mtx); | |
1501 | while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) | |
1502 | cv_wait(&dn->dn_notxholds, &dn->dn_mtx); | |
1503 | mutex_exit(&dn->dn_mtx); | |
1504 | tx->tx_needassign_txh = NULL; | |
1505 | } else { | |
e8b96c60 MA |
1506 | /* |
1507 | * A dnode is assigned to the quiescing txg. Wait for its | |
1508 | * transaction to complete. | |
1509 | */ | |
34dc7c2f BB |
1510 | txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); |
1511 | } | |
a77c4c83 NB |
1512 | |
1513 | spa_tx_assign_add_nsecs(spa, gethrtime() - before); | |
34dc7c2f BB |
1514 | } |
1515 | ||
1516 | void | |
1517 | dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) | |
1518 | { | |
1c5de20a | 1519 | #ifdef DEBUG_DMU_TX |
34dc7c2f BB |
1520 | if (tx->tx_dir == NULL || delta == 0) |
1521 | return; | |
1522 | ||
1523 | if (delta > 0) { | |
1524 | ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=, | |
1525 | tx->tx_space_towrite); | |
1526 | (void) refcount_add_many(&tx->tx_space_written, delta, NULL); | |
1527 | } else { | |
1528 | (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL); | |
1529 | } | |
1530 | #endif | |
1531 | } | |
1532 | ||
f85c06be GM |
1533 | static void |
1534 | dmu_tx_destroy(dmu_tx_t *tx) | |
1535 | { | |
1536 | dmu_tx_hold_t *txh; | |
1537 | ||
1538 | while ((txh = list_head(&tx->tx_holds)) != NULL) { | |
1539 | dnode_t *dn = txh->txh_dnode; | |
1540 | ||
1541 | list_remove(&tx->tx_holds, txh); | |
1542 | refcount_destroy_many(&txh->txh_space_towrite, | |
1543 | refcount_count(&txh->txh_space_towrite)); | |
1544 | refcount_destroy_many(&txh->txh_space_tofree, | |
1545 | refcount_count(&txh->txh_space_tofree)); | |
1546 | refcount_destroy_many(&txh->txh_space_tooverwrite, | |
1547 | refcount_count(&txh->txh_space_tooverwrite)); | |
1548 | refcount_destroy_many(&txh->txh_space_tounref, | |
1549 | refcount_count(&txh->txh_space_tounref)); | |
1550 | refcount_destroy_many(&txh->txh_memory_tohold, | |
1551 | refcount_count(&txh->txh_memory_tohold)); | |
1552 | refcount_destroy_many(&txh->txh_fudge, | |
1553 | refcount_count(&txh->txh_fudge)); | |
1554 | kmem_free(txh, sizeof (dmu_tx_hold_t)); | |
1555 | if (dn != NULL) | |
1556 | dnode_rele(dn, tx); | |
1557 | } | |
1558 | ||
1559 | list_destroy(&tx->tx_callbacks); | |
1560 | list_destroy(&tx->tx_holds); | |
1561 | #ifdef DEBUG_DMU_TX | |
1562 | refcount_destroy_many(&tx->tx_space_written, | |
1563 | refcount_count(&tx->tx_space_written)); | |
1564 | refcount_destroy_many(&tx->tx_space_freed, | |
1565 | refcount_count(&tx->tx_space_freed)); | |
1566 | #endif | |
1567 | kmem_free(tx, sizeof (dmu_tx_t)); | |
1568 | } | |
1569 | ||
34dc7c2f BB |
1570 | void |
1571 | dmu_tx_commit(dmu_tx_t *tx) | |
1572 | { | |
1573 | dmu_tx_hold_t *txh; | |
1574 | ||
1575 | ASSERT(tx->tx_txg != 0); | |
1576 | ||
e49f1e20 WA |
1577 | /* |
1578 | * Go through the transaction's hold list and remove holds on | |
1579 | * associated dnodes, notifying waiters if no holds remain. | |
1580 | */ | |
f85c06be GM |
1581 | for (txh = list_head(&tx->tx_holds); txh != NULL; |
1582 | txh = list_next(&tx->tx_holds, txh)) { | |
34dc7c2f BB |
1583 | dnode_t *dn = txh->txh_dnode; |
1584 | ||
34dc7c2f BB |
1585 | if (dn == NULL) |
1586 | continue; | |
f85c06be | 1587 | |
34dc7c2f BB |
1588 | mutex_enter(&dn->dn_mtx); |
1589 | ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); | |
1590 | ||
1591 | if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { | |
1592 | dn->dn_assigned_txg = 0; | |
1593 | cv_broadcast(&dn->dn_notxholds); | |
1594 | } | |
1595 | mutex_exit(&dn->dn_mtx); | |
34dc7c2f BB |
1596 | } |
1597 | ||
1598 | if (tx->tx_tempreserve_cookie) | |
1599 | dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); | |
1600 | ||
428870ff BB |
1601 | if (!list_is_empty(&tx->tx_callbacks)) |
1602 | txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); | |
1603 | ||
34dc7c2f BB |
1604 | if (tx->tx_anyobj == FALSE) |
1605 | txg_rele_to_sync(&tx->tx_txgh); | |
428870ff | 1606 | |
1c5de20a | 1607 | #ifdef DEBUG_DMU_TX |
34dc7c2f BB |
1608 | dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", |
1609 | tx->tx_space_towrite, refcount_count(&tx->tx_space_written), | |
1610 | tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); | |
34dc7c2f | 1611 | #endif |
f85c06be | 1612 | dmu_tx_destroy(tx); |
34dc7c2f BB |
1613 | } |
1614 | ||
1615 | void | |
1616 | dmu_tx_abort(dmu_tx_t *tx) | |
1617 | { | |
34dc7c2f BB |
1618 | ASSERT(tx->tx_txg == 0); |
1619 | ||
428870ff BB |
1620 | /* |
1621 | * Call any registered callbacks with an error code. | |
1622 | */ | |
1623 | if (!list_is_empty(&tx->tx_callbacks)) | |
1624 | dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); | |
1625 | ||
f85c06be | 1626 | dmu_tx_destroy(tx); |
34dc7c2f BB |
1627 | } |
1628 | ||
1629 | uint64_t | |
1630 | dmu_tx_get_txg(dmu_tx_t *tx) | |
1631 | { | |
1632 | ASSERT(tx->tx_txg != 0); | |
1633 | return (tx->tx_txg); | |
1634 | } | |
428870ff | 1635 | |
13fe0198 MA |
1636 | dsl_pool_t * |
1637 | dmu_tx_pool(dmu_tx_t *tx) | |
1638 | { | |
1639 | ASSERT(tx->tx_pool != NULL); | |
1640 | return (tx->tx_pool); | |
1641 | } | |
1642 | ||
428870ff BB |
1643 | void |
1644 | dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) | |
1645 | { | |
1646 | dmu_tx_callback_t *dcb; | |
1647 | ||
79c76d5b | 1648 | dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); |
428870ff BB |
1649 | |
1650 | dcb->dcb_func = func; | |
1651 | dcb->dcb_data = data; | |
1652 | ||
1653 | list_insert_tail(&tx->tx_callbacks, dcb); | |
1654 | } | |
1655 | ||
1656 | /* | |
1657 | * Call all the commit callbacks on a list, with a given error code. | |
1658 | */ | |
1659 | void | |
1660 | dmu_tx_do_callbacks(list_t *cb_list, int error) | |
1661 | { | |
1662 | dmu_tx_callback_t *dcb; | |
1663 | ||
f85c06be | 1664 | while ((dcb = list_head(cb_list)) != NULL) { |
428870ff BB |
1665 | list_remove(cb_list, dcb); |
1666 | dcb->dcb_func(dcb->dcb_data, error); | |
1667 | kmem_free(dcb, sizeof (dmu_tx_callback_t)); | |
1668 | } | |
1669 | } | |
1670 | ||
1671 | /* | |
1672 | * Interface to hold a bunch of attributes. | |
1673 | * used for creating new files. | |
1674 | * attrsize is the total size of all attributes | |
1675 | * to be added during object creation | |
1676 | * | |
1677 | * For updating/adding a single attribute dmu_tx_hold_sa() should be used. | |
1678 | */ | |
1679 | ||
1680 | /* | |
1681 | * hold necessary attribute name for attribute registration. | |
1682 | * should be a very rare case where this is needed. If it does | |
1683 | * happen it would only happen on the first write to the file system. | |
1684 | */ | |
1685 | static void | |
1686 | dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) | |
1687 | { | |
1688 | int i; | |
1689 | ||
1690 | if (!sa->sa_need_attr_registration) | |
1691 | return; | |
1692 | ||
1693 | for (i = 0; i != sa->sa_num_attrs; i++) { | |
1694 | if (!sa->sa_attr_table[i].sa_registered) { | |
1695 | if (sa->sa_reg_attr_obj) | |
1696 | dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, | |
1697 | B_TRUE, sa->sa_attr_table[i].sa_name); | |
1698 | else | |
1699 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, | |
1700 | B_TRUE, sa->sa_attr_table[i].sa_name); | |
1701 | } | |
1702 | } | |
1703 | } | |
1704 | ||
1705 | ||
1706 | void | |
1707 | dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) | |
1708 | { | |
1709 | dnode_t *dn; | |
1710 | dmu_tx_hold_t *txh; | |
428870ff BB |
1711 | |
1712 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, | |
1713 | THT_SPILL, 0, 0); | |
7d637211 NC |
1714 | if (txh == NULL) |
1715 | return; | |
428870ff BB |
1716 | |
1717 | dn = txh->txh_dnode; | |
1718 | ||
1719 | if (dn == NULL) | |
1720 | return; | |
1721 | ||
1722 | /* If blkptr doesn't exist then add space to towrite */ | |
22cd4a46 | 1723 | if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { |
f85c06be GM |
1724 | (void) refcount_add_many(&txh->txh_space_towrite, |
1725 | SPA_OLD_MAXBLOCKSIZE, FTAG); | |
428870ff | 1726 | } else { |
22cd4a46 AL |
1727 | blkptr_t *bp; |
1728 | ||
50c957f7 | 1729 | bp = DN_SPILL_BLKPTR(dn->dn_phys); |
428870ff | 1730 | if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, |
f85c06be GM |
1731 | bp, bp->blk_birth)) { |
1732 | (void) refcount_add_many(&txh->txh_space_tooverwrite, | |
1733 | SPA_OLD_MAXBLOCKSIZE, FTAG); | |
1734 | } else { | |
1735 | (void) refcount_add_many(&txh->txh_space_towrite, | |
1736 | SPA_OLD_MAXBLOCKSIZE, FTAG); | |
1737 | } | |
1738 | if (!BP_IS_HOLE(bp)) { | |
1739 | (void) refcount_add_many(&txh->txh_space_tounref, | |
1740 | SPA_OLD_MAXBLOCKSIZE, FTAG); | |
1741 | } | |
428870ff BB |
1742 | } |
1743 | } | |
1744 | ||
1745 | void | |
1746 | dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) | |
1747 | { | |
1748 | sa_os_t *sa = tx->tx_objset->os_sa; | |
1749 | ||
1750 | dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); | |
1751 | ||
1752 | if (tx->tx_objset->os_sa->sa_master_obj == 0) | |
1753 | return; | |
1754 | ||
1755 | if (tx->tx_objset->os_sa->sa_layout_attr_obj) | |
1756 | dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); | |
1757 | else { | |
1758 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); | |
1759 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); | |
1760 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1761 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1762 | } | |
1763 | ||
1764 | dmu_tx_sa_registration_hold(sa, tx); | |
1765 | ||
50c957f7 | 1766 | if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill) |
428870ff BB |
1767 | return; |
1768 | ||
1769 | (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, | |
1770 | THT_SPILL, 0, 0); | |
1771 | } | |
1772 | ||
1773 | /* | |
1774 | * Hold SA attribute | |
1775 | * | |
1776 | * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) | |
1777 | * | |
1778 | * variable_size is the total size of all variable sized attributes | |
1779 | * passed to this function. It is not the total size of all | |
1780 | * variable size attributes that *may* exist on this object. | |
1781 | */ | |
1782 | void | |
1783 | dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) | |
1784 | { | |
1785 | uint64_t object; | |
1786 | sa_os_t *sa = tx->tx_objset->os_sa; | |
1787 | ||
1788 | ASSERT(hdl != NULL); | |
1789 | ||
1790 | object = sa_handle_object(hdl); | |
1791 | ||
1792 | dmu_tx_hold_bonus(tx, object); | |
1793 | ||
1794 | if (tx->tx_objset->os_sa->sa_master_obj == 0) | |
1795 | return; | |
1796 | ||
1797 | if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || | |
1798 | tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { | |
1799 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); | |
1800 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); | |
1801 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1802 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1803 | } | |
1804 | ||
1805 | dmu_tx_sa_registration_hold(sa, tx); | |
1806 | ||
1807 | if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) | |
1808 | dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); | |
1809 | ||
572e2857 | 1810 | if (sa->sa_force_spill || may_grow || hdl->sa_spill) { |
428870ff BB |
1811 | ASSERT(tx->tx_txg == 0); |
1812 | dmu_tx_hold_spill(tx, object); | |
572e2857 BB |
1813 | } else { |
1814 | dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; | |
1815 | dnode_t *dn; | |
1816 | ||
1817 | DB_DNODE_ENTER(db); | |
1818 | dn = DB_DNODE(db); | |
1819 | if (dn->dn_have_spill) { | |
1820 | ASSERT(tx->tx_txg == 0); | |
1821 | dmu_tx_hold_spill(tx, object); | |
1822 | } | |
1823 | DB_DNODE_EXIT(db); | |
428870ff BB |
1824 | } |
1825 | } | |
c28b2279 | 1826 | |
570827e1 BB |
1827 | void |
1828 | dmu_tx_init(void) | |
1829 | { | |
1830 | dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc", | |
1831 | KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t), | |
1832 | KSTAT_FLAG_VIRTUAL); | |
1833 | ||
1834 | if (dmu_tx_ksp != NULL) { | |
1835 | dmu_tx_ksp->ks_data = &dmu_tx_stats; | |
1836 | kstat_install(dmu_tx_ksp); | |
1837 | } | |
1838 | } | |
1839 | ||
1840 | void | |
1841 | dmu_tx_fini(void) | |
1842 | { | |
1843 | if (dmu_tx_ksp != NULL) { | |
1844 | kstat_delete(dmu_tx_ksp); | |
1845 | dmu_tx_ksp = NULL; | |
1846 | } | |
1847 | } | |
1848 | ||
c28b2279 BB |
1849 | #if defined(_KERNEL) && defined(HAVE_SPL) |
1850 | EXPORT_SYMBOL(dmu_tx_create); | |
1851 | EXPORT_SYMBOL(dmu_tx_hold_write); | |
0eef1bde | 1852 | EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode); |
c28b2279 | 1853 | EXPORT_SYMBOL(dmu_tx_hold_free); |
0eef1bde | 1854 | EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode); |
c28b2279 | 1855 | EXPORT_SYMBOL(dmu_tx_hold_zap); |
0eef1bde | 1856 | EXPORT_SYMBOL(dmu_tx_hold_zap_by_dnode); |
c28b2279 | 1857 | EXPORT_SYMBOL(dmu_tx_hold_bonus); |
0eef1bde | 1858 | EXPORT_SYMBOL(dmu_tx_hold_bonus_by_dnode); |
c28b2279 BB |
1859 | EXPORT_SYMBOL(dmu_tx_abort); |
1860 | EXPORT_SYMBOL(dmu_tx_assign); | |
1861 | EXPORT_SYMBOL(dmu_tx_wait); | |
1862 | EXPORT_SYMBOL(dmu_tx_commit); | |
1863 | EXPORT_SYMBOL(dmu_tx_get_txg); | |
1864 | EXPORT_SYMBOL(dmu_tx_callback_register); | |
1865 | EXPORT_SYMBOL(dmu_tx_do_callbacks); | |
1866 | EXPORT_SYMBOL(dmu_tx_hold_spill); | |
1867 | EXPORT_SYMBOL(dmu_tx_hold_sa_create); | |
1868 | EXPORT_SYMBOL(dmu_tx_hold_sa); | |
1869 | #endif |