]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
34dc7c2f BB |
23 | */ |
24 | ||
34dc7c2f BB |
25 | #include <sys/dmu.h> |
26 | #include <sys/dmu_impl.h> | |
27 | #include <sys/dbuf.h> | |
28 | #include <sys/dmu_tx.h> | |
29 | #include <sys/dmu_objset.h> | |
30 | #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */ | |
31 | #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */ | |
32 | #include <sys/dsl_pool.h> | |
33 | #include <sys/zap_impl.h> /* for fzap_default_block_shift */ | |
34 | #include <sys/spa.h> | |
428870ff BB |
35 | #include <sys/sa.h> |
36 | #include <sys/sa_impl.h> | |
34dc7c2f | 37 | #include <sys/zfs_context.h> |
428870ff | 38 | #include <sys/varargs.h> |
34dc7c2f BB |
39 | |
40 | typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, | |
41 | uint64_t arg1, uint64_t arg2); | |
42 | ||
43 | ||
44 | dmu_tx_t * | |
45 | dmu_tx_create_dd(dsl_dir_t *dd) | |
46 | { | |
47 | dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); | |
48 | tx->tx_dir = dd; | |
49 | if (dd) | |
50 | tx->tx_pool = dd->dd_pool; | |
51 | list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), | |
52 | offsetof(dmu_tx_hold_t, txh_node)); | |
428870ff BB |
53 | list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), |
54 | offsetof(dmu_tx_callback_t, dcb_node)); | |
34dc7c2f BB |
55 | #ifdef ZFS_DEBUG |
56 | refcount_create(&tx->tx_space_written); | |
57 | refcount_create(&tx->tx_space_freed); | |
58 | #endif | |
59 | return (tx); | |
60 | } | |
61 | ||
62 | dmu_tx_t * | |
63 | dmu_tx_create(objset_t *os) | |
64 | { | |
428870ff | 65 | dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); |
34dc7c2f | 66 | tx->tx_objset = os; |
428870ff | 67 | tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset); |
34dc7c2f BB |
68 | return (tx); |
69 | } | |
70 | ||
71 | dmu_tx_t * | |
72 | dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) | |
73 | { | |
74 | dmu_tx_t *tx = dmu_tx_create_dd(NULL); | |
75 | ||
76 | ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); | |
77 | tx->tx_pool = dp; | |
78 | tx->tx_txg = txg; | |
79 | tx->tx_anyobj = TRUE; | |
80 | ||
81 | return (tx); | |
82 | } | |
83 | ||
84 | int | |
85 | dmu_tx_is_syncing(dmu_tx_t *tx) | |
86 | { | |
87 | return (tx->tx_anyobj); | |
88 | } | |
89 | ||
90 | int | |
91 | dmu_tx_private_ok(dmu_tx_t *tx) | |
92 | { | |
93 | return (tx->tx_anyobj); | |
94 | } | |
95 | ||
96 | static dmu_tx_hold_t * | |
97 | dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, | |
98 | enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) | |
99 | { | |
100 | dmu_tx_hold_t *txh; | |
101 | dnode_t *dn = NULL; | |
102 | int err; | |
103 | ||
104 | if (object != DMU_NEW_OBJECT) { | |
428870ff | 105 | err = dnode_hold(os, object, tx, &dn); |
34dc7c2f BB |
106 | if (err) { |
107 | tx->tx_err = err; | |
108 | return (NULL); | |
109 | } | |
110 | ||
111 | if (err == 0 && tx->tx_txg != 0) { | |
112 | mutex_enter(&dn->dn_mtx); | |
113 | /* | |
114 | * dn->dn_assigned_txg == tx->tx_txg doesn't pose a | |
115 | * problem, but there's no way for it to happen (for | |
116 | * now, at least). | |
117 | */ | |
118 | ASSERT(dn->dn_assigned_txg == 0); | |
119 | dn->dn_assigned_txg = tx->tx_txg; | |
120 | (void) refcount_add(&dn->dn_tx_holds, tx); | |
121 | mutex_exit(&dn->dn_mtx); | |
122 | } | |
123 | } | |
124 | ||
125 | txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); | |
126 | txh->txh_tx = tx; | |
127 | txh->txh_dnode = dn; | |
128 | #ifdef ZFS_DEBUG | |
129 | txh->txh_type = type; | |
130 | txh->txh_arg1 = arg1; | |
131 | txh->txh_arg2 = arg2; | |
132 | #endif | |
133 | list_insert_tail(&tx->tx_holds, txh); | |
134 | ||
135 | return (txh); | |
136 | } | |
137 | ||
138 | void | |
139 | dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) | |
140 | { | |
141 | /* | |
142 | * If we're syncing, they can manipulate any object anyhow, and | |
143 | * the hold on the dnode_t can cause problems. | |
144 | */ | |
145 | if (!dmu_tx_is_syncing(tx)) { | |
146 | (void) dmu_tx_hold_object_impl(tx, os, | |
147 | object, THT_NEWOBJECT, 0, 0); | |
148 | } | |
149 | } | |
150 | ||
151 | static int | |
152 | dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) | |
153 | { | |
154 | int err; | |
155 | dmu_buf_impl_t *db; | |
156 | ||
157 | rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
158 | db = dbuf_hold_level(dn, level, blkid, FTAG); | |
159 | rw_exit(&dn->dn_struct_rwlock); | |
160 | if (db == NULL) | |
161 | return (EIO); | |
162 | err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); | |
163 | dbuf_rele(db, FTAG); | |
164 | return (err); | |
165 | } | |
166 | ||
9babb374 | 167 | static void |
428870ff BB |
168 | dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db, |
169 | int level, uint64_t blkid, boolean_t freeable, uint64_t *history) | |
9babb374 | 170 | { |
428870ff BB |
171 | objset_t *os = dn->dn_objset; |
172 | dsl_dataset_t *ds = os->os_dsl_dataset; | |
173 | int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; | |
174 | dmu_buf_impl_t *parent = NULL; | |
175 | blkptr_t *bp = NULL; | |
176 | uint64_t space; | |
177 | ||
178 | if (level >= dn->dn_nlevels || history[level] == blkid) | |
9babb374 BB |
179 | return; |
180 | ||
428870ff | 181 | history[level] = blkid; |
9babb374 | 182 | |
428870ff BB |
183 | space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift); |
184 | ||
185 | if (db == NULL || db == dn->dn_dbuf) { | |
186 | ASSERT(level != 0); | |
187 | db = NULL; | |
188 | } else { | |
572e2857 | 189 | ASSERT(DB_DNODE(db) == dn); |
428870ff BB |
190 | ASSERT(db->db_level == level); |
191 | ASSERT(db->db.db_size == space); | |
192 | ASSERT(db->db_blkid == blkid); | |
193 | bp = db->db_blkptr; | |
194 | parent = db->db_parent; | |
9babb374 BB |
195 | } |
196 | ||
428870ff BB |
197 | freeable = (bp && (freeable || |
198 | dsl_dataset_block_freeable(ds, bp, bp->blk_birth))); | |
9babb374 | 199 | |
428870ff BB |
200 | if (freeable) |
201 | txh->txh_space_tooverwrite += space; | |
202 | else | |
203 | txh->txh_space_towrite += space; | |
204 | if (bp) | |
205 | txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp); | |
206 | ||
207 | dmu_tx_count_twig(txh, dn, parent, level + 1, | |
208 | blkid >> epbs, freeable, history); | |
9babb374 BB |
209 | } |
210 | ||
34dc7c2f BB |
211 | /* ARGSUSED */ |
212 | static void | |
213 | dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) | |
214 | { | |
215 | dnode_t *dn = txh->txh_dnode; | |
216 | uint64_t start, end, i; | |
217 | int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; | |
218 | int err = 0; | |
d6320ddb | 219 | int l; |
34dc7c2f BB |
220 | |
221 | if (len == 0) | |
222 | return; | |
223 | ||
224 | min_bs = SPA_MINBLOCKSHIFT; | |
225 | max_bs = SPA_MAXBLOCKSHIFT; | |
226 | min_ibs = DN_MIN_INDBLKSHIFT; | |
227 | max_ibs = DN_MAX_INDBLKSHIFT; | |
228 | ||
34dc7c2f | 229 | if (dn) { |
428870ff | 230 | uint64_t history[DN_MAX_LEVELS]; |
9babb374 BB |
231 | int nlvls = dn->dn_nlevels; |
232 | int delta; | |
233 | ||
234 | /* | |
235 | * For i/o error checking, read the first and last level-0 | |
236 | * blocks (if they are not aligned), and all the level-1 blocks. | |
237 | */ | |
34dc7c2f | 238 | if (dn->dn_maxblkid == 0) { |
9babb374 BB |
239 | delta = dn->dn_datablksz; |
240 | start = (off < dn->dn_datablksz) ? 0 : 1; | |
241 | end = (off+len <= dn->dn_datablksz) ? 0 : 1; | |
242 | if (start == 0 && (off > 0 || len < dn->dn_datablksz)) { | |
b128c09f BB |
243 | err = dmu_tx_check_ioerr(NULL, dn, 0, 0); |
244 | if (err) | |
245 | goto out; | |
9babb374 | 246 | delta -= off; |
b128c09f | 247 | } |
34dc7c2f BB |
248 | } else { |
249 | zio_t *zio = zio_root(dn->dn_objset->os_spa, | |
250 | NULL, NULL, ZIO_FLAG_CANFAIL); | |
251 | ||
252 | /* first level-0 block */ | |
253 | start = off >> dn->dn_datablkshift; | |
254 | if (P2PHASE(off, dn->dn_datablksz) || | |
255 | len < dn->dn_datablksz) { | |
256 | err = dmu_tx_check_ioerr(zio, dn, 0, start); | |
257 | if (err) | |
258 | goto out; | |
259 | } | |
260 | ||
261 | /* last level-0 block */ | |
262 | end = (off+len-1) >> dn->dn_datablkshift; | |
b128c09f | 263 | if (end != start && end <= dn->dn_maxblkid && |
34dc7c2f BB |
264 | P2PHASE(off+len, dn->dn_datablksz)) { |
265 | err = dmu_tx_check_ioerr(zio, dn, 0, end); | |
266 | if (err) | |
267 | goto out; | |
268 | } | |
269 | ||
270 | /* level-1 blocks */ | |
9babb374 BB |
271 | if (nlvls > 1) { |
272 | int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; | |
273 | for (i = (start>>shft)+1; i < end>>shft; i++) { | |
34dc7c2f BB |
274 | err = dmu_tx_check_ioerr(zio, dn, 1, i); |
275 | if (err) | |
276 | goto out; | |
277 | } | |
278 | } | |
279 | ||
280 | err = zio_wait(zio); | |
281 | if (err) | |
282 | goto out; | |
9babb374 | 283 | delta = P2NPHASE(off, dn->dn_datablksz); |
34dc7c2f | 284 | } |
34dc7c2f | 285 | |
9babb374 BB |
286 | if (dn->dn_maxblkid > 0) { |
287 | /* | |
288 | * The blocksize can't change, | |
289 | * so we can make a more precise estimate. | |
290 | */ | |
291 | ASSERT(dn->dn_datablkshift != 0); | |
34dc7c2f | 292 | min_bs = max_bs = dn->dn_datablkshift; |
9babb374 BB |
293 | min_ibs = max_ibs = dn->dn_indblkshift; |
294 | } else if (dn->dn_indblkshift > max_ibs) { | |
295 | /* | |
296 | * This ensures that if we reduce DN_MAX_INDBLKSHIFT, | |
297 | * the code will still work correctly on older pools. | |
298 | */ | |
299 | min_ibs = max_ibs = dn->dn_indblkshift; | |
300 | } | |
301 | ||
302 | /* | |
303 | * If this write is not off the end of the file | |
304 | * we need to account for overwrites/unref. | |
305 | */ | |
428870ff | 306 | if (start <= dn->dn_maxblkid) { |
d6320ddb | 307 | for (l = 0; l < DN_MAX_LEVELS; l++) |
428870ff BB |
308 | history[l] = -1ULL; |
309 | } | |
9babb374 | 310 | while (start <= dn->dn_maxblkid) { |
9babb374 BB |
311 | dmu_buf_impl_t *db; |
312 | ||
313 | rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
428870ff | 314 | err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db); |
9babb374 | 315 | rw_exit(&dn->dn_struct_rwlock); |
428870ff BB |
316 | |
317 | if (err) { | |
318 | txh->txh_tx->tx_err = err; | |
319 | return; | |
9babb374 | 320 | } |
428870ff BB |
321 | |
322 | dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE, | |
323 | history); | |
9babb374 BB |
324 | dbuf_rele(db, FTAG); |
325 | if (++start > end) { | |
326 | /* | |
327 | * Account for new indirects appearing | |
328 | * before this IO gets assigned into a txg. | |
329 | */ | |
330 | bits = 64 - min_bs; | |
331 | epbs = min_ibs - SPA_BLKPTRSHIFT; | |
332 | for (bits -= epbs * (nlvls - 1); | |
333 | bits >= 0; bits -= epbs) | |
334 | txh->txh_fudge += 1ULL << max_ibs; | |
335 | goto out; | |
336 | } | |
337 | off += delta; | |
338 | if (len >= delta) | |
339 | len -= delta; | |
340 | delta = dn->dn_datablksz; | |
341 | } | |
34dc7c2f BB |
342 | } |
343 | ||
344 | /* | |
345 | * 'end' is the last thing we will access, not one past. | |
346 | * This way we won't overflow when accessing the last byte. | |
347 | */ | |
348 | start = P2ALIGN(off, 1ULL << max_bs); | |
349 | end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; | |
350 | txh->txh_space_towrite += end - start + 1; | |
351 | ||
352 | start >>= min_bs; | |
353 | end >>= min_bs; | |
354 | ||
355 | epbs = min_ibs - SPA_BLKPTRSHIFT; | |
356 | ||
357 | /* | |
358 | * The object contains at most 2^(64 - min_bs) blocks, | |
359 | * and each indirect level maps 2^epbs. | |
360 | */ | |
361 | for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { | |
362 | start >>= epbs; | |
363 | end >>= epbs; | |
9babb374 | 364 | ASSERT3U(end, >=, start); |
34dc7c2f | 365 | txh->txh_space_towrite += (end - start + 1) << max_ibs; |
9babb374 BB |
366 | if (start != 0) { |
367 | /* | |
368 | * We also need a new blkid=0 indirect block | |
369 | * to reference any existing file data. | |
370 | */ | |
371 | txh->txh_space_towrite += 1ULL << max_ibs; | |
372 | } | |
34dc7c2f BB |
373 | } |
374 | ||
34dc7c2f | 375 | out: |
9babb374 BB |
376 | if (txh->txh_space_towrite + txh->txh_space_tooverwrite > |
377 | 2 * DMU_MAX_ACCESS) | |
378 | err = EFBIG; | |
379 | ||
34dc7c2f BB |
380 | if (err) |
381 | txh->txh_tx->tx_err = err; | |
382 | } | |
383 | ||
384 | static void | |
385 | dmu_tx_count_dnode(dmu_tx_hold_t *txh) | |
386 | { | |
387 | dnode_t *dn = txh->txh_dnode; | |
572e2857 | 388 | dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset); |
34dc7c2f BB |
389 | uint64_t space = mdn->dn_datablksz + |
390 | ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); | |
391 | ||
392 | if (dn && dn->dn_dbuf->db_blkptr && | |
393 | dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, | |
428870ff | 394 | dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) { |
34dc7c2f | 395 | txh->txh_space_tooverwrite += space; |
9babb374 | 396 | txh->txh_space_tounref += space; |
34dc7c2f BB |
397 | } else { |
398 | txh->txh_space_towrite += space; | |
399 | if (dn && dn->dn_dbuf->db_blkptr) | |
400 | txh->txh_space_tounref += space; | |
401 | } | |
402 | } | |
403 | ||
404 | void | |
405 | dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) | |
406 | { | |
407 | dmu_tx_hold_t *txh; | |
408 | ||
409 | ASSERT(tx->tx_txg == 0); | |
410 | ASSERT(len < DMU_MAX_ACCESS); | |
411 | ASSERT(len == 0 || UINT64_MAX - off >= len - 1); | |
412 | ||
413 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
414 | object, THT_WRITE, off, len); | |
415 | if (txh == NULL) | |
416 | return; | |
417 | ||
418 | dmu_tx_count_write(txh, off, len); | |
419 | dmu_tx_count_dnode(txh); | |
420 | } | |
421 | ||
422 | static void | |
423 | dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) | |
424 | { | |
b128c09f BB |
425 | uint64_t blkid, nblks, lastblk; |
426 | uint64_t space = 0, unref = 0, skipped = 0; | |
34dc7c2f BB |
427 | dnode_t *dn = txh->txh_dnode; |
428 | dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; | |
429 | spa_t *spa = txh->txh_tx->tx_pool->dp_spa; | |
b128c09f | 430 | int epbs; |
34dc7c2f | 431 | |
b128c09f | 432 | if (dn->dn_nlevels == 0) |
34dc7c2f BB |
433 | return; |
434 | ||
435 | /* | |
b128c09f | 436 | * The struct_rwlock protects us against dn_nlevels |
34dc7c2f BB |
437 | * changing, in case (against all odds) we manage to dirty & |
438 | * sync out the changes after we check for being dirty. | |
428870ff | 439 | * Also, dbuf_hold_impl() wants us to have the struct_rwlock. |
34dc7c2f BB |
440 | */ |
441 | rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
b128c09f BB |
442 | epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; |
443 | if (dn->dn_maxblkid == 0) { | |
34dc7c2f BB |
444 | if (off == 0 && len >= dn->dn_datablksz) { |
445 | blkid = 0; | |
446 | nblks = 1; | |
447 | } else { | |
448 | rw_exit(&dn->dn_struct_rwlock); | |
449 | return; | |
450 | } | |
451 | } else { | |
452 | blkid = off >> dn->dn_datablkshift; | |
b128c09f | 453 | nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift; |
34dc7c2f | 454 | |
b128c09f | 455 | if (blkid >= dn->dn_maxblkid) { |
34dc7c2f BB |
456 | rw_exit(&dn->dn_struct_rwlock); |
457 | return; | |
458 | } | |
b128c09f BB |
459 | if (blkid + nblks > dn->dn_maxblkid) |
460 | nblks = dn->dn_maxblkid - blkid; | |
34dc7c2f | 461 | |
34dc7c2f | 462 | } |
b128c09f | 463 | if (dn->dn_nlevels == 1) { |
34dc7c2f BB |
464 | int i; |
465 | for (i = 0; i < nblks; i++) { | |
466 | blkptr_t *bp = dn->dn_phys->dn_blkptr; | |
b128c09f | 467 | ASSERT3U(blkid + i, <, dn->dn_nblkptr); |
34dc7c2f | 468 | bp += blkid + i; |
428870ff | 469 | if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) { |
34dc7c2f | 470 | dprintf_bp(bp, "can free old%s", ""); |
428870ff | 471 | space += bp_get_dsize(spa, bp); |
34dc7c2f BB |
472 | } |
473 | unref += BP_GET_ASIZE(bp); | |
474 | } | |
475 | nblks = 0; | |
476 | } | |
477 | ||
b128c09f BB |
478 | /* |
479 | * Add in memory requirements of higher-level indirects. | |
480 | * This assumes a worst-possible scenario for dn_nlevels. | |
481 | */ | |
482 | { | |
483 | uint64_t blkcnt = 1 + ((nblks >> epbs) >> epbs); | |
484 | int level = (dn->dn_nlevels > 1) ? 2 : 1; | |
485 | ||
486 | while (level++ < DN_MAX_LEVELS) { | |
487 | txh->txh_memory_tohold += blkcnt << dn->dn_indblkshift; | |
488 | blkcnt = 1 + (blkcnt >> epbs); | |
489 | } | |
490 | ASSERT(blkcnt <= dn->dn_nblkptr); | |
491 | } | |
492 | ||
493 | lastblk = blkid + nblks - 1; | |
34dc7c2f BB |
494 | while (nblks) { |
495 | dmu_buf_impl_t *dbuf; | |
b128c09f BB |
496 | uint64_t ibyte, new_blkid; |
497 | int epb = 1 << epbs; | |
498 | int err, i, blkoff, tochk; | |
499 | blkptr_t *bp; | |
500 | ||
501 | ibyte = blkid << dn->dn_datablkshift; | |
502 | err = dnode_next_offset(dn, | |
503 | DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); | |
504 | new_blkid = ibyte >> dn->dn_datablkshift; | |
505 | if (err == ESRCH) { | |
506 | skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; | |
507 | break; | |
508 | } | |
509 | if (err) { | |
510 | txh->txh_tx->tx_err = err; | |
511 | break; | |
512 | } | |
513 | if (new_blkid > lastblk) { | |
514 | skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; | |
515 | break; | |
516 | } | |
34dc7c2f | 517 | |
b128c09f BB |
518 | if (new_blkid > blkid) { |
519 | ASSERT((new_blkid >> epbs) > (blkid >> epbs)); | |
520 | skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1; | |
521 | nblks -= new_blkid - blkid; | |
522 | blkid = new_blkid; | |
523 | } | |
524 | blkoff = P2PHASE(blkid, epb); | |
525 | tochk = MIN(epb - blkoff, nblks); | |
34dc7c2f | 526 | |
428870ff BB |
527 | err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf); |
528 | if (err) { | |
529 | txh->txh_tx->tx_err = err; | |
b128c09f | 530 | break; |
34dc7c2f | 531 | } |
428870ff BB |
532 | |
533 | txh->txh_memory_tohold += dbuf->db.db_size; | |
534 | ||
535 | /* | |
536 | * We don't check memory_tohold against DMU_MAX_ACCESS because | |
537 | * memory_tohold is an over-estimation (especially the >L1 | |
538 | * indirect blocks), so it could fail. Callers should have | |
539 | * already verified that they will not be holding too much | |
540 | * memory. | |
541 | */ | |
542 | ||
b128c09f BB |
543 | err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); |
544 | if (err != 0) { | |
34dc7c2f | 545 | txh->txh_tx->tx_err = err; |
b128c09f | 546 | dbuf_rele(dbuf, FTAG); |
34dc7c2f BB |
547 | break; |
548 | } | |
549 | ||
b128c09f BB |
550 | bp = dbuf->db.db_data; |
551 | bp += blkoff; | |
552 | ||
553 | for (i = 0; i < tochk; i++) { | |
428870ff BB |
554 | if (dsl_dataset_block_freeable(ds, &bp[i], |
555 | bp[i].blk_birth)) { | |
b128c09f | 556 | dprintf_bp(&bp[i], "can free old%s", ""); |
428870ff | 557 | space += bp_get_dsize(spa, &bp[i]); |
b128c09f BB |
558 | } |
559 | unref += BP_GET_ASIZE(bp); | |
560 | } | |
561 | dbuf_rele(dbuf, FTAG); | |
562 | ||
34dc7c2f BB |
563 | blkid += tochk; |
564 | nblks -= tochk; | |
565 | } | |
566 | rw_exit(&dn->dn_struct_rwlock); | |
567 | ||
b128c09f BB |
568 | /* account for new level 1 indirect blocks that might show up */ |
569 | if (skipped > 0) { | |
570 | txh->txh_fudge += skipped << dn->dn_indblkshift; | |
571 | skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs); | |
572 | txh->txh_memory_tohold += skipped << dn->dn_indblkshift; | |
573 | } | |
34dc7c2f BB |
574 | txh->txh_space_tofree += space; |
575 | txh->txh_space_tounref += unref; | |
576 | } | |
577 | ||
578 | void | |
579 | dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) | |
580 | { | |
581 | dmu_tx_hold_t *txh; | |
582 | dnode_t *dn; | |
583 | uint64_t start, end, i; | |
584 | int err, shift; | |
585 | zio_t *zio; | |
586 | ||
587 | ASSERT(tx->tx_txg == 0); | |
588 | ||
589 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
590 | object, THT_FREE, off, len); | |
591 | if (txh == NULL) | |
592 | return; | |
593 | dn = txh->txh_dnode; | |
594 | ||
595 | /* first block */ | |
596 | if (off != 0) | |
597 | dmu_tx_count_write(txh, off, 1); | |
598 | /* last block */ | |
599 | if (len != DMU_OBJECT_END) | |
600 | dmu_tx_count_write(txh, off+len, 1); | |
601 | ||
428870ff BB |
602 | dmu_tx_count_dnode(txh); |
603 | ||
34dc7c2f BB |
604 | if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) |
605 | return; | |
606 | if (len == DMU_OBJECT_END) | |
607 | len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; | |
608 | ||
609 | /* | |
610 | * For i/o error checking, read the first and last level-0 | |
611 | * blocks, and all the level-1 blocks. The above count_write's | |
b128c09f | 612 | * have already taken care of the level-0 blocks. |
34dc7c2f BB |
613 | */ |
614 | if (dn->dn_nlevels > 1) { | |
615 | shift = dn->dn_datablkshift + dn->dn_indblkshift - | |
616 | SPA_BLKPTRSHIFT; | |
617 | start = off >> shift; | |
618 | end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; | |
619 | ||
620 | zio = zio_root(tx->tx_pool->dp_spa, | |
621 | NULL, NULL, ZIO_FLAG_CANFAIL); | |
622 | for (i = start; i <= end; i++) { | |
623 | uint64_t ibyte = i << shift; | |
b128c09f | 624 | err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); |
34dc7c2f BB |
625 | i = ibyte >> shift; |
626 | if (err == ESRCH) | |
627 | break; | |
628 | if (err) { | |
629 | tx->tx_err = err; | |
630 | return; | |
631 | } | |
632 | ||
633 | err = dmu_tx_check_ioerr(zio, dn, 1, i); | |
634 | if (err) { | |
635 | tx->tx_err = err; | |
636 | return; | |
637 | } | |
638 | } | |
639 | err = zio_wait(zio); | |
640 | if (err) { | |
641 | tx->tx_err = err; | |
642 | return; | |
643 | } | |
644 | } | |
645 | ||
34dc7c2f BB |
646 | dmu_tx_count_free(txh, off, len); |
647 | } | |
648 | ||
649 | void | |
9babb374 | 650 | dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) |
34dc7c2f BB |
651 | { |
652 | dmu_tx_hold_t *txh; | |
653 | dnode_t *dn; | |
654 | uint64_t nblocks; | |
655 | int epbs, err; | |
656 | ||
657 | ASSERT(tx->tx_txg == 0); | |
658 | ||
659 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
660 | object, THT_ZAP, add, (uintptr_t)name); | |
661 | if (txh == NULL) | |
662 | return; | |
663 | dn = txh->txh_dnode; | |
664 | ||
665 | dmu_tx_count_dnode(txh); | |
666 | ||
667 | if (dn == NULL) { | |
668 | /* | |
669 | * We will be able to fit a new object's entries into one leaf | |
670 | * block. So there will be at most 2 blocks total, | |
671 | * including the header block. | |
672 | */ | |
673 | dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift); | |
674 | return; | |
675 | } | |
676 | ||
677 | ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); | |
678 | ||
679 | if (dn->dn_maxblkid == 0 && !add) { | |
680 | /* | |
681 | * If there is only one block (i.e. this is a micro-zap) | |
682 | * and we are not adding anything, the accounting is simple. | |
683 | */ | |
684 | err = dmu_tx_check_ioerr(NULL, dn, 0, 0); | |
685 | if (err) { | |
686 | tx->tx_err = err; | |
687 | return; | |
688 | } | |
689 | ||
690 | /* | |
691 | * Use max block size here, since we don't know how much | |
692 | * the size will change between now and the dbuf dirty call. | |
693 | */ | |
694 | if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, | |
428870ff | 695 | &dn->dn_phys->dn_blkptr[0], |
34dc7c2f BB |
696 | dn->dn_phys->dn_blkptr[0].blk_birth)) { |
697 | txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; | |
698 | } else { | |
699 | txh->txh_space_towrite += SPA_MAXBLOCKSIZE; | |
34dc7c2f | 700 | } |
9babb374 BB |
701 | if (dn->dn_phys->dn_blkptr[0].blk_birth) |
702 | txh->txh_space_tounref += SPA_MAXBLOCKSIZE; | |
34dc7c2f BB |
703 | return; |
704 | } | |
705 | ||
706 | if (dn->dn_maxblkid > 0 && name) { | |
707 | /* | |
708 | * access the name in this fat-zap so that we'll check | |
709 | * for i/o errors to the leaf blocks, etc. | |
710 | */ | |
428870ff | 711 | err = zap_lookup(dn->dn_objset, dn->dn_object, name, |
34dc7c2f BB |
712 | 8, 0, NULL); |
713 | if (err == EIO) { | |
714 | tx->tx_err = err; | |
715 | return; | |
716 | } | |
717 | } | |
718 | ||
428870ff | 719 | err = zap_count_write(dn->dn_objset, dn->dn_object, name, add, |
45d1cae3 | 720 | &txh->txh_space_towrite, &txh->txh_space_tooverwrite); |
34dc7c2f BB |
721 | |
722 | /* | |
723 | * If the modified blocks are scattered to the four winds, | |
724 | * we'll have to modify an indirect twig for each. | |
725 | */ | |
726 | epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; | |
727 | for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) | |
9babb374 BB |
728 | if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj) |
729 | txh->txh_space_towrite += 3 << dn->dn_indblkshift; | |
730 | else | |
731 | txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift; | |
34dc7c2f BB |
732 | } |
733 | ||
734 | void | |
735 | dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) | |
736 | { | |
737 | dmu_tx_hold_t *txh; | |
738 | ||
739 | ASSERT(tx->tx_txg == 0); | |
740 | ||
741 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
742 | object, THT_BONUS, 0, 0); | |
743 | if (txh) | |
744 | dmu_tx_count_dnode(txh); | |
745 | } | |
746 | ||
747 | void | |
748 | dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) | |
749 | { | |
750 | dmu_tx_hold_t *txh; | |
751 | ASSERT(tx->tx_txg == 0); | |
752 | ||
753 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, | |
754 | DMU_NEW_OBJECT, THT_SPACE, space, 0); | |
755 | ||
756 | txh->txh_space_towrite += space; | |
757 | } | |
758 | ||
759 | int | |
760 | dmu_tx_holds(dmu_tx_t *tx, uint64_t object) | |
761 | { | |
762 | dmu_tx_hold_t *txh; | |
763 | int holds = 0; | |
764 | ||
765 | /* | |
766 | * By asserting that the tx is assigned, we're counting the | |
767 | * number of dn_tx_holds, which is the same as the number of | |
768 | * dn_holds. Otherwise, we'd be counting dn_holds, but | |
769 | * dn_tx_holds could be 0. | |
770 | */ | |
771 | ASSERT(tx->tx_txg != 0); | |
772 | ||
773 | /* if (tx->tx_anyobj == TRUE) */ | |
774 | /* return (0); */ | |
775 | ||
776 | for (txh = list_head(&tx->tx_holds); txh; | |
777 | txh = list_next(&tx->tx_holds, txh)) { | |
778 | if (txh->txh_dnode && txh->txh_dnode->dn_object == object) | |
779 | holds++; | |
780 | } | |
781 | ||
782 | return (holds); | |
783 | } | |
784 | ||
785 | #ifdef ZFS_DEBUG | |
786 | void | |
787 | dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) | |
788 | { | |
789 | dmu_tx_hold_t *txh; | |
790 | int match_object = FALSE, match_offset = FALSE; | |
572e2857 | 791 | dnode_t *dn; |
34dc7c2f | 792 | |
572e2857 BB |
793 | DB_DNODE_ENTER(db); |
794 | dn = DB_DNODE(db); | |
34dc7c2f | 795 | ASSERT(tx->tx_txg != 0); |
428870ff | 796 | ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); |
34dc7c2f BB |
797 | ASSERT3U(dn->dn_object, ==, db->db.db_object); |
798 | ||
572e2857 BB |
799 | if (tx->tx_anyobj) { |
800 | DB_DNODE_EXIT(db); | |
34dc7c2f | 801 | return; |
572e2857 | 802 | } |
34dc7c2f BB |
803 | |
804 | /* XXX No checking on the meta dnode for now */ | |
572e2857 BB |
805 | if (db->db.db_object == DMU_META_DNODE_OBJECT) { |
806 | DB_DNODE_EXIT(db); | |
34dc7c2f | 807 | return; |
572e2857 | 808 | } |
34dc7c2f BB |
809 | |
810 | for (txh = list_head(&tx->tx_holds); txh; | |
811 | txh = list_next(&tx->tx_holds, txh)) { | |
812 | ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); | |
813 | if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) | |
814 | match_object = TRUE; | |
815 | if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { | |
816 | int datablkshift = dn->dn_datablkshift ? | |
817 | dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; | |
818 | int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; | |
819 | int shift = datablkshift + epbs * db->db_level; | |
820 | uint64_t beginblk = shift >= 64 ? 0 : | |
821 | (txh->txh_arg1 >> shift); | |
822 | uint64_t endblk = shift >= 64 ? 0 : | |
823 | ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); | |
824 | uint64_t blkid = db->db_blkid; | |
825 | ||
826 | /* XXX txh_arg2 better not be zero... */ | |
827 | ||
828 | dprintf("found txh type %x beginblk=%llx endblk=%llx\n", | |
829 | txh->txh_type, beginblk, endblk); | |
830 | ||
831 | switch (txh->txh_type) { | |
832 | case THT_WRITE: | |
833 | if (blkid >= beginblk && blkid <= endblk) | |
834 | match_offset = TRUE; | |
835 | /* | |
836 | * We will let this hold work for the bonus | |
428870ff BB |
837 | * or spill buffer so that we don't need to |
838 | * hold it when creating a new object. | |
34dc7c2f | 839 | */ |
428870ff BB |
840 | if (blkid == DMU_BONUS_BLKID || |
841 | blkid == DMU_SPILL_BLKID) | |
34dc7c2f BB |
842 | match_offset = TRUE; |
843 | /* | |
844 | * They might have to increase nlevels, | |
845 | * thus dirtying the new TLIBs. Or the | |
846 | * might have to change the block size, | |
847 | * thus dirying the new lvl=0 blk=0. | |
848 | */ | |
849 | if (blkid == 0) | |
850 | match_offset = TRUE; | |
851 | break; | |
852 | case THT_FREE: | |
b128c09f BB |
853 | /* |
854 | * We will dirty all the level 1 blocks in | |
855 | * the free range and perhaps the first and | |
856 | * last level 0 block. | |
857 | */ | |
858 | if (blkid >= beginblk && (blkid <= endblk || | |
859 | txh->txh_arg2 == DMU_OBJECT_END)) | |
34dc7c2f BB |
860 | match_offset = TRUE; |
861 | break; | |
428870ff BB |
862 | case THT_SPILL: |
863 | if (blkid == DMU_SPILL_BLKID) | |
864 | match_offset = TRUE; | |
865 | break; | |
34dc7c2f | 866 | case THT_BONUS: |
428870ff | 867 | if (blkid == DMU_BONUS_BLKID) |
34dc7c2f BB |
868 | match_offset = TRUE; |
869 | break; | |
870 | case THT_ZAP: | |
871 | match_offset = TRUE; | |
872 | break; | |
873 | case THT_NEWOBJECT: | |
874 | match_object = TRUE; | |
875 | break; | |
876 | default: | |
877 | ASSERT(!"bad txh_type"); | |
878 | } | |
879 | } | |
572e2857 BB |
880 | if (match_object && match_offset) { |
881 | DB_DNODE_EXIT(db); | |
34dc7c2f | 882 | return; |
572e2857 | 883 | } |
34dc7c2f | 884 | } |
572e2857 | 885 | DB_DNODE_EXIT(db); |
34dc7c2f BB |
886 | panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", |
887 | (u_longlong_t)db->db.db_object, db->db_level, | |
888 | (u_longlong_t)db->db_blkid); | |
889 | } | |
890 | #endif | |
891 | ||
892 | static int | |
893 | dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) | |
894 | { | |
895 | dmu_tx_hold_t *txh; | |
896 | spa_t *spa = tx->tx_pool->dp_spa; | |
b128c09f BB |
897 | uint64_t memory, asize, fsize, usize; |
898 | uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge; | |
34dc7c2f BB |
899 | |
900 | ASSERT3U(tx->tx_txg, ==, 0); | |
901 | ||
902 | if (tx->tx_err) | |
903 | return (tx->tx_err); | |
904 | ||
b128c09f | 905 | if (spa_suspended(spa)) { |
34dc7c2f BB |
906 | /* |
907 | * If the user has indicated a blocking failure mode | |
908 | * then return ERESTART which will block in dmu_tx_wait(). | |
909 | * Otherwise, return EIO so that an error can get | |
910 | * propagated back to the VOP calls. | |
911 | * | |
912 | * Note that we always honor the txg_how flag regardless | |
913 | * of the failuremode setting. | |
914 | */ | |
915 | if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && | |
916 | txg_how != TXG_WAIT) | |
917 | return (EIO); | |
918 | ||
919 | return (ERESTART); | |
920 | } | |
921 | ||
922 | tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); | |
923 | tx->tx_needassign_txh = NULL; | |
924 | ||
925 | /* | |
926 | * NB: No error returns are allowed after txg_hold_open, but | |
927 | * before processing the dnode holds, due to the | |
928 | * dmu_tx_unassign() logic. | |
929 | */ | |
930 | ||
b128c09f | 931 | towrite = tofree = tooverwrite = tounref = tohold = fudge = 0; |
34dc7c2f BB |
932 | for (txh = list_head(&tx->tx_holds); txh; |
933 | txh = list_next(&tx->tx_holds, txh)) { | |
934 | dnode_t *dn = txh->txh_dnode; | |
935 | if (dn != NULL) { | |
936 | mutex_enter(&dn->dn_mtx); | |
937 | if (dn->dn_assigned_txg == tx->tx_txg - 1) { | |
938 | mutex_exit(&dn->dn_mtx); | |
939 | tx->tx_needassign_txh = txh; | |
940 | return (ERESTART); | |
941 | } | |
942 | if (dn->dn_assigned_txg == 0) | |
943 | dn->dn_assigned_txg = tx->tx_txg; | |
944 | ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); | |
945 | (void) refcount_add(&dn->dn_tx_holds, tx); | |
946 | mutex_exit(&dn->dn_mtx); | |
947 | } | |
948 | towrite += txh->txh_space_towrite; | |
949 | tofree += txh->txh_space_tofree; | |
950 | tooverwrite += txh->txh_space_tooverwrite; | |
951 | tounref += txh->txh_space_tounref; | |
b128c09f BB |
952 | tohold += txh->txh_memory_tohold; |
953 | fudge += txh->txh_fudge; | |
34dc7c2f BB |
954 | } |
955 | ||
956 | /* | |
957 | * NB: This check must be after we've held the dnodes, so that | |
958 | * the dmu_tx_unassign() logic will work properly | |
959 | */ | |
960 | if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) | |
961 | return (ERESTART); | |
962 | ||
963 | /* | |
964 | * If a snapshot has been taken since we made our estimates, | |
965 | * assume that we won't be able to free or overwrite anything. | |
966 | */ | |
967 | if (tx->tx_objset && | |
428870ff | 968 | dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) > |
34dc7c2f BB |
969 | tx->tx_lastsnap_txg) { |
970 | towrite += tooverwrite; | |
971 | tooverwrite = tofree = 0; | |
972 | } | |
973 | ||
b128c09f BB |
974 | /* needed allocation: worst-case estimate of write space */ |
975 | asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite); | |
976 | /* freed space estimate: worst-case overwrite + free estimate */ | |
34dc7c2f | 977 | fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; |
b128c09f | 978 | /* convert unrefd space to worst-case estimate */ |
34dc7c2f | 979 | usize = spa_get_asize(tx->tx_pool->dp_spa, tounref); |
b128c09f BB |
980 | /* calculate memory footprint estimate */ |
981 | memory = towrite + tooverwrite + tohold; | |
34dc7c2f BB |
982 | |
983 | #ifdef ZFS_DEBUG | |
b128c09f BB |
984 | /* |
985 | * Add in 'tohold' to account for our dirty holds on this memory | |
986 | * XXX - the "fudge" factor is to account for skipped blocks that | |
987 | * we missed because dnode_next_offset() misses in-core-only blocks. | |
988 | */ | |
989 | tx->tx_space_towrite = asize + | |
990 | spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge); | |
34dc7c2f BB |
991 | tx->tx_space_tofree = tofree; |
992 | tx->tx_space_tooverwrite = tooverwrite; | |
993 | tx->tx_space_tounref = tounref; | |
994 | #endif | |
995 | ||
996 | if (tx->tx_dir && asize != 0) { | |
b128c09f BB |
997 | int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, |
998 | asize, fsize, usize, &tx->tx_tempreserve_cookie, tx); | |
34dc7c2f BB |
999 | if (err) |
1000 | return (err); | |
1001 | } | |
1002 | ||
1003 | return (0); | |
1004 | } | |
1005 | ||
1006 | static void | |
1007 | dmu_tx_unassign(dmu_tx_t *tx) | |
1008 | { | |
1009 | dmu_tx_hold_t *txh; | |
1010 | ||
1011 | if (tx->tx_txg == 0) | |
1012 | return; | |
1013 | ||
1014 | txg_rele_to_quiesce(&tx->tx_txgh); | |
1015 | ||
1016 | for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; | |
1017 | txh = list_next(&tx->tx_holds, txh)) { | |
1018 | dnode_t *dn = txh->txh_dnode; | |
1019 | ||
1020 | if (dn == NULL) | |
1021 | continue; | |
1022 | mutex_enter(&dn->dn_mtx); | |
1023 | ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); | |
1024 | ||
1025 | if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { | |
1026 | dn->dn_assigned_txg = 0; | |
1027 | cv_broadcast(&dn->dn_notxholds); | |
1028 | } | |
1029 | mutex_exit(&dn->dn_mtx); | |
1030 | } | |
1031 | ||
1032 | txg_rele_to_sync(&tx->tx_txgh); | |
1033 | ||
1034 | tx->tx_lasttried_txg = tx->tx_txg; | |
1035 | tx->tx_txg = 0; | |
1036 | } | |
1037 | ||
1038 | /* | |
1039 | * Assign tx to a transaction group. txg_how can be one of: | |
1040 | * | |
1041 | * (1) TXG_WAIT. If the current open txg is full, waits until there's | |
1042 | * a new one. This should be used when you're not holding locks. | |
1043 | * If will only fail if we're truly out of space (or over quota). | |
1044 | * | |
1045 | * (2) TXG_NOWAIT. If we can't assign into the current open txg without | |
1046 | * blocking, returns immediately with ERESTART. This should be used | |
1047 | * whenever you're holding locks. On an ERESTART error, the caller | |
1048 | * should drop locks, do a dmu_tx_wait(tx), and try again. | |
1049 | * | |
1050 | * (3) A specific txg. Use this if you need to ensure that multiple | |
1051 | * transactions all sync in the same txg. Like TXG_NOWAIT, it | |
1052 | * returns ERESTART if it can't assign you into the requested txg. | |
1053 | */ | |
1054 | int | |
1055 | dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) | |
1056 | { | |
1057 | int err; | |
1058 | ||
1059 | ASSERT(tx->tx_txg == 0); | |
1060 | ASSERT(txg_how != 0); | |
1061 | ASSERT(!dsl_pool_sync_context(tx->tx_pool)); | |
1062 | ||
1063 | while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { | |
1064 | dmu_tx_unassign(tx); | |
1065 | ||
1066 | if (err != ERESTART || txg_how != TXG_WAIT) | |
1067 | return (err); | |
1068 | ||
1069 | dmu_tx_wait(tx); | |
1070 | } | |
1071 | ||
1072 | txg_rele_to_quiesce(&tx->tx_txgh); | |
1073 | ||
1074 | return (0); | |
1075 | } | |
1076 | ||
1077 | void | |
1078 | dmu_tx_wait(dmu_tx_t *tx) | |
1079 | { | |
1080 | spa_t *spa = tx->tx_pool->dp_spa; | |
1081 | ||
1082 | ASSERT(tx->tx_txg == 0); | |
1083 | ||
1084 | /* | |
1085 | * It's possible that the pool has become active after this thread | |
1086 | * has tried to obtain a tx. If that's the case then his | |
1087 | * tx_lasttried_txg would not have been assigned. | |
1088 | */ | |
b128c09f | 1089 | if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { |
34dc7c2f BB |
1090 | txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1); |
1091 | } else if (tx->tx_needassign_txh) { | |
1092 | dnode_t *dn = tx->tx_needassign_txh->txh_dnode; | |
1093 | ||
1094 | mutex_enter(&dn->dn_mtx); | |
1095 | while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) | |
1096 | cv_wait(&dn->dn_notxholds, &dn->dn_mtx); | |
1097 | mutex_exit(&dn->dn_mtx); | |
1098 | tx->tx_needassign_txh = NULL; | |
1099 | } else { | |
1100 | txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); | |
1101 | } | |
1102 | } | |
1103 | ||
1104 | void | |
1105 | dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) | |
1106 | { | |
1107 | #ifdef ZFS_DEBUG | |
1108 | if (tx->tx_dir == NULL || delta == 0) | |
1109 | return; | |
1110 | ||
1111 | if (delta > 0) { | |
1112 | ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=, | |
1113 | tx->tx_space_towrite); | |
1114 | (void) refcount_add_many(&tx->tx_space_written, delta, NULL); | |
1115 | } else { | |
1116 | (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL); | |
1117 | } | |
1118 | #endif | |
1119 | } | |
1120 | ||
1121 | void | |
1122 | dmu_tx_commit(dmu_tx_t *tx) | |
1123 | { | |
1124 | dmu_tx_hold_t *txh; | |
1125 | ||
1126 | ASSERT(tx->tx_txg != 0); | |
1127 | ||
1128 | while (txh = list_head(&tx->tx_holds)) { | |
1129 | dnode_t *dn = txh->txh_dnode; | |
1130 | ||
1131 | list_remove(&tx->tx_holds, txh); | |
1132 | kmem_free(txh, sizeof (dmu_tx_hold_t)); | |
1133 | if (dn == NULL) | |
1134 | continue; | |
1135 | mutex_enter(&dn->dn_mtx); | |
1136 | ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); | |
1137 | ||
1138 | if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { | |
1139 | dn->dn_assigned_txg = 0; | |
1140 | cv_broadcast(&dn->dn_notxholds); | |
1141 | } | |
1142 | mutex_exit(&dn->dn_mtx); | |
1143 | dnode_rele(dn, tx); | |
1144 | } | |
1145 | ||
1146 | if (tx->tx_tempreserve_cookie) | |
1147 | dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); | |
1148 | ||
428870ff BB |
1149 | if (!list_is_empty(&tx->tx_callbacks)) |
1150 | txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); | |
1151 | ||
34dc7c2f BB |
1152 | if (tx->tx_anyobj == FALSE) |
1153 | txg_rele_to_sync(&tx->tx_txgh); | |
428870ff BB |
1154 | |
1155 | list_destroy(&tx->tx_callbacks); | |
34dc7c2f BB |
1156 | list_destroy(&tx->tx_holds); |
1157 | #ifdef ZFS_DEBUG | |
1158 | dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", | |
1159 | tx->tx_space_towrite, refcount_count(&tx->tx_space_written), | |
1160 | tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); | |
1161 | refcount_destroy_many(&tx->tx_space_written, | |
1162 | refcount_count(&tx->tx_space_written)); | |
1163 | refcount_destroy_many(&tx->tx_space_freed, | |
1164 | refcount_count(&tx->tx_space_freed)); | |
1165 | #endif | |
1166 | kmem_free(tx, sizeof (dmu_tx_t)); | |
1167 | } | |
1168 | ||
1169 | void | |
1170 | dmu_tx_abort(dmu_tx_t *tx) | |
1171 | { | |
1172 | dmu_tx_hold_t *txh; | |
1173 | ||
1174 | ASSERT(tx->tx_txg == 0); | |
1175 | ||
1176 | while (txh = list_head(&tx->tx_holds)) { | |
1177 | dnode_t *dn = txh->txh_dnode; | |
1178 | ||
1179 | list_remove(&tx->tx_holds, txh); | |
1180 | kmem_free(txh, sizeof (dmu_tx_hold_t)); | |
1181 | if (dn != NULL) | |
1182 | dnode_rele(dn, tx); | |
1183 | } | |
428870ff BB |
1184 | |
1185 | /* | |
1186 | * Call any registered callbacks with an error code. | |
1187 | */ | |
1188 | if (!list_is_empty(&tx->tx_callbacks)) | |
1189 | dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); | |
1190 | ||
1191 | list_destroy(&tx->tx_callbacks); | |
34dc7c2f BB |
1192 | list_destroy(&tx->tx_holds); |
1193 | #ifdef ZFS_DEBUG | |
1194 | refcount_destroy_many(&tx->tx_space_written, | |
1195 | refcount_count(&tx->tx_space_written)); | |
1196 | refcount_destroy_many(&tx->tx_space_freed, | |
1197 | refcount_count(&tx->tx_space_freed)); | |
1198 | #endif | |
1199 | kmem_free(tx, sizeof (dmu_tx_t)); | |
1200 | } | |
1201 | ||
1202 | uint64_t | |
1203 | dmu_tx_get_txg(dmu_tx_t *tx) | |
1204 | { | |
1205 | ASSERT(tx->tx_txg != 0); | |
1206 | return (tx->tx_txg); | |
1207 | } | |
428870ff BB |
1208 | |
1209 | void | |
1210 | dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) | |
1211 | { | |
1212 | dmu_tx_callback_t *dcb; | |
1213 | ||
1214 | dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); | |
1215 | ||
1216 | dcb->dcb_func = func; | |
1217 | dcb->dcb_data = data; | |
1218 | ||
1219 | list_insert_tail(&tx->tx_callbacks, dcb); | |
1220 | } | |
1221 | ||
1222 | /* | |
1223 | * Call all the commit callbacks on a list, with a given error code. | |
1224 | */ | |
1225 | void | |
1226 | dmu_tx_do_callbacks(list_t *cb_list, int error) | |
1227 | { | |
1228 | dmu_tx_callback_t *dcb; | |
1229 | ||
1230 | while (dcb = list_head(cb_list)) { | |
1231 | list_remove(cb_list, dcb); | |
1232 | dcb->dcb_func(dcb->dcb_data, error); | |
1233 | kmem_free(dcb, sizeof (dmu_tx_callback_t)); | |
1234 | } | |
1235 | } | |
1236 | ||
1237 | /* | |
1238 | * Interface to hold a bunch of attributes. | |
1239 | * used for creating new files. | |
1240 | * attrsize is the total size of all attributes | |
1241 | * to be added during object creation | |
1242 | * | |
1243 | * For updating/adding a single attribute dmu_tx_hold_sa() should be used. | |
1244 | */ | |
1245 | ||
1246 | /* | |
1247 | * hold necessary attribute name for attribute registration. | |
1248 | * should be a very rare case where this is needed. If it does | |
1249 | * happen it would only happen on the first write to the file system. | |
1250 | */ | |
1251 | static void | |
1252 | dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) | |
1253 | { | |
1254 | int i; | |
1255 | ||
1256 | if (!sa->sa_need_attr_registration) | |
1257 | return; | |
1258 | ||
1259 | for (i = 0; i != sa->sa_num_attrs; i++) { | |
1260 | if (!sa->sa_attr_table[i].sa_registered) { | |
1261 | if (sa->sa_reg_attr_obj) | |
1262 | dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, | |
1263 | B_TRUE, sa->sa_attr_table[i].sa_name); | |
1264 | else | |
1265 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, | |
1266 | B_TRUE, sa->sa_attr_table[i].sa_name); | |
1267 | } | |
1268 | } | |
1269 | } | |
1270 | ||
1271 | ||
1272 | void | |
1273 | dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) | |
1274 | { | |
1275 | dnode_t *dn; | |
1276 | dmu_tx_hold_t *txh; | |
1277 | blkptr_t *bp; | |
1278 | ||
1279 | txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, | |
1280 | THT_SPILL, 0, 0); | |
1281 | ||
1282 | dn = txh->txh_dnode; | |
1283 | ||
1284 | if (dn == NULL) | |
1285 | return; | |
1286 | ||
1287 | /* If blkptr doesn't exist then add space to towrite */ | |
1288 | bp = &dn->dn_phys->dn_spill; | |
1289 | if (BP_IS_HOLE(bp)) { | |
1290 | txh->txh_space_towrite += SPA_MAXBLOCKSIZE; | |
1291 | txh->txh_space_tounref = 0; | |
1292 | } else { | |
1293 | if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, | |
1294 | bp, bp->blk_birth)) | |
1295 | txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; | |
1296 | else | |
1297 | txh->txh_space_towrite += SPA_MAXBLOCKSIZE; | |
1298 | if (bp->blk_birth) | |
1299 | txh->txh_space_tounref += SPA_MAXBLOCKSIZE; | |
1300 | } | |
1301 | } | |
1302 | ||
1303 | void | |
1304 | dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) | |
1305 | { | |
1306 | sa_os_t *sa = tx->tx_objset->os_sa; | |
1307 | ||
1308 | dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); | |
1309 | ||
1310 | if (tx->tx_objset->os_sa->sa_master_obj == 0) | |
1311 | return; | |
1312 | ||
1313 | if (tx->tx_objset->os_sa->sa_layout_attr_obj) | |
1314 | dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); | |
1315 | else { | |
1316 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); | |
1317 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); | |
1318 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1319 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1320 | } | |
1321 | ||
1322 | dmu_tx_sa_registration_hold(sa, tx); | |
1323 | ||
1324 | if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill) | |
1325 | return; | |
1326 | ||
1327 | (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, | |
1328 | THT_SPILL, 0, 0); | |
1329 | } | |
1330 | ||
1331 | /* | |
1332 | * Hold SA attribute | |
1333 | * | |
1334 | * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) | |
1335 | * | |
1336 | * variable_size is the total size of all variable sized attributes | |
1337 | * passed to this function. It is not the total size of all | |
1338 | * variable size attributes that *may* exist on this object. | |
1339 | */ | |
1340 | void | |
1341 | dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) | |
1342 | { | |
1343 | uint64_t object; | |
1344 | sa_os_t *sa = tx->tx_objset->os_sa; | |
1345 | ||
1346 | ASSERT(hdl != NULL); | |
1347 | ||
1348 | object = sa_handle_object(hdl); | |
1349 | ||
1350 | dmu_tx_hold_bonus(tx, object); | |
1351 | ||
1352 | if (tx->tx_objset->os_sa->sa_master_obj == 0) | |
1353 | return; | |
1354 | ||
1355 | if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || | |
1356 | tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { | |
1357 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); | |
1358 | dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); | |
1359 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1360 | dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); | |
1361 | } | |
1362 | ||
1363 | dmu_tx_sa_registration_hold(sa, tx); | |
1364 | ||
1365 | if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) | |
1366 | dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); | |
1367 | ||
572e2857 | 1368 | if (sa->sa_force_spill || may_grow || hdl->sa_spill) { |
428870ff BB |
1369 | ASSERT(tx->tx_txg == 0); |
1370 | dmu_tx_hold_spill(tx, object); | |
572e2857 BB |
1371 | } else { |
1372 | dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; | |
1373 | dnode_t *dn; | |
1374 | ||
1375 | DB_DNODE_ENTER(db); | |
1376 | dn = DB_DNODE(db); | |
1377 | if (dn->dn_have_spill) { | |
1378 | ASSERT(tx->tx_txg == 0); | |
1379 | dmu_tx_hold_spill(tx, object); | |
1380 | } | |
1381 | DB_DNODE_EXIT(db); | |
428870ff BB |
1382 | } |
1383 | } |