]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/dbuf.c
Fix gcc c90 compliance warnings
[mirror_zfs.git] / module / zfs / dbuf.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 #include <sys/zfs_context.h>
26 #include <sys/dmu.h>
27 #include <sys/dmu_impl.h>
28 #include <sys/dbuf.h>
29 #include <sys/dmu_objset.h>
30 #include <sys/dsl_dataset.h>
31 #include <sys/dsl_dir.h>
32 #include <sys/dmu_tx.h>
33 #include <sys/spa.h>
34 #include <sys/zio.h>
35 #include <sys/dmu_zfetch.h>
36 #include <sys/sa.h>
37 #include <sys/sa_impl.h>
38
39 static void dbuf_destroy(dmu_buf_impl_t *db);
40 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
41 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
42
43 /*
44 * Global data structures and functions for the dbuf cache.
45 */
46 static kmem_cache_t *dbuf_cache;
47
48 /* ARGSUSED */
49 static int
50 dbuf_cons(void *vdb, void *unused, int kmflag)
51 {
52 dmu_buf_impl_t *db = vdb;
53 bzero(db, sizeof (dmu_buf_impl_t));
54
55 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
56 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
57 refcount_create(&db->db_holds);
58 return (0);
59 }
60
61 /* ARGSUSED */
62 static void
63 dbuf_dest(void *vdb, void *unused)
64 {
65 dmu_buf_impl_t *db = vdb;
66 mutex_destroy(&db->db_mtx);
67 cv_destroy(&db->db_changed);
68 refcount_destroy(&db->db_holds);
69 }
70
71 /*
72 * dbuf hash table routines
73 */
74 static dbuf_hash_table_t dbuf_hash_table;
75
76 static uint64_t dbuf_hash_count;
77
78 static uint64_t
79 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
80 {
81 uintptr_t osv = (uintptr_t)os;
82 uint64_t crc = -1ULL;
83
84 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
85 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
86 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
87 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
88 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
89 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
90 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
91
92 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
93
94 return (crc);
95 }
96
97 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
98
99 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
100 ((dbuf)->db.db_object == (obj) && \
101 (dbuf)->db_objset == (os) && \
102 (dbuf)->db_level == (level) && \
103 (dbuf)->db_blkid == (blkid))
104
105 dmu_buf_impl_t *
106 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
107 {
108 dbuf_hash_table_t *h = &dbuf_hash_table;
109 objset_t *os = dn->dn_objset;
110 uint64_t obj;
111 uint64_t hv;
112 uint64_t idx;
113 dmu_buf_impl_t *db;
114
115 obj = dn->dn_object;
116 hv = DBUF_HASH(os, obj, level, blkid);
117 idx = hv & h->hash_table_mask;
118
119 mutex_enter(DBUF_HASH_MUTEX(h, idx));
120 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
121 if (DBUF_EQUAL(db, os, obj, level, blkid)) {
122 mutex_enter(&db->db_mtx);
123 if (db->db_state != DB_EVICTING) {
124 mutex_exit(DBUF_HASH_MUTEX(h, idx));
125 return (db);
126 }
127 mutex_exit(&db->db_mtx);
128 }
129 }
130 mutex_exit(DBUF_HASH_MUTEX(h, idx));
131 return (NULL);
132 }
133
134 /*
135 * Insert an entry into the hash table. If there is already an element
136 * equal to elem in the hash table, then the already existing element
137 * will be returned and the new element will not be inserted.
138 * Otherwise returns NULL.
139 */
140 static dmu_buf_impl_t *
141 dbuf_hash_insert(dmu_buf_impl_t *db)
142 {
143 dbuf_hash_table_t *h = &dbuf_hash_table;
144 objset_t *os = db->db_objset;
145 uint64_t obj = db->db.db_object;
146 int level = db->db_level;
147 uint64_t blkid, hv, idx;
148 dmu_buf_impl_t *dbf;
149
150 blkid = db->db_blkid;
151 hv = DBUF_HASH(os, obj, level, blkid);
152 idx = hv & h->hash_table_mask;
153
154 mutex_enter(DBUF_HASH_MUTEX(h, idx));
155 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
156 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
157 mutex_enter(&dbf->db_mtx);
158 if (dbf->db_state != DB_EVICTING) {
159 mutex_exit(DBUF_HASH_MUTEX(h, idx));
160 return (dbf);
161 }
162 mutex_exit(&dbf->db_mtx);
163 }
164 }
165
166 mutex_enter(&db->db_mtx);
167 db->db_hash_next = h->hash_table[idx];
168 h->hash_table[idx] = db;
169 mutex_exit(DBUF_HASH_MUTEX(h, idx));
170 atomic_add_64(&dbuf_hash_count, 1);
171
172 return (NULL);
173 }
174
175 /*
176 * Remove an entry from the hash table. This operation will
177 * fail if there are any existing holds on the db.
178 */
179 static void
180 dbuf_hash_remove(dmu_buf_impl_t *db)
181 {
182 dbuf_hash_table_t *h = &dbuf_hash_table;
183 uint64_t hv, idx;
184 dmu_buf_impl_t *dbf, **dbp;
185
186 hv = DBUF_HASH(db->db_objset, db->db.db_object,
187 db->db_level, db->db_blkid);
188 idx = hv & h->hash_table_mask;
189
190 /*
191 * We musn't hold db_mtx to maintin lock ordering:
192 * DBUF_HASH_MUTEX > db_mtx.
193 */
194 ASSERT(refcount_is_zero(&db->db_holds));
195 ASSERT(db->db_state == DB_EVICTING);
196 ASSERT(!MUTEX_HELD(&db->db_mtx));
197
198 mutex_enter(DBUF_HASH_MUTEX(h, idx));
199 dbp = &h->hash_table[idx];
200 while ((dbf = *dbp) != db) {
201 dbp = &dbf->db_hash_next;
202 ASSERT(dbf != NULL);
203 }
204 *dbp = db->db_hash_next;
205 db->db_hash_next = NULL;
206 mutex_exit(DBUF_HASH_MUTEX(h, idx));
207 atomic_add_64(&dbuf_hash_count, -1);
208 }
209
210 static arc_evict_func_t dbuf_do_evict;
211
212 static void
213 dbuf_evict_user(dmu_buf_impl_t *db)
214 {
215 ASSERT(MUTEX_HELD(&db->db_mtx));
216
217 if (db->db_level != 0 || db->db_evict_func == NULL)
218 return;
219
220 if (db->db_user_data_ptr_ptr)
221 *db->db_user_data_ptr_ptr = db->db.db_data;
222 db->db_evict_func(&db->db, db->db_user_ptr);
223 db->db_user_ptr = NULL;
224 db->db_user_data_ptr_ptr = NULL;
225 db->db_evict_func = NULL;
226 }
227
228 boolean_t
229 dbuf_is_metadata(dmu_buf_impl_t *db)
230 {
231 if (db->db_level > 0) {
232 return (B_TRUE);
233 } else {
234 boolean_t is_metadata;
235
236 DB_DNODE_ENTER(db);
237 is_metadata = dmu_ot[DB_DNODE(db)->dn_type].ot_metadata;
238 DB_DNODE_EXIT(db);
239
240 return (is_metadata);
241 }
242 }
243
244 void
245 dbuf_evict(dmu_buf_impl_t *db)
246 {
247 ASSERT(MUTEX_HELD(&db->db_mtx));
248 ASSERT(db->db_buf == NULL);
249 ASSERT(db->db_data_pending == NULL);
250
251 dbuf_clear(db);
252 dbuf_destroy(db);
253 }
254
255 void
256 dbuf_init(void)
257 {
258 uint64_t hsize = 1ULL << 16;
259 dbuf_hash_table_t *h = &dbuf_hash_table;
260 int i;
261
262 /*
263 * The hash table is big enough to fill all of physical memory
264 * with an average 4K block size. The table will take up
265 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
266 */
267 while (hsize * 4096 < physmem * PAGESIZE)
268 hsize <<= 1;
269
270 retry:
271 h->hash_table_mask = hsize - 1;
272 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
273 if (h->hash_table == NULL) {
274 /* XXX - we should really return an error instead of assert */
275 ASSERT(hsize > (1ULL << 10));
276 hsize >>= 1;
277 goto retry;
278 }
279
280 dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
281 sizeof (dmu_buf_impl_t),
282 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
283
284 for (i = 0; i < DBUF_MUTEXES; i++)
285 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
286 }
287
288 void
289 dbuf_fini(void)
290 {
291 dbuf_hash_table_t *h = &dbuf_hash_table;
292 int i;
293
294 for (i = 0; i < DBUF_MUTEXES; i++)
295 mutex_destroy(&h->hash_mutexes[i]);
296 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
297 kmem_cache_destroy(dbuf_cache);
298 }
299
300 /*
301 * Other stuff.
302 */
303
304 #ifdef ZFS_DEBUG
305 static void
306 dbuf_verify(dmu_buf_impl_t *db)
307 {
308 dnode_t *dn;
309 dbuf_dirty_record_t *dr;
310
311 ASSERT(MUTEX_HELD(&db->db_mtx));
312
313 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
314 return;
315
316 ASSERT(db->db_objset != NULL);
317 DB_DNODE_ENTER(db);
318 dn = DB_DNODE(db);
319 if (dn == NULL) {
320 ASSERT(db->db_parent == NULL);
321 ASSERT(db->db_blkptr == NULL);
322 } else {
323 ASSERT3U(db->db.db_object, ==, dn->dn_object);
324 ASSERT3P(db->db_objset, ==, dn->dn_objset);
325 ASSERT3U(db->db_level, <, dn->dn_nlevels);
326 ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
327 db->db_blkid == DMU_SPILL_BLKID ||
328 !list_is_empty(&dn->dn_dbufs));
329 }
330 if (db->db_blkid == DMU_BONUS_BLKID) {
331 ASSERT(dn != NULL);
332 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
333 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
334 } else if (db->db_blkid == DMU_SPILL_BLKID) {
335 ASSERT(dn != NULL);
336 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
337 ASSERT3U(db->db.db_offset, ==, 0);
338 } else {
339 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
340 }
341
342 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
343 ASSERT(dr->dr_dbuf == db);
344
345 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
346 ASSERT(dr->dr_dbuf == db);
347
348 /*
349 * We can't assert that db_size matches dn_datablksz because it
350 * can be momentarily different when another thread is doing
351 * dnode_set_blksz().
352 */
353 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
354 dr = db->db_data_pending;
355 /*
356 * It should only be modified in syncing context, so
357 * make sure we only have one copy of the data.
358 */
359 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
360 }
361
362 /* verify db->db_blkptr */
363 if (db->db_blkptr) {
364 if (db->db_parent == dn->dn_dbuf) {
365 /* db is pointed to by the dnode */
366 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
367 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
368 ASSERT(db->db_parent == NULL);
369 else
370 ASSERT(db->db_parent != NULL);
371 if (db->db_blkid != DMU_SPILL_BLKID)
372 ASSERT3P(db->db_blkptr, ==,
373 &dn->dn_phys->dn_blkptr[db->db_blkid]);
374 } else {
375 /* db is pointed to by an indirect block */
376 int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
377 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
378 ASSERT3U(db->db_parent->db.db_object, ==,
379 db->db.db_object);
380 /*
381 * dnode_grow_indblksz() can make this fail if we don't
382 * have the struct_rwlock. XXX indblksz no longer
383 * grows. safe to do this now?
384 */
385 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
386 ASSERT3P(db->db_blkptr, ==,
387 ((blkptr_t *)db->db_parent->db.db_data +
388 db->db_blkid % epb));
389 }
390 }
391 }
392 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
393 (db->db_buf == NULL || db->db_buf->b_data) &&
394 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
395 db->db_state != DB_FILL && !dn->dn_free_txg) {
396 /*
397 * If the blkptr isn't set but they have nonzero data,
398 * it had better be dirty, otherwise we'll lose that
399 * data when we evict this buffer.
400 */
401 if (db->db_dirtycnt == 0) {
402 uint64_t *buf = db->db.db_data;
403 int i;
404
405 for (i = 0; i < db->db.db_size >> 3; i++) {
406 ASSERT(buf[i] == 0);
407 }
408 }
409 }
410 DB_DNODE_EXIT(db);
411 }
412 #endif
413
414 static void
415 dbuf_update_data(dmu_buf_impl_t *db)
416 {
417 ASSERT(MUTEX_HELD(&db->db_mtx));
418 if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
419 ASSERT(!refcount_is_zero(&db->db_holds));
420 *db->db_user_data_ptr_ptr = db->db.db_data;
421 }
422 }
423
424 static void
425 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
426 {
427 ASSERT(MUTEX_HELD(&db->db_mtx));
428 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
429 db->db_buf = buf;
430 if (buf != NULL) {
431 ASSERT(buf->b_data != NULL);
432 db->db.db_data = buf->b_data;
433 if (!arc_released(buf))
434 arc_set_callback(buf, dbuf_do_evict, db);
435 dbuf_update_data(db);
436 } else {
437 dbuf_evict_user(db);
438 db->db.db_data = NULL;
439 if (db->db_state != DB_NOFILL)
440 db->db_state = DB_UNCACHED;
441 }
442 }
443
444 /*
445 * Loan out an arc_buf for read. Return the loaned arc_buf.
446 */
447 arc_buf_t *
448 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
449 {
450 arc_buf_t *abuf;
451
452 mutex_enter(&db->db_mtx);
453 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
454 int blksz = db->db.db_size;
455 spa_t *spa;
456
457 mutex_exit(&db->db_mtx);
458 DB_GET_SPA(&spa, db);
459 abuf = arc_loan_buf(spa, blksz);
460 bcopy(db->db.db_data, abuf->b_data, blksz);
461 } else {
462 abuf = db->db_buf;
463 arc_loan_inuse_buf(abuf, db);
464 dbuf_set_data(db, NULL);
465 mutex_exit(&db->db_mtx);
466 }
467 return (abuf);
468 }
469
470 uint64_t
471 dbuf_whichblock(dnode_t *dn, uint64_t offset)
472 {
473 if (dn->dn_datablkshift) {
474 return (offset >> dn->dn_datablkshift);
475 } else {
476 ASSERT3U(offset, <, dn->dn_datablksz);
477 return (0);
478 }
479 }
480
481 static void
482 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
483 {
484 dmu_buf_impl_t *db = vdb;
485
486 mutex_enter(&db->db_mtx);
487 ASSERT3U(db->db_state, ==, DB_READ);
488 /*
489 * All reads are synchronous, so we must have a hold on the dbuf
490 */
491 ASSERT(refcount_count(&db->db_holds) > 0);
492 ASSERT(db->db_buf == NULL);
493 ASSERT(db->db.db_data == NULL);
494 if (db->db_level == 0 && db->db_freed_in_flight) {
495 /* we were freed in flight; disregard any error */
496 arc_release(buf, db);
497 bzero(buf->b_data, db->db.db_size);
498 arc_buf_freeze(buf);
499 db->db_freed_in_flight = FALSE;
500 dbuf_set_data(db, buf);
501 db->db_state = DB_CACHED;
502 } else if (zio == NULL || zio->io_error == 0) {
503 dbuf_set_data(db, buf);
504 db->db_state = DB_CACHED;
505 } else {
506 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
507 ASSERT3P(db->db_buf, ==, NULL);
508 VERIFY(arc_buf_remove_ref(buf, db) == 1);
509 db->db_state = DB_UNCACHED;
510 }
511 cv_broadcast(&db->db_changed);
512 dbuf_rele_and_unlock(db, NULL);
513 }
514
515 static void
516 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
517 {
518 dnode_t *dn;
519 spa_t *spa;
520 zbookmark_t zb;
521 uint32_t aflags = ARC_NOWAIT;
522 arc_buf_t *pbuf;
523
524 DB_DNODE_ENTER(db);
525 dn = DB_DNODE(db);
526 ASSERT(!refcount_is_zero(&db->db_holds));
527 /* We need the struct_rwlock to prevent db_blkptr from changing. */
528 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
529 ASSERT(MUTEX_HELD(&db->db_mtx));
530 ASSERT(db->db_state == DB_UNCACHED);
531 ASSERT(db->db_buf == NULL);
532
533 if (db->db_blkid == DMU_BONUS_BLKID) {
534 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
535
536 ASSERT3U(bonuslen, <=, db->db.db_size);
537 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
538 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
539 if (bonuslen < DN_MAX_BONUSLEN)
540 bzero(db->db.db_data, DN_MAX_BONUSLEN);
541 if (bonuslen)
542 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
543 DB_DNODE_EXIT(db);
544 dbuf_update_data(db);
545 db->db_state = DB_CACHED;
546 mutex_exit(&db->db_mtx);
547 return;
548 }
549
550 /*
551 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
552 * processes the delete record and clears the bp while we are waiting
553 * for the dn_mtx (resulting in a "no" from block_freed).
554 */
555 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
556 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
557 BP_IS_HOLE(db->db_blkptr)))) {
558 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
559
560 dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
561 db->db.db_size, db, type));
562 DB_DNODE_EXIT(db);
563 bzero(db->db.db_data, db->db.db_size);
564 db->db_state = DB_CACHED;
565 *flags |= DB_RF_CACHED;
566 mutex_exit(&db->db_mtx);
567 return;
568 }
569
570 spa = dn->dn_objset->os_spa;
571 DB_DNODE_EXIT(db);
572
573 db->db_state = DB_READ;
574 mutex_exit(&db->db_mtx);
575
576 if (DBUF_IS_L2CACHEABLE(db))
577 aflags |= ARC_L2CACHE;
578
579 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
580 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
581 db->db.db_object, db->db_level, db->db_blkid);
582
583 dbuf_add_ref(db, NULL);
584 /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
585
586 if (db->db_parent)
587 pbuf = db->db_parent->db_buf;
588 else
589 pbuf = db->db_objset->os_phys_buf;
590
591 (void) dsl_read(zio, spa, db->db_blkptr, pbuf,
592 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
593 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
594 &aflags, &zb);
595 if (aflags & ARC_CACHED)
596 *flags |= DB_RF_CACHED;
597 }
598
599 int
600 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
601 {
602 int err = 0;
603 int havepzio = (zio != NULL);
604 int prefetch;
605 dnode_t *dn;
606
607 /*
608 * We don't have to hold the mutex to check db_state because it
609 * can't be freed while we have a hold on the buffer.
610 */
611 ASSERT(!refcount_is_zero(&db->db_holds));
612
613 if (db->db_state == DB_NOFILL)
614 return (EIO);
615
616 DB_DNODE_ENTER(db);
617 dn = DB_DNODE(db);
618 if ((flags & DB_RF_HAVESTRUCT) == 0)
619 rw_enter(&dn->dn_struct_rwlock, RW_READER);
620
621 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
622 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
623 DBUF_IS_CACHEABLE(db);
624
625 mutex_enter(&db->db_mtx);
626 if (db->db_state == DB_CACHED) {
627 mutex_exit(&db->db_mtx);
628 if (prefetch)
629 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
630 db->db.db_size, TRUE);
631 if ((flags & DB_RF_HAVESTRUCT) == 0)
632 rw_exit(&dn->dn_struct_rwlock);
633 DB_DNODE_EXIT(db);
634 } else if (db->db_state == DB_UNCACHED) {
635 spa_t *spa = dn->dn_objset->os_spa;
636
637 if (zio == NULL)
638 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
639 dbuf_read_impl(db, zio, &flags);
640
641 /* dbuf_read_impl has dropped db_mtx for us */
642
643 if (prefetch)
644 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
645 db->db.db_size, flags & DB_RF_CACHED);
646
647 if ((flags & DB_RF_HAVESTRUCT) == 0)
648 rw_exit(&dn->dn_struct_rwlock);
649 DB_DNODE_EXIT(db);
650
651 if (!havepzio)
652 err = zio_wait(zio);
653 } else {
654 mutex_exit(&db->db_mtx);
655 if (prefetch)
656 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
657 db->db.db_size, TRUE);
658 if ((flags & DB_RF_HAVESTRUCT) == 0)
659 rw_exit(&dn->dn_struct_rwlock);
660 DB_DNODE_EXIT(db);
661
662 mutex_enter(&db->db_mtx);
663 if ((flags & DB_RF_NEVERWAIT) == 0) {
664 while (db->db_state == DB_READ ||
665 db->db_state == DB_FILL) {
666 ASSERT(db->db_state == DB_READ ||
667 (flags & DB_RF_HAVESTRUCT) == 0);
668 cv_wait(&db->db_changed, &db->db_mtx);
669 }
670 if (db->db_state == DB_UNCACHED)
671 err = EIO;
672 }
673 mutex_exit(&db->db_mtx);
674 }
675
676 ASSERT(err || havepzio || db->db_state == DB_CACHED);
677 return (err);
678 }
679
680 static void
681 dbuf_noread(dmu_buf_impl_t *db)
682 {
683 ASSERT(!refcount_is_zero(&db->db_holds));
684 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
685 mutex_enter(&db->db_mtx);
686 while (db->db_state == DB_READ || db->db_state == DB_FILL)
687 cv_wait(&db->db_changed, &db->db_mtx);
688 if (db->db_state == DB_UNCACHED) {
689 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
690 spa_t *spa;
691
692 ASSERT(db->db_buf == NULL);
693 ASSERT(db->db.db_data == NULL);
694 DB_GET_SPA(&spa, db);
695 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
696 db->db_state = DB_FILL;
697 } else if (db->db_state == DB_NOFILL) {
698 dbuf_set_data(db, NULL);
699 } else {
700 ASSERT3U(db->db_state, ==, DB_CACHED);
701 }
702 mutex_exit(&db->db_mtx);
703 }
704
705 /*
706 * This is our just-in-time copy function. It makes a copy of
707 * buffers, that have been modified in a previous transaction
708 * group, before we modify them in the current active group.
709 *
710 * This function is used in two places: when we are dirtying a
711 * buffer for the first time in a txg, and when we are freeing
712 * a range in a dnode that includes this buffer.
713 *
714 * Note that when we are called from dbuf_free_range() we do
715 * not put a hold on the buffer, we just traverse the active
716 * dbuf list for the dnode.
717 */
718 static void
719 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
720 {
721 dbuf_dirty_record_t *dr = db->db_last_dirty;
722
723 ASSERT(MUTEX_HELD(&db->db_mtx));
724 ASSERT(db->db.db_data != NULL);
725 ASSERT(db->db_level == 0);
726 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
727
728 if (dr == NULL ||
729 (dr->dt.dl.dr_data !=
730 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
731 return;
732
733 /*
734 * If the last dirty record for this dbuf has not yet synced
735 * and its referencing the dbuf data, either:
736 * reset the reference to point to a new copy,
737 * or (if there a no active holders)
738 * just null out the current db_data pointer.
739 */
740 ASSERT(dr->dr_txg >= txg - 2);
741 if (db->db_blkid == DMU_BONUS_BLKID) {
742 /* Note that the data bufs here are zio_bufs */
743 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
744 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
745 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
746 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
747 int size = db->db.db_size;
748 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
749 spa_t *spa;
750
751 DB_GET_SPA(&spa, db);
752 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
753 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
754 } else {
755 dbuf_set_data(db, NULL);
756 }
757 }
758
759 void
760 dbuf_unoverride(dbuf_dirty_record_t *dr)
761 {
762 dmu_buf_impl_t *db = dr->dr_dbuf;
763 blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
764 uint64_t txg = dr->dr_txg;
765
766 ASSERT(MUTEX_HELD(&db->db_mtx));
767 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
768 ASSERT(db->db_level == 0);
769
770 if (db->db_blkid == DMU_BONUS_BLKID ||
771 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
772 return;
773
774 ASSERT(db->db_data_pending != dr);
775
776 /* free this block */
777 if (!BP_IS_HOLE(bp)) {
778 spa_t *spa;
779
780 DB_GET_SPA(&spa, db);
781 zio_free(spa, txg, bp);
782 }
783 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
784 /*
785 * Release the already-written buffer, so we leave it in
786 * a consistent dirty state. Note that all callers are
787 * modifying the buffer, so they will immediately do
788 * another (redundant) arc_release(). Therefore, leave
789 * the buf thawed to save the effort of freezing &
790 * immediately re-thawing it.
791 */
792 arc_release(dr->dt.dl.dr_data, db);
793 }
794
795 /*
796 * Evict (if its unreferenced) or clear (if its referenced) any level-0
797 * data blocks in the free range, so that any future readers will find
798 * empty blocks. Also, if we happen accross any level-1 dbufs in the
799 * range that have not already been marked dirty, mark them dirty so
800 * they stay in memory.
801 */
802 void
803 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
804 {
805 dmu_buf_impl_t *db, *db_next;
806 uint64_t txg = tx->tx_txg;
807 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
808 uint64_t first_l1 = start >> epbs;
809 uint64_t last_l1 = end >> epbs;
810
811 if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
812 end = dn->dn_maxblkid;
813 last_l1 = end >> epbs;
814 }
815 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
816 mutex_enter(&dn->dn_dbufs_mtx);
817 for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
818 db_next = list_next(&dn->dn_dbufs, db);
819 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
820
821 if (db->db_level == 1 &&
822 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
823 mutex_enter(&db->db_mtx);
824 if (db->db_last_dirty &&
825 db->db_last_dirty->dr_txg < txg) {
826 dbuf_add_ref(db, FTAG);
827 mutex_exit(&db->db_mtx);
828 dbuf_will_dirty(db, tx);
829 dbuf_rele(db, FTAG);
830 } else {
831 mutex_exit(&db->db_mtx);
832 }
833 }
834
835 if (db->db_level != 0)
836 continue;
837 dprintf_dbuf(db, "found buf %s\n", "");
838 if (db->db_blkid < start || db->db_blkid > end)
839 continue;
840
841 /* found a level 0 buffer in the range */
842 if (dbuf_undirty(db, tx))
843 continue;
844
845 mutex_enter(&db->db_mtx);
846 if (db->db_state == DB_UNCACHED ||
847 db->db_state == DB_NOFILL ||
848 db->db_state == DB_EVICTING) {
849 ASSERT(db->db.db_data == NULL);
850 mutex_exit(&db->db_mtx);
851 continue;
852 }
853 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
854 /* will be handled in dbuf_read_done or dbuf_rele */
855 db->db_freed_in_flight = TRUE;
856 mutex_exit(&db->db_mtx);
857 continue;
858 }
859 if (refcount_count(&db->db_holds) == 0) {
860 ASSERT(db->db_buf);
861 dbuf_clear(db);
862 continue;
863 }
864 /* The dbuf is referenced */
865
866 if (db->db_last_dirty != NULL) {
867 dbuf_dirty_record_t *dr = db->db_last_dirty;
868
869 if (dr->dr_txg == txg) {
870 /*
871 * This buffer is "in-use", re-adjust the file
872 * size to reflect that this buffer may
873 * contain new data when we sync.
874 */
875 if (db->db_blkid != DMU_SPILL_BLKID &&
876 db->db_blkid > dn->dn_maxblkid)
877 dn->dn_maxblkid = db->db_blkid;
878 dbuf_unoverride(dr);
879 } else {
880 /*
881 * This dbuf is not dirty in the open context.
882 * Either uncache it (if its not referenced in
883 * the open context) or reset its contents to
884 * empty.
885 */
886 dbuf_fix_old_data(db, txg);
887 }
888 }
889 /* clear the contents if its cached */
890 if (db->db_state == DB_CACHED) {
891 ASSERT(db->db.db_data != NULL);
892 arc_release(db->db_buf, db);
893 bzero(db->db.db_data, db->db.db_size);
894 arc_buf_freeze(db->db_buf);
895 }
896
897 mutex_exit(&db->db_mtx);
898 }
899 mutex_exit(&dn->dn_dbufs_mtx);
900 }
901
902 static int
903 dbuf_block_freeable(dmu_buf_impl_t *db)
904 {
905 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
906 uint64_t birth_txg = 0;
907
908 /*
909 * We don't need any locking to protect db_blkptr:
910 * If it's syncing, then db_last_dirty will be set
911 * so we'll ignore db_blkptr.
912 */
913 ASSERT(MUTEX_HELD(&db->db_mtx));
914 if (db->db_last_dirty)
915 birth_txg = db->db_last_dirty->dr_txg;
916 else if (db->db_blkptr)
917 birth_txg = db->db_blkptr->blk_birth;
918
919 /*
920 * If we don't exist or are in a snapshot, we can't be freed.
921 * Don't pass the bp to dsl_dataset_block_freeable() since we
922 * are holding the db_mtx lock and might deadlock if we are
923 * prefetching a dedup-ed block.
924 */
925 if (birth_txg)
926 return (ds == NULL ||
927 dsl_dataset_block_freeable(ds, NULL, birth_txg));
928 else
929 return (FALSE);
930 }
931
932 void
933 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
934 {
935 arc_buf_t *buf, *obuf;
936 int osize = db->db.db_size;
937 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
938 dnode_t *dn;
939
940 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
941
942 DB_DNODE_ENTER(db);
943 dn = DB_DNODE(db);
944
945 /* XXX does *this* func really need the lock? */
946 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
947
948 /*
949 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
950 * is OK, because there can be no other references to the db
951 * when we are changing its size, so no concurrent DB_FILL can
952 * be happening.
953 */
954 /*
955 * XXX we should be doing a dbuf_read, checking the return
956 * value and returning that up to our callers
957 */
958 dbuf_will_dirty(db, tx);
959
960 /* create the data buffer for the new block */
961 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
962
963 /* copy old block data to the new block */
964 obuf = db->db_buf;
965 bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
966 /* zero the remainder */
967 if (size > osize)
968 bzero((uint8_t *)buf->b_data + osize, size - osize);
969
970 mutex_enter(&db->db_mtx);
971 dbuf_set_data(db, buf);
972 VERIFY(arc_buf_remove_ref(obuf, db) == 1);
973 db->db.db_size = size;
974
975 if (db->db_level == 0) {
976 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
977 db->db_last_dirty->dt.dl.dr_data = buf;
978 }
979 mutex_exit(&db->db_mtx);
980
981 dnode_willuse_space(dn, size-osize, tx);
982 DB_DNODE_EXIT(db);
983 }
984
985 void
986 dbuf_release_bp(dmu_buf_impl_t *db)
987 {
988 objset_t *os;
989 zbookmark_t zb;
990
991 DB_GET_OBJSET(&os, db);
992 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
993 ASSERT(arc_released(os->os_phys_buf) ||
994 list_link_active(&os->os_dsl_dataset->ds_synced_link));
995 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
996
997 zb.zb_objset = os->os_dsl_dataset ?
998 os->os_dsl_dataset->ds_object : 0;
999 zb.zb_object = db->db.db_object;
1000 zb.zb_level = db->db_level;
1001 zb.zb_blkid = db->db_blkid;
1002 (void) arc_release_bp(db->db_buf, db,
1003 db->db_blkptr, os->os_spa, &zb);
1004 }
1005
1006 dbuf_dirty_record_t *
1007 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1008 {
1009 dnode_t *dn;
1010 objset_t *os;
1011 dbuf_dirty_record_t **drp, *dr;
1012 int drop_struct_lock = FALSE;
1013 boolean_t do_free_accounting = B_FALSE;
1014 int txgoff = tx->tx_txg & TXG_MASK;
1015
1016 ASSERT(tx->tx_txg != 0);
1017 ASSERT(!refcount_is_zero(&db->db_holds));
1018 DMU_TX_DIRTY_BUF(tx, db);
1019
1020 DB_DNODE_ENTER(db);
1021 dn = DB_DNODE(db);
1022 /*
1023 * Shouldn't dirty a regular buffer in syncing context. Private
1024 * objects may be dirtied in syncing context, but only if they
1025 * were already pre-dirtied in open context.
1026 */
1027 ASSERT(!dmu_tx_is_syncing(tx) ||
1028 BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1029 DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1030 dn->dn_objset->os_dsl_dataset == NULL);
1031 /*
1032 * We make this assert for private objects as well, but after we
1033 * check if we're already dirty. They are allowed to re-dirty
1034 * in syncing context.
1035 */
1036 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1037 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1038 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1039
1040 mutex_enter(&db->db_mtx);
1041 /*
1042 * XXX make this true for indirects too? The problem is that
1043 * transactions created with dmu_tx_create_assigned() from
1044 * syncing context don't bother holding ahead.
1045 */
1046 ASSERT(db->db_level != 0 ||
1047 db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1048 db->db_state == DB_NOFILL);
1049
1050 mutex_enter(&dn->dn_mtx);
1051 /*
1052 * Don't set dirtyctx to SYNC if we're just modifying this as we
1053 * initialize the objset.
1054 */
1055 if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1056 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1057 dn->dn_dirtyctx =
1058 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1059 ASSERT(dn->dn_dirtyctx_firstset == NULL);
1060 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1061 }
1062 mutex_exit(&dn->dn_mtx);
1063
1064 if (db->db_blkid == DMU_SPILL_BLKID)
1065 dn->dn_have_spill = B_TRUE;
1066
1067 /*
1068 * If this buffer is already dirty, we're done.
1069 */
1070 drp = &db->db_last_dirty;
1071 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1072 db->db.db_object == DMU_META_DNODE_OBJECT);
1073 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1074 drp = &dr->dr_next;
1075 if (dr && dr->dr_txg == tx->tx_txg) {
1076 DB_DNODE_EXIT(db);
1077
1078 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1079 /*
1080 * If this buffer has already been written out,
1081 * we now need to reset its state.
1082 */
1083 dbuf_unoverride(dr);
1084 if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1085 db->db_state != DB_NOFILL)
1086 arc_buf_thaw(db->db_buf);
1087 }
1088 mutex_exit(&db->db_mtx);
1089 return (dr);
1090 }
1091
1092 /*
1093 * Only valid if not already dirty.
1094 */
1095 ASSERT(dn->dn_object == 0 ||
1096 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1097 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1098
1099 ASSERT3U(dn->dn_nlevels, >, db->db_level);
1100 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1101 dn->dn_phys->dn_nlevels > db->db_level ||
1102 dn->dn_next_nlevels[txgoff] > db->db_level ||
1103 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1104 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1105
1106 /*
1107 * We should only be dirtying in syncing context if it's the
1108 * mos or we're initializing the os or it's a special object.
1109 * However, we are allowed to dirty in syncing context provided
1110 * we already dirtied it in open context. Hence we must make
1111 * this assertion only if we're not already dirty.
1112 */
1113 os = dn->dn_objset;
1114 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1115 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1116 ASSERT(db->db.db_size != 0);
1117
1118 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1119
1120 if (db->db_blkid != DMU_BONUS_BLKID) {
1121 /*
1122 * Update the accounting.
1123 * Note: we delay "free accounting" until after we drop
1124 * the db_mtx. This keeps us from grabbing other locks
1125 * (and possibly deadlocking) in bp_get_dsize() while
1126 * also holding the db_mtx.
1127 */
1128 dnode_willuse_space(dn, db->db.db_size, tx);
1129 do_free_accounting = dbuf_block_freeable(db);
1130 }
1131
1132 /*
1133 * If this buffer is dirty in an old transaction group we need
1134 * to make a copy of it so that the changes we make in this
1135 * transaction group won't leak out when we sync the older txg.
1136 */
1137 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
1138 if (db->db_level == 0) {
1139 void *data_old = db->db_buf;
1140
1141 if (db->db_state != DB_NOFILL) {
1142 if (db->db_blkid == DMU_BONUS_BLKID) {
1143 dbuf_fix_old_data(db, tx->tx_txg);
1144 data_old = db->db.db_data;
1145 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1146 /*
1147 * Release the data buffer from the cache so
1148 * that we can modify it without impacting
1149 * possible other users of this cached data
1150 * block. Note that indirect blocks and
1151 * private objects are not released until the
1152 * syncing state (since they are only modified
1153 * then).
1154 */
1155 arc_release(db->db_buf, db);
1156 dbuf_fix_old_data(db, tx->tx_txg);
1157 data_old = db->db_buf;
1158 }
1159 ASSERT(data_old != NULL);
1160 }
1161 dr->dt.dl.dr_data = data_old;
1162 } else {
1163 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1164 list_create(&dr->dt.di.dr_children,
1165 sizeof (dbuf_dirty_record_t),
1166 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1167 }
1168 dr->dr_dbuf = db;
1169 dr->dr_txg = tx->tx_txg;
1170 dr->dr_next = *drp;
1171 *drp = dr;
1172
1173 /*
1174 * We could have been freed_in_flight between the dbuf_noread
1175 * and dbuf_dirty. We win, as though the dbuf_noread() had
1176 * happened after the free.
1177 */
1178 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1179 db->db_blkid != DMU_SPILL_BLKID) {
1180 mutex_enter(&dn->dn_mtx);
1181 dnode_clear_range(dn, db->db_blkid, 1, tx);
1182 mutex_exit(&dn->dn_mtx);
1183 db->db_freed_in_flight = FALSE;
1184 }
1185
1186 /*
1187 * This buffer is now part of this txg
1188 */
1189 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1190 db->db_dirtycnt += 1;
1191 ASSERT3U(db->db_dirtycnt, <=, 3);
1192
1193 mutex_exit(&db->db_mtx);
1194
1195 if (db->db_blkid == DMU_BONUS_BLKID ||
1196 db->db_blkid == DMU_SPILL_BLKID) {
1197 mutex_enter(&dn->dn_mtx);
1198 ASSERT(!list_link_active(&dr->dr_dirty_node));
1199 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1200 mutex_exit(&dn->dn_mtx);
1201 dnode_setdirty(dn, tx);
1202 DB_DNODE_EXIT(db);
1203 return (dr);
1204 } else if (do_free_accounting) {
1205 blkptr_t *bp = db->db_blkptr;
1206 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1207 bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1208 /*
1209 * This is only a guess -- if the dbuf is dirty
1210 * in a previous txg, we don't know how much
1211 * space it will use on disk yet. We should
1212 * really have the struct_rwlock to access
1213 * db_blkptr, but since this is just a guess,
1214 * it's OK if we get an odd answer.
1215 */
1216 ddt_prefetch(os->os_spa, bp);
1217 dnode_willuse_space(dn, -willfree, tx);
1218 }
1219
1220 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1221 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1222 drop_struct_lock = TRUE;
1223 }
1224
1225 if (db->db_level == 0) {
1226 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1227 ASSERT(dn->dn_maxblkid >= db->db_blkid);
1228 }
1229
1230 if (db->db_level+1 < dn->dn_nlevels) {
1231 dmu_buf_impl_t *parent = db->db_parent;
1232 dbuf_dirty_record_t *di;
1233 int parent_held = FALSE;
1234
1235 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1236 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1237
1238 parent = dbuf_hold_level(dn, db->db_level+1,
1239 db->db_blkid >> epbs, FTAG);
1240 ASSERT(parent != NULL);
1241 parent_held = TRUE;
1242 }
1243 if (drop_struct_lock)
1244 rw_exit(&dn->dn_struct_rwlock);
1245 ASSERT3U(db->db_level+1, ==, parent->db_level);
1246 di = dbuf_dirty(parent, tx);
1247 if (parent_held)
1248 dbuf_rele(parent, FTAG);
1249
1250 mutex_enter(&db->db_mtx);
1251 /* possible race with dbuf_undirty() */
1252 if (db->db_last_dirty == dr ||
1253 dn->dn_object == DMU_META_DNODE_OBJECT) {
1254 mutex_enter(&di->dt.di.dr_mtx);
1255 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1256 ASSERT(!list_link_active(&dr->dr_dirty_node));
1257 list_insert_tail(&di->dt.di.dr_children, dr);
1258 mutex_exit(&di->dt.di.dr_mtx);
1259 dr->dr_parent = di;
1260 }
1261 mutex_exit(&db->db_mtx);
1262 } else {
1263 ASSERT(db->db_level+1 == dn->dn_nlevels);
1264 ASSERT(db->db_blkid < dn->dn_nblkptr);
1265 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1266 mutex_enter(&dn->dn_mtx);
1267 ASSERT(!list_link_active(&dr->dr_dirty_node));
1268 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1269 mutex_exit(&dn->dn_mtx);
1270 if (drop_struct_lock)
1271 rw_exit(&dn->dn_struct_rwlock);
1272 }
1273
1274 dnode_setdirty(dn, tx);
1275 DB_DNODE_EXIT(db);
1276 return (dr);
1277 }
1278
1279 static int
1280 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1281 {
1282 dnode_t *dn;
1283 uint64_t txg = tx->tx_txg;
1284 dbuf_dirty_record_t *dr, **drp;
1285
1286 ASSERT(txg != 0);
1287 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1288
1289 mutex_enter(&db->db_mtx);
1290 /*
1291 * If this buffer is not dirty, we're done.
1292 */
1293 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1294 if (dr->dr_txg <= txg)
1295 break;
1296 if (dr == NULL || dr->dr_txg < txg) {
1297 mutex_exit(&db->db_mtx);
1298 return (0);
1299 }
1300 ASSERT(dr->dr_txg == txg);
1301 ASSERT(dr->dr_dbuf == db);
1302
1303 DB_DNODE_ENTER(db);
1304 dn = DB_DNODE(db);
1305
1306 /*
1307 * If this buffer is currently held, we cannot undirty
1308 * it, since one of the current holders may be in the
1309 * middle of an update. Note that users of dbuf_undirty()
1310 * should not place a hold on the dbuf before the call.
1311 */
1312 if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
1313 mutex_exit(&db->db_mtx);
1314 /* Make sure we don't toss this buffer at sync phase */
1315 mutex_enter(&dn->dn_mtx);
1316 dnode_clear_range(dn, db->db_blkid, 1, tx);
1317 mutex_exit(&dn->dn_mtx);
1318 DB_DNODE_EXIT(db);
1319 return (0);
1320 }
1321
1322 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1323
1324 ASSERT(db->db.db_size != 0);
1325
1326 /* XXX would be nice to fix up dn_towrite_space[] */
1327
1328 *drp = dr->dr_next;
1329
1330 if (dr->dr_parent) {
1331 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1332 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1333 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1334 } else if (db->db_level+1 == dn->dn_nlevels) {
1335 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1336 mutex_enter(&dn->dn_mtx);
1337 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1338 mutex_exit(&dn->dn_mtx);
1339 }
1340 DB_DNODE_EXIT(db);
1341
1342 if (db->db_level == 0) {
1343 if (db->db_state != DB_NOFILL) {
1344 dbuf_unoverride(dr);
1345
1346 ASSERT(db->db_buf != NULL);
1347 ASSERT(dr->dt.dl.dr_data != NULL);
1348 if (dr->dt.dl.dr_data != db->db_buf)
1349 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
1350 db) == 1);
1351 }
1352 } else {
1353 ASSERT(db->db_buf != NULL);
1354 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
1355 mutex_destroy(&dr->dt.di.dr_mtx);
1356 list_destroy(&dr->dt.di.dr_children);
1357 }
1358 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1359
1360 ASSERT(db->db_dirtycnt > 0);
1361 db->db_dirtycnt -= 1;
1362
1363 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1364 arc_buf_t *buf = db->db_buf;
1365
1366 ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1367 dbuf_set_data(db, NULL);
1368 VERIFY(arc_buf_remove_ref(buf, db) == 1);
1369 dbuf_evict(db);
1370 return (1);
1371 }
1372
1373 mutex_exit(&db->db_mtx);
1374 return (0);
1375 }
1376
1377 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
1378 void
1379 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1380 {
1381 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1382
1383 ASSERT(tx->tx_txg != 0);
1384 ASSERT(!refcount_is_zero(&db->db_holds));
1385
1386 DB_DNODE_ENTER(db);
1387 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1388 rf |= DB_RF_HAVESTRUCT;
1389 DB_DNODE_EXIT(db);
1390 (void) dbuf_read(db, NULL, rf);
1391 (void) dbuf_dirty(db, tx);
1392 }
1393
1394 void
1395 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1396 {
1397 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1398
1399 db->db_state = DB_NOFILL;
1400
1401 dmu_buf_will_fill(db_fake, tx);
1402 }
1403
1404 void
1405 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1406 {
1407 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1408
1409 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1410 ASSERT(tx->tx_txg != 0);
1411 ASSERT(db->db_level == 0);
1412 ASSERT(!refcount_is_zero(&db->db_holds));
1413
1414 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1415 dmu_tx_private_ok(tx));
1416
1417 dbuf_noread(db);
1418 (void) dbuf_dirty(db, tx);
1419 }
1420
1421 #pragma weak dmu_buf_fill_done = dbuf_fill_done
1422 /* ARGSUSED */
1423 void
1424 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1425 {
1426 mutex_enter(&db->db_mtx);
1427 DBUF_VERIFY(db);
1428
1429 if (db->db_state == DB_FILL) {
1430 if (db->db_level == 0 && db->db_freed_in_flight) {
1431 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1432 /* we were freed while filling */
1433 /* XXX dbuf_undirty? */
1434 bzero(db->db.db_data, db->db.db_size);
1435 db->db_freed_in_flight = FALSE;
1436 }
1437 db->db_state = DB_CACHED;
1438 cv_broadcast(&db->db_changed);
1439 }
1440 mutex_exit(&db->db_mtx);
1441 }
1442
1443 /*
1444 * Directly assign a provided arc buf to a given dbuf if it's not referenced
1445 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1446 */
1447 void
1448 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1449 {
1450 ASSERT(!refcount_is_zero(&db->db_holds));
1451 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1452 ASSERT(db->db_level == 0);
1453 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1454 ASSERT(buf != NULL);
1455 ASSERT(arc_buf_size(buf) == db->db.db_size);
1456 ASSERT(tx->tx_txg != 0);
1457
1458 arc_return_buf(buf, db);
1459 ASSERT(arc_released(buf));
1460
1461 mutex_enter(&db->db_mtx);
1462
1463 while (db->db_state == DB_READ || db->db_state == DB_FILL)
1464 cv_wait(&db->db_changed, &db->db_mtx);
1465
1466 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1467
1468 if (db->db_state == DB_CACHED &&
1469 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1470 mutex_exit(&db->db_mtx);
1471 (void) dbuf_dirty(db, tx);
1472 bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1473 VERIFY(arc_buf_remove_ref(buf, db) == 1);
1474 xuio_stat_wbuf_copied();
1475 return;
1476 }
1477
1478 xuio_stat_wbuf_nocopy();
1479 if (db->db_state == DB_CACHED) {
1480 dbuf_dirty_record_t *dr = db->db_last_dirty;
1481
1482 ASSERT(db->db_buf != NULL);
1483 if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1484 ASSERT(dr->dt.dl.dr_data == db->db_buf);
1485 if (!arc_released(db->db_buf)) {
1486 ASSERT(dr->dt.dl.dr_override_state ==
1487 DR_OVERRIDDEN);
1488 arc_release(db->db_buf, db);
1489 }
1490 dr->dt.dl.dr_data = buf;
1491 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
1492 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1493 arc_release(db->db_buf, db);
1494 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
1495 }
1496 db->db_buf = NULL;
1497 }
1498 ASSERT(db->db_buf == NULL);
1499 dbuf_set_data(db, buf);
1500 db->db_state = DB_FILL;
1501 mutex_exit(&db->db_mtx);
1502 (void) dbuf_dirty(db, tx);
1503 dbuf_fill_done(db, tx);
1504 }
1505
1506 /*
1507 * "Clear" the contents of this dbuf. This will mark the dbuf
1508 * EVICTING and clear *most* of its references. Unfortunetely,
1509 * when we are not holding the dn_dbufs_mtx, we can't clear the
1510 * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1511 * in this case. For callers from the DMU we will usually see:
1512 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1513 * For the arc callback, we will usually see:
1514 * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1515 * Sometimes, though, we will get a mix of these two:
1516 * DMU: dbuf_clear()->arc_buf_evict()
1517 * ARC: dbuf_do_evict()->dbuf_destroy()
1518 */
1519 void
1520 dbuf_clear(dmu_buf_impl_t *db)
1521 {
1522 dnode_t *dn;
1523 dmu_buf_impl_t *parent = db->db_parent;
1524 dmu_buf_impl_t *dndb;
1525 int dbuf_gone = FALSE;
1526
1527 ASSERT(MUTEX_HELD(&db->db_mtx));
1528 ASSERT(refcount_is_zero(&db->db_holds));
1529
1530 dbuf_evict_user(db);
1531
1532 if (db->db_state == DB_CACHED) {
1533 ASSERT(db->db.db_data != NULL);
1534 if (db->db_blkid == DMU_BONUS_BLKID) {
1535 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1536 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1537 }
1538 db->db.db_data = NULL;
1539 db->db_state = DB_UNCACHED;
1540 }
1541
1542 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1543 ASSERT(db->db_data_pending == NULL);
1544
1545 db->db_state = DB_EVICTING;
1546 db->db_blkptr = NULL;
1547
1548 DB_DNODE_ENTER(db);
1549 dn = DB_DNODE(db);
1550 dndb = dn->dn_dbuf;
1551 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1552 list_remove(&dn->dn_dbufs, db);
1553 (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1554 membar_producer();
1555 DB_DNODE_EXIT(db);
1556 /*
1557 * Decrementing the dbuf count means that the hold corresponding
1558 * to the removed dbuf is no longer discounted in dnode_move(),
1559 * so the dnode cannot be moved until after we release the hold.
1560 * The membar_producer() ensures visibility of the decremented
1561 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1562 * release any lock.
1563 */
1564 dnode_rele(dn, db);
1565 db->db_dnode_handle = NULL;
1566 } else {
1567 DB_DNODE_EXIT(db);
1568 }
1569
1570 if (db->db_buf)
1571 dbuf_gone = arc_buf_evict(db->db_buf);
1572
1573 if (!dbuf_gone)
1574 mutex_exit(&db->db_mtx);
1575
1576 /*
1577 * If this dbuf is referenced from an indirect dbuf,
1578 * decrement the ref count on the indirect dbuf.
1579 */
1580 if (parent && parent != dndb)
1581 dbuf_rele(parent, db);
1582 }
1583
1584 static int
1585 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1586 dmu_buf_impl_t **parentp, blkptr_t **bpp)
1587 {
1588 int nlevels, epbs;
1589
1590 *parentp = NULL;
1591 *bpp = NULL;
1592
1593 ASSERT(blkid != DMU_BONUS_BLKID);
1594
1595 if (blkid == DMU_SPILL_BLKID) {
1596 mutex_enter(&dn->dn_mtx);
1597 if (dn->dn_have_spill &&
1598 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1599 *bpp = &dn->dn_phys->dn_spill;
1600 else
1601 *bpp = NULL;
1602 dbuf_add_ref(dn->dn_dbuf, NULL);
1603 *parentp = dn->dn_dbuf;
1604 mutex_exit(&dn->dn_mtx);
1605 return (0);
1606 }
1607
1608 if (dn->dn_phys->dn_nlevels == 0)
1609 nlevels = 1;
1610 else
1611 nlevels = dn->dn_phys->dn_nlevels;
1612
1613 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1614
1615 ASSERT3U(level * epbs, <, 64);
1616 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1617 if (level >= nlevels ||
1618 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1619 /* the buffer has no parent yet */
1620 return (ENOENT);
1621 } else if (level < nlevels-1) {
1622 /* this block is referenced from an indirect block */
1623 int err = dbuf_hold_impl(dn, level+1,
1624 blkid >> epbs, fail_sparse, NULL, parentp);
1625 if (err)
1626 return (err);
1627 err = dbuf_read(*parentp, NULL,
1628 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1629 if (err) {
1630 dbuf_rele(*parentp, NULL);
1631 *parentp = NULL;
1632 return (err);
1633 }
1634 *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1635 (blkid & ((1ULL << epbs) - 1));
1636 return (0);
1637 } else {
1638 /* the block is referenced from the dnode */
1639 ASSERT3U(level, ==, nlevels-1);
1640 ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1641 blkid < dn->dn_phys->dn_nblkptr);
1642 if (dn->dn_dbuf) {
1643 dbuf_add_ref(dn->dn_dbuf, NULL);
1644 *parentp = dn->dn_dbuf;
1645 }
1646 *bpp = &dn->dn_phys->dn_blkptr[blkid];
1647 return (0);
1648 }
1649 }
1650
1651 static dmu_buf_impl_t *
1652 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1653 dmu_buf_impl_t *parent, blkptr_t *blkptr)
1654 {
1655 objset_t *os = dn->dn_objset;
1656 dmu_buf_impl_t *db, *odb;
1657
1658 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1659 ASSERT(dn->dn_type != DMU_OT_NONE);
1660
1661 db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
1662
1663 db->db_objset = os;
1664 db->db.db_object = dn->dn_object;
1665 db->db_level = level;
1666 db->db_blkid = blkid;
1667 db->db_last_dirty = NULL;
1668 db->db_dirtycnt = 0;
1669 db->db_dnode_handle = dn->dn_handle;
1670 db->db_parent = parent;
1671 db->db_blkptr = blkptr;
1672
1673 db->db_user_ptr = NULL;
1674 db->db_user_data_ptr_ptr = NULL;
1675 db->db_evict_func = NULL;
1676 db->db_immediate_evict = 0;
1677 db->db_freed_in_flight = 0;
1678
1679 if (blkid == DMU_BONUS_BLKID) {
1680 ASSERT3P(parent, ==, dn->dn_dbuf);
1681 db->db.db_size = DN_MAX_BONUSLEN -
1682 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1683 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1684 db->db.db_offset = DMU_BONUS_BLKID;
1685 db->db_state = DB_UNCACHED;
1686 /* the bonus dbuf is not placed in the hash table */
1687 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1688 return (db);
1689 } else if (blkid == DMU_SPILL_BLKID) {
1690 db->db.db_size = (blkptr != NULL) ?
1691 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1692 db->db.db_offset = 0;
1693 } else {
1694 int blocksize =
1695 db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
1696 db->db.db_size = blocksize;
1697 db->db.db_offset = db->db_blkid * blocksize;
1698 }
1699
1700 /*
1701 * Hold the dn_dbufs_mtx while we get the new dbuf
1702 * in the hash table *and* added to the dbufs list.
1703 * This prevents a possible deadlock with someone
1704 * trying to look up this dbuf before its added to the
1705 * dn_dbufs list.
1706 */
1707 mutex_enter(&dn->dn_dbufs_mtx);
1708 db->db_state = DB_EVICTING;
1709 if ((odb = dbuf_hash_insert(db)) != NULL) {
1710 /* someone else inserted it first */
1711 kmem_cache_free(dbuf_cache, db);
1712 mutex_exit(&dn->dn_dbufs_mtx);
1713 return (odb);
1714 }
1715 list_insert_head(&dn->dn_dbufs, db);
1716 db->db_state = DB_UNCACHED;
1717 mutex_exit(&dn->dn_dbufs_mtx);
1718 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1719
1720 if (parent && parent != dn->dn_dbuf)
1721 dbuf_add_ref(parent, db);
1722
1723 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1724 refcount_count(&dn->dn_holds) > 0);
1725 (void) refcount_add(&dn->dn_holds, db);
1726 (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
1727
1728 dprintf_dbuf(db, "db=%p\n", db);
1729
1730 return (db);
1731 }
1732
1733 static int
1734 dbuf_do_evict(void *private)
1735 {
1736 arc_buf_t *buf = private;
1737 dmu_buf_impl_t *db = buf->b_private;
1738
1739 if (!MUTEX_HELD(&db->db_mtx))
1740 mutex_enter(&db->db_mtx);
1741
1742 ASSERT(refcount_is_zero(&db->db_holds));
1743
1744 if (db->db_state != DB_EVICTING) {
1745 ASSERT(db->db_state == DB_CACHED);
1746 DBUF_VERIFY(db);
1747 db->db_buf = NULL;
1748 dbuf_evict(db);
1749 } else {
1750 mutex_exit(&db->db_mtx);
1751 dbuf_destroy(db);
1752 }
1753 return (0);
1754 }
1755
1756 static void
1757 dbuf_destroy(dmu_buf_impl_t *db)
1758 {
1759 ASSERT(refcount_is_zero(&db->db_holds));
1760
1761 if (db->db_blkid != DMU_BONUS_BLKID) {
1762 /*
1763 * If this dbuf is still on the dn_dbufs list,
1764 * remove it from that list.
1765 */
1766 if (db->db_dnode_handle != NULL) {
1767 dnode_t *dn;
1768
1769 DB_DNODE_ENTER(db);
1770 dn = DB_DNODE(db);
1771 mutex_enter(&dn->dn_dbufs_mtx);
1772 list_remove(&dn->dn_dbufs, db);
1773 (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1774 mutex_exit(&dn->dn_dbufs_mtx);
1775 DB_DNODE_EXIT(db);
1776 /*
1777 * Decrementing the dbuf count means that the hold
1778 * corresponding to the removed dbuf is no longer
1779 * discounted in dnode_move(), so the dnode cannot be
1780 * moved until after we release the hold.
1781 */
1782 dnode_rele(dn, db);
1783 db->db_dnode_handle = NULL;
1784 }
1785 dbuf_hash_remove(db);
1786 }
1787 db->db_parent = NULL;
1788 db->db_buf = NULL;
1789
1790 ASSERT(!list_link_active(&db->db_link));
1791 ASSERT(db->db.db_data == NULL);
1792 ASSERT(db->db_hash_next == NULL);
1793 ASSERT(db->db_blkptr == NULL);
1794 ASSERT(db->db_data_pending == NULL);
1795
1796 kmem_cache_free(dbuf_cache, db);
1797 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1798 }
1799
1800 void
1801 dbuf_prefetch(dnode_t *dn, uint64_t blkid)
1802 {
1803 dmu_buf_impl_t *db = NULL;
1804 blkptr_t *bp = NULL;
1805
1806 ASSERT(blkid != DMU_BONUS_BLKID);
1807 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1808
1809 if (dnode_block_freed(dn, blkid))
1810 return;
1811
1812 /* dbuf_find() returns with db_mtx held */
1813 if (db = dbuf_find(dn, 0, blkid)) {
1814 /*
1815 * This dbuf is already in the cache. We assume that
1816 * it is already CACHED, or else about to be either
1817 * read or filled.
1818 */
1819 mutex_exit(&db->db_mtx);
1820 return;
1821 }
1822
1823 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
1824 if (bp && !BP_IS_HOLE(bp)) {
1825 int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
1826 ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
1827 arc_buf_t *pbuf;
1828 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1829 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1830 zbookmark_t zb;
1831
1832 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1833 dn->dn_object, 0, blkid);
1834
1835 if (db)
1836 pbuf = db->db_buf;
1837 else
1838 pbuf = dn->dn_objset->os_phys_buf;
1839
1840 (void) dsl_read(NULL, dn->dn_objset->os_spa,
1841 bp, pbuf, NULL, NULL, priority,
1842 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1843 &aflags, &zb);
1844 }
1845 if (db)
1846 dbuf_rele(db, NULL);
1847 }
1848 }
1849
1850 /*
1851 * Returns with db_holds incremented, and db_mtx not held.
1852 * Note: dn_struct_rwlock must be held.
1853 */
1854 int
1855 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
1856 void *tag, dmu_buf_impl_t **dbp)
1857 {
1858 dmu_buf_impl_t *db, *parent = NULL;
1859
1860 ASSERT(blkid != DMU_BONUS_BLKID);
1861 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1862 ASSERT3U(dn->dn_nlevels, >, level);
1863
1864 *dbp = NULL;
1865 top:
1866 /* dbuf_find() returns with db_mtx held */
1867 db = dbuf_find(dn, level, blkid);
1868
1869 if (db == NULL) {
1870 blkptr_t *bp = NULL;
1871 int err;
1872
1873 ASSERT3P(parent, ==, NULL);
1874 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
1875 if (fail_sparse) {
1876 if (err == 0 && bp && BP_IS_HOLE(bp))
1877 err = ENOENT;
1878 if (err) {
1879 if (parent)
1880 dbuf_rele(parent, NULL);
1881 return (err);
1882 }
1883 }
1884 if (err && err != ENOENT)
1885 return (err);
1886 db = dbuf_create(dn, level, blkid, parent, bp);
1887 }
1888
1889 if (db->db_buf && refcount_is_zero(&db->db_holds)) {
1890 arc_buf_add_ref(db->db_buf, db);
1891 if (db->db_buf->b_data == NULL) {
1892 dbuf_clear(db);
1893 if (parent) {
1894 dbuf_rele(parent, NULL);
1895 parent = NULL;
1896 }
1897 goto top;
1898 }
1899 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
1900 }
1901
1902 ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
1903
1904 /*
1905 * If this buffer is currently syncing out, and we are are
1906 * still referencing it from db_data, we need to make a copy
1907 * of it in case we decide we want to dirty it again in this txg.
1908 */
1909 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1910 dn->dn_object != DMU_META_DNODE_OBJECT &&
1911 db->db_state == DB_CACHED && db->db_data_pending) {
1912 dbuf_dirty_record_t *dr = db->db_data_pending;
1913
1914 if (dr->dt.dl.dr_data == db->db_buf) {
1915 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1916
1917 dbuf_set_data(db,
1918 arc_buf_alloc(dn->dn_objset->os_spa,
1919 db->db.db_size, db, type));
1920 bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
1921 db->db.db_size);
1922 }
1923 }
1924
1925 (void) refcount_add(&db->db_holds, tag);
1926 dbuf_update_data(db);
1927 DBUF_VERIFY(db);
1928 mutex_exit(&db->db_mtx);
1929
1930 /* NOTE: we can't rele the parent until after we drop the db_mtx */
1931 if (parent)
1932 dbuf_rele(parent, NULL);
1933
1934 ASSERT3P(DB_DNODE(db), ==, dn);
1935 ASSERT3U(db->db_blkid, ==, blkid);
1936 ASSERT3U(db->db_level, ==, level);
1937 *dbp = db;
1938
1939 return (0);
1940 }
1941
1942 dmu_buf_impl_t *
1943 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
1944 {
1945 dmu_buf_impl_t *db;
1946 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
1947 return (err ? NULL : db);
1948 }
1949
1950 dmu_buf_impl_t *
1951 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
1952 {
1953 dmu_buf_impl_t *db;
1954 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
1955 return (err ? NULL : db);
1956 }
1957
1958 void
1959 dbuf_create_bonus(dnode_t *dn)
1960 {
1961 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1962
1963 ASSERT(dn->dn_bonus == NULL);
1964 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
1965 }
1966
1967 int
1968 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
1969 {
1970 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1971 dnode_t *dn;
1972
1973 if (db->db_blkid != DMU_SPILL_BLKID)
1974 return (ENOTSUP);
1975 if (blksz == 0)
1976 blksz = SPA_MINBLOCKSIZE;
1977 if (blksz > SPA_MAXBLOCKSIZE)
1978 blksz = SPA_MAXBLOCKSIZE;
1979 else
1980 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
1981
1982 DB_DNODE_ENTER(db);
1983 dn = DB_DNODE(db);
1984 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
1985 dbuf_new_size(db, blksz, tx);
1986 rw_exit(&dn->dn_struct_rwlock);
1987 DB_DNODE_EXIT(db);
1988
1989 return (0);
1990 }
1991
1992 void
1993 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
1994 {
1995 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
1996 }
1997
1998 #pragma weak dmu_buf_add_ref = dbuf_add_ref
1999 void
2000 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
2001 {
2002 int64_t holds = refcount_add(&db->db_holds, tag);
2003 ASSERT(holds > 1);
2004 }
2005
2006 /*
2007 * If you call dbuf_rele() you had better not be referencing the dnode handle
2008 * unless you have some other direct or indirect hold on the dnode. (An indirect
2009 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2010 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2011 * dnode's parent dbuf evicting its dnode handles.
2012 */
2013 #pragma weak dmu_buf_rele = dbuf_rele
2014 void
2015 dbuf_rele(dmu_buf_impl_t *db, void *tag)
2016 {
2017 mutex_enter(&db->db_mtx);
2018 dbuf_rele_and_unlock(db, tag);
2019 }
2020
2021 /*
2022 * dbuf_rele() for an already-locked dbuf. This is necessary to allow
2023 * db_dirtycnt and db_holds to be updated atomically.
2024 */
2025 void
2026 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2027 {
2028 int64_t holds;
2029
2030 ASSERT(MUTEX_HELD(&db->db_mtx));
2031 DBUF_VERIFY(db);
2032
2033 /*
2034 * Remove the reference to the dbuf before removing its hold on the
2035 * dnode so we can guarantee in dnode_move() that a referenced bonus
2036 * buffer has a corresponding dnode hold.
2037 */
2038 holds = refcount_remove(&db->db_holds, tag);
2039 ASSERT(holds >= 0);
2040
2041 /*
2042 * We can't freeze indirects if there is a possibility that they
2043 * may be modified in the current syncing context.
2044 */
2045 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2046 arc_buf_freeze(db->db_buf);
2047
2048 if (holds == db->db_dirtycnt &&
2049 db->db_level == 0 && db->db_immediate_evict)
2050 dbuf_evict_user(db);
2051
2052 if (holds == 0) {
2053 if (db->db_blkid == DMU_BONUS_BLKID) {
2054 mutex_exit(&db->db_mtx);
2055
2056 /*
2057 * If the dnode moves here, we cannot cross this barrier
2058 * until the move completes.
2059 */
2060 DB_DNODE_ENTER(db);
2061 (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
2062 DB_DNODE_EXIT(db);
2063 /*
2064 * The bonus buffer's dnode hold is no longer discounted
2065 * in dnode_move(). The dnode cannot move until after
2066 * the dnode_rele().
2067 */
2068 dnode_rele(DB_DNODE(db), db);
2069 } else if (db->db_buf == NULL) {
2070 /*
2071 * This is a special case: we never associated this
2072 * dbuf with any data allocated from the ARC.
2073 */
2074 ASSERT(db->db_state == DB_UNCACHED ||
2075 db->db_state == DB_NOFILL);
2076 dbuf_evict(db);
2077 } else if (arc_released(db->db_buf)) {
2078 arc_buf_t *buf = db->db_buf;
2079 /*
2080 * This dbuf has anonymous data associated with it.
2081 */
2082 dbuf_set_data(db, NULL);
2083 VERIFY(arc_buf_remove_ref(buf, db) == 1);
2084 dbuf_evict(db);
2085 } else {
2086 VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
2087 if (!DBUF_IS_CACHEABLE(db))
2088 dbuf_clear(db);
2089 else
2090 mutex_exit(&db->db_mtx);
2091 }
2092 } else {
2093 mutex_exit(&db->db_mtx);
2094 }
2095 }
2096
2097 #pragma weak dmu_buf_refcount = dbuf_refcount
2098 uint64_t
2099 dbuf_refcount(dmu_buf_impl_t *db)
2100 {
2101 return (refcount_count(&db->db_holds));
2102 }
2103
2104 void *
2105 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2106 dmu_buf_evict_func_t *evict_func)
2107 {
2108 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2109 user_data_ptr_ptr, evict_func));
2110 }
2111
2112 void *
2113 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2114 dmu_buf_evict_func_t *evict_func)
2115 {
2116 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2117
2118 db->db_immediate_evict = TRUE;
2119 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2120 user_data_ptr_ptr, evict_func));
2121 }
2122
2123 void *
2124 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
2125 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
2126 {
2127 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2128 ASSERT(db->db_level == 0);
2129
2130 ASSERT((user_ptr == NULL) == (evict_func == NULL));
2131
2132 mutex_enter(&db->db_mtx);
2133
2134 if (db->db_user_ptr == old_user_ptr) {
2135 db->db_user_ptr = user_ptr;
2136 db->db_user_data_ptr_ptr = user_data_ptr_ptr;
2137 db->db_evict_func = evict_func;
2138
2139 dbuf_update_data(db);
2140 } else {
2141 old_user_ptr = db->db_user_ptr;
2142 }
2143
2144 mutex_exit(&db->db_mtx);
2145 return (old_user_ptr);
2146 }
2147
2148 void *
2149 dmu_buf_get_user(dmu_buf_t *db_fake)
2150 {
2151 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2152 ASSERT(!refcount_is_zero(&db->db_holds));
2153
2154 return (db->db_user_ptr);
2155 }
2156
2157 boolean_t
2158 dmu_buf_freeable(dmu_buf_t *dbuf)
2159 {
2160 boolean_t res = B_FALSE;
2161 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2162
2163 if (db->db_blkptr)
2164 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2165 db->db_blkptr, db->db_blkptr->blk_birth);
2166
2167 return (res);
2168 }
2169
2170 static void
2171 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2172 {
2173 /* ASSERT(dmu_tx_is_syncing(tx) */
2174 ASSERT(MUTEX_HELD(&db->db_mtx));
2175
2176 if (db->db_blkptr != NULL)
2177 return;
2178
2179 if (db->db_blkid == DMU_SPILL_BLKID) {
2180 db->db_blkptr = &dn->dn_phys->dn_spill;
2181 BP_ZERO(db->db_blkptr);
2182 return;
2183 }
2184 if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2185 /*
2186 * This buffer was allocated at a time when there was
2187 * no available blkptrs from the dnode, or it was
2188 * inappropriate to hook it in (i.e., nlevels mis-match).
2189 */
2190 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2191 ASSERT(db->db_parent == NULL);
2192 db->db_parent = dn->dn_dbuf;
2193 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2194 DBUF_VERIFY(db);
2195 } else {
2196 dmu_buf_impl_t *parent = db->db_parent;
2197 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2198
2199 ASSERT(dn->dn_phys->dn_nlevels > 1);
2200 if (parent == NULL) {
2201 mutex_exit(&db->db_mtx);
2202 rw_enter(&dn->dn_struct_rwlock, RW_READER);
2203 (void) dbuf_hold_impl(dn, db->db_level+1,
2204 db->db_blkid >> epbs, FALSE, db, &parent);
2205 rw_exit(&dn->dn_struct_rwlock);
2206 mutex_enter(&db->db_mtx);
2207 db->db_parent = parent;
2208 }
2209 db->db_blkptr = (blkptr_t *)parent->db.db_data +
2210 (db->db_blkid & ((1ULL << epbs) - 1));
2211 DBUF_VERIFY(db);
2212 }
2213 }
2214
2215 static void
2216 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2217 {
2218 dmu_buf_impl_t *db = dr->dr_dbuf;
2219 dnode_t *dn;
2220 zio_t *zio;
2221
2222 ASSERT(dmu_tx_is_syncing(tx));
2223
2224 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2225
2226 mutex_enter(&db->db_mtx);
2227
2228 ASSERT(db->db_level > 0);
2229 DBUF_VERIFY(db);
2230
2231 if (db->db_buf == NULL) {
2232 mutex_exit(&db->db_mtx);
2233 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2234 mutex_enter(&db->db_mtx);
2235 }
2236 ASSERT3U(db->db_state, ==, DB_CACHED);
2237 ASSERT(db->db_buf != NULL);
2238
2239 DB_DNODE_ENTER(db);
2240 dn = DB_DNODE(db);
2241 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2242 dbuf_check_blkptr(dn, db);
2243 DB_DNODE_EXIT(db);
2244
2245 db->db_data_pending = dr;
2246
2247 mutex_exit(&db->db_mtx);
2248 dbuf_write(dr, db->db_buf, tx);
2249
2250 zio = dr->dr_zio;
2251 mutex_enter(&dr->dt.di.dr_mtx);
2252 dbuf_sync_list(&dr->dt.di.dr_children, tx);
2253 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2254 mutex_exit(&dr->dt.di.dr_mtx);
2255 zio_nowait(zio);
2256 }
2257
2258 static void
2259 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2260 {
2261 arc_buf_t **datap = &dr->dt.dl.dr_data;
2262 dmu_buf_impl_t *db = dr->dr_dbuf;
2263 dnode_t *dn;
2264 objset_t *os;
2265 uint64_t txg = tx->tx_txg;
2266
2267 ASSERT(dmu_tx_is_syncing(tx));
2268
2269 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2270
2271 mutex_enter(&db->db_mtx);
2272 /*
2273 * To be synced, we must be dirtied. But we
2274 * might have been freed after the dirty.
2275 */
2276 if (db->db_state == DB_UNCACHED) {
2277 /* This buffer has been freed since it was dirtied */
2278 ASSERT(db->db.db_data == NULL);
2279 } else if (db->db_state == DB_FILL) {
2280 /* This buffer was freed and is now being re-filled */
2281 ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2282 } else {
2283 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2284 }
2285 DBUF_VERIFY(db);
2286
2287 DB_DNODE_ENTER(db);
2288 dn = DB_DNODE(db);
2289
2290 if (db->db_blkid == DMU_SPILL_BLKID) {
2291 mutex_enter(&dn->dn_mtx);
2292 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2293 mutex_exit(&dn->dn_mtx);
2294 }
2295
2296 /*
2297 * If this is a bonus buffer, simply copy the bonus data into the
2298 * dnode. It will be written out when the dnode is synced (and it
2299 * will be synced, since it must have been dirty for dbuf_sync to
2300 * be called).
2301 */
2302 if (db->db_blkid == DMU_BONUS_BLKID) {
2303 dbuf_dirty_record_t **drp;
2304
2305 ASSERT(*datap != NULL);
2306 ASSERT3U(db->db_level, ==, 0);
2307 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2308 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2309 DB_DNODE_EXIT(db);
2310
2311 if (*datap != db->db.db_data) {
2312 zio_buf_free(*datap, DN_MAX_BONUSLEN);
2313 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2314 }
2315 db->db_data_pending = NULL;
2316 drp = &db->db_last_dirty;
2317 while (*drp != dr)
2318 drp = &(*drp)->dr_next;
2319 ASSERT(dr->dr_next == NULL);
2320 ASSERT(dr->dr_dbuf == db);
2321 *drp = dr->dr_next;
2322 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2323 ASSERT(db->db_dirtycnt > 0);
2324 db->db_dirtycnt -= 1;
2325 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2326 return;
2327 }
2328
2329 os = dn->dn_objset;
2330
2331 /*
2332 * This function may have dropped the db_mtx lock allowing a dmu_sync
2333 * operation to sneak in. As a result, we need to ensure that we
2334 * don't check the dr_override_state until we have returned from
2335 * dbuf_check_blkptr.
2336 */
2337 dbuf_check_blkptr(dn, db);
2338
2339 /*
2340 * If this buffer is in the middle of an immediate write,
2341 * wait for the synchronous IO to complete.
2342 */
2343 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2344 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2345 cv_wait(&db->db_changed, &db->db_mtx);
2346 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2347 }
2348
2349 if (db->db_state != DB_NOFILL &&
2350 dn->dn_object != DMU_META_DNODE_OBJECT &&
2351 refcount_count(&db->db_holds) > 1 &&
2352 dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2353 *datap == db->db_buf) {
2354 /*
2355 * If this buffer is currently "in use" (i.e., there
2356 * are active holds and db_data still references it),
2357 * then make a copy before we start the write so that
2358 * any modifications from the open txg will not leak
2359 * into this write.
2360 *
2361 * NOTE: this copy does not need to be made for
2362 * objects only modified in the syncing context (e.g.
2363 * DNONE_DNODE blocks).
2364 */
2365 int blksz = arc_buf_size(*datap);
2366 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2367 *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2368 bcopy(db->db.db_data, (*datap)->b_data, blksz);
2369 }
2370 db->db_data_pending = dr;
2371
2372 mutex_exit(&db->db_mtx);
2373
2374 dbuf_write(dr, *datap, tx);
2375
2376 ASSERT(!list_link_active(&dr->dr_dirty_node));
2377 if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2378 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2379 DB_DNODE_EXIT(db);
2380 } else {
2381 /*
2382 * Although zio_nowait() does not "wait for an IO", it does
2383 * initiate the IO. If this is an empty write it seems plausible
2384 * that the IO could actually be completed before the nowait
2385 * returns. We need to DB_DNODE_EXIT() first in case
2386 * zio_nowait() invalidates the dbuf.
2387 */
2388 DB_DNODE_EXIT(db);
2389 zio_nowait(dr->dr_zio);
2390 }
2391 }
2392
2393 void
2394 dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2395 {
2396 dbuf_dirty_record_t *dr;
2397
2398 while (dr = list_head(list)) {
2399 if (dr->dr_zio != NULL) {
2400 /*
2401 * If we find an already initialized zio then we
2402 * are processing the meta-dnode, and we have finished.
2403 * The dbufs for all dnodes are put back on the list
2404 * during processing, so that we can zio_wait()
2405 * these IOs after initiating all child IOs.
2406 */
2407 ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2408 DMU_META_DNODE_OBJECT);
2409 break;
2410 }
2411 list_remove(list, dr);
2412 if (dr->dr_dbuf->db_level > 0)
2413 dbuf_sync_indirect(dr, tx);
2414 else
2415 dbuf_sync_leaf(dr, tx);
2416 }
2417 }
2418
2419 /* ARGSUSED */
2420 static void
2421 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2422 {
2423 dmu_buf_impl_t *db = vdb;
2424 dnode_t *dn;
2425 blkptr_t *bp = zio->io_bp;
2426 blkptr_t *bp_orig = &zio->io_bp_orig;
2427 spa_t *spa = zio->io_spa;
2428 int64_t delta;
2429 uint64_t fill = 0;
2430 int i;
2431
2432 ASSERT(db->db_blkptr == bp);
2433
2434 DB_DNODE_ENTER(db);
2435 dn = DB_DNODE(db);
2436 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2437 dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2438 zio->io_prev_space_delta = delta;
2439
2440 if (BP_IS_HOLE(bp)) {
2441 ASSERT(bp->blk_fill == 0);
2442 DB_DNODE_EXIT(db);
2443 return;
2444 }
2445
2446 ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2447 BP_GET_TYPE(bp) == dn->dn_type) ||
2448 (db->db_blkid == DMU_SPILL_BLKID &&
2449 BP_GET_TYPE(bp) == dn->dn_bonustype));
2450 ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2451
2452 mutex_enter(&db->db_mtx);
2453
2454 #ifdef ZFS_DEBUG
2455 if (db->db_blkid == DMU_SPILL_BLKID) {
2456 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2457 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2458 db->db_blkptr == &dn->dn_phys->dn_spill);
2459 }
2460 #endif
2461
2462 if (db->db_level == 0) {
2463 mutex_enter(&dn->dn_mtx);
2464 if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2465 db->db_blkid != DMU_SPILL_BLKID)
2466 dn->dn_phys->dn_maxblkid = db->db_blkid;
2467 mutex_exit(&dn->dn_mtx);
2468
2469 if (dn->dn_type == DMU_OT_DNODE) {
2470 dnode_phys_t *dnp = db->db.db_data;
2471 for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2472 i--, dnp++) {
2473 if (dnp->dn_type != DMU_OT_NONE)
2474 fill++;
2475 }
2476 } else {
2477 fill = 1;
2478 }
2479 } else {
2480 blkptr_t *ibp = db->db.db_data;
2481 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2482 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2483 if (BP_IS_HOLE(ibp))
2484 continue;
2485 fill += ibp->blk_fill;
2486 }
2487 }
2488 DB_DNODE_EXIT(db);
2489
2490 bp->blk_fill = fill;
2491
2492 mutex_exit(&db->db_mtx);
2493 }
2494
2495 /* ARGSUSED */
2496 static void
2497 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2498 {
2499 dmu_buf_impl_t *db = vdb;
2500 blkptr_t *bp = zio->io_bp;
2501 blkptr_t *bp_orig = &zio->io_bp_orig;
2502 uint64_t txg = zio->io_txg;
2503 dbuf_dirty_record_t **drp, *dr;
2504
2505 ASSERT3U(zio->io_error, ==, 0);
2506 ASSERT(db->db_blkptr == bp);
2507
2508 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
2509 ASSERT(BP_EQUAL(bp, bp_orig));
2510 } else {
2511 objset_t *os;
2512 dsl_dataset_t *ds;
2513 dmu_tx_t *tx;
2514
2515 DB_GET_OBJSET(&os, db);
2516 ds = os->os_dsl_dataset;
2517 tx = os->os_synctx;
2518
2519 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2520 dsl_dataset_block_born(ds, bp, tx);
2521 }
2522
2523 mutex_enter(&db->db_mtx);
2524
2525 DBUF_VERIFY(db);
2526
2527 drp = &db->db_last_dirty;
2528 while ((dr = *drp) != db->db_data_pending)
2529 drp = &dr->dr_next;
2530 ASSERT(!list_link_active(&dr->dr_dirty_node));
2531 ASSERT(dr->dr_txg == txg);
2532 ASSERT(dr->dr_dbuf == db);
2533 ASSERT(dr->dr_next == NULL);
2534 *drp = dr->dr_next;
2535
2536 #ifdef ZFS_DEBUG
2537 if (db->db_blkid == DMU_SPILL_BLKID) {
2538 dnode_t *dn;
2539
2540 DB_DNODE_ENTER(db);
2541 dn = DB_DNODE(db);
2542 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2543 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2544 db->db_blkptr == &dn->dn_phys->dn_spill);
2545 DB_DNODE_EXIT(db);
2546 }
2547 #endif
2548
2549 if (db->db_level == 0) {
2550 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2551 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2552 if (db->db_state != DB_NOFILL) {
2553 if (dr->dt.dl.dr_data != db->db_buf)
2554 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
2555 db) == 1);
2556 else if (!arc_released(db->db_buf))
2557 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2558 }
2559 } else {
2560 dnode_t *dn;
2561
2562 DB_DNODE_ENTER(db);
2563 dn = DB_DNODE(db);
2564 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2565 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2566 if (!BP_IS_HOLE(db->db_blkptr)) {
2567 int epbs =
2568 dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2569 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2570 db->db.db_size);
2571 ASSERT3U(dn->dn_phys->dn_maxblkid
2572 >> (db->db_level * epbs), >=, db->db_blkid);
2573 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2574 }
2575 DB_DNODE_EXIT(db);
2576 mutex_destroy(&dr->dt.di.dr_mtx);
2577 list_destroy(&dr->dt.di.dr_children);
2578 }
2579 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2580
2581 cv_broadcast(&db->db_changed);
2582 ASSERT(db->db_dirtycnt > 0);
2583 db->db_dirtycnt -= 1;
2584 db->db_data_pending = NULL;
2585 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2586 }
2587
2588 static void
2589 dbuf_write_nofill_ready(zio_t *zio)
2590 {
2591 dbuf_write_ready(zio, NULL, zio->io_private);
2592 }
2593
2594 static void
2595 dbuf_write_nofill_done(zio_t *zio)
2596 {
2597 dbuf_write_done(zio, NULL, zio->io_private);
2598 }
2599
2600 static void
2601 dbuf_write_override_ready(zio_t *zio)
2602 {
2603 dbuf_dirty_record_t *dr = zio->io_private;
2604 dmu_buf_impl_t *db = dr->dr_dbuf;
2605
2606 dbuf_write_ready(zio, NULL, db);
2607 }
2608
2609 static void
2610 dbuf_write_override_done(zio_t *zio)
2611 {
2612 dbuf_dirty_record_t *dr = zio->io_private;
2613 dmu_buf_impl_t *db = dr->dr_dbuf;
2614 blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2615
2616 mutex_enter(&db->db_mtx);
2617 if (!BP_EQUAL(zio->io_bp, obp)) {
2618 if (!BP_IS_HOLE(obp))
2619 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2620 arc_release(dr->dt.dl.dr_data, db);
2621 }
2622 mutex_exit(&db->db_mtx);
2623
2624 dbuf_write_done(zio, NULL, db);
2625 }
2626
2627 static void
2628 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2629 {
2630 dmu_buf_impl_t *db = dr->dr_dbuf;
2631 dnode_t *dn;
2632 objset_t *os;
2633 dmu_buf_impl_t *parent = db->db_parent;
2634 uint64_t txg = tx->tx_txg;
2635 zbookmark_t zb;
2636 zio_prop_t zp;
2637 zio_t *zio;
2638 int wp_flag = 0;
2639
2640 DB_DNODE_ENTER(db);
2641 dn = DB_DNODE(db);
2642 os = dn->dn_objset;
2643
2644 if (db->db_state != DB_NOFILL) {
2645 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2646 /*
2647 * Private object buffers are released here rather
2648 * than in dbuf_dirty() since they are only modified
2649 * in the syncing context and we don't want the
2650 * overhead of making multiple copies of the data.
2651 */
2652 if (BP_IS_HOLE(db->db_blkptr)) {
2653 arc_buf_thaw(data);
2654 } else {
2655 dbuf_release_bp(db);
2656 }
2657 }
2658 }
2659
2660 if (parent != dn->dn_dbuf) {
2661 ASSERT(parent && parent->db_data_pending);
2662 ASSERT(db->db_level == parent->db_level-1);
2663 ASSERT(arc_released(parent->db_buf));
2664 zio = parent->db_data_pending->dr_zio;
2665 } else {
2666 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2667 db->db_blkid != DMU_SPILL_BLKID) ||
2668 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2669 if (db->db_blkid != DMU_SPILL_BLKID)
2670 ASSERT3P(db->db_blkptr, ==,
2671 &dn->dn_phys->dn_blkptr[db->db_blkid]);
2672 zio = dn->dn_zio;
2673 }
2674
2675 ASSERT(db->db_level == 0 || data == db->db_buf);
2676 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2677 ASSERT(zio);
2678
2679 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2680 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2681 db->db.db_object, db->db_level, db->db_blkid);
2682
2683 if (db->db_blkid == DMU_SPILL_BLKID)
2684 wp_flag = WP_SPILL;
2685 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2686
2687 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2688 DB_DNODE_EXIT(db);
2689
2690 if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2691 ASSERT(db->db_state != DB_NOFILL);
2692 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2693 db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
2694 dbuf_write_override_ready, dbuf_write_override_done, dr,
2695 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2696 mutex_enter(&db->db_mtx);
2697 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2698 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2699 dr->dt.dl.dr_copies);
2700 mutex_exit(&db->db_mtx);
2701 } else if (db->db_state == DB_NOFILL) {
2702 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
2703 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2704 db->db_blkptr, NULL, db->db.db_size, &zp,
2705 dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
2706 ZIO_PRIORITY_ASYNC_WRITE,
2707 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2708 } else {
2709 ASSERT(arc_released(data));
2710 dr->dr_zio = arc_write(zio, os->os_spa, txg,
2711 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp,
2712 dbuf_write_ready, dbuf_write_done, db,
2713 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2714 }
2715 }