]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/dbuf.c
Illumos #4045 write throttle & i/o scheduler performance work
[mirror_zfs.git] / module / zfs / dbuf.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
26 */
27
28 #include <sys/zfs_context.h>
29 #include <sys/arc.h>
30 #include <sys/dmu.h>
31 #include <sys/dmu_send.h>
32 #include <sys/dmu_impl.h>
33 #include <sys/dbuf.h>
34 #include <sys/dmu_objset.h>
35 #include <sys/dsl_dataset.h>
36 #include <sys/dsl_dir.h>
37 #include <sys/dmu_tx.h>
38 #include <sys/spa.h>
39 #include <sys/zio.h>
40 #include <sys/dmu_zfetch.h>
41 #include <sys/sa.h>
42 #include <sys/sa_impl.h>
43
44 struct dbuf_hold_impl_data {
45 /* Function arguments */
46 dnode_t *dh_dn;
47 uint8_t dh_level;
48 uint64_t dh_blkid;
49 int dh_fail_sparse;
50 void *dh_tag;
51 dmu_buf_impl_t **dh_dbp;
52 /* Local variables */
53 dmu_buf_impl_t *dh_db;
54 dmu_buf_impl_t *dh_parent;
55 blkptr_t *dh_bp;
56 int dh_err;
57 dbuf_dirty_record_t *dh_dr;
58 arc_buf_contents_t dh_type;
59 int dh_depth;
60 };
61
62 static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
63 dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
64 void *tag, dmu_buf_impl_t **dbp, int depth);
65 static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh);
66
67 /*
68 * Number of times that zfs_free_range() took the slow path while doing
69 * a zfs receive. A nonzero value indicates a potential performance problem.
70 */
71 uint64_t zfs_free_range_recv_miss;
72
73 static void dbuf_destroy(dmu_buf_impl_t *db);
74 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
75 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
76
77 /*
78 * Global data structures and functions for the dbuf cache.
79 */
80 static kmem_cache_t *dbuf_cache;
81
82 /* ARGSUSED */
83 static int
84 dbuf_cons(void *vdb, void *unused, int kmflag)
85 {
86 dmu_buf_impl_t *db = vdb;
87 bzero(db, sizeof (dmu_buf_impl_t));
88
89 mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
90 cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
91 refcount_create(&db->db_holds);
92 list_link_init(&db->db_link);
93 return (0);
94 }
95
96 /* ARGSUSED */
97 static void
98 dbuf_dest(void *vdb, void *unused)
99 {
100 dmu_buf_impl_t *db = vdb;
101 mutex_destroy(&db->db_mtx);
102 cv_destroy(&db->db_changed);
103 refcount_destroy(&db->db_holds);
104 }
105
106 /*
107 * dbuf hash table routines
108 */
109 static dbuf_hash_table_t dbuf_hash_table;
110
111 static uint64_t dbuf_hash_count;
112
113 static uint64_t
114 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
115 {
116 uintptr_t osv = (uintptr_t)os;
117 uint64_t crc = -1ULL;
118
119 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
120 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
121 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
122 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
123 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
124 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
125 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
126
127 crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
128
129 return (crc);
130 }
131
132 #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
133
134 #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
135 ((dbuf)->db.db_object == (obj) && \
136 (dbuf)->db_objset == (os) && \
137 (dbuf)->db_level == (level) && \
138 (dbuf)->db_blkid == (blkid))
139
140 dmu_buf_impl_t *
141 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
142 {
143 dbuf_hash_table_t *h = &dbuf_hash_table;
144 objset_t *os = dn->dn_objset;
145 uint64_t obj;
146 uint64_t hv;
147 uint64_t idx;
148 dmu_buf_impl_t *db;
149
150 obj = dn->dn_object;
151 hv = DBUF_HASH(os, obj, level, blkid);
152 idx = hv & h->hash_table_mask;
153
154 mutex_enter(DBUF_HASH_MUTEX(h, idx));
155 for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
156 if (DBUF_EQUAL(db, os, obj, level, blkid)) {
157 mutex_enter(&db->db_mtx);
158 if (db->db_state != DB_EVICTING) {
159 mutex_exit(DBUF_HASH_MUTEX(h, idx));
160 return (db);
161 }
162 mutex_exit(&db->db_mtx);
163 }
164 }
165 mutex_exit(DBUF_HASH_MUTEX(h, idx));
166 return (NULL);
167 }
168
169 /*
170 * Insert an entry into the hash table. If there is already an element
171 * equal to elem in the hash table, then the already existing element
172 * will be returned and the new element will not be inserted.
173 * Otherwise returns NULL.
174 */
175 static dmu_buf_impl_t *
176 dbuf_hash_insert(dmu_buf_impl_t *db)
177 {
178 dbuf_hash_table_t *h = &dbuf_hash_table;
179 objset_t *os = db->db_objset;
180 uint64_t obj = db->db.db_object;
181 int level = db->db_level;
182 uint64_t blkid, hv, idx;
183 dmu_buf_impl_t *dbf;
184
185 blkid = db->db_blkid;
186 hv = DBUF_HASH(os, obj, level, blkid);
187 idx = hv & h->hash_table_mask;
188
189 mutex_enter(DBUF_HASH_MUTEX(h, idx));
190 for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
191 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
192 mutex_enter(&dbf->db_mtx);
193 if (dbf->db_state != DB_EVICTING) {
194 mutex_exit(DBUF_HASH_MUTEX(h, idx));
195 return (dbf);
196 }
197 mutex_exit(&dbf->db_mtx);
198 }
199 }
200
201 mutex_enter(&db->db_mtx);
202 db->db_hash_next = h->hash_table[idx];
203 h->hash_table[idx] = db;
204 mutex_exit(DBUF_HASH_MUTEX(h, idx));
205 atomic_add_64(&dbuf_hash_count, 1);
206
207 return (NULL);
208 }
209
210 /*
211 * Remove an entry from the hash table. This operation will
212 * fail if there are any existing holds on the db.
213 */
214 static void
215 dbuf_hash_remove(dmu_buf_impl_t *db)
216 {
217 dbuf_hash_table_t *h = &dbuf_hash_table;
218 uint64_t hv, idx;
219 dmu_buf_impl_t *dbf, **dbp;
220
221 hv = DBUF_HASH(db->db_objset, db->db.db_object,
222 db->db_level, db->db_blkid);
223 idx = hv & h->hash_table_mask;
224
225 /*
226 * We musn't hold db_mtx to maintin lock ordering:
227 * DBUF_HASH_MUTEX > db_mtx.
228 */
229 ASSERT(refcount_is_zero(&db->db_holds));
230 ASSERT(db->db_state == DB_EVICTING);
231 ASSERT(!MUTEX_HELD(&db->db_mtx));
232
233 mutex_enter(DBUF_HASH_MUTEX(h, idx));
234 dbp = &h->hash_table[idx];
235 while ((dbf = *dbp) != db) {
236 dbp = &dbf->db_hash_next;
237 ASSERT(dbf != NULL);
238 }
239 *dbp = db->db_hash_next;
240 db->db_hash_next = NULL;
241 mutex_exit(DBUF_HASH_MUTEX(h, idx));
242 atomic_add_64(&dbuf_hash_count, -1);
243 }
244
245 static arc_evict_func_t dbuf_do_evict;
246
247 static void
248 dbuf_evict_user(dmu_buf_impl_t *db)
249 {
250 ASSERT(MUTEX_HELD(&db->db_mtx));
251
252 if (db->db_level != 0 || db->db_evict_func == NULL)
253 return;
254
255 if (db->db_user_data_ptr_ptr)
256 *db->db_user_data_ptr_ptr = db->db.db_data;
257 db->db_evict_func(&db->db, db->db_user_ptr);
258 db->db_user_ptr = NULL;
259 db->db_user_data_ptr_ptr = NULL;
260 db->db_evict_func = NULL;
261 }
262
263 boolean_t
264 dbuf_is_metadata(dmu_buf_impl_t *db)
265 {
266 if (db->db_level > 0) {
267 return (B_TRUE);
268 } else {
269 boolean_t is_metadata;
270
271 DB_DNODE_ENTER(db);
272 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
273 DB_DNODE_EXIT(db);
274
275 return (is_metadata);
276 }
277 }
278
279 void
280 dbuf_evict(dmu_buf_impl_t *db)
281 {
282 ASSERT(MUTEX_HELD(&db->db_mtx));
283 ASSERT(db->db_buf == NULL);
284 ASSERT(db->db_data_pending == NULL);
285
286 dbuf_clear(db);
287 dbuf_destroy(db);
288 }
289
290 void
291 dbuf_init(void)
292 {
293 uint64_t hsize = 1ULL << 16;
294 dbuf_hash_table_t *h = &dbuf_hash_table;
295 int i;
296
297 /*
298 * The hash table is big enough to fill all of physical memory
299 * with an average 4K block size. The table will take up
300 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
301 */
302 while (hsize * 4096 < physmem * PAGESIZE)
303 hsize <<= 1;
304
305 retry:
306 h->hash_table_mask = hsize - 1;
307 #if defined(_KERNEL) && defined(HAVE_SPL)
308 /* Large allocations which do not require contiguous pages
309 * should be using vmem_alloc() in the linux kernel */
310 h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_PUSHPAGE);
311 #else
312 h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
313 #endif
314 if (h->hash_table == NULL) {
315 /* XXX - we should really return an error instead of assert */
316 ASSERT(hsize > (1ULL << 10));
317 hsize >>= 1;
318 goto retry;
319 }
320
321 dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
322 sizeof (dmu_buf_impl_t),
323 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
324
325 for (i = 0; i < DBUF_MUTEXES; i++)
326 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
327
328 dbuf_stats_init(h);
329 }
330
331 void
332 dbuf_fini(void)
333 {
334 dbuf_hash_table_t *h = &dbuf_hash_table;
335 int i;
336
337 dbuf_stats_destroy();
338
339 for (i = 0; i < DBUF_MUTEXES; i++)
340 mutex_destroy(&h->hash_mutexes[i]);
341 #if defined(_KERNEL) && defined(HAVE_SPL)
342 /* Large allocations which do not require contiguous pages
343 * should be using vmem_free() in the linux kernel */
344 vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
345 #else
346 kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
347 #endif
348 kmem_cache_destroy(dbuf_cache);
349 }
350
351 /*
352 * Other stuff.
353 */
354
355 #ifdef ZFS_DEBUG
356 static void
357 dbuf_verify(dmu_buf_impl_t *db)
358 {
359 dnode_t *dn;
360 dbuf_dirty_record_t *dr;
361
362 ASSERT(MUTEX_HELD(&db->db_mtx));
363
364 if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
365 return;
366
367 ASSERT(db->db_objset != NULL);
368 DB_DNODE_ENTER(db);
369 dn = DB_DNODE(db);
370 if (dn == NULL) {
371 ASSERT(db->db_parent == NULL);
372 ASSERT(db->db_blkptr == NULL);
373 } else {
374 ASSERT3U(db->db.db_object, ==, dn->dn_object);
375 ASSERT3P(db->db_objset, ==, dn->dn_objset);
376 ASSERT3U(db->db_level, <, dn->dn_nlevels);
377 ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
378 db->db_blkid == DMU_SPILL_BLKID ||
379 !list_is_empty(&dn->dn_dbufs));
380 }
381 if (db->db_blkid == DMU_BONUS_BLKID) {
382 ASSERT(dn != NULL);
383 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
384 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
385 } else if (db->db_blkid == DMU_SPILL_BLKID) {
386 ASSERT(dn != NULL);
387 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
388 ASSERT0(db->db.db_offset);
389 } else {
390 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
391 }
392
393 for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
394 ASSERT(dr->dr_dbuf == db);
395
396 for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
397 ASSERT(dr->dr_dbuf == db);
398
399 /*
400 * We can't assert that db_size matches dn_datablksz because it
401 * can be momentarily different when another thread is doing
402 * dnode_set_blksz().
403 */
404 if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
405 dr = db->db_data_pending;
406 /*
407 * It should only be modified in syncing context, so
408 * make sure we only have one copy of the data.
409 */
410 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
411 }
412
413 /* verify db->db_blkptr */
414 if (db->db_blkptr) {
415 if (db->db_parent == dn->dn_dbuf) {
416 /* db is pointed to by the dnode */
417 /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
418 if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
419 ASSERT(db->db_parent == NULL);
420 else
421 ASSERT(db->db_parent != NULL);
422 if (db->db_blkid != DMU_SPILL_BLKID)
423 ASSERT3P(db->db_blkptr, ==,
424 &dn->dn_phys->dn_blkptr[db->db_blkid]);
425 } else {
426 /* db is pointed to by an indirect block */
427 ASSERTV(int epb = db->db_parent->db.db_size >>
428 SPA_BLKPTRSHIFT);
429 ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
430 ASSERT3U(db->db_parent->db.db_object, ==,
431 db->db.db_object);
432 /*
433 * dnode_grow_indblksz() can make this fail if we don't
434 * have the struct_rwlock. XXX indblksz no longer
435 * grows. safe to do this now?
436 */
437 if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
438 ASSERT3P(db->db_blkptr, ==,
439 ((blkptr_t *)db->db_parent->db.db_data +
440 db->db_blkid % epb));
441 }
442 }
443 }
444 if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
445 (db->db_buf == NULL || db->db_buf->b_data) &&
446 db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
447 db->db_state != DB_FILL && !dn->dn_free_txg) {
448 /*
449 * If the blkptr isn't set but they have nonzero data,
450 * it had better be dirty, otherwise we'll lose that
451 * data when we evict this buffer.
452 */
453 if (db->db_dirtycnt == 0) {
454 ASSERTV(uint64_t *buf = db->db.db_data);
455 int i;
456
457 for (i = 0; i < db->db.db_size >> 3; i++) {
458 ASSERT(buf[i] == 0);
459 }
460 }
461 }
462 DB_DNODE_EXIT(db);
463 }
464 #endif
465
466 static void
467 dbuf_update_data(dmu_buf_impl_t *db)
468 {
469 ASSERT(MUTEX_HELD(&db->db_mtx));
470 if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
471 ASSERT(!refcount_is_zero(&db->db_holds));
472 *db->db_user_data_ptr_ptr = db->db.db_data;
473 }
474 }
475
476 static void
477 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
478 {
479 ASSERT(MUTEX_HELD(&db->db_mtx));
480 ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
481 db->db_buf = buf;
482 if (buf != NULL) {
483 ASSERT(buf->b_data != NULL);
484 db->db.db_data = buf->b_data;
485 if (!arc_released(buf))
486 arc_set_callback(buf, dbuf_do_evict, db);
487 dbuf_update_data(db);
488 } else {
489 dbuf_evict_user(db);
490 db->db.db_data = NULL;
491 if (db->db_state != DB_NOFILL)
492 db->db_state = DB_UNCACHED;
493 }
494 }
495
496 /*
497 * Loan out an arc_buf for read. Return the loaned arc_buf.
498 */
499 arc_buf_t *
500 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
501 {
502 arc_buf_t *abuf;
503
504 mutex_enter(&db->db_mtx);
505 if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
506 int blksz = db->db.db_size;
507 spa_t *spa;
508
509 mutex_exit(&db->db_mtx);
510 DB_GET_SPA(&spa, db);
511 abuf = arc_loan_buf(spa, blksz);
512 bcopy(db->db.db_data, abuf->b_data, blksz);
513 } else {
514 abuf = db->db_buf;
515 arc_loan_inuse_buf(abuf, db);
516 dbuf_set_data(db, NULL);
517 mutex_exit(&db->db_mtx);
518 }
519 return (abuf);
520 }
521
522 uint64_t
523 dbuf_whichblock(dnode_t *dn, uint64_t offset)
524 {
525 if (dn->dn_datablkshift) {
526 return (offset >> dn->dn_datablkshift);
527 } else {
528 ASSERT3U(offset, <, dn->dn_datablksz);
529 return (0);
530 }
531 }
532
533 static void
534 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
535 {
536 dmu_buf_impl_t *db = vdb;
537
538 mutex_enter(&db->db_mtx);
539 ASSERT3U(db->db_state, ==, DB_READ);
540 /*
541 * All reads are synchronous, so we must have a hold on the dbuf
542 */
543 ASSERT(refcount_count(&db->db_holds) > 0);
544 ASSERT(db->db_buf == NULL);
545 ASSERT(db->db.db_data == NULL);
546 if (db->db_level == 0 && db->db_freed_in_flight) {
547 /* we were freed in flight; disregard any error */
548 arc_release(buf, db);
549 bzero(buf->b_data, db->db.db_size);
550 arc_buf_freeze(buf);
551 db->db_freed_in_flight = FALSE;
552 dbuf_set_data(db, buf);
553 db->db_state = DB_CACHED;
554 } else if (zio == NULL || zio->io_error == 0) {
555 dbuf_set_data(db, buf);
556 db->db_state = DB_CACHED;
557 } else {
558 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
559 ASSERT3P(db->db_buf, ==, NULL);
560 VERIFY(arc_buf_remove_ref(buf, db));
561 db->db_state = DB_UNCACHED;
562 }
563 cv_broadcast(&db->db_changed);
564 dbuf_rele_and_unlock(db, NULL);
565 }
566
567 static void
568 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
569 {
570 dnode_t *dn;
571 spa_t *spa;
572 zbookmark_t zb;
573 uint32_t aflags = ARC_NOWAIT;
574
575 DB_DNODE_ENTER(db);
576 dn = DB_DNODE(db);
577 ASSERT(!refcount_is_zero(&db->db_holds));
578 /* We need the struct_rwlock to prevent db_blkptr from changing. */
579 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
580 ASSERT(MUTEX_HELD(&db->db_mtx));
581 ASSERT(db->db_state == DB_UNCACHED);
582 ASSERT(db->db_buf == NULL);
583
584 if (db->db_blkid == DMU_BONUS_BLKID) {
585 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
586
587 ASSERT3U(bonuslen, <=, db->db.db_size);
588 db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
589 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
590 if (bonuslen < DN_MAX_BONUSLEN)
591 bzero(db->db.db_data, DN_MAX_BONUSLEN);
592 if (bonuslen)
593 bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
594 DB_DNODE_EXIT(db);
595 dbuf_update_data(db);
596 db->db_state = DB_CACHED;
597 mutex_exit(&db->db_mtx);
598 return;
599 }
600
601 /*
602 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
603 * processes the delete record and clears the bp while we are waiting
604 * for the dn_mtx (resulting in a "no" from block_freed).
605 */
606 if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
607 (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
608 BP_IS_HOLE(db->db_blkptr)))) {
609 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
610
611 dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
612 db->db.db_size, db, type));
613 DB_DNODE_EXIT(db);
614 bzero(db->db.db_data, db->db.db_size);
615 db->db_state = DB_CACHED;
616 *flags |= DB_RF_CACHED;
617 mutex_exit(&db->db_mtx);
618 return;
619 }
620
621 spa = dn->dn_objset->os_spa;
622 DB_DNODE_EXIT(db);
623
624 db->db_state = DB_READ;
625 mutex_exit(&db->db_mtx);
626
627 if (DBUF_IS_L2CACHEABLE(db))
628 aflags |= ARC_L2CACHE;
629 if (DBUF_IS_L2COMPRESSIBLE(db))
630 aflags |= ARC_L2COMPRESS;
631
632 SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
633 db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
634 db->db.db_object, db->db_level, db->db_blkid);
635
636 dbuf_add_ref(db, NULL);
637
638 (void) arc_read(zio, spa, db->db_blkptr,
639 dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
640 (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
641 &aflags, &zb);
642 if (aflags & ARC_CACHED)
643 *flags |= DB_RF_CACHED;
644 }
645
646 int
647 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
648 {
649 int err = 0;
650 int havepzio = (zio != NULL);
651 int prefetch;
652 dnode_t *dn;
653
654 /*
655 * We don't have to hold the mutex to check db_state because it
656 * can't be freed while we have a hold on the buffer.
657 */
658 ASSERT(!refcount_is_zero(&db->db_holds));
659
660 if (db->db_state == DB_NOFILL)
661 return (SET_ERROR(EIO));
662
663 DB_DNODE_ENTER(db);
664 dn = DB_DNODE(db);
665 if ((flags & DB_RF_HAVESTRUCT) == 0)
666 rw_enter(&dn->dn_struct_rwlock, RW_READER);
667
668 prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
669 (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
670 DBUF_IS_CACHEABLE(db);
671
672 mutex_enter(&db->db_mtx);
673 if (db->db_state == DB_CACHED) {
674 mutex_exit(&db->db_mtx);
675 if (prefetch)
676 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
677 db->db.db_size, TRUE);
678 if ((flags & DB_RF_HAVESTRUCT) == 0)
679 rw_exit(&dn->dn_struct_rwlock);
680 DB_DNODE_EXIT(db);
681 } else if (db->db_state == DB_UNCACHED) {
682 spa_t *spa = dn->dn_objset->os_spa;
683
684 if (zio == NULL)
685 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
686 dbuf_read_impl(db, zio, &flags);
687
688 /* dbuf_read_impl has dropped db_mtx for us */
689
690 if (prefetch)
691 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
692 db->db.db_size, flags & DB_RF_CACHED);
693
694 if ((flags & DB_RF_HAVESTRUCT) == 0)
695 rw_exit(&dn->dn_struct_rwlock);
696 DB_DNODE_EXIT(db);
697
698 if (!havepzio)
699 err = zio_wait(zio);
700 } else {
701 /*
702 * Another reader came in while the dbuf was in flight
703 * between UNCACHED and CACHED. Either a writer will finish
704 * writing the buffer (sending the dbuf to CACHED) or the
705 * first reader's request will reach the read_done callback
706 * and send the dbuf to CACHED. Otherwise, a failure
707 * occurred and the dbuf went to UNCACHED.
708 */
709 mutex_exit(&db->db_mtx);
710 if (prefetch)
711 dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
712 db->db.db_size, TRUE);
713 if ((flags & DB_RF_HAVESTRUCT) == 0)
714 rw_exit(&dn->dn_struct_rwlock);
715 DB_DNODE_EXIT(db);
716
717 /* Skip the wait per the caller's request. */
718 mutex_enter(&db->db_mtx);
719 if ((flags & DB_RF_NEVERWAIT) == 0) {
720 while (db->db_state == DB_READ ||
721 db->db_state == DB_FILL) {
722 ASSERT(db->db_state == DB_READ ||
723 (flags & DB_RF_HAVESTRUCT) == 0);
724 cv_wait(&db->db_changed, &db->db_mtx);
725 }
726 if (db->db_state == DB_UNCACHED)
727 err = SET_ERROR(EIO);
728 }
729 mutex_exit(&db->db_mtx);
730 }
731
732 ASSERT(err || havepzio || db->db_state == DB_CACHED);
733 return (err);
734 }
735
736 static void
737 dbuf_noread(dmu_buf_impl_t *db)
738 {
739 ASSERT(!refcount_is_zero(&db->db_holds));
740 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
741 mutex_enter(&db->db_mtx);
742 while (db->db_state == DB_READ || db->db_state == DB_FILL)
743 cv_wait(&db->db_changed, &db->db_mtx);
744 if (db->db_state == DB_UNCACHED) {
745 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
746 spa_t *spa;
747
748 ASSERT(db->db_buf == NULL);
749 ASSERT(db->db.db_data == NULL);
750 DB_GET_SPA(&spa, db);
751 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
752 db->db_state = DB_FILL;
753 } else if (db->db_state == DB_NOFILL) {
754 dbuf_set_data(db, NULL);
755 } else {
756 ASSERT3U(db->db_state, ==, DB_CACHED);
757 }
758 mutex_exit(&db->db_mtx);
759 }
760
761 /*
762 * This is our just-in-time copy function. It makes a copy of
763 * buffers, that have been modified in a previous transaction
764 * group, before we modify them in the current active group.
765 *
766 * This function is used in two places: when we are dirtying a
767 * buffer for the first time in a txg, and when we are freeing
768 * a range in a dnode that includes this buffer.
769 *
770 * Note that when we are called from dbuf_free_range() we do
771 * not put a hold on the buffer, we just traverse the active
772 * dbuf list for the dnode.
773 */
774 static void
775 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
776 {
777 dbuf_dirty_record_t *dr = db->db_last_dirty;
778
779 ASSERT(MUTEX_HELD(&db->db_mtx));
780 ASSERT(db->db.db_data != NULL);
781 ASSERT(db->db_level == 0);
782 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
783
784 if (dr == NULL ||
785 (dr->dt.dl.dr_data !=
786 ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
787 return;
788
789 /*
790 * If the last dirty record for this dbuf has not yet synced
791 * and its referencing the dbuf data, either:
792 * reset the reference to point to a new copy,
793 * or (if there a no active holders)
794 * just null out the current db_data pointer.
795 */
796 ASSERT(dr->dr_txg >= txg - 2);
797 if (db->db_blkid == DMU_BONUS_BLKID) {
798 /* Note that the data bufs here are zio_bufs */
799 dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
800 arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
801 bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
802 } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
803 int size = db->db.db_size;
804 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
805 spa_t *spa;
806
807 DB_GET_SPA(&spa, db);
808 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
809 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
810 } else {
811 dbuf_set_data(db, NULL);
812 }
813 }
814
815 void
816 dbuf_unoverride(dbuf_dirty_record_t *dr)
817 {
818 dmu_buf_impl_t *db = dr->dr_dbuf;
819 blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
820 uint64_t txg = dr->dr_txg;
821
822 ASSERT(MUTEX_HELD(&db->db_mtx));
823 ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
824 ASSERT(db->db_level == 0);
825
826 if (db->db_blkid == DMU_BONUS_BLKID ||
827 dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
828 return;
829
830 ASSERT(db->db_data_pending != dr);
831
832 /* free this block */
833 if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) {
834 spa_t *spa;
835
836 DB_GET_SPA(&spa, db);
837 zio_free(spa, txg, bp);
838 }
839 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
840 dr->dt.dl.dr_nopwrite = B_FALSE;
841
842 /*
843 * Release the already-written buffer, so we leave it in
844 * a consistent dirty state. Note that all callers are
845 * modifying the buffer, so they will immediately do
846 * another (redundant) arc_release(). Therefore, leave
847 * the buf thawed to save the effort of freezing &
848 * immediately re-thawing it.
849 */
850 arc_release(dr->dt.dl.dr_data, db);
851 }
852
853 /*
854 * Evict (if its unreferenced) or clear (if its referenced) any level-0
855 * data blocks in the free range, so that any future readers will find
856 * empty blocks. Also, if we happen across any level-1 dbufs in the
857 * range that have not already been marked dirty, mark them dirty so
858 * they stay in memory.
859 *
860 * This is a no-op if the dataset is in the middle of an incremental
861 * receive; see comment below for details.
862 */
863 void
864 dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
865 {
866 dmu_buf_impl_t *db, *db_next;
867 uint64_t txg = tx->tx_txg;
868 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
869 uint64_t first_l1 = start >> epbs;
870 uint64_t last_l1 = end >> epbs;
871
872 if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
873 end = dn->dn_maxblkid;
874 last_l1 = end >> epbs;
875 }
876 dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
877
878 mutex_enter(&dn->dn_dbufs_mtx);
879 if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
880 /* There can't be any dbufs in this range; no need to search. */
881 mutex_exit(&dn->dn_dbufs_mtx);
882 return;
883 } else if (dmu_objset_is_receiving(dn->dn_objset)) {
884 /*
885 * If we are receiving, we expect there to be no dbufs in
886 * the range to be freed, because receive modifies each
887 * block at most once, and in offset order. If this is
888 * not the case, it can lead to performance problems,
889 * so note that we unexpectedly took the slow path.
890 */
891 atomic_inc_64(&zfs_free_range_recv_miss);
892 }
893
894 for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
895 db_next = list_next(&dn->dn_dbufs, db);
896 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
897
898 if (db->db_level == 1 &&
899 db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
900 mutex_enter(&db->db_mtx);
901 if (db->db_last_dirty &&
902 db->db_last_dirty->dr_txg < txg) {
903 dbuf_add_ref(db, FTAG);
904 mutex_exit(&db->db_mtx);
905 dbuf_will_dirty(db, tx);
906 dbuf_rele(db, FTAG);
907 } else {
908 mutex_exit(&db->db_mtx);
909 }
910 }
911
912 if (db->db_level != 0)
913 continue;
914 dprintf_dbuf(db, "found buf %s\n", "");
915 if (db->db_blkid < start || db->db_blkid > end)
916 continue;
917
918 /* found a level 0 buffer in the range */
919 mutex_enter(&db->db_mtx);
920 if (dbuf_undirty(db, tx)) {
921 /* mutex has been dropped and dbuf destroyed */
922 continue;
923 }
924
925 if (db->db_state == DB_UNCACHED ||
926 db->db_state == DB_NOFILL ||
927 db->db_state == DB_EVICTING) {
928 ASSERT(db->db.db_data == NULL);
929 mutex_exit(&db->db_mtx);
930 continue;
931 }
932 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
933 /* will be handled in dbuf_read_done or dbuf_rele */
934 db->db_freed_in_flight = TRUE;
935 mutex_exit(&db->db_mtx);
936 continue;
937 }
938 if (refcount_count(&db->db_holds) == 0) {
939 ASSERT(db->db_buf);
940 dbuf_clear(db);
941 continue;
942 }
943 /* The dbuf is referenced */
944
945 if (db->db_last_dirty != NULL) {
946 dbuf_dirty_record_t *dr = db->db_last_dirty;
947
948 if (dr->dr_txg == txg) {
949 /*
950 * This buffer is "in-use", re-adjust the file
951 * size to reflect that this buffer may
952 * contain new data when we sync.
953 */
954 if (db->db_blkid != DMU_SPILL_BLKID &&
955 db->db_blkid > dn->dn_maxblkid)
956 dn->dn_maxblkid = db->db_blkid;
957 dbuf_unoverride(dr);
958 } else {
959 /*
960 * This dbuf is not dirty in the open context.
961 * Either uncache it (if its not referenced in
962 * the open context) or reset its contents to
963 * empty.
964 */
965 dbuf_fix_old_data(db, txg);
966 }
967 }
968 /* clear the contents if its cached */
969 if (db->db_state == DB_CACHED) {
970 ASSERT(db->db.db_data != NULL);
971 arc_release(db->db_buf, db);
972 bzero(db->db.db_data, db->db.db_size);
973 arc_buf_freeze(db->db_buf);
974 }
975
976 mutex_exit(&db->db_mtx);
977 }
978 mutex_exit(&dn->dn_dbufs_mtx);
979 }
980
981 static int
982 dbuf_block_freeable(dmu_buf_impl_t *db)
983 {
984 dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
985 uint64_t birth_txg = 0;
986
987 /*
988 * We don't need any locking to protect db_blkptr:
989 * If it's syncing, then db_last_dirty will be set
990 * so we'll ignore db_blkptr.
991 */
992 ASSERT(MUTEX_HELD(&db->db_mtx));
993 if (db->db_last_dirty)
994 birth_txg = db->db_last_dirty->dr_txg;
995 else if (db->db_blkptr)
996 birth_txg = db->db_blkptr->blk_birth;
997
998 /*
999 * If we don't exist or are in a snapshot, we can't be freed.
1000 * Don't pass the bp to dsl_dataset_block_freeable() since we
1001 * are holding the db_mtx lock and might deadlock if we are
1002 * prefetching a dedup-ed block.
1003 */
1004 if (birth_txg)
1005 return (ds == NULL ||
1006 dsl_dataset_block_freeable(ds, NULL, birth_txg));
1007 else
1008 return (FALSE);
1009 }
1010
1011 void
1012 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
1013 {
1014 arc_buf_t *buf, *obuf;
1015 int osize = db->db.db_size;
1016 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1017 dnode_t *dn;
1018
1019 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1020
1021 DB_DNODE_ENTER(db);
1022 dn = DB_DNODE(db);
1023
1024 /* XXX does *this* func really need the lock? */
1025 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
1026
1027 /*
1028 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
1029 * is OK, because there can be no other references to the db
1030 * when we are changing its size, so no concurrent DB_FILL can
1031 * be happening.
1032 */
1033 /*
1034 * XXX we should be doing a dbuf_read, checking the return
1035 * value and returning that up to our callers
1036 */
1037 dbuf_will_dirty(db, tx);
1038
1039 /* create the data buffer for the new block */
1040 buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
1041
1042 /* copy old block data to the new block */
1043 obuf = db->db_buf;
1044 bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
1045 /* zero the remainder */
1046 if (size > osize)
1047 bzero((uint8_t *)buf->b_data + osize, size - osize);
1048
1049 mutex_enter(&db->db_mtx);
1050 dbuf_set_data(db, buf);
1051 VERIFY(arc_buf_remove_ref(obuf, db));
1052 db->db.db_size = size;
1053
1054 if (db->db_level == 0) {
1055 ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
1056 db->db_last_dirty->dt.dl.dr_data = buf;
1057 }
1058 mutex_exit(&db->db_mtx);
1059
1060 dnode_willuse_space(dn, size-osize, tx);
1061 DB_DNODE_EXIT(db);
1062 }
1063
1064 void
1065 dbuf_release_bp(dmu_buf_impl_t *db)
1066 {
1067 objset_t *os;
1068
1069 DB_GET_OBJSET(&os, db);
1070 ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1071 ASSERT(arc_released(os->os_phys_buf) ||
1072 list_link_active(&os->os_dsl_dataset->ds_synced_link));
1073 ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1074
1075 (void) arc_release(db->db_buf, db);
1076 }
1077
1078 dbuf_dirty_record_t *
1079 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1080 {
1081 dnode_t *dn;
1082 objset_t *os;
1083 dbuf_dirty_record_t **drp, *dr;
1084 int drop_struct_lock = FALSE;
1085 boolean_t do_free_accounting = B_FALSE;
1086 int txgoff = tx->tx_txg & TXG_MASK;
1087
1088 ASSERT(tx->tx_txg != 0);
1089 ASSERT(!refcount_is_zero(&db->db_holds));
1090 DMU_TX_DIRTY_BUF(tx, db);
1091
1092 DB_DNODE_ENTER(db);
1093 dn = DB_DNODE(db);
1094 /*
1095 * Shouldn't dirty a regular buffer in syncing context. Private
1096 * objects may be dirtied in syncing context, but only if they
1097 * were already pre-dirtied in open context.
1098 */
1099 ASSERT(!dmu_tx_is_syncing(tx) ||
1100 BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1101 DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1102 dn->dn_objset->os_dsl_dataset == NULL);
1103 /*
1104 * We make this assert for private objects as well, but after we
1105 * check if we're already dirty. They are allowed to re-dirty
1106 * in syncing context.
1107 */
1108 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1109 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1110 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1111
1112 mutex_enter(&db->db_mtx);
1113 /*
1114 * XXX make this true for indirects too? The problem is that
1115 * transactions created with dmu_tx_create_assigned() from
1116 * syncing context don't bother holding ahead.
1117 */
1118 ASSERT(db->db_level != 0 ||
1119 db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1120 db->db_state == DB_NOFILL);
1121
1122 mutex_enter(&dn->dn_mtx);
1123 /*
1124 * Don't set dirtyctx to SYNC if we're just modifying this as we
1125 * initialize the objset.
1126 */
1127 if (dn->dn_dirtyctx == DN_UNDIRTIED &&
1128 !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1129 dn->dn_dirtyctx =
1130 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1131 ASSERT(dn->dn_dirtyctx_firstset == NULL);
1132 dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_PUSHPAGE);
1133 }
1134 mutex_exit(&dn->dn_mtx);
1135
1136 if (db->db_blkid == DMU_SPILL_BLKID)
1137 dn->dn_have_spill = B_TRUE;
1138
1139 /*
1140 * If this buffer is already dirty, we're done.
1141 */
1142 drp = &db->db_last_dirty;
1143 ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
1144 db->db.db_object == DMU_META_DNODE_OBJECT);
1145 while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
1146 drp = &dr->dr_next;
1147 if (dr && dr->dr_txg == tx->tx_txg) {
1148 DB_DNODE_EXIT(db);
1149
1150 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1151 /*
1152 * If this buffer has already been written out,
1153 * we now need to reset its state.
1154 */
1155 dbuf_unoverride(dr);
1156 if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1157 db->db_state != DB_NOFILL)
1158 arc_buf_thaw(db->db_buf);
1159 }
1160 mutex_exit(&db->db_mtx);
1161 return (dr);
1162 }
1163
1164 /*
1165 * Only valid if not already dirty.
1166 */
1167 ASSERT(dn->dn_object == 0 ||
1168 dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1169 (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1170
1171 ASSERT3U(dn->dn_nlevels, >, db->db_level);
1172 ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
1173 dn->dn_phys->dn_nlevels > db->db_level ||
1174 dn->dn_next_nlevels[txgoff] > db->db_level ||
1175 dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
1176 dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
1177
1178 /*
1179 * We should only be dirtying in syncing context if it's the
1180 * mos or we're initializing the os or it's a special object.
1181 * However, we are allowed to dirty in syncing context provided
1182 * we already dirtied it in open context. Hence we must make
1183 * this assertion only if we're not already dirty.
1184 */
1185 os = dn->dn_objset;
1186 ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1187 os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
1188 ASSERT(db->db.db_size != 0);
1189
1190 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1191
1192 if (db->db_blkid != DMU_BONUS_BLKID) {
1193 /*
1194 * Update the accounting.
1195 * Note: we delay "free accounting" until after we drop
1196 * the db_mtx. This keeps us from grabbing other locks
1197 * (and possibly deadlocking) in bp_get_dsize() while
1198 * also holding the db_mtx.
1199 */
1200 dnode_willuse_space(dn, db->db.db_size, tx);
1201 do_free_accounting = dbuf_block_freeable(db);
1202 }
1203
1204 /*
1205 * If this buffer is dirty in an old transaction group we need
1206 * to make a copy of it so that the changes we make in this
1207 * transaction group won't leak out when we sync the older txg.
1208 */
1209 dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_PUSHPAGE);
1210 list_link_init(&dr->dr_dirty_node);
1211 if (db->db_level == 0) {
1212 void *data_old = db->db_buf;
1213
1214 if (db->db_state != DB_NOFILL) {
1215 if (db->db_blkid == DMU_BONUS_BLKID) {
1216 dbuf_fix_old_data(db, tx->tx_txg);
1217 data_old = db->db.db_data;
1218 } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
1219 /*
1220 * Release the data buffer from the cache so
1221 * that we can modify it without impacting
1222 * possible other users of this cached data
1223 * block. Note that indirect blocks and
1224 * private objects are not released until the
1225 * syncing state (since they are only modified
1226 * then).
1227 */
1228 arc_release(db->db_buf, db);
1229 dbuf_fix_old_data(db, tx->tx_txg);
1230 data_old = db->db_buf;
1231 }
1232 ASSERT(data_old != NULL);
1233 }
1234 dr->dt.dl.dr_data = data_old;
1235 } else {
1236 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
1237 list_create(&dr->dt.di.dr_children,
1238 sizeof (dbuf_dirty_record_t),
1239 offsetof(dbuf_dirty_record_t, dr_dirty_node));
1240 }
1241 if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
1242 dr->dr_accounted = db->db.db_size;
1243 dr->dr_dbuf = db;
1244 dr->dr_txg = tx->tx_txg;
1245 dr->dr_next = *drp;
1246 *drp = dr;
1247
1248 /*
1249 * We could have been freed_in_flight between the dbuf_noread
1250 * and dbuf_dirty. We win, as though the dbuf_noread() had
1251 * happened after the free.
1252 */
1253 if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1254 db->db_blkid != DMU_SPILL_BLKID) {
1255 mutex_enter(&dn->dn_mtx);
1256 dnode_clear_range(dn, db->db_blkid, 1, tx);
1257 mutex_exit(&dn->dn_mtx);
1258 db->db_freed_in_flight = FALSE;
1259 }
1260
1261 /*
1262 * This buffer is now part of this txg
1263 */
1264 dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
1265 db->db_dirtycnt += 1;
1266 ASSERT3U(db->db_dirtycnt, <=, 3);
1267
1268 mutex_exit(&db->db_mtx);
1269
1270 if (db->db_blkid == DMU_BONUS_BLKID ||
1271 db->db_blkid == DMU_SPILL_BLKID) {
1272 mutex_enter(&dn->dn_mtx);
1273 ASSERT(!list_link_active(&dr->dr_dirty_node));
1274 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1275 mutex_exit(&dn->dn_mtx);
1276 dnode_setdirty(dn, tx);
1277 DB_DNODE_EXIT(db);
1278 return (dr);
1279 } else if (do_free_accounting) {
1280 blkptr_t *bp = db->db_blkptr;
1281 int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
1282 bp_get_dsize(os->os_spa, bp) : db->db.db_size;
1283 /*
1284 * This is only a guess -- if the dbuf is dirty
1285 * in a previous txg, we don't know how much
1286 * space it will use on disk yet. We should
1287 * really have the struct_rwlock to access
1288 * db_blkptr, but since this is just a guess,
1289 * it's OK if we get an odd answer.
1290 */
1291 ddt_prefetch(os->os_spa, bp);
1292 dnode_willuse_space(dn, -willfree, tx);
1293 }
1294
1295 if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
1296 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1297 drop_struct_lock = TRUE;
1298 }
1299
1300 if (db->db_level == 0) {
1301 dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
1302 ASSERT(dn->dn_maxblkid >= db->db_blkid);
1303 }
1304
1305 if (db->db_level+1 < dn->dn_nlevels) {
1306 dmu_buf_impl_t *parent = db->db_parent;
1307 dbuf_dirty_record_t *di;
1308 int parent_held = FALSE;
1309
1310 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
1311 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1312
1313 parent = dbuf_hold_level(dn, db->db_level+1,
1314 db->db_blkid >> epbs, FTAG);
1315 ASSERT(parent != NULL);
1316 parent_held = TRUE;
1317 }
1318 if (drop_struct_lock)
1319 rw_exit(&dn->dn_struct_rwlock);
1320 ASSERT3U(db->db_level+1, ==, parent->db_level);
1321 di = dbuf_dirty(parent, tx);
1322 if (parent_held)
1323 dbuf_rele(parent, FTAG);
1324
1325 mutex_enter(&db->db_mtx);
1326 /*
1327 * Since we've dropped the mutex, it's possible that
1328 * dbuf_undirty() might have changed this out from under us.
1329 */
1330 if (db->db_last_dirty == dr ||
1331 dn->dn_object == DMU_META_DNODE_OBJECT) {
1332 mutex_enter(&di->dt.di.dr_mtx);
1333 ASSERT3U(di->dr_txg, ==, tx->tx_txg);
1334 ASSERT(!list_link_active(&dr->dr_dirty_node));
1335 list_insert_tail(&di->dt.di.dr_children, dr);
1336 mutex_exit(&di->dt.di.dr_mtx);
1337 dr->dr_parent = di;
1338 }
1339 mutex_exit(&db->db_mtx);
1340 } else {
1341 ASSERT(db->db_level+1 == dn->dn_nlevels);
1342 ASSERT(db->db_blkid < dn->dn_nblkptr);
1343 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
1344 mutex_enter(&dn->dn_mtx);
1345 ASSERT(!list_link_active(&dr->dr_dirty_node));
1346 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
1347 mutex_exit(&dn->dn_mtx);
1348 if (drop_struct_lock)
1349 rw_exit(&dn->dn_struct_rwlock);
1350 }
1351
1352 dnode_setdirty(dn, tx);
1353 DB_DNODE_EXIT(db);
1354 return (dr);
1355 }
1356
1357 /*
1358 * Undirty a buffer in the transaction group referenced by the given
1359 * transaction. Return whether this evicted the dbuf.
1360 */
1361 static boolean_t
1362 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1363 {
1364 dnode_t *dn;
1365 uint64_t txg = tx->tx_txg;
1366 dbuf_dirty_record_t *dr, **drp;
1367
1368 ASSERT(txg != 0);
1369 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1370 ASSERT0(db->db_level);
1371 ASSERT(MUTEX_HELD(&db->db_mtx));
1372
1373 /*
1374 * If this buffer is not dirty, we're done.
1375 */
1376 for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1377 if (dr->dr_txg <= txg)
1378 break;
1379 if (dr == NULL || dr->dr_txg < txg)
1380 return (B_FALSE);
1381 ASSERT(dr->dr_txg == txg);
1382 ASSERT(dr->dr_dbuf == db);
1383
1384 DB_DNODE_ENTER(db);
1385 dn = DB_DNODE(db);
1386
1387 /*
1388 * Note: This code will probably work even if there are concurrent
1389 * holders, but it is untested in that scenerio, as the ZPL and
1390 * ztest have additional locking (the range locks) that prevents
1391 * that type of concurrent access.
1392 */
1393 ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
1394
1395 dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
1396
1397 ASSERT(db->db.db_size != 0);
1398
1399 /*
1400 * Any space we accounted for in dp_dirty_* will be cleaned up by
1401 * dsl_pool_sync(). This is relatively rare so the discrepancy
1402 * is not a big deal.
1403 */
1404
1405 *drp = dr->dr_next;
1406
1407 /*
1408 * Note that there are three places in dbuf_dirty()
1409 * where this dirty record may be put on a list.
1410 * Make sure to do a list_remove corresponding to
1411 * every one of those list_insert calls.
1412 */
1413 if (dr->dr_parent) {
1414 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
1415 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
1416 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
1417 } else if (db->db_blkid == DMU_SPILL_BLKID ||
1418 db->db_level+1 == dn->dn_nlevels) {
1419 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
1420 mutex_enter(&dn->dn_mtx);
1421 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
1422 mutex_exit(&dn->dn_mtx);
1423 }
1424 DB_DNODE_EXIT(db);
1425
1426 if (db->db_state != DB_NOFILL) {
1427 dbuf_unoverride(dr);
1428
1429 ASSERT(db->db_buf != NULL);
1430 ASSERT(dr->dt.dl.dr_data != NULL);
1431 if (dr->dt.dl.dr_data != db->db_buf)
1432 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
1433 }
1434 kmem_free(dr, sizeof (dbuf_dirty_record_t));
1435
1436 ASSERT(db->db_dirtycnt > 0);
1437 db->db_dirtycnt -= 1;
1438
1439 if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
1440 arc_buf_t *buf = db->db_buf;
1441
1442 ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
1443 dbuf_set_data(db, NULL);
1444 VERIFY(arc_buf_remove_ref(buf, db));
1445 dbuf_evict(db);
1446 return (B_TRUE);
1447 }
1448
1449 return (B_FALSE);
1450 }
1451
1452 #pragma weak dmu_buf_will_dirty = dbuf_will_dirty
1453 void
1454 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1455 {
1456 int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
1457
1458 ASSERT(tx->tx_txg != 0);
1459 ASSERT(!refcount_is_zero(&db->db_holds));
1460
1461 DB_DNODE_ENTER(db);
1462 if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
1463 rf |= DB_RF_HAVESTRUCT;
1464 DB_DNODE_EXIT(db);
1465 (void) dbuf_read(db, NULL, rf);
1466 (void) dbuf_dirty(db, tx);
1467 }
1468
1469 void
1470 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1471 {
1472 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1473
1474 db->db_state = DB_NOFILL;
1475
1476 dmu_buf_will_fill(db_fake, tx);
1477 }
1478
1479 void
1480 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
1481 {
1482 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1483
1484 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1485 ASSERT(tx->tx_txg != 0);
1486 ASSERT(db->db_level == 0);
1487 ASSERT(!refcount_is_zero(&db->db_holds));
1488
1489 ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
1490 dmu_tx_private_ok(tx));
1491
1492 dbuf_noread(db);
1493 (void) dbuf_dirty(db, tx);
1494 }
1495
1496 #pragma weak dmu_buf_fill_done = dbuf_fill_done
1497 /* ARGSUSED */
1498 void
1499 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
1500 {
1501 mutex_enter(&db->db_mtx);
1502 DBUF_VERIFY(db);
1503
1504 if (db->db_state == DB_FILL) {
1505 if (db->db_level == 0 && db->db_freed_in_flight) {
1506 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1507 /* we were freed while filling */
1508 /* XXX dbuf_undirty? */
1509 bzero(db->db.db_data, db->db.db_size);
1510 db->db_freed_in_flight = FALSE;
1511 }
1512 db->db_state = DB_CACHED;
1513 cv_broadcast(&db->db_changed);
1514 }
1515 mutex_exit(&db->db_mtx);
1516 }
1517
1518 /*
1519 * Directly assign a provided arc buf to a given dbuf if it's not referenced
1520 * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
1521 */
1522 void
1523 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
1524 {
1525 ASSERT(!refcount_is_zero(&db->db_holds));
1526 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1527 ASSERT(db->db_level == 0);
1528 ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
1529 ASSERT(buf != NULL);
1530 ASSERT(arc_buf_size(buf) == db->db.db_size);
1531 ASSERT(tx->tx_txg != 0);
1532
1533 arc_return_buf(buf, db);
1534 ASSERT(arc_released(buf));
1535
1536 mutex_enter(&db->db_mtx);
1537
1538 while (db->db_state == DB_READ || db->db_state == DB_FILL)
1539 cv_wait(&db->db_changed, &db->db_mtx);
1540
1541 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
1542
1543 if (db->db_state == DB_CACHED &&
1544 refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
1545 mutex_exit(&db->db_mtx);
1546 (void) dbuf_dirty(db, tx);
1547 bcopy(buf->b_data, db->db.db_data, db->db.db_size);
1548 VERIFY(arc_buf_remove_ref(buf, db));
1549 xuio_stat_wbuf_copied();
1550 return;
1551 }
1552
1553 xuio_stat_wbuf_nocopy();
1554 if (db->db_state == DB_CACHED) {
1555 dbuf_dirty_record_t *dr = db->db_last_dirty;
1556
1557 ASSERT(db->db_buf != NULL);
1558 if (dr != NULL && dr->dr_txg == tx->tx_txg) {
1559 ASSERT(dr->dt.dl.dr_data == db->db_buf);
1560 if (!arc_released(db->db_buf)) {
1561 ASSERT(dr->dt.dl.dr_override_state ==
1562 DR_OVERRIDDEN);
1563 arc_release(db->db_buf, db);
1564 }
1565 dr->dt.dl.dr_data = buf;
1566 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1567 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
1568 arc_release(db->db_buf, db);
1569 VERIFY(arc_buf_remove_ref(db->db_buf, db));
1570 }
1571 db->db_buf = NULL;
1572 }
1573 ASSERT(db->db_buf == NULL);
1574 dbuf_set_data(db, buf);
1575 db->db_state = DB_FILL;
1576 mutex_exit(&db->db_mtx);
1577 (void) dbuf_dirty(db, tx);
1578 dbuf_fill_done(db, tx);
1579 }
1580
1581 /*
1582 * "Clear" the contents of this dbuf. This will mark the dbuf
1583 * EVICTING and clear *most* of its references. Unfortunately,
1584 * when we are not holding the dn_dbufs_mtx, we can't clear the
1585 * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
1586 * in this case. For callers from the DMU we will usually see:
1587 * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
1588 * For the arc callback, we will usually see:
1589 * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
1590 * Sometimes, though, we will get a mix of these two:
1591 * DMU: dbuf_clear()->arc_buf_evict()
1592 * ARC: dbuf_do_evict()->dbuf_destroy()
1593 */
1594 void
1595 dbuf_clear(dmu_buf_impl_t *db)
1596 {
1597 dnode_t *dn;
1598 dmu_buf_impl_t *parent = db->db_parent;
1599 dmu_buf_impl_t *dndb;
1600 int dbuf_gone = FALSE;
1601
1602 ASSERT(MUTEX_HELD(&db->db_mtx));
1603 ASSERT(refcount_is_zero(&db->db_holds));
1604
1605 dbuf_evict_user(db);
1606
1607 if (db->db_state == DB_CACHED) {
1608 ASSERT(db->db.db_data != NULL);
1609 if (db->db_blkid == DMU_BONUS_BLKID) {
1610 zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
1611 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
1612 }
1613 db->db.db_data = NULL;
1614 db->db_state = DB_UNCACHED;
1615 }
1616
1617 ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
1618 ASSERT(db->db_data_pending == NULL);
1619
1620 db->db_state = DB_EVICTING;
1621 db->db_blkptr = NULL;
1622
1623 DB_DNODE_ENTER(db);
1624 dn = DB_DNODE(db);
1625 dndb = dn->dn_dbuf;
1626 if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
1627 list_remove(&dn->dn_dbufs, db);
1628 (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1629 membar_producer();
1630 DB_DNODE_EXIT(db);
1631 /*
1632 * Decrementing the dbuf count means that the hold corresponding
1633 * to the removed dbuf is no longer discounted in dnode_move(),
1634 * so the dnode cannot be moved until after we release the hold.
1635 * The membar_producer() ensures visibility of the decremented
1636 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
1637 * release any lock.
1638 */
1639 dnode_rele(dn, db);
1640 db->db_dnode_handle = NULL;
1641 } else {
1642 DB_DNODE_EXIT(db);
1643 }
1644
1645 if (db->db_buf)
1646 dbuf_gone = arc_buf_evict(db->db_buf);
1647
1648 if (!dbuf_gone)
1649 mutex_exit(&db->db_mtx);
1650
1651 /*
1652 * If this dbuf is referenced from an indirect dbuf,
1653 * decrement the ref count on the indirect dbuf.
1654 */
1655 if (parent && parent != dndb)
1656 dbuf_rele(parent, db);
1657 }
1658
1659 __attribute__((always_inline))
1660 static inline int
1661 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
1662 dmu_buf_impl_t **parentp, blkptr_t **bpp, struct dbuf_hold_impl_data *dh)
1663 {
1664 int nlevels, epbs;
1665
1666 *parentp = NULL;
1667 *bpp = NULL;
1668
1669 ASSERT(blkid != DMU_BONUS_BLKID);
1670
1671 if (blkid == DMU_SPILL_BLKID) {
1672 mutex_enter(&dn->dn_mtx);
1673 if (dn->dn_have_spill &&
1674 (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
1675 *bpp = &dn->dn_phys->dn_spill;
1676 else
1677 *bpp = NULL;
1678 dbuf_add_ref(dn->dn_dbuf, NULL);
1679 *parentp = dn->dn_dbuf;
1680 mutex_exit(&dn->dn_mtx);
1681 return (0);
1682 }
1683
1684 if (dn->dn_phys->dn_nlevels == 0)
1685 nlevels = 1;
1686 else
1687 nlevels = dn->dn_phys->dn_nlevels;
1688
1689 epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
1690
1691 ASSERT3U(level * epbs, <, 64);
1692 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1693 if (level >= nlevels ||
1694 (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
1695 /* the buffer has no parent yet */
1696 return (SET_ERROR(ENOENT));
1697 } else if (level < nlevels-1) {
1698 /* this block is referenced from an indirect block */
1699 int err;
1700 if (dh == NULL) {
1701 err = dbuf_hold_impl(dn, level+1, blkid >> epbs,
1702 fail_sparse, NULL, parentp);
1703 }
1704 else {
1705 __dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1,
1706 blkid >> epbs, fail_sparse, NULL,
1707 parentp, dh->dh_depth + 1);
1708 err = __dbuf_hold_impl(dh + 1);
1709 }
1710 if (err)
1711 return (err);
1712 err = dbuf_read(*parentp, NULL,
1713 (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
1714 if (err) {
1715 dbuf_rele(*parentp, NULL);
1716 *parentp = NULL;
1717 return (err);
1718 }
1719 *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
1720 (blkid & ((1ULL << epbs) - 1));
1721 return (0);
1722 } else {
1723 /* the block is referenced from the dnode */
1724 ASSERT3U(level, ==, nlevels-1);
1725 ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
1726 blkid < dn->dn_phys->dn_nblkptr);
1727 if (dn->dn_dbuf) {
1728 dbuf_add_ref(dn->dn_dbuf, NULL);
1729 *parentp = dn->dn_dbuf;
1730 }
1731 *bpp = &dn->dn_phys->dn_blkptr[blkid];
1732 return (0);
1733 }
1734 }
1735
1736 static dmu_buf_impl_t *
1737 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
1738 dmu_buf_impl_t *parent, blkptr_t *blkptr)
1739 {
1740 objset_t *os = dn->dn_objset;
1741 dmu_buf_impl_t *db, *odb;
1742
1743 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1744 ASSERT(dn->dn_type != DMU_OT_NONE);
1745
1746 db = kmem_cache_alloc(dbuf_cache, KM_PUSHPAGE);
1747
1748 db->db_objset = os;
1749 db->db.db_object = dn->dn_object;
1750 db->db_level = level;
1751 db->db_blkid = blkid;
1752 db->db_last_dirty = NULL;
1753 db->db_dirtycnt = 0;
1754 db->db_dnode_handle = dn->dn_handle;
1755 db->db_parent = parent;
1756 db->db_blkptr = blkptr;
1757
1758 db->db_user_ptr = NULL;
1759 db->db_user_data_ptr_ptr = NULL;
1760 db->db_evict_func = NULL;
1761 db->db_immediate_evict = 0;
1762 db->db_freed_in_flight = 0;
1763
1764 if (blkid == DMU_BONUS_BLKID) {
1765 ASSERT3P(parent, ==, dn->dn_dbuf);
1766 db->db.db_size = DN_MAX_BONUSLEN -
1767 (dn->dn_nblkptr-1) * sizeof (blkptr_t);
1768 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
1769 db->db.db_offset = DMU_BONUS_BLKID;
1770 db->db_state = DB_UNCACHED;
1771 /* the bonus dbuf is not placed in the hash table */
1772 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1773 return (db);
1774 } else if (blkid == DMU_SPILL_BLKID) {
1775 db->db.db_size = (blkptr != NULL) ?
1776 BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
1777 db->db.db_offset = 0;
1778 } else {
1779 int blocksize =
1780 db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
1781 db->db.db_size = blocksize;
1782 db->db.db_offset = db->db_blkid * blocksize;
1783 }
1784
1785 /*
1786 * Hold the dn_dbufs_mtx while we get the new dbuf
1787 * in the hash table *and* added to the dbufs list.
1788 * This prevents a possible deadlock with someone
1789 * trying to look up this dbuf before its added to the
1790 * dn_dbufs list.
1791 */
1792 mutex_enter(&dn->dn_dbufs_mtx);
1793 db->db_state = DB_EVICTING;
1794 if ((odb = dbuf_hash_insert(db)) != NULL) {
1795 /* someone else inserted it first */
1796 kmem_cache_free(dbuf_cache, db);
1797 mutex_exit(&dn->dn_dbufs_mtx);
1798 return (odb);
1799 }
1800 list_insert_head(&dn->dn_dbufs, db);
1801 if (db->db_level == 0 && db->db_blkid >=
1802 dn->dn_unlisted_l0_blkid)
1803 dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
1804 db->db_state = DB_UNCACHED;
1805 mutex_exit(&dn->dn_dbufs_mtx);
1806 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1807
1808 if (parent && parent != dn->dn_dbuf)
1809 dbuf_add_ref(parent, db);
1810
1811 ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1812 refcount_count(&dn->dn_holds) > 0);
1813 (void) refcount_add(&dn->dn_holds, db);
1814 (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
1815
1816 dprintf_dbuf(db, "db=%p\n", db);
1817
1818 return (db);
1819 }
1820
1821 static int
1822 dbuf_do_evict(void *private)
1823 {
1824 arc_buf_t *buf = private;
1825 dmu_buf_impl_t *db = buf->b_private;
1826
1827 if (!MUTEX_HELD(&db->db_mtx))
1828 mutex_enter(&db->db_mtx);
1829
1830 ASSERT(refcount_is_zero(&db->db_holds));
1831
1832 if (db->db_state != DB_EVICTING) {
1833 ASSERT(db->db_state == DB_CACHED);
1834 DBUF_VERIFY(db);
1835 db->db_buf = NULL;
1836 dbuf_evict(db);
1837 } else {
1838 mutex_exit(&db->db_mtx);
1839 dbuf_destroy(db);
1840 }
1841 return (0);
1842 }
1843
1844 static void
1845 dbuf_destroy(dmu_buf_impl_t *db)
1846 {
1847 ASSERT(refcount_is_zero(&db->db_holds));
1848
1849 if (db->db_blkid != DMU_BONUS_BLKID) {
1850 /*
1851 * If this dbuf is still on the dn_dbufs list,
1852 * remove it from that list.
1853 */
1854 if (db->db_dnode_handle != NULL) {
1855 dnode_t *dn;
1856
1857 DB_DNODE_ENTER(db);
1858 dn = DB_DNODE(db);
1859 mutex_enter(&dn->dn_dbufs_mtx);
1860 list_remove(&dn->dn_dbufs, db);
1861 (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
1862 mutex_exit(&dn->dn_dbufs_mtx);
1863 DB_DNODE_EXIT(db);
1864 /*
1865 * Decrementing the dbuf count means that the hold
1866 * corresponding to the removed dbuf is no longer
1867 * discounted in dnode_move(), so the dnode cannot be
1868 * moved until after we release the hold.
1869 */
1870 dnode_rele(dn, db);
1871 db->db_dnode_handle = NULL;
1872 }
1873 dbuf_hash_remove(db);
1874 }
1875 db->db_parent = NULL;
1876 db->db_buf = NULL;
1877
1878 ASSERT(!list_link_active(&db->db_link));
1879 ASSERT(db->db.db_data == NULL);
1880 ASSERT(db->db_hash_next == NULL);
1881 ASSERT(db->db_blkptr == NULL);
1882 ASSERT(db->db_data_pending == NULL);
1883
1884 kmem_cache_free(dbuf_cache, db);
1885 arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
1886 }
1887
1888 void
1889 dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
1890 {
1891 dmu_buf_impl_t *db = NULL;
1892 blkptr_t *bp = NULL;
1893
1894 ASSERT(blkid != DMU_BONUS_BLKID);
1895 ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
1896
1897 if (dnode_block_freed(dn, blkid))
1898 return;
1899
1900 /* dbuf_find() returns with db_mtx held */
1901 if ((db = dbuf_find(dn, 0, blkid))) {
1902 /*
1903 * This dbuf is already in the cache. We assume that
1904 * it is already CACHED, or else about to be either
1905 * read or filled.
1906 */
1907 mutex_exit(&db->db_mtx);
1908 return;
1909 }
1910
1911 if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) {
1912 if (bp && !BP_IS_HOLE(bp)) {
1913 dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
1914 uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
1915 zbookmark_t zb;
1916
1917 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
1918 dn->dn_object, 0, blkid);
1919
1920 (void) arc_read(NULL, dn->dn_objset->os_spa,
1921 bp, NULL, NULL, prio,
1922 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1923 &aflags, &zb);
1924 }
1925 if (db)
1926 dbuf_rele(db, NULL);
1927 }
1928 }
1929
1930 #define DBUF_HOLD_IMPL_MAX_DEPTH 20
1931
1932 /*
1933 * Returns with db_holds incremented, and db_mtx not held.
1934 * Note: dn_struct_rwlock must be held.
1935 */
1936 static int
1937 __dbuf_hold_impl(struct dbuf_hold_impl_data *dh)
1938 {
1939 ASSERT3S(dh->dh_depth, <, DBUF_HOLD_IMPL_MAX_DEPTH);
1940 dh->dh_parent = NULL;
1941
1942 ASSERT(dh->dh_blkid != DMU_BONUS_BLKID);
1943 ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock));
1944 ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level);
1945
1946 *(dh->dh_dbp) = NULL;
1947 top:
1948 /* dbuf_find() returns with db_mtx held */
1949 dh->dh_db = dbuf_find(dh->dh_dn, dh->dh_level, dh->dh_blkid);
1950
1951 if (dh->dh_db == NULL) {
1952 dh->dh_bp = NULL;
1953
1954 ASSERT3P(dh->dh_parent, ==, NULL);
1955 dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid,
1956 dh->dh_fail_sparse, &dh->dh_parent,
1957 &dh->dh_bp, dh);
1958 if (dh->dh_fail_sparse) {
1959 if (dh->dh_err == 0 && dh->dh_bp && BP_IS_HOLE(dh->dh_bp))
1960 dh->dh_err = SET_ERROR(ENOENT);
1961 if (dh->dh_err) {
1962 if (dh->dh_parent)
1963 dbuf_rele(dh->dh_parent, NULL);
1964 return (dh->dh_err);
1965 }
1966 }
1967 if (dh->dh_err && dh->dh_err != ENOENT)
1968 return (dh->dh_err);
1969 dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid,
1970 dh->dh_parent, dh->dh_bp);
1971 }
1972
1973 if (dh->dh_db->db_buf && refcount_is_zero(&dh->dh_db->db_holds)) {
1974 arc_buf_add_ref(dh->dh_db->db_buf, dh->dh_db);
1975 if (dh->dh_db->db_buf->b_data == NULL) {
1976 dbuf_clear(dh->dh_db);
1977 if (dh->dh_parent) {
1978 dbuf_rele(dh->dh_parent, NULL);
1979 dh->dh_parent = NULL;
1980 }
1981 goto top;
1982 }
1983 ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data);
1984 }
1985
1986 ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf));
1987
1988 /*
1989 * If this buffer is currently syncing out, and we are are
1990 * still referencing it from db_data, we need to make a copy
1991 * of it in case we decide we want to dirty it again in this txg.
1992 */
1993 if (dh->dh_db->db_level == 0 &&
1994 dh->dh_db->db_blkid != DMU_BONUS_BLKID &&
1995 dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT &&
1996 dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) {
1997 dh->dh_dr = dh->dh_db->db_data_pending;
1998
1999 if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf) {
2000 dh->dh_type = DBUF_GET_BUFC_TYPE(dh->dh_db);
2001
2002 dbuf_set_data(dh->dh_db,
2003 arc_buf_alloc(dh->dh_dn->dn_objset->os_spa,
2004 dh->dh_db->db.db_size, dh->dh_db, dh->dh_type));
2005 bcopy(dh->dh_dr->dt.dl.dr_data->b_data,
2006 dh->dh_db->db.db_data, dh->dh_db->db.db_size);
2007 }
2008 }
2009
2010 (void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag);
2011 dbuf_update_data(dh->dh_db);
2012 DBUF_VERIFY(dh->dh_db);
2013 mutex_exit(&dh->dh_db->db_mtx);
2014
2015 /* NOTE: we can't rele the parent until after we drop the db_mtx */
2016 if (dh->dh_parent)
2017 dbuf_rele(dh->dh_parent, NULL);
2018
2019 ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn);
2020 ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid);
2021 ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level);
2022 *(dh->dh_dbp) = dh->dh_db;
2023
2024 return (0);
2025 }
2026
2027 /*
2028 * The following code preserves the recursive function dbuf_hold_impl()
2029 * but moves the local variables AND function arguments to the heap to
2030 * minimize the stack frame size. Enough space is initially allocated
2031 * on the stack for 20 levels of recursion.
2032 */
2033 int
2034 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
2035 void *tag, dmu_buf_impl_t **dbp)
2036 {
2037 struct dbuf_hold_impl_data *dh;
2038 int error;
2039
2040 dh = kmem_zalloc(sizeof(struct dbuf_hold_impl_data) *
2041 DBUF_HOLD_IMPL_MAX_DEPTH, KM_PUSHPAGE);
2042 __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, tag, dbp, 0);
2043
2044 error = __dbuf_hold_impl(dh);
2045
2046 kmem_free(dh, sizeof(struct dbuf_hold_impl_data) *
2047 DBUF_HOLD_IMPL_MAX_DEPTH);
2048
2049 return (error);
2050 }
2051
2052 static void
2053 __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
2054 dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
2055 void *tag, dmu_buf_impl_t **dbp, int depth)
2056 {
2057 dh->dh_dn = dn;
2058 dh->dh_level = level;
2059 dh->dh_blkid = blkid;
2060 dh->dh_fail_sparse = fail_sparse;
2061 dh->dh_tag = tag;
2062 dh->dh_dbp = dbp;
2063 dh->dh_depth = depth;
2064 }
2065
2066 dmu_buf_impl_t *
2067 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
2068 {
2069 dmu_buf_impl_t *db;
2070 int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
2071 return (err ? NULL : db);
2072 }
2073
2074 dmu_buf_impl_t *
2075 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
2076 {
2077 dmu_buf_impl_t *db;
2078 int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
2079 return (err ? NULL : db);
2080 }
2081
2082 void
2083 dbuf_create_bonus(dnode_t *dn)
2084 {
2085 ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
2086
2087 ASSERT(dn->dn_bonus == NULL);
2088 dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
2089 }
2090
2091 int
2092 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
2093 {
2094 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2095 dnode_t *dn;
2096
2097 if (db->db_blkid != DMU_SPILL_BLKID)
2098 return (SET_ERROR(ENOTSUP));
2099 if (blksz == 0)
2100 blksz = SPA_MINBLOCKSIZE;
2101 if (blksz > SPA_MAXBLOCKSIZE)
2102 blksz = SPA_MAXBLOCKSIZE;
2103 else
2104 blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
2105
2106 DB_DNODE_ENTER(db);
2107 dn = DB_DNODE(db);
2108 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
2109 dbuf_new_size(db, blksz, tx);
2110 rw_exit(&dn->dn_struct_rwlock);
2111 DB_DNODE_EXIT(db);
2112
2113 return (0);
2114 }
2115
2116 void
2117 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
2118 {
2119 dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
2120 }
2121
2122 #pragma weak dmu_buf_add_ref = dbuf_add_ref
2123 void
2124 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
2125 {
2126 VERIFY(refcount_add(&db->db_holds, tag) > 1);
2127 }
2128
2129 /*
2130 * If you call dbuf_rele() you had better not be referencing the dnode handle
2131 * unless you have some other direct or indirect hold on the dnode. (An indirect
2132 * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
2133 * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
2134 * dnode's parent dbuf evicting its dnode handles.
2135 */
2136 #pragma weak dmu_buf_rele = dbuf_rele
2137 void
2138 dbuf_rele(dmu_buf_impl_t *db, void *tag)
2139 {
2140 mutex_enter(&db->db_mtx);
2141 dbuf_rele_and_unlock(db, tag);
2142 }
2143
2144 /*
2145 * dbuf_rele() for an already-locked dbuf. This is necessary to allow
2146 * db_dirtycnt and db_holds to be updated atomically.
2147 */
2148 void
2149 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
2150 {
2151 int64_t holds;
2152
2153 ASSERT(MUTEX_HELD(&db->db_mtx));
2154 DBUF_VERIFY(db);
2155
2156 /*
2157 * Remove the reference to the dbuf before removing its hold on the
2158 * dnode so we can guarantee in dnode_move() that a referenced bonus
2159 * buffer has a corresponding dnode hold.
2160 */
2161 holds = refcount_remove(&db->db_holds, tag);
2162 ASSERT(holds >= 0);
2163
2164 /*
2165 * We can't freeze indirects if there is a possibility that they
2166 * may be modified in the current syncing context.
2167 */
2168 if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
2169 arc_buf_freeze(db->db_buf);
2170
2171 if (holds == db->db_dirtycnt &&
2172 db->db_level == 0 && db->db_immediate_evict)
2173 dbuf_evict_user(db);
2174
2175 if (holds == 0) {
2176 if (db->db_blkid == DMU_BONUS_BLKID) {
2177 mutex_exit(&db->db_mtx);
2178
2179 /*
2180 * If the dnode moves here, we cannot cross this barrier
2181 * until the move completes.
2182 */
2183 DB_DNODE_ENTER(db);
2184 (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
2185 DB_DNODE_EXIT(db);
2186 /*
2187 * The bonus buffer's dnode hold is no longer discounted
2188 * in dnode_move(). The dnode cannot move until after
2189 * the dnode_rele().
2190 */
2191 dnode_rele(DB_DNODE(db), db);
2192 } else if (db->db_buf == NULL) {
2193 /*
2194 * This is a special case: we never associated this
2195 * dbuf with any data allocated from the ARC.
2196 */
2197 ASSERT(db->db_state == DB_UNCACHED ||
2198 db->db_state == DB_NOFILL);
2199 dbuf_evict(db);
2200 } else if (arc_released(db->db_buf)) {
2201 arc_buf_t *buf = db->db_buf;
2202 /*
2203 * This dbuf has anonymous data associated with it.
2204 */
2205 dbuf_set_data(db, NULL);
2206 VERIFY(arc_buf_remove_ref(buf, db));
2207 dbuf_evict(db);
2208 } else {
2209 VERIFY(!arc_buf_remove_ref(db->db_buf, db));
2210
2211 /*
2212 * A dbuf will be eligible for eviction if either the
2213 * 'primarycache' property is set or a duplicate
2214 * copy of this buffer is already cached in the arc.
2215 *
2216 * In the case of the 'primarycache' a buffer
2217 * is considered for eviction if it matches the
2218 * criteria set in the property.
2219 *
2220 * To decide if our buffer is considered a
2221 * duplicate, we must call into the arc to determine
2222 * if multiple buffers are referencing the same
2223 * block on-disk. If so, then we simply evict
2224 * ourselves.
2225 */
2226 if (!DBUF_IS_CACHEABLE(db) ||
2227 arc_buf_eviction_needed(db->db_buf))
2228 dbuf_clear(db);
2229 else
2230 mutex_exit(&db->db_mtx);
2231 }
2232 } else {
2233 mutex_exit(&db->db_mtx);
2234 }
2235 }
2236
2237 #pragma weak dmu_buf_refcount = dbuf_refcount
2238 uint64_t
2239 dbuf_refcount(dmu_buf_impl_t *db)
2240 {
2241 return (refcount_count(&db->db_holds));
2242 }
2243
2244 void *
2245 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2246 dmu_buf_evict_func_t *evict_func)
2247 {
2248 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2249 user_data_ptr_ptr, evict_func));
2250 }
2251
2252 void *
2253 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
2254 dmu_buf_evict_func_t *evict_func)
2255 {
2256 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2257
2258 db->db_immediate_evict = TRUE;
2259 return (dmu_buf_update_user(db_fake, NULL, user_ptr,
2260 user_data_ptr_ptr, evict_func));
2261 }
2262
2263 void *
2264 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
2265 void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
2266 {
2267 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2268 ASSERT(db->db_level == 0);
2269
2270 ASSERT((user_ptr == NULL) == (evict_func == NULL));
2271
2272 mutex_enter(&db->db_mtx);
2273
2274 if (db->db_user_ptr == old_user_ptr) {
2275 db->db_user_ptr = user_ptr;
2276 db->db_user_data_ptr_ptr = user_data_ptr_ptr;
2277 db->db_evict_func = evict_func;
2278
2279 dbuf_update_data(db);
2280 } else {
2281 old_user_ptr = db->db_user_ptr;
2282 }
2283
2284 mutex_exit(&db->db_mtx);
2285 return (old_user_ptr);
2286 }
2287
2288 void *
2289 dmu_buf_get_user(dmu_buf_t *db_fake)
2290 {
2291 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2292 ASSERT(!refcount_is_zero(&db->db_holds));
2293
2294 return (db->db_user_ptr);
2295 }
2296
2297 boolean_t
2298 dmu_buf_freeable(dmu_buf_t *dbuf)
2299 {
2300 boolean_t res = B_FALSE;
2301 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2302
2303 if (db->db_blkptr)
2304 res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
2305 db->db_blkptr, db->db_blkptr->blk_birth);
2306
2307 return (res);
2308 }
2309
2310 blkptr_t *
2311 dmu_buf_get_blkptr(dmu_buf_t *db)
2312 {
2313 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
2314 return (dbi->db_blkptr);
2315 }
2316
2317 static void
2318 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
2319 {
2320 /* ASSERT(dmu_tx_is_syncing(tx) */
2321 ASSERT(MUTEX_HELD(&db->db_mtx));
2322
2323 if (db->db_blkptr != NULL)
2324 return;
2325
2326 if (db->db_blkid == DMU_SPILL_BLKID) {
2327 db->db_blkptr = &dn->dn_phys->dn_spill;
2328 BP_ZERO(db->db_blkptr);
2329 return;
2330 }
2331 if (db->db_level == dn->dn_phys->dn_nlevels-1) {
2332 /*
2333 * This buffer was allocated at a time when there was
2334 * no available blkptrs from the dnode, or it was
2335 * inappropriate to hook it in (i.e., nlevels mis-match).
2336 */
2337 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
2338 ASSERT(db->db_parent == NULL);
2339 db->db_parent = dn->dn_dbuf;
2340 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
2341 DBUF_VERIFY(db);
2342 } else {
2343 dmu_buf_impl_t *parent = db->db_parent;
2344 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
2345
2346 ASSERT(dn->dn_phys->dn_nlevels > 1);
2347 if (parent == NULL) {
2348 mutex_exit(&db->db_mtx);
2349 rw_enter(&dn->dn_struct_rwlock, RW_READER);
2350 (void) dbuf_hold_impl(dn, db->db_level+1,
2351 db->db_blkid >> epbs, FALSE, db, &parent);
2352 rw_exit(&dn->dn_struct_rwlock);
2353 mutex_enter(&db->db_mtx);
2354 db->db_parent = parent;
2355 }
2356 db->db_blkptr = (blkptr_t *)parent->db.db_data +
2357 (db->db_blkid & ((1ULL << epbs) - 1));
2358 DBUF_VERIFY(db);
2359 }
2360 }
2361
2362 /* dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
2363 * is critical the we not allow the compiler to inline this function in to
2364 * dbuf_sync_list() thereby drastically bloating the stack usage.
2365 */
2366 noinline static void
2367 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2368 {
2369 dmu_buf_impl_t *db = dr->dr_dbuf;
2370 dnode_t *dn;
2371 zio_t *zio;
2372
2373 ASSERT(dmu_tx_is_syncing(tx));
2374
2375 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2376
2377 mutex_enter(&db->db_mtx);
2378
2379 ASSERT(db->db_level > 0);
2380 DBUF_VERIFY(db);
2381
2382 /* Read the block if it hasn't been read yet. */
2383 if (db->db_buf == NULL) {
2384 mutex_exit(&db->db_mtx);
2385 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
2386 mutex_enter(&db->db_mtx);
2387 }
2388 ASSERT3U(db->db_state, ==, DB_CACHED);
2389 ASSERT(db->db_buf != NULL);
2390
2391 DB_DNODE_ENTER(db);
2392 dn = DB_DNODE(db);
2393 /* Indirect block size must match what the dnode thinks it is. */
2394 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2395 dbuf_check_blkptr(dn, db);
2396 DB_DNODE_EXIT(db);
2397
2398 /* Provide the pending dirty record to child dbufs */
2399 db->db_data_pending = dr;
2400
2401 mutex_exit(&db->db_mtx);
2402 dbuf_write(dr, db->db_buf, tx);
2403
2404 zio = dr->dr_zio;
2405 mutex_enter(&dr->dt.di.dr_mtx);
2406 dbuf_sync_list(&dr->dt.di.dr_children, tx);
2407 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2408 mutex_exit(&dr->dt.di.dr_mtx);
2409 zio_nowait(zio);
2410 }
2411
2412 /* dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
2413 * critical the we not allow the compiler to inline this function in to
2414 * dbuf_sync_list() thereby drastically bloating the stack usage.
2415 */
2416 noinline static void
2417 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
2418 {
2419 arc_buf_t **datap = &dr->dt.dl.dr_data;
2420 dmu_buf_impl_t *db = dr->dr_dbuf;
2421 dnode_t *dn;
2422 objset_t *os;
2423 uint64_t txg = tx->tx_txg;
2424
2425 ASSERT(dmu_tx_is_syncing(tx));
2426
2427 dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
2428
2429 mutex_enter(&db->db_mtx);
2430 /*
2431 * To be synced, we must be dirtied. But we
2432 * might have been freed after the dirty.
2433 */
2434 if (db->db_state == DB_UNCACHED) {
2435 /* This buffer has been freed since it was dirtied */
2436 ASSERT(db->db.db_data == NULL);
2437 } else if (db->db_state == DB_FILL) {
2438 /* This buffer was freed and is now being re-filled */
2439 ASSERT(db->db.db_data != dr->dt.dl.dr_data);
2440 } else {
2441 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
2442 }
2443 DBUF_VERIFY(db);
2444
2445 DB_DNODE_ENTER(db);
2446 dn = DB_DNODE(db);
2447
2448 if (db->db_blkid == DMU_SPILL_BLKID) {
2449 mutex_enter(&dn->dn_mtx);
2450 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
2451 mutex_exit(&dn->dn_mtx);
2452 }
2453
2454 /*
2455 * If this is a bonus buffer, simply copy the bonus data into the
2456 * dnode. It will be written out when the dnode is synced (and it
2457 * will be synced, since it must have been dirty for dbuf_sync to
2458 * be called).
2459 */
2460 if (db->db_blkid == DMU_BONUS_BLKID) {
2461 dbuf_dirty_record_t **drp;
2462
2463 ASSERT(*datap != NULL);
2464 ASSERT0(db->db_level);
2465 ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
2466 bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
2467 DB_DNODE_EXIT(db);
2468
2469 if (*datap != db->db.db_data) {
2470 zio_buf_free(*datap, DN_MAX_BONUSLEN);
2471 arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
2472 }
2473 db->db_data_pending = NULL;
2474 drp = &db->db_last_dirty;
2475 while (*drp != dr)
2476 drp = &(*drp)->dr_next;
2477 ASSERT(dr->dr_next == NULL);
2478 ASSERT(dr->dr_dbuf == db);
2479 *drp = dr->dr_next;
2480 if (dr->dr_dbuf->db_level != 0) {
2481 mutex_destroy(&dr->dt.di.dr_mtx);
2482 list_destroy(&dr->dt.di.dr_children);
2483 }
2484 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2485 ASSERT(db->db_dirtycnt > 0);
2486 db->db_dirtycnt -= 1;
2487 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2488 return;
2489 }
2490
2491 os = dn->dn_objset;
2492
2493 /*
2494 * This function may have dropped the db_mtx lock allowing a dmu_sync
2495 * operation to sneak in. As a result, we need to ensure that we
2496 * don't check the dr_override_state until we have returned from
2497 * dbuf_check_blkptr.
2498 */
2499 dbuf_check_blkptr(dn, db);
2500
2501 /*
2502 * If this buffer is in the middle of an immediate write,
2503 * wait for the synchronous IO to complete.
2504 */
2505 while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
2506 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
2507 cv_wait(&db->db_changed, &db->db_mtx);
2508 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
2509 }
2510
2511 if (db->db_state != DB_NOFILL &&
2512 dn->dn_object != DMU_META_DNODE_OBJECT &&
2513 refcount_count(&db->db_holds) > 1 &&
2514 dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
2515 *datap == db->db_buf) {
2516 /*
2517 * If this buffer is currently "in use" (i.e., there
2518 * are active holds and db_data still references it),
2519 * then make a copy before we start the write so that
2520 * any modifications from the open txg will not leak
2521 * into this write.
2522 *
2523 * NOTE: this copy does not need to be made for
2524 * objects only modified in the syncing context (e.g.
2525 * DNONE_DNODE blocks).
2526 */
2527 int blksz = arc_buf_size(*datap);
2528 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
2529 *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
2530 bcopy(db->db.db_data, (*datap)->b_data, blksz);
2531 }
2532 db->db_data_pending = dr;
2533
2534 mutex_exit(&db->db_mtx);
2535
2536 dbuf_write(dr, *datap, tx);
2537
2538 ASSERT(!list_link_active(&dr->dr_dirty_node));
2539 if (dn->dn_object == DMU_META_DNODE_OBJECT) {
2540 list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
2541 DB_DNODE_EXIT(db);
2542 } else {
2543 /*
2544 * Although zio_nowait() does not "wait for an IO", it does
2545 * initiate the IO. If this is an empty write it seems plausible
2546 * that the IO could actually be completed before the nowait
2547 * returns. We need to DB_DNODE_EXIT() first in case
2548 * zio_nowait() invalidates the dbuf.
2549 */
2550 DB_DNODE_EXIT(db);
2551 zio_nowait(dr->dr_zio);
2552 }
2553 }
2554
2555 void
2556 dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2557 {
2558 dbuf_dirty_record_t *dr;
2559
2560 while ((dr = list_head(list))) {
2561 if (dr->dr_zio != NULL) {
2562 /*
2563 * If we find an already initialized zio then we
2564 * are processing the meta-dnode, and we have finished.
2565 * The dbufs for all dnodes are put back on the list
2566 * during processing, so that we can zio_wait()
2567 * these IOs after initiating all child IOs.
2568 */
2569 ASSERT3U(dr->dr_dbuf->db.db_object, ==,
2570 DMU_META_DNODE_OBJECT);
2571 break;
2572 }
2573 list_remove(list, dr);
2574 if (dr->dr_dbuf->db_level > 0)
2575 dbuf_sync_indirect(dr, tx);
2576 else
2577 dbuf_sync_leaf(dr, tx);
2578 }
2579 }
2580
2581 /* ARGSUSED */
2582 static void
2583 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
2584 {
2585 dmu_buf_impl_t *db = vdb;
2586 dnode_t *dn;
2587 blkptr_t *bp = zio->io_bp;
2588 blkptr_t *bp_orig = &zio->io_bp_orig;
2589 spa_t *spa = zio->io_spa;
2590 int64_t delta;
2591 uint64_t fill = 0;
2592 int i;
2593
2594 ASSERT(db->db_blkptr == bp);
2595
2596 DB_DNODE_ENTER(db);
2597 dn = DB_DNODE(db);
2598 delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
2599 dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
2600 zio->io_prev_space_delta = delta;
2601
2602 if (BP_IS_HOLE(bp)) {
2603 ASSERT(bp->blk_fill == 0);
2604 DB_DNODE_EXIT(db);
2605 return;
2606 }
2607
2608 ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
2609 BP_GET_TYPE(bp) == dn->dn_type) ||
2610 (db->db_blkid == DMU_SPILL_BLKID &&
2611 BP_GET_TYPE(bp) == dn->dn_bonustype));
2612 ASSERT(BP_GET_LEVEL(bp) == db->db_level);
2613
2614 mutex_enter(&db->db_mtx);
2615
2616 #ifdef ZFS_DEBUG
2617 if (db->db_blkid == DMU_SPILL_BLKID) {
2618 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2619 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2620 db->db_blkptr == &dn->dn_phys->dn_spill);
2621 }
2622 #endif
2623
2624 if (db->db_level == 0) {
2625 mutex_enter(&dn->dn_mtx);
2626 if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
2627 db->db_blkid != DMU_SPILL_BLKID)
2628 dn->dn_phys->dn_maxblkid = db->db_blkid;
2629 mutex_exit(&dn->dn_mtx);
2630
2631 if (dn->dn_type == DMU_OT_DNODE) {
2632 dnode_phys_t *dnp = db->db.db_data;
2633 for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
2634 i--, dnp++) {
2635 if (dnp->dn_type != DMU_OT_NONE)
2636 fill++;
2637 }
2638 } else {
2639 fill = 1;
2640 }
2641 } else {
2642 blkptr_t *ibp = db->db.db_data;
2643 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2644 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
2645 if (BP_IS_HOLE(ibp))
2646 continue;
2647 fill += ibp->blk_fill;
2648 }
2649 }
2650 DB_DNODE_EXIT(db);
2651
2652 bp->blk_fill = fill;
2653
2654 mutex_exit(&db->db_mtx);
2655 }
2656
2657 /*
2658 * The SPA will call this callback several times for each zio - once
2659 * for every physical child i/o (zio->io_phys_children times). This
2660 * allows the DMU to monitor the progress of each logical i/o. For example,
2661 * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
2662 * block. There may be a long delay before all copies/fragments are completed,
2663 * so this callback allows us to retire dirty space gradually, as the physical
2664 * i/os complete.
2665 */
2666 /* ARGSUSED */
2667 static void
2668 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
2669 {
2670 dmu_buf_impl_t *db = arg;
2671 objset_t *os = db->db_objset;
2672 dsl_pool_t *dp = dmu_objset_pool(os);
2673 dbuf_dirty_record_t *dr;
2674 int delta = 0;
2675
2676 dr = db->db_data_pending;
2677 ASSERT3U(dr->dr_txg, ==, zio->io_txg);
2678
2679 /*
2680 * The callback will be called io_phys_children times. Retire one
2681 * portion of our dirty space each time we are called. Any rounding
2682 * error will be cleaned up by dsl_pool_sync()'s call to
2683 * dsl_pool_undirty_space().
2684 */
2685 delta = dr->dr_accounted / zio->io_phys_children;
2686 dsl_pool_undirty_space(dp, delta, zio->io_txg);
2687 }
2688
2689 /* ARGSUSED */
2690 static void
2691 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
2692 {
2693 dmu_buf_impl_t *db = vdb;
2694 blkptr_t *bp = zio->io_bp;
2695 blkptr_t *bp_orig = &zio->io_bp_orig;
2696 uint64_t txg = zio->io_txg;
2697 dbuf_dirty_record_t **drp, *dr;
2698
2699 ASSERT0(zio->io_error);
2700 ASSERT(db->db_blkptr == bp);
2701
2702 /*
2703 * For nopwrites and rewrites we ensure that the bp matches our
2704 * original and bypass all the accounting.
2705 */
2706 if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
2707 ASSERT(BP_EQUAL(bp, bp_orig));
2708 } else {
2709 objset_t *os;
2710 dsl_dataset_t *ds;
2711 dmu_tx_t *tx;
2712
2713 DB_GET_OBJSET(&os, db);
2714 ds = os->os_dsl_dataset;
2715 tx = os->os_synctx;
2716
2717 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
2718 dsl_dataset_block_born(ds, bp, tx);
2719 }
2720
2721 mutex_enter(&db->db_mtx);
2722
2723 DBUF_VERIFY(db);
2724
2725 drp = &db->db_last_dirty;
2726 while ((dr = *drp) != db->db_data_pending)
2727 drp = &dr->dr_next;
2728 ASSERT(!list_link_active(&dr->dr_dirty_node));
2729 ASSERT(dr->dr_txg == txg);
2730 ASSERT(dr->dr_dbuf == db);
2731 ASSERT(dr->dr_next == NULL);
2732 *drp = dr->dr_next;
2733
2734 #ifdef ZFS_DEBUG
2735 if (db->db_blkid == DMU_SPILL_BLKID) {
2736 dnode_t *dn;
2737
2738 DB_DNODE_ENTER(db);
2739 dn = DB_DNODE(db);
2740 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
2741 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
2742 db->db_blkptr == &dn->dn_phys->dn_spill);
2743 DB_DNODE_EXIT(db);
2744 }
2745 #endif
2746
2747 if (db->db_level == 0) {
2748 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2749 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
2750 if (db->db_state != DB_NOFILL) {
2751 if (dr->dt.dl.dr_data != db->db_buf)
2752 VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
2753 db));
2754 else if (!arc_released(db->db_buf))
2755 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2756 }
2757 } else {
2758 dnode_t *dn;
2759
2760 DB_DNODE_ENTER(db);
2761 dn = DB_DNODE(db);
2762 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
2763 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
2764 if (!BP_IS_HOLE(db->db_blkptr)) {
2765 ASSERTV(int epbs = dn->dn_phys->dn_indblkshift -
2766 SPA_BLKPTRSHIFT);
2767 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
2768 db->db.db_size);
2769 ASSERT3U(dn->dn_phys->dn_maxblkid
2770 >> (db->db_level * epbs), >=, db->db_blkid);
2771 arc_set_callback(db->db_buf, dbuf_do_evict, db);
2772 }
2773 DB_DNODE_EXIT(db);
2774 mutex_destroy(&dr->dt.di.dr_mtx);
2775 list_destroy(&dr->dt.di.dr_children);
2776 }
2777 kmem_free(dr, sizeof (dbuf_dirty_record_t));
2778
2779 cv_broadcast(&db->db_changed);
2780 ASSERT(db->db_dirtycnt > 0);
2781 db->db_dirtycnt -= 1;
2782 db->db_data_pending = NULL;
2783
2784 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
2785 }
2786
2787 static void
2788 dbuf_write_nofill_ready(zio_t *zio)
2789 {
2790 dbuf_write_ready(zio, NULL, zio->io_private);
2791 }
2792
2793 static void
2794 dbuf_write_nofill_done(zio_t *zio)
2795 {
2796 dbuf_write_done(zio, NULL, zio->io_private);
2797 }
2798
2799 static void
2800 dbuf_write_override_ready(zio_t *zio)
2801 {
2802 dbuf_dirty_record_t *dr = zio->io_private;
2803 dmu_buf_impl_t *db = dr->dr_dbuf;
2804
2805 dbuf_write_ready(zio, NULL, db);
2806 }
2807
2808 static void
2809 dbuf_write_override_done(zio_t *zio)
2810 {
2811 dbuf_dirty_record_t *dr = zio->io_private;
2812 dmu_buf_impl_t *db = dr->dr_dbuf;
2813 blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
2814
2815 mutex_enter(&db->db_mtx);
2816 if (!BP_EQUAL(zio->io_bp, obp)) {
2817 if (!BP_IS_HOLE(obp))
2818 dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
2819 arc_release(dr->dt.dl.dr_data, db);
2820 }
2821 mutex_exit(&db->db_mtx);
2822
2823 dbuf_write_done(zio, NULL, db);
2824 }
2825
2826 /* Issue I/O to commit a dirty buffer to disk. */
2827 static void
2828 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
2829 {
2830 dmu_buf_impl_t *db = dr->dr_dbuf;
2831 dnode_t *dn;
2832 objset_t *os;
2833 dmu_buf_impl_t *parent = db->db_parent;
2834 uint64_t txg = tx->tx_txg;
2835 zbookmark_t zb;
2836 zio_prop_t zp;
2837 zio_t *zio;
2838 int wp_flag = 0;
2839
2840 DB_DNODE_ENTER(db);
2841 dn = DB_DNODE(db);
2842 os = dn->dn_objset;
2843
2844 if (db->db_state != DB_NOFILL) {
2845 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
2846 /*
2847 * Private object buffers are released here rather
2848 * than in dbuf_dirty() since they are only modified
2849 * in the syncing context and we don't want the
2850 * overhead of making multiple copies of the data.
2851 */
2852 if (BP_IS_HOLE(db->db_blkptr)) {
2853 arc_buf_thaw(data);
2854 } else {
2855 dbuf_release_bp(db);
2856 }
2857 }
2858 }
2859
2860 if (parent != dn->dn_dbuf) {
2861 /* Our parent is an indirect block. */
2862 /* We have a dirty parent that has been scheduled for write. */
2863 ASSERT(parent && parent->db_data_pending);
2864 /* Our parent's buffer is one level closer to the dnode. */
2865 ASSERT(db->db_level == parent->db_level-1);
2866 /*
2867 * We're about to modify our parent's db_data by modifying
2868 * our block pointer, so the parent must be released.
2869 */
2870 ASSERT(arc_released(parent->db_buf));
2871 zio = parent->db_data_pending->dr_zio;
2872 } else {
2873 /* Our parent is the dnode itself. */
2874 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
2875 db->db_blkid != DMU_SPILL_BLKID) ||
2876 (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
2877 if (db->db_blkid != DMU_SPILL_BLKID)
2878 ASSERT3P(db->db_blkptr, ==,
2879 &dn->dn_phys->dn_blkptr[db->db_blkid]);
2880 zio = dn->dn_zio;
2881 }
2882
2883 ASSERT(db->db_level == 0 || data == db->db_buf);
2884 ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
2885 ASSERT(zio);
2886
2887 SET_BOOKMARK(&zb, os->os_dsl_dataset ?
2888 os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
2889 db->db.db_object, db->db_level, db->db_blkid);
2890
2891 if (db->db_blkid == DMU_SPILL_BLKID)
2892 wp_flag = WP_SPILL;
2893 wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
2894
2895 dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
2896 DB_DNODE_EXIT(db);
2897
2898 if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
2899 ASSERT(db->db_state != DB_NOFILL);
2900 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2901 db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
2902 dbuf_write_override_ready, NULL, dbuf_write_override_done,
2903 dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2904 mutex_enter(&db->db_mtx);
2905 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
2906 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
2907 dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
2908 mutex_exit(&db->db_mtx);
2909 } else if (db->db_state == DB_NOFILL) {
2910 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
2911 dr->dr_zio = zio_write(zio, os->os_spa, txg,
2912 db->db_blkptr, NULL, db->db.db_size, &zp,
2913 dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
2914 ZIO_PRIORITY_ASYNC_WRITE,
2915 ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
2916 } else {
2917 ASSERT(arc_released(data));
2918 dr->dr_zio = arc_write(zio, os->os_spa, txg,
2919 db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
2920 DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
2921 dbuf_write_physdone, dbuf_write_done, db,
2922 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
2923 }
2924 }
2925
2926 #if defined(_KERNEL) && defined(HAVE_SPL)
2927 EXPORT_SYMBOL(dbuf_find);
2928 EXPORT_SYMBOL(dbuf_is_metadata);
2929 EXPORT_SYMBOL(dbuf_evict);
2930 EXPORT_SYMBOL(dbuf_loan_arcbuf);
2931 EXPORT_SYMBOL(dbuf_whichblock);
2932 EXPORT_SYMBOL(dbuf_read);
2933 EXPORT_SYMBOL(dbuf_unoverride);
2934 EXPORT_SYMBOL(dbuf_free_range);
2935 EXPORT_SYMBOL(dbuf_new_size);
2936 EXPORT_SYMBOL(dbuf_release_bp);
2937 EXPORT_SYMBOL(dbuf_dirty);
2938 EXPORT_SYMBOL(dmu_buf_will_dirty);
2939 EXPORT_SYMBOL(dmu_buf_will_not_fill);
2940 EXPORT_SYMBOL(dmu_buf_will_fill);
2941 EXPORT_SYMBOL(dmu_buf_fill_done);
2942 EXPORT_SYMBOL(dmu_buf_rele);
2943 EXPORT_SYMBOL(dbuf_assign_arcbuf);
2944 EXPORT_SYMBOL(dbuf_clear);
2945 EXPORT_SYMBOL(dbuf_prefetch);
2946 EXPORT_SYMBOL(dbuf_hold_impl);
2947 EXPORT_SYMBOL(dbuf_hold);
2948 EXPORT_SYMBOL(dbuf_hold_level);
2949 EXPORT_SYMBOL(dbuf_create_bonus);
2950 EXPORT_SYMBOL(dbuf_spill_set_blksz);
2951 EXPORT_SYMBOL(dbuf_rm_spill);
2952 EXPORT_SYMBOL(dbuf_add_ref);
2953 EXPORT_SYMBOL(dbuf_rele);
2954 EXPORT_SYMBOL(dbuf_rele_and_unlock);
2955 EXPORT_SYMBOL(dbuf_refcount);
2956 EXPORT_SYMBOL(dbuf_sync_list);
2957 EXPORT_SYMBOL(dmu_buf_set_user);
2958 EXPORT_SYMBOL(dmu_buf_set_user_ie);
2959 EXPORT_SYMBOL(dmu_buf_update_user);
2960 EXPORT_SYMBOL(dmu_buf_get_user);
2961 EXPORT_SYMBOL(dmu_buf_freeable);
2962 #endif