module/zfs/dbuf.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  25  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  26  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  27  */
  28
  29 #include <sys/zfs_context.h>
  30 #include <sys/arc.h>
  31 #include <sys/dmu.h>
  32 #include <sys/dmu_send.h>
  33 #include <sys/dmu_impl.h>
  34 #include <sys/dbuf.h>
  35 #include <sys/dmu_objset.h>
  36 #include <sys/dsl_dataset.h>
  37 #include <sys/dsl_dir.h>
  38 #include <sys/dmu_tx.h>
  39 #include <sys/spa.h>
  40 #include <sys/zio.h>
  41 #include <sys/dmu_zfetch.h>
  42 #include <sys/sa.h>
  43 #include <sys/sa_impl.h>
  44 #include <sys/zfeature.h>
  45 #include <sys/blkptr.h>
  46 #include <sys/range_tree.h>
  47 #include <sys/trace_zfs.h>
  48 #include <sys/callb.h>
  49 #include <sys/abd.h>
  50 #include <sys/vdev.h>
  51 #include <sys/cityhash.h>
  52 #include <sys/spa_impl.h>
  53
  54 kstat_t *dbuf_ksp;
  55
  56 typedef struct dbuf_stats {
  57         /*
  58          * Various statistics about the size of the dbuf cache.
  59          */
  60         kstat_named_t cache_count;
  61         kstat_named_t cache_size_bytes;
  62         kstat_named_t cache_size_bytes_max;
  63         /*
  64          * Statistics regarding the bounds on the dbuf cache size.
  65          */
  66         kstat_named_t cache_target_bytes;
  67         kstat_named_t cache_lowater_bytes;
  68         kstat_named_t cache_hiwater_bytes;
  69         /*
  70          * Total number of dbuf cache evictions that have occurred.
  71          */
  72         kstat_named_t cache_total_evicts;
  73         /*
  74          * The distribution of dbuf levels in the dbuf cache and
  75          * the total size of all dbufs at each level.
  76          */
  77         kstat_named_t cache_levels[DN_MAX_LEVELS];
  78         kstat_named_t cache_levels_bytes[DN_MAX_LEVELS];
  79         /*
  80          * Statistics about the dbuf hash table.
  81          */
  82         kstat_named_t hash_hits;
  83         kstat_named_t hash_misses;
  84         kstat_named_t hash_collisions;
  85         kstat_named_t hash_elements;
  86         kstat_named_t hash_elements_max;
  87         /*
  88          * Number of sublists containing more than one dbuf in the dbuf
  89          * hash table. Keep track of the longest hash chain.
  90          */
  91         kstat_named_t hash_chains;
  92         kstat_named_t hash_chain_max;
  93         /*
  94          * Number of times a dbuf_create() discovers that a dbuf was
  95          * already created and in the dbuf hash table.
  96          */
  97         kstat_named_t hash_insert_race;
  98         /*
  99          * Statistics about the size of the metadata dbuf cache.
 100          */
 101         kstat_named_t metadata_cache_count;
 102         kstat_named_t metadata_cache_size_bytes;
 103         kstat_named_t metadata_cache_size_bytes_max;
 104         /*
 105          * For diagnostic purposes, this is incremented whenever we can't add
 106          * something to the metadata cache because it's full, and instead put
 107          * the data in the regular dbuf cache.
 108          */
 109         kstat_named_t metadata_cache_overflow;
 110 } dbuf_stats_t;
 111
 112 dbuf_stats_t dbuf_stats = {
 113         { "cache_count",                        KSTAT_DATA_UINT64 },
 114         { "cache_size_bytes",                   KSTAT_DATA_UINT64 },
 115         { "cache_size_bytes_max",               KSTAT_DATA_UINT64 },
 116         { "cache_target_bytes",                 KSTAT_DATA_UINT64 },
 117         { "cache_lowater_bytes",                KSTAT_DATA_UINT64 },
 118         { "cache_hiwater_bytes",                KSTAT_DATA_UINT64 },
 119         { "cache_total_evicts",                 KSTAT_DATA_UINT64 },
 120         { { "cache_levels_N",                   KSTAT_DATA_UINT64 } },
 121         { { "cache_levels_bytes_N",             KSTAT_DATA_UINT64 } },
 122         { "hash_hits",                          KSTAT_DATA_UINT64 },
 123         { "hash_misses",                        KSTAT_DATA_UINT64 },
 124         { "hash_collisions",                    KSTAT_DATA_UINT64 },
 125         { "hash_elements",                      KSTAT_DATA_UINT64 },
 126         { "hash_elements_max",                  KSTAT_DATA_UINT64 },
 127         { "hash_chains",                        KSTAT_DATA_UINT64 },
 128         { "hash_chain_max",                     KSTAT_DATA_UINT64 },
 129         { "hash_insert_race",                   KSTAT_DATA_UINT64 },
 130         { "metadata_cache_count",               KSTAT_DATA_UINT64 },
 131         { "metadata_cache_size_bytes",          KSTAT_DATA_UINT64 },
 132         { "metadata_cache_size_bytes_max",      KSTAT_DATA_UINT64 },
 133         { "metadata_cache_overflow",            KSTAT_DATA_UINT64 }
 134 };
 135
 136 #define DBUF_STAT_INCR(stat, val)       \
 137         atomic_add_64(&dbuf_stats.stat.value.ui64, (val));
 138 #define DBUF_STAT_DECR(stat, val)       \
 139         DBUF_STAT_INCR(stat, -(val));
 140 #define DBUF_STAT_BUMP(stat)            \
 141         DBUF_STAT_INCR(stat, 1);
 142 #define DBUF_STAT_BUMPDOWN(stat)        \
 143         DBUF_STAT_INCR(stat, -1);
 144 #define DBUF_STAT_MAX(stat, v) {                                        \
 145         uint64_t _m;                                                    \
 146         while ((v) > (_m = dbuf_stats.stat.value.ui64) &&               \
 147             (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\
 148                 continue;                                               \
 149 }
 150
 151 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 152 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 153
 154 extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
 155     dmu_buf_evict_func_t *evict_func_sync,
 156     dmu_buf_evict_func_t *evict_func_async,
 157     dmu_buf_t **clear_on_evict_dbufp);
 158
 159 /*
 160  * Global data structures and functions for the dbuf cache.
 161  */
 162 static kmem_cache_t *dbuf_kmem_cache;
 163 static taskq_t *dbu_evict_taskq;
 164
 165 static kthread_t *dbuf_cache_evict_thread;
 166 static kmutex_t dbuf_evict_lock;
 167 static kcondvar_t dbuf_evict_cv;
 168 static boolean_t dbuf_evict_thread_exit;
 169
 170 /*
 171  * There are two dbuf caches; each dbuf can only be in one of them at a time.
 172  *
 173  * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
 174  *    from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
 175  *    that represent the metadata that describes filesystems/snapshots/
 176  *    bookmarks/properties/etc. We only evict from this cache when we export a
 177  *    pool, to short-circuit as much I/O as possible for all administrative
 178  *    commands that need the metadata. There is no eviction policy for this
 179  *    cache, because we try to only include types in it which would occupy a
 180  *    very small amount of space per object but create a large impact on the
 181  *    performance of these commands. Instead, after it reaches a maximum size
 182  *    (which should only happen on very small memory systems with a very large
 183  *    number of filesystem objects), we stop taking new dbufs into the
 184  *    metadata cache, instead putting them in the normal dbuf cache.
 185  *
 186  * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
 187  *    are not currently held but have been recently released. These dbufs
 188  *    are not eligible for arc eviction until they are aged out of the cache.
 189  *    Dbufs that are aged out of the cache will be immediately destroyed and
 190  *    become eligible for arc eviction.
 191  *
 192  * Dbufs are added to these caches once the last hold is released. If a dbuf is
 193  * later accessed and still exists in the dbuf cache, then it will be removed
 194  * from the cache and later re-added to the head of the cache.
 195  *
 196  * If a given dbuf meets the requirements for the metadata cache, it will go
 197  * there, otherwise it will be considered for the generic LRU dbuf cache. The
 198  * caches and the refcounts tracking their sizes are stored in an array indexed
 199  * by those caches' matching enum values (from dbuf_cached_state_t).
 200  */
 201 typedef struct dbuf_cache {
 202         multilist_t *cache;
 203         zfs_refcount_t size;
 204 } dbuf_cache_t;
 205 dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
 206
 207 /* Size limits for the caches */
 208 unsigned long dbuf_cache_max_bytes = 0;
 209 unsigned long dbuf_metadata_cache_max_bytes = 0;
 210 /* Set the default sizes of the caches to log2 fraction of arc size */
 211 int dbuf_cache_shift = 5;
 212 int dbuf_metadata_cache_shift = 6;
 213
 214 /*
 215  * The LRU dbuf cache uses a three-stage eviction policy:
 216  *      - A low water marker designates when the dbuf eviction thread
 217  *      should stop evicting from the dbuf cache.
 218  *      - When we reach the maximum size (aka mid water mark), we
 219  *      signal the eviction thread to run.
 220  *      - The high water mark indicates when the eviction thread
 221  *      is unable to keep up with the incoming load and eviction must
 222  *      happen in the context of the calling thread.
 223  *
 224  * The dbuf cache:
 225  *                                                 (max size)
 226  *                                      low water   mid water   hi water
 227  * +----------------------------------------+----------+----------+
 228  * |                                        |          |          |
 229  * |                                        |          |          |
 230  * |                                        |          |          |
 231  * |                                        |          |          |
 232  * +----------------------------------------+----------+----------+
 233  *                                        stop        signal     evict
 234  *                                      evicting     eviction   directly
 235  *                                                    thread
 236  *
 237  * The high and low water marks indicate the operating range for the eviction
 238  * thread. The low water mark is, by default, 90% of the total size of the
 239  * cache and the high water mark is at 110% (both of these percentages can be
 240  * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
 241  * respectively). The eviction thread will try to ensure that the cache remains
 242  * within this range by waking up every second and checking if the cache is
 243  * above the low water mark. The thread can also be woken up by callers adding
 244  * elements into the cache if the cache is larger than the mid water (i.e max
 245  * cache size). Once the eviction thread is woken up and eviction is required,
 246  * it will continue evicting buffers until it's able to reduce the cache size
 247  * to the low water mark. If the cache size continues to grow and hits the high
 248  * water mark, then callers adding elements to the cache will begin to evict
 249  * directly from the cache until the cache is no longer above the high water
 250  * mark.
 251  */
 252
 253 /*
 254  * The percentage above and below the maximum cache size.
 255  */
 256 uint_t dbuf_cache_hiwater_pct = 10;
 257 uint_t dbuf_cache_lowater_pct = 10;
 258
 259 /* ARGSUSED */
 260 static int
 261 dbuf_cons(void *vdb, void *unused, int kmflag)
 262 {
 263         dmu_buf_impl_t *db = vdb;
 264         bzero(db, sizeof (dmu_buf_impl_t));
 265
 266         mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
 267         rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL);
 268         cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
 269         multilist_link_init(&db->db_cache_link);
 270         zfs_refcount_create(&db->db_holds);
 271
 272         return (0);
 273 }
 274
 275 /* ARGSUSED */
 276 static void
 277 dbuf_dest(void *vdb, void *unused)
 278 {
 279         dmu_buf_impl_t *db = vdb;
 280         mutex_destroy(&db->db_mtx);
 281         rw_destroy(&db->db_rwlock);
 282         cv_destroy(&db->db_changed);
 283         ASSERT(!multilist_link_active(&db->db_cache_link));
 284         zfs_refcount_destroy(&db->db_holds);
 285 }
 286
 287 /*
 288  * dbuf hash table routines
 289  */
 290 static dbuf_hash_table_t dbuf_hash_table;
 291
 292 static uint64_t dbuf_hash_count;
 293
 294 /*
 295  * We use Cityhash for this. It's fast, and has good hash properties without
 296  * requiring any large static buffers.
 297  */
 298 static uint64_t
 299 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
 300 {
 301         return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
 302 }
 303
 304 #define DBUF_EQUAL(dbuf, os, obj, level, blkid)         \
 305         ((dbuf)->db.db_object == (obj) &&               \
 306         (dbuf)->db_objset == (os) &&                    \
 307         (dbuf)->db_level == (level) &&                  \
 308         (dbuf)->db_blkid == (blkid))
 309
 310 dmu_buf_impl_t *
 311 dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
 312 {
 313         dbuf_hash_table_t *h = &dbuf_hash_table;
 314         uint64_t hv;
 315         uint64_t idx;
 316         dmu_buf_impl_t *db;
 317
 318         hv = dbuf_hash(os, obj, level, blkid);
 319         idx = hv & h->hash_table_mask;
 320
 321         mutex_enter(DBUF_HASH_MUTEX(h, idx));
 322         for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
 323                 if (DBUF_EQUAL(db, os, obj, level, blkid)) {
 324                         mutex_enter(&db->db_mtx);
 325                         if (db->db_state != DB_EVICTING) {
 326                                 mutex_exit(DBUF_HASH_MUTEX(h, idx));
 327                                 return (db);
 328                         }
 329                         mutex_exit(&db->db_mtx);
 330                 }
 331         }
 332         mutex_exit(DBUF_HASH_MUTEX(h, idx));
 333         return (NULL);
 334 }
 335
 336 static dmu_buf_impl_t *
 337 dbuf_find_bonus(objset_t *os, uint64_t object)
 338 {
 339         dnode_t *dn;
 340         dmu_buf_impl_t *db = NULL;
 341
 342         if (dnode_hold(os, object, FTAG, &dn) == 0) {
 343                 rw_enter(&dn->dn_struct_rwlock, RW_READER);
 344                 if (dn->dn_bonus != NULL) {
 345                         db = dn->dn_bonus;
 346                         mutex_enter(&db->db_mtx);
 347                 }
 348                 rw_exit(&dn->dn_struct_rwlock);
 349                 dnode_rele(dn, FTAG);
 350         }
 351         return (db);
 352 }
 353
 354 /*
 355  * Insert an entry into the hash table.  If there is already an element
 356  * equal to elem in the hash table, then the already existing element
 357  * will be returned and the new element will not be inserted.
 358  * Otherwise returns NULL.
 359  */
 360 static dmu_buf_impl_t *
 361 dbuf_hash_insert(dmu_buf_impl_t *db)
 362 {
 363         dbuf_hash_table_t *h = &dbuf_hash_table;
 364         objset_t *os = db->db_objset;
 365         uint64_t obj = db->db.db_object;
 366         int level = db->db_level;
 367         uint64_t blkid, hv, idx;
 368         dmu_buf_impl_t *dbf;
 369         uint32_t i;
 370
 371         blkid = db->db_blkid;
 372         hv = dbuf_hash(os, obj, level, blkid);
 373         idx = hv & h->hash_table_mask;
 374
 375         mutex_enter(DBUF_HASH_MUTEX(h, idx));
 376         for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
 377             dbf = dbf->db_hash_next, i++) {
 378                 if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
 379                         mutex_enter(&dbf->db_mtx);
 380                         if (dbf->db_state != DB_EVICTING) {
 381                                 mutex_exit(DBUF_HASH_MUTEX(h, idx));
 382                                 return (dbf);
 383                         }
 384                         mutex_exit(&dbf->db_mtx);
 385                 }
 386         }
 387
 388         if (i > 0) {
 389                 DBUF_STAT_BUMP(hash_collisions);
 390                 if (i == 1)
 391                         DBUF_STAT_BUMP(hash_chains);
 392
 393                 DBUF_STAT_MAX(hash_chain_max, i);
 394         }
 395
 396         mutex_enter(&db->db_mtx);
 397         db->db_hash_next = h->hash_table[idx];
 398         h->hash_table[idx] = db;
 399         mutex_exit(DBUF_HASH_MUTEX(h, idx));
 400         atomic_inc_64(&dbuf_hash_count);
 401         DBUF_STAT_MAX(hash_elements_max, dbuf_hash_count);
 402
 403         return (NULL);
 404 }
 405
 406 /*
 407  * This returns whether this dbuf should be stored in the metadata cache, which
 408  * is based on whether it's from one of the dnode types that store data related
 409  * to traversing dataset hierarchies.
 410  */
 411 static boolean_t
 412 dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
 413 {
 414         DB_DNODE_ENTER(db);
 415         dmu_object_type_t type = DB_DNODE(db)->dn_type;
 416         DB_DNODE_EXIT(db);
 417
 418         /* Check if this dbuf is one of the types we care about */
 419         if (DMU_OT_IS_METADATA_CACHED(type)) {
 420                 /* If we hit this, then we set something up wrong in dmu_ot */
 421                 ASSERT(DMU_OT_IS_METADATA(type));
 422
 423                 /*
 424                  * Sanity check for small-memory systems: don't allocate too
 425                  * much memory for this purpose.
 426                  */
 427                 if (zfs_refcount_count(
 428                     &dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
 429                     dbuf_metadata_cache_max_bytes) {
 430                         DBUF_STAT_BUMP(metadata_cache_overflow);
 431                         return (B_FALSE);
 432                 }
 433
 434                 return (B_TRUE);
 435         }
 436
 437         return (B_FALSE);
 438 }
 439
 440 /*
 441  * Remove an entry from the hash table.  It must be in the EVICTING state.
 442  */
 443 static void
 444 dbuf_hash_remove(dmu_buf_impl_t *db)
 445 {
 446         dbuf_hash_table_t *h = &dbuf_hash_table;
 447         uint64_t hv, idx;
 448         dmu_buf_impl_t *dbf, **dbp;
 449
 450         hv = dbuf_hash(db->db_objset, db->db.db_object,
 451             db->db_level, db->db_blkid);
 452         idx = hv & h->hash_table_mask;
 453
 454         /*
 455          * We mustn't hold db_mtx to maintain lock ordering:
 456          * DBUF_HASH_MUTEX > db_mtx.
 457          */
 458         ASSERT(zfs_refcount_is_zero(&db->db_holds));
 459         ASSERT(db->db_state == DB_EVICTING);
 460         ASSERT(!MUTEX_HELD(&db->db_mtx));
 461
 462         mutex_enter(DBUF_HASH_MUTEX(h, idx));
 463         dbp = &h->hash_table[idx];
 464         while ((dbf = *dbp) != db) {
 465                 dbp = &dbf->db_hash_next;
 466                 ASSERT(dbf != NULL);
 467         }
 468         *dbp = db->db_hash_next;
 469         db->db_hash_next = NULL;
 470         if (h->hash_table[idx] &&
 471             h->hash_table[idx]->db_hash_next == NULL)
 472                 DBUF_STAT_BUMPDOWN(hash_chains);
 473         mutex_exit(DBUF_HASH_MUTEX(h, idx));
 474         atomic_dec_64(&dbuf_hash_count);
 475 }
 476
 477 typedef enum {
 478         DBVU_EVICTING,
 479         DBVU_NOT_EVICTING
 480 } dbvu_verify_type_t;
 481
 482 static void
 483 dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
 484 {
 485 #ifdef ZFS_DEBUG
 486         int64_t holds;
 487
 488         if (db->db_user == NULL)
 489                 return;
 490
 491         /* Only data blocks support the attachment of user data. */
 492         ASSERT(db->db_level == 0);
 493
 494         /* Clients must resolve a dbuf before attaching user data. */
 495         ASSERT(db->db.db_data != NULL);
 496         ASSERT3U(db->db_state, ==, DB_CACHED);
 497
 498         holds = zfs_refcount_count(&db->db_holds);
 499         if (verify_type == DBVU_EVICTING) {
 500                 /*
 501                  * Immediate eviction occurs when holds == dirtycnt.
 502                  * For normal eviction buffers, holds is zero on
 503                  * eviction, except when dbuf_fix_old_data() calls
 504                  * dbuf_clear_data().  However, the hold count can grow
 505                  * during eviction even though db_mtx is held (see
 506                  * dmu_bonus_hold() for an example), so we can only
 507                  * test the generic invariant that holds >= dirtycnt.
 508                  */
 509                 ASSERT3U(holds, >=, db->db_dirtycnt);
 510         } else {
 511                 if (db->db_user_immediate_evict == TRUE)
 512                         ASSERT3U(holds, >=, db->db_dirtycnt);
 513                 else
 514                         ASSERT3U(holds, >, 0);
 515         }
 516 #endif
 517 }
 518
 519 static void
 520 dbuf_evict_user(dmu_buf_impl_t *db)
 521 {
 522         dmu_buf_user_t *dbu = db->db_user;
 523
 524         ASSERT(MUTEX_HELD(&db->db_mtx));
 525
 526         if (dbu == NULL)
 527                 return;
 528
 529         dbuf_verify_user(db, DBVU_EVICTING);
 530         db->db_user = NULL;
 531
 532 #ifdef ZFS_DEBUG
 533         if (dbu->dbu_clear_on_evict_dbufp != NULL)
 534                 *dbu->dbu_clear_on_evict_dbufp = NULL;
 535 #endif
 536
 537         /*
 538          * There are two eviction callbacks - one that we call synchronously
 539          * and one that we invoke via a taskq.  The async one is useful for
 540          * avoiding lock order reversals and limiting stack depth.
 541          *
 542          * Note that if we have a sync callback but no async callback,
 543          * it's likely that the sync callback will free the structure
 544          * containing the dbu.  In that case we need to take care to not
 545          * dereference dbu after calling the sync evict func.
 546          */
 547         boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
 548
 549         if (dbu->dbu_evict_func_sync != NULL)
 550                 dbu->dbu_evict_func_sync(dbu);
 551
 552         if (has_async) {
 553                 taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
 554                     dbu, 0, &dbu->dbu_tqent);
 555         }
 556 }
 557
 558 boolean_t
 559 dbuf_is_metadata(dmu_buf_impl_t *db)
 560 {
 561         /*
 562          * Consider indirect blocks and spill blocks to be meta data.
 563          */
 564         if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) {
 565                 return (B_TRUE);
 566         } else {
 567                 boolean_t is_metadata;
 568
 569                 DB_DNODE_ENTER(db);
 570                 is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
 571                 DB_DNODE_EXIT(db);
 572
 573                 return (is_metadata);
 574         }
 575 }
 576
 577
 578 /*
 579  * This function *must* return indices evenly distributed between all
 580  * sublists of the multilist. This is needed due to how the dbuf eviction
 581  * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
 582  * distributed between all sublists and uses this assumption when
 583  * deciding which sublist to evict from and how much to evict from it.
 584  */
 585 unsigned int
 586 dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
 587 {
 588         dmu_buf_impl_t *db = obj;
 589
 590         /*
 591          * The assumption here, is the hash value for a given
 592          * dmu_buf_impl_t will remain constant throughout it's lifetime
 593          * (i.e. it's objset, object, level and blkid fields don't change).
 594          * Thus, we don't need to store the dbuf's sublist index
 595          * on insertion, as this index can be recalculated on removal.
 596          *
 597          * Also, the low order bits of the hash value are thought to be
 598          * distributed evenly. Otherwise, in the case that the multilist
 599          * has a power of two number of sublists, each sublists' usage
 600          * would not be evenly distributed.
 601          */
 602         return (dbuf_hash(db->db_objset, db->db.db_object,
 603             db->db_level, db->db_blkid) %
 604             multilist_get_num_sublists(ml));
 605 }
 606
 607 static inline unsigned long
 608 dbuf_cache_target_bytes(void)
 609 {
 610         return MIN(dbuf_cache_max_bytes,
 611             arc_target_bytes() >> dbuf_cache_shift);
 612 }
 613
 614 static inline uint64_t
 615 dbuf_cache_hiwater_bytes(void)
 616 {
 617         uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
 618         return (dbuf_cache_target +
 619             (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100);
 620 }
 621
 622 static inline uint64_t
 623 dbuf_cache_lowater_bytes(void)
 624 {
 625         uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
 626         return (dbuf_cache_target -
 627             (dbuf_cache_target * dbuf_cache_lowater_pct) / 100);
 628 }
 629
 630 static inline boolean_t
 631 dbuf_cache_above_lowater(void)
 632 {
 633         return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
 634             dbuf_cache_lowater_bytes());
 635 }
 636
 637 /*
 638  * Evict the oldest eligible dbuf from the dbuf cache.
 639  */
 640 static void
 641 dbuf_evict_one(void)
 642 {
 643         int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache);
 644         multilist_sublist_t *mls = multilist_sublist_lock(
 645             dbuf_caches[DB_DBUF_CACHE].cache, idx);
 646
 647         ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
 648
 649         dmu_buf_impl_t *db = multilist_sublist_tail(mls);
 650         while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
 651                 db = multilist_sublist_prev(mls, db);
 652         }
 653
 654         DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
 655             multilist_sublist_t *, mls);
 656
 657         if (db != NULL) {
 658                 multilist_sublist_remove(mls, db);
 659                 multilist_sublist_unlock(mls);
 660                 (void) zfs_refcount_remove_many(
 661                     &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db);
 662                 DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
 663                 DBUF_STAT_BUMPDOWN(cache_count);
 664                 DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
 665                     db->db.db_size);
 666                 ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
 667                 db->db_caching_status = DB_NO_CACHE;
 668                 dbuf_destroy(db);
 669                 DBUF_STAT_BUMP(cache_total_evicts);
 670         } else {
 671                 multilist_sublist_unlock(mls);
 672         }
 673 }
 674
 675 /*
 676  * The dbuf evict thread is responsible for aging out dbufs from the
 677  * cache. Once the cache has reached it's maximum size, dbufs are removed
 678  * and destroyed. The eviction thread will continue running until the size
 679  * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
 680  * out of the cache it is destroyed and becomes eligible for arc eviction.
 681  */
 682 /* ARGSUSED */
 683 static void
 684 dbuf_evict_thread(void *unused)
 685 {
 686         callb_cpr_t cpr;
 687
 688         CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
 689
 690         mutex_enter(&dbuf_evict_lock);
 691         while (!dbuf_evict_thread_exit) {
 692                 while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
 693                         CALLB_CPR_SAFE_BEGIN(&cpr);
 694                         (void) cv_timedwait_sig_hires(&dbuf_evict_cv,
 695                             &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
 696                         CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
 697                 }
 698                 mutex_exit(&dbuf_evict_lock);
 699
 700                 /*
 701                  * Keep evicting as long as we're above the low water mark
 702                  * for the cache. We do this without holding the locks to
 703                  * minimize lock contention.
 704                  */
 705                 while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
 706                         dbuf_evict_one();
 707                 }
 708
 709                 mutex_enter(&dbuf_evict_lock);
 710         }
 711
 712         dbuf_evict_thread_exit = B_FALSE;
 713         cv_broadcast(&dbuf_evict_cv);
 714         CALLB_CPR_EXIT(&cpr);   /* drops dbuf_evict_lock */
 715         thread_exit();
 716 }
 717
 718 /*
 719  * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
 720  * If the dbuf cache is at its high water mark, then evict a dbuf from the
 721  * dbuf cache using the callers context.
 722  */
 723 static void
 724 dbuf_evict_notify(uint64_t size)
 725 {
 726         /*
 727          * We check if we should evict without holding the dbuf_evict_lock,
 728          * because it's OK to occasionally make the wrong decision here,
 729          * and grabbing the lock results in massive lock contention.
 730          */
 731         if (size > dbuf_cache_target_bytes()) {
 732                 if (size > dbuf_cache_hiwater_bytes())
 733                         dbuf_evict_one();
 734                 cv_signal(&dbuf_evict_cv);
 735         }
 736 }
 737
 738 static int
 739 dbuf_kstat_update(kstat_t *ksp, int rw)
 740 {
 741         dbuf_stats_t *ds = ksp->ks_data;
 742
 743         if (rw == KSTAT_WRITE) {
 744                 return (SET_ERROR(EACCES));
 745         } else {
 746                 ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
 747                     &dbuf_caches[DB_DBUF_METADATA_CACHE].size);
 748                 ds->cache_size_bytes.value.ui64 =
 749                     zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
 750                 ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
 751                 ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
 752                 ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
 753                 ds->hash_elements.value.ui64 = dbuf_hash_count;
 754         }
 755
 756         return (0);
 757 }
 758
 759 void
 760 dbuf_init(void)
 761 {
 762         uint64_t hsize = 1ULL << 16;
 763         dbuf_hash_table_t *h = &dbuf_hash_table;
 764         int i;
 765
 766         /*
 767          * The hash table is big enough to fill all of physical memory
 768          * with an average block size of zfs_arc_average_blocksize (default 8K).
 769          * By default, the table will take up
 770          * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
 771          */
 772         while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
 773                 hsize <<= 1;
 774
 775 retry:
 776         h->hash_table_mask = hsize - 1;
 777 #if defined(_KERNEL)
 778         /*
 779          * Large allocations which do not require contiguous pages
 780          * should be using vmem_alloc() in the linux kernel
 781          */
 782         h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
 783 #else
 784         h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
 785 #endif
 786         if (h->hash_table == NULL) {
 787                 /* XXX - we should really return an error instead of assert */
 788                 ASSERT(hsize > (1ULL << 10));
 789                 hsize >>= 1;
 790                 goto retry;
 791         }
 792
 793         dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
 794             sizeof (dmu_buf_impl_t),
 795             0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 796
 797         for (i = 0; i < DBUF_MUTEXES; i++)
 798                 mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
 799
 800         dbuf_stats_init(h);
 801
 802         /*
 803          * Setup the parameters for the dbuf caches. We set the sizes of the
 804          * dbuf cache and the metadata cache to 1/32nd and 1/16th (default)
 805          * of the target size of the ARC. If the values has been specified as
 806          * a module option and they're not greater than the target size of the
 807          * ARC, then we honor that value.
 808          */
 809         if (dbuf_cache_max_bytes == 0 ||
 810             dbuf_cache_max_bytes >= arc_target_bytes()) {
 811                 dbuf_cache_max_bytes = arc_target_bytes() >> dbuf_cache_shift;
 812         }
 813         if (dbuf_metadata_cache_max_bytes == 0 ||
 814             dbuf_metadata_cache_max_bytes >= arc_target_bytes()) {
 815                 dbuf_metadata_cache_max_bytes =
 816                     arc_target_bytes() >> dbuf_metadata_cache_shift;
 817         }
 818
 819         /*
 820          * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
 821          * configuration is not required.
 822          */
 823         dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
 824
 825         for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
 826                 dbuf_caches[dcs].cache =
 827                     multilist_create(sizeof (dmu_buf_impl_t),
 828                     offsetof(dmu_buf_impl_t, db_cache_link),
 829                     dbuf_cache_multilist_index_func);
 830                 zfs_refcount_create(&dbuf_caches[dcs].size);
 831         }
 832
 833         dbuf_evict_thread_exit = B_FALSE;
 834         mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 835         cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
 836         dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
 837             NULL, 0, &p0, TS_RUN, minclsyspri);
 838
 839         dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc",
 840             KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
 841             KSTAT_FLAG_VIRTUAL);
 842         if (dbuf_ksp != NULL) {
 843                 for (i = 0; i < DN_MAX_LEVELS; i++) {
 844                         snprintf(dbuf_stats.cache_levels[i].name,
 845                             KSTAT_STRLEN, "cache_level_%d", i);
 846                         dbuf_stats.cache_levels[i].data_type =
 847                             KSTAT_DATA_UINT64;
 848                         snprintf(dbuf_stats.cache_levels_bytes[i].name,
 849                             KSTAT_STRLEN, "cache_level_%d_bytes", i);
 850                         dbuf_stats.cache_levels_bytes[i].data_type =
 851                             KSTAT_DATA_UINT64;
 852                 }
 853                 dbuf_ksp->ks_data = &dbuf_stats;
 854                 dbuf_ksp->ks_update = dbuf_kstat_update;
 855                 kstat_install(dbuf_ksp);
 856         }
 857 }
 858
 859 void
 860 dbuf_fini(void)
 861 {
 862         dbuf_hash_table_t *h = &dbuf_hash_table;
 863         int i;
 864
 865         dbuf_stats_destroy();
 866
 867         for (i = 0; i < DBUF_MUTEXES; i++)
 868                 mutex_destroy(&h->hash_mutexes[i]);
 869 #if defined(_KERNEL)
 870         /*
 871          * Large allocations which do not require contiguous pages
 872          * should be using vmem_free() in the linux kernel
 873          */
 874         vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
 875 #else
 876         kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
 877 #endif
 878         kmem_cache_destroy(dbuf_kmem_cache);
 879         taskq_destroy(dbu_evict_taskq);
 880
 881         mutex_enter(&dbuf_evict_lock);
 882         dbuf_evict_thread_exit = B_TRUE;
 883         while (dbuf_evict_thread_exit) {
 884                 cv_signal(&dbuf_evict_cv);
 885                 cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
 886         }
 887         mutex_exit(&dbuf_evict_lock);
 888
 889         mutex_destroy(&dbuf_evict_lock);
 890         cv_destroy(&dbuf_evict_cv);
 891
 892         for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
 893                 zfs_refcount_destroy(&dbuf_caches[dcs].size);
 894                 multilist_destroy(dbuf_caches[dcs].cache);
 895         }
 896
 897         if (dbuf_ksp != NULL) {
 898                 kstat_delete(dbuf_ksp);
 899                 dbuf_ksp = NULL;
 900         }
 901 }
 902
 903 /*
 904  * Other stuff.
 905  */
 906
 907 #ifdef ZFS_DEBUG
 908 static void
 909 dbuf_verify(dmu_buf_impl_t *db)
 910 {
 911         dnode_t *dn;
 912         dbuf_dirty_record_t *dr;
 913         uint32_t txg_prev;
 914
 915         ASSERT(MUTEX_HELD(&db->db_mtx));
 916
 917         if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
 918                 return;
 919
 920         ASSERT(db->db_objset != NULL);
 921         DB_DNODE_ENTER(db);
 922         dn = DB_DNODE(db);
 923         if (dn == NULL) {
 924                 ASSERT(db->db_parent == NULL);
 925                 ASSERT(db->db_blkptr == NULL);
 926         } else {
 927                 ASSERT3U(db->db.db_object, ==, dn->dn_object);
 928                 ASSERT3P(db->db_objset, ==, dn->dn_objset);
 929                 ASSERT3U(db->db_level, <, dn->dn_nlevels);
 930                 ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
 931                     db->db_blkid == DMU_SPILL_BLKID ||
 932                     !avl_is_empty(&dn->dn_dbufs));
 933         }
 934         if (db->db_blkid == DMU_BONUS_BLKID) {
 935                 ASSERT(dn != NULL);
 936                 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 937                 ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
 938         } else if (db->db_blkid == DMU_SPILL_BLKID) {
 939                 ASSERT(dn != NULL);
 940                 ASSERT0(db->db.db_offset);
 941         } else {
 942                 ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
 943         }
 944
 945         if ((dr = list_head(&db->db_dirty_records)) != NULL) {
 946                 ASSERT(dr->dr_dbuf == db);
 947                 txg_prev = dr->dr_txg;
 948                 for (dr = list_next(&db->db_dirty_records, dr); dr != NULL;
 949                     dr = list_next(&db->db_dirty_records, dr)) {
 950                         ASSERT(dr->dr_dbuf == db);
 951                         ASSERT(txg_prev > dr->dr_txg);
 952                         txg_prev = dr->dr_txg;
 953                 }
 954         }
 955
 956         /*
 957          * We can't assert that db_size matches dn_datablksz because it
 958          * can be momentarily different when another thread is doing
 959          * dnode_set_blksz().
 960          */
 961         if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
 962                 dr = db->db_data_pending;
 963                 /*
 964                  * It should only be modified in syncing context, so
 965                  * make sure we only have one copy of the data.
 966                  */
 967                 ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
 968         }
 969
 970         /* verify db->db_blkptr */
 971         if (db->db_blkptr) {
 972                 if (db->db_parent == dn->dn_dbuf) {
 973                         /* db is pointed to by the dnode */
 974                         /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
 975                         if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
 976                                 ASSERT(db->db_parent == NULL);
 977                         else
 978                                 ASSERT(db->db_parent != NULL);
 979                         if (db->db_blkid != DMU_SPILL_BLKID)
 980                                 ASSERT3P(db->db_blkptr, ==,
 981                                     &dn->dn_phys->dn_blkptr[db->db_blkid]);
 982                 } else {
 983                         /* db is pointed to by an indirect block */
 984                         int epb __maybe_unused = db->db_parent->db.db_size >>
 985                             SPA_BLKPTRSHIFT;
 986                         ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
 987                         ASSERT3U(db->db_parent->db.db_object, ==,
 988                             db->db.db_object);
 989                         /*
 990                          * dnode_grow_indblksz() can make this fail if we don't
 991                          * have the parent's rwlock.  XXX indblksz no longer
 992                          * grows.  safe to do this now?
 993                          */
 994                         if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) {
 995                                 ASSERT3P(db->db_blkptr, ==,
 996                                     ((blkptr_t *)db->db_parent->db.db_data +
 997                                     db->db_blkid % epb));
 998                         }
 999                 }
1000         }
1001         if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
1002             (db->db_buf == NULL || db->db_buf->b_data) &&
1003             db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
1004             db->db_state != DB_FILL && !dn->dn_free_txg) {
1005                 /*
1006                  * If the blkptr isn't set but they have nonzero data,
1007                  * it had better be dirty, otherwise we'll lose that
1008                  * data when we evict this buffer.
1009                  *
1010                  * There is an exception to this rule for indirect blocks; in
1011                  * this case, if the indirect block is a hole, we fill in a few
1012                  * fields on each of the child blocks (importantly, birth time)
1013                  * to prevent hole birth times from being lost when you
1014                  * partially fill in a hole.
1015                  */
1016                 if (db->db_dirtycnt == 0) {
1017                         if (db->db_level == 0) {
1018                                 uint64_t *buf = db->db.db_data;
1019                                 int i;
1020
1021                                 for (i = 0; i < db->db.db_size >> 3; i++) {
1022                                         ASSERT(buf[i] == 0);
1023                                 }
1024                         } else {
1025                                 blkptr_t *bps = db->db.db_data;
1026                                 ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
1027                                     db->db.db_size);
1028                                 /*
1029                                  * We want to verify that all the blkptrs in the
1030                                  * indirect block are holes, but we may have
1031                                  * automatically set up a few fields for them.
1032                                  * We iterate through each blkptr and verify
1033                                  * they only have those fields set.
1034                                  */
1035                                 for (int i = 0;
1036                                     i < db->db.db_size / sizeof (blkptr_t);
1037                                     i++) {
1038                                         blkptr_t *bp = &bps[i];
1039                                         ASSERT(ZIO_CHECKSUM_IS_ZERO(
1040                                             &bp->blk_cksum));
1041                                         ASSERT(
1042                                             DVA_IS_EMPTY(&bp->blk_dva[0]) &&
1043                                             DVA_IS_EMPTY(&bp->blk_dva[1]) &&
1044                                             DVA_IS_EMPTY(&bp->blk_dva[2]));
1045                                         ASSERT0(bp->blk_fill);
1046                                         ASSERT0(bp->blk_pad[0]);
1047                                         ASSERT0(bp->blk_pad[1]);
1048                                         ASSERT(!BP_IS_EMBEDDED(bp));
1049                                         ASSERT(BP_IS_HOLE(bp));
1050                                         ASSERT0(bp->blk_phys_birth);
1051                                 }
1052                         }
1053                 }
1054         }
1055         DB_DNODE_EXIT(db);
1056 }
1057 #endif
1058
1059 static void
1060 dbuf_clear_data(dmu_buf_impl_t *db)
1061 {
1062         ASSERT(MUTEX_HELD(&db->db_mtx));
1063         dbuf_evict_user(db);
1064         ASSERT3P(db->db_buf, ==, NULL);
1065         db->db.db_data = NULL;
1066         if (db->db_state != DB_NOFILL)
1067                 db->db_state = DB_UNCACHED;
1068 }
1069
1070 static void
1071 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
1072 {
1073         ASSERT(MUTEX_HELD(&db->db_mtx));
1074         ASSERT(buf != NULL);
1075
1076         db->db_buf = buf;
1077         ASSERT(buf->b_data != NULL);
1078         db->db.db_data = buf->b_data;
1079 }
1080
1081 /*
1082  * Loan out an arc_buf for read.  Return the loaned arc_buf.
1083  */
1084 arc_buf_t *
1085 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
1086 {
1087         arc_buf_t *abuf;
1088
1089         ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1090         mutex_enter(&db->db_mtx);
1091         if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) {
1092                 int blksz = db->db.db_size;
1093                 spa_t *spa = db->db_objset->os_spa;
1094
1095                 mutex_exit(&db->db_mtx);
1096                 abuf = arc_loan_buf(spa, B_FALSE, blksz);
1097                 bcopy(db->db.db_data, abuf->b_data, blksz);
1098         } else {
1099                 abuf = db->db_buf;
1100                 arc_loan_inuse_buf(abuf, db);
1101                 db->db_buf = NULL;
1102                 dbuf_clear_data(db);
1103                 mutex_exit(&db->db_mtx);
1104         }
1105         return (abuf);
1106 }
1107
1108 /*
1109  * Calculate which level n block references the data at the level 0 offset
1110  * provided.
1111  */
1112 uint64_t
1113 dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
1114 {
1115         if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
1116                 /*
1117                  * The level n blkid is equal to the level 0 blkid divided by
1118                  * the number of level 0s in a level n block.
1119                  *
1120                  * The level 0 blkid is offset >> datablkshift =
1121                  * offset / 2^datablkshift.
1122                  *
1123                  * The number of level 0s in a level n is the number of block
1124                  * pointers in an indirect block, raised to the power of level.
1125                  * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
1126                  * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
1127                  *
1128                  * Thus, the level n blkid is: offset /
1129                  * ((2^datablkshift)*(2^(level*(indblkshift-SPA_BLKPTRSHIFT))))
1130                  * = offset / 2^(datablkshift + level *
1131                  *   (indblkshift - SPA_BLKPTRSHIFT))
1132                  * = offset >> (datablkshift + level *
1133                  *   (indblkshift - SPA_BLKPTRSHIFT))
1134                  */
1135
1136                 const unsigned exp = dn->dn_datablkshift +
1137                     level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
1138
1139                 if (exp >= 8 * sizeof (offset)) {
1140                         /* This only happens on the highest indirection level */
1141                         ASSERT3U(level, ==, dn->dn_nlevels - 1);
1142                         return (0);
1143                 }
1144
1145                 ASSERT3U(exp, <, 8 * sizeof (offset));
1146
1147                 return (offset >> exp);
1148         } else {
1149                 ASSERT3U(offset, <, dn->dn_datablksz);
1150                 return (0);
1151         }
1152 }
1153
1154 /*
1155  * This function is used to lock the parent of the provided dbuf. This should be
1156  * used when modifying or reading db_blkptr.
1157  */
1158 db_lock_type_t
1159 dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag)
1160 {
1161         enum db_lock_type ret = DLT_NONE;
1162         if (db->db_parent != NULL) {
1163                 rw_enter(&db->db_parent->db_rwlock, rw);
1164                 ret = DLT_PARENT;
1165         } else if (dmu_objset_ds(db->db_objset) != NULL) {
1166                 rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw,
1167                     tag);
1168                 ret = DLT_OBJSET;
1169         }
1170         /*
1171          * We only return a DLT_NONE lock when it's the top-most indirect block
1172          * of the meta-dnode of the MOS.
1173          */
1174         return (ret);
1175 }
1176
1177 /*
1178  * We need to pass the lock type in because it's possible that the block will
1179  * move from being the topmost indirect block in a dnode (and thus, have no
1180  * parent) to not the top-most via an indirection increase. This would cause a
1181  * panic if we didn't pass the lock type in.
1182  */
1183 void
1184 dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag)
1185 {
1186         if (type == DLT_PARENT)
1187                 rw_exit(&db->db_parent->db_rwlock);
1188         else if (type == DLT_OBJSET)
1189                 rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag);
1190 }
1191
1192 static void
1193 dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
1194     arc_buf_t *buf, void *vdb)
1195 {
1196         dmu_buf_impl_t *db = vdb;
1197
1198         mutex_enter(&db->db_mtx);
1199         ASSERT3U(db->db_state, ==, DB_READ);
1200         /*
1201          * All reads are synchronous, so we must have a hold on the dbuf
1202          */
1203         ASSERT(zfs_refcount_count(&db->db_holds) > 0);
1204         ASSERT(db->db_buf == NULL);
1205         ASSERT(db->db.db_data == NULL);
1206         if (buf == NULL) {
1207                 /* i/o error */
1208                 ASSERT(zio == NULL || zio->io_error != 0);
1209                 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1210                 ASSERT3P(db->db_buf, ==, NULL);
1211                 db->db_state = DB_UNCACHED;
1212         } else if (db->db_level == 0 && db->db_freed_in_flight) {
1213                 /* freed in flight */
1214                 ASSERT(zio == NULL || zio->io_error == 0);
1215                 arc_release(buf, db);
1216                 bzero(buf->b_data, db->db.db_size);
1217                 arc_buf_freeze(buf);
1218                 db->db_freed_in_flight = FALSE;
1219                 dbuf_set_data(db, buf);
1220                 db->db_state = DB_CACHED;
1221         } else {
1222                 /* success */
1223                 ASSERT(zio == NULL || zio->io_error == 0);
1224                 dbuf_set_data(db, buf);
1225                 db->db_state = DB_CACHED;
1226         }
1227         cv_broadcast(&db->db_changed);
1228         dbuf_rele_and_unlock(db, NULL, B_FALSE);
1229 }
1230
1231
1232 /*
1233  * This function ensures that, when doing a decrypting read of a block,
1234  * we make sure we have decrypted the dnode associated with it. We must do
1235  * this so that we ensure we are fully authenticating the checksum-of-MACs
1236  * tree from the root of the objset down to this block. Indirect blocks are
1237  * always verified against their secure checksum-of-MACs assuming that the
1238  * dnode containing them is correct. Now that we are doing a decrypting read,
1239  * we can be sure that the key is loaded and verify that assumption. This is
1240  * especially important considering that we always read encrypted dnode
1241  * blocks as raw data (without verifying their MACs) to start, and
1242  * decrypt / authenticate them when we need to read an encrypted bonus buffer.
1243  */
1244 static int
1245 dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
1246 {
1247         int err = 0;
1248         objset_t *os = db->db_objset;
1249         arc_buf_t *dnode_abuf;
1250         dnode_t *dn;
1251         zbookmark_phys_t zb;
1252
1253         ASSERT(MUTEX_HELD(&db->db_mtx));
1254
1255         if (!os->os_encrypted || os->os_raw_receive ||
1256             (flags & DB_RF_NO_DECRYPT) != 0)
1257                 return (0);
1258
1259         DB_DNODE_ENTER(db);
1260         dn = DB_DNODE(db);
1261         dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL;
1262
1263         if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) {
1264                 DB_DNODE_EXIT(db);
1265                 return (0);
1266         }
1267
1268         SET_BOOKMARK(&zb, dmu_objset_id(os),
1269             DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid);
1270         err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE);
1271
1272         /*
1273          * An error code of EACCES tells us that the key is still not
1274          * available. This is ok if we are only reading authenticated
1275          * (and therefore non-encrypted) blocks.
1276          */
1277         if (err == EACCES && ((db->db_blkid != DMU_BONUS_BLKID &&
1278             !DMU_OT_IS_ENCRYPTED(dn->dn_type)) ||
1279             (db->db_blkid == DMU_BONUS_BLKID &&
1280             !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
1281                 err = 0;
1282
1283         DB_DNODE_EXIT(db);
1284
1285         return (err);
1286 }
1287
1288 /*
1289  * Drops db_mtx and the parent lock specified by dblt and tag before
1290  * returning.
1291  */
1292 static int
1293 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
1294     db_lock_type_t dblt, void *tag)
1295 {
1296         dnode_t *dn;
1297         zbookmark_phys_t zb;
1298         uint32_t aflags = ARC_FLAG_NOWAIT;
1299         int err, zio_flags = 0;
1300
1301         DB_DNODE_ENTER(db);
1302         dn = DB_DNODE(db);
1303         ASSERT(!zfs_refcount_is_zero(&db->db_holds));
1304         ASSERT(MUTEX_HELD(&db->db_mtx));
1305         ASSERT(db->db_state == DB_UNCACHED);
1306         ASSERT(db->db_buf == NULL);
1307         ASSERT(db->db_parent == NULL ||
1308             RW_LOCK_HELD(&db->db_parent->db_rwlock));
1309
1310         if (db->db_blkid == DMU_BONUS_BLKID) {
1311                 /*
1312                  * The bonus length stored in the dnode may be less than
1313                  * the maximum available space in the bonus buffer.
1314                  */
1315                 int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
1316                 int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
1317
1318                 /* if the underlying dnode block is encrypted, decrypt it */
1319                 err = dbuf_read_verify_dnode_crypt(db, flags);
1320                 if (err != 0) {
1321                         DB_DNODE_EXIT(db);
1322                         mutex_exit(&db->db_mtx);
1323                         return (err);
1324                 }
1325
1326                 ASSERT3U(bonuslen, <=, db->db.db_size);
1327                 db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
1328                 arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
1329                 if (bonuslen < max_bonuslen)
1330                         bzero(db->db.db_data, max_bonuslen);
1331                 if (bonuslen)
1332                         bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
1333                 DB_DNODE_EXIT(db);
1334                 db->db_state = DB_CACHED;
1335                 mutex_exit(&db->db_mtx);
1336                 dmu_buf_unlock_parent(db, dblt, tag);
1337                 return (0);
1338         }
1339
1340         /*
1341          * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
1342          * processes the delete record and clears the bp while we are waiting
1343          * for the dn_mtx (resulting in a "no" from block_freed).
1344          */
1345         if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
1346             (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
1347             BP_IS_HOLE(db->db_blkptr)))) {
1348                 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1349
1350                 dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type,
1351                     db->db.db_size));
1352                 bzero(db->db.db_data, db->db.db_size);
1353
1354                 if (db->db_blkptr != NULL && db->db_level > 0 &&
1355                     BP_IS_HOLE(db->db_blkptr) &&
1356                     db->db_blkptr->blk_birth != 0) {
1357                         blkptr_t *bps = db->db.db_data;
1358                         for (int i = 0; i < ((1 <<
1359                             DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t));
1360                             i++) {
1361                                 blkptr_t *bp = &bps[i];
1362                                 ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
1363                                     1 << dn->dn_indblkshift);
1364                                 BP_SET_LSIZE(bp,
1365                                     BP_GET_LEVEL(db->db_blkptr) == 1 ?
1366                                     dn->dn_datablksz :
1367                                     BP_GET_LSIZE(db->db_blkptr));
1368                                 BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
1369                                 BP_SET_LEVEL(bp,
1370                                     BP_GET_LEVEL(db->db_blkptr) - 1);
1371                                 BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
1372                         }
1373                 }
1374                 DB_DNODE_EXIT(db);
1375                 db->db_state = DB_CACHED;
1376                 mutex_exit(&db->db_mtx);
1377                 dmu_buf_unlock_parent(db, dblt, tag);
1378                 return (0);
1379         }
1380
1381         /*
1382          * Any attempt to read a redacted block should result in an error. This
1383          * will never happen under normal conditions, but can be useful for
1384          * debugging purposes.
1385          */
1386         if (BP_IS_REDACTED(db->db_blkptr)) {
1387                 ASSERT(dsl_dataset_feature_is_active(
1388                     db->db_objset->os_dsl_dataset,
1389                     SPA_FEATURE_REDACTED_DATASETS));
1390                 DB_DNODE_EXIT(db);
1391                 mutex_exit(&db->db_mtx);
1392                 return (SET_ERROR(EIO));
1393         }
1394
1395
1396         SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
1397             db->db.db_object, db->db_level, db->db_blkid);
1398
1399         /*
1400          * All bps of an encrypted os should have the encryption bit set.
1401          * If this is not true it indicates tampering and we report an error.
1402          */
1403         if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
1404                 spa_log_error(db->db_objset->os_spa, &zb);
1405                 zfs_panic_recover("unencrypted block in encrypted "
1406                     "object set %llu", dmu_objset_id(db->db_objset));
1407                 DB_DNODE_EXIT(db);
1408                 mutex_exit(&db->db_mtx);
1409                 dmu_buf_unlock_parent(db, dblt, tag);
1410                 return (SET_ERROR(EIO));
1411         }
1412
1413         err = dbuf_read_verify_dnode_crypt(db, flags);
1414         if (err != 0) {
1415                 DB_DNODE_EXIT(db);
1416                 dmu_buf_unlock_parent(db, dblt, tag);
1417                 mutex_exit(&db->db_mtx);
1418                 return (err);
1419         }
1420
1421         DB_DNODE_EXIT(db);
1422
1423         db->db_state = DB_READ;
1424         mutex_exit(&db->db_mtx);
1425
1426         if (DBUF_IS_L2CACHEABLE(db))
1427                 aflags |= ARC_FLAG_L2CACHE;
1428
1429         dbuf_add_ref(db, NULL);
1430
1431         zio_flags = (flags & DB_RF_CANFAIL) ?
1432             ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
1433
1434         if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
1435                 zio_flags |= ZIO_FLAG_RAW;
1436         /*
1437          * The zio layer will copy the provided blkptr later, but we need to
1438          * do this now so that we can release the parent's rwlock. We have to
1439          * do that now so that if dbuf_read_done is called synchronously (on
1440          * an l1 cache hit) we don't acquire the db_mtx while holding the
1441          * parent's rwlock, which would be a lock ordering violation.
1442          */
1443         blkptr_t bp = *db->db_blkptr;
1444         dmu_buf_unlock_parent(db, dblt, tag);
1445         (void) arc_read(zio, db->db_objset->os_spa, &bp,
1446             dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
1447             &aflags, &zb);
1448         return (err);
1449 }
1450
1451 /*
1452  * This is our just-in-time copy function.  It makes a copy of buffers that
1453  * have been modified in a previous transaction group before we access them in
1454  * the current active group.
1455  *
1456  * This function is used in three places: when we are dirtying a buffer for the
1457  * first time in a txg, when we are freeing a range in a dnode that includes
1458  * this buffer, and when we are accessing a buffer which was received compressed
1459  * and later referenced in a WRITE_BYREF record.
1460  *
1461  * Note that when we are called from dbuf_free_range() we do not put a hold on
1462  * the buffer, we just traverse the active dbuf list for the dnode.
1463  */
1464 static void
1465 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
1466 {
1467         dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
1468
1469         ASSERT(MUTEX_HELD(&db->db_mtx));
1470         ASSERT(db->db.db_data != NULL);
1471         ASSERT(db->db_level == 0);
1472         ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
1473
1474         if (dr == NULL ||
1475             (dr->dt.dl.dr_data !=
1476             ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
1477                 return;
1478
1479         /*
1480          * If the last dirty record for this dbuf has not yet synced
1481          * and its referencing the dbuf data, either:
1482          *      reset the reference to point to a new copy,
1483          * or (if there a no active holders)
1484          *      just null out the current db_data pointer.
1485          */
1486         ASSERT3U(dr->dr_txg, >=, txg - 2);
1487         if (db->db_blkid == DMU_BONUS_BLKID) {
1488                 dnode_t *dn = DB_DNODE(db);
1489                 int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
1490                 dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
1491                 arc_space_consume(bonuslen, ARC_SPACE_BONUS);
1492                 bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
1493         } else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
1494                 dnode_t *dn = DB_DNODE(db);
1495                 int size = arc_buf_size(db->db_buf);
1496                 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1497                 spa_t *spa = db->db_objset->os_spa;
1498                 enum zio_compress compress_type =
1499                     arc_get_compression(db->db_buf);
1500
1501                 if (arc_is_encrypted(db->db_buf)) {
1502                         boolean_t byteorder;
1503                         uint8_t salt[ZIO_DATA_SALT_LEN];
1504                         uint8_t iv[ZIO_DATA_IV_LEN];
1505                         uint8_t mac[ZIO_DATA_MAC_LEN];
1506
1507                         arc_get_raw_params(db->db_buf, &byteorder, salt,
1508                             iv, mac);
1509                         dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db,
1510                             dmu_objset_id(dn->dn_objset), byteorder, salt, iv,
1511                             mac, dn->dn_type, size, arc_buf_lsize(db->db_buf),
1512                             compress_type);
1513                 } else if (compress_type != ZIO_COMPRESS_OFF) {
1514                         ASSERT3U(type, ==, ARC_BUFC_DATA);
1515                         dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
1516                             size, arc_buf_lsize(db->db_buf), compress_type);
1517                 } else {
1518                         dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
1519                 }
1520                 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
1521         } else {
1522                 db->db_buf = NULL;
1523                 dbuf_clear_data(db);
1524         }
1525 }
1526
1527 int
1528 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
1529 {
1530         int err = 0;
1531         boolean_t prefetch;
1532         dnode_t *dn;
1533
1534         /*
1535          * We don't have to hold the mutex to check db_state because it
1536          * can't be freed while we have a hold on the buffer.
1537          */
1538         ASSERT(!zfs_refcount_is_zero(&db->db_holds));
1539
1540         if (db->db_state == DB_NOFILL)
1541                 return (SET_ERROR(EIO));
1542
1543         DB_DNODE_ENTER(db);
1544         dn = DB_DNODE(db);
1545
1546         prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
1547             (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
1548             DBUF_IS_CACHEABLE(db);
1549
1550         mutex_enter(&db->db_mtx);
1551         if (db->db_state == DB_CACHED) {
1552                 spa_t *spa = dn->dn_objset->os_spa;
1553
1554                 /*
1555                  * Ensure that this block's dnode has been decrypted if
1556                  * the caller has requested decrypted data.
1557                  */
1558                 err = dbuf_read_verify_dnode_crypt(db, flags);
1559
1560                 /*
1561                  * If the arc buf is compressed or encrypted and the caller
1562                  * requested uncompressed data, we need to untransform it
1563                  * before returning. We also call arc_untransform() on any
1564                  * unauthenticated blocks, which will verify their MAC if
1565                  * the key is now available.
1566                  */
1567                 if (err == 0 && db->db_buf != NULL &&
1568                     (flags & DB_RF_NO_DECRYPT) == 0 &&
1569                     (arc_is_encrypted(db->db_buf) ||
1570                     arc_is_unauthenticated(db->db_buf) ||
1571                     arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
1572                         zbookmark_phys_t zb;
1573
1574                         SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
1575                             db->db.db_object, db->db_level, db->db_blkid);
1576                         dbuf_fix_old_data(db, spa_syncing_txg(spa));
1577                         err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);
1578                         dbuf_set_data(db, db->db_buf);
1579                 }
1580                 mutex_exit(&db->db_mtx);
1581                 if (err == 0 && prefetch) {
1582                         dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
1583                             flags & DB_RF_HAVESTRUCT);
1584                 }
1585                 DB_DNODE_EXIT(db);
1586                 DBUF_STAT_BUMP(hash_hits);
1587         } else if (db->db_state == DB_UNCACHED) {
1588                 spa_t *spa = dn->dn_objset->os_spa;
1589                 boolean_t need_wait = B_FALSE;
1590
1591                 db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
1592
1593                 if (zio == NULL &&
1594                     db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
1595                         zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
1596                         need_wait = B_TRUE;
1597                 }
1598                 err = dbuf_read_impl(db, zio, flags, dblt, FTAG);
1599                 /*
1600                  * dbuf_read_impl has dropped db_mtx and our parent's rwlock
1601                  * for us
1602                  */
1603                 if (!err && prefetch) {
1604                         dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
1605                             flags & DB_RF_HAVESTRUCT);
1606                 }
1607
1608                 DB_DNODE_EXIT(db);
1609                 DBUF_STAT_BUMP(hash_misses);
1610
1611                 /*
1612                  * If we created a zio_root we must execute it to avoid
1613                  * leaking it, even if it isn't attached to any work due
1614                  * to an error in dbuf_read_impl().
1615                  */
1616                 if (need_wait) {
1617                         if (err == 0)
1618                                 err = zio_wait(zio);
1619                         else
1620                                 VERIFY0(zio_wait(zio));
1621                 }
1622         } else {
1623                 /*
1624                  * Another reader came in while the dbuf was in flight
1625                  * between UNCACHED and CACHED.  Either a writer will finish
1626                  * writing the buffer (sending the dbuf to CACHED) or the
1627                  * first reader's request will reach the read_done callback
1628                  * and send the dbuf to CACHED.  Otherwise, a failure
1629                  * occurred and the dbuf went to UNCACHED.
1630                  */
1631                 mutex_exit(&db->db_mtx);
1632                 if (prefetch) {
1633                         dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
1634                             flags & DB_RF_HAVESTRUCT);
1635                 }
1636                 DB_DNODE_EXIT(db);
1637                 DBUF_STAT_BUMP(hash_misses);
1638
1639                 /* Skip the wait per the caller's request. */
1640                 mutex_enter(&db->db_mtx);
1641                 if ((flags & DB_RF_NEVERWAIT) == 0) {
1642                         while (db->db_state == DB_READ ||
1643                             db->db_state == DB_FILL) {
1644                                 ASSERT(db->db_state == DB_READ ||
1645                                     (flags & DB_RF_HAVESTRUCT) == 0);
1646                                 DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
1647                                     db, zio_t *, zio);
1648                                 cv_wait(&db->db_changed, &db->db_mtx);
1649                         }
1650                         if (db->db_state == DB_UNCACHED)
1651                                 err = SET_ERROR(EIO);
1652                 }
1653                 mutex_exit(&db->db_mtx);
1654         }
1655
1656         return (err);
1657 }
1658
1659 static void
1660 dbuf_noread(dmu_buf_impl_t *db)
1661 {
1662         ASSERT(!zfs_refcount_is_zero(&db->db_holds));
1663         ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1664         mutex_enter(&db->db_mtx);
1665         while (db->db_state == DB_READ || db->db_state == DB_FILL)
1666                 cv_wait(&db->db_changed, &db->db_mtx);
1667         if (db->db_state == DB_UNCACHED) {
1668                 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1669                 spa_t *spa = db->db_objset->os_spa;
1670
1671                 ASSERT(db->db_buf == NULL);
1672                 ASSERT(db->db.db_data == NULL);
1673                 dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size));
1674                 db->db_state = DB_FILL;
1675         } else if (db->db_state == DB_NOFILL) {
1676                 dbuf_clear_data(db);
1677         } else {
1678                 ASSERT3U(db->db_state, ==, DB_CACHED);
1679         }
1680         mutex_exit(&db->db_mtx);
1681 }
1682
1683 void
1684 dbuf_unoverride(dbuf_dirty_record_t *dr)
1685 {
1686         dmu_buf_impl_t *db = dr->dr_dbuf;
1687         blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
1688         uint64_t txg = dr->dr_txg;
1689
1690         ASSERT(MUTEX_HELD(&db->db_mtx));
1691         /*
1692          * This assert is valid because dmu_sync() expects to be called by
1693          * a zilog's get_data while holding a range lock.  This call only
1694          * comes from dbuf_dirty() callers who must also hold a range lock.
1695          */
1696         ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
1697         ASSERT(db->db_level == 0);
1698
1699         if (db->db_blkid == DMU_BONUS_BLKID ||
1700             dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
1701                 return;
1702
1703         ASSERT(db->db_data_pending != dr);
1704
1705         /* free this block */
1706         if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
1707                 zio_free(db->db_objset->os_spa, txg, bp);
1708
1709         dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1710         dr->dt.dl.dr_nopwrite = B_FALSE;
1711         dr->dt.dl.dr_has_raw_params = B_FALSE;
1712
1713         /*
1714          * Release the already-written buffer, so we leave it in
1715          * a consistent dirty state.  Note that all callers are
1716          * modifying the buffer, so they will immediately do
1717          * another (redundant) arc_release().  Therefore, leave
1718          * the buf thawed to save the effort of freezing &
1719          * immediately re-thawing it.
1720          */
1721         arc_release(dr->dt.dl.dr_data, db);
1722 }
1723
1724 /*
1725  * Evict (if its unreferenced) or clear (if its referenced) any level-0
1726  * data blocks in the free range, so that any future readers will find
1727  * empty blocks.
1728  */
1729 void
1730 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
1731     dmu_tx_t *tx)
1732 {
1733         dmu_buf_impl_t *db_search;
1734         dmu_buf_impl_t *db, *db_next;
1735         uint64_t txg = tx->tx_txg;
1736         avl_index_t where;
1737
1738         if (end_blkid > dn->dn_maxblkid &&
1739             !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))
1740                 end_blkid = dn->dn_maxblkid;
1741         dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
1742
1743         db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
1744         db_search->db_level = 0;
1745         db_search->db_blkid = start_blkid;
1746         db_search->db_state = DB_SEARCH;
1747
1748         mutex_enter(&dn->dn_dbufs_mtx);
1749         db = avl_find(&dn->dn_dbufs, db_search, &where);
1750         ASSERT3P(db, ==, NULL);
1751
1752         db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
1753
1754         for (; db != NULL; db = db_next) {
1755                 db_next = AVL_NEXT(&dn->dn_dbufs, db);
1756                 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1757
1758                 if (db->db_level != 0 || db->db_blkid > end_blkid) {
1759                         break;
1760                 }
1761                 ASSERT3U(db->db_blkid, >=, start_blkid);
1762
1763                 /* found a level 0 buffer in the range */
1764                 mutex_enter(&db->db_mtx);
1765                 if (dbuf_undirty(db, tx)) {
1766                         /* mutex has been dropped and dbuf destroyed */
1767                         continue;
1768                 }
1769
1770                 if (db->db_state == DB_UNCACHED ||
1771                     db->db_state == DB_NOFILL ||
1772                     db->db_state == DB_EVICTING) {
1773                         ASSERT(db->db.db_data == NULL);
1774                         mutex_exit(&db->db_mtx);
1775                         continue;
1776                 }
1777                 if (db->db_state == DB_READ || db->db_state == DB_FILL) {
1778                         /* will be handled in dbuf_read_done or dbuf_rele */
1779                         db->db_freed_in_flight = TRUE;
1780                         mutex_exit(&db->db_mtx);
1781                         continue;
1782                 }
1783                 if (zfs_refcount_count(&db->db_holds) == 0) {
1784                         ASSERT(db->db_buf);
1785                         dbuf_destroy(db);
1786                         continue;
1787                 }
1788                 /* The dbuf is referenced */
1789
1790                 if (!list_is_empty(&db->db_dirty_records)) {
1791                         dbuf_dirty_record_t *dr;
1792
1793                         dr = list_head(&db->db_dirty_records);
1794                         if (dr->dr_txg == txg) {
1795                                 /*
1796                                  * This buffer is "in-use", re-adjust the file
1797                                  * size to reflect that this buffer may
1798                                  * contain new data when we sync.
1799                                  */
1800                                 if (db->db_blkid != DMU_SPILL_BLKID &&
1801                                     db->db_blkid > dn->dn_maxblkid)
1802                                         dn->dn_maxblkid = db->db_blkid;
1803                                 dbuf_unoverride(dr);
1804                         } else {
1805                                 /*
1806                                  * This dbuf is not dirty in the open context.
1807                                  * Either uncache it (if its not referenced in
1808                                  * the open context) or reset its contents to
1809                                  * empty.
1810                                  */
1811                                 dbuf_fix_old_data(db, txg);
1812                         }
1813                 }
1814                 /* clear the contents if its cached */
1815                 if (db->db_state == DB_CACHED) {
1816                         ASSERT(db->db.db_data != NULL);
1817                         arc_release(db->db_buf, db);
1818                         rw_enter(&db->db_rwlock, RW_WRITER);
1819                         bzero(db->db.db_data, db->db.db_size);
1820                         rw_exit(&db->db_rwlock);
1821                         arc_buf_freeze(db->db_buf);
1822                 }
1823
1824                 mutex_exit(&db->db_mtx);
1825         }
1826
1827         kmem_free(db_search, sizeof (dmu_buf_impl_t));
1828         mutex_exit(&dn->dn_dbufs_mtx);
1829 }
1830
1831 void
1832 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
1833 {
1834         arc_buf_t *buf, *obuf;
1835         dbuf_dirty_record_t *dr;
1836         int osize = db->db.db_size;
1837         arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
1838         dnode_t *dn;
1839
1840         ASSERT(db->db_blkid != DMU_BONUS_BLKID);
1841
1842         DB_DNODE_ENTER(db);
1843         dn = DB_DNODE(db);
1844
1845         /*
1846          * XXX we should be doing a dbuf_read, checking the return
1847          * value and returning that up to our callers
1848          */
1849         dmu_buf_will_dirty(&db->db, tx);
1850
1851         /* create the data buffer for the new block */
1852         buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
1853
1854         /* copy old block data to the new block */
1855         obuf = db->db_buf;
1856         bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
1857         /* zero the remainder */
1858         if (size > osize)
1859                 bzero((uint8_t *)buf->b_data + osize, size - osize);
1860
1861         mutex_enter(&db->db_mtx);
1862         dbuf_set_data(db, buf);
1863         arc_buf_destroy(obuf, db);
1864         db->db.db_size = size;
1865
1866         dr = list_head(&db->db_dirty_records);
1867         if (db->db_level == 0)
1868                 dr->dt.dl.dr_data = buf;
1869         ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
1870         ASSERT3U(dr->dr_accounted, ==, osize);
1871         dr->dr_accounted = size;
1872         mutex_exit(&db->db_mtx);
1873
1874         dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
1875         DB_DNODE_EXIT(db);
1876 }
1877
1878 void
1879 dbuf_release_bp(dmu_buf_impl_t *db)
1880 {
1881         objset_t *os __maybe_unused = db->db_objset;
1882
1883         ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
1884         ASSERT(arc_released(os->os_phys_buf) ||
1885             list_link_active(&os->os_dsl_dataset->ds_synced_link));
1886         ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
1887
1888         (void) arc_release(db->db_buf, db);
1889 }
1890
1891 /*
1892  * We already have a dirty record for this TXG, and we are being
1893  * dirtied again.
1894  */
1895 static void
1896 dbuf_redirty(dbuf_dirty_record_t *dr)
1897 {
1898         dmu_buf_impl_t *db = dr->dr_dbuf;
1899
1900         ASSERT(MUTEX_HELD(&db->db_mtx));
1901
1902         if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
1903                 /*
1904                  * If this buffer has already been written out,
1905                  * we now need to reset its state.
1906                  */
1907                 dbuf_unoverride(dr);
1908                 if (db->db.db_object != DMU_META_DNODE_OBJECT &&
1909                     db->db_state != DB_NOFILL) {
1910                         /* Already released on initial dirty, so just thaw. */
1911                         ASSERT(arc_released(db->db_buf));
1912                         arc_buf_thaw(db->db_buf);
1913                 }
1914         }
1915 }
1916
1917 dbuf_dirty_record_t *
1918 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
1919 {
1920         dnode_t *dn;
1921         objset_t *os;
1922         dbuf_dirty_record_t *dr, *dr_next, *dr_head;
1923         int txgoff = tx->tx_txg & TXG_MASK;
1924         boolean_t drop_struct_rwlock = B_FALSE;
1925
1926         ASSERT(tx->tx_txg != 0);
1927         ASSERT(!zfs_refcount_is_zero(&db->db_holds));
1928         DMU_TX_DIRTY_BUF(tx, db);
1929
1930         DB_DNODE_ENTER(db);
1931         dn = DB_DNODE(db);
1932         /*
1933          * Shouldn't dirty a regular buffer in syncing context.  Private
1934          * objects may be dirtied in syncing context, but only if they
1935          * were already pre-dirtied in open context.
1936          */
1937 #ifdef DEBUG
1938         if (dn->dn_objset->os_dsl_dataset != NULL) {
1939                 rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
1940                     RW_READER, FTAG);
1941         }
1942         ASSERT(!dmu_tx_is_syncing(tx) ||
1943             BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
1944             DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
1945             dn->dn_objset->os_dsl_dataset == NULL);
1946         if (dn->dn_objset->os_dsl_dataset != NULL)
1947                 rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
1948 #endif
1949         /*
1950          * We make this assert for private objects as well, but after we
1951          * check if we're already dirty.  They are allowed to re-dirty
1952          * in syncing context.
1953          */
1954         ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
1955             dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
1956             (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
1957
1958         mutex_enter(&db->db_mtx);
1959         /*
1960          * XXX make this true for indirects too?  The problem is that
1961          * transactions created with dmu_tx_create_assigned() from
1962          * syncing context don't bother holding ahead.
1963          */
1964         ASSERT(db->db_level != 0 ||
1965             db->db_state == DB_CACHED || db->db_state == DB_FILL ||
1966             db->db_state == DB_NOFILL);
1967
1968         mutex_enter(&dn->dn_mtx);
1969         /*
1970          * Don't set dirtyctx to SYNC if we're just modifying this as we
1971          * initialize the objset.
1972          */
1973         if (dn->dn_dirtyctx == DN_UNDIRTIED) {
1974                 if (dn->dn_objset->os_dsl_dataset != NULL) {
1975                         rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
1976                             RW_READER, FTAG);
1977                 }
1978                 if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
1979                         dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ?
1980                             DN_DIRTY_SYNC : DN_DIRTY_OPEN);
1981                         ASSERT(dn->dn_dirtyctx_firstset == NULL);
1982                         dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
1983                 }
1984                 if (dn->dn_objset->os_dsl_dataset != NULL) {
1985                         rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
1986                             FTAG);
1987                 }
1988         }
1989
1990         if (tx->tx_txg > dn->dn_dirty_txg)
1991                 dn->dn_dirty_txg = tx->tx_txg;
1992         mutex_exit(&dn->dn_mtx);
1993
1994         if (db->db_blkid == DMU_SPILL_BLKID)
1995                 dn->dn_have_spill = B_TRUE;
1996
1997         /*
1998          * If this buffer is already dirty, we're done.
1999          */
2000         dr_head = list_head(&db->db_dirty_records);
2001         ASSERT(dr_head == NULL || dr_head->dr_txg <= tx->tx_txg ||
2002             db->db.db_object == DMU_META_DNODE_OBJECT);
2003         dr_next = dbuf_find_dirty_lte(db, tx->tx_txg);
2004         if (dr_next && dr_next->dr_txg == tx->tx_txg) {
2005                 DB_DNODE_EXIT(db);
2006
2007                 dbuf_redirty(dr_next);
2008                 mutex_exit(&db->db_mtx);
2009                 return (dr_next);
2010         }
2011
2012         /*
2013          * Only valid if not already dirty.
2014          */
2015         ASSERT(dn->dn_object == 0 ||
2016             dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
2017             (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
2018
2019         ASSERT3U(dn->dn_nlevels, >, db->db_level);
2020
2021         /*
2022          * We should only be dirtying in syncing context if it's the
2023          * mos or we're initializing the os or it's a special object.
2024          * However, we are allowed to dirty in syncing context provided
2025          * we already dirtied it in open context.  Hence we must make
2026          * this assertion only if we're not already dirty.
2027          */
2028         os = dn->dn_objset;
2029         VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
2030 #ifdef DEBUG
2031         if (dn->dn_objset->os_dsl_dataset != NULL)
2032                 rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
2033         ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
2034             os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
2035         if (dn->dn_objset->os_dsl_dataset != NULL)
2036                 rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
2037 #endif
2038         ASSERT(db->db.db_size != 0);
2039
2040         dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
2041
2042         if (db->db_blkid != DMU_BONUS_BLKID) {
2043                 dmu_objset_willuse_space(os, db->db.db_size, tx);
2044         }
2045
2046         /*
2047          * If this buffer is dirty in an old transaction group we need
2048          * to make a copy of it so that the changes we make in this
2049          * transaction group won't leak out when we sync the older txg.
2050          */
2051         dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
2052         list_link_init(&dr->dr_dirty_node);
2053         list_link_init(&dr->dr_dbuf_node);
2054         if (db->db_level == 0) {
2055                 void *data_old = db->db_buf;
2056
2057                 if (db->db_state != DB_NOFILL) {
2058                         if (db->db_blkid == DMU_BONUS_BLKID) {
2059                                 dbuf_fix_old_data(db, tx->tx_txg);
2060                                 data_old = db->db.db_data;
2061                         } else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
2062                                 /*
2063                                  * Release the data buffer from the cache so
2064                                  * that we can modify it without impacting
2065                                  * possible other users of this cached data
2066                                  * block.  Note that indirect blocks and
2067                                  * private objects are not released until the
2068                                  * syncing state (since they are only modified
2069                                  * then).
2070                                  */
2071                                 arc_release(db->db_buf, db);
2072                                 dbuf_fix_old_data(db, tx->tx_txg);
2073                                 data_old = db->db_buf;
2074                         }
2075                         ASSERT(data_old != NULL);
2076                 }
2077                 dr->dt.dl.dr_data = data_old;
2078         } else {
2079                 mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_NOLOCKDEP, NULL);
2080                 list_create(&dr->dt.di.dr_children,
2081                     sizeof (dbuf_dirty_record_t),
2082                     offsetof(dbuf_dirty_record_t, dr_dirty_node));
2083         }
2084         if (db->db_blkid != DMU_BONUS_BLKID)
2085                 dr->dr_accounted = db->db.db_size;
2086         dr->dr_dbuf = db;
2087         dr->dr_txg = tx->tx_txg;
2088         list_insert_before(&db->db_dirty_records, dr_next, dr);
2089
2090         /*
2091          * We could have been freed_in_flight between the dbuf_noread
2092          * and dbuf_dirty.  We win, as though the dbuf_noread() had
2093          * happened after the free.
2094          */
2095         if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
2096             db->db_blkid != DMU_SPILL_BLKID) {
2097                 mutex_enter(&dn->dn_mtx);
2098                 if (dn->dn_free_ranges[txgoff] != NULL) {
2099                         range_tree_clear(dn->dn_free_ranges[txgoff],
2100                             db->db_blkid, 1);
2101                 }
2102                 mutex_exit(&dn->dn_mtx);
2103                 db->db_freed_in_flight = FALSE;
2104         }
2105
2106         /*
2107          * This buffer is now part of this txg
2108          */
2109         dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
2110         db->db_dirtycnt += 1;
2111         ASSERT3U(db->db_dirtycnt, <=, 3);
2112
2113         mutex_exit(&db->db_mtx);
2114
2115         if (db->db_blkid == DMU_BONUS_BLKID ||
2116             db->db_blkid == DMU_SPILL_BLKID) {
2117                 mutex_enter(&dn->dn_mtx);
2118                 ASSERT(!list_link_active(&dr->dr_dirty_node));
2119                 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
2120                 mutex_exit(&dn->dn_mtx);
2121                 dnode_setdirty(dn, tx);
2122                 DB_DNODE_EXIT(db);
2123                 return (dr);
2124         }
2125
2126         if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
2127                 rw_enter(&dn->dn_struct_rwlock, RW_READER);
2128                 drop_struct_rwlock = B_TRUE;
2129         }
2130
2131         /*
2132          * If we are overwriting a dedup BP, then unless it is snapshotted,
2133          * when we get to syncing context we will need to decrement its
2134          * refcount in the DDT.  Prefetch the relevant DDT block so that
2135          * syncing context won't have to wait for the i/o.
2136          */
2137         if (db->db_blkptr != NULL) {
2138                 db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
2139                 ddt_prefetch(os->os_spa, db->db_blkptr);
2140                 dmu_buf_unlock_parent(db, dblt, FTAG);
2141         }
2142
2143         /*
2144          * We need to hold the dn_struct_rwlock to make this assertion,
2145          * because it protects dn_phys / dn_next_nlevels from changing.
2146          */
2147         ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
2148             dn->dn_phys->dn_nlevels > db->db_level ||
2149             dn->dn_next_nlevels[txgoff] > db->db_level ||
2150             dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
2151             dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
2152
2153
2154         if (db->db_level == 0) {
2155                 ASSERT(!db->db_objset->os_raw_receive ||
2156                     dn->dn_maxblkid >= db->db_blkid);
2157                 dnode_new_blkid(dn, db->db_blkid, tx,
2158                     drop_struct_rwlock, B_FALSE);
2159                 ASSERT(dn->dn_maxblkid >= db->db_blkid);
2160         }
2161
2162         if (db->db_level+1 < dn->dn_nlevels) {
2163                 dmu_buf_impl_t *parent = db->db_parent;
2164                 dbuf_dirty_record_t *di;
2165                 int parent_held = FALSE;
2166
2167                 if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
2168                         int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2169                         parent = dbuf_hold_level(dn, db->db_level + 1,
2170                             db->db_blkid >> epbs, FTAG);
2171                         ASSERT(parent != NULL);
2172                         parent_held = TRUE;
2173                 }
2174                 if (drop_struct_rwlock)
2175                         rw_exit(&dn->dn_struct_rwlock);
2176                 ASSERT3U(db->db_level + 1, ==, parent->db_level);
2177                 di = dbuf_dirty(parent, tx);
2178                 if (parent_held)
2179                         dbuf_rele(parent, FTAG);
2180
2181                 mutex_enter(&db->db_mtx);
2182                 /*
2183                  * Since we've dropped the mutex, it's possible that
2184                  * dbuf_undirty() might have changed this out from under us.
2185                  */
2186                 if (list_head(&db->db_dirty_records) == dr ||
2187                     dn->dn_object == DMU_META_DNODE_OBJECT) {
2188                         mutex_enter(&di->dt.di.dr_mtx);
2189                         ASSERT3U(di->dr_txg, ==, tx->tx_txg);
2190                         ASSERT(!list_link_active(&dr->dr_dirty_node));
2191                         list_insert_tail(&di->dt.di.dr_children, dr);
2192                         mutex_exit(&di->dt.di.dr_mtx);
2193                         dr->dr_parent = di;
2194                 }
2195                 mutex_exit(&db->db_mtx);
2196         } else {
2197                 ASSERT(db->db_level + 1 == dn->dn_nlevels);
2198                 ASSERT(db->db_blkid < dn->dn_nblkptr);
2199                 ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
2200                 mutex_enter(&dn->dn_mtx);
2201                 ASSERT(!list_link_active(&dr->dr_dirty_node));
2202                 list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
2203                 mutex_exit(&dn->dn_mtx);
2204                 if (drop_struct_rwlock)
2205                         rw_exit(&dn->dn_struct_rwlock);
2206         }
2207
2208         dnode_setdirty(dn, tx);
2209         DB_DNODE_EXIT(db);
2210         return (dr);
2211 }
2212
2213 /*
2214  * Undirty a buffer in the transaction group referenced by the given
2215  * transaction.  Return whether this evicted the dbuf.
2216  */
2217 static boolean_t
2218 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
2219 {
2220         dnode_t *dn;
2221         uint64_t txg = tx->tx_txg;
2222         dbuf_dirty_record_t *dr;
2223
2224         ASSERT(txg != 0);
2225
2226         /*
2227          * Due to our use of dn_nlevels below, this can only be called
2228          * in open context, unless we are operating on the MOS.
2229          * From syncing context, dn_nlevels may be different from the
2230          * dn_nlevels used when dbuf was dirtied.
2231          */
2232         ASSERT(db->db_objset ==
2233             dmu_objset_pool(db->db_objset)->dp_meta_objset ||
2234             txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
2235         ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2236         ASSERT0(db->db_level);
2237         ASSERT(MUTEX_HELD(&db->db_mtx));
2238
2239         /*
2240          * If this buffer is not dirty, we're done.
2241          */
2242         dr = dbuf_find_dirty_eq(db, txg);
2243         if (dr == NULL)
2244                 return (B_FALSE);
2245         ASSERT(dr->dr_dbuf == db);
2246
2247         DB_DNODE_ENTER(db);
2248         dn = DB_DNODE(db);
2249
2250         dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
2251
2252         ASSERT(db->db.db_size != 0);
2253
2254         dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
2255             dr->dr_accounted, txg);
2256
2257         list_remove(&db->db_dirty_records, dr);
2258
2259         /*
2260          * Note that there are three places in dbuf_dirty()
2261          * where this dirty record may be put on a list.
2262          * Make sure to do a list_remove corresponding to
2263          * every one of those list_insert calls.
2264          */
2265         if (dr->dr_parent) {
2266                 mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
2267                 list_remove(&dr->dr_parent->dt.di.dr_children, dr);
2268                 mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
2269         } else if (db->db_blkid == DMU_SPILL_BLKID ||
2270             db->db_level + 1 == dn->dn_nlevels) {
2271                 ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
2272                 mutex_enter(&dn->dn_mtx);
2273                 list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
2274                 mutex_exit(&dn->dn_mtx);
2275         }
2276         DB_DNODE_EXIT(db);
2277
2278         if (db->db_state != DB_NOFILL) {
2279                 dbuf_unoverride(dr);
2280
2281                 ASSERT(db->db_buf != NULL);
2282                 ASSERT(dr->dt.dl.dr_data != NULL);
2283                 if (dr->dt.dl.dr_data != db->db_buf)
2284                         arc_buf_destroy(dr->dt.dl.dr_data, db);
2285         }
2286
2287         kmem_free(dr, sizeof (dbuf_dirty_record_t));
2288
2289         ASSERT(db->db_dirtycnt > 0);
2290         db->db_dirtycnt -= 1;
2291
2292         if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
2293                 ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
2294                 dbuf_destroy(db);
2295                 return (B_TRUE);
2296         }
2297
2298         return (B_FALSE);
2299 }
2300
2301 static void
2302 dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
2303 {
2304         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2305
2306         ASSERT(tx->tx_txg != 0);
2307         ASSERT(!zfs_refcount_is_zero(&db->db_holds));
2308
2309         /*
2310          * Quick check for dirtiness.  For already dirty blocks, this
2311          * reduces runtime of this function by >90%, and overall performance
2312          * by 50% for some workloads (e.g. file deletion with indirect blocks
2313          * cached).
2314          */
2315         mutex_enter(&db->db_mtx);
2316
2317         if (db->db_state == DB_CACHED) {
2318                 dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
2319                 /*
2320                  * It's possible that it is already dirty but not cached,
2321                  * because there are some calls to dbuf_dirty() that don't
2322                  * go through dmu_buf_will_dirty().
2323                  */
2324                 if (dr != NULL) {
2325                         /* This dbuf is already dirty and cached. */
2326                         dbuf_redirty(dr);
2327                         mutex_exit(&db->db_mtx);
2328                         return;
2329                 }
2330         }
2331         mutex_exit(&db->db_mtx);
2332
2333         DB_DNODE_ENTER(db);
2334         if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
2335                 flags |= DB_RF_HAVESTRUCT;
2336         DB_DNODE_EXIT(db);
2337         (void) dbuf_read(db, NULL, flags);
2338         (void) dbuf_dirty(db, tx);
2339 }
2340
2341 void
2342 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
2343 {
2344         dmu_buf_will_dirty_impl(db_fake,
2345             DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
2346 }
2347
2348 boolean_t
2349 dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
2350 {
2351         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2352         dbuf_dirty_record_t *dr;
2353
2354         mutex_enter(&db->db_mtx);
2355         dr = dbuf_find_dirty_eq(db, tx->tx_txg);
2356         mutex_exit(&db->db_mtx);
2357         return (dr != NULL);
2358 }
2359
2360 void
2361 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
2362 {
2363         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2364
2365         db->db_state = DB_NOFILL;
2366
2367         dmu_buf_will_fill(db_fake, tx);
2368 }
2369
2370 void
2371 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
2372 {
2373         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2374
2375         ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2376         ASSERT(tx->tx_txg != 0);
2377         ASSERT(db->db_level == 0);
2378         ASSERT(!zfs_refcount_is_zero(&db->db_holds));
2379
2380         ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
2381             dmu_tx_private_ok(tx));
2382
2383         dbuf_noread(db);
2384         (void) dbuf_dirty(db, tx);
2385 }
2386
2387 /*
2388  * This function is effectively the same as dmu_buf_will_dirty(), but
2389  * indicates the caller expects raw encrypted data in the db, and provides
2390  * the crypt params (byteorder, salt, iv, mac) which should be stored in the
2391  * blkptr_t when this dbuf is written.  This is only used for blocks of
2392  * dnodes, during raw receive.
2393  */
2394 void
2395 dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
2396     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)
2397 {
2398         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2399         dbuf_dirty_record_t *dr;
2400
2401         /*
2402          * dr_has_raw_params is only processed for blocks of dnodes
2403          * (see dbuf_sync_dnode_leaf_crypt()).
2404          */
2405         ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
2406         ASSERT3U(db->db_level, ==, 0);
2407         ASSERT(db->db_objset->os_raw_receive);
2408
2409         dmu_buf_will_dirty_impl(db_fake,
2410             DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx);
2411
2412         dr = dbuf_find_dirty_eq(db, tx->tx_txg);
2413
2414         ASSERT3P(dr, !=, NULL);
2415
2416         dr->dt.dl.dr_has_raw_params = B_TRUE;
2417         dr->dt.dl.dr_byteorder = byteorder;
2418         bcopy(salt, dr->dt.dl.dr_salt, ZIO_DATA_SALT_LEN);
2419         bcopy(iv, dr->dt.dl.dr_iv, ZIO_DATA_IV_LEN);
2420         bcopy(mac, dr->dt.dl.dr_mac, ZIO_DATA_MAC_LEN);
2421 }
2422
2423 static void
2424 dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
2425 {
2426         struct dirty_leaf *dl;
2427         dbuf_dirty_record_t *dr;
2428
2429         dr = list_head(&db->db_dirty_records);
2430         ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
2431         dl = &dr->dt.dl;
2432         dl->dr_overridden_by = *bp;
2433         dl->dr_override_state = DR_OVERRIDDEN;
2434         dl->dr_overridden_by.blk_birth = dr->dr_txg;
2435 }
2436
2437 /* ARGSUSED */
2438 void
2439 dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx)
2440 {
2441         dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2442         mutex_enter(&db->db_mtx);
2443         DBUF_VERIFY(db);
2444
2445         if (db->db_state == DB_FILL) {
2446                 if (db->db_level == 0 && db->db_freed_in_flight) {
2447                         ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2448                         /* we were freed while filling */
2449                         /* XXX dbuf_undirty? */
2450                         bzero(db->db.db_data, db->db.db_size);
2451                         db->db_freed_in_flight = FALSE;
2452                 }
2453                 db->db_state = DB_CACHED;
2454                 cv_broadcast(&db->db_changed);
2455         }
2456         mutex_exit(&db->db_mtx);
2457 }
2458
2459 void
2460 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
2461     bp_embedded_type_t etype, enum zio_compress comp,
2462     int uncompressed_size, int compressed_size, int byteorder,
2463     dmu_tx_t *tx)
2464 {
2465         dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2466         struct dirty_leaf *dl;
2467         dmu_object_type_t type;
2468         dbuf_dirty_record_t *dr;
2469
2470         if (etype == BP_EMBEDDED_TYPE_DATA) {
2471                 ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
2472                     SPA_FEATURE_EMBEDDED_DATA));
2473         }
2474
2475         DB_DNODE_ENTER(db);
2476         type = DB_DNODE(db)->dn_type;
2477         DB_DNODE_EXIT(db);
2478
2479         ASSERT0(db->db_level);
2480         ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2481
2482         dmu_buf_will_not_fill(dbuf, tx);
2483
2484         dr = list_head(&db->db_dirty_records);
2485         ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
2486         dl = &dr->dt.dl;
2487         encode_embedded_bp_compressed(&dl->dr_overridden_by,
2488             data, comp, uncompressed_size, compressed_size);
2489         BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
2490         BP_SET_TYPE(&dl->dr_overridden_by, type);
2491         BP_SET_LEVEL(&dl->dr_overridden_by, 0);
2492         BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
2493
2494         dl->dr_override_state = DR_OVERRIDDEN;
2495         dl->dr_overridden_by.blk_birth = dr->dr_txg;
2496 }
2497
2498 void
2499 dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx)
2500 {
2501         dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
2502         dmu_object_type_t type;
2503         ASSERT(dsl_dataset_feature_is_active(db->db_objset->os_dsl_dataset,
2504             SPA_FEATURE_REDACTED_DATASETS));
2505
2506         DB_DNODE_ENTER(db);
2507         type = DB_DNODE(db)->dn_type;
2508         DB_DNODE_EXIT(db);
2509
2510         ASSERT0(db->db_level);
2511         dmu_buf_will_not_fill(dbuf, tx);
2512
2513         blkptr_t bp = { { { {0} } } };
2514         BP_SET_TYPE(&bp, type);
2515         BP_SET_LEVEL(&bp, 0);
2516         BP_SET_BIRTH(&bp, tx->tx_txg, 0);
2517         BP_SET_REDACTED(&bp);
2518         BPE_SET_LSIZE(&bp, dbuf->db_size);
2519
2520         dbuf_override_impl(db, &bp, tx);
2521 }
2522
2523 /*
2524  * Directly assign a provided arc buf to a given dbuf if it's not referenced
2525  * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
2526  */
2527 void
2528 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
2529 {
2530         ASSERT(!zfs_refcount_is_zero(&db->db_holds));
2531         ASSERT(db->db_blkid != DMU_BONUS_BLKID);
2532         ASSERT(db->db_level == 0);
2533         ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
2534         ASSERT(buf != NULL);
2535         ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size);
2536         ASSERT(tx->tx_txg != 0);
2537
2538         arc_return_buf(buf, db);
2539         ASSERT(arc_released(buf));
2540
2541         mutex_enter(&db->db_mtx);
2542
2543         while (db->db_state == DB_READ || db->db_state == DB_FILL)
2544                 cv_wait(&db->db_changed, &db->db_mtx);
2545
2546         ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
2547
2548         if (db->db_state == DB_CACHED &&
2549             zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
2550                 /*
2551                  * In practice, we will never have a case where we have an
2552                  * encrypted arc buffer while additional holds exist on the
2553                  * dbuf. We don't handle this here so we simply assert that
2554                  * fact instead.
2555                  */
2556                 ASSERT(!arc_is_encrypted(buf));
2557                 mutex_exit(&db->db_mtx);
2558                 (void) dbuf_dirty(db, tx);
2559                 bcopy(buf->b_data, db->db.db_data, db->db.db_size);
2560                 arc_buf_destroy(buf, db);
2561                 xuio_stat_wbuf_copied();
2562                 return;
2563         }
2564
2565         xuio_stat_wbuf_nocopy();
2566         if (db->db_state == DB_CACHED) {
2567                 dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
2568
2569                 ASSERT(db->db_buf != NULL);
2570                 if (dr != NULL && dr->dr_txg == tx->tx_txg) {
2571                         ASSERT(dr->dt.dl.dr_data == db->db_buf);
2572
2573                         if (!arc_released(db->db_buf)) {
2574                                 ASSERT(dr->dt.dl.dr_override_state ==
2575                                     DR_OVERRIDDEN);
2576                                 arc_release(db->db_buf, db);
2577                         }
2578                         dr->dt.dl.dr_data = buf;
2579                         arc_buf_destroy(db->db_buf, db);
2580                 } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
2581                         arc_release(db->db_buf, db);
2582                         arc_buf_destroy(db->db_buf, db);
2583                 }
2584                 db->db_buf = NULL;
2585         }
2586         ASSERT(db->db_buf == NULL);
2587         dbuf_set_data(db, buf);
2588         db->db_state = DB_FILL;
2589         mutex_exit(&db->db_mtx);
2590         (void) dbuf_dirty(db, tx);
2591         dmu_buf_fill_done(&db->db, tx);
2592 }
2593
2594 void
2595 dbuf_destroy(dmu_buf_impl_t *db)
2596 {
2597         dnode_t *dn;
2598         dmu_buf_impl_t *parent = db->db_parent;
2599         dmu_buf_impl_t *dndb;
2600
2601         ASSERT(MUTEX_HELD(&db->db_mtx));
2602         ASSERT(zfs_refcount_is_zero(&db->db_holds));
2603
2604         if (db->db_buf != NULL) {
2605                 arc_buf_destroy(db->db_buf, db);
2606                 db->db_buf = NULL;
2607         }
2608
2609         if (db->db_blkid == DMU_BONUS_BLKID) {
2610                 int slots = DB_DNODE(db)->dn_num_slots;
2611                 int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
2612                 if (db->db.db_data != NULL) {
2613                         kmem_free(db->db.db_data, bonuslen);
2614                         arc_space_return(bonuslen, ARC_SPACE_BONUS);
2615                         db->db_state = DB_UNCACHED;
2616                 }
2617         }
2618
2619         dbuf_clear_data(db);
2620
2621         if (multilist_link_active(&db->db_cache_link)) {
2622                 ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
2623                     db->db_caching_status == DB_DBUF_METADATA_CACHE);
2624
2625                 multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
2626                 (void) zfs_refcount_remove_many(
2627                     &dbuf_caches[db->db_caching_status].size,
2628                     db->db.db_size, db);
2629
2630                 if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
2631                         DBUF_STAT_BUMPDOWN(metadata_cache_count);
2632                 } else {
2633                         DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
2634                         DBUF_STAT_BUMPDOWN(cache_count);
2635                         DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
2636                             db->db.db_size);
2637                 }
2638                 db->db_caching_status = DB_NO_CACHE;
2639         }
2640
2641         ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
2642         ASSERT(db->db_data_pending == NULL);
2643
2644         db->db_state = DB_EVICTING;
2645         db->db_blkptr = NULL;
2646
2647         /*
2648          * Now that db_state is DB_EVICTING, nobody else can find this via
2649          * the hash table.  We can now drop db_mtx, which allows us to
2650          * acquire the dn_dbufs_mtx.
2651          */
2652         mutex_exit(&db->db_mtx);
2653
2654         DB_DNODE_ENTER(db);
2655         dn = DB_DNODE(db);
2656         dndb = dn->dn_dbuf;
2657         if (db->db_blkid != DMU_BONUS_BLKID) {
2658                 boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
2659                 if (needlock)
2660                         mutex_enter_nested(&dn->dn_dbufs_mtx,
2661                             NESTED_SINGLE);
2662                 avl_remove(&dn->dn_dbufs, db);
2663                 atomic_dec_32(&dn->dn_dbufs_count);
2664                 membar_producer();
2665                 DB_DNODE_EXIT(db);
2666                 if (needlock)
2667                         mutex_exit(&dn->dn_dbufs_mtx);
2668                 /*
2669                  * Decrementing the dbuf count means that the hold corresponding
2670                  * to the removed dbuf is no longer discounted in dnode_move(),
2671                  * so the dnode cannot be moved until after we release the hold.
2672                  * The membar_producer() ensures visibility of the decremented
2673                  * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
2674                  * release any lock.
2675                  */
2676                 mutex_enter(&dn->dn_mtx);
2677                 dnode_rele_and_unlock(dn, db, B_TRUE);
2678                 db->db_dnode_handle = NULL;
2679
2680                 dbuf_hash_remove(db);
2681         } else {
2682                 DB_DNODE_EXIT(db);
2683         }
2684
2685         ASSERT(zfs_refcount_is_zero(&db->db_holds));
2686
2687         db->db_parent = NULL;
2688
2689         ASSERT(db->db_buf == NULL);
2690         ASSERT(db->db.db_data == NULL);
2691         ASSERT(db->db_hash_next == NULL);
2692         ASSERT(db->db_blkptr == NULL);
2693         ASSERT(db->db_data_pending == NULL);
2694         ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
2695         ASSERT(!multilist_link_active(&db->db_cache_link));
2696
2697         kmem_cache_free(dbuf_kmem_cache, db);
2698         arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
2699
2700         /*
2701          * If this dbuf is referenced from an indirect dbuf,
2702          * decrement the ref count on the indirect dbuf.
2703          */
2704         if (parent && parent != dndb) {
2705                 mutex_enter(&parent->db_mtx);
2706                 dbuf_rele_and_unlock(parent, db, B_TRUE);
2707         }
2708 }
2709
2710 /*
2711  * Note: While bpp will always be updated if the function returns success,
2712  * parentp will not be updated if the dnode does not have dn_dbuf filled in;
2713  * this happens when the dnode is the meta-dnode, or {user|group|project}used
2714  * object.
2715  */
2716 __attribute__((always_inline))
2717 static inline int
2718 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
2719     dmu_buf_impl_t **parentp, blkptr_t **bpp)
2720 {
2721         *parentp = NULL;
2722         *bpp = NULL;
2723
2724         ASSERT(blkid != DMU_BONUS_BLKID);
2725
2726         if (blkid == DMU_SPILL_BLKID) {
2727                 mutex_enter(&dn->dn_mtx);
2728                 if (dn->dn_have_spill &&
2729                     (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
2730                         *bpp = DN_SPILL_BLKPTR(dn->dn_phys);
2731                 else
2732                         *bpp = NULL;
2733                 dbuf_add_ref(dn->dn_dbuf, NULL);
2734                 *parentp = dn->dn_dbuf;
2735                 mutex_exit(&dn->dn_mtx);
2736                 return (0);
2737         }
2738
2739         int nlevels =
2740             (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;
2741         int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
2742
2743         ASSERT3U(level * epbs, <, 64);
2744         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2745         /*
2746          * This assertion shouldn't trip as long as the max indirect block size
2747          * is less than 1M.  The reason for this is that up to that point,
2748          * the number of levels required to address an entire object with blocks
2749          * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64.  In
2750          * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55
2751          * (i.e. we can address the entire object), objects will all use at most
2752          * N-1 levels and the assertion won't overflow.  However, once epbs is
2753          * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66.  Then, 4 levels will not be
2754          * enough to address an entire object, so objects will have 5 levels,
2755          * but then this assertion will overflow.
2756          *
2757          * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we
2758          * need to redo this logic to handle overflows.
2759          */
2760         ASSERT(level >= nlevels ||
2761             ((nlevels - level - 1) * epbs) +
2762             highbit64(dn->dn_phys->dn_nblkptr) <= 64);
2763         if (level >= nlevels ||
2764             blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<
2765             ((nlevels - level - 1) * epbs)) ||
2766             (fail_sparse &&
2767             blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
2768                 /* the buffer has no parent yet */
2769                 return (SET_ERROR(ENOENT));
2770         } else if (level < nlevels-1) {
2771                 /* this block is referenced from an indirect block */
2772                 int err;
2773
2774                 err = dbuf_hold_impl(dn, level + 1,
2775                     blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
2776
2777                 if (err)
2778                         return (err);
2779                 err = dbuf_read(*parentp, NULL,
2780                     (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
2781                 if (err) {
2782                         dbuf_rele(*parentp, NULL);
2783                         *parentp = NULL;
2784                         return (err);
2785                 }
2786                 rw_enter(&(*parentp)->db_rwlock, RW_READER);
2787                 *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
2788                     (blkid & ((1ULL << epbs) - 1));
2789                 if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
2790                         ASSERT(BP_IS_HOLE(*bpp));
2791                 rw_exit(&(*parentp)->db_rwlock);
2792                 return (0);
2793         } else {
2794                 /* the block is referenced from the dnode */
2795                 ASSERT3U(level, ==, nlevels-1);
2796                 ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
2797                     blkid < dn->dn_phys->dn_nblkptr);
2798                 if (dn->dn_dbuf) {
2799                         dbuf_add_ref(dn->dn_dbuf, NULL);
2800                         *parentp = dn->dn_dbuf;
2801                 }
2802                 *bpp = &dn->dn_phys->dn_blkptr[blkid];
2803                 return (0);
2804         }
2805 }
2806
2807 static dmu_buf_impl_t *
2808 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
2809     dmu_buf_impl_t *parent, blkptr_t *blkptr)
2810 {
2811         objset_t *os = dn->dn_objset;
2812         dmu_buf_impl_t *db, *odb;
2813
2814         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2815         ASSERT(dn->dn_type != DMU_OT_NONE);
2816
2817         db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
2818
2819         list_create(&db->db_dirty_records, sizeof (dbuf_dirty_record_t),
2820             offsetof(dbuf_dirty_record_t, dr_dbuf_node));
2821
2822         db->db_objset = os;
2823         db->db.db_object = dn->dn_object;
2824         db->db_level = level;
2825         db->db_blkid = blkid;
2826         db->db_dirtycnt = 0;
2827         db->db_dnode_handle = dn->dn_handle;
2828         db->db_parent = parent;
2829         db->db_blkptr = blkptr;
2830
2831         db->db_user = NULL;
2832         db->db_user_immediate_evict = FALSE;
2833         db->db_freed_in_flight = FALSE;
2834         db->db_pending_evict = FALSE;
2835
2836         if (blkid == DMU_BONUS_BLKID) {
2837                 ASSERT3P(parent, ==, dn->dn_dbuf);
2838                 db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
2839                     (dn->dn_nblkptr-1) * sizeof (blkptr_t);
2840                 ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
2841                 db->db.db_offset = DMU_BONUS_BLKID;
2842                 db->db_state = DB_UNCACHED;
2843                 db->db_caching_status = DB_NO_CACHE;
2844                 /* the bonus dbuf is not placed in the hash table */
2845                 arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
2846                 return (db);
2847         } else if (blkid == DMU_SPILL_BLKID) {
2848                 db->db.db_size = (blkptr != NULL) ?
2849                     BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
2850                 db->db.db_offset = 0;
2851         } else {
2852                 int blocksize =
2853                     db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
2854                 db->db.db_size = blocksize;
2855                 db->db.db_offset = db->db_blkid * blocksize;
2856         }
2857
2858         /*
2859          * Hold the dn_dbufs_mtx while we get the new dbuf
2860          * in the hash table *and* added to the dbufs list.
2861          * This prevents a possible deadlock with someone
2862          * trying to look up this dbuf before it's added to the
2863          * dn_dbufs list.
2864          */
2865         mutex_enter(&dn->dn_dbufs_mtx);
2866         db->db_state = DB_EVICTING;
2867         if ((odb = dbuf_hash_insert(db)) != NULL) {
2868                 /* someone else inserted it first */
2869                 kmem_cache_free(dbuf_kmem_cache, db);
2870                 mutex_exit(&dn->dn_dbufs_mtx);
2871                 DBUF_STAT_BUMP(hash_insert_race);
2872                 return (odb);
2873         }
2874         avl_add(&dn->dn_dbufs, db);
2875
2876         db->db_state = DB_UNCACHED;
2877         db->db_caching_status = DB_NO_CACHE;
2878         mutex_exit(&dn->dn_dbufs_mtx);
2879         arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
2880
2881         if (parent && parent != dn->dn_dbuf)
2882                 dbuf_add_ref(parent, db);
2883
2884         ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
2885             zfs_refcount_count(&dn->dn_holds) > 0);
2886         (void) zfs_refcount_add(&dn->dn_holds, db);
2887         atomic_inc_32(&dn->dn_dbufs_count);
2888
2889         dprintf_dbuf(db, "db=%p\n", db);
2890
2891         return (db);
2892 }
2893
2894 /*
2895  * This function returns a block pointer and information about the object,
2896  * given a dnode and a block.  This is a publicly accessible version of
2897  * dbuf_findbp that only returns some information, rather than the
2898  * dbuf.  Note that the dnode passed in must be held, and the dn_struct_rwlock
2899  * should be locked as (at least) a reader.
2900  */
2901 int
2902 dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid,
2903     blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift)
2904 {
2905         dmu_buf_impl_t *dbp = NULL;
2906         blkptr_t *bp2;
2907         int err = 0;
2908         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
2909
2910         err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2);
2911         if (err == 0) {
2912                 *bp = *bp2;
2913                 if (dbp != NULL)
2914                         dbuf_rele(dbp, NULL);
2915                 if (datablkszsec != NULL)
2916                         *datablkszsec = dn->dn_phys->dn_datablkszsec;
2917                 if (indblkshift != NULL)
2918                         *indblkshift = dn->dn_phys->dn_indblkshift;
2919         }
2920
2921         return (err);
2922 }
2923
2924 typedef struct dbuf_prefetch_arg {
2925         spa_t *dpa_spa; /* The spa to issue the prefetch in. */
2926         zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
2927         int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
2928         int dpa_curlevel; /* The current level that we're reading */
2929         dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
2930         zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
2931         zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
2932         arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
2933 } dbuf_prefetch_arg_t;
2934
2935 /*
2936  * Actually issue the prefetch read for the block given.
2937  */
2938 static void
2939 dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
2940 {
2941         ASSERT(!BP_IS_REDACTED(bp) ||
2942             dsl_dataset_feature_is_active(
2943             dpa->dpa_dnode->dn_objset->os_dsl_dataset,
2944             SPA_FEATURE_REDACTED_DATASETS));
2945
2946         if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
2947                 return;
2948
2949         int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
2950         arc_flags_t aflags =
2951             dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
2952
2953         /* dnodes are always read as raw and then converted later */
2954         if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) &&
2955             dpa->dpa_curlevel == 0)
2956                 zio_flags |= ZIO_FLAG_RAW;
2957
2958         ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
2959         ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
2960         ASSERT(dpa->dpa_zio != NULL);
2961         (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
2962             dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);
2963 }
2964
2965 /*
2966  * Called when an indirect block above our prefetch target is read in.  This
2967  * will either read in the next indirect block down the tree or issue the actual
2968  * prefetch if the next block down is our target.
2969  */
2970 static void
2971 dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
2972     const blkptr_t *iobp, arc_buf_t *abuf, void *private)
2973 {
2974         dbuf_prefetch_arg_t *dpa = private;
2975
2976         ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
2977         ASSERT3S(dpa->dpa_curlevel, >, 0);
2978
2979         if (abuf == NULL) {
2980                 ASSERT(zio == NULL || zio->io_error != 0);
2981                 kmem_free(dpa, sizeof (*dpa));
2982                 return;
2983         }
2984         ASSERT(zio == NULL || zio->io_error == 0);
2985
2986         /*
2987          * The dpa_dnode is only valid if we are called with a NULL
2988          * zio. This indicates that the arc_read() returned without
2989          * first calling zio_read() to issue a physical read. Once
2990          * a physical read is made the dpa_dnode must be invalidated
2991          * as the locks guarding it may have been dropped. If the
2992          * dpa_dnode is still valid, then we want to add it to the dbuf
2993          * cache. To do so, we must hold the dbuf associated with the block
2994          * we just prefetched, read its contents so that we associate it
2995          * with an arc_buf_t, and then release it.
2996          */
2997         if (zio != NULL) {
2998                 ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
2999                 if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {
3000                         ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
3001                 } else {
3002                         ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
3003                 }
3004                 ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
3005
3006                 dpa->dpa_dnode = NULL;
3007         } else if (dpa->dpa_dnode != NULL) {
3008                 uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
3009                     (dpa->dpa_epbs * (dpa->dpa_curlevel -
3010                     dpa->dpa_zb.zb_level));
3011                 dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
3012                     dpa->dpa_curlevel, curblkid, FTAG);
3013                 if (db == NULL) {
3014                         kmem_free(dpa, sizeof (*dpa));
3015                         arc_buf_destroy(abuf, private);
3016                         return;
3017                 }
3018
3019                 (void) dbuf_read(db, NULL,
3020                     DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
3021                 dbuf_rele(db, FTAG);
3022         }
3023
3024         dpa->dpa_curlevel--;
3025         uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
3026             (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
3027         blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
3028             P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
3029
3030         ASSERT(!BP_IS_REDACTED(bp) ||
3031             dsl_dataset_feature_is_active(
3032             dpa->dpa_dnode->dn_objset->os_dsl_dataset,
3033             SPA_FEATURE_REDACTED_DATASETS));
3034         if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
3035                 kmem_free(dpa, sizeof (*dpa));
3036         } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
3037                 ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
3038                 dbuf_issue_final_prefetch(dpa, bp);
3039                 kmem_free(dpa, sizeof (*dpa));
3040         } else {
3041                 arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
3042                 zbookmark_phys_t zb;
3043
3044                 /* flag if L2ARC eligible, l2arc_noprefetch then decides */
3045                 if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
3046                         iter_aflags |= ARC_FLAG_L2CACHE;
3047
3048                 ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
3049
3050                 SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
3051                     dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
3052
3053                 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
3054                     bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
3055                     ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
3056                     &iter_aflags, &zb);
3057         }
3058
3059         arc_buf_destroy(abuf, private);
3060 }
3061
3062 /*
3063  * Issue prefetch reads for the given block on the given level.  If the indirect
3064  * blocks above that block are not in memory, we will read them in
3065  * asynchronously.  As a result, this call never blocks waiting for a read to
3066  * complete. Note that the prefetch might fail if the dataset is encrypted and
3067  * the encryption key is unmapped before the IO completes.
3068  */
3069 void
3070 dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
3071     arc_flags_t aflags)
3072 {
3073         blkptr_t bp;
3074         int epbs, nlevels, curlevel;
3075         uint64_t curblkid;
3076
3077         ASSERT(blkid != DMU_BONUS_BLKID);
3078         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
3079
3080         if (blkid > dn->dn_maxblkid)
3081                 return;
3082
3083         if (level == 0 && dnode_block_freed(dn, blkid))
3084                 return;
3085
3086         /*
3087          * This dnode hasn't been written to disk yet, so there's nothing to
3088          * prefetch.
3089          */
3090         nlevels = dn->dn_phys->dn_nlevels;
3091         if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
3092                 return;
3093
3094         epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
3095         if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
3096                 return;
3097
3098         dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
3099             level, blkid);
3100         if (db != NULL) {
3101                 mutex_exit(&db->db_mtx);
3102                 /*
3103                  * This dbuf already exists.  It is either CACHED, or
3104                  * (we assume) about to be read or filled.
3105                  */
3106                 return;
3107         }
3108
3109         /*
3110          * Find the closest ancestor (indirect block) of the target block
3111          * that is present in the cache.  In this indirect block, we will
3112          * find the bp that is at curlevel, curblkid.
3113          */
3114         curlevel = level;
3115         curblkid = blkid;
3116         while (curlevel < nlevels - 1) {
3117                 int parent_level = curlevel + 1;
3118                 uint64_t parent_blkid = curblkid >> epbs;
3119                 dmu_buf_impl_t *db;
3120
3121                 if (dbuf_hold_impl(dn, parent_level, parent_blkid,
3122                     FALSE, TRUE, FTAG, &db) == 0) {
3123                         blkptr_t *bpp = db->db_buf->b_data;
3124                         bp = bpp[P2PHASE(curblkid, 1 << epbs)];
3125                         dbuf_rele(db, FTAG);
3126                         break;
3127                 }
3128
3129                 curlevel = parent_level;
3130                 curblkid = parent_blkid;
3131         }
3132
3133         if (curlevel == nlevels - 1) {
3134                 /* No cached indirect blocks found. */
3135                 ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
3136                 bp = dn->dn_phys->dn_blkptr[curblkid];
3137         }
3138         ASSERT(!BP_IS_REDACTED(&bp) ||
3139             dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset,
3140             SPA_FEATURE_REDACTED_DATASETS));
3141         if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp))
3142                 return;
3143
3144         ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
3145
3146         zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
3147             ZIO_FLAG_CANFAIL);
3148
3149         dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
3150         dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
3151         SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
3152             dn->dn_object, level, blkid);
3153         dpa->dpa_curlevel = curlevel;
3154         dpa->dpa_prio = prio;
3155         dpa->dpa_aflags = aflags;
3156         dpa->dpa_spa = dn->dn_objset->os_spa;
3157         dpa->dpa_dnode = dn;
3158         dpa->dpa_epbs = epbs;
3159         dpa->dpa_zio = pio;
3160
3161         /* flag if L2ARC eligible, l2arc_noprefetch then decides */
3162         if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
3163                 dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
3164
3165         /*
3166          * If we have the indirect just above us, no need to do the asynchronous
3167          * prefetch chain; we'll just run the last step ourselves.  If we're at
3168          * a higher level, though, we want to issue the prefetches for all the
3169          * indirect blocks asynchronously, so we can go on with whatever we were
3170          * doing.
3171          */
3172         if (curlevel == level) {
3173                 ASSERT3U(curblkid, ==, blkid);
3174                 dbuf_issue_final_prefetch(dpa, &bp);
3175                 kmem_free(dpa, sizeof (*dpa));
3176         } else {
3177                 arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
3178                 zbookmark_phys_t zb;
3179
3180                 /* flag if L2ARC eligible, l2arc_noprefetch then decides */
3181                 if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
3182                         iter_aflags |= ARC_FLAG_L2CACHE;
3183
3184                 SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
3185                     dn->dn_object, curlevel, curblkid);
3186                 (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
3187                     &bp, dbuf_prefetch_indirect_done, dpa, prio,
3188                     ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
3189                     &iter_aflags, &zb);
3190         }
3191         /*
3192          * We use pio here instead of dpa_zio since it's possible that
3193          * dpa may have already been freed.
3194          */
3195         zio_nowait(pio);
3196 }
3197
3198 /*
3199  * Helper function for dbuf_hold_impl() to copy a buffer. Handles
3200  * the case of encrypted, compressed and uncompressed buffers by
3201  * allocating the new buffer, respectively, with arc_alloc_raw_buf(),
3202  * arc_alloc_compressed_buf() or arc_alloc_buf().*
3203  *
3204  * NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl().
3205  */
3206 noinline static void
3207 dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
3208 {
3209         dbuf_dirty_record_t *dr = db->db_data_pending;
3210         arc_buf_t *data = dr->dt.dl.dr_data;
3211         enum zio_compress compress_type = arc_get_compression(data);
3212
3213         if (arc_is_encrypted(data)) {
3214                 boolean_t byteorder;
3215                 uint8_t salt[ZIO_DATA_SALT_LEN];
3216                 uint8_t iv[ZIO_DATA_IV_LEN];
3217                 uint8_t mac[ZIO_DATA_MAC_LEN];
3218
3219                 arc_get_raw_params(data, &byteorder, salt, iv, mac);
3220                 dbuf_set_data(db, arc_alloc_raw_buf(dn->dn_objset->os_spa, db,
3221                     dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac,
3222                     dn->dn_type, arc_buf_size(data), arc_buf_lsize(data),
3223                     compress_type));
3224         } else if (compress_type != ZIO_COMPRESS_OFF) {
3225                 dbuf_set_data(db, arc_alloc_compressed_buf(
3226                     dn->dn_objset->os_spa, db, arc_buf_size(data),
3227                     arc_buf_lsize(data), compress_type));
3228         } else {
3229                 dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db,
3230                     DBUF_GET_BUFC_TYPE(db), db->db.db_size));
3231         }
3232
3233         rw_enter(&db->db_rwlock, RW_WRITER);
3234         bcopy(data->b_data, db->db.db_data, arc_buf_size(data));
3235         rw_exit(&db->db_rwlock);
3236 }
3237
3238 /*
3239  * Returns with db_holds incremented, and db_mtx not held.
3240  * Note: dn_struct_rwlock must be held.
3241  */
3242 int
3243 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
3244     boolean_t fail_sparse, boolean_t fail_uncached,
3245     void *tag, dmu_buf_impl_t **dbp)
3246 {
3247         dmu_buf_impl_t *db, *parent = NULL;
3248
3249         /* If the pool has been created, verify the tx_sync_lock is not held */
3250         spa_t *spa = dn->dn_objset->os_spa;
3251         dsl_pool_t *dp = spa->spa_dsl_pool;
3252         if (dp != NULL) {
3253                 ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock));
3254         }
3255
3256         ASSERT(blkid != DMU_BONUS_BLKID);
3257         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
3258         ASSERT3U(dn->dn_nlevels, >, level);
3259
3260         *dbp = NULL;
3261
3262         /* dbuf_find() returns with db_mtx held */
3263         db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
3264
3265         if (db == NULL) {
3266                 blkptr_t *bp = NULL;
3267                 int err;
3268
3269                 if (fail_uncached)
3270                         return (SET_ERROR(ENOENT));
3271
3272                 ASSERT3P(parent, ==, NULL);
3273                 err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
3274                 if (fail_sparse) {
3275                         if (err == 0 && bp && BP_IS_HOLE(bp))
3276                                 err = SET_ERROR(ENOENT);
3277                         if (err) {
3278                                 if (parent)
3279                                         dbuf_rele(parent, NULL);
3280                                 return (err);
3281                         }
3282                 }
3283                 if (err && err != ENOENT)
3284                         return (err);
3285                 db = dbuf_create(dn, level, blkid, parent, bp);
3286         }
3287
3288         if (fail_uncached && db->db_state != DB_CACHED) {
3289                 mutex_exit(&db->db_mtx);
3290                 return (SET_ERROR(ENOENT));
3291         }
3292
3293         if (db->db_buf != NULL) {
3294                 arc_buf_access(db->db_buf);
3295                 ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
3296         }
3297
3298         ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
3299
3300         /*
3301          * If this buffer is currently syncing out, and we are
3302          * still referencing it from db_data, we need to make a copy
3303          * of it in case we decide we want to dirty it again in this txg.
3304          */
3305         if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
3306             dn->dn_object != DMU_META_DNODE_OBJECT &&
3307             db->db_state == DB_CACHED && db->db_data_pending) {
3308                 dbuf_dirty_record_t *dr = db->db_data_pending;
3309                 if (dr->dt.dl.dr_data == db->db_buf)
3310                         dbuf_hold_copy(dn, db);
3311         }
3312
3313         if (multilist_link_active(&db->db_cache_link)) {
3314                 ASSERT(zfs_refcount_is_zero(&db->db_holds));
3315                 ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
3316                     db->db_caching_status == DB_DBUF_METADATA_CACHE);
3317
3318                 multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
3319                 (void) zfs_refcount_remove_many(
3320                     &dbuf_caches[db->db_caching_status].size,
3321                     db->db.db_size, db);
3322
3323                 if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
3324                         DBUF_STAT_BUMPDOWN(metadata_cache_count);
3325                 } else {
3326                         DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
3327                         DBUF_STAT_BUMPDOWN(cache_count);
3328                         DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
3329                             db->db.db_size);
3330                 }
3331                 db->db_caching_status = DB_NO_CACHE;
3332         }
3333         (void) zfs_refcount_add(&db->db_holds, tag);
3334         DBUF_VERIFY(db);
3335         mutex_exit(&db->db_mtx);
3336
3337         /* NOTE: we can't rele the parent until after we drop the db_mtx */
3338         if (parent)
3339                 dbuf_rele(parent, NULL);
3340
3341         ASSERT3P(DB_DNODE(db), ==, dn);
3342         ASSERT3U(db->db_blkid, ==, blkid);
3343         ASSERT3U(db->db_level, ==, level);
3344         *dbp = db;
3345
3346         return (0);
3347 }
3348
3349 dmu_buf_impl_t *
3350 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
3351 {
3352         return (dbuf_hold_level(dn, 0, blkid, tag));
3353 }
3354
3355 dmu_buf_impl_t *
3356 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
3357 {
3358         dmu_buf_impl_t *db;
3359         int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
3360         return (err ? NULL : db);
3361 }
3362
3363 void
3364 dbuf_create_bonus(dnode_t *dn)
3365 {
3366         ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
3367
3368         ASSERT(dn->dn_bonus == NULL);
3369         dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
3370 }
3371
3372 int
3373 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
3374 {
3375         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
3376
3377         if (db->db_blkid != DMU_SPILL_BLKID)
3378                 return (SET_ERROR(ENOTSUP));
3379         if (blksz == 0)
3380                 blksz = SPA_MINBLOCKSIZE;
3381         ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
3382         blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
3383
3384         dbuf_new_size(db, blksz, tx);
3385
3386         return (0);
3387 }
3388
3389 void
3390 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
3391 {
3392         dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
3393 }
3394
3395 #pragma weak dmu_buf_add_ref = dbuf_add_ref
3396 void
3397 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
3398 {
3399         int64_t holds = zfs_refcount_add(&db->db_holds, tag);
3400         VERIFY3S(holds, >, 1);
3401 }
3402
3403 #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
3404 boolean_t
3405 dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
3406     void *tag)
3407 {
3408         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
3409         dmu_buf_impl_t *found_db;
3410         boolean_t result = B_FALSE;
3411
3412         if (blkid == DMU_BONUS_BLKID)
3413                 found_db = dbuf_find_bonus(os, obj);
3414         else
3415                 found_db = dbuf_find(os, obj, 0, blkid);
3416
3417         if (found_db != NULL) {
3418                 if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
3419                         (void) zfs_refcount_add(&db->db_holds, tag);
3420                         result = B_TRUE;
3421                 }
3422                 mutex_exit(&found_db->db_mtx);
3423         }
3424         return (result);
3425 }
3426
3427 /*
3428  * If you call dbuf_rele() you had better not be referencing the dnode handle
3429  * unless you have some other direct or indirect hold on the dnode. (An indirect
3430  * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
3431  * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
3432  * dnode's parent dbuf evicting its dnode handles.
3433  */
3434 void
3435 dbuf_rele(dmu_buf_impl_t *db, void *tag)
3436 {
3437         mutex_enter(&db->db_mtx);
3438         dbuf_rele_and_unlock(db, tag, B_FALSE);
3439 }
3440
3441 void
3442 dmu_buf_rele(dmu_buf_t *db, void *tag)
3443 {
3444         dbuf_rele((dmu_buf_impl_t *)db, tag);
3445 }
3446
3447 /*
3448  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
3449  * db_dirtycnt and db_holds to be updated atomically.  The 'evicting'
3450  * argument should be set if we are already in the dbuf-evicting code
3451  * path, in which case we don't want to recursively evict.  This allows us to
3452  * avoid deeply nested stacks that would have a call flow similar to this:
3453  *
3454  * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
3455  *      ^                                               |
3456  *      |                                               |
3457  *      +-----dbuf_destroy()<--dbuf_evict_one()<--------+
3458  *
3459  */
3460 void
3461 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
3462 {
3463         int64_t holds;
3464         uint64_t size;
3465
3466         ASSERT(MUTEX_HELD(&db->db_mtx));
3467         DBUF_VERIFY(db);
3468
3469         /*
3470          * Remove the reference to the dbuf before removing its hold on the
3471          * dnode so we can guarantee in dnode_move() that a referenced bonus
3472          * buffer has a corresponding dnode hold.
3473          */
3474         holds = zfs_refcount_remove(&db->db_holds, tag);
3475         ASSERT(holds >= 0);
3476
3477         /*
3478          * We can't freeze indirects if there is a possibility that they
3479          * may be modified in the current syncing context.
3480          */
3481         if (db->db_buf != NULL &&
3482             holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
3483                 arc_buf_freeze(db->db_buf);
3484         }
3485
3486         if (holds == db->db_dirtycnt &&
3487             db->db_level == 0 && db->db_user_immediate_evict)
3488                 dbuf_evict_user(db);
3489
3490         if (holds == 0) {
3491                 if (db->db_blkid == DMU_BONUS_BLKID) {
3492                         dnode_t *dn;
3493                         boolean_t evict_dbuf = db->db_pending_evict;
3494
3495                         /*
3496                          * If the dnode moves here, we cannot cross this
3497                          * barrier until the move completes.
3498                          */
3499                         DB_DNODE_ENTER(db);
3500
3501                         dn = DB_DNODE(db);
3502                         atomic_dec_32(&dn->dn_dbufs_count);
3503
3504                         /*
3505                          * Decrementing the dbuf count means that the bonus
3506                          * buffer's dnode hold is no longer discounted in
3507                          * dnode_move(). The dnode cannot move until after
3508                          * the dnode_rele() below.
3509                          */
3510                         DB_DNODE_EXIT(db);
3511
3512                         /*
3513                          * Do not reference db after its lock is dropped.
3514                          * Another thread may evict it.
3515                          */
3516                         mutex_exit(&db->db_mtx);
3517
3518                         if (evict_dbuf)
3519                                 dnode_evict_bonus(dn);
3520
3521                         dnode_rele(dn, db);
3522                 } else if (db->db_buf == NULL) {
3523                         /*
3524                          * This is a special case: we never associated this
3525                          * dbuf with any data allocated from the ARC.
3526                          */
3527                         ASSERT(db->db_state == DB_UNCACHED ||
3528                             db->db_state == DB_NOFILL);
3529                         dbuf_destroy(db);
3530                 } else if (arc_released(db->db_buf)) {
3531                         /*
3532                          * This dbuf has anonymous data associated with it.
3533                          */
3534                         dbuf_destroy(db);
3535                 } else {
3536                         boolean_t do_arc_evict = B_FALSE;
3537                         blkptr_t bp;
3538                         spa_t *spa = dmu_objset_spa(db->db_objset);
3539
3540                         if (!DBUF_IS_CACHEABLE(db) &&
3541                             db->db_blkptr != NULL &&
3542                             !BP_IS_HOLE(db->db_blkptr) &&
3543                             !BP_IS_EMBEDDED(db->db_blkptr)) {
3544                                 do_arc_evict = B_TRUE;
3545                                 bp = *db->db_blkptr;
3546                         }
3547
3548                         if (!DBUF_IS_CACHEABLE(db) ||
3549                             db->db_pending_evict) {
3550                                 dbuf_destroy(db);
3551                         } else if (!multilist_link_active(&db->db_cache_link)) {
3552                                 ASSERT3U(db->db_caching_status, ==,
3553                                     DB_NO_CACHE);
3554
3555                                 dbuf_cached_state_t dcs =
3556                                     dbuf_include_in_metadata_cache(db) ?
3557                                     DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
3558                                 db->db_caching_status = dcs;
3559
3560                                 multilist_insert(dbuf_caches[dcs].cache, db);
3561                                 size = zfs_refcount_add_many(
3562                                     &dbuf_caches[dcs].size,
3563                                     db->db.db_size, db);
3564
3565                                 if (dcs == DB_DBUF_METADATA_CACHE) {
3566                                         DBUF_STAT_BUMP(metadata_cache_count);
3567                                         DBUF_STAT_MAX(
3568                                             metadata_cache_size_bytes_max,
3569                                             size);
3570                                 } else {
3571                                         DBUF_STAT_BUMP(
3572                                             cache_levels[db->db_level]);
3573                                         DBUF_STAT_BUMP(cache_count);
3574                                         DBUF_STAT_INCR(
3575                                             cache_levels_bytes[db->db_level],
3576                                             db->db.db_size);
3577                                         DBUF_STAT_MAX(cache_size_bytes_max,
3578                                             size);
3579                                 }
3580                                 mutex_exit(&db->db_mtx);
3581
3582                                 if (dcs == DB_DBUF_CACHE && !evicting)
3583                                         dbuf_evict_notify(size);
3584                         }
3585
3586                         if (do_arc_evict)
3587                                 arc_freed(spa, &bp);
3588                 }
3589         } else {
3590                 mutex_exit(&db->db_mtx);
3591         }
3592
3593 }
3594
3595 #pragma weak dmu_buf_refcount = dbuf_refcount
3596 uint64_t
3597 dbuf_refcount(dmu_buf_impl_t *db)
3598 {
3599         return (zfs_refcount_count(&db->db_holds));
3600 }
3601
3602 uint64_t
3603 dmu_buf_user_refcount(dmu_buf_t *db_fake)
3604 {
3605         uint64_t holds;
3606         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
3607
3608         mutex_enter(&db->db_mtx);
3609         ASSERT3U(zfs_refcount_count(&db->db_holds), >=, db->db_dirtycnt);
3610         holds = zfs_refcount_count(&db->db_holds) - db->db_dirtycnt;
3611         mutex_exit(&db->db_mtx);
3612
3613         return (holds);
3614 }
3615
3616 void *
3617 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
3618     dmu_buf_user_t *new_user)
3619 {
3620         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
3621
3622         mutex_enter(&db->db_mtx);
3623         dbuf_verify_user(db, DBVU_NOT_EVICTING);
3624         if (db->db_user == old_user)
3625                 db->db_user = new_user;
3626         else
3627                 old_user = db->db_user;
3628         dbuf_verify_user(db, DBVU_NOT_EVICTING);
3629         mutex_exit(&db->db_mtx);
3630
3631         return (old_user);
3632 }
3633
3634 void *
3635 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
3636 {
3637         return (dmu_buf_replace_user(db_fake, NULL, user));
3638 }
3639
3640 void *
3641 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
3642 {
3643         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
3644
3645         db->db_user_immediate_evict = TRUE;
3646         return (dmu_buf_set_user(db_fake, user));
3647 }
3648
3649 void *
3650 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
3651 {
3652         return (dmu_buf_replace_user(db_fake, user, NULL));
3653 }
3654
3655 void *
3656 dmu_buf_get_user(dmu_buf_t *db_fake)
3657 {
3658         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
3659
3660         dbuf_verify_user(db, DBVU_NOT_EVICTING);
3661         return (db->db_user);
3662 }
3663
3664 void
3665 dmu_buf_user_evict_wait()
3666 {
3667         taskq_wait(dbu_evict_taskq);
3668 }
3669
3670 blkptr_t *
3671 dmu_buf_get_blkptr(dmu_buf_t *db)
3672 {
3673         dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
3674         return (dbi->db_blkptr);
3675 }
3676
3677 objset_t *
3678 dmu_buf_get_objset(dmu_buf_t *db)
3679 {
3680         dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
3681         return (dbi->db_objset);
3682 }
3683
3684 dnode_t *
3685 dmu_buf_dnode_enter(dmu_buf_t *db)
3686 {
3687         dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
3688         DB_DNODE_ENTER(dbi);
3689         return (DB_DNODE(dbi));
3690 }
3691
3692 void
3693 dmu_buf_dnode_exit(dmu_buf_t *db)
3694 {
3695         dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
3696         DB_DNODE_EXIT(dbi);
3697 }
3698
3699 static void
3700 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
3701 {
3702         /* ASSERT(dmu_tx_is_syncing(tx) */
3703         ASSERT(MUTEX_HELD(&db->db_mtx));
3704
3705         if (db->db_blkptr != NULL)
3706                 return;
3707
3708         if (db->db_blkid == DMU_SPILL_BLKID) {
3709                 db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
3710                 BP_ZERO(db->db_blkptr);
3711                 return;
3712         }
3713         if (db->db_level == dn->dn_phys->dn_nlevels-1) {
3714                 /*
3715                  * This buffer was allocated at a time when there was
3716                  * no available blkptrs from the dnode, or it was
3717                  * inappropriate to hook it in (i.e., nlevels mismatch).
3718                  */
3719                 ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
3720                 ASSERT(db->db_parent == NULL);
3721                 db->db_parent = dn->dn_dbuf;
3722                 db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
3723                 DBUF_VERIFY(db);
3724         } else {
3725                 dmu_buf_impl_t *parent = db->db_parent;
3726                 int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
3727
3728                 ASSERT(dn->dn_phys->dn_nlevels > 1);
3729                 if (parent == NULL) {
3730                         mutex_exit(&db->db_mtx);
3731                         rw_enter(&dn->dn_struct_rwlock, RW_READER);
3732                         parent = dbuf_hold_level(dn, db->db_level + 1,
3733                             db->db_blkid >> epbs, db);
3734                         rw_exit(&dn->dn_struct_rwlock);
3735                         mutex_enter(&db->db_mtx);
3736                         db->db_parent = parent;
3737                 }
3738                 db->db_blkptr = (blkptr_t *)parent->db.db_data +
3739                     (db->db_blkid & ((1ULL << epbs) - 1));
3740                 DBUF_VERIFY(db);
3741         }
3742 }
3743
3744 /*
3745  * When syncing out a blocks of dnodes, adjust the block to deal with
3746  * encryption.  Normally, we make sure the block is decrypted before writing
3747  * it.  If we have crypt params, then we are writing a raw (encrypted) block,
3748  * from a raw receive.  In this case, set the ARC buf's crypt params so
3749  * that the BP will be filled with the correct byteorder, salt, iv, and mac.
3750  */
3751 static void
3752 dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr)
3753 {
3754         int err;
3755         dmu_buf_impl_t *db = dr->dr_dbuf;
3756
3757         ASSERT(MUTEX_HELD(&db->db_mtx));
3758         ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
3759         ASSERT3U(db->db_level, ==, 0);
3760
3761         if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) {
3762                 zbookmark_phys_t zb;
3763
3764                 /*
3765                  * Unfortunately, there is currently no mechanism for
3766                  * syncing context to handle decryption errors. An error
3767                  * here is only possible if an attacker maliciously
3768                  * changed a dnode block and updated the associated
3769                  * checksums going up the block tree.
3770                  */
3771                 SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
3772                     db->db.db_object, db->db_level, db->db_blkid);
3773                 err = arc_untransform(db->db_buf, db->db_objset->os_spa,
3774                     &zb, B_TRUE);
3775                 if (err)
3776                         panic("Invalid dnode block MAC");
3777         } else if (dr->dt.dl.dr_has_raw_params) {
3778                 (void) arc_release(dr->dt.dl.dr_data, db);
3779                 arc_convert_to_raw(dr->dt.dl.dr_data,
3780                     dmu_objset_id(db->db_objset),
3781                     dr->dt.dl.dr_byteorder, DMU_OT_DNODE,
3782                     dr->dt.dl.dr_salt, dr->dt.dl.dr_iv, dr->dt.dl.dr_mac);
3783         }
3784 }
3785
3786 /*
3787  * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
3788  * is critical the we not allow the compiler to inline this function in to
3789  * dbuf_sync_list() thereby drastically bloating the stack usage.
3790  */
3791 noinline static void
3792 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
3793 {
3794         dmu_buf_impl_t *db = dr->dr_dbuf;
3795         dnode_t *dn;
3796         zio_t *zio;
3797
3798         ASSERT(dmu_tx_is_syncing(tx));
3799
3800         dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
3801
3802         mutex_enter(&db->db_mtx);
3803
3804         ASSERT(db->db_level > 0);
3805         DBUF_VERIFY(db);
3806
3807         /* Read the block if it hasn't been read yet. */
3808         if (db->db_buf == NULL) {
3809                 mutex_exit(&db->db_mtx);
3810                 (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
3811                 mutex_enter(&db->db_mtx);
3812         }
3813         ASSERT3U(db->db_state, ==, DB_CACHED);
3814         ASSERT(db->db_buf != NULL);
3815
3816         DB_DNODE_ENTER(db);
3817         dn = DB_DNODE(db);
3818         /* Indirect block size must match what the dnode thinks it is. */
3819         ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
3820         dbuf_check_blkptr(dn, db);
3821         DB_DNODE_EXIT(db);
3822
3823         /* Provide the pending dirty record to child dbufs */
3824         db->db_data_pending = dr;
3825
3826         mutex_exit(&db->db_mtx);
3827
3828         dbuf_write(dr, db->db_buf, tx);
3829
3830         zio = dr->dr_zio;
3831         mutex_enter(&dr->dt.di.dr_mtx);
3832         dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
3833         ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
3834         mutex_exit(&dr->dt.di.dr_mtx);
3835         zio_nowait(zio);
3836 }
3837
3838 #ifdef ZFS_DEBUG
3839 /*
3840  * Verify that the size of the data in our bonus buffer does not exceed
3841  * its recorded size.
3842  *
3843  * The purpose of this verification is to catch any cases in development
3844  * where the size of a phys structure (i.e space_map_phys_t) grows and,
3845  * due to incorrect feature management, older pools expect to read more
3846  * data even though they didn't actually write it to begin with.
3847  *
3848  * For a example, this would catch an error in the feature logic where we
3849  * open an older pool and we expect to write the space map histogram of
3850  * a space map with size SPACE_MAP_SIZE_V0.
3851  */
3852 static void
3853 dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr)
3854 {
3855         dnode_t *dn = DB_DNODE(dr->dr_dbuf);
3856
3857         /*
3858          * Encrypted bonus buffers can have data past their bonuslen.
3859          * Skip the verification of these blocks.
3860          */
3861         if (DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))
3862                 return;
3863
3864         uint16_t bonuslen = dn->dn_phys->dn_bonuslen;
3865         uint16_t maxbonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
3866         ASSERT3U(bonuslen, <=, maxbonuslen);
3867
3868         arc_buf_t *datap = dr->dt.dl.dr_data;
3869         char *datap_end = ((char *)datap) + bonuslen;
3870         char *datap_max = ((char *)datap) + maxbonuslen;
3871
3872         /* ensure that everything is zero after our data */
3873         for (; datap_end < datap_max; datap_end++)
3874                 ASSERT(*datap_end == 0);
3875 }
3876 #endif
3877
3878 /*
3879  * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
3880  * critical the we not allow the compiler to inline this function in to
3881  * dbuf_sync_list() thereby drastically bloating the stack usage.
3882  */
3883 noinline static void
3884 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
3885 {
3886         arc_buf_t **datap = &dr->dt.dl.dr_data;
3887         dmu_buf_impl_t *db = dr->dr_dbuf;
3888         dnode_t *dn;
3889         objset_t *os;
3890         uint64_t txg = tx->tx_txg;
3891
3892         ASSERT(dmu_tx_is_syncing(tx));
3893
3894         dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
3895
3896         mutex_enter(&db->db_mtx);
3897         /*
3898          * To be synced, we must be dirtied.  But we
3899          * might have been freed after the dirty.
3900          */
3901         if (db->db_state == DB_UNCACHED) {
3902                 /* This buffer has been freed since it was dirtied */
3903                 ASSERT(db->db.db_data == NULL);
3904         } else if (db->db_state == DB_FILL) {
3905                 /* This buffer was freed and is now being re-filled */
3906                 ASSERT(db->db.db_data != dr->dt.dl.dr_data);
3907         } else {
3908                 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
3909         }
3910         DBUF_VERIFY(db);
3911
3912         DB_DNODE_ENTER(db);
3913         dn = DB_DNODE(db);
3914
3915         if (db->db_blkid == DMU_SPILL_BLKID) {
3916                 mutex_enter(&dn->dn_mtx);
3917                 if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
3918                         /*
3919                          * In the previous transaction group, the bonus buffer
3920                          * was entirely used to store the attributes for the
3921                          * dnode which overrode the dn_spill field.  However,
3922                          * when adding more attributes to the file a spill
3923                          * block was required to hold the extra attributes.
3924                          *
3925                          * Make sure to clear the garbage left in the dn_spill
3926                          * field from the previous attributes in the bonus
3927                          * buffer.  Otherwise, after writing out the spill
3928                          * block to the new allocated dva, it will free
3929                          * the old block pointed to by the invalid dn_spill.
3930                          */
3931                         db->db_blkptr = NULL;
3932                 }
3933                 dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
3934                 mutex_exit(&dn->dn_mtx);
3935         }
3936
3937         /*
3938          * If this is a bonus buffer, simply copy the bonus data into the
3939          * dnode.  It will be written out when the dnode is synced (and it
3940          * will be synced, since it must have been dirty for dbuf_sync to
3941          * be called).
3942          */
3943         if (db->db_blkid == DMU_BONUS_BLKID) {
3944                 ASSERT(*datap != NULL);
3945                 ASSERT0(db->db_level);
3946                 ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
3947                     DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
3948                 bcopy(*datap, DN_BONUS(dn->dn_phys),
3949                     DN_MAX_BONUS_LEN(dn->dn_phys));
3950                 DB_DNODE_EXIT(db);
3951
3952 #ifdef ZFS_DEBUG
3953                 dbuf_sync_leaf_verify_bonus_dnode(dr);
3954 #endif
3955
3956                 if (*datap != db->db.db_data) {
3957                         int slots = DB_DNODE(db)->dn_num_slots;
3958                         int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
3959                         kmem_free(*datap, bonuslen);
3960                         arc_space_return(bonuslen, ARC_SPACE_BONUS);
3961                 }
3962                 db->db_data_pending = NULL;
3963                 ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
3964                 ASSERT(dr->dr_dbuf == db);
3965                 list_remove(&db->db_dirty_records, dr);
3966                 if (dr->dr_dbuf->db_level != 0) {
3967                         mutex_destroy(&dr->dt.di.dr_mtx);
3968                         list_destroy(&dr->dt.di.dr_children);
3969                 }
3970                 kmem_free(dr, sizeof (dbuf_dirty_record_t));
3971                 ASSERT(db->db_dirtycnt > 0);
3972                 db->db_dirtycnt -= 1;
3973                 dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
3974                 return;
3975         }
3976
3977         os = dn->dn_objset;
3978
3979         /*
3980          * This function may have dropped the db_mtx lock allowing a dmu_sync
3981          * operation to sneak in. As a result, we need to ensure that we
3982          * don't check the dr_override_state until we have returned from
3983          * dbuf_check_blkptr.
3984          */
3985         dbuf_check_blkptr(dn, db);
3986
3987         /*
3988          * If this buffer is in the middle of an immediate write,
3989          * wait for the synchronous IO to complete.
3990          */
3991         while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
3992                 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
3993                 cv_wait(&db->db_changed, &db->db_mtx);
3994                 ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
3995         }
3996
3997         /*
3998          * If this is a dnode block, ensure it is appropriately encrypted
3999          * or decrypted, depending on what we are writing to it this txg.
4000          */
4001         if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
4002                 dbuf_prepare_encrypted_dnode_leaf(dr);
4003
4004         if (db->db_state != DB_NOFILL &&
4005             dn->dn_object != DMU_META_DNODE_OBJECT &&
4006             zfs_refcount_count(&db->db_holds) > 1 &&
4007             dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
4008             *datap == db->db_buf) {
4009                 /*
4010                  * If this buffer is currently "in use" (i.e., there
4011                  * are active holds and db_data still references it),
4012                  * then make a copy before we start the write so that
4013                  * any modifications from the open txg will not leak
4014                  * into this write.
4015                  *
4016                  * NOTE: this copy does not need to be made for
4017                  * objects only modified in the syncing context (e.g.
4018                  * DNONE_DNODE blocks).
4019                  */
4020                 int psize = arc_buf_size(*datap);
4021                 int lsize = arc_buf_lsize(*datap);
4022                 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
4023                 enum zio_compress compress_type = arc_get_compression(*datap);
4024
4025                 if (arc_is_encrypted(*datap)) {
4026                         boolean_t byteorder;
4027                         uint8_t salt[ZIO_DATA_SALT_LEN];
4028                         uint8_t iv[ZIO_DATA_IV_LEN];
4029                         uint8_t mac[ZIO_DATA_MAC_LEN];
4030
4031                         arc_get_raw_params(*datap, &byteorder, salt, iv, mac);
4032                         *datap = arc_alloc_raw_buf(os->os_spa, db,
4033                             dmu_objset_id(os), byteorder, salt, iv, mac,
4034                             dn->dn_type, psize, lsize, compress_type);
4035                 } else if (compress_type != ZIO_COMPRESS_OFF) {
4036                         ASSERT3U(type, ==, ARC_BUFC_DATA);
4037                         *datap = arc_alloc_compressed_buf(os->os_spa, db,
4038                             psize, lsize, compress_type);
4039                 } else {
4040                         *datap = arc_alloc_buf(os->os_spa, db, type, psize);
4041                 }
4042                 bcopy(db->db.db_data, (*datap)->b_data, psize);
4043         }
4044         db->db_data_pending = dr;
4045
4046         mutex_exit(&db->db_mtx);
4047
4048         dbuf_write(dr, *datap, tx);
4049
4050         ASSERT(!list_link_active(&dr->dr_dirty_node));
4051         if (dn->dn_object == DMU_META_DNODE_OBJECT) {
4052                 list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr);
4053                 DB_DNODE_EXIT(db);
4054         } else {
4055                 /*
4056                  * Although zio_nowait() does not "wait for an IO", it does
4057                  * initiate the IO. If this is an empty write it seems plausible
4058                  * that the IO could actually be completed before the nowait
4059                  * returns. We need to DB_DNODE_EXIT() first in case
4060                  * zio_nowait() invalidates the dbuf.
4061                  */
4062                 DB_DNODE_EXIT(db);
4063                 zio_nowait(dr->dr_zio);
4064         }
4065 }
4066
4067 void
4068 dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
4069 {
4070         dbuf_dirty_record_t *dr;
4071
4072         while ((dr = list_head(list))) {
4073                 if (dr->dr_zio != NULL) {
4074                         /*
4075                          * If we find an already initialized zio then we
4076                          * are processing the meta-dnode, and we have finished.
4077                          * The dbufs for all dnodes are put back on the list
4078                          * during processing, so that we can zio_wait()
4079                          * these IOs after initiating all child IOs.
4080                          */
4081                         ASSERT3U(dr->dr_dbuf->db.db_object, ==,
4082                             DMU_META_DNODE_OBJECT);
4083                         break;
4084                 }
4085                 if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
4086                     dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
4087                         VERIFY3U(dr->dr_dbuf->db_level, ==, level);
4088                 }
4089                 list_remove(list, dr);
4090                 if (dr->dr_dbuf->db_level > 0)
4091                         dbuf_sync_indirect(dr, tx);
4092                 else
4093                         dbuf_sync_leaf(dr, tx);
4094         }
4095 }
4096
4097 /* ARGSUSED */
4098 static void
4099 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
4100 {
4101         dmu_buf_impl_t *db = vdb;
4102         dnode_t *dn;
4103         blkptr_t *bp = zio->io_bp;
4104         blkptr_t *bp_orig = &zio->io_bp_orig;
4105         spa_t *spa = zio->io_spa;
4106         int64_t delta;
4107         uint64_t fill = 0;
4108         int i;
4109
4110         ASSERT3P(db->db_blkptr, !=, NULL);
4111         ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
4112
4113         DB_DNODE_ENTER(db);
4114         dn = DB_DNODE(db);
4115         delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
4116         dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
4117         zio->io_prev_space_delta = delta;
4118
4119         if (bp->blk_birth != 0) {
4120                 ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
4121                     BP_GET_TYPE(bp) == dn->dn_type) ||
4122                     (db->db_blkid == DMU_SPILL_BLKID &&
4123                     BP_GET_TYPE(bp) == dn->dn_bonustype) ||
4124                     BP_IS_EMBEDDED(bp));
4125                 ASSERT(BP_GET_LEVEL(bp) == db->db_level);
4126         }
4127
4128         mutex_enter(&db->db_mtx);
4129
4130 #ifdef ZFS_DEBUG
4131         if (db->db_blkid == DMU_SPILL_BLKID) {
4132                 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
4133                 ASSERT(!(BP_IS_HOLE(bp)) &&
4134                     db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
4135         }
4136 #endif
4137
4138         if (db->db_level == 0) {
4139                 mutex_enter(&dn->dn_mtx);
4140                 if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
4141                     db->db_blkid != DMU_SPILL_BLKID) {
4142                         ASSERT0(db->db_objset->os_raw_receive);
4143                         dn->dn_phys->dn_maxblkid = db->db_blkid;
4144                 }
4145                 mutex_exit(&dn->dn_mtx);
4146
4147                 if (dn->dn_type == DMU_OT_DNODE) {
4148                         i = 0;
4149                         while (i < db->db.db_size) {
4150                                 dnode_phys_t *dnp =
4151                                     (void *)(((char *)db->db.db_data) + i);
4152
4153                                 i += DNODE_MIN_SIZE;
4154                                 if (dnp->dn_type != DMU_OT_NONE) {
4155                                         fill++;
4156                                         i += dnp->dn_extra_slots *
4157                                             DNODE_MIN_SIZE;
4158                                 }
4159                         }
4160                 } else {
4161                         if (BP_IS_HOLE(bp)) {
4162                                 fill = 0;
4163                         } else {
4164                                 fill = 1;
4165                         }
4166                 }
4167         } else {
4168                 blkptr_t *ibp = db->db.db_data;
4169                 ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
4170                 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
4171                         if (BP_IS_HOLE(ibp))
4172                                 continue;
4173                         fill += BP_GET_FILL(ibp);
4174                 }
4175         }
4176         DB_DNODE_EXIT(db);
4177
4178         if (!BP_IS_EMBEDDED(bp))
4179                 BP_SET_FILL(bp, fill);
4180
4181         mutex_exit(&db->db_mtx);
4182
4183         db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG);
4184         *db->db_blkptr = *bp;
4185         dmu_buf_unlock_parent(db, dblt, FTAG);
4186 }
4187
4188 /* ARGSUSED */
4189 /*
4190  * This function gets called just prior to running through the compression
4191  * stage of the zio pipeline. If we're an indirect block comprised of only
4192  * holes, then we want this indirect to be compressed away to a hole. In
4193  * order to do that we must zero out any information about the holes that
4194  * this indirect points to prior to before we try to compress it.
4195  */
4196 static void
4197 dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
4198 {
4199         dmu_buf_impl_t *db = vdb;
4200         dnode_t *dn;
4201         blkptr_t *bp;
4202         unsigned int epbs, i;
4203
4204         ASSERT3U(db->db_level, >, 0);
4205         DB_DNODE_ENTER(db);
4206         dn = DB_DNODE(db);
4207         epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
4208         ASSERT3U(epbs, <, 31);
4209
4210         /* Determine if all our children are holes */
4211         for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {
4212                 if (!BP_IS_HOLE(bp))
4213                         break;
4214         }
4215
4216         /*
4217          * If all the children are holes, then zero them all out so that
4218          * we may get compressed away.
4219          */
4220         if (i == 1ULL << epbs) {
4221                 /*
4222                  * We only found holes. Grab the rwlock to prevent
4223                  * anybody from reading the blocks we're about to
4224                  * zero out.
4225                  */
4226                 rw_enter(&db->db_rwlock, RW_WRITER);
4227                 bzero(db->db.db_data, db->db.db_size);
4228                 rw_exit(&db->db_rwlock);
4229         }
4230         DB_DNODE_EXIT(db);
4231 }
4232
4233 /*
4234  * The SPA will call this callback several times for each zio - once
4235  * for every physical child i/o (zio->io_phys_children times).  This
4236  * allows the DMU to monitor the progress of each logical i/o.  For example,
4237  * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
4238  * block.  There may be a long delay before all copies/fragments are completed,
4239  * so this callback allows us to retire dirty space gradually, as the physical
4240  * i/os complete.
4241  */
4242 /* ARGSUSED */
4243 static void
4244 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
4245 {
4246         dmu_buf_impl_t *db = arg;
4247         objset_t *os = db->db_objset;
4248         dsl_pool_t *dp = dmu_objset_pool(os);
4249         dbuf_dirty_record_t *dr;
4250         int delta = 0;
4251
4252         dr = db->db_data_pending;
4253         ASSERT3U(dr->dr_txg, ==, zio->io_txg);
4254
4255         /*
4256          * The callback will be called io_phys_children times.  Retire one
4257          * portion of our dirty space each time we are called.  Any rounding
4258          * error will be cleaned up by dbuf_write_done().
4259          */
4260         delta = dr->dr_accounted / zio->io_phys_children;
4261         dsl_pool_undirty_space(dp, delta, zio->io_txg);
4262 }
4263
4264 /* ARGSUSED */
4265 static void
4266 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
4267 {
4268         dmu_buf_impl_t *db = vdb;
4269         blkptr_t *bp_orig = &zio->io_bp_orig;
4270         blkptr_t *bp = db->db_blkptr;
4271         objset_t *os = db->db_objset;
4272         dmu_tx_t *tx = os->os_synctx;
4273         dbuf_dirty_record_t *dr;
4274
4275         ASSERT0(zio->io_error);
4276         ASSERT(db->db_blkptr == bp);
4277
4278         /*
4279          * For nopwrites and rewrites we ensure that the bp matches our
4280          * original and bypass all the accounting.
4281          */
4282         if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
4283                 ASSERT(BP_EQUAL(bp, bp_orig));
4284         } else {
4285                 dsl_dataset_t *ds = os->os_dsl_dataset;
4286                 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
4287                 dsl_dataset_block_born(ds, bp, tx);
4288         }
4289
4290         mutex_enter(&db->db_mtx);
4291
4292         DBUF_VERIFY(db);
4293
4294         dr = db->db_data_pending;
4295         ASSERT(!list_link_active(&dr->dr_dirty_node));
4296         ASSERT(dr->dr_dbuf == db);
4297         ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
4298         list_remove(&db->db_dirty_records, dr);
4299
4300 #ifdef ZFS_DEBUG
4301         if (db->db_blkid == DMU_SPILL_BLKID) {
4302                 dnode_t *dn;
4303
4304                 DB_DNODE_ENTER(db);
4305                 dn = DB_DNODE(db);
4306                 ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
4307                 ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
4308                     db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
4309                 DB_DNODE_EXIT(db);
4310         }
4311 #endif
4312
4313         if (db->db_level == 0) {
4314                 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
4315                 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
4316                 if (db->db_state != DB_NOFILL) {
4317                         if (dr->dt.dl.dr_data != db->db_buf)
4318                                 arc_buf_destroy(dr->dt.dl.dr_data, db);
4319                 }
4320         } else {
4321                 dnode_t *dn;
4322
4323                 DB_DNODE_ENTER(db);
4324                 dn = DB_DNODE(db);
4325                 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
4326                 ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
4327                 if (!BP_IS_HOLE(db->db_blkptr)) {
4328                         int epbs __maybe_unused = dn->dn_phys->dn_indblkshift -
4329                             SPA_BLKPTRSHIFT;
4330                         ASSERT3U(db->db_blkid, <=,
4331                             dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
4332                         ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
4333                             db->db.db_size);
4334                 }
4335                 DB_DNODE_EXIT(db);
4336                 mutex_destroy(&dr->dt.di.dr_mtx);
4337                 list_destroy(&dr->dt.di.dr_children);
4338         }
4339
4340         cv_broadcast(&db->db_changed);
4341         ASSERT(db->db_dirtycnt > 0);
4342         db->db_dirtycnt -= 1;
4343         db->db_data_pending = NULL;
4344         dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
4345
4346         /*
4347          * If we didn't do a physical write in this ZIO and we
4348          * still ended up here, it means that the space of the
4349          * dbuf that we just released (and undirtied) above hasn't
4350          * been marked as undirtied in the pool's accounting.
4351          *
4352          * Thus, we undirty that space in the pool's view of the
4353          * world here. For physical writes this type of update
4354          * happens in dbuf_write_physdone().
4355          *
4356          * If we did a physical write, cleanup any rounding errors
4357          * that came up due to writing multiple copies of a block
4358          * on disk [see dbuf_write_physdone()].
4359          */
4360         if (zio->io_phys_children == 0) {
4361                 dsl_pool_undirty_space(dmu_objset_pool(os),
4362                     dr->dr_accounted, zio->io_txg);
4363         } else {
4364                 dsl_pool_undirty_space(dmu_objset_pool(os),
4365                     dr->dr_accounted % zio->io_phys_children, zio->io_txg);
4366         }
4367
4368         kmem_free(dr, sizeof (dbuf_dirty_record_t));
4369 }
4370
4371 static void
4372 dbuf_write_nofill_ready(zio_t *zio)
4373 {
4374         dbuf_write_ready(zio, NULL, zio->io_private);
4375 }
4376
4377 static void
4378 dbuf_write_nofill_done(zio_t *zio)
4379 {
4380         dbuf_write_done(zio, NULL, zio->io_private);
4381 }
4382
4383 static void
4384 dbuf_write_override_ready(zio_t *zio)
4385 {
4386         dbuf_dirty_record_t *dr = zio->io_private;
4387         dmu_buf_impl_t *db = dr->dr_dbuf;
4388
4389         dbuf_write_ready(zio, NULL, db);
4390 }
4391
4392 static void
4393 dbuf_write_override_done(zio_t *zio)
4394 {
4395         dbuf_dirty_record_t *dr = zio->io_private;
4396         dmu_buf_impl_t *db = dr->dr_dbuf;
4397         blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
4398
4399         mutex_enter(&db->db_mtx);
4400         if (!BP_EQUAL(zio->io_bp, obp)) {
4401                 if (!BP_IS_HOLE(obp))
4402                         dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
4403                 arc_release(dr->dt.dl.dr_data, db);
4404         }
4405         mutex_exit(&db->db_mtx);
4406
4407         dbuf_write_done(zio, NULL, db);
4408
4409         if (zio->io_abd != NULL)
4410                 abd_put(zio->io_abd);
4411 }
4412
4413 typedef struct dbuf_remap_impl_callback_arg {
4414         objset_t        *drica_os;
4415         uint64_t        drica_blk_birth;
4416         dmu_tx_t        *drica_tx;
4417 } dbuf_remap_impl_callback_arg_t;
4418
4419 static void
4420 dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
4421     void *arg)
4422 {
4423         dbuf_remap_impl_callback_arg_t *drica = arg;
4424         objset_t *os = drica->drica_os;
4425         spa_t *spa = dmu_objset_spa(os);
4426         dmu_tx_t *tx = drica->drica_tx;
4427
4428         ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
4429
4430         if (os == spa_meta_objset(spa)) {
4431                 spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
4432         } else {
4433                 dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
4434                     size, drica->drica_blk_birth, tx);
4435         }
4436 }
4437
4438 static void
4439 dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
4440 {
4441         blkptr_t bp_copy = *bp;
4442         spa_t *spa = dmu_objset_spa(dn->dn_objset);
4443         dbuf_remap_impl_callback_arg_t drica;
4444
4445         ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
4446
4447         drica.drica_os = dn->dn_objset;
4448         drica.drica_blk_birth = bp->blk_birth;
4449         drica.drica_tx = tx;
4450         if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
4451             &drica)) {
4452                 /*
4453                  * If the blkptr being remapped is tracked by a livelist,
4454                  * then we need to make sure the livelist reflects the update.
4455                  * First, cancel out the old blkptr by appending a 'FREE'
4456                  * entry. Next, add an 'ALLOC' to track the new version. This
4457                  * way we avoid trying to free an inaccurate blkptr at delete.
4458                  * Note that embedded blkptrs are not tracked in livelists.
4459                  */
4460                 if (dn->dn_objset != spa_meta_objset(spa)) {
4461                         dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
4462                         if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
4463                             bp->blk_birth > ds->ds_dir->dd_origin_txg) {
4464                                 ASSERT(!BP_IS_EMBEDDED(bp));
4465                                 ASSERT(dsl_dir_is_clone(ds->ds_dir));
4466                                 ASSERT(spa_feature_is_enabled(spa,
4467                                     SPA_FEATURE_LIVELIST));
4468                                 bplist_append(&ds->ds_dir->dd_pending_frees,
4469                                     bp);
4470                                 bplist_append(&ds->ds_dir->dd_pending_allocs,
4471                                     &bp_copy);
4472                         }
4473                 }
4474
4475                 /*
4476                  * The db_rwlock prevents dbuf_read_impl() from
4477                  * dereferencing the BP while we are changing it.  To
4478                  * avoid lock contention, only grab it when we are actually
4479                  * changing the BP.
4480                  */
4481                 if (rw != NULL)
4482                         rw_enter(rw, RW_WRITER);
4483                 *bp = bp_copy;
4484                 if (rw != NULL)
4485                         rw_exit(rw);
4486         }
4487 }
4488
4489 /*
4490  * Remap any existing BP's to concrete vdevs, if possible.
4491  */
4492 static void
4493 dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
4494 {
4495         spa_t *spa = dmu_objset_spa(db->db_objset);
4496         ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
4497
4498         if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
4499                 return;
4500
4501         if (db->db_level > 0) {
4502                 blkptr_t *bp = db->db.db_data;
4503                 for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
4504                         dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx);
4505                 }
4506         } else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
4507                 dnode_phys_t *dnp = db->db.db_data;
4508                 ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==,
4509                     DMU_OT_DNODE);
4510                 for (int i = 0; i < db->db.db_size >> DNODE_SHIFT;
4511                     i += dnp[i].dn_extra_slots + 1) {
4512                         for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
4513                                 krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL :
4514                                     &dn->dn_dbuf->db_rwlock);
4515                                 dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock,
4516                                     tx);
4517                         }
4518                 }
4519         }
4520 }
4521
4522
4523 /* Issue I/O to commit a dirty buffer to disk. */
4524 static void
4525 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
4526 {
4527         dmu_buf_impl_t *db = dr->dr_dbuf;
4528         dnode_t *dn;
4529         objset_t *os;
4530         dmu_buf_impl_t *parent = db->db_parent;
4531         uint64_t txg = tx->tx_txg;
4532         zbookmark_phys_t zb;
4533         zio_prop_t zp;
4534         zio_t *zio;
4535         int wp_flag = 0;
4536
4537         ASSERT(dmu_tx_is_syncing(tx));
4538
4539         DB_DNODE_ENTER(db);
4540         dn = DB_DNODE(db);
4541         os = dn->dn_objset;
4542
4543         if (db->db_state != DB_NOFILL) {
4544                 if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
4545                         /*
4546                          * Private object buffers are released here rather
4547                          * than in dbuf_dirty() since they are only modified
4548                          * in the syncing context and we don't want the
4549                          * overhead of making multiple copies of the data.
4550                          */
4551                         if (BP_IS_HOLE(db->db_blkptr)) {
4552                                 arc_buf_thaw(data);
4553                         } else {
4554                                 dbuf_release_bp(db);
4555                         }
4556                         dbuf_remap(dn, db, tx);
4557                 }
4558         }
4559
4560         if (parent != dn->dn_dbuf) {
4561                 /* Our parent is an indirect block. */
4562                 /* We have a dirty parent that has been scheduled for write. */
4563                 ASSERT(parent && parent->db_data_pending);
4564                 /* Our parent's buffer is one level closer to the dnode. */
4565                 ASSERT(db->db_level == parent->db_level-1);
4566                 /*
4567                  * We're about to modify our parent's db_data by modifying
4568                  * our block pointer, so the parent must be released.
4569                  */
4570                 ASSERT(arc_released(parent->db_buf));
4571                 zio = parent->db_data_pending->dr_zio;
4572         } else {
4573                 /* Our parent is the dnode itself. */
4574                 ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
4575                     db->db_blkid != DMU_SPILL_BLKID) ||
4576                     (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
4577                 if (db->db_blkid != DMU_SPILL_BLKID)
4578                         ASSERT3P(db->db_blkptr, ==,
4579                             &dn->dn_phys->dn_blkptr[db->db_blkid]);
4580                 zio = dn->dn_zio;
4581         }
4582
4583         ASSERT(db->db_level == 0 || data == db->db_buf);
4584         ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
4585         ASSERT(zio);
4586
4587         SET_BOOKMARK(&zb, os->os_dsl_dataset ?
4588             os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
4589             db->db.db_object, db->db_level, db->db_blkid);
4590
4591         if (db->db_blkid == DMU_SPILL_BLKID)
4592                 wp_flag = WP_SPILL;
4593         wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
4594
4595         dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
4596         DB_DNODE_EXIT(db);
4597
4598         /*
4599          * We copy the blkptr now (rather than when we instantiate the dirty
4600          * record), because its value can change between open context and
4601          * syncing context. We do not need to hold dn_struct_rwlock to read
4602          * db_blkptr because we are in syncing context.
4603          */
4604         dr->dr_bp_copy = *db->db_blkptr;
4605
4606         if (db->db_level == 0 &&
4607             dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
4608                 /*
4609                  * The BP for this block has been provided by open context
4610                  * (by dmu_sync() or dmu_buf_write_embedded()).
4611                  */
4612                 abd_t *contents = (data != NULL) ?
4613                     abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
4614
4615                 dr->dr_zio = zio_write(zio, os->os_spa, txg,
4616                     &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size,
4617                     &zp, dbuf_write_override_ready, NULL, NULL,
4618                     dbuf_write_override_done,
4619                     dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
4620                 mutex_enter(&db->db_mtx);
4621                 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
4622                 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
4623                     dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
4624                 mutex_exit(&db->db_mtx);
4625         } else if (db->db_state == DB_NOFILL) {
4626                 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
4627                     zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
4628                 dr->dr_zio = zio_write(zio, os->os_spa, txg,
4629                     &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
4630                     dbuf_write_nofill_ready, NULL, NULL,
4631                     dbuf_write_nofill_done, db,
4632                     ZIO_PRIORITY_ASYNC_WRITE,
4633                     ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
4634         } else {
4635                 ASSERT(arc_released(data));
4636
4637                 /*
4638                  * For indirect blocks, we want to setup the children
4639                  * ready callback so that we can properly handle an indirect
4640                  * block that only contains holes.
4641                  */
4642                 arc_write_done_func_t *children_ready_cb = NULL;
4643                 if (db->db_level != 0)
4644                         children_ready_cb = dbuf_write_children_ready;
4645
4646                 dr->dr_zio = arc_write(zio, os->os_spa, txg,
4647                     &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
4648                     &zp, dbuf_write_ready,
4649                     children_ready_cb, dbuf_write_physdone,
4650                     dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
4651                     ZIO_FLAG_MUSTSUCCEED, &zb);
4652         }
4653 }
4654
4655 EXPORT_SYMBOL(dbuf_find);
4656 EXPORT_SYMBOL(dbuf_is_metadata);
4657 EXPORT_SYMBOL(dbuf_destroy);
4658 EXPORT_SYMBOL(dbuf_loan_arcbuf);
4659 EXPORT_SYMBOL(dbuf_whichblock);
4660 EXPORT_SYMBOL(dbuf_read);
4661 EXPORT_SYMBOL(dbuf_unoverride);
4662 EXPORT_SYMBOL(dbuf_free_range);
4663 EXPORT_SYMBOL(dbuf_new_size);
4664 EXPORT_SYMBOL(dbuf_release_bp);
4665 EXPORT_SYMBOL(dbuf_dirty);
4666 EXPORT_SYMBOL(dmu_buf_set_crypt_params);
4667 EXPORT_SYMBOL(dmu_buf_will_dirty);
4668 EXPORT_SYMBOL(dmu_buf_is_dirty);
4669 EXPORT_SYMBOL(dmu_buf_will_not_fill);
4670 EXPORT_SYMBOL(dmu_buf_will_fill);
4671 EXPORT_SYMBOL(dmu_buf_fill_done);
4672 EXPORT_SYMBOL(dmu_buf_rele);
4673 EXPORT_SYMBOL(dbuf_assign_arcbuf);
4674 EXPORT_SYMBOL(dbuf_prefetch);
4675 EXPORT_SYMBOL(dbuf_hold_impl);
4676 EXPORT_SYMBOL(dbuf_hold);
4677 EXPORT_SYMBOL(dbuf_hold_level);
4678 EXPORT_SYMBOL(dbuf_create_bonus);
4679 EXPORT_SYMBOL(dbuf_spill_set_blksz);
4680 EXPORT_SYMBOL(dbuf_rm_spill);
4681 EXPORT_SYMBOL(dbuf_add_ref);
4682 EXPORT_SYMBOL(dbuf_rele);
4683 EXPORT_SYMBOL(dbuf_rele_and_unlock);
4684 EXPORT_SYMBOL(dbuf_refcount);
4685 EXPORT_SYMBOL(dbuf_sync_list);
4686 EXPORT_SYMBOL(dmu_buf_set_user);
4687 EXPORT_SYMBOL(dmu_buf_set_user_ie);
4688 EXPORT_SYMBOL(dmu_buf_get_user);
4689 EXPORT_SYMBOL(dmu_buf_get_blkptr);
4690
4691 /* BEGIN CSTYLED */
4692 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, ULONG, ZMOD_RW,
4693         "Maximum size in bytes of the dbuf cache.");
4694
4695 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,
4696         "Percentage over dbuf_cache_max_bytes when dbufs must be evicted "
4697         "directly.");
4698
4699 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW,
4700         "Percentage below dbuf_cache_max_bytes when the evict thread stops "
4701         "evicting dbufs.");
4702
4703 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, ULONG, ZMOD_RW,
4704         "Maximum size in bytes of the dbuf metadata cache.");
4705
4706 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, INT, ZMOD_RW,
4707         "Set the size of the dbuf cache to a log2 fraction of arc size.");
4708
4709 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, INT, ZMOD_RW,
4710         "Set the size of the dbuf metadata cache to a log2 fraction of arc "
4711         "size.");
4712 /* END CSTYLED */