module/zfs/zil.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  24  * Copyright (c) 2014 Integros [integros.com]
  25  * Copyright (c) 2018 Datto Inc.
  26  */
  27
  28 /* Portions Copyright 2010 Robert Milkowski */
  29
  30 #include <sys/zfs_context.h>
  31 #include <sys/spa.h>
  32 #include <sys/spa_impl.h>
  33 #include <sys/dmu.h>
  34 #include <sys/zap.h>
  35 #include <sys/arc.h>
  36 #include <sys/stat.h>
  37 #include <sys/zil.h>
  38 #include <sys/zil_impl.h>
  39 #include <sys/dsl_dataset.h>
  40 #include <sys/vdev_impl.h>
  41 #include <sys/dmu_tx.h>
  42 #include <sys/dsl_pool.h>
  43 #include <sys/metaslab.h>
  44 #include <sys/trace_zfs.h>
  45 #include <sys/abd.h>
  46 #include <sys/brt.h>
  47 #include <sys/wmsum.h>
  48
  49 /*
  50  * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system
  51  * calls that change the file system. Each itx has enough information to
  52  * be able to replay them after a system crash, power loss, or
  53  * equivalent failure mode. These are stored in memory until either:
  54  *
  55  *   1. they are committed to the pool by the DMU transaction group
  56  *      (txg), at which point they can be discarded; or
  57  *   2. they are committed to the on-disk ZIL for the dataset being
  58  *      modified (e.g. due to an fsync, O_DSYNC, or other synchronous
  59  *      requirement).
  60  *
  61  * In the event of a crash or power loss, the itxs contained by each
  62  * dataset's on-disk ZIL will be replayed when that dataset is first
  63  * instantiated (e.g. if the dataset is a normal filesystem, when it is
  64  * first mounted).
  65  *
  66  * As hinted at above, there is one ZIL per dataset (both the in-memory
  67  * representation, and the on-disk representation). The on-disk format
  68  * consists of 3 parts:
  69  *
  70  *      - a single, per-dataset, ZIL header; which points to a chain of
  71  *      - zero or more ZIL blocks; each of which contains
  72  *      - zero or more ZIL records
  73  *
  74  * A ZIL record holds the information necessary to replay a single
  75  * system call transaction. A ZIL block can hold many ZIL records, and
  76  * the blocks are chained together, similarly to a singly linked list.
  77  *
  78  * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL
  79  * block in the chain, and the ZIL header points to the first block in
  80  * the chain.
  81  *
  82  * Note, there is not a fixed place in the pool to hold these ZIL
  83  * blocks; they are dynamically allocated and freed as needed from the
  84  * blocks available on the pool, though they can be preferentially
  85  * allocated from a dedicated "log" vdev.
  86  */
  87
  88 /*
  89  * This controls the amount of time that a ZIL block (lwb) will remain
  90  * "open" when it isn't "full", and it has a thread waiting for it to be
  91  * committed to stable storage. Please refer to the zil_commit_waiter()
  92  * function (and the comments within it) for more details.
  93  */
  94 static uint_t zfs_commit_timeout_pct = 10;
  95
  96 /*
  97  * See zil.h for more information about these fields.
  98  */
  99 static zil_kstat_values_t zil_stats = {
 100         { "zil_commit_count",                   KSTAT_DATA_UINT64 },
 101         { "zil_commit_writer_count",            KSTAT_DATA_UINT64 },
 102         { "zil_itx_count",                      KSTAT_DATA_UINT64 },
 103         { "zil_itx_indirect_count",             KSTAT_DATA_UINT64 },
 104         { "zil_itx_indirect_bytes",             KSTAT_DATA_UINT64 },
 105         { "zil_itx_copied_count",               KSTAT_DATA_UINT64 },
 106         { "zil_itx_copied_bytes",               KSTAT_DATA_UINT64 },
 107         { "zil_itx_needcopy_count",             KSTAT_DATA_UINT64 },
 108         { "zil_itx_needcopy_bytes",             KSTAT_DATA_UINT64 },
 109         { "zil_itx_metaslab_normal_count",      KSTAT_DATA_UINT64 },
 110         { "zil_itx_metaslab_normal_bytes",      KSTAT_DATA_UINT64 },
 111         { "zil_itx_metaslab_normal_write",      KSTAT_DATA_UINT64 },
 112         { "zil_itx_metaslab_normal_alloc",      KSTAT_DATA_UINT64 },
 113         { "zil_itx_metaslab_slog_count",        KSTAT_DATA_UINT64 },
 114         { "zil_itx_metaslab_slog_bytes",        KSTAT_DATA_UINT64 },
 115         { "zil_itx_metaslab_slog_write",        KSTAT_DATA_UINT64 },
 116         { "zil_itx_metaslab_slog_alloc",        KSTAT_DATA_UINT64 },
 117 };
 118
 119 static zil_sums_t zil_sums_global;
 120 static kstat_t *zil_kstats_global;
 121
 122 /*
 123  * Disable intent logging replay.  This global ZIL switch affects all pools.
 124  */
 125 int zil_replay_disable = 0;
 126
 127 /*
 128  * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to
 129  * the disk(s) by the ZIL after an LWB write has completed. Setting this
 130  * will cause ZIL corruption on power loss if a volatile out-of-order
 131  * write cache is enabled.
 132  */
 133 static int zil_nocacheflush = 0;
 134
 135 /*
 136  * Limit SLOG write size per commit executed with synchronous priority.
 137  * Any writes above that will be executed with lower (asynchronous) priority
 138  * to limit potential SLOG device abuse by single active ZIL writer.
 139  */
 140 static uint64_t zil_slog_bulk = 64 * 1024 * 1024;
 141
 142 static kmem_cache_t *zil_lwb_cache;
 143 static kmem_cache_t *zil_zcw_cache;
 144
 145 static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
 146 static itx_t *zil_itx_clone(itx_t *oitx);
 147
 148 static int
 149 zil_bp_compare(const void *x1, const void *x2)
 150 {
 151         const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
 152         const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
 153
 154         int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
 155         if (likely(cmp))
 156                 return (cmp);
 157
 158         return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
 159 }
 160
 161 static void
 162 zil_bp_tree_init(zilog_t *zilog)
 163 {
 164         avl_create(&zilog->zl_bp_tree, zil_bp_compare,
 165             sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
 166 }
 167
 168 static void
 169 zil_bp_tree_fini(zilog_t *zilog)
 170 {
 171         avl_tree_t *t = &zilog->zl_bp_tree;
 172         zil_bp_node_t *zn;
 173         void *cookie = NULL;
 174
 175         while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
 176                 kmem_free(zn, sizeof (zil_bp_node_t));
 177
 178         avl_destroy(t);
 179 }
 180
 181 int
 182 zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
 183 {
 184         avl_tree_t *t = &zilog->zl_bp_tree;
 185         const dva_t *dva;
 186         zil_bp_node_t *zn;
 187         avl_index_t where;
 188
 189         if (BP_IS_EMBEDDED(bp))
 190                 return (0);
 191
 192         dva = BP_IDENTITY(bp);
 193
 194         if (avl_find(t, dva, &where) != NULL)
 195                 return (SET_ERROR(EEXIST));
 196
 197         zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
 198         zn->zn_dva = *dva;
 199         avl_insert(t, zn, where);
 200
 201         return (0);
 202 }
 203
 204 static zil_header_t *
 205 zil_header_in_syncing_context(zilog_t *zilog)
 206 {
 207         return ((zil_header_t *)zilog->zl_header);
 208 }
 209
 210 static void
 211 zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
 212 {
 213         zio_cksum_t *zc = &bp->blk_cksum;
 214
 215         (void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_0],
 216             sizeof (zc->zc_word[ZIL_ZC_GUID_0]));
 217         (void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_1],
 218             sizeof (zc->zc_word[ZIL_ZC_GUID_1]));
 219         zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
 220         zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
 221 }
 222
 223 static int
 224 zil_kstats_global_update(kstat_t *ksp, int rw)
 225 {
 226         zil_kstat_values_t *zs = ksp->ks_data;
 227         ASSERT3P(&zil_stats, ==, zs);
 228
 229         if (rw == KSTAT_WRITE) {
 230                 return (SET_ERROR(EACCES));
 231         }
 232
 233         zil_kstat_values_update(zs, &zil_sums_global);
 234
 235         return (0);
 236 }
 237
 238 /*
 239  * Read a log block and make sure it's valid.
 240  */
 241 static int
 242 zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
 243     blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf)
 244 {
 245         zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 246         arc_flags_t aflags = ARC_FLAG_WAIT;
 247         zbookmark_phys_t zb;
 248         int error;
 249
 250         if (zilog->zl_header->zh_claim_txg == 0)
 251                 zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 252
 253         if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 254                 zio_flags |= ZIO_FLAG_SPECULATIVE;
 255
 256         if (!decrypt)
 257                 zio_flags |= ZIO_FLAG_RAW;
 258
 259         SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
 260             ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 261
 262         error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func,
 263             abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 264
 265         if (error == 0) {
 266                 zio_cksum_t cksum = bp->blk_cksum;
 267
 268                 /*
 269                  * Validate the checksummed log block.
 270                  *
 271                  * Sequence numbers should be... sequential.  The checksum
 272                  * verifier for the next block should be bp's checksum plus 1.
 273                  *
 274                  * Also check the log chain linkage and size used.
 275                  */
 276                 cksum.zc_word[ZIL_ZC_SEQ]++;
 277
 278                 uint64_t size = BP_GET_LSIZE(bp);
 279                 if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
 280                         zil_chain_t *zilc = (*abuf)->b_data;
 281                         char *lr = (char *)(zilc + 1);
 282
 283                         if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 284                             sizeof (cksum)) ||
 285                             zilc->zc_nused < sizeof (*zilc) ||
 286                             zilc->zc_nused > size) {
 287                                 error = SET_ERROR(ECKSUM);
 288                         } else {
 289                                 *begin = lr;
 290                                 *end = lr + zilc->zc_nused - sizeof (*zilc);
 291                                 *nbp = zilc->zc_next_blk;
 292                         }
 293                 } else {
 294                         char *lr = (*abuf)->b_data;
 295                         zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
 296
 297                         if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 298                             sizeof (cksum)) ||
 299                             (zilc->zc_nused > (size - sizeof (*zilc)))) {
 300                                 error = SET_ERROR(ECKSUM);
 301                         } else {
 302                                 *begin = lr;
 303                                 *end = lr + zilc->zc_nused;
 304                                 *nbp = zilc->zc_next_blk;
 305                         }
 306                 }
 307         }
 308
 309         return (error);
 310 }
 311
 312 /*
 313  * Read a TX_WRITE log data block.
 314  */
 315 static int
 316 zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
 317 {
 318         zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 319         const blkptr_t *bp = &lr->lr_blkptr;
 320         arc_flags_t aflags = ARC_FLAG_WAIT;
 321         arc_buf_t *abuf = NULL;
 322         zbookmark_phys_t zb;
 323         int error;
 324
 325         if (BP_IS_HOLE(bp)) {
 326                 if (wbuf != NULL)
 327                         memset(wbuf, 0, MAX(BP_GET_LSIZE(bp), lr->lr_length));
 328                 return (0);
 329         }
 330
 331         if (zilog->zl_header->zh_claim_txg == 0)
 332                 zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 333
 334         /*
 335          * If we are not using the resulting data, we are just checking that
 336          * it hasn't been corrupted so we don't need to waste CPU time
 337          * decompressing and decrypting it.
 338          */
 339         if (wbuf == NULL)
 340                 zio_flags |= ZIO_FLAG_RAW;
 341
 342         ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
 343         SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
 344             ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 345
 346         error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
 347             ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 348
 349         if (error == 0) {
 350                 if (wbuf != NULL)
 351                         memcpy(wbuf, abuf->b_data, arc_buf_size(abuf));
 352                 arc_buf_destroy(abuf, &abuf);
 353         }
 354
 355         return (error);
 356 }
 357
 358 void
 359 zil_sums_init(zil_sums_t *zs)
 360 {
 361         wmsum_init(&zs->zil_commit_count, 0);
 362         wmsum_init(&zs->zil_commit_writer_count, 0);
 363         wmsum_init(&zs->zil_itx_count, 0);
 364         wmsum_init(&zs->zil_itx_indirect_count, 0);
 365         wmsum_init(&zs->zil_itx_indirect_bytes, 0);
 366         wmsum_init(&zs->zil_itx_copied_count, 0);
 367         wmsum_init(&zs->zil_itx_copied_bytes, 0);
 368         wmsum_init(&zs->zil_itx_needcopy_count, 0);
 369         wmsum_init(&zs->zil_itx_needcopy_bytes, 0);
 370         wmsum_init(&zs->zil_itx_metaslab_normal_count, 0);
 371         wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0);
 372         wmsum_init(&zs->zil_itx_metaslab_normal_write, 0);
 373         wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0);
 374         wmsum_init(&zs->zil_itx_metaslab_slog_count, 0);
 375         wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0);
 376         wmsum_init(&zs->zil_itx_metaslab_slog_write, 0);
 377         wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0);
 378 }
 379
 380 void
 381 zil_sums_fini(zil_sums_t *zs)
 382 {
 383         wmsum_fini(&zs->zil_commit_count);
 384         wmsum_fini(&zs->zil_commit_writer_count);
 385         wmsum_fini(&zs->zil_itx_count);
 386         wmsum_fini(&zs->zil_itx_indirect_count);
 387         wmsum_fini(&zs->zil_itx_indirect_bytes);
 388         wmsum_fini(&zs->zil_itx_copied_count);
 389         wmsum_fini(&zs->zil_itx_copied_bytes);
 390         wmsum_fini(&zs->zil_itx_needcopy_count);
 391         wmsum_fini(&zs->zil_itx_needcopy_bytes);
 392         wmsum_fini(&zs->zil_itx_metaslab_normal_count);
 393         wmsum_fini(&zs->zil_itx_metaslab_normal_bytes);
 394         wmsum_fini(&zs->zil_itx_metaslab_normal_write);
 395         wmsum_fini(&zs->zil_itx_metaslab_normal_alloc);
 396         wmsum_fini(&zs->zil_itx_metaslab_slog_count);
 397         wmsum_fini(&zs->zil_itx_metaslab_slog_bytes);
 398         wmsum_fini(&zs->zil_itx_metaslab_slog_write);
 399         wmsum_fini(&zs->zil_itx_metaslab_slog_alloc);
 400 }
 401
 402 void
 403 zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums)
 404 {
 405         zs->zil_commit_count.value.ui64 =
 406             wmsum_value(&zil_sums->zil_commit_count);
 407         zs->zil_commit_writer_count.value.ui64 =
 408             wmsum_value(&zil_sums->zil_commit_writer_count);
 409         zs->zil_itx_count.value.ui64 =
 410             wmsum_value(&zil_sums->zil_itx_count);
 411         zs->zil_itx_indirect_count.value.ui64 =
 412             wmsum_value(&zil_sums->zil_itx_indirect_count);
 413         zs->zil_itx_indirect_bytes.value.ui64 =
 414             wmsum_value(&zil_sums->zil_itx_indirect_bytes);
 415         zs->zil_itx_copied_count.value.ui64 =
 416             wmsum_value(&zil_sums->zil_itx_copied_count);
 417         zs->zil_itx_copied_bytes.value.ui64 =
 418             wmsum_value(&zil_sums->zil_itx_copied_bytes);
 419         zs->zil_itx_needcopy_count.value.ui64 =
 420             wmsum_value(&zil_sums->zil_itx_needcopy_count);
 421         zs->zil_itx_needcopy_bytes.value.ui64 =
 422             wmsum_value(&zil_sums->zil_itx_needcopy_bytes);
 423         zs->zil_itx_metaslab_normal_count.value.ui64 =
 424             wmsum_value(&zil_sums->zil_itx_metaslab_normal_count);
 425         zs->zil_itx_metaslab_normal_bytes.value.ui64 =
 426             wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes);
 427         zs->zil_itx_metaslab_normal_write.value.ui64 =
 428             wmsum_value(&zil_sums->zil_itx_metaslab_normal_write);
 429         zs->zil_itx_metaslab_normal_alloc.value.ui64 =
 430             wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc);
 431         zs->zil_itx_metaslab_slog_count.value.ui64 =
 432             wmsum_value(&zil_sums->zil_itx_metaslab_slog_count);
 433         zs->zil_itx_metaslab_slog_bytes.value.ui64 =
 434             wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes);
 435         zs->zil_itx_metaslab_slog_write.value.ui64 =
 436             wmsum_value(&zil_sums->zil_itx_metaslab_slog_write);
 437         zs->zil_itx_metaslab_slog_alloc.value.ui64 =
 438             wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc);
 439 }
 440
 441 /*
 442  * Parse the intent log, and call parse_func for each valid record within.
 443  */
 444 int
 445 zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
 446     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg,
 447     boolean_t decrypt)
 448 {
 449         const zil_header_t *zh = zilog->zl_header;
 450         boolean_t claimed = !!zh->zh_claim_txg;
 451         uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
 452         uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
 453         uint64_t max_blk_seq = 0;
 454         uint64_t max_lr_seq = 0;
 455         uint64_t blk_count = 0;
 456         uint64_t lr_count = 0;
 457         blkptr_t blk, next_blk = {{{{0}}}};
 458         int error = 0;
 459
 460         /*
 461          * Old logs didn't record the maximum zh_claim_lr_seq.
 462          */
 463         if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 464                 claim_lr_seq = UINT64_MAX;
 465
 466         /*
 467          * Starting at the block pointed to by zh_log we read the log chain.
 468          * For each block in the chain we strongly check that block to
 469          * ensure its validity.  We stop when an invalid block is found.
 470          * For each block pointer in the chain we call parse_blk_func().
 471          * For each record in each valid block we call parse_lr_func().
 472          * If the log has been claimed, stop if we encounter a sequence
 473          * number greater than the highest claimed sequence number.
 474          */
 475         zil_bp_tree_init(zilog);
 476
 477         for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
 478                 uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
 479                 int reclen;
 480                 char *lrp, *end;
 481                 arc_buf_t *abuf = NULL;
 482
 483                 if (blk_seq > claim_blk_seq)
 484                         break;
 485
 486                 error = parse_blk_func(zilog, &blk, arg, txg);
 487                 if (error != 0)
 488                         break;
 489                 ASSERT3U(max_blk_seq, <, blk_seq);
 490                 max_blk_seq = blk_seq;
 491                 blk_count++;
 492
 493                 if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
 494                         break;
 495
 496                 error = zil_read_log_block(zilog, decrypt, &blk, &next_blk,
 497                     &lrp, &end, &abuf);
 498                 if (error != 0) {
 499                         if (abuf)
 500                                 arc_buf_destroy(abuf, &abuf);
 501                         if (claimed) {
 502                                 char name[ZFS_MAX_DATASET_NAME_LEN];
 503
 504                                 dmu_objset_name(zilog->zl_os, name);
 505
 506                                 cmn_err(CE_WARN, "ZFS read log block error %d, "
 507                                     "dataset %s, seq 0x%llx\n", error, name,
 508                                     (u_longlong_t)blk_seq);
 509                         }
 510                         break;
 511                 }
 512
 513                 for (; lrp < end; lrp += reclen) {
 514                         lr_t *lr = (lr_t *)lrp;
 515                         reclen = lr->lrc_reclen;
 516                         ASSERT3U(reclen, >=, sizeof (lr_t));
 517                         ASSERT3U(reclen, <=, end - lrp);
 518                         if (lr->lrc_seq > claim_lr_seq) {
 519                                 arc_buf_destroy(abuf, &abuf);
 520                                 goto done;
 521                         }
 522
 523                         error = parse_lr_func(zilog, lr, arg, txg);
 524                         if (error != 0) {
 525                                 arc_buf_destroy(abuf, &abuf);
 526                                 goto done;
 527                         }
 528                         ASSERT3U(max_lr_seq, <, lr->lrc_seq);
 529                         max_lr_seq = lr->lrc_seq;
 530                         lr_count++;
 531                 }
 532                 arc_buf_destroy(abuf, &abuf);
 533         }
 534 done:
 535         zilog->zl_parse_error = error;
 536         zilog->zl_parse_blk_seq = max_blk_seq;
 537         zilog->zl_parse_lr_seq = max_lr_seq;
 538         zilog->zl_parse_blk_count = blk_count;
 539         zilog->zl_parse_lr_count = lr_count;
 540
 541         zil_bp_tree_fini(zilog);
 542
 543         return (error);
 544 }
 545
 546 static int
 547 zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
 548     uint64_t first_txg)
 549 {
 550         (void) tx;
 551         ASSERT(!BP_IS_HOLE(bp));
 552
 553         /*
 554          * As we call this function from the context of a rewind to a
 555          * checkpoint, each ZIL block whose txg is later than the txg
 556          * that we rewind to is invalid. Thus, we return -1 so
 557          * zil_parse() doesn't attempt to read it.
 558          */
 559         if (bp->blk_birth >= first_txg)
 560                 return (-1);
 561
 562         if (zil_bp_tree_add(zilog, bp) != 0)
 563                 return (0);
 564
 565         zio_free(zilog->zl_spa, first_txg, bp);
 566         return (0);
 567 }
 568
 569 static int
 570 zil_noop_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
 571     uint64_t first_txg)
 572 {
 573         (void) zilog, (void) lrc, (void) tx, (void) first_txg;
 574         return (0);
 575 }
 576
 577 static int
 578 zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
 579     uint64_t first_txg)
 580 {
 581         /*
 582          * Claim log block if not already committed and not already claimed.
 583          * If tx == NULL, just verify that the block is claimable.
 584          */
 585         if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
 586             zil_bp_tree_add(zilog, bp) != 0)
 587                 return (0);
 588
 589         return (zio_wait(zio_claim(NULL, zilog->zl_spa,
 590             tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
 591             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
 592 }
 593
 594 static int
 595 zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
 596 {
 597         lr_write_t *lr = (lr_write_t *)lrc;
 598         int error;
 599
 600         ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 601
 602         /*
 603          * If the block is not readable, don't claim it.  This can happen
 604          * in normal operation when a log block is written to disk before
 605          * some of the dmu_sync() blocks it points to.  In this case, the
 606          * transaction cannot have been committed to anyone (we would have
 607          * waited for all writes to be stable first), so it is semantically
 608          * correct to declare this the end of the log.
 609          */
 610         if (lr->lr_blkptr.blk_birth >= first_txg) {
 611                 error = zil_read_log_data(zilog, lr, NULL);
 612                 if (error != 0)
 613                         return (error);
 614         }
 615
 616         return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
 617 }
 618
 619 static int
 620 zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx,
 621     uint64_t first_txg)
 622 {
 623         const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
 624         const blkptr_t *bp;
 625         spa_t *spa = zilog->zl_spa;
 626         uint_t ii;
 627
 628         ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 629         ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
 630             lr_bps[lr->lr_nbps]));
 631
 632         if (tx == NULL) {
 633                 return (0);
 634         }
 635
 636         /*
 637          * XXX: Do we need to byteswap lr?
 638          */
 639
 640         for (ii = 0; ii < lr->lr_nbps; ii++) {
 641                 bp = &lr->lr_bps[ii];
 642
 643                 /*
 644                  * When data is embedded into the BP there is no need to create
 645                  * BRT entry as there is no data block.  Just copy the BP as it
 646                  * contains the data.
 647                  */
 648                 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 649                         continue;
 650
 651                 /*
 652                  * We can not handle block pointers from the future, since they
 653                  * are not yet allocated.  It should not normally happen, but
 654                  * just in case lets be safe and just stop here now instead of
 655                  * corrupting the pool.
 656                  */
 657                 if (BP_PHYSICAL_BIRTH(bp) >= first_txg)
 658                         return (SET_ERROR(ENOENT));
 659
 660                 /*
 661                  * Assert the block is really allocated before we reference it.
 662                  */
 663                 metaslab_check_free(spa, bp);
 664         }
 665
 666         for (ii = 0; ii < lr->lr_nbps; ii++) {
 667                 bp = &lr->lr_bps[ii];
 668                 if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp))
 669                         brt_pending_add(spa, bp, tx);
 670         }
 671
 672         return (0);
 673 }
 674
 675 static int
 676 zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
 677     uint64_t first_txg)
 678 {
 679
 680         switch (lrc->lrc_txtype) {
 681         case TX_WRITE:
 682                 return (zil_claim_write(zilog, lrc, tx, first_txg));
 683         case TX_CLONE_RANGE:
 684                 return (zil_claim_clone_range(zilog, lrc, tx, first_txg));
 685         default:
 686                 return (0);
 687         }
 688 }
 689
 690 static int
 691 zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
 692     uint64_t claim_txg)
 693 {
 694         (void) claim_txg;
 695
 696         zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 697
 698         return (0);
 699 }
 700
 701 static int
 702 zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
 703 {
 704         lr_write_t *lr = (lr_write_t *)lrc;
 705         blkptr_t *bp = &lr->lr_blkptr;
 706
 707         ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 708
 709         /*
 710          * If we previously claimed it, we need to free it.
 711          */
 712         if (bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
 713             !BP_IS_HOLE(bp)) {
 714                 zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 715         }
 716
 717         return (0);
 718 }
 719
 720 static int
 721 zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
 722 {
 723         const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
 724         const blkptr_t *bp;
 725         spa_t *spa;
 726         uint_t ii;
 727
 728         ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 729         ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
 730             lr_bps[lr->lr_nbps]));
 731
 732         if (tx == NULL) {
 733                 return (0);
 734         }
 735
 736         spa = zilog->zl_spa;
 737
 738         for (ii = 0; ii < lr->lr_nbps; ii++) {
 739                 bp = &lr->lr_bps[ii];
 740
 741                 if (!BP_IS_HOLE(bp)) {
 742                         zio_free(spa, dmu_tx_get_txg(tx), bp);
 743                 }
 744         }
 745
 746         return (0);
 747 }
 748
 749 static int
 750 zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
 751     uint64_t claim_txg)
 752 {
 753
 754         if (claim_txg == 0) {
 755                 return (0);
 756         }
 757
 758         switch (lrc->lrc_txtype) {
 759         case TX_WRITE:
 760                 return (zil_free_write(zilog, lrc, tx, claim_txg));
 761         case TX_CLONE_RANGE:
 762                 return (zil_free_clone_range(zilog, lrc, tx));
 763         default:
 764                 return (0);
 765         }
 766 }
 767
 768 static int
 769 zil_lwb_vdev_compare(const void *x1, const void *x2)
 770 {
 771         const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
 772         const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
 773
 774         return (TREE_CMP(v1, v2));
 775 }
 776
 777 /*
 778  * Allocate a new lwb.  We may already have a block pointer for it, in which
 779  * case we get size and version from there.  Or we may not yet, in which case
 780  * we choose them here and later make the block allocation match.
 781  */
 782 static lwb_t *
 783 zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
 784     uint64_t txg, lwb_state_t state)
 785 {
 786         lwb_t *lwb;
 787
 788         lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
 789         lwb->lwb_zilog = zilog;
 790         if (bp) {
 791                 lwb->lwb_blk = *bp;
 792                 lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2);
 793                 sz = BP_GET_LSIZE(bp);
 794         } else {
 795                 BP_ZERO(&lwb->lwb_blk);
 796                 lwb->lwb_slim = (spa_version(zilog->zl_spa) >=
 797                     SPA_VERSION_SLIM_ZIL);
 798         }
 799         lwb->lwb_slog = slog;
 800         lwb->lwb_error = 0;
 801         if (lwb->lwb_slim) {
 802                 lwb->lwb_nmax = sz;
 803                 lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
 804         } else {
 805                 lwb->lwb_nmax = sz - sizeof (zil_chain_t);
 806                 lwb->lwb_nused = lwb->lwb_nfilled = 0;
 807         }
 808         lwb->lwb_sz = sz;
 809         lwb->lwb_state = state;
 810         lwb->lwb_buf = zio_buf_alloc(sz);
 811         lwb->lwb_child_zio = NULL;
 812         lwb->lwb_write_zio = NULL;
 813         lwb->lwb_root_zio = NULL;
 814         lwb->lwb_issued_timestamp = 0;
 815         lwb->lwb_issued_txg = 0;
 816         lwb->lwb_alloc_txg = txg;
 817         lwb->lwb_max_txg = 0;
 818
 819         mutex_enter(&zilog->zl_lock);
 820         list_insert_tail(&zilog->zl_lwb_list, lwb);
 821         if (state != LWB_STATE_NEW)
 822                 zilog->zl_last_lwb_opened = lwb;
 823         mutex_exit(&zilog->zl_lock);
 824
 825         return (lwb);
 826 }
 827
 828 static void
 829 zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
 830 {
 831         ASSERT(MUTEX_HELD(&zilog->zl_lock));
 832         ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 833             lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 834         ASSERT3P(lwb->lwb_child_zio, ==, NULL);
 835         ASSERT3P(lwb->lwb_write_zio, ==, NULL);
 836         ASSERT3P(lwb->lwb_root_zio, ==, NULL);
 837         ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa));
 838         ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
 839         VERIFY(list_is_empty(&lwb->lwb_itxs));
 840         VERIFY(list_is_empty(&lwb->lwb_waiters));
 841         ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 842         ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
 843
 844         /*
 845          * Clear the zilog's field to indicate this lwb is no longer
 846          * valid, and prevent use-after-free errors.
 847          */
 848         if (zilog->zl_last_lwb_opened == lwb)
 849                 zilog->zl_last_lwb_opened = NULL;
 850
 851         kmem_cache_free(zil_lwb_cache, lwb);
 852 }
 853
 854 /*
 855  * Called when we create in-memory log transactions so that we know
 856  * to cleanup the itxs at the end of spa_sync().
 857  */
 858 static void
 859 zilog_dirty(zilog_t *zilog, uint64_t txg)
 860 {
 861         dsl_pool_t *dp = zilog->zl_dmu_pool;
 862         dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 863
 864         ASSERT(spa_writeable(zilog->zl_spa));
 865
 866         if (ds->ds_is_snapshot)
 867                 panic("dirtying snapshot!");
 868
 869         if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
 870                 /* up the hold count until we can be written out */
 871                 dmu_buf_add_ref(ds->ds_dbuf, zilog);
 872
 873                 zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg);
 874         }
 875 }
 876
 877 /*
 878  * Determine if the zil is dirty in the specified txg. Callers wanting to
 879  * ensure that the dirty state does not change must hold the itxg_lock for
 880  * the specified txg. Holding the lock will ensure that the zil cannot be
 881  * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
 882  * state.
 883  */
 884 static boolean_t __maybe_unused
 885 zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
 886 {
 887         dsl_pool_t *dp = zilog->zl_dmu_pool;
 888
 889         if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
 890                 return (B_TRUE);
 891         return (B_FALSE);
 892 }
 893
 894 /*
 895  * Determine if the zil is dirty. The zil is considered dirty if it has
 896  * any pending itx records that have not been cleaned by zil_clean().
 897  */
 898 static boolean_t
 899 zilog_is_dirty(zilog_t *zilog)
 900 {
 901         dsl_pool_t *dp = zilog->zl_dmu_pool;
 902
 903         for (int t = 0; t < TXG_SIZE; t++) {
 904                 if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
 905                         return (B_TRUE);
 906         }
 907         return (B_FALSE);
 908 }
 909
 910 /*
 911  * Its called in zil_commit context (zil_process_commit_list()/zil_create()).
 912  * It activates SPA_FEATURE_ZILSAXATTR feature, if its enabled.
 913  * Check dsl_dataset_feature_is_active to avoid txg_wait_synced() on every
 914  * zil_commit.
 915  */
 916 static void
 917 zil_commit_activate_saxattr_feature(zilog_t *zilog)
 918 {
 919         dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 920         uint64_t txg = 0;
 921         dmu_tx_t *tx = NULL;
 922
 923         if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
 924             dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL &&
 925             !dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) {
 926                 tx = dmu_tx_create(zilog->zl_os);
 927                 VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 928                 dsl_dataset_dirty(ds, tx);
 929                 txg = dmu_tx_get_txg(tx);
 930
 931                 mutex_enter(&ds->ds_lock);
 932                 ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
 933                     (void *)B_TRUE;
 934                 mutex_exit(&ds->ds_lock);
 935                 dmu_tx_commit(tx);
 936                 txg_wait_synced(zilog->zl_dmu_pool, txg);
 937         }
 938 }
 939
 940 /*
 941  * Create an on-disk intent log.
 942  */
 943 static lwb_t *
 944 zil_create(zilog_t *zilog)
 945 {
 946         const zil_header_t *zh = zilog->zl_header;
 947         lwb_t *lwb = NULL;
 948         uint64_t txg = 0;
 949         dmu_tx_t *tx = NULL;
 950         blkptr_t blk;
 951         int error = 0;
 952         boolean_t slog = FALSE;
 953         dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 954
 955
 956         /*
 957          * Wait for any previous destroy to complete.
 958          */
 959         txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 960
 961         ASSERT(zh->zh_claim_txg == 0);
 962         ASSERT(zh->zh_replay_seq == 0);
 963
 964         blk = zh->zh_log;
 965
 966         /*
 967          * Allocate an initial log block if:
 968          *    - there isn't one already
 969          *    - the existing block is the wrong endianness
 970          */
 971         if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
 972                 tx = dmu_tx_create(zilog->zl_os);
 973                 VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 974                 dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 975                 txg = dmu_tx_get_txg(tx);
 976
 977                 if (!BP_IS_HOLE(&blk)) {
 978                         zio_free(zilog->zl_spa, txg, &blk);
 979                         BP_ZERO(&blk);
 980                 }
 981
 982                 error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
 983                     ZIL_MIN_BLKSZ, &slog);
 984                 if (error == 0)
 985                         zil_init_log_chain(zilog, &blk);
 986         }
 987
 988         /*
 989          * Allocate a log write block (lwb) for the first log block.
 990          */
 991         if (error == 0)
 992                 lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW);
 993
 994         /*
 995          * If we just allocated the first log block, commit our transaction
 996          * and wait for zil_sync() to stuff the block pointer into zh_log.
 997          * (zh is part of the MOS, so we cannot modify it in open context.)
 998          */
 999         if (tx != NULL) {
1000                 /*
1001                  * If "zilsaxattr" feature is enabled on zpool, then activate
1002                  * it now when we're creating the ZIL chain. We can't wait with
1003                  * this until we write the first xattr log record because we
1004                  * need to wait for the feature activation to sync out.
1005                  */
1006                 if (spa_feature_is_enabled(zilog->zl_spa,
1007                     SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) !=
1008                     DMU_OST_ZVOL) {
1009                         mutex_enter(&ds->ds_lock);
1010                         ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
1011                             (void *)B_TRUE;
1012                         mutex_exit(&ds->ds_lock);
1013                 }
1014
1015                 dmu_tx_commit(tx);
1016                 txg_wait_synced(zilog->zl_dmu_pool, txg);
1017         } else {
1018                 /*
1019                  * This branch covers the case where we enable the feature on a
1020                  * zpool that has existing ZIL headers.
1021                  */
1022                 zil_commit_activate_saxattr_feature(zilog);
1023         }
1024         IMPLY(spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
1025             dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL,
1026             dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR));
1027
1028         ASSERT(error != 0 || memcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
1029         IMPLY(error == 0, lwb != NULL);
1030
1031         return (lwb);
1032 }
1033
1034 /*
1035  * In one tx, free all log blocks and clear the log header. If keep_first
1036  * is set, then we're replaying a log with no content. We want to keep the
1037  * first block, however, so that the first synchronous transaction doesn't
1038  * require a txg_wait_synced() in zil_create(). We don't need to
1039  * txg_wait_synced() here either when keep_first is set, because both
1040  * zil_create() and zil_destroy() will wait for any in-progress destroys
1041  * to complete.
1042  * Return B_TRUE if there were any entries to replay.
1043  */
1044 boolean_t
1045 zil_destroy(zilog_t *zilog, boolean_t keep_first)
1046 {
1047         const zil_header_t *zh = zilog->zl_header;
1048         lwb_t *lwb;
1049         dmu_tx_t *tx;
1050         uint64_t txg;
1051
1052         /*
1053          * Wait for any previous destroy to complete.
1054          */
1055         txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
1056
1057         zilog->zl_old_header = *zh;             /* debugging aid */
1058
1059         if (BP_IS_HOLE(&zh->zh_log))
1060                 return (B_FALSE);
1061
1062         tx = dmu_tx_create(zilog->zl_os);
1063         VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
1064         dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
1065         txg = dmu_tx_get_txg(tx);
1066
1067         mutex_enter(&zilog->zl_lock);
1068
1069         ASSERT3U(zilog->zl_destroy_txg, <, txg);
1070         zilog->zl_destroy_txg = txg;
1071         zilog->zl_keep_first = keep_first;
1072
1073         if (!list_is_empty(&zilog->zl_lwb_list)) {
1074                 ASSERT(zh->zh_claim_txg == 0);
1075                 VERIFY(!keep_first);
1076                 while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) {
1077                         if (lwb->lwb_buf != NULL)
1078                                 zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
1079                         if (!BP_IS_HOLE(&lwb->lwb_blk))
1080                                 zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
1081                         zil_free_lwb(zilog, lwb);
1082                 }
1083         } else if (!keep_first) {
1084                 zil_destroy_sync(zilog, tx);
1085         }
1086         mutex_exit(&zilog->zl_lock);
1087
1088         dmu_tx_commit(tx);
1089
1090         return (B_TRUE);
1091 }
1092
1093 void
1094 zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
1095 {
1096         ASSERT(list_is_empty(&zilog->zl_lwb_list));
1097         (void) zil_parse(zilog, zil_free_log_block,
1098             zil_free_log_record, tx, zilog->zl_header->zh_claim_txg, B_FALSE);
1099 }
1100
1101 int
1102 zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
1103 {
1104         dmu_tx_t *tx = txarg;
1105         zilog_t *zilog;
1106         uint64_t first_txg;
1107         zil_header_t *zh;
1108         objset_t *os;
1109         int error;
1110
1111         error = dmu_objset_own_obj(dp, ds->ds_object,
1112             DMU_OST_ANY, B_FALSE, B_FALSE, FTAG, &os);
1113         if (error != 0) {
1114                 /*
1115                  * EBUSY indicates that the objset is inconsistent, in which
1116                  * case it can not have a ZIL.
1117                  */
1118                 if (error != EBUSY) {
1119                         cmn_err(CE_WARN, "can't open objset for %llu, error %u",
1120                             (unsigned long long)ds->ds_object, error);
1121                 }
1122
1123                 return (0);
1124         }
1125
1126         zilog = dmu_objset_zil(os);
1127         zh = zil_header_in_syncing_context(zilog);
1128         ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa));
1129         first_txg = spa_min_claim_txg(zilog->zl_spa);
1130
1131         /*
1132          * If the spa_log_state is not set to be cleared, check whether
1133          * the current uberblock is a checkpoint one and if the current
1134          * header has been claimed before moving on.
1135          *
1136          * If the current uberblock is a checkpointed uberblock then
1137          * one of the following scenarios took place:
1138          *
1139          * 1] We are currently rewinding to the checkpoint of the pool.
1140          * 2] We crashed in the middle of a checkpoint rewind but we
1141          *    did manage to write the checkpointed uberblock to the
1142          *    vdev labels, so when we tried to import the pool again
1143          *    the checkpointed uberblock was selected from the import
1144          *    procedure.
1145          *
1146          * In both cases we want to zero out all the ZIL blocks, except
1147          * the ones that have been claimed at the time of the checkpoint
1148          * (their zh_claim_txg != 0). The reason is that these blocks
1149          * may be corrupted since we may have reused their locations on
1150          * disk after we took the checkpoint.
1151          *
1152          * We could try to set spa_log_state to SPA_LOG_CLEAR earlier
1153          * when we first figure out whether the current uberblock is
1154          * checkpointed or not. Unfortunately, that would discard all
1155          * the logs, including the ones that are claimed, and we would
1156          * leak space.
1157          */
1158         if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR ||
1159             (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
1160             zh->zh_claim_txg == 0)) {
1161                 if (!BP_IS_HOLE(&zh->zh_log)) {
1162                         (void) zil_parse(zilog, zil_clear_log_block,
1163                             zil_noop_log_record, tx, first_txg, B_FALSE);
1164                 }
1165                 BP_ZERO(&zh->zh_log);
1166                 if (os->os_encrypted)
1167                         os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
1168                 dsl_dataset_dirty(dmu_objset_ds(os), tx);
1169                 dmu_objset_disown(os, B_FALSE, FTAG);
1170                 return (0);
1171         }
1172
1173         /*
1174          * If we are not rewinding and opening the pool normally, then
1175          * the min_claim_txg should be equal to the first txg of the pool.
1176          */
1177         ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa));
1178
1179         /*
1180          * Claim all log blocks if we haven't already done so, and remember
1181          * the highest claimed sequence number.  This ensures that if we can
1182          * read only part of the log now (e.g. due to a missing device),
1183          * but we can read the entire log later, we will not try to replay
1184          * or destroy beyond the last block we successfully claimed.
1185          */
1186         ASSERT3U(zh->zh_claim_txg, <=, first_txg);
1187         if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
1188                 (void) zil_parse(zilog, zil_claim_log_block,
1189                     zil_claim_log_record, tx, first_txg, B_FALSE);
1190                 zh->zh_claim_txg = first_txg;
1191                 zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
1192                 zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
1193                 if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
1194                         zh->zh_flags |= ZIL_REPLAY_NEEDED;
1195                 zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
1196                 if (os->os_encrypted)
1197                         os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
1198                 dsl_dataset_dirty(dmu_objset_ds(os), tx);
1199         }
1200
1201         ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
1202         dmu_objset_disown(os, B_FALSE, FTAG);
1203         return (0);
1204 }
1205
1206 /*
1207  * Check the log by walking the log chain.
1208  * Checksum errors are ok as they indicate the end of the chain.
1209  * Any other error (no device or read failure) returns an error.
1210  */
1211 int
1212 zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
1213 {
1214         (void) dp;
1215         zilog_t *zilog;
1216         objset_t *os;
1217         blkptr_t *bp;
1218         int error;
1219
1220         ASSERT(tx == NULL);
1221
1222         error = dmu_objset_from_ds(ds, &os);
1223         if (error != 0) {
1224                 cmn_err(CE_WARN, "can't open objset %llu, error %d",
1225                     (unsigned long long)ds->ds_object, error);
1226                 return (0);
1227         }
1228
1229         zilog = dmu_objset_zil(os);
1230         bp = (blkptr_t *)&zilog->zl_header->zh_log;
1231
1232         if (!BP_IS_HOLE(bp)) {
1233                 vdev_t *vd;
1234                 boolean_t valid = B_TRUE;
1235
1236                 /*
1237                  * Check the first block and determine if it's on a log device
1238                  * which may have been removed or faulted prior to loading this
1239                  * pool.  If so, there's no point in checking the rest of the
1240                  * log as its content should have already been synced to the
1241                  * pool.
1242                  */
1243                 spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
1244                 vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
1245                 if (vd->vdev_islog && vdev_is_dead(vd))
1246                         valid = vdev_log_state_valid(vd);
1247                 spa_config_exit(os->os_spa, SCL_STATE, FTAG);
1248
1249                 if (!valid)
1250                         return (0);
1251
1252                 /*
1253                  * Check whether the current uberblock is checkpointed (e.g.
1254                  * we are rewinding) and whether the current header has been
1255                  * claimed or not. If it hasn't then skip verifying it. We
1256                  * do this because its ZIL blocks may be part of the pool's
1257                  * state before the rewind, which is no longer valid.
1258                  */
1259                 zil_header_t *zh = zil_header_in_syncing_context(zilog);
1260                 if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
1261                     zh->zh_claim_txg == 0)
1262                         return (0);
1263         }
1264
1265         /*
1266          * Because tx == NULL, zil_claim_log_block() will not actually claim
1267          * any blocks, but just determine whether it is possible to do so.
1268          * In addition to checking the log chain, zil_claim_log_block()
1269          * will invoke zio_claim() with a done func of spa_claim_notify(),
1270          * which will update spa_max_claim_txg.  See spa_load() for details.
1271          */
1272         error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
1273             zilog->zl_header->zh_claim_txg ? -1ULL :
1274             spa_min_claim_txg(os->os_spa), B_FALSE);
1275
1276         return ((error == ECKSUM || error == ENOENT) ? 0 : error);
1277 }
1278
1279 /*
1280  * When an itx is "skipped", this function is used to properly mark the
1281  * waiter as "done, and signal any thread(s) waiting on it. An itx can
1282  * be skipped (and not committed to an lwb) for a variety of reasons,
1283  * one of them being that the itx was committed via spa_sync(), prior to
1284  * it being committed to an lwb; this can happen if a thread calling
1285  * zil_commit() is racing with spa_sync().
1286  */
1287 static void
1288 zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
1289 {
1290         mutex_enter(&zcw->zcw_lock);
1291         ASSERT3B(zcw->zcw_done, ==, B_FALSE);
1292         zcw->zcw_done = B_TRUE;
1293         cv_broadcast(&zcw->zcw_cv);
1294         mutex_exit(&zcw->zcw_lock);
1295 }
1296
1297 /*
1298  * This function is used when the given waiter is to be linked into an
1299  * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb.
1300  * At this point, the waiter will no longer be referenced by the itx,
1301  * and instead, will be referenced by the lwb.
1302  */
1303 static void
1304 zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
1305 {
1306         /*
1307          * The lwb_waiters field of the lwb is protected by the zilog's
1308          * zl_issuer_lock while the lwb is open and zl_lock otherwise.
1309          * zl_issuer_lock also protects leaving the open state.
1310          * zcw_lwb setting is protected by zl_issuer_lock and state !=
1311          * flush_done, which transition is protected by zl_lock.
1312          */
1313         ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock));
1314         IMPLY(lwb->lwb_state != LWB_STATE_OPENED,
1315             MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
1316         ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
1317         ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
1318
1319         ASSERT(!list_link_active(&zcw->zcw_node));
1320         list_insert_tail(&lwb->lwb_waiters, zcw);
1321         ASSERT3P(zcw->zcw_lwb, ==, NULL);
1322         zcw->zcw_lwb = lwb;
1323 }
1324
1325 /*
1326  * This function is used when zio_alloc_zil() fails to allocate a ZIL
1327  * block, and the given waiter must be linked to the "nolwb waiters"
1328  * list inside of zil_process_commit_list().
1329  */
1330 static void
1331 zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
1332 {
1333         ASSERT(!list_link_active(&zcw->zcw_node));
1334         list_insert_tail(nolwb, zcw);
1335         ASSERT3P(zcw->zcw_lwb, ==, NULL);
1336 }
1337
1338 void
1339 zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
1340 {
1341         avl_tree_t *t = &lwb->lwb_vdev_tree;
1342         avl_index_t where;
1343         zil_vdev_node_t *zv, zvsearch;
1344         int ndvas = BP_GET_NDVAS(bp);
1345         int i;
1346
1347         ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
1348         ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
1349
1350         if (zil_nocacheflush)
1351                 return;
1352
1353         mutex_enter(&lwb->lwb_vdev_lock);
1354         for (i = 0; i < ndvas; i++) {
1355                 zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
1356                 if (avl_find(t, &zvsearch, &where) == NULL) {
1357                         zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
1358                         zv->zv_vdev = zvsearch.zv_vdev;
1359                         avl_insert(t, zv, where);
1360                 }
1361         }
1362         mutex_exit(&lwb->lwb_vdev_lock);
1363 }
1364
1365 static void
1366 zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
1367 {
1368         avl_tree_t *src = &lwb->lwb_vdev_tree;
1369         avl_tree_t *dst = &nlwb->lwb_vdev_tree;
1370         void *cookie = NULL;
1371         zil_vdev_node_t *zv;
1372
1373         ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
1374         ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
1375         ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
1376
1377         /*
1378          * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
1379          * not need the protection of lwb_vdev_lock (it will only be modified
1380          * while holding zilog->zl_lock) as its writes and those of its
1381          * children have all completed.  The younger 'nlwb' may be waiting on
1382          * future writes to additional vdevs.
1383          */
1384         mutex_enter(&nlwb->lwb_vdev_lock);
1385         /*
1386          * Tear down the 'lwb' vdev tree, ensuring that entries which do not
1387          * exist in 'nlwb' are moved to it, freeing any would-be duplicates.
1388          */
1389         while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) {
1390                 avl_index_t where;
1391
1392                 if (avl_find(dst, zv, &where) == NULL) {
1393                         avl_insert(dst, zv, where);
1394                 } else {
1395                         kmem_free(zv, sizeof (*zv));
1396                 }
1397         }
1398         mutex_exit(&nlwb->lwb_vdev_lock);
1399 }
1400
1401 void
1402 zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
1403 {
1404         lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
1405 }
1406
1407 /*
1408  * This function is a called after all vdevs associated with a given lwb
1409  * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
1410  * as the lwb write completes, if "zil_nocacheflush" is set. Further,
1411  * all "previous" lwb's will have completed before this function is
1412  * called; i.e. this function is called for all previous lwbs before
1413  * it's called for "this" lwb (enforced via zio the dependencies
1414  * configured in zil_lwb_set_zio_dependency()).
1415  *
1416  * The intention is for this function to be called as soon as the
1417  * contents of an lwb are considered "stable" on disk, and will survive
1418  * any sudden loss of power. At this point, any threads waiting for the
1419  * lwb to reach this state are signalled, and the "waiter" structures
1420  * are marked "done".
1421  */
1422 static void
1423 zil_lwb_flush_vdevs_done(zio_t *zio)
1424 {
1425         lwb_t *lwb = zio->io_private;
1426         zilog_t *zilog = lwb->lwb_zilog;
1427         zil_commit_waiter_t *zcw;
1428         itx_t *itx;
1429
1430         spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
1431
1432         hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp;
1433
1434         mutex_enter(&zilog->zl_lock);
1435
1436         zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 7 + t) / 8;
1437
1438         lwb->lwb_root_zio = NULL;
1439
1440         ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
1441         lwb->lwb_state = LWB_STATE_FLUSH_DONE;
1442
1443         if (zilog->zl_last_lwb_opened == lwb) {
1444                 /*
1445                  * Remember the highest committed log sequence number
1446                  * for ztest. We only update this value when all the log
1447                  * writes succeeded, because ztest wants to ASSERT that
1448                  * it got the whole log chain.
1449                  */
1450                 zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
1451         }
1452
1453         while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
1454                 zil_itx_destroy(itx);
1455
1456         while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
1457                 mutex_enter(&zcw->zcw_lock);
1458
1459                 ASSERT3P(zcw->zcw_lwb, ==, lwb);
1460                 zcw->zcw_lwb = NULL;
1461                 /*
1462                  * We expect any ZIO errors from child ZIOs to have been
1463                  * propagated "up" to this specific LWB's root ZIO, in
1464                  * order for this error handling to work correctly. This
1465                  * includes ZIO errors from either this LWB's write or
1466                  * flush, as well as any errors from other dependent LWBs
1467                  * (e.g. a root LWB ZIO that might be a child of this LWB).
1468                  *
1469                  * With that said, it's important to note that LWB flush
1470                  * errors are not propagated up to the LWB root ZIO.
1471                  * This is incorrect behavior, and results in VDEV flush
1472                  * errors not being handled correctly here. See the
1473                  * comment above the call to "zio_flush" for details.
1474                  */
1475
1476                 zcw->zcw_zio_error = zio->io_error;
1477
1478                 ASSERT3B(zcw->zcw_done, ==, B_FALSE);
1479                 zcw->zcw_done = B_TRUE;
1480                 cv_broadcast(&zcw->zcw_cv);
1481
1482                 mutex_exit(&zcw->zcw_lock);
1483         }
1484
1485         uint64_t txg = lwb->lwb_issued_txg;
1486
1487         /* Once we drop the lock, lwb may be freed by zil_sync(). */
1488         mutex_exit(&zilog->zl_lock);
1489
1490         mutex_enter(&zilog->zl_lwb_io_lock);
1491         ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0);
1492         zilog->zl_lwb_inflight[txg & TXG_MASK]--;
1493         if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0)
1494                 cv_broadcast(&zilog->zl_lwb_io_cv);
1495         mutex_exit(&zilog->zl_lwb_io_lock);
1496 }
1497
1498 /*
1499  * Wait for the completion of all issued write/flush of that txg provided.
1500  * It guarantees zil_lwb_flush_vdevs_done() is called and returned.
1501  */
1502 static void
1503 zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg)
1504 {
1505         ASSERT3U(txg, ==, spa_syncing_txg(zilog->zl_spa));
1506
1507         mutex_enter(&zilog->zl_lwb_io_lock);
1508         while (zilog->zl_lwb_inflight[txg & TXG_MASK] > 0)
1509                 cv_wait(&zilog->zl_lwb_io_cv, &zilog->zl_lwb_io_lock);
1510         mutex_exit(&zilog->zl_lwb_io_lock);
1511
1512 #ifdef ZFS_DEBUG
1513         mutex_enter(&zilog->zl_lock);
1514         mutex_enter(&zilog->zl_lwb_io_lock);
1515         lwb_t *lwb = list_head(&zilog->zl_lwb_list);
1516         while (lwb != NULL) {
1517                 if (lwb->lwb_issued_txg <= txg) {
1518                         ASSERT(lwb->lwb_state != LWB_STATE_ISSUED);
1519                         ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE);
1520                         IMPLY(lwb->lwb_issued_txg > 0,
1521                             lwb->lwb_state == LWB_STATE_FLUSH_DONE);
1522                 }
1523                 IMPLY(lwb->lwb_state == LWB_STATE_WRITE_DONE ||
1524                     lwb->lwb_state == LWB_STATE_FLUSH_DONE,
1525                     lwb->lwb_buf == NULL);
1526                 lwb = list_next(&zilog->zl_lwb_list, lwb);
1527         }
1528         mutex_exit(&zilog->zl_lwb_io_lock);
1529         mutex_exit(&zilog->zl_lock);
1530 #endif
1531 }
1532
1533 /*
1534  * This is called when an lwb's write zio completes. The callback's
1535  * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
1536  * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
1537  * in writing out this specific lwb's data, and in the case that cache
1538  * flushes have been deferred, vdevs involved in writing the data for
1539  * previous lwbs. The writes corresponding to all the vdevs in the
1540  * lwb_vdev_tree will have completed by the time this is called, due to
1541  * the zio dependencies configured in zil_lwb_set_zio_dependency(),
1542  * which takes deferred flushes into account. The lwb will be "done"
1543  * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
1544  * completion callback for the lwb's root zio.
1545  */
1546 static void
1547 zil_lwb_write_done(zio_t *zio)
1548 {
1549         lwb_t *lwb = zio->io_private;
1550         spa_t *spa = zio->io_spa;
1551         zilog_t *zilog = lwb->lwb_zilog;
1552         avl_tree_t *t = &lwb->lwb_vdev_tree;
1553         void *cookie = NULL;
1554         zil_vdev_node_t *zv;
1555         lwb_t *nlwb;
1556
1557         ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
1558
1559         abd_free(zio->io_abd);
1560         zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
1561         lwb->lwb_buf = NULL;
1562
1563         mutex_enter(&zilog->zl_lock);
1564         ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
1565         lwb->lwb_state = LWB_STATE_WRITE_DONE;
1566         lwb->lwb_child_zio = NULL;
1567         lwb->lwb_write_zio = NULL;
1568
1569         /*
1570          * If nlwb is not yet issued, zil_lwb_set_zio_dependency() is not
1571          * called for it yet, and when it will be, it won't be able to make
1572          * its write ZIO a parent this ZIO.  In such case we can not defer
1573          * our flushes or below may be a race between the done callbacks.
1574          */
1575         nlwb = list_next(&zilog->zl_lwb_list, lwb);
1576         if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED)
1577                 nlwb = NULL;
1578         mutex_exit(&zilog->zl_lock);
1579
1580         if (avl_numnodes(t) == 0)
1581                 return;
1582
1583         /*
1584          * If there was an IO error, we're not going to call zio_flush()
1585          * on these vdevs, so we simply empty the tree and free the
1586          * nodes. We avoid calling zio_flush() since there isn't any
1587          * good reason for doing so, after the lwb block failed to be
1588          * written out.
1589          *
1590          * Additionally, we don't perform any further error handling at
1591          * this point (e.g. setting "zcw_zio_error" appropriately), as
1592          * we expect that to occur in "zil_lwb_flush_vdevs_done" (thus,
1593          * we expect any error seen here, to have been propagated to
1594          * that function).
1595          */
1596         if (zio->io_error != 0) {
1597                 while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
1598                         kmem_free(zv, sizeof (*zv));
1599                 return;
1600         }
1601
1602         /*
1603          * If this lwb does not have any threads waiting for it to
1604          * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
1605          * command to the vdevs written to by "this" lwb, and instead
1606          * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
1607          * command for those vdevs. Thus, we merge the vdev tree of
1608          * "this" lwb with the vdev tree of the "next" lwb in the list,
1609          * and assume the "next" lwb will handle flushing the vdevs (or
1610          * deferring the flush(s) again).
1611          *
1612          * This is a useful performance optimization, especially for
1613          * workloads with lots of async write activity and few sync
1614          * write and/or fsync activity, as it has the potential to
1615          * coalesce multiple flush commands to a vdev into one.
1616          */
1617         if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) {
1618                 zil_lwb_flush_defer(lwb, nlwb);
1619                 ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
1620                 return;
1621         }
1622
1623         while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
1624                 vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
1625                 if (vd != NULL) {
1626                         /*
1627                          * The "ZIO_FLAG_DONT_PROPAGATE" is currently
1628                          * always used within "zio_flush". This means,
1629                          * any errors when flushing the vdev(s), will
1630                          * (unfortunately) not be handled correctly,
1631                          * since these "zio_flush" errors will not be
1632                          * propagated up to "zil_lwb_flush_vdevs_done".
1633                          */
1634                         zio_flush(lwb->lwb_root_zio, vd);
1635                 }
1636                 kmem_free(zv, sizeof (*zv));
1637         }
1638 }
1639
1640 /*
1641  * Build the zio dependency chain, which is used to preserve the ordering of
1642  * lwb completions that is required by the semantics of the ZIL. Each new lwb
1643  * zio becomes a parent of the previous lwb zio, such that the new lwb's zio
1644  * cannot complete until the previous lwb's zio completes.
1645  *
1646  * This is required by the semantics of zil_commit(): the commit waiters
1647  * attached to the lwbs will be woken in the lwb zio's completion callback,
1648  * so this zio dependency graph ensures the waiters are woken in the correct
1649  * order (the same order the lwbs were created).
1650  */
1651 static void
1652 zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
1653 {
1654         ASSERT(MUTEX_HELD(&zilog->zl_lock));
1655
1656         lwb_t *prev_lwb = list_prev(&zilog->zl_lwb_list, lwb);
1657         if (prev_lwb == NULL ||
1658             prev_lwb->lwb_state == LWB_STATE_FLUSH_DONE)
1659                 return;
1660
1661         /*
1662          * If the previous lwb's write hasn't already completed, we also want
1663          * to order the completion of the lwb write zios (above, we only order
1664          * the completion of the lwb root zios). This is required because of
1665          * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb.
1666          *
1667          * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous
1668          * lwb will rely on this lwb to flush the vdevs written to by that
1669          * previous lwb. Thus, we need to ensure this lwb doesn't issue the
1670          * flush until after the previous lwb's write completes. We ensure
1671          * this ordering by setting the zio parent/child relationship here.
1672          *
1673          * Without this relationship on the lwb's write zio, it's possible
1674          * for this lwb's write to complete prior to the previous lwb's write
1675          * completing; and thus, the vdevs for the previous lwb would be
1676          * flushed prior to that lwb's data being written to those vdevs (the
1677          * vdevs are flushed in the lwb write zio's completion handler,
1678          * zil_lwb_write_done()).
1679          */
1680         if (prev_lwb->lwb_state == LWB_STATE_ISSUED) {
1681                 ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL);
1682                 zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio);
1683         } else {
1684                 ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
1685         }
1686
1687         ASSERT3P(prev_lwb->lwb_root_zio, !=, NULL);
1688         zio_add_child(lwb->lwb_root_zio, prev_lwb->lwb_root_zio);
1689 }
1690
1691
1692 /*
1693  * This function's purpose is to "open" an lwb such that it is ready to
1694  * accept new itxs being committed to it. This function is idempotent; if
1695  * the passed in lwb has already been opened, it is essentially a no-op.
1696  */
1697 static void
1698 zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
1699 {
1700         ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
1701
1702         if (lwb->lwb_state != LWB_STATE_NEW) {
1703                 ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
1704                 return;
1705         }
1706
1707         mutex_enter(&zilog->zl_lock);
1708         lwb->lwb_state = LWB_STATE_OPENED;
1709         zilog->zl_last_lwb_opened = lwb;
1710         mutex_exit(&zilog->zl_lock);
1711 }
1712
1713 /*
1714  * Define a limited set of intent log block sizes.
1715  *
1716  * These must be a multiple of 4KB. Note only the amount used (again
1717  * aligned to 4KB) actually gets written. However, we can't always just
1718  * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
1719  */
1720 static const struct {
1721         uint64_t        limit;
1722         uint64_t        blksz;
1723 } zil_block_buckets[] = {
1724         { 4096,         4096 },                 /* non TX_WRITE */
1725         { 8192 + 4096,  8192 + 4096 },          /* database */
1726         { 32768 + 4096, 32768 + 4096 },         /* NFS writes */
1727         { 65536 + 4096, 65536 + 4096 },         /* 64KB writes */
1728         { UINT64_MAX,   SPA_OLD_MAXBLOCKSIZE},  /* > 128KB writes */
1729 };
1730
1731 /*
1732  * Maximum block size used by the ZIL.  This is picked up when the ZIL is
1733  * initialized.  Otherwise this should not be used directly; see
1734  * zl_max_block_size instead.
1735  */
1736 static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
1737
1738 /*
1739  * Close the log block for being issued and allocate the next one.
1740  * Has to be called under zl_issuer_lock to chain more lwbs.
1741  */
1742 static lwb_t *
1743 zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
1744 {
1745         int i;
1746
1747         ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
1748         ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
1749         lwb->lwb_state = LWB_STATE_CLOSED;
1750
1751         /*
1752          * If there was an allocation failure then returned NULL will trigger
1753          * zil_commit_writer_stall() at the caller.  This is inherently racy,
1754          * since allocation may not have happened yet.
1755          */
1756         if (lwb->lwb_error != 0)
1757                 return (NULL);
1758
1759         /*
1760          * Log blocks are pre-allocated. Here we select the size of the next
1761          * block, based on size used in the last block.
1762          * - first find the smallest bucket that will fit the block from a
1763          *   limited set of block sizes. This is because it's faster to write
1764          *   blocks allocated from the same metaslab as they are adjacent or
1765          *   close.
1766          * - next find the maximum from the new suggested size and an array of
1767          *   previous sizes. This lessens a picket fence effect of wrongly
1768          *   guessing the size if we have a stream of say 2k, 64k, 2k, 64k
1769          *   requests.
1770          *
1771          * Note we only write what is used, but we can't just allocate
1772          * the maximum block size because we can exhaust the available
1773          * pool log space.
1774          */
1775         uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
1776         for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++)
1777                 continue;
1778         zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size);
1779         zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
1780         for (i = 0; i < ZIL_PREV_BLKS; i++)
1781                 zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
1782         DTRACE_PROBE3(zil__block__size, zilog_t *, zilog,
1783             uint64_t, zil_blksz,
1784             uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]);
1785         zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
1786
1787         return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state));
1788 }
1789
1790 /*
1791  * Finalize previously closed block and issue the write zio.
1792  */
1793 static void
1794 zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
1795 {
1796         spa_t *spa = zilog->zl_spa;
1797         zil_chain_t *zilc;
1798         boolean_t slog;
1799         zbookmark_phys_t zb;
1800         zio_priority_t prio;
1801         int error;
1802
1803         ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
1804
1805         /* Actually fill the lwb with the data. */
1806         for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
1807             itx = list_next(&lwb->lwb_itxs, itx))
1808                 zil_lwb_commit(zilog, lwb, itx);
1809         lwb->lwb_nused = lwb->lwb_nfilled;
1810         ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
1811
1812         lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb,
1813             ZIO_FLAG_CANFAIL);
1814
1815         /*
1816          * The lwb is now ready to be issued, but it can be only if it already
1817          * got its block pointer allocated or the allocation has failed.
1818          * Otherwise leave it as-is, relying on some other thread to issue it
1819          * after allocating its block pointer via calling zil_lwb_write_issue()
1820          * for the previous lwb(s) in the chain.
1821          */
1822         mutex_enter(&zilog->zl_lock);
1823         lwb->lwb_state = LWB_STATE_READY;
1824         if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) {
1825                 mutex_exit(&zilog->zl_lock);
1826                 return;
1827         }
1828         mutex_exit(&zilog->zl_lock);
1829
1830 next_lwb:
1831         if (lwb->lwb_slim)
1832                 zilc = (zil_chain_t *)lwb->lwb_buf;
1833         else
1834                 zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax);
1835         int wsz = lwb->lwb_sz;
1836         if (lwb->lwb_error == 0) {
1837                 abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
1838                 if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
1839                         prio = ZIO_PRIORITY_SYNC_WRITE;
1840                 else
1841                         prio = ZIO_PRIORITY_ASYNC_WRITE;
1842                 SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
1843                     ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
1844                     lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
1845                 lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0,
1846                     &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done,
1847                     lwb, prio, ZIO_FLAG_CANFAIL, &zb);
1848                 zil_lwb_add_block(lwb, &lwb->lwb_blk);
1849
1850                 if (lwb->lwb_slim) {
1851                         /* For Slim ZIL only write what is used. */
1852                         wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ,
1853                             int);
1854                         ASSERT3S(wsz, <=, lwb->lwb_sz);
1855                         zio_shrink(lwb->lwb_write_zio, wsz);
1856                         wsz = lwb->lwb_write_zio->io_size;
1857                 }
1858                 memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
1859                 zilc->zc_pad = 0;
1860                 zilc->zc_nused = lwb->lwb_nused;
1861                 zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
1862         } else {
1863                 /*
1864                  * We can't write the lwb if there was an allocation failure,
1865                  * so create a null zio instead just to maintain dependencies.
1866                  */
1867                 lwb->lwb_write_zio = zio_null(lwb->lwb_root_zio, spa, NULL,
1868                     zil_lwb_write_done, lwb, ZIO_FLAG_CANFAIL);
1869                 lwb->lwb_write_zio->io_error = lwb->lwb_error;
1870         }
1871         if (lwb->lwb_child_zio)
1872                 zio_add_child(lwb->lwb_write_zio, lwb->lwb_child_zio);
1873
1874         /*
1875          * Open transaction to allocate the next block pointer.
1876          */
1877         dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
1878         VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
1879         dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
1880         uint64_t txg = dmu_tx_get_txg(tx);
1881
1882         /*
1883          * Allocate next the block pointer unless we are already in error.
1884          */
1885         lwb_t *nlwb = list_next(&zilog->zl_lwb_list, lwb);
1886         blkptr_t *bp = &zilc->zc_next_blk;
1887         BP_ZERO(bp);
1888         error = lwb->lwb_error;
1889         if (error == 0) {
1890                 error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz,
1891                     &slog);
1892         }
1893         if (error == 0) {
1894                 ASSERT3U(bp->blk_birth, ==, txg);
1895                 BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
1896                     ZIO_CHECKSUM_ZILOG);
1897                 bp->blk_cksum = lwb->lwb_blk.blk_cksum;
1898                 bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
1899         }
1900
1901         /*
1902          * Reduce TXG open time by incrementing inflight counter and committing
1903          * the transaciton.  zil_sync() will wait for it to return to zero.
1904          */
1905         mutex_enter(&zilog->zl_lwb_io_lock);
1906         lwb->lwb_issued_txg = txg;
1907         zilog->zl_lwb_inflight[txg & TXG_MASK]++;
1908         zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg);
1909         mutex_exit(&zilog->zl_lwb_io_lock);
1910         dmu_tx_commit(tx);
1911
1912         spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
1913
1914         /*
1915          * We've completed all potentially blocking operations.  Update the
1916          * nlwb and allow it proceed without possible lock order reversals.
1917          */
1918         mutex_enter(&zilog->zl_lock);
1919         zil_lwb_set_zio_dependency(zilog, lwb);
1920         lwb->lwb_state = LWB_STATE_ISSUED;
1921
1922         if (nlwb) {
1923                 nlwb->lwb_blk = *bp;
1924                 nlwb->lwb_error = error;
1925                 nlwb->lwb_slog = slog;
1926                 nlwb->lwb_alloc_txg = txg;
1927                 if (nlwb->lwb_state != LWB_STATE_READY)
1928                         nlwb = NULL;
1929         }
1930         mutex_exit(&zilog->zl_lock);
1931
1932         if (lwb->lwb_slog) {
1933                 ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
1934                 ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes,
1935                     lwb->lwb_nused);
1936                 ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write,
1937                     wsz);
1938                 ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc,
1939                     BP_GET_LSIZE(&lwb->lwb_blk));
1940         } else {
1941                 ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count);
1942                 ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes,
1943                     lwb->lwb_nused);
1944                 ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write,
1945                     wsz);
1946                 ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc,
1947                     BP_GET_LSIZE(&lwb->lwb_blk));
1948         }
1949         lwb->lwb_issued_timestamp = gethrtime();
1950         if (lwb->lwb_child_zio)
1951                 zio_nowait(lwb->lwb_child_zio);
1952         zio_nowait(lwb->lwb_write_zio);
1953         zio_nowait(lwb->lwb_root_zio);
1954
1955         /*
1956          * If nlwb was ready when we gave it the block pointer,
1957          * it is on us to issue it and possibly following ones.
1958          */
1959         lwb = nlwb;
1960         if (lwb)
1961                 goto next_lwb;
1962 }
1963
1964 /*
1965  * Maximum amount of data that can be put into single log block.
1966  */
1967 uint64_t
1968 zil_max_log_data(zilog_t *zilog, size_t hdrsize)
1969 {
1970         return (zilog->zl_max_block_size - sizeof (zil_chain_t) - hdrsize);
1971 }
1972
1973 /*
1974  * Maximum amount of log space we agree to waste to reduce number of
1975  * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~6%).
1976  */
1977 static inline uint64_t
1978 zil_max_waste_space(zilog_t *zilog)
1979 {
1980         return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 16);
1981 }
1982
1983 /*
1984  * Maximum amount of write data for WR_COPIED.  For correctness, consumers
1985  * must fall back to WR_NEED_COPY if we can't fit the entire record into one
1986  * maximum sized log block, because each WR_COPIED record must fit in a
1987  * single log block.  Below that it is a tradeoff of additional memory copy
1988  * and possibly worse log space efficiency vs additional range lock/unlock.
1989  */
1990 static uint_t zil_maxcopied = 7680;
1991
1992 uint64_t
1993 zil_max_copied_data(zilog_t *zilog)
1994 {
1995         uint64_t max_data = zil_max_log_data(zilog, sizeof (lr_write_t));
1996         return (MIN(max_data, zil_maxcopied));
1997 }
1998
1999 /*
2000  * Estimate space needed in the lwb for the itx.  Allocate more lwbs or
2001  * split the itx as needed, but don't touch the actual transaction data.
2002  * Has to be called under zl_issuer_lock to call zil_lwb_write_close()
2003  * to chain more lwbs.
2004  */
2005 static lwb_t *
2006 zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
2007 {
2008         itx_t *citx;
2009         lr_t *lr, *clr;
2010         lr_write_t *lrw;
2011         uint64_t dlen, dnow, lwb_sp, reclen, max_log_data;
2012
2013         ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
2014         ASSERT3P(lwb, !=, NULL);
2015         ASSERT3P(lwb->lwb_buf, !=, NULL);
2016
2017         zil_lwb_write_open(zilog, lwb);
2018
2019         lr = &itx->itx_lr;
2020         lrw = (lr_write_t *)lr;
2021
2022         /*
2023          * A commit itx doesn't represent any on-disk state; instead
2024          * it's simply used as a place holder on the commit list, and
2025          * provides a mechanism for attaching a "commit waiter" onto the
2026          * correct lwb (such that the waiter can be signalled upon
2027          * completion of that lwb). Thus, we don't process this itx's
2028          * log record if it's a commit itx (these itx's don't have log
2029          * records), and instead link the itx's waiter onto the lwb's
2030          * list of waiters.
2031          *
2032          * For more details, see the comment above zil_commit().
2033          */
2034         if (lr->lrc_txtype == TX_COMMIT) {
2035                 zil_commit_waiter_link_lwb(itx->itx_private, lwb);
2036                 list_insert_tail(&lwb->lwb_itxs, itx);
2037                 return (lwb);
2038         }
2039
2040         reclen = lr->lrc_reclen;
2041         if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
2042                 ASSERT3U(reclen, ==, sizeof (lr_write_t));
2043                 dlen = P2ROUNDUP_TYPED(
2044                     lrw->lr_length, sizeof (uint64_t), uint64_t);
2045         } else {
2046                 ASSERT3U(reclen, >=, sizeof (lr_t));
2047                 dlen = 0;
2048         }
2049         ASSERT3U(reclen, <=, zil_max_log_data(zilog, 0));
2050         zilog->zl_cur_used += (reclen + dlen);
2051
2052 cont:
2053         /*
2054          * If this record won't fit in the current log block, start a new one.
2055          * For WR_NEED_COPY optimize layout for minimal number of chunks.
2056          */
2057         lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
2058         max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t));
2059         if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
2060             lwb_sp < zil_max_waste_space(zilog) &&
2061             (dlen % max_log_data == 0 ||
2062             lwb_sp < reclen + dlen % max_log_data))) {
2063                 list_insert_tail(ilwbs, lwb);
2064                 lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED);
2065                 if (lwb == NULL)
2066                         return (NULL);
2067                 lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
2068         }
2069
2070         /*
2071          * There must be enough space in the log block to hold reclen.
2072          * For WR_COPIED, we need to fit the whole record in one block,
2073          * and reclen is the write record header size + the data size.
2074          * For WR_NEED_COPY, we can create multiple records, splitting
2075          * the data into multiple blocks, so we only need to fit one
2076          * word of data per block; in this case reclen is just the header
2077          * size (no data).
2078          */
2079         ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
2080
2081         dnow = MIN(dlen, lwb_sp - reclen);
2082         if (dlen > dnow) {
2083                 ASSERT3U(lr->lrc_txtype, ==, TX_WRITE);
2084                 ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY);
2085                 citx = zil_itx_clone(itx);
2086                 clr = &citx->itx_lr;
2087                 lr_write_t *clrw = (lr_write_t *)clr;
2088                 clrw->lr_length = dnow;
2089                 lrw->lr_offset += dnow;
2090                 lrw->lr_length -= dnow;
2091         } else {
2092                 citx = itx;
2093                 clr = lr;
2094         }
2095
2096         /*
2097          * We're actually making an entry, so update lrc_seq to be the
2098          * log record sequence number.  Note that this is generally not
2099          * equal to the itx sequence number because not all transactions
2100          * are synchronous, and sometimes spa_sync() gets there first.
2101          */
2102         clr->lrc_seq = ++zilog->zl_lr_seq;
2103
2104         lwb->lwb_nused += reclen + dnow;
2105         ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
2106         ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
2107
2108         zil_lwb_add_txg(lwb, lr->lrc_txg);
2109         list_insert_tail(&lwb->lwb_itxs, citx);
2110
2111         dlen -= dnow;
2112         if (dlen > 0) {
2113                 zilog->zl_cur_used += reclen;
2114                 goto cont;
2115         }
2116
2117         if (lr->lrc_txtype == TX_WRITE &&
2118             lr->lrc_txg > spa_freeze_txg(zilog->zl_spa))
2119                 txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg);
2120
2121         return (lwb);
2122 }
2123
2124 /*
2125  * Fill the actual transaction data into the lwb, following zil_lwb_assign().
2126  * Does not require locking.
2127  */
2128 static void
2129 zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
2130 {
2131         lr_t *lr, *lrb;
2132         lr_write_t *lrw, *lrwb;
2133         char *lr_buf;
2134         uint64_t dlen, reclen;
2135
2136         lr = &itx->itx_lr;
2137         lrw = (lr_write_t *)lr;
2138
2139         if (lr->lrc_txtype == TX_COMMIT)
2140                 return;
2141
2142         if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
2143                 dlen = P2ROUNDUP_TYPED(
2144                     lrw->lr_length, sizeof (uint64_t), uint64_t);
2145         } else {
2146                 dlen = 0;
2147         }
2148         reclen = lr->lrc_reclen;
2149         ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled);
2150
2151         lr_buf = lwb->lwb_buf + lwb->lwb_nfilled;
2152         memcpy(lr_buf, lr, reclen);
2153         lrb = (lr_t *)lr_buf;           /* Like lr, but inside lwb. */
2154         lrwb = (lr_write_t *)lrb;       /* Like lrw, but inside lwb. */
2155
2156         ZIL_STAT_BUMP(zilog, zil_itx_count);
2157
2158         /*
2159          * If it's a write, fetch the data or get its blkptr as appropriate.
2160          */
2161         if (lr->lrc_txtype == TX_WRITE) {
2162                 if (itx->itx_wr_state == WR_COPIED) {
2163                         ZIL_STAT_BUMP(zilog, zil_itx_copied_count);
2164                         ZIL_STAT_INCR(zilog, zil_itx_copied_bytes,
2165                             lrw->lr_length);
2166                 } else {
2167                         char *dbuf;
2168                         int error;
2169
2170                         if (itx->itx_wr_state == WR_NEED_COPY) {
2171                                 dbuf = lr_buf + reclen;
2172                                 lrb->lrc_reclen += dlen;
2173                                 ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count);
2174                                 ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes,
2175                                     dlen);
2176                         } else {
2177                                 ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT);
2178                                 dbuf = NULL;
2179                                 ZIL_STAT_BUMP(zilog, zil_itx_indirect_count);
2180                                 ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes,
2181                                     lrw->lr_length);
2182                                 if (lwb->lwb_child_zio == NULL) {
2183                                         lwb->lwb_child_zio = zio_null(NULL,
2184                                             zilog->zl_spa, NULL, NULL, NULL,
2185                                             ZIO_FLAG_CANFAIL);
2186                                 }
2187                         }
2188
2189                         /*
2190                          * The "lwb_child_zio" we pass in will become a child of
2191                          * "lwb_write_zio", when one is created, so one will be
2192                          * a parent of any zio's created by the "zl_get_data".
2193                          * This way "lwb_write_zio" will first wait for children
2194                          * block pointers before own writing, and then for their
2195                          * writing completion before the vdev cache flushing.
2196                          */
2197                         error = zilog->zl_get_data(itx->itx_private,
2198                             itx->itx_gen, lrwb, dbuf, lwb,
2199                             lwb->lwb_child_zio);
2200                         if (dbuf != NULL && error == 0) {
2201                                 /* Zero any padding bytes in the last block. */
2202                                 memset((char *)dbuf + lrwb->lr_length, 0,
2203                                     dlen - lrwb->lr_length);
2204                         }
2205
2206                         /*
2207                          * Typically, the only return values we should see from
2208                          * ->zl_get_data() are 0, EIO, ENOENT, EEXIST or
2209                          *  EALREADY. However, it is also possible to see other
2210                          *  error values such as ENOSPC or EINVAL from
2211                          *  dmu_read() -> dnode_hold() -> dnode_hold_impl() or
2212                          *  ENXIO as well as a multitude of others from the
2213                          *  block layer through dmu_buf_hold() -> dbuf_read()
2214                          *  -> zio_wait(), as well as through dmu_read() ->
2215                          *  dnode_hold() -> dnode_hold_impl() -> dbuf_read() ->
2216                          *  zio_wait(). When these errors happen, we can assume
2217                          *  that neither an immediate write nor an indirect
2218                          *  write occurred, so we need to fall back to
2219                          *  txg_wait_synced(). This is unusual, so we print to
2220                          *  dmesg whenever one of these errors occurs.
2221                          */
2222                         switch (error) {
2223                         case 0:
2224                                 break;
2225                         default:
2226                                 cmn_err(CE_WARN, "zil_lwb_commit() received "
2227                                     "unexpected error %d from ->zl_get_data()"
2228                                     ". Falling back to txg_wait_synced().",
2229                                     error);
2230                                 zfs_fallthrough;
2231                         case EIO:
2232                                 txg_wait_synced(zilog->zl_dmu_pool,
2233                                     lr->lrc_txg);
2234                                 zfs_fallthrough;
2235                         case ENOENT:
2236                                 zfs_fallthrough;
2237                         case EEXIST:
2238                                 zfs_fallthrough;
2239                         case EALREADY:
2240                                 return;
2241                         }
2242                 }
2243         }
2244
2245         lwb->lwb_nfilled += reclen + dlen;
2246         ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused);
2247         ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t)));
2248 }
2249
2250 itx_t *
2251 zil_itx_create(uint64_t txtype, size_t olrsize)
2252 {
2253         size_t itxsize, lrsize;
2254         itx_t *itx;
2255
2256         ASSERT3U(olrsize, >=, sizeof (lr_t));
2257         lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t);
2258         ASSERT3U(lrsize, >=, olrsize);
2259         itxsize = offsetof(itx_t, itx_lr) + lrsize;
2260
2261         itx = zio_data_buf_alloc(itxsize);
2262         itx->itx_lr.lrc_txtype = txtype;
2263         itx->itx_lr.lrc_reclen = lrsize;
2264         itx->itx_lr.lrc_seq = 0;        /* defensive */
2265         memset((char *)&itx->itx_lr + olrsize, 0, lrsize - olrsize);
2266         itx->itx_sync = B_TRUE;         /* default is synchronous */
2267         itx->itx_callback = NULL;
2268         itx->itx_callback_data = NULL;
2269         itx->itx_size = itxsize;
2270
2271         return (itx);
2272 }
2273
2274 static itx_t *
2275 zil_itx_clone(itx_t *oitx)
2276 {
2277         ASSERT3U(oitx->itx_size, >=, sizeof (itx_t));
2278         ASSERT3U(oitx->itx_size, ==,
2279             offsetof(itx_t, itx_lr) + oitx->itx_lr.lrc_reclen);
2280
2281         itx_t *itx = zio_data_buf_alloc(oitx->itx_size);
2282         memcpy(itx, oitx, oitx->itx_size);
2283         itx->itx_callback = NULL;
2284         itx->itx_callback_data = NULL;
2285         return (itx);
2286 }
2287
2288 void
2289 zil_itx_destroy(itx_t *itx)
2290 {
2291         ASSERT3U(itx->itx_size, >=, sizeof (itx_t));
2292         ASSERT3U(itx->itx_lr.lrc_reclen, ==,
2293             itx->itx_size - offsetof(itx_t, itx_lr));
2294         IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL);
2295         IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
2296
2297         if (itx->itx_callback != NULL)
2298                 itx->itx_callback(itx->itx_callback_data);
2299
2300         zio_data_buf_free(itx, itx->itx_size);
2301 }
2302
2303 /*
2304  * Free up the sync and async itxs. The itxs_t has already been detached
2305  * so no locks are needed.
2306  */
2307 static void
2308 zil_itxg_clean(void *arg)
2309 {
2310         itx_t *itx;
2311         list_t *list;
2312         avl_tree_t *t;
2313         void *cookie;
2314         itxs_t *itxs = arg;
2315         itx_async_node_t *ian;
2316
2317         list = &itxs->i_sync_list;
2318         while ((itx = list_remove_head(list)) != NULL) {
2319                 /*
2320                  * In the general case, commit itxs will not be found
2321                  * here, as they'll be committed to an lwb via
2322                  * zil_lwb_assign(), and free'd in that function. Having
2323                  * said that, it is still possible for commit itxs to be
2324                  * found here, due to the following race:
2325                  *
2326                  *      - a thread calls zil_commit() which assigns the
2327                  *        commit itx to a per-txg i_sync_list
2328                  *      - zil_itxg_clean() is called (e.g. via spa_sync())
2329                  *        while the waiter is still on the i_sync_list
2330                  *
2331                  * There's nothing to prevent syncing the txg while the
2332                  * waiter is on the i_sync_list. This normally doesn't
2333                  * happen because spa_sync() is slower than zil_commit(),
2334                  * but if zil_commit() calls txg_wait_synced() (e.g.
2335                  * because zil_create() or zil_commit_writer_stall() is
2336                  * called) we will hit this case.
2337                  */
2338                 if (itx->itx_lr.lrc_txtype == TX_COMMIT)
2339                         zil_commit_waiter_skip(itx->itx_private);
2340
2341                 zil_itx_destroy(itx);
2342         }
2343
2344         cookie = NULL;
2345         t = &itxs->i_async_tree;
2346         while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
2347                 list = &ian->ia_list;
2348                 while ((itx = list_remove_head(list)) != NULL) {
2349                         /* commit itxs should never be on the async lists. */
2350                         ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
2351                         zil_itx_destroy(itx);
2352                 }
2353                 list_destroy(list);
2354                 kmem_free(ian, sizeof (itx_async_node_t));
2355         }
2356         avl_destroy(t);
2357
2358         kmem_free(itxs, sizeof (itxs_t));
2359 }
2360
2361 static int
2362 zil_aitx_compare(const void *x1, const void *x2)
2363 {
2364         const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
2365         const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
2366
2367         return (TREE_CMP(o1, o2));
2368 }
2369
2370 /*
2371  * Remove all async itx with the given oid.
2372  */
2373 void
2374 zil_remove_async(zilog_t *zilog, uint64_t oid)
2375 {
2376         uint64_t otxg, txg;
2377         itx_async_node_t *ian, ian_search;
2378         avl_tree_t *t;
2379         avl_index_t where;
2380         list_t clean_list;
2381         itx_t *itx;
2382
2383         ASSERT(oid != 0);
2384         list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
2385
2386         if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
2387                 otxg = ZILTEST_TXG;
2388         else
2389                 otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
2390
2391         for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
2392                 itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
2393
2394                 mutex_enter(&itxg->itxg_lock);
2395                 if (itxg->itxg_txg != txg) {
2396                         mutex_exit(&itxg->itxg_lock);
2397                         continue;
2398                 }
2399
2400                 /*
2401                  * Locate the object node and append its list.
2402                  */
2403                 t = &itxg->itxg_itxs->i_async_tree;
2404                 ian_search.ia_foid = oid;
2405                 ian = avl_find(t, &ian_search, &where);
2406                 if (ian != NULL)
2407                         list_move_tail(&clean_list, &ian->ia_list);
2408                 mutex_exit(&itxg->itxg_lock);
2409         }
2410         while ((itx = list_remove_head(&clean_list)) != NULL) {
2411                 /* commit itxs should never be on the async lists. */
2412                 ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
2413                 zil_itx_destroy(itx);
2414         }
2415         list_destroy(&clean_list);
2416 }
2417
2418 void
2419 zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
2420 {
2421         uint64_t txg;
2422         itxg_t *itxg;
2423         itxs_t *itxs, *clean = NULL;
2424
2425         /*
2426          * Ensure the data of a renamed file is committed before the rename.
2427          */
2428         if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
2429                 zil_async_to_sync(zilog, itx->itx_oid);
2430
2431         if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
2432                 txg = ZILTEST_TXG;
2433         else
2434                 txg = dmu_tx_get_txg(tx);
2435
2436         itxg = &zilog->zl_itxg[txg & TXG_MASK];
2437         mutex_enter(&itxg->itxg_lock);
2438         itxs = itxg->itxg_itxs;
2439         if (itxg->itxg_txg != txg) {
2440                 if (itxs != NULL) {
2441                         /*
2442                          * The zil_clean callback hasn't got around to cleaning
2443                          * this itxg. Save the itxs for release below.
2444                          * This should be rare.
2445                          */
2446                         zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
2447                             "txg %llu", (u_longlong_t)itxg->itxg_txg);
2448                         clean = itxg->itxg_itxs;
2449                 }
2450                 itxg->itxg_txg = txg;
2451                 itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t),
2452                     KM_SLEEP);
2453
2454                 list_create(&itxs->i_sync_list, sizeof (itx_t),
2455                     offsetof(itx_t, itx_node));
2456                 avl_create(&itxs->i_async_tree, zil_aitx_compare,
2457                     sizeof (itx_async_node_t),
2458                     offsetof(itx_async_node_t, ia_node));
2459         }
2460         if (itx->itx_sync) {
2461                 list_insert_tail(&itxs->i_sync_list, itx);
2462         } else {
2463                 avl_tree_t *t = &itxs->i_async_tree;
2464                 uint64_t foid =
2465                     LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid);
2466                 itx_async_node_t *ian;
2467                 avl_index_t where;
2468
2469                 ian = avl_find(t, &foid, &where);
2470                 if (ian == NULL) {
2471                         ian = kmem_alloc(sizeof (itx_async_node_t),
2472                             KM_SLEEP);
2473                         list_create(&ian->ia_list, sizeof (itx_t),
2474                             offsetof(itx_t, itx_node));
2475                         ian->ia_foid = foid;
2476                         avl_insert(t, ian, where);
2477                 }
2478                 list_insert_tail(&ian->ia_list, itx);
2479         }
2480
2481         itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
2482
2483         /*
2484          * We don't want to dirty the ZIL using ZILTEST_TXG, because
2485          * zil_clean() will never be called using ZILTEST_TXG. Thus, we
2486          * need to be careful to always dirty the ZIL using the "real"
2487          * TXG (not itxg_txg) even when the SPA is frozen.
2488          */
2489         zilog_dirty(zilog, dmu_tx_get_txg(tx));
2490         mutex_exit(&itxg->itxg_lock);
2491
2492         /* Release the old itxs now we've dropped the lock */
2493         if (clean != NULL)
2494                 zil_itxg_clean(clean);
2495 }
2496
2497 /*
2498  * If there are any in-memory intent log transactions which have now been
2499  * synced then start up a taskq to free them. We should only do this after we
2500  * have written out the uberblocks (i.e. txg has been committed) so that
2501  * don't inadvertently clean out in-memory log records that would be required
2502  * by zil_commit().
2503  */
2504 void
2505 zil_clean(zilog_t *zilog, uint64_t synced_txg)
2506 {
2507         itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
2508         itxs_t *clean_me;
2509
2510         ASSERT3U(synced_txg, <, ZILTEST_TXG);
2511
2512         mutex_enter(&itxg->itxg_lock);
2513         if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
2514                 mutex_exit(&itxg->itxg_lock);
2515                 return;
2516         }
2517         ASSERT3U(itxg->itxg_txg, <=, synced_txg);
2518         ASSERT3U(itxg->itxg_txg, !=, 0);
2519         clean_me = itxg->itxg_itxs;
2520         itxg->itxg_itxs = NULL;
2521         itxg->itxg_txg = 0;
2522         mutex_exit(&itxg->itxg_lock);
2523         /*
2524          * Preferably start a task queue to free up the old itxs but
2525          * if taskq_dispatch can't allocate resources to do that then
2526          * free it in-line. This should be rare. Note, using TQ_SLEEP
2527          * created a bad performance problem.
2528          */
2529         ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
2530         ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
2531         taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
2532             zil_itxg_clean, clean_me, TQ_NOSLEEP);
2533         if (id == TASKQID_INVALID)
2534                 zil_itxg_clean(clean_me);
2535 }
2536
2537 /*
2538  * This function will traverse the queue of itxs that need to be
2539  * committed, and move them onto the ZIL's zl_itx_commit_list.
2540  */
2541 static uint64_t
2542 zil_get_commit_list(zilog_t *zilog)
2543 {
2544         uint64_t otxg, txg, wtxg = 0;
2545         list_t *commit_list = &zilog->zl_itx_commit_list;
2546
2547         ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
2548
2549         if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
2550                 otxg = ZILTEST_TXG;
2551         else
2552                 otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
2553
2554         /*
2555          * This is inherently racy, since there is nothing to prevent
2556          * the last synced txg from changing. That's okay since we'll
2557          * only commit things in the future.
2558          */
2559         for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
2560                 itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
2561
2562                 mutex_enter(&itxg->itxg_lock);
2563                 if (itxg->itxg_txg != txg) {
2564                         mutex_exit(&itxg->itxg_lock);
2565                         continue;
2566                 }
2567
2568                 /*
2569                  * If we're adding itx records to the zl_itx_commit_list,
2570                  * then the zil better be dirty in this "txg". We can assert
2571                  * that here since we're holding the itxg_lock which will
2572                  * prevent spa_sync from cleaning it. Once we add the itxs
2573                  * to the zl_itx_commit_list we must commit it to disk even
2574                  * if it's unnecessary (i.e. the txg was synced).
2575                  */
2576                 ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
2577                     spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
2578                 list_t *sync_list = &itxg->itxg_itxs->i_sync_list;
2579                 if (unlikely(zilog->zl_suspend > 0)) {
2580                         /*
2581                          * ZIL was just suspended, but we lost the race.
2582                          * Allow all earlier itxs to be committed, but ask
2583                          * caller to do txg_wait_synced(txg) for any new.
2584                          */
2585                         if (!list_is_empty(sync_list))
2586                                 wtxg = MAX(wtxg, txg);
2587                 } else {
2588                         list_move_tail(commit_list, sync_list);
2589                 }
2590
2591                 mutex_exit(&itxg->itxg_lock);
2592         }
2593         return (wtxg);
2594 }
2595
2596 /*
2597  * Move the async itxs for a specified object to commit into sync lists.
2598  */
2599 void
2600 zil_async_to_sync(zilog_t *zilog, uint64_t foid)
2601 {
2602         uint64_t otxg, txg;
2603         itx_async_node_t *ian, ian_search;
2604         avl_tree_t *t;
2605         avl_index_t where;
2606
2607         if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
2608                 otxg = ZILTEST_TXG;
2609         else
2610                 otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
2611
2612         /*
2613          * This is inherently racy, since there is nothing to prevent
2614          * the last synced txg from changing.
2615          */
2616         for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
2617                 itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
2618
2619                 mutex_enter(&itxg->itxg_lock);
2620                 if (itxg->itxg_txg != txg) {
2621                         mutex_exit(&itxg->itxg_lock);
2622                         continue;
2623                 }
2624
2625                 /*
2626                  * If a foid is specified then find that node and append its
2627                  * list. Otherwise walk the tree appending all the lists
2628                  * to the sync list. We add to the end rather than the
2629                  * beginning to ensure the create has happened.
2630                  */
2631                 t = &itxg->itxg_itxs->i_async_tree;
2632                 if (foid != 0) {
2633                         ian_search.ia_foid = foid;
2634                         ian = avl_find(t, &ian_search, &where);
2635                         if (ian != NULL) {
2636                                 list_move_tail(&itxg->itxg_itxs->i_sync_list,
2637                                     &ian->ia_list);
2638                         }
2639                 } else {
2640                         void *cookie = NULL;
2641
2642                         while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
2643                                 list_move_tail(&itxg->itxg_itxs->i_sync_list,
2644                                     &ian->ia_list);
2645                                 list_destroy(&ian->ia_list);
2646                                 kmem_free(ian, sizeof (itx_async_node_t));
2647                         }
2648                 }
2649                 mutex_exit(&itxg->itxg_lock);
2650         }
2651 }
2652
2653 /*
2654  * This function will prune commit itxs that are at the head of the
2655  * commit list (it won't prune past the first non-commit itx), and
2656  * either: a) attach them to the last lwb that's still pending
2657  * completion, or b) skip them altogether.
2658  *
2659  * This is used as a performance optimization to prevent commit itxs
2660  * from generating new lwbs when it's unnecessary to do so.
2661  */
2662 static void
2663 zil_prune_commit_list(zilog_t *zilog)
2664 {
2665         itx_t *itx;
2666
2667         ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
2668
2669         while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) {
2670                 lr_t *lrc = &itx->itx_lr;
2671                 if (lrc->lrc_txtype != TX_COMMIT)
2672                         break;
2673
2674                 mutex_enter(&zilog->zl_lock);
2675
2676                 lwb_t *last_lwb = zilog->zl_last_lwb_opened;
2677                 if (last_lwb == NULL ||
2678                     last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
2679                         /*
2680                          * All of the itxs this waiter was waiting on
2681                          * must have already completed (or there were
2682                          * never any itx's for it to wait on), so it's
2683                          * safe to skip this waiter and mark it done.
2684                          */
2685                         zil_commit_waiter_skip(itx->itx_private);
2686                 } else {
2687                         zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
2688                 }
2689
2690                 mutex_exit(&zilog->zl_lock);
2691
2692                 list_remove(&zilog->zl_itx_commit_list, itx);
2693                 zil_itx_destroy(itx);
2694         }
2695
2696         IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
2697 }
2698
2699 static void
2700 zil_commit_writer_stall(zilog_t *zilog)
2701 {
2702         /*
2703          * When zio_alloc_zil() fails to allocate the next lwb block on
2704          * disk, we must call txg_wait_synced() to ensure all of the
2705          * lwbs in the zilog's zl_lwb_list are synced and then freed (in
2706          * zil_sync()), such that any subsequent ZIL writer (i.e. a call
2707          * to zil_process_commit_list()) will have to call zil_create(),
2708          * and start a new ZIL chain.
2709          *
2710          * Since zil_alloc_zil() failed, the lwb that was previously
2711          * issued does not have a pointer to the "next" lwb on disk.
2712          * Thus, if another ZIL writer thread was to allocate the "next"
2713          * on-disk lwb, that block could be leaked in the event of a
2714          * crash (because the previous lwb on-disk would not point to
2715          * it).
2716          *
2717          * We must hold the zilog's zl_issuer_lock while we do this, to
2718          * ensure no new threads enter zil_process_commit_list() until
2719          * all lwb's in the zl_lwb_list have been synced and freed
2720          * (which is achieved via the txg_wait_synced() call).
2721          */
2722         ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
2723         txg_wait_synced(zilog->zl_dmu_pool, 0);
2724         ASSERT(list_is_empty(&zilog->zl_lwb_list));
2725 }
2726
2727 static void
2728 zil_burst_done(zilog_t *zilog)
2729 {
2730         if (!list_is_empty(&zilog->zl_itx_commit_list) ||
2731             zilog->zl_cur_used == 0)
2732                 return;
2733
2734         if (zilog->zl_parallel)
2735                 zilog->zl_parallel--;
2736
2737         zilog->zl_cur_used = 0;
2738 }
2739
2740 /*
2741  * This function will traverse the commit list, creating new lwbs as
2742  * needed, and committing the itxs from the commit list to these newly
2743  * created lwbs. Additionally, as a new lwb is created, the previous
2744  * lwb will be issued to the zio layer to be written to disk.
2745  */
2746 static void
2747 zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
2748 {
2749         spa_t *spa = zilog->zl_spa;
2750         list_t nolwb_itxs;
2751         list_t nolwb_waiters;
2752         lwb_t *lwb, *plwb;
2753         itx_t *itx;
2754
2755         ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
2756
2757         /*
2758          * Return if there's nothing to commit before we dirty the fs by
2759          * calling zil_create().
2760          */
2761         if (list_is_empty(&zilog->zl_itx_commit_list))
2762                 return;
2763
2764         list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
2765         list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
2766             offsetof(zil_commit_waiter_t, zcw_node));
2767
2768         lwb = list_tail(&zilog->zl_lwb_list);
2769         if (lwb == NULL) {
2770                 lwb = zil_create(zilog);
2771         } else {
2772                 /*
2773                  * Activate SPA_FEATURE_ZILSAXATTR for the cases where ZIL will
2774                  * have already been created (zl_lwb_list not empty).
2775                  */
2776                 zil_commit_activate_saxattr_feature(zilog);
2777                 ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
2778                     lwb->lwb_state == LWB_STATE_OPENED);
2779
2780                 /*
2781                  * If the lwb is still opened, it means the workload is really
2782                  * multi-threaded and we won the chance of write aggregation.
2783                  * If it is not opened yet, but previous lwb is still not
2784                  * flushed, it still means the workload is multi-threaded, but
2785                  * there was too much time between the commits to aggregate, so
2786                  * we try aggregation next times, but without too much hopes.
2787                  */
2788                 if (lwb->lwb_state == LWB_STATE_OPENED) {
2789                         zilog->zl_parallel = ZIL_BURSTS;
2790                 } else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb))
2791                     != NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) {
2792                         zilog->zl_parallel = MAX(zilog->zl_parallel,
2793                             ZIL_BURSTS / 2);
2794                 }
2795         }
2796
2797         while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
2798                 lr_t *lrc = &itx->itx_lr;
2799                 uint64_t txg = lrc->lrc_txg;
2800
2801                 ASSERT3U(txg, !=, 0);
2802
2803                 if (lrc->lrc_txtype == TX_COMMIT) {
2804                         DTRACE_PROBE2(zil__process__commit__itx,
2805                             zilog_t *, zilog, itx_t *, itx);
2806                 } else {
2807                         DTRACE_PROBE2(zil__process__normal__itx,
2808                             zilog_t *, zilog, itx_t *, itx);
2809                 }
2810
2811                 boolean_t synced = txg <= spa_last_synced_txg(spa);
2812                 boolean_t frozen = txg > spa_freeze_txg(spa);
2813
2814                 /*
2815                  * If the txg of this itx has already been synced out, then
2816                  * we don't need to commit this itx to an lwb. This is
2817                  * because the data of this itx will have already been
2818                  * written to the main pool. This is inherently racy, and
2819                  * it's still ok to commit an itx whose txg has already
2820                  * been synced; this will result in a write that's
2821                  * unnecessary, but will do no harm.
2822                  *
2823                  * With that said, we always want to commit TX_COMMIT itxs
2824                  * to an lwb, regardless of whether or not that itx's txg
2825                  * has been synced out. We do this to ensure any OPENED lwb
2826                  * will always have at least one zil_commit_waiter_t linked
2827                  * to the lwb.
2828                  *
2829                  * As a counter-example, if we skipped TX_COMMIT itx's
2830                  * whose txg had already been synced, the following
2831                  * situation could occur if we happened to be racing with
2832                  * spa_sync:
2833                  *
2834                  * 1. We commit a non-TX_COMMIT itx to an lwb, where the
2835                  *    itx's txg is 10 and the last synced txg is 9.
2836                  * 2. spa_sync finishes syncing out txg 10.
2837                  * 3. We move to the next itx in the list, it's a TX_COMMIT
2838                  *    whose txg is 10, so we skip it rather than committing
2839                  *    it to the lwb used in (1).
2840                  *
2841                  * If the itx that is skipped in (3) is the last TX_COMMIT
2842                  * itx in the commit list, than it's possible for the lwb
2843                  * used in (1) to remain in the OPENED state indefinitely.
2844                  *
2845                  * To prevent the above scenario from occurring, ensuring
2846                  * that once an lwb is OPENED it will transition to ISSUED
2847                  * and eventually DONE, we always commit TX_COMMIT itx's to
2848                  * an lwb here, even if that itx's txg has already been
2849                  * synced.
2850                  *
2851                  * Finally, if the pool is frozen, we _always_ commit the
2852                  * itx.  The point of freezing the pool is to prevent data
2853                  * from being written to the main pool via spa_sync, and
2854                  * instead rely solely on the ZIL to persistently store the
2855                  * data; i.e.  when the pool is frozen, the last synced txg
2856                  * value can't be trusted.
2857                  */
2858                 if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) {
2859                         if (lwb != NULL) {
2860                                 lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs);
2861                                 if (lwb == NULL) {
2862                                         list_insert_tail(&nolwb_itxs, itx);
2863                                 } else if ((zcw->zcw_lwb != NULL &&
2864                                     zcw->zcw_lwb != lwb) || zcw->zcw_done) {
2865                                         /*
2866                                          * Our lwb is done, leave the rest of
2867                                          * itx list to somebody else who care.
2868                                          */
2869                                         zilog->zl_parallel = ZIL_BURSTS;
2870                                         break;
2871                                 }
2872                         } else {
2873                                 if (lrc->lrc_txtype == TX_COMMIT) {
2874                                         zil_commit_waiter_link_nolwb(
2875                                             itx->itx_private, &nolwb_waiters);
2876                                 }
2877                                 list_insert_tail(&nolwb_itxs, itx);
2878                         }
2879                 } else {
2880                         ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT);
2881                         zil_itx_destroy(itx);
2882                 }
2883         }
2884
2885         if (lwb == NULL) {
2886                 /*
2887                  * This indicates zio_alloc_zil() failed to allocate the
2888                  * "next" lwb on-disk. When this happens, we must stall
2889                  * the ZIL write pipeline; see the comment within
2890                  * zil_commit_writer_stall() for more details.
2891                  */
2892                 while ((lwb = list_remove_head(ilwbs)) != NULL)
2893                         zil_lwb_write_issue(zilog, lwb);
2894                 zil_commit_writer_stall(zilog);
2895
2896                 /*
2897                  * Additionally, we have to signal and mark the "nolwb"
2898                  * waiters as "done" here, since without an lwb, we
2899                  * can't do this via zil_lwb_flush_vdevs_done() like
2900                  * normal.
2901                  */
2902                 zil_commit_waiter_t *zcw;
2903                 while ((zcw = list_remove_head(&nolwb_waiters)) != NULL)
2904                         zil_commit_waiter_skip(zcw);
2905
2906                 /*
2907                  * And finally, we have to destroy the itx's that
2908                  * couldn't be committed to an lwb; this will also call
2909                  * the itx's callback if one exists for the itx.
2910                  */
2911                 while ((itx = list_remove_head(&nolwb_itxs)) != NULL)
2912                         zil_itx_destroy(itx);
2913         } else {
2914                 ASSERT(list_is_empty(&nolwb_waiters));
2915                 ASSERT3P(lwb, !=, NULL);
2916                 ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
2917                     lwb->lwb_state == LWB_STATE_OPENED);
2918
2919                 /*
2920                  * At this point, the ZIL block pointed at by the "lwb"
2921                  * variable is in "new" or "opened" state.
2922                  *
2923                  * If it's "new", then no itxs have been committed to it, so
2924                  * there's no point in issuing its zio (i.e. it's "empty").
2925                  *
2926                  * If it's "opened", then it contains one or more itxs that
2927                  * eventually need to be committed to stable storage. In
2928                  * this case we intentionally do not issue the lwb's zio
2929                  * to disk yet, and instead rely on one of the following
2930                  * two mechanisms for issuing the zio:
2931                  *
2932                  * 1. Ideally, there will be more ZIL activity occurring on
2933                  * the system, such that this function will be immediately
2934                  * called again by different thread and this lwb will be
2935                  * closed by zil_lwb_assign().  This way, the lwb will be
2936                  * "full" when it is issued to disk, and we'll make use of
2937                  * the lwb's size the best we can.
2938                  *
2939                  * 2. If there isn't sufficient ZIL activity occurring on
2940                  * the system, zil_commit_waiter() will close it and issue
2941                  * the zio.  If this occurs, the lwb is not guaranteed
2942                  * to be "full" by the time its zio is issued, and means
2943                  * the size of the lwb was "too large" given the amount
2944                  * of ZIL activity occurring on the system at that time.
2945                  *
2946                  * We do this for a couple of reasons:
2947                  *
2948                  * 1. To try and reduce the number of IOPs needed to
2949                  * write the same number of itxs. If an lwb has space
2950                  * available in its buffer for more itxs, and more itxs
2951                  * will be committed relatively soon (relative to the
2952                  * latency of performing a write), then it's beneficial
2953                  * to wait for these "next" itxs. This way, more itxs
2954                  * can be committed to stable storage with fewer writes.
2955                  *
2956                  * 2. To try and use the largest lwb block size that the
2957                  * incoming rate of itxs can support. Again, this is to
2958                  * try and pack as many itxs into as few lwbs as
2959                  * possible, without significantly impacting the latency
2960                  * of each individual itx.
2961                  */
2962                 if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
2963                         list_insert_tail(ilwbs, lwb);
2964                         lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
2965                         zil_burst_done(zilog);
2966                         if (lwb == NULL) {
2967                                 while ((lwb = list_remove_head(ilwbs)) != NULL)
2968                                         zil_lwb_write_issue(zilog, lwb);
2969                                 zil_commit_writer_stall(zilog);
2970                         }
2971                 }
2972         }
2973 }
2974
2975 /*
2976  * This function is responsible for ensuring the passed in commit waiter
2977  * (and associated commit itx) is committed to an lwb. If the waiter is
2978  * not already committed to an lwb, all itxs in the zilog's queue of
2979  * itxs will be processed. The assumption is the passed in waiter's
2980  * commit itx will found in the queue just like the other non-commit
2981  * itxs, such that when the entire queue is processed, the waiter will
2982  * have been committed to an lwb.
2983  *
2984  * The lwb associated with the passed in waiter is not guaranteed to
2985  * have been issued by the time this function completes. If the lwb is
2986  * not issued, we rely on future calls to zil_commit_writer() to issue
2987  * the lwb, or the timeout mechanism found in zil_commit_waiter().
2988  */
2989 static uint64_t
2990 zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
2991 {
2992         list_t ilwbs;
2993         lwb_t *lwb;
2994         uint64_t wtxg = 0;
2995
2996         ASSERT(!MUTEX_HELD(&zilog->zl_lock));
2997         ASSERT(spa_writeable(zilog->zl_spa));
2998
2999         list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node));
3000         mutex_enter(&zilog->zl_issuer_lock);
3001
3002         if (zcw->zcw_lwb != NULL || zcw->zcw_done) {
3003                 /*
3004                  * It's possible that, while we were waiting to acquire
3005                  * the "zl_issuer_lock", another thread committed this
3006                  * waiter to an lwb. If that occurs, we bail out early,
3007                  * without processing any of the zilog's queue of itxs.
3008                  *
3009                  * On certain workloads and system configurations, the
3010                  * "zl_issuer_lock" can become highly contended. In an
3011                  * attempt to reduce this contention, we immediately drop
3012                  * the lock if the waiter has already been processed.
3013                  *
3014                  * We've measured this optimization to reduce CPU spent
3015                  * contending on this lock by up to 5%, using a system
3016                  * with 32 CPUs, low latency storage (~50 usec writes),
3017                  * and 1024 threads performing sync writes.
3018                  */
3019                 goto out;
3020         }
3021
3022         ZIL_STAT_BUMP(zilog, zil_commit_writer_count);
3023
3024         wtxg = zil_get_commit_list(zilog);
3025         zil_prune_commit_list(zilog);
3026         zil_process_commit_list(zilog, zcw, &ilwbs);
3027
3028 out:
3029         mutex_exit(&zilog->zl_issuer_lock);
3030         while ((lwb = list_remove_head(&ilwbs)) != NULL)
3031                 zil_lwb_write_issue(zilog, lwb);
3032         list_destroy(&ilwbs);
3033         return (wtxg);
3034 }
3035
3036 static void
3037 zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
3038 {
3039         ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
3040         ASSERT(MUTEX_HELD(&zcw->zcw_lock));
3041         ASSERT3B(zcw->zcw_done, ==, B_FALSE);
3042
3043         lwb_t *lwb = zcw->zcw_lwb;
3044         ASSERT3P(lwb, !=, NULL);
3045         ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
3046
3047         /*
3048          * If the lwb has already been issued by another thread, we can
3049          * immediately return since there's no work to be done (the
3050          * point of this function is to issue the lwb). Additionally, we
3051          * do this prior to acquiring the zl_issuer_lock, to avoid
3052          * acquiring it when it's not necessary to do so.
3053          */
3054         if (lwb->lwb_state != LWB_STATE_OPENED)
3055                 return;
3056
3057         /*
3058          * In order to call zil_lwb_write_close() we must hold the
3059          * zilog's "zl_issuer_lock". We can't simply acquire that lock,
3060          * since we're already holding the commit waiter's "zcw_lock",
3061          * and those two locks are acquired in the opposite order
3062          * elsewhere.
3063          */
3064         mutex_exit(&zcw->zcw_lock);
3065         mutex_enter(&zilog->zl_issuer_lock);
3066         mutex_enter(&zcw->zcw_lock);
3067
3068         /*
3069          * Since we just dropped and re-acquired the commit waiter's
3070          * lock, we have to re-check to see if the waiter was marked
3071          * "done" during that process. If the waiter was marked "done",
3072          * the "lwb" pointer is no longer valid (it can be free'd after
3073          * the waiter is marked "done"), so without this check we could
3074          * wind up with a use-after-free error below.
3075          */
3076         if (zcw->zcw_done) {
3077                 mutex_exit(&zilog->zl_issuer_lock);
3078                 return;
3079         }
3080
3081         ASSERT3P(lwb, ==, zcw->zcw_lwb);
3082
3083         /*
3084          * We've already checked this above, but since we hadn't acquired
3085          * the zilog's zl_issuer_lock, we have to perform this check a
3086          * second time while holding the lock.
3087          *
3088          * We don't need to hold the zl_lock since the lwb cannot transition
3089          * from OPENED to CLOSED while we hold the zl_issuer_lock. The lwb
3090          * _can_ transition from CLOSED to DONE, but it's OK to race with
3091          * that transition since we treat the lwb the same, whether it's in
3092          * the CLOSED, ISSUED or DONE states.
3093          *
3094          * The important thing, is we treat the lwb differently depending on
3095          * if it's OPENED or CLOSED, and block any other threads that might
3096          * attempt to close/issue this lwb. For that reason we hold the
3097          * zl_issuer_lock when checking the lwb_state; we must not call
3098          * zil_lwb_write_close() if the lwb had already been closed/issued.
3099          *
3100          * See the comment above the lwb_state_t structure definition for
3101          * more details on the lwb states, and locking requirements.
3102          */
3103         if (lwb->lwb_state != LWB_STATE_OPENED) {
3104                 mutex_exit(&zilog->zl_issuer_lock);
3105                 return;
3106         }
3107
3108         /*
3109          * We do not need zcw_lock once we hold zl_issuer_lock and know lwb
3110          * is still open.  But we have to drop it to avoid a deadlock in case
3111          * callback of zio issued by zil_lwb_write_issue() try to get it,
3112          * while zil_lwb_write_issue() is blocked on attempt to issue next
3113          * lwb it found in LWB_STATE_READY state.
3114          */
3115         mutex_exit(&zcw->zcw_lock);
3116
3117         /*
3118          * As described in the comments above zil_commit_waiter() and
3119          * zil_process_commit_list(), we need to issue this lwb's zio
3120          * since we've reached the commit waiter's timeout and it still
3121          * hasn't been issued.
3122          */
3123         lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
3124
3125         ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
3126
3127         zil_burst_done(zilog);
3128
3129         if (nlwb == NULL) {
3130                 /*
3131                  * When zil_lwb_write_close() returns NULL, this
3132                  * indicates zio_alloc_zil() failed to allocate the
3133                  * "next" lwb on-disk. When this occurs, the ZIL write
3134                  * pipeline must be stalled; see the comment within the
3135                  * zil_commit_writer_stall() function for more details.
3136                  */
3137                 zil_lwb_write_issue(zilog, lwb);
3138                 zil_commit_writer_stall(zilog);
3139                 mutex_exit(&zilog->zl_issuer_lock);
3140         } else {
3141                 mutex_exit(&zilog->zl_issuer_lock);
3142                 zil_lwb_write_issue(zilog, lwb);
3143         }
3144         mutex_enter(&zcw->zcw_lock);
3145 }
3146
3147 /*
3148  * This function is responsible for performing the following two tasks:
3149  *
3150  * 1. its primary responsibility is to block until the given "commit
3151  *    waiter" is considered "done".
3152  *
3153  * 2. its secondary responsibility is to issue the zio for the lwb that
3154  *    the given "commit waiter" is waiting on, if this function has
3155  *    waited "long enough" and the lwb is still in the "open" state.
3156  *
3157  * Given a sufficient amount of itxs being generated and written using
3158  * the ZIL, the lwb's zio will be issued via the zil_lwb_assign()
3159  * function. If this does not occur, this secondary responsibility will
3160  * ensure the lwb is issued even if there is not other synchronous
3161  * activity on the system.
3162  *
3163  * For more details, see zil_process_commit_list(); more specifically,
3164  * the comment at the bottom of that function.
3165  */
3166 static void
3167 zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
3168 {
3169         ASSERT(!MUTEX_HELD(&zilog->zl_lock));
3170         ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
3171         ASSERT(spa_writeable(zilog->zl_spa));
3172
3173         mutex_enter(&zcw->zcw_lock);
3174
3175         /*
3176          * The timeout is scaled based on the lwb latency to avoid
3177          * significantly impacting the latency of each individual itx.
3178          * For more details, see the comment at the bottom of the
3179          * zil_process_commit_list() function.
3180          */
3181         int pct = MAX(zfs_commit_timeout_pct, 1);
3182         hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100;
3183         hrtime_t wakeup = gethrtime() + sleep;
3184         boolean_t timedout = B_FALSE;
3185
3186         while (!zcw->zcw_done) {
3187                 ASSERT(MUTEX_HELD(&zcw->zcw_lock));
3188
3189                 lwb_t *lwb = zcw->zcw_lwb;
3190
3191                 /*
3192                  * Usually, the waiter will have a non-NULL lwb field here,
3193                  * but it's possible for it to be NULL as a result of
3194                  * zil_commit() racing with spa_sync().
3195                  *
3196                  * When zil_clean() is called, it's possible for the itxg
3197                  * list (which may be cleaned via a taskq) to contain
3198                  * commit itxs. When this occurs, the commit waiters linked
3199                  * off of these commit itxs will not be committed to an
3200                  * lwb.  Additionally, these commit waiters will not be
3201                  * marked done until zil_commit_waiter_skip() is called via
3202                  * zil_itxg_clean().
3203                  *
3204                  * Thus, it's possible for this commit waiter (i.e. the
3205                  * "zcw" variable) to be found in this "in between" state;
3206                  * where it's "zcw_lwb" field is NULL, and it hasn't yet
3207                  * been skipped, so it's "zcw_done" field is still B_FALSE.
3208                  */
3209                 IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_NEW);
3210
3211                 if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
3212                         ASSERT3B(timedout, ==, B_FALSE);
3213
3214                         /*
3215                          * If the lwb hasn't been issued yet, then we
3216                          * need to wait with a timeout, in case this
3217                          * function needs to issue the lwb after the
3218                          * timeout is reached; responsibility (2) from
3219                          * the comment above this function.
3220                          */
3221                         int rc = cv_timedwait_hires(&zcw->zcw_cv,
3222                             &zcw->zcw_lock, wakeup, USEC2NSEC(1),
3223                             CALLOUT_FLAG_ABSOLUTE);
3224
3225                         if (rc != -1 || zcw->zcw_done)
3226                                 continue;
3227
3228                         timedout = B_TRUE;
3229                         zil_commit_waiter_timeout(zilog, zcw);
3230
3231                         if (!zcw->zcw_done) {
3232                                 /*
3233                                  * If the commit waiter has already been
3234                                  * marked "done", it's possible for the
3235                                  * waiter's lwb structure to have already
3236                                  * been freed.  Thus, we can only reliably
3237                                  * make these assertions if the waiter
3238                                  * isn't done.
3239                                  */
3240                                 ASSERT3P(lwb, ==, zcw->zcw_lwb);
3241                                 ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
3242                         }
3243                 } else {
3244                         /*
3245                          * If the lwb isn't open, then it must have already
3246                          * been issued. In that case, there's no need to
3247                          * use a timeout when waiting for the lwb to
3248                          * complete.
3249                          *
3250                          * Additionally, if the lwb is NULL, the waiter
3251                          * will soon be signaled and marked done via
3252                          * zil_clean() and zil_itxg_clean(), so no timeout
3253                          * is required.
3254                          */
3255
3256                         IMPLY(lwb != NULL,
3257                             lwb->lwb_state == LWB_STATE_CLOSED ||
3258                             lwb->lwb_state == LWB_STATE_READY ||
3259                             lwb->lwb_state == LWB_STATE_ISSUED ||
3260                             lwb->lwb_state == LWB_STATE_WRITE_DONE ||
3261                             lwb->lwb_state == LWB_STATE_FLUSH_DONE);
3262                         cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
3263                 }
3264         }
3265
3266         mutex_exit(&zcw->zcw_lock);
3267 }
3268
3269 static zil_commit_waiter_t *
3270 zil_alloc_commit_waiter(void)
3271 {
3272         zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP);
3273
3274         cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL);
3275         mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL);
3276         list_link_init(&zcw->zcw_node);
3277         zcw->zcw_lwb = NULL;
3278         zcw->zcw_done = B_FALSE;
3279         zcw->zcw_zio_error = 0;
3280
3281         return (zcw);
3282 }
3283
3284 static void
3285 zil_free_commit_waiter(zil_commit_waiter_t *zcw)
3286 {
3287         ASSERT(!list_link_active(&zcw->zcw_node));
3288         ASSERT3P(zcw->zcw_lwb, ==, NULL);
3289         ASSERT3B(zcw->zcw_done, ==, B_TRUE);
3290         mutex_destroy(&zcw->zcw_lock);
3291         cv_destroy(&zcw->zcw_cv);
3292         kmem_cache_free(zil_zcw_cache, zcw);
3293 }
3294
3295 /*
3296  * This function is used to create a TX_COMMIT itx and assign it. This
3297  * way, it will be linked into the ZIL's list of synchronous itxs, and
3298  * then later committed to an lwb (or skipped) when
3299  * zil_process_commit_list() is called.
3300  */
3301 static void
3302 zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
3303 {
3304         dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
3305
3306         /*
3307          * Since we are not going to create any new dirty data, and we
3308          * can even help with clearing the existing dirty data, we
3309          * should not be subject to the dirty data based delays. We
3310          * use TXG_NOTHROTTLE to bypass the delay mechanism.
3311          */
3312         VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
3313
3314         itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));
3315         itx->itx_sync = B_TRUE;
3316         itx->itx_private = zcw;
3317
3318         zil_itx_assign(zilog, itx, tx);
3319
3320         dmu_tx_commit(tx);
3321 }
3322
3323 /*
3324  * Commit ZFS Intent Log transactions (itxs) to stable storage.
3325  *
3326  * When writing ZIL transactions to the on-disk representation of the
3327  * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple
3328  * itxs can be committed to a single lwb. Once a lwb is written and
3329  * committed to stable storage (i.e. the lwb is written, and vdevs have
3330  * been flushed), each itx that was committed to that lwb is also
3331  * considered to be committed to stable storage.
3332  *
3333  * When an itx is committed to an lwb, the log record (lr_t) contained
3334  * by the itx is copied into the lwb's zio buffer, and once this buffer
3335  * is written to disk, it becomes an on-disk ZIL block.
3336  *
3337  * As itxs are generated, they're inserted into the ZIL's queue of
3338  * uncommitted itxs. The semantics of zil_commit() are such that it will
3339  * block until all itxs that were in the queue when it was called, are
3340  * committed to stable storage.
3341  *
3342  * If "foid" is zero, this means all "synchronous" and "asynchronous"
3343  * itxs, for all objects in the dataset, will be committed to stable
3344  * storage prior to zil_commit() returning. If "foid" is non-zero, all
3345  * "synchronous" itxs for all objects, but only "asynchronous" itxs
3346  * that correspond to the foid passed in, will be committed to stable
3347  * storage prior to zil_commit() returning.
3348  *
3349  * Generally speaking, when zil_commit() is called, the consumer doesn't
3350  * actually care about _all_ of the uncommitted itxs. Instead, they're
3351  * simply trying to waiting for a specific itx to be committed to disk,
3352  * but the interface(s) for interacting with the ZIL don't allow such
3353  * fine-grained communication. A better interface would allow a consumer
3354  * to create and assign an itx, and then pass a reference to this itx to
3355  * zil_commit(); such that zil_commit() would return as soon as that
3356  * specific itx was committed to disk (instead of waiting for _all_
3357  * itxs to be committed).
3358  *
3359  * When a thread calls zil_commit() a special "commit itx" will be
3360  * generated, along with a corresponding "waiter" for this commit itx.
3361  * zil_commit() will wait on this waiter's CV, such that when the waiter
3362  * is marked done, and signaled, zil_commit() will return.
3363  *
3364  * This commit itx is inserted into the queue of uncommitted itxs. This
3365  * provides an easy mechanism for determining which itxs were in the
3366  * queue prior to zil_commit() having been called, and which itxs were
3367  * added after zil_commit() was called.
3368  *
3369  * The commit itx is special; it doesn't have any on-disk representation.
3370  * When a commit itx is "committed" to an lwb, the waiter associated
3371  * with it is linked onto the lwb's list of waiters. Then, when that lwb
3372  * completes, each waiter on the lwb's list is marked done and signaled
3373  * -- allowing the thread waiting on the waiter to return from zil_commit().
3374  *
3375  * It's important to point out a few critical factors that allow us
3376  * to make use of the commit itxs, commit waiters, per-lwb lists of
3377  * commit waiters, and zio completion callbacks like we're doing:
3378  *
3379  *   1. The list of waiters for each lwb is traversed, and each commit
3380  *      waiter is marked "done" and signaled, in the zio completion
3381  *      callback of the lwb's zio[*].
3382  *
3383  *      * Actually, the waiters are signaled in the zio completion
3384  *        callback of the root zio for the DKIOCFLUSHWRITECACHE commands
3385  *        that are sent to the vdevs upon completion of the lwb zio.
3386  *
3387  *   2. When the itxs are inserted into the ZIL's queue of uncommitted
3388  *      itxs, the order in which they are inserted is preserved[*]; as
3389  *      itxs are added to the queue, they are added to the tail of
3390  *      in-memory linked lists.
3391  *
3392  *      When committing the itxs to lwbs (to be written to disk), they
3393  *      are committed in the same order in which the itxs were added to
3394  *      the uncommitted queue's linked list(s); i.e. the linked list of
3395  *      itxs to commit is traversed from head to tail, and each itx is
3396  *      committed to an lwb in that order.
3397  *
3398  *      * To clarify:
3399  *
3400  *        - the order of "sync" itxs is preserved w.r.t. other
3401  *          "sync" itxs, regardless of the corresponding objects.
3402  *        - the order of "async" itxs is preserved w.r.t. other
3403  *          "async" itxs corresponding to the same object.
3404  *        - the order of "async" itxs is *not* preserved w.r.t. other
3405  *          "async" itxs corresponding to different objects.
3406  *        - the order of "sync" itxs w.r.t. "async" itxs (or vice
3407  *          versa) is *not* preserved, even for itxs that correspond
3408  *          to the same object.
3409  *
3410  *      For more details, see: zil_itx_assign(), zil_async_to_sync(),
3411  *      zil_get_commit_list(), and zil_process_commit_list().
3412  *
3413  *   3. The lwbs represent a linked list of blocks on disk. Thus, any
3414  *      lwb cannot be considered committed to stable storage, until its
3415  *      "previous" lwb is also committed to stable storage. This fact,
3416  *      coupled with the fact described above, means that itxs are
3417  *      committed in (roughly) the order in which they were generated.
3418  *      This is essential because itxs are dependent on prior itxs.
3419  *      Thus, we *must not* deem an itx as being committed to stable
3420  *      storage, until *all* prior itxs have also been committed to
3421  *      stable storage.
3422  *
3423  *      To enforce this ordering of lwb zio's, while still leveraging as
3424  *      much of the underlying storage performance as possible, we rely
3425  *      on two fundamental concepts:
3426  *
3427  *          1. The creation and issuance of lwb zio's is protected by
3428  *             the zilog's "zl_issuer_lock", which ensures only a single
3429  *             thread is creating and/or issuing lwb's at a time
3430  *          2. The "previous" lwb is a child of the "current" lwb
3431  *             (leveraging the zio parent-child dependency graph)
3432  *
3433  *      By relying on this parent-child zio relationship, we can have
3434  *      many lwb zio's concurrently issued to the underlying storage,
3435  *      but the order in which they complete will be the same order in
3436  *      which they were created.
3437  */
3438 void
3439 zil_commit(zilog_t *zilog, uint64_t foid)
3440 {
3441         /*
3442          * We should never attempt to call zil_commit on a snapshot for
3443          * a couple of reasons:
3444          *
3445          * 1. A snapshot may never be modified, thus it cannot have any
3446          *    in-flight itxs that would have modified the dataset.
3447          *
3448          * 2. By design, when zil_commit() is called, a commit itx will
3449          *    be assigned to this zilog; as a result, the zilog will be
3450          *    dirtied. We must not dirty the zilog of a snapshot; there's
3451          *    checks in the code that enforce this invariant, and will
3452          *    cause a panic if it's not upheld.
3453          */
3454         ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE);
3455
3456         if (zilog->zl_sync == ZFS_SYNC_DISABLED)
3457                 return;
3458
3459         if (!spa_writeable(zilog->zl_spa)) {
3460                 /*
3461                  * If the SPA is not writable, there should never be any
3462                  * pending itxs waiting to be committed to disk. If that
3463                  * weren't true, we'd skip writing those itxs out, and
3464                  * would break the semantics of zil_commit(); thus, we're
3465                  * verifying that truth before we return to the caller.
3466                  */
3467                 ASSERT(list_is_empty(&zilog->zl_lwb_list));
3468                 ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
3469                 for (int i = 0; i < TXG_SIZE; i++)
3470                         ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL);
3471                 return;
3472         }
3473
3474         /*
3475          * If the ZIL is suspended, we don't want to dirty it by calling
3476          * zil_commit_itx_assign() below, nor can we write out
3477          * lwbs like would be done in zil_commit_write(). Thus, we
3478          * simply rely on txg_wait_synced() to maintain the necessary
3479          * semantics, and avoid calling those functions altogether.
3480          */
3481         if (zilog->zl_suspend > 0) {
3482                 txg_wait_synced(zilog->zl_dmu_pool, 0);
3483                 return;
3484         }
3485
3486         zil_commit_impl(zilog, foid);
3487 }
3488
3489 void
3490 zil_commit_impl(zilog_t *zilog, uint64_t foid)
3491 {
3492         ZIL_STAT_BUMP(zilog, zil_commit_count);
3493
3494         /*
3495          * Move the "async" itxs for the specified foid to the "sync"
3496          * queues, such that they will be later committed (or skipped)
3497          * to an lwb when zil_process_commit_list() is called.
3498          *
3499          * Since these "async" itxs must be committed prior to this
3500          * call to zil_commit returning, we must perform this operation
3501          * before we call zil_commit_itx_assign().
3502          */
3503         zil_async_to_sync(zilog, foid);
3504
3505         /*
3506          * We allocate a new "waiter" structure which will initially be
3507          * linked to the commit itx using the itx's "itx_private" field.
3508          * Since the commit itx doesn't represent any on-disk state,
3509          * when it's committed to an lwb, rather than copying the its
3510          * lr_t into the lwb's buffer, the commit itx's "waiter" will be
3511          * added to the lwb's list of waiters. Then, when the lwb is
3512          * committed to stable storage, each waiter in the lwb's list of
3513          * waiters will be marked "done", and signalled.
3514          *
3515          * We must create the waiter and assign the commit itx prior to
3516          * calling zil_commit_writer(), or else our specific commit itx
3517          * is not guaranteed to be committed to an lwb prior to calling
3518          * zil_commit_waiter().
3519          */
3520         zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
3521         zil_commit_itx_assign(zilog, zcw);
3522
3523         uint64_t wtxg = zil_commit_writer(zilog, zcw);
3524         zil_commit_waiter(zilog, zcw);
3525
3526         if (zcw->zcw_zio_error != 0) {
3527                 /*
3528                  * If there was an error writing out the ZIL blocks that
3529                  * this thread is waiting on, then we fallback to
3530                  * relying on spa_sync() to write out the data this
3531                  * thread is waiting on. Obviously this has performance
3532                  * implications, but the expectation is for this to be
3533                  * an exceptional case, and shouldn't occur often.
3534                  */
3535                 DTRACE_PROBE2(zil__commit__io__error,
3536                     zilog_t *, zilog, zil_commit_waiter_t *, zcw);
3537                 txg_wait_synced(zilog->zl_dmu_pool, 0);
3538         } else if (wtxg != 0) {
3539                 txg_wait_synced(zilog->zl_dmu_pool, wtxg);
3540         }
3541
3542         zil_free_commit_waiter(zcw);
3543 }
3544
3545 /*
3546  * Called in syncing context to free committed log blocks and update log header.
3547  */
3548 void
3549 zil_sync(zilog_t *zilog, dmu_tx_t *tx)
3550 {
3551         zil_header_t *zh = zil_header_in_syncing_context(zilog);
3552         uint64_t txg = dmu_tx_get_txg(tx);
3553         spa_t *spa = zilog->zl_spa;
3554         uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
3555         lwb_t *lwb;
3556
3557         /*
3558          * We don't zero out zl_destroy_txg, so make sure we don't try
3559          * to destroy it twice.
3560          */
3561         if (spa_sync_pass(spa) != 1)
3562                 return;
3563
3564         zil_lwb_flush_wait_all(zilog, txg);
3565
3566         mutex_enter(&zilog->zl_lock);
3567
3568         ASSERT(zilog->zl_stop_sync == 0);
3569
3570         if (*replayed_seq != 0) {
3571                 ASSERT(zh->zh_replay_seq < *replayed_seq);
3572                 zh->zh_replay_seq = *replayed_seq;
3573                 *replayed_seq = 0;
3574         }
3575
3576         if (zilog->zl_destroy_txg == txg) {
3577                 blkptr_t blk = zh->zh_log;
3578                 dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
3579
3580                 ASSERT(list_is_empty(&zilog->zl_lwb_list));
3581
3582                 memset(zh, 0, sizeof (zil_header_t));
3583                 memset(zilog->zl_replayed_seq, 0,
3584                     sizeof (zilog->zl_replayed_seq));
3585
3586                 if (zilog->zl_keep_first) {
3587                         /*
3588                          * If this block was part of log chain that couldn't
3589                          * be claimed because a device was missing during
3590                          * zil_claim(), but that device later returns,
3591                          * then this block could erroneously appear valid.
3592                          * To guard against this, assign a new GUID to the new
3593                          * log chain so it doesn't matter what blk points to.
3594                          */
3595                         zil_init_log_chain(zilog, &blk);
3596                         zh->zh_log = blk;
3597                 } else {
3598                         /*
3599                          * A destroyed ZIL chain can't contain any TX_SETSAXATTR
3600                          * records. So, deactivate the feature for this dataset.
3601                          * We activate it again when we start a new ZIL chain.
3602                          */
3603                         if (dsl_dataset_feature_is_active(ds,
3604                             SPA_FEATURE_ZILSAXATTR))
3605                                 dsl_dataset_deactivate_feature(ds,
3606                                     SPA_FEATURE_ZILSAXATTR, tx);
3607                 }
3608         }
3609
3610         while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
3611                 zh->zh_log = lwb->lwb_blk;
3612                 if (lwb->lwb_state != LWB_STATE_FLUSH_DONE ||
3613                     lwb->lwb_alloc_txg > txg || lwb->lwb_max_txg > txg)
3614                         break;
3615                 list_remove(&zilog->zl_lwb_list, lwb);
3616                 if (!BP_IS_HOLE(&lwb->lwb_blk))
3617                         zio_free(spa, txg, &lwb->lwb_blk);
3618                 zil_free_lwb(zilog, lwb);
3619
3620                 /*
3621                  * If we don't have anything left in the lwb list then
3622                  * we've had an allocation failure and we need to zero
3623                  * out the zil_header blkptr so that we don't end
3624                  * up freeing the same block twice.
3625                  */
3626                 if (list_is_empty(&zilog->zl_lwb_list))
3627                         BP_ZERO(&zh->zh_log);
3628         }
3629
3630         mutex_exit(&zilog->zl_lock);
3631 }
3632
3633 static int
3634 zil_lwb_cons(void *vbuf, void *unused, int kmflag)
3635 {
3636         (void) unused, (void) kmflag;
3637         lwb_t *lwb = vbuf;
3638         list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
3639         list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
3640             offsetof(zil_commit_waiter_t, zcw_node));
3641         avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
3642             sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
3643         mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
3644         return (0);
3645 }
3646
3647 static void
3648 zil_lwb_dest(void *vbuf, void *unused)
3649 {
3650         (void) unused;
3651         lwb_t *lwb = vbuf;
3652         mutex_destroy(&lwb->lwb_vdev_lock);
3653         avl_destroy(&lwb->lwb_vdev_tree);
3654         list_destroy(&lwb->lwb_waiters);
3655         list_destroy(&lwb->lwb_itxs);
3656 }
3657
3658 void
3659 zil_init(void)
3660 {
3661         zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
3662             sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0);
3663
3664         zil_zcw_cache = kmem_cache_create("zil_zcw_cache",
3665             sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
3666
3667         zil_sums_init(&zil_sums_global);
3668         zil_kstats_global = kstat_create("zfs", 0, "zil", "misc",
3669             KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t),
3670             KSTAT_FLAG_VIRTUAL);
3671
3672         if (zil_kstats_global != NULL) {
3673                 zil_kstats_global->ks_data = &zil_stats;
3674                 zil_kstats_global->ks_update = zil_kstats_global_update;
3675                 zil_kstats_global->ks_private = NULL;
3676                 kstat_install(zil_kstats_global);
3677         }
3678 }
3679
3680 void
3681 zil_fini(void)
3682 {
3683         kmem_cache_destroy(zil_zcw_cache);
3684         kmem_cache_destroy(zil_lwb_cache);
3685
3686         if (zil_kstats_global != NULL) {
3687                 kstat_delete(zil_kstats_global);
3688                 zil_kstats_global = NULL;
3689         }
3690
3691         zil_sums_fini(&zil_sums_global);
3692 }
3693
3694 void
3695 zil_set_sync(zilog_t *zilog, uint64_t sync)
3696 {
3697         zilog->zl_sync = sync;
3698 }
3699
3700 void
3701 zil_set_logbias(zilog_t *zilog, uint64_t logbias)
3702 {
3703         zilog->zl_logbias = logbias;
3704 }
3705
3706 zilog_t *
3707 zil_alloc(objset_t *os, zil_header_t *zh_phys)
3708 {
3709         zilog_t *zilog;
3710
3711         zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
3712
3713         zilog->zl_header = zh_phys;
3714         zilog->zl_os = os;
3715         zilog->zl_spa = dmu_objset_spa(os);
3716         zilog->zl_dmu_pool = dmu_objset_pool(os);
3717         zilog->zl_destroy_txg = TXG_INITIAL - 1;
3718         zilog->zl_logbias = dmu_objset_logbias(os);
3719         zilog->zl_sync = dmu_objset_syncprop(os);
3720         zilog->zl_dirty_max_txg = 0;
3721         zilog->zl_last_lwb_opened = NULL;
3722         zilog->zl_last_lwb_latency = 0;
3723         zilog->zl_max_block_size = zil_maxblocksize;
3724
3725         mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
3726         mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
3727         mutex_init(&zilog->zl_lwb_io_lock, NULL, MUTEX_DEFAULT, NULL);
3728
3729         for (int i = 0; i < TXG_SIZE; i++) {
3730                 mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
3731                     MUTEX_DEFAULT, NULL);
3732         }
3733
3734         list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
3735             offsetof(lwb_t, lwb_node));
3736
3737         list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
3738             offsetof(itx_t, itx_node));
3739
3740         cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
3741         cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL);
3742
3743         return (zilog);
3744 }
3745
3746 void
3747 zil_free(zilog_t *zilog)
3748 {
3749         int i;
3750
3751         zilog->zl_stop_sync = 1;
3752
3753         ASSERT0(zilog->zl_suspend);
3754         ASSERT0(zilog->zl_suspending);
3755
3756         ASSERT(list_is_empty(&zilog->zl_lwb_list));
3757         list_destroy(&zilog->zl_lwb_list);
3758
3759         ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
3760         list_destroy(&zilog->zl_itx_commit_list);
3761
3762         for (i = 0; i < TXG_SIZE; i++) {
3763                 /*
3764                  * It's possible for an itx to be generated that doesn't dirty
3765                  * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
3766                  * callback to remove the entry. We remove those here.
3767                  *
3768                  * Also free up the ziltest itxs.
3769                  */
3770                 if (zilog->zl_itxg[i].itxg_itxs)
3771                         zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
3772                 mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
3773         }
3774
3775         mutex_destroy(&zilog->zl_issuer_lock);
3776         mutex_destroy(&zilog->zl_lock);
3777         mutex_destroy(&zilog->zl_lwb_io_lock);
3778
3779         cv_destroy(&zilog->zl_cv_suspend);
3780         cv_destroy(&zilog->zl_lwb_io_cv);
3781
3782         kmem_free(zilog, sizeof (zilog_t));
3783 }
3784
3785 /*
3786  * Open an intent log.
3787  */
3788 zilog_t *
3789 zil_open(objset_t *os, zil_get_data_t *get_data, zil_sums_t *zil_sums)
3790 {
3791         zilog_t *zilog = dmu_objset_zil(os);
3792
3793         ASSERT3P(zilog->zl_get_data, ==, NULL);
3794         ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
3795         ASSERT(list_is_empty(&zilog->zl_lwb_list));
3796
3797         zilog->zl_get_data = get_data;
3798         zilog->zl_sums = zil_sums;
3799
3800         return (zilog);
3801 }
3802
3803 /*
3804  * Close an intent log.
3805  */
3806 void
3807 zil_close(zilog_t *zilog)
3808 {
3809         lwb_t *lwb;
3810         uint64_t txg;
3811
3812         if (!dmu_objset_is_snapshot(zilog->zl_os)) {
3813                 zil_commit(zilog, 0);
3814         } else {
3815                 ASSERT(list_is_empty(&zilog->zl_lwb_list));
3816                 ASSERT0(zilog->zl_dirty_max_txg);
3817                 ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE);
3818         }
3819
3820         mutex_enter(&zilog->zl_lock);
3821         txg = zilog->zl_dirty_max_txg;
3822         lwb = list_tail(&zilog->zl_lwb_list);
3823         if (lwb != NULL) {
3824                 txg = MAX(txg, lwb->lwb_alloc_txg);
3825                 txg = MAX(txg, lwb->lwb_max_txg);
3826         }
3827         mutex_exit(&zilog->zl_lock);
3828
3829         /*
3830          * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends
3831          * on the time when the dmu_tx transaction is assigned in
3832          * zil_lwb_write_issue().
3833          */
3834         mutex_enter(&zilog->zl_lwb_io_lock);
3835         txg = MAX(zilog->zl_lwb_max_issued_txg, txg);
3836         mutex_exit(&zilog->zl_lwb_io_lock);
3837
3838         /*
3839          * We need to use txg_wait_synced() to wait until that txg is synced.
3840          * zil_sync() will guarantee all lwbs up to that txg have been
3841          * written out, flushed, and cleaned.
3842          */
3843         if (txg != 0)
3844                 txg_wait_synced(zilog->zl_dmu_pool, txg);
3845
3846         if (zilog_is_dirty(zilog))
3847                 zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog,
3848                     (u_longlong_t)txg);
3849         if (txg < spa_freeze_txg(zilog->zl_spa))
3850                 VERIFY(!zilog_is_dirty(zilog));
3851
3852         zilog->zl_get_data = NULL;
3853
3854         /*
3855          * We should have only one lwb left on the list; remove it now.
3856          */
3857         mutex_enter(&zilog->zl_lock);
3858         lwb = list_remove_head(&zilog->zl_lwb_list);
3859         if (lwb != NULL) {
3860                 ASSERT(list_is_empty(&zilog->zl_lwb_list));
3861                 ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW);
3862                 zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
3863                 zil_free_lwb(zilog, lwb);
3864         }
3865         mutex_exit(&zilog->zl_lock);
3866 }
3867
3868 static const char *suspend_tag = "zil suspending";
3869
3870 /*
3871  * Suspend an intent log.  While in suspended mode, we still honor
3872  * synchronous semantics, but we rely on txg_wait_synced() to do it.
3873  * On old version pools, we suspend the log briefly when taking a
3874  * snapshot so that it will have an empty intent log.
3875  *
3876  * Long holds are not really intended to be used the way we do here --
3877  * held for such a short time.  A concurrent caller of dsl_dataset_long_held()
3878  * could fail.  Therefore we take pains to only put a long hold if it is
3879  * actually necessary.  Fortunately, it will only be necessary if the
3880  * objset is currently mounted (or the ZVOL equivalent).  In that case it
3881  * will already have a long hold, so we are not really making things any worse.
3882  *
3883  * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
3884  * zvol_state_t), and use their mechanism to prevent their hold from being
3885  * dropped (e.g. VFS_HOLD()).  However, that would be even more pain for
3886  * very little gain.
3887  *
3888  * if cookiep == NULL, this does both the suspend & resume.
3889  * Otherwise, it returns with the dataset "long held", and the cookie
3890  * should be passed into zil_resume().
3891  */
3892 int
3893 zil_suspend(const char *osname, void **cookiep)
3894 {
3895         objset_t *os;
3896         zilog_t *zilog;
3897         const zil_header_t *zh;
3898         int error;
3899
3900         error = dmu_objset_hold(osname, suspend_tag, &os);
3901         if (error != 0)
3902                 return (error);
3903         zilog = dmu_objset_zil(os);
3904
3905         mutex_enter(&zilog->zl_lock);
3906         zh = zilog->zl_header;
3907
3908         if (zh->zh_flags & ZIL_REPLAY_NEEDED) {         /* unplayed log */
3909                 mutex_exit(&zilog->zl_lock);
3910                 dmu_objset_rele(os, suspend_tag);
3911                 return (SET_ERROR(EBUSY));
3912         }
3913
3914         /*
3915          * Don't put a long hold in the cases where we can avoid it.  This
3916          * is when there is no cookie so we are doing a suspend & resume
3917          * (i.e. called from zil_vdev_offline()), and there's nothing to do
3918          * for the suspend because it's already suspended, or there's no ZIL.
3919          */
3920         if (cookiep == NULL && !zilog->zl_suspending &&
3921             (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
3922                 mutex_exit(&zilog->zl_lock);
3923                 dmu_objset_rele(os, suspend_tag);
3924                 return (0);
3925         }
3926
3927         dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
3928         dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
3929
3930         zilog->zl_suspend++;
3931
3932         if (zilog->zl_suspend > 1) {
3933                 /*
3934                  * Someone else is already suspending it.
3935                  * Just wait for them to finish.
3936                  */
3937
3938                 while (zilog->zl_suspending)
3939                         cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
3940                 mutex_exit(&zilog->zl_lock);
3941
3942                 if (cookiep == NULL)
3943                         zil_resume(os);
3944                 else
3945                         *cookiep = os;
3946                 return (0);
3947         }
3948
3949         /*
3950          * If there is no pointer to an on-disk block, this ZIL must not
3951          * be active (e.g. filesystem not mounted), so there's nothing
3952          * to clean up.
3953          */
3954         if (BP_IS_HOLE(&zh->zh_log)) {
3955                 ASSERT(cookiep != NULL); /* fast path already handled */
3956
3957                 *cookiep = os;
3958                 mutex_exit(&zilog->zl_lock);
3959                 return (0);
3960         }
3961
3962         /*
3963          * The ZIL has work to do. Ensure that the associated encryption
3964          * key will remain mapped while we are committing the log by
3965          * grabbing a reference to it. If the key isn't loaded we have no
3966          * choice but to return an error until the wrapping key is loaded.
3967          */
3968         if (os->os_encrypted &&
3969             dsl_dataset_create_key_mapping(dmu_objset_ds(os)) != 0) {
3970                 zilog->zl_suspend--;
3971                 mutex_exit(&zilog->zl_lock);
3972                 dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
3973                 dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
3974                 return (SET_ERROR(EACCES));
3975         }
3976
3977         zilog->zl_suspending = B_TRUE;
3978         mutex_exit(&zilog->zl_lock);
3979
3980         /*
3981          * We need to use zil_commit_impl to ensure we wait for all
3982          * LWB_STATE_OPENED, _CLOSED and _READY lwbs to be committed
3983          * to disk before proceeding. If we used zil_commit instead, it
3984          * would just call txg_wait_synced(), because zl_suspend is set.
3985          * txg_wait_synced() doesn't wait for these lwb's to be
3986          * LWB_STATE_FLUSH_DONE before returning.
3987          */
3988         zil_commit_impl(zilog, 0);
3989
3990         /*
3991          * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we
3992          * use txg_wait_synced() to ensure the data from the zilog has
3993          * migrated to the main pool before calling zil_destroy().
3994          */
3995         txg_wait_synced(zilog->zl_dmu_pool, 0);
3996
3997         zil_destroy(zilog, B_FALSE);
3998
3999         mutex_enter(&zilog->zl_lock);
4000         zilog->zl_suspending = B_FALSE;
4001         cv_broadcast(&zilog->zl_cv_suspend);
4002         mutex_exit(&zilog->zl_lock);
4003
4004         if (os->os_encrypted)
4005                 dsl_dataset_remove_key_mapping(dmu_objset_ds(os));
4006
4007         if (cookiep == NULL)
4008                 zil_resume(os);
4009         else
4010                 *cookiep = os;
4011         return (0);
4012 }
4013
4014 void
4015 zil_resume(void *cookie)
4016 {
4017         objset_t *os = cookie;
4018         zilog_t *zilog = dmu_objset_zil(os);
4019
4020         mutex_enter(&zilog->zl_lock);
4021         ASSERT(zilog->zl_suspend != 0);
4022         zilog->zl_suspend--;
4023         mutex_exit(&zilog->zl_lock);
4024         dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
4025         dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
4026 }
4027
4028 typedef struct zil_replay_arg {
4029         zil_replay_func_t *const *zr_replay;
4030         void            *zr_arg;
4031         boolean_t       zr_byteswap;
4032         char            *zr_lr;
4033 } zil_replay_arg_t;
4034
4035 static int
4036 zil_replay_error(zilog_t *zilog, const lr_t *lr, int error)
4037 {
4038         char name[ZFS_MAX_DATASET_NAME_LEN];
4039
4040         zilog->zl_replaying_seq--;      /* didn't actually replay this one */
4041
4042         dmu_objset_name(zilog->zl_os, name);
4043
4044         cmn_err(CE_WARN, "ZFS replay transaction error %d, "
4045             "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
4046             (u_longlong_t)lr->lrc_seq,
4047             (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
4048             (lr->lrc_txtype & TX_CI) ? "CI" : "");
4049
4050         return (error);
4051 }
4052
4053 static int
4054 zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra,
4055     uint64_t claim_txg)
4056 {
4057         zil_replay_arg_t *zr = zra;
4058         const zil_header_t *zh = zilog->zl_header;
4059         uint64_t reclen = lr->lrc_reclen;
4060         uint64_t txtype = lr->lrc_txtype;
4061         int error = 0;
4062
4063         zilog->zl_replaying_seq = lr->lrc_seq;
4064
4065         if (lr->lrc_seq <= zh->zh_replay_seq)   /* already replayed */
4066                 return (0);
4067
4068         if (lr->lrc_txg < claim_txg)            /* already committed */
4069                 return (0);
4070
4071         /* Strip case-insensitive bit, still present in log record */
4072         txtype &= ~TX_CI;
4073
4074         if (txtype == 0 || txtype >= TX_MAX_TYPE)
4075                 return (zil_replay_error(zilog, lr, EINVAL));
4076
4077         /*
4078          * If this record type can be logged out of order, the object
4079          * (lr_foid) may no longer exist.  That's legitimate, not an error.
4080          */
4081         if (TX_OOO(txtype)) {
4082                 error = dmu_object_info(zilog->zl_os,
4083                     LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL);
4084                 if (error == ENOENT || error == EEXIST)
4085                         return (0);
4086         }
4087
4088         /*
4089          * Make a copy of the data so we can revise and extend it.
4090          */
4091         memcpy(zr->zr_lr, lr, reclen);
4092
4093         /*
4094          * If this is a TX_WRITE with a blkptr, suck in the data.
4095          */
4096         if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
4097                 error = zil_read_log_data(zilog, (lr_write_t *)lr,
4098                     zr->zr_lr + reclen);
4099                 if (error != 0)
4100                         return (zil_replay_error(zilog, lr, error));
4101         }
4102
4103         /*
4104          * The log block containing this lr may have been byteswapped
4105          * so that we can easily examine common fields like lrc_txtype.
4106          * However, the log is a mix of different record types, and only the
4107          * replay vectors know how to byteswap their records.  Therefore, if
4108          * the lr was byteswapped, undo it before invoking the replay vector.
4109          */
4110         if (zr->zr_byteswap)
4111                 byteswap_uint64_array(zr->zr_lr, reclen);
4112
4113         /*
4114          * We must now do two things atomically: replay this log record,
4115          * and update the log header sequence number to reflect the fact that
4116          * we did so. At the end of each replay function the sequence number
4117          * is updated if we are in replay mode.
4118          */
4119         error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
4120         if (error != 0) {
4121                 /*
4122                  * The DMU's dnode layer doesn't see removes until the txg
4123                  * commits, so a subsequent claim can spuriously fail with
4124                  * EEXIST. So if we receive any error we try syncing out
4125                  * any removes then retry the transaction.  Note that we
4126                  * specify B_FALSE for byteswap now, so we don't do it twice.
4127                  */
4128                 txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
4129                 error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
4130                 if (error != 0)
4131                         return (zil_replay_error(zilog, lr, error));
4132         }
4133         return (0);
4134 }
4135
4136 static int
4137 zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg)
4138 {
4139         (void) bp, (void) arg, (void) claim_txg;
4140
4141         zilog->zl_replay_blks++;
4142
4143         return (0);
4144 }
4145
4146 /*
4147  * If this dataset has a non-empty intent log, replay it and destroy it.
4148  * Return B_TRUE if there were any entries to replay.
4149  */
4150 boolean_t
4151 zil_replay(objset_t *os, void *arg,
4152     zil_replay_func_t *const replay_func[TX_MAX_TYPE])
4153 {
4154         zilog_t *zilog = dmu_objset_zil(os);
4155         const zil_header_t *zh = zilog->zl_header;
4156         zil_replay_arg_t zr;
4157
4158         if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
4159                 return (zil_destroy(zilog, B_TRUE));
4160         }
4161
4162         zr.zr_replay = replay_func;
4163         zr.zr_arg = arg;
4164         zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
4165         zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
4166
4167         /*
4168          * Wait for in-progress removes to sync before starting replay.
4169          */
4170         txg_wait_synced(zilog->zl_dmu_pool, 0);
4171
4172         zilog->zl_replay = B_TRUE;
4173         zilog->zl_replay_time = ddi_get_lbolt();
4174         ASSERT(zilog->zl_replay_blks == 0);
4175         (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
4176             zh->zh_claim_txg, B_TRUE);
4177         vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
4178
4179         zil_destroy(zilog, B_FALSE);
4180         txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
4181         zilog->zl_replay = B_FALSE;
4182
4183         return (B_TRUE);
4184 }
4185
4186 boolean_t
4187 zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
4188 {
4189         if (zilog->zl_sync == ZFS_SYNC_DISABLED)
4190                 return (B_TRUE);
4191
4192         if (zilog->zl_replay) {
4193                 dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
4194                 zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
4195                     zilog->zl_replaying_seq;
4196                 return (B_TRUE);
4197         }
4198
4199         return (B_FALSE);
4200 }
4201
4202 int
4203 zil_reset(const char *osname, void *arg)
4204 {
4205         (void) arg;
4206
4207         int error = zil_suspend(osname, NULL);
4208         /* EACCES means crypto key not loaded */
4209         if ((error == EACCES) || (error == EBUSY))
4210                 return (SET_ERROR(error));
4211         if (error != 0)
4212                 return (SET_ERROR(EEXIST));
4213         return (0);
4214 }
4215
4216 EXPORT_SYMBOL(zil_alloc);
4217 EXPORT_SYMBOL(zil_free);
4218 EXPORT_SYMBOL(zil_open);
4219 EXPORT_SYMBOL(zil_close);
4220 EXPORT_SYMBOL(zil_replay);
4221 EXPORT_SYMBOL(zil_replaying);
4222 EXPORT_SYMBOL(zil_destroy);
4223 EXPORT_SYMBOL(zil_destroy_sync);
4224 EXPORT_SYMBOL(zil_itx_create);
4225 EXPORT_SYMBOL(zil_itx_destroy);
4226 EXPORT_SYMBOL(zil_itx_assign);
4227 EXPORT_SYMBOL(zil_commit);
4228 EXPORT_SYMBOL(zil_claim);
4229 EXPORT_SYMBOL(zil_check_log_chain);
4230 EXPORT_SYMBOL(zil_sync);
4231 EXPORT_SYMBOL(zil_clean);
4232 EXPORT_SYMBOL(zil_suspend);
4233 EXPORT_SYMBOL(zil_resume);
4234 EXPORT_SYMBOL(zil_lwb_add_block);
4235 EXPORT_SYMBOL(zil_bp_tree_add);
4236 EXPORT_SYMBOL(zil_set_sync);
4237 EXPORT_SYMBOL(zil_set_logbias);
4238 EXPORT_SYMBOL(zil_sums_init);
4239 EXPORT_SYMBOL(zil_sums_fini);
4240 EXPORT_SYMBOL(zil_kstat_values_update);
4241
4242 ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW,
4243         "ZIL block open timeout percentage");
4244
4245 ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
4246         "Disable intent logging replay");
4247
4248 ZFS_MODULE_PARAM(zfs_zil, zil_, nocacheflush, INT, ZMOD_RW,
4249         "Disable ZIL cache flushes");
4250
4251 ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, U64, ZMOD_RW,
4252         "Limit in bytes slog sync writes per commit");
4253
4254 ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW,
4255         "Limit in bytes of ZIL log block size");
4256
4257 ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW,
4258         "Limit in bytes WR_COPIED size");