module/zfs/dmu_tx.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  25  */
  26
  27 #include <sys/dmu.h>
  28 #include <sys/dmu_impl.h>
  29 #include <sys/dbuf.h>
  30 #include <sys/dmu_tx.h>
  31 #include <sys/dmu_objset.h>
  32 #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
  33 #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
  34 #include <sys/dsl_pool.h>
  35 #include <sys/zap_impl.h> /* for fzap_default_block_shift */
  36 #include <sys/spa.h>
  37 #include <sys/sa.h>
  38 #include <sys/sa_impl.h>
  39 #include <sys/zfs_context.h>
  40 #include <sys/varargs.h>
  41 #include <sys/trace_dmu.h>
  42
  43 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
  44     uint64_t arg1, uint64_t arg2);
  45
  46 dmu_tx_stats_t dmu_tx_stats = {
  47         { "dmu_tx_assigned",            KSTAT_DATA_UINT64 },
  48         { "dmu_tx_delay",               KSTAT_DATA_UINT64 },
  49         { "dmu_tx_error",               KSTAT_DATA_UINT64 },
  50         { "dmu_tx_suspended",           KSTAT_DATA_UINT64 },
  51         { "dmu_tx_group",               KSTAT_DATA_UINT64 },
  52         { "dmu_tx_memory_reserve",      KSTAT_DATA_UINT64 },
  53         { "dmu_tx_memory_reclaim",      KSTAT_DATA_UINT64 },
  54         { "dmu_tx_dirty_throttle",      KSTAT_DATA_UINT64 },
  55         { "dmu_tx_dirty_delay",         KSTAT_DATA_UINT64 },
  56         { "dmu_tx_dirty_over_max",      KSTAT_DATA_UINT64 },
  57         { "dmu_tx_quota",               KSTAT_DATA_UINT64 },
  58 };
  59
  60 static kstat_t *dmu_tx_ksp;
  61
  62 dmu_tx_t *
  63 dmu_tx_create_dd(dsl_dir_t *dd)
  64 {
  65         dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
  66         tx->tx_dir = dd;
  67         if (dd != NULL)
  68                 tx->tx_pool = dd->dd_pool;
  69         list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
  70             offsetof(dmu_tx_hold_t, txh_node));
  71         list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
  72             offsetof(dmu_tx_callback_t, dcb_node));
  73         tx->tx_start = gethrtime();
  74 #ifdef DEBUG_DMU_TX
  75         refcount_create(&tx->tx_space_written);
  76         refcount_create(&tx->tx_space_freed);
  77 #endif
  78         return (tx);
  79 }
  80
  81 dmu_tx_t *
  82 dmu_tx_create(objset_t *os)
  83 {
  84         dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
  85         tx->tx_objset = os;
  86         tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
  87         return (tx);
  88 }
  89
  90 dmu_tx_t *
  91 dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
  92 {
  93         dmu_tx_t *tx = dmu_tx_create_dd(NULL);
  94
  95         ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
  96         tx->tx_pool = dp;
  97         tx->tx_txg = txg;
  98         tx->tx_anyobj = TRUE;
  99
 100         return (tx);
 101 }
 102
 103 int
 104 dmu_tx_is_syncing(dmu_tx_t *tx)
 105 {
 106         return (tx->tx_anyobj);
 107 }
 108
 109 int
 110 dmu_tx_private_ok(dmu_tx_t *tx)
 111 {
 112         return (tx->tx_anyobj);
 113 }
 114
 115 static dmu_tx_hold_t *
 116 dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
 117     enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
 118 {
 119         dmu_tx_hold_t *txh;
 120         dnode_t *dn = NULL;
 121         int err;
 122
 123         if (object != DMU_NEW_OBJECT) {
 124                 err = dnode_hold(os, object, tx, &dn);
 125                 if (err) {
 126                         tx->tx_err = err;
 127                         return (NULL);
 128                 }
 129
 130                 if (err == 0 && tx->tx_txg != 0) {
 131                         mutex_enter(&dn->dn_mtx);
 132                         /*
 133                          * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
 134                          * problem, but there's no way for it to happen (for
 135                          * now, at least).
 136                          */
 137                         ASSERT(dn->dn_assigned_txg == 0);
 138                         dn->dn_assigned_txg = tx->tx_txg;
 139                         (void) refcount_add(&dn->dn_tx_holds, tx);
 140                         mutex_exit(&dn->dn_mtx);
 141                 }
 142         }
 143
 144         txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 145         txh->txh_tx = tx;
 146         txh->txh_dnode = dn;
 147 #ifdef DEBUG_DMU_TX
 148         txh->txh_type = type;
 149         txh->txh_arg1 = arg1;
 150         txh->txh_arg2 = arg2;
 151 #endif
 152         list_insert_tail(&tx->tx_holds, txh);
 153
 154         return (txh);
 155 }
 156
 157 void
 158 dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
 159 {
 160         /*
 161          * If we're syncing, they can manipulate any object anyhow, and
 162          * the hold on the dnode_t can cause problems.
 163          */
 164         if (!dmu_tx_is_syncing(tx)) {
 165                 (void) dmu_tx_hold_object_impl(tx, os,
 166                     object, THT_NEWOBJECT, 0, 0);
 167         }
 168 }
 169
 170 static int
 171 dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 172 {
 173         int err;
 174         dmu_buf_impl_t *db;
 175
 176         rw_enter(&dn->dn_struct_rwlock, RW_READER);
 177         db = dbuf_hold_level(dn, level, blkid, FTAG);
 178         rw_exit(&dn->dn_struct_rwlock);
 179         if (db == NULL)
 180                 return (SET_ERROR(EIO));
 181         err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 182         dbuf_rele(db, FTAG);
 183         return (err);
 184 }
 185
 186 static void
 187 dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
 188     int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
 189 {
 190         objset_t *os = dn->dn_objset;
 191         dsl_dataset_t *ds = os->os_dsl_dataset;
 192         int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 193         dmu_buf_impl_t *parent = NULL;
 194         blkptr_t *bp = NULL;
 195         uint64_t space;
 196
 197         if (level >= dn->dn_nlevels || history[level] == blkid)
 198                 return;
 199
 200         history[level] = blkid;
 201
 202         space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
 203
 204         if (db == NULL || db == dn->dn_dbuf) {
 205                 ASSERT(level != 0);
 206                 db = NULL;
 207         } else {
 208                 ASSERT(DB_DNODE(db) == dn);
 209                 ASSERT(db->db_level == level);
 210                 ASSERT(db->db.db_size == space);
 211                 ASSERT(db->db_blkid == blkid);
 212                 bp = db->db_blkptr;
 213                 parent = db->db_parent;
 214         }
 215
 216         freeable = (bp && (freeable ||
 217             dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
 218
 219         if (freeable)
 220                 txh->txh_space_tooverwrite += space;
 221         else
 222                 txh->txh_space_towrite += space;
 223         if (bp)
 224                 txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
 225
 226         dmu_tx_count_twig(txh, dn, parent, level + 1,
 227             blkid >> epbs, freeable, history);
 228 }
 229
 230 /* ARGSUSED */
 231 static void
 232 dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 233 {
 234         dnode_t *dn = txh->txh_dnode;
 235         uint64_t start, end, i;
 236         int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
 237         int err = 0;
 238         int l;
 239
 240         if (len == 0)
 241                 return;
 242
 243         min_bs = SPA_MINBLOCKSHIFT;
 244         max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
 245         min_ibs = DN_MIN_INDBLKSHIFT;
 246         max_ibs = DN_MAX_INDBLKSHIFT;
 247
 248         if (dn) {
 249                 uint64_t history[DN_MAX_LEVELS];
 250                 int nlvls = dn->dn_nlevels;
 251                 int delta;
 252
 253                 /*
 254                  * For i/o error checking, read the first and last level-0
 255                  * blocks (if they are not aligned), and all the level-1 blocks.
 256                  */
 257                 if (dn->dn_maxblkid == 0) {
 258                         delta = dn->dn_datablksz;
 259                         start = (off < dn->dn_datablksz) ? 0 : 1;
 260                         end = (off+len <= dn->dn_datablksz) ? 0 : 1;
 261                         if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
 262                                 err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 263                                 if (err)
 264                                         goto out;
 265                                 delta -= off;
 266                         }
 267                 } else {
 268                         zio_t *zio = zio_root(dn->dn_objset->os_spa,
 269                             NULL, NULL, ZIO_FLAG_CANFAIL);
 270
 271                         /* first level-0 block */
 272                         start = off >> dn->dn_datablkshift;
 273                         if (P2PHASE(off, dn->dn_datablksz) ||
 274                             len < dn->dn_datablksz) {
 275                                 err = dmu_tx_check_ioerr(zio, dn, 0, start);
 276                                 if (err)
 277                                         goto out;
 278                         }
 279
 280                         /* last level-0 block */
 281                         end = (off+len-1) >> dn->dn_datablkshift;
 282                         if (end != start && end <= dn->dn_maxblkid &&
 283                             P2PHASE(off+len, dn->dn_datablksz)) {
 284                                 err = dmu_tx_check_ioerr(zio, dn, 0, end);
 285                                 if (err)
 286                                         goto out;
 287                         }
 288
 289                         /* level-1 blocks */
 290                         if (nlvls > 1) {
 291                                 int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 292                                 for (i = (start>>shft)+1; i < end>>shft; i++) {
 293                                         err = dmu_tx_check_ioerr(zio, dn, 1, i);
 294                                         if (err)
 295                                                 goto out;
 296                                 }
 297                         }
 298
 299                         err = zio_wait(zio);
 300                         if (err)
 301                                 goto out;
 302                         delta = P2NPHASE(off, dn->dn_datablksz);
 303                 }
 304
 305                 min_ibs = max_ibs = dn->dn_indblkshift;
 306                 if (dn->dn_maxblkid > 0) {
 307                         /*
 308                          * The blocksize can't change,
 309                          * so we can make a more precise estimate.
 310                          */
 311                         ASSERT(dn->dn_datablkshift != 0);
 312                         min_bs = max_bs = dn->dn_datablkshift;
 313                 } else {
 314                         /*
 315                          * The blocksize can increase up to the recordsize,
 316                          * or if it is already more than the recordsize,
 317                          * up to the next power of 2.
 318                          */
 319                         min_bs = highbit64(dn->dn_datablksz - 1);
 320                         max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
 321                 }
 322
 323                 /*
 324                  * If this write is not off the end of the file
 325                  * we need to account for overwrites/unref.
 326                  */
 327                 if (start <= dn->dn_maxblkid) {
 328                         for (l = 0; l < DN_MAX_LEVELS; l++)
 329                                 history[l] = -1ULL;
 330                 }
 331                 while (start <= dn->dn_maxblkid) {
 332                         dmu_buf_impl_t *db;
 333
 334                         rw_enter(&dn->dn_struct_rwlock, RW_READER);
 335                         err = dbuf_hold_impl(dn, 0, start,
 336                             FALSE, FALSE, FTAG, &db);
 337                         rw_exit(&dn->dn_struct_rwlock);
 338
 339                         if (err) {
 340                                 txh->txh_tx->tx_err = err;
 341                                 return;
 342                         }
 343
 344                         dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
 345                             history);
 346                         dbuf_rele(db, FTAG);
 347                         if (++start > end) {
 348                                 /*
 349                                  * Account for new indirects appearing
 350                                  * before this IO gets assigned into a txg.
 351                                  */
 352                                 bits = 64 - min_bs;
 353                                 epbs = min_ibs - SPA_BLKPTRSHIFT;
 354                                 for (bits -= epbs * (nlvls - 1);
 355                                     bits >= 0; bits -= epbs)
 356                                         txh->txh_fudge += 1ULL << max_ibs;
 357                                 goto out;
 358                         }
 359                         off += delta;
 360                         if (len >= delta)
 361                                 len -= delta;
 362                         delta = dn->dn_datablksz;
 363                 }
 364         }
 365
 366         /*
 367          * 'end' is the last thing we will access, not one past.
 368          * This way we won't overflow when accessing the last byte.
 369          */
 370         start = P2ALIGN(off, 1ULL << max_bs);
 371         end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
 372         txh->txh_space_towrite += end - start + 1;
 373
 374         start >>= min_bs;
 375         end >>= min_bs;
 376
 377         epbs = min_ibs - SPA_BLKPTRSHIFT;
 378
 379         /*
 380          * The object contains at most 2^(64 - min_bs) blocks,
 381          * and each indirect level maps 2^epbs.
 382          */
 383         for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
 384                 start >>= epbs;
 385                 end >>= epbs;
 386                 ASSERT3U(end, >=, start);
 387                 txh->txh_space_towrite += (end - start + 1) << max_ibs;
 388                 if (start != 0) {
 389                         /*
 390                          * We also need a new blkid=0 indirect block
 391                          * to reference any existing file data.
 392                          */
 393                         txh->txh_space_towrite += 1ULL << max_ibs;
 394                 }
 395         }
 396
 397 out:
 398         if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
 399             2 * DMU_MAX_ACCESS)
 400                 err = SET_ERROR(EFBIG);
 401
 402         if (err)
 403                 txh->txh_tx->tx_err = err;
 404 }
 405
 406 static void
 407 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 408 {
 409         dnode_t *dn = txh->txh_dnode;
 410         dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
 411         uint64_t space = mdn->dn_datablksz +
 412             ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
 413
 414         if (dn && dn->dn_dbuf->db_blkptr &&
 415             dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 416             dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
 417                 txh->txh_space_tooverwrite += space;
 418                 txh->txh_space_tounref += space;
 419         } else {
 420                 txh->txh_space_towrite += space;
 421                 if (dn && dn->dn_dbuf->db_blkptr)
 422                         txh->txh_space_tounref += space;
 423         }
 424 }
 425
 426 void
 427 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 428 {
 429         dmu_tx_hold_t *txh;
 430
 431         ASSERT(tx->tx_txg == 0);
 432         ASSERT(len <= DMU_MAX_ACCESS);
 433         ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 434
 435         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 436             object, THT_WRITE, off, len);
 437         if (txh == NULL)
 438                 return;
 439
 440         dmu_tx_count_write(txh, off, len);
 441         dmu_tx_count_dnode(txh);
 442 }
 443
 444 static void
 445 dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 446 {
 447         uint64_t blkid, nblks, lastblk;
 448         uint64_t space = 0, unref = 0, skipped = 0;
 449         dnode_t *dn = txh->txh_dnode;
 450         dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 451         spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
 452         int epbs;
 453         uint64_t l0span = 0, nl1blks = 0;
 454
 455         if (dn->dn_nlevels == 0)
 456                 return;
 457
 458         /*
 459          * The struct_rwlock protects us against dn_nlevels
 460          * changing, in case (against all odds) we manage to dirty &
 461          * sync out the changes after we check for being dirty.
 462          * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
 463          */
 464         rw_enter(&dn->dn_struct_rwlock, RW_READER);
 465         epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 466         if (dn->dn_maxblkid == 0) {
 467                 if (off == 0 && len >= dn->dn_datablksz) {
 468                         blkid = 0;
 469                         nblks = 1;
 470                 } else {
 471                         rw_exit(&dn->dn_struct_rwlock);
 472                         return;
 473                 }
 474         } else {
 475                 blkid = off >> dn->dn_datablkshift;
 476                 nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
 477
 478                 if (blkid > dn->dn_maxblkid) {
 479                         rw_exit(&dn->dn_struct_rwlock);
 480                         return;
 481                 }
 482                 if (blkid + nblks > dn->dn_maxblkid)
 483                         nblks = dn->dn_maxblkid - blkid + 1;
 484
 485         }
 486         l0span = nblks;    /* save for later use to calc level > 1 overhead */
 487         if (dn->dn_nlevels == 1) {
 488                 int i;
 489                 for (i = 0; i < nblks; i++) {
 490                         blkptr_t *bp = dn->dn_phys->dn_blkptr;
 491                         ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 492                         bp += blkid + i;
 493                         if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
 494                                 dprintf_bp(bp, "can free old%s", "");
 495                                 space += bp_get_dsize(spa, bp);
 496                         }
 497                         unref += BP_GET_ASIZE(bp);
 498                 }
 499                 nl1blks = 1;
 500                 nblks = 0;
 501         }
 502
 503         lastblk = blkid + nblks - 1;
 504         while (nblks) {
 505                 dmu_buf_impl_t *dbuf;
 506                 uint64_t ibyte, new_blkid;
 507                 int epb = 1 << epbs;
 508                 int err, i, blkoff, tochk;
 509                 blkptr_t *bp;
 510
 511                 ibyte = blkid << dn->dn_datablkshift;
 512                 err = dnode_next_offset(dn,
 513                     DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
 514                 new_blkid = ibyte >> dn->dn_datablkshift;
 515                 if (err == ESRCH) {
 516                         skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 517                         break;
 518                 }
 519                 if (err) {
 520                         txh->txh_tx->tx_err = err;
 521                         break;
 522                 }
 523                 if (new_blkid > lastblk) {
 524                         skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 525                         break;
 526                 }
 527
 528                 if (new_blkid > blkid) {
 529                         ASSERT((new_blkid >> epbs) > (blkid >> epbs));
 530                         skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
 531                         nblks -= new_blkid - blkid;
 532                         blkid = new_blkid;
 533                 }
 534                 blkoff = P2PHASE(blkid, epb);
 535                 tochk = MIN(epb - blkoff, nblks);
 536
 537                 err = dbuf_hold_impl(dn, 1, blkid >> epbs,
 538                     FALSE, FALSE, FTAG, &dbuf);
 539                 if (err) {
 540                         txh->txh_tx->tx_err = err;
 541                         break;
 542                 }
 543
 544                 txh->txh_memory_tohold += dbuf->db.db_size;
 545
 546                 /*
 547                  * We don't check memory_tohold against DMU_MAX_ACCESS because
 548                  * memory_tohold is an over-estimation (especially the >L1
 549                  * indirect blocks), so it could fail.  Callers should have
 550                  * already verified that they will not be holding too much
 551                  * memory.
 552                  */
 553
 554                 err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
 555                 if (err != 0) {
 556                         txh->txh_tx->tx_err = err;
 557                         dbuf_rele(dbuf, FTAG);
 558                         break;
 559                 }
 560
 561                 bp = dbuf->db.db_data;
 562                 bp += blkoff;
 563
 564                 for (i = 0; i < tochk; i++) {
 565                         if (dsl_dataset_block_freeable(ds, &bp[i],
 566                             bp[i].blk_birth)) {
 567                                 dprintf_bp(&bp[i], "can free old%s", "");
 568                                 space += bp_get_dsize(spa, &bp[i]);
 569                         }
 570                         unref += BP_GET_ASIZE(bp);
 571                 }
 572                 dbuf_rele(dbuf, FTAG);
 573
 574                 ++nl1blks;
 575                 blkid += tochk;
 576                 nblks -= tochk;
 577         }
 578         rw_exit(&dn->dn_struct_rwlock);
 579
 580         /*
 581          * Add in memory requirements of higher-level indirects.
 582          * This assumes a worst-possible scenario for dn_nlevels and a
 583          * worst-possible distribution of l1-blocks over the region to free.
 584          */
 585         {
 586                 uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
 587                 int level = 2;
 588                 /*
 589                  * Here we don't use DN_MAX_LEVEL, but calculate it with the
 590                  * given datablkshift and indblkshift. This makes the
 591                  * difference between 19 and 8 on large files.
 592                  */
 593                 int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
 594                     (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
 595
 596                 while (level++ < maxlevel) {
 597                         txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1)
 598                             << dn->dn_indblkshift;
 599                         blkcnt = 1 + (blkcnt >> epbs);
 600                 }
 601         }
 602
 603         /* account for new level 1 indirect blocks that might show up */
 604         if (skipped > 0) {
 605                 txh->txh_fudge += skipped << dn->dn_indblkshift;
 606                 skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
 607                 txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
 608         }
 609         txh->txh_space_tofree += space;
 610         txh->txh_space_tounref += unref;
 611 }
 612
 613 /*
 614  * This function marks the transaction as being a "net free".  The end
 615  * result is that refquotas will be disabled for this transaction, and
 616  * this transaction will be able to use half of the pool space overhead
 617  * (see dsl_pool_adjustedsize()).  Therefore this function should only
 618  * be called for transactions that we expect will not cause a net increase
 619  * in the amount of space used (but it's OK if that is occasionally not true).
 620  */
 621 void
 622 dmu_tx_mark_netfree(dmu_tx_t *tx)
 623 {
 624         dmu_tx_hold_t *txh;
 625
 626         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 627             DMU_NEW_OBJECT, THT_FREE, 0, 0);
 628
 629         /*
 630          * Pretend that this operation will free 1GB of space.  This
 631          * should be large enough to cancel out the largest write.
 632          * We don't want to use something like UINT64_MAX, because that would
 633          * cause overflows when doing math with these values (e.g. in
 634          * dmu_tx_try_assign()).
 635          */
 636         txh->txh_space_tofree = txh->txh_space_tounref = 1024 * 1024 * 1024;
 637 }
 638
 639 void
 640 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 641 {
 642         dmu_tx_hold_t *txh;
 643         dnode_t *dn;
 644         int err;
 645         zio_t *zio;
 646
 647         ASSERT(tx->tx_txg == 0);
 648
 649         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 650             object, THT_FREE, off, len);
 651         if (txh == NULL)
 652                 return;
 653         dn = txh->txh_dnode;
 654         dmu_tx_count_dnode(txh);
 655
 656         if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 657                 return;
 658         if (len == DMU_OBJECT_END)
 659                 len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
 660
 661         dmu_tx_count_dnode(txh);
 662
 663         /*
 664          * For i/o error checking, we read the first and last level-0
 665          * blocks if they are not aligned, and all the level-1 blocks.
 666          *
 667          * Note:  dbuf_free_range() assumes that we have not instantiated
 668          * any level-0 dbufs that will be completely freed.  Therefore we must
 669          * exercise care to not read or count the first and last blocks
 670          * if they are blocksize-aligned.
 671          */
 672         if (dn->dn_datablkshift == 0) {
 673                 if (off != 0 || len < dn->dn_datablksz)
 674                         dmu_tx_count_write(txh, 0, dn->dn_datablksz);
 675         } else {
 676                 /* first block will be modified if it is not aligned */
 677                 if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
 678                         dmu_tx_count_write(txh, off, 1);
 679                 /* last block will be modified if it is not aligned */
 680                 if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
 681                         dmu_tx_count_write(txh, off+len, 1);
 682         }
 683
 684         /*
 685          * Check level-1 blocks.
 686          */
 687         if (dn->dn_nlevels > 1) {
 688                 int shift = dn->dn_datablkshift + dn->dn_indblkshift -
 689                     SPA_BLKPTRSHIFT;
 690                 uint64_t start = off >> shift;
 691                 uint64_t end = (off + len) >> shift;
 692                 uint64_t i;
 693
 694                 ASSERT(dn->dn_indblkshift != 0);
 695
 696                 /*
 697                  * dnode_reallocate() can result in an object with indirect
 698                  * blocks having an odd data block size.  In this case,
 699                  * just check the single block.
 700                  */
 701                 if (dn->dn_datablkshift == 0)
 702                         start = end = 0;
 703
 704                 zio = zio_root(tx->tx_pool->dp_spa,
 705                     NULL, NULL, ZIO_FLAG_CANFAIL);
 706                 for (i = start; i <= end; i++) {
 707                         uint64_t ibyte = i << shift;
 708                         err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 709                         i = ibyte >> shift;
 710                         if (err == ESRCH || i > end)
 711                                 break;
 712                         if (err) {
 713                                 tx->tx_err = err;
 714                                 return;
 715                         }
 716
 717                         err = dmu_tx_check_ioerr(zio, dn, 1, i);
 718                         if (err) {
 719                                 tx->tx_err = err;
 720                                 return;
 721                         }
 722                 }
 723                 err = zio_wait(zio);
 724                 if (err) {
 725                         tx->tx_err = err;
 726                         return;
 727                 }
 728         }
 729
 730         dmu_tx_count_free(txh, off, len);
 731 }
 732
 733 void
 734 dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 735 {
 736         dmu_tx_hold_t *txh;
 737         dnode_t *dn;
 738         dsl_dataset_phys_t *ds_phys;
 739         uint64_t nblocks;
 740         int epbs, err;
 741
 742         ASSERT(tx->tx_txg == 0);
 743
 744         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 745             object, THT_ZAP, add, (uintptr_t)name);
 746         if (txh == NULL)
 747                 return;
 748         dn = txh->txh_dnode;
 749
 750         dmu_tx_count_dnode(txh);
 751
 752         if (dn == NULL) {
 753                 /*
 754                  * We will be able to fit a new object's entries into one leaf
 755                  * block.  So there will be at most 2 blocks total,
 756                  * including the header block.
 757                  */
 758                 dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
 759                 return;
 760         }
 761
 762         ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 763
 764         if (dn->dn_maxblkid == 0 && !add) {
 765                 blkptr_t *bp;
 766
 767                 /*
 768                  * If there is only one block  (i.e. this is a micro-zap)
 769                  * and we are not adding anything, the accounting is simple.
 770                  */
 771                 err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 772                 if (err) {
 773                         tx->tx_err = err;
 774                         return;
 775                 }
 776
 777                 /*
 778                  * Use max block size here, since we don't know how much
 779                  * the size will change between now and the dbuf dirty call.
 780                  */
 781                 bp = &dn->dn_phys->dn_blkptr[0];
 782                 if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 783                     bp, bp->blk_birth))
 784                         txh->txh_space_tooverwrite += MZAP_MAX_BLKSZ;
 785                 else
 786                         txh->txh_space_towrite += MZAP_MAX_BLKSZ;
 787                 if (!BP_IS_HOLE(bp))
 788                         txh->txh_space_tounref += MZAP_MAX_BLKSZ;
 789                 return;
 790         }
 791
 792         if (dn->dn_maxblkid > 0 && name) {
 793                 /*
 794                  * access the name in this fat-zap so that we'll check
 795                  * for i/o errors to the leaf blocks, etc.
 796                  */
 797                 err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
 798                 if (err == EIO) {
 799                         tx->tx_err = err;
 800                         return;
 801                 }
 802         }
 803
 804         err = zap_count_write_by_dnode(dn, name, add,
 805             &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 806
 807         /*
 808          * If the modified blocks are scattered to the four winds,
 809          * we'll have to modify an indirect twig for each.
 810          */
 811         epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 812         ds_phys = dsl_dataset_phys(dn->dn_objset->os_dsl_dataset);
 813         for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
 814                 if (ds_phys->ds_prev_snap_obj)
 815                         txh->txh_space_towrite += 3 << dn->dn_indblkshift;
 816                 else
 817                         txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
 818 }
 819
 820 void
 821 dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
 822 {
 823         dmu_tx_hold_t *txh;
 824
 825         ASSERT(tx->tx_txg == 0);
 826
 827         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 828             object, THT_BONUS, 0, 0);
 829         if (txh)
 830                 dmu_tx_count_dnode(txh);
 831 }
 832
 833 void
 834 dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
 835 {
 836         dmu_tx_hold_t *txh;
 837
 838         ASSERT(tx->tx_txg == 0);
 839
 840         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 841             DMU_NEW_OBJECT, THT_SPACE, space, 0);
 842         if (txh)
 843                 txh->txh_space_towrite += space;
 844 }
 845
 846 int
 847 dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
 848 {
 849         dmu_tx_hold_t *txh;
 850         int holds = 0;
 851
 852         /*
 853          * By asserting that the tx is assigned, we're counting the
 854          * number of dn_tx_holds, which is the same as the number of
 855          * dn_holds.  Otherwise, we'd be counting dn_holds, but
 856          * dn_tx_holds could be 0.
 857          */
 858         ASSERT(tx->tx_txg != 0);
 859
 860         /* if (tx->tx_anyobj == TRUE) */
 861                 /* return (0); */
 862
 863         for (txh = list_head(&tx->tx_holds); txh;
 864             txh = list_next(&tx->tx_holds, txh)) {
 865                 if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
 866                         holds++;
 867         }
 868
 869         return (holds);
 870 }
 871
 872 #ifdef DEBUG_DMU_TX
 873 void
 874 dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 875 {
 876         dmu_tx_hold_t *txh;
 877         int match_object = FALSE, match_offset = FALSE;
 878         dnode_t *dn;
 879
 880         DB_DNODE_ENTER(db);
 881         dn = DB_DNODE(db);
 882         ASSERT(dn != NULL);
 883         ASSERT(tx->tx_txg != 0);
 884         ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 885         ASSERT3U(dn->dn_object, ==, db->db.db_object);
 886
 887         if (tx->tx_anyobj) {
 888                 DB_DNODE_EXIT(db);
 889                 return;
 890         }
 891
 892         /* XXX No checking on the meta dnode for now */
 893         if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 894                 DB_DNODE_EXIT(db);
 895                 return;
 896         }
 897
 898         for (txh = list_head(&tx->tx_holds); txh;
 899             txh = list_next(&tx->tx_holds, txh)) {
 900                 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 901                 if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
 902                         match_object = TRUE;
 903                 if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
 904                         int datablkshift = dn->dn_datablkshift ?
 905                             dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
 906                         int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 907                         int shift = datablkshift + epbs * db->db_level;
 908                         uint64_t beginblk = shift >= 64 ? 0 :
 909                             (txh->txh_arg1 >> shift);
 910                         uint64_t endblk = shift >= 64 ? 0 :
 911                             ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
 912                         uint64_t blkid = db->db_blkid;
 913
 914                         /* XXX txh_arg2 better not be zero... */
 915
 916                         dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
 917                             txh->txh_type, beginblk, endblk);
 918
 919                         switch (txh->txh_type) {
 920                         case THT_WRITE:
 921                                 if (blkid >= beginblk && blkid <= endblk)
 922                                         match_offset = TRUE;
 923                                 /*
 924                                  * We will let this hold work for the bonus
 925                                  * or spill buffer so that we don't need to
 926                                  * hold it when creating a new object.
 927                                  */
 928                                 if (blkid == DMU_BONUS_BLKID ||
 929                                     blkid == DMU_SPILL_BLKID)
 930                                         match_offset = TRUE;
 931                                 /*
 932                                  * They might have to increase nlevels,
 933                                  * thus dirtying the new TLIBs.  Or the
 934                                  * might have to change the block size,
 935                                  * thus dirying the new lvl=0 blk=0.
 936                                  */
 937                                 if (blkid == 0)
 938                                         match_offset = TRUE;
 939                                 break;
 940                         case THT_FREE:
 941                                 /*
 942                                  * We will dirty all the level 1 blocks in
 943                                  * the free range and perhaps the first and
 944                                  * last level 0 block.
 945                                  */
 946                                 if (blkid >= beginblk && (blkid <= endblk ||
 947                                     txh->txh_arg2 == DMU_OBJECT_END))
 948                                         match_offset = TRUE;
 949                                 break;
 950                         case THT_SPILL:
 951                                 if (blkid == DMU_SPILL_BLKID)
 952                                         match_offset = TRUE;
 953                                 break;
 954                         case THT_BONUS:
 955                                 if (blkid == DMU_BONUS_BLKID)
 956                                         match_offset = TRUE;
 957                                 break;
 958                         case THT_ZAP:
 959                                 match_offset = TRUE;
 960                                 break;
 961                         case THT_NEWOBJECT:
 962                                 match_object = TRUE;
 963                                 break;
 964                         default:
 965                                 cmn_err(CE_PANIC, "bad txh_type %d",
 966                                     txh->txh_type);
 967                         }
 968                 }
 969                 if (match_object && match_offset) {
 970                         DB_DNODE_EXIT(db);
 971                         return;
 972                 }
 973         }
 974         DB_DNODE_EXIT(db);
 975         panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
 976             (u_longlong_t)db->db.db_object, db->db_level,
 977             (u_longlong_t)db->db_blkid);
 978 }
 979 #endif
 980
 981 /*
 982  * If we can't do 10 iops, something is wrong.  Let us go ahead
 983  * and hit zfs_dirty_data_max.
 984  */
 985 hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */
 986 int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
 987
 988 /*
 989  * We delay transactions when we've determined that the backend storage
 990  * isn't able to accommodate the rate of incoming writes.
 991  *
 992  * If there is already a transaction waiting, we delay relative to when
 993  * that transaction finishes waiting.  This way the calculated min_time
 994  * is independent of the number of threads concurrently executing
 995  * transactions.
 996  *
 997  * If we are the only waiter, wait relative to when the transaction
 998  * started, rather than the current time.  This credits the transaction for
 999  * "time already served", e.g. reading indirect blocks.
1000  *
1001  * The minimum time for a transaction to take is calculated as:
1002  *     min_time = scale * (dirty - min) / (max - dirty)
1003  *     min_time is then capped at zfs_delay_max_ns.
1004  *
1005  * The delay has two degrees of freedom that can be adjusted via tunables.
1006  * The percentage of dirty data at which we start to delay is defined by
1007  * zfs_delay_min_dirty_percent. This should typically be at or above
1008  * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
1009  * delay after writing at full speed has failed to keep up with the incoming
1010  * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
1011  * speaking, this variable determines the amount of delay at the midpoint of
1012  * the curve.
1013  *
1014  * delay
1015  *  10ms +-------------------------------------------------------------*+
1016  *       |                                                             *|
1017  *   9ms +                                                             *+
1018  *       |                                                             *|
1019  *   8ms +                                                             *+
1020  *       |                                                            * |
1021  *   7ms +                                                            * +
1022  *       |                                                            * |
1023  *   6ms +                                                            * +
1024  *       |                                                            * |
1025  *   5ms +                                                           *  +
1026  *       |                                                           *  |
1027  *   4ms +                                                           *  +
1028  *       |                                                           *  |
1029  *   3ms +                                                          *   +
1030  *       |                                                          *   |
1031  *   2ms +                                              (midpoint) *    +
1032  *       |                                                  |    **     |
1033  *   1ms +                                                  v ***       +
1034  *       |             zfs_delay_scale ---------->     ********         |
1035  *     0 +-------------------------------------*********----------------+
1036  *       0%                    <- zfs_dirty_data_max ->               100%
1037  *
1038  * Note that since the delay is added to the outstanding time remaining on the
1039  * most recent transaction, the delay is effectively the inverse of IOPS.
1040  * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
1041  * was chosen such that small changes in the amount of accumulated dirty data
1042  * in the first 3/4 of the curve yield relatively small differences in the
1043  * amount of delay.
1044  *
1045  * The effects can be easier to understand when the amount of delay is
1046  * represented on a log scale:
1047  *
1048  * delay
1049  * 100ms +-------------------------------------------------------------++
1050  *       +                                                              +
1051  *       |                                                              |
1052  *       +                                                             *+
1053  *  10ms +                                                             *+
1054  *       +                                                           ** +
1055  *       |                                              (midpoint)  **  |
1056  *       +                                                  |     **    +
1057  *   1ms +                                                  v ****      +
1058  *       +             zfs_delay_scale ---------->        *****         +
1059  *       |                                             ****             |
1060  *       +                                          ****                +
1061  * 100us +                                        **                    +
1062  *       +                                       *                      +
1063  *       |                                      *                       |
1064  *       +                                     *                        +
1065  *  10us +                                     *                        +
1066  *       +                                                              +
1067  *       |                                                              |
1068  *       +                                                              +
1069  *       +--------------------------------------------------------------+
1070  *       0%                    <- zfs_dirty_data_max ->               100%
1071  *
1072  * Note here that only as the amount of dirty data approaches its limit does
1073  * the delay start to increase rapidly. The goal of a properly tuned system
1074  * should be to keep the amount of dirty data out of that range by first
1075  * ensuring that the appropriate limits are set for the I/O scheduler to reach
1076  * optimal throughput on the backend storage, and then by changing the value
1077  * of zfs_delay_scale to increase the steepness of the curve.
1078  */
1079 static void
1080 dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
1081 {
1082         dsl_pool_t *dp = tx->tx_pool;
1083         uint64_t delay_min_bytes =
1084             zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
1085         hrtime_t wakeup, min_tx_time, now;
1086
1087         if (dirty <= delay_min_bytes)
1088                 return;
1089
1090         /*
1091          * The caller has already waited until we are under the max.
1092          * We make them pass us the amount of dirty data so we don't
1093          * have to handle the case of it being >= the max, which could
1094          * cause a divide-by-zero if it's == the max.
1095          */
1096         ASSERT3U(dirty, <, zfs_dirty_data_max);
1097
1098         now = gethrtime();
1099         min_tx_time = zfs_delay_scale *
1100             (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
1101         min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
1102         if (now > tx->tx_start + min_tx_time)
1103                 return;
1104
1105         DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
1106             uint64_t, min_tx_time);
1107
1108         mutex_enter(&dp->dp_lock);
1109         wakeup = MAX(tx->tx_start + min_tx_time,
1110             dp->dp_last_wakeup + min_tx_time);
1111         dp->dp_last_wakeup = wakeup;
1112         mutex_exit(&dp->dp_lock);
1113
1114         zfs_sleep_until(wakeup);
1115 }
1116
1117 static int
1118 dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
1119 {
1120         dmu_tx_hold_t *txh;
1121         spa_t *spa = tx->tx_pool->dp_spa;
1122         uint64_t memory, asize, fsize, usize;
1123         uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
1124
1125         ASSERT0(tx->tx_txg);
1126
1127         if (tx->tx_err) {
1128                 DMU_TX_STAT_BUMP(dmu_tx_error);
1129                 return (tx->tx_err);
1130         }
1131
1132         if (spa_suspended(spa)) {
1133                 DMU_TX_STAT_BUMP(dmu_tx_suspended);
1134
1135                 /*
1136                  * If the user has indicated a blocking failure mode
1137                  * then return ERESTART which will block in dmu_tx_wait().
1138                  * Otherwise, return EIO so that an error can get
1139                  * propagated back to the VOP calls.
1140                  *
1141                  * Note that we always honor the txg_how flag regardless
1142                  * of the failuremode setting.
1143                  */
1144                 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
1145                     txg_how != TXG_WAIT)
1146                         return (SET_ERROR(EIO));
1147
1148                 return (SET_ERROR(ERESTART));
1149         }
1150
1151         if (!tx->tx_waited &&
1152             dsl_pool_need_dirty_delay(tx->tx_pool)) {
1153                 tx->tx_wait_dirty = B_TRUE;
1154                 DMU_TX_STAT_BUMP(dmu_tx_dirty_delay);
1155                 return (ERESTART);
1156         }
1157
1158         tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
1159         tx->tx_needassign_txh = NULL;
1160
1161         /*
1162          * NB: No error returns are allowed after txg_hold_open, but
1163          * before processing the dnode holds, due to the
1164          * dmu_tx_unassign() logic.
1165          */
1166
1167         towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
1168         for (txh = list_head(&tx->tx_holds); txh;
1169             txh = list_next(&tx->tx_holds, txh)) {
1170                 dnode_t *dn = txh->txh_dnode;
1171                 if (dn != NULL) {
1172                         mutex_enter(&dn->dn_mtx);
1173                         if (dn->dn_assigned_txg == tx->tx_txg - 1) {
1174                                 mutex_exit(&dn->dn_mtx);
1175                                 tx->tx_needassign_txh = txh;
1176                                 DMU_TX_STAT_BUMP(dmu_tx_group);
1177                                 return (SET_ERROR(ERESTART));
1178                         }
1179                         if (dn->dn_assigned_txg == 0)
1180                                 dn->dn_assigned_txg = tx->tx_txg;
1181                         ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1182                         (void) refcount_add(&dn->dn_tx_holds, tx);
1183                         mutex_exit(&dn->dn_mtx);
1184                 }
1185                 towrite += txh->txh_space_towrite;
1186                 tofree += txh->txh_space_tofree;
1187                 tooverwrite += txh->txh_space_tooverwrite;
1188                 tounref += txh->txh_space_tounref;
1189                 tohold += txh->txh_memory_tohold;
1190                 fudge += txh->txh_fudge;
1191         }
1192
1193         /*
1194          * If a snapshot has been taken since we made our estimates,
1195          * assume that we won't be able to free or overwrite anything.
1196          */
1197         if (tx->tx_objset &&
1198             dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
1199             tx->tx_lastsnap_txg) {
1200                 towrite += tooverwrite;
1201                 tooverwrite = tofree = 0;
1202         }
1203
1204         /* needed allocation: worst-case estimate of write space */
1205         asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
1206         /* freed space estimate: worst-case overwrite + free estimate */
1207         fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
1208         /* convert unrefd space to worst-case estimate */
1209         usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
1210         /* calculate memory footprint estimate */
1211         memory = towrite + tooverwrite + tohold;
1212
1213 #ifdef DEBUG_DMU_TX
1214         /*
1215          * Add in 'tohold' to account for our dirty holds on this memory
1216          * XXX - the "fudge" factor is to account for skipped blocks that
1217          * we missed because dnode_next_offset() misses in-core-only blocks.
1218          */
1219         tx->tx_space_towrite = asize +
1220             spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
1221         tx->tx_space_tofree = tofree;
1222         tx->tx_space_tooverwrite = tooverwrite;
1223         tx->tx_space_tounref = tounref;
1224 #endif
1225
1226         if (tx->tx_dir && asize != 0) {
1227                 int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
1228                     asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
1229                 if (err)
1230                         return (err);
1231         }
1232
1233         DMU_TX_STAT_BUMP(dmu_tx_assigned);
1234
1235         return (0);
1236 }
1237
1238 static void
1239 dmu_tx_unassign(dmu_tx_t *tx)
1240 {
1241         dmu_tx_hold_t *txh;
1242
1243         if (tx->tx_txg == 0)
1244                 return;
1245
1246         txg_rele_to_quiesce(&tx->tx_txgh);
1247
1248         /*
1249          * Walk the transaction's hold list, removing the hold on the
1250          * associated dnode, and notifying waiters if the refcount drops to 0.
1251          */
1252         for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
1253             txh = list_next(&tx->tx_holds, txh)) {
1254                 dnode_t *dn = txh->txh_dnode;
1255
1256                 if (dn == NULL)
1257                         continue;
1258                 mutex_enter(&dn->dn_mtx);
1259                 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1260
1261                 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1262                         dn->dn_assigned_txg = 0;
1263                         cv_broadcast(&dn->dn_notxholds);
1264                 }
1265                 mutex_exit(&dn->dn_mtx);
1266         }
1267
1268         txg_rele_to_sync(&tx->tx_txgh);
1269
1270         tx->tx_lasttried_txg = tx->tx_txg;
1271         tx->tx_txg = 0;
1272 }
1273
1274 /*
1275  * Assign tx to a transaction group.  txg_how can be one of:
1276  *
1277  * (1)  TXG_WAIT.  If the current open txg is full, waits until there's
1278  *      a new one.  This should be used when you're not holding locks.
1279  *      It will only fail if we're truly out of space (or over quota).
1280  *
1281  * (2)  TXG_NOWAIT.  If we can't assign into the current open txg without
1282  *      blocking, returns immediately with ERESTART.  This should be used
1283  *      whenever you're holding locks.  On an ERESTART error, the caller
1284  *      should drop locks, do a dmu_tx_wait(tx), and try again.
1285  *
1286  * (3)  TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
1287  *      has already been called on behalf of this operation (though
1288  *      most likely on a different tx).
1289  */
1290 int
1291 dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1292 {
1293         int err;
1294
1295         ASSERT(tx->tx_txg == 0);
1296         ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
1297             txg_how == TXG_WAITED);
1298         ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1299
1300         if (txg_how == TXG_WAITED)
1301                 tx->tx_waited = B_TRUE;
1302
1303         /* If we might wait, we must not hold the config lock. */
1304         ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
1305
1306         while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1307                 dmu_tx_unassign(tx);
1308
1309                 if (err != ERESTART || txg_how != TXG_WAIT)
1310                         return (err);
1311
1312                 dmu_tx_wait(tx);
1313         }
1314
1315         txg_rele_to_quiesce(&tx->tx_txgh);
1316
1317         return (0);
1318 }
1319
1320 void
1321 dmu_tx_wait(dmu_tx_t *tx)
1322 {
1323         spa_t *spa = tx->tx_pool->dp_spa;
1324         dsl_pool_t *dp = tx->tx_pool;
1325         hrtime_t before;
1326
1327         ASSERT(tx->tx_txg == 0);
1328         ASSERT(!dsl_pool_config_held(tx->tx_pool));
1329
1330         before = gethrtime();
1331
1332         if (tx->tx_wait_dirty) {
1333                 uint64_t dirty;
1334
1335                 /*
1336                  * dmu_tx_try_assign() has determined that we need to wait
1337                  * because we've consumed much or all of the dirty buffer
1338                  * space.
1339                  */
1340                 mutex_enter(&dp->dp_lock);
1341                 if (dp->dp_dirty_total >= zfs_dirty_data_max)
1342                         DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max);
1343                 while (dp->dp_dirty_total >= zfs_dirty_data_max)
1344                         cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1345                 dirty = dp->dp_dirty_total;
1346                 mutex_exit(&dp->dp_lock);
1347
1348                 dmu_tx_delay(tx, dirty);
1349
1350                 tx->tx_wait_dirty = B_FALSE;
1351
1352                 /*
1353                  * Note: setting tx_waited only has effect if the caller
1354                  * used TX_WAIT.  Otherwise they are going to destroy
1355                  * this tx and try again.  The common case, zfs_write(),
1356                  * uses TX_WAIT.
1357                  */
1358                 tx->tx_waited = B_TRUE;
1359         } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1360                 /*
1361                  * If the pool is suspended we need to wait until it
1362                  * is resumed.  Note that it's possible that the pool
1363                  * has become active after this thread has tried to
1364                  * obtain a tx.  If that's the case then tx_lasttried_txg
1365                  * would not have been set.
1366                  */
1367                 txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1368         } else if (tx->tx_needassign_txh) {
1369                 dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1370
1371                 mutex_enter(&dn->dn_mtx);
1372                 while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1373                         cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1374                 mutex_exit(&dn->dn_mtx);
1375                 tx->tx_needassign_txh = NULL;
1376         } else {
1377                 /*
1378                  * A dnode is assigned to the quiescing txg.  Wait for its
1379                  * transaction to complete.
1380                  */
1381                 txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
1382         }
1383
1384         spa_tx_assign_add_nsecs(spa, gethrtime() - before);
1385 }
1386
1387 void
1388 dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
1389 {
1390 #ifdef DEBUG_DMU_TX
1391         if (tx->tx_dir == NULL || delta == 0)
1392                 return;
1393
1394         if (delta > 0) {
1395                 ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
1396                     tx->tx_space_towrite);
1397                 (void) refcount_add_many(&tx->tx_space_written, delta, NULL);
1398         } else {
1399                 (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
1400         }
1401 #endif
1402 }
1403
1404 void
1405 dmu_tx_commit(dmu_tx_t *tx)
1406 {
1407         dmu_tx_hold_t *txh;
1408
1409         ASSERT(tx->tx_txg != 0);
1410
1411         /*
1412          * Go through the transaction's hold list and remove holds on
1413          * associated dnodes, notifying waiters if no holds remain.
1414          */
1415         while ((txh = list_head(&tx->tx_holds))) {
1416                 dnode_t *dn = txh->txh_dnode;
1417
1418                 list_remove(&tx->tx_holds, txh);
1419                 kmem_free(txh, sizeof (dmu_tx_hold_t));
1420                 if (dn == NULL)
1421                         continue;
1422                 mutex_enter(&dn->dn_mtx);
1423                 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1424
1425                 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1426                         dn->dn_assigned_txg = 0;
1427                         cv_broadcast(&dn->dn_notxholds);
1428                 }
1429                 mutex_exit(&dn->dn_mtx);
1430                 dnode_rele(dn, tx);
1431         }
1432
1433         if (tx->tx_tempreserve_cookie)
1434                 dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1435
1436         if (!list_is_empty(&tx->tx_callbacks))
1437                 txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
1438
1439         if (tx->tx_anyobj == FALSE)
1440                 txg_rele_to_sync(&tx->tx_txgh);
1441
1442         list_destroy(&tx->tx_callbacks);
1443         list_destroy(&tx->tx_holds);
1444 #ifdef DEBUG_DMU_TX
1445         dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
1446             tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
1447             tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
1448         refcount_destroy_many(&tx->tx_space_written,
1449             refcount_count(&tx->tx_space_written));
1450         refcount_destroy_many(&tx->tx_space_freed,
1451             refcount_count(&tx->tx_space_freed));
1452 #endif
1453         kmem_free(tx, sizeof (dmu_tx_t));
1454 }
1455
1456 void
1457 dmu_tx_abort(dmu_tx_t *tx)
1458 {
1459         dmu_tx_hold_t *txh;
1460
1461         ASSERT(tx->tx_txg == 0);
1462
1463         while ((txh = list_head(&tx->tx_holds))) {
1464                 dnode_t *dn = txh->txh_dnode;
1465
1466                 list_remove(&tx->tx_holds, txh);
1467                 kmem_free(txh, sizeof (dmu_tx_hold_t));
1468                 if (dn != NULL)
1469                         dnode_rele(dn, tx);
1470         }
1471
1472         /*
1473          * Call any registered callbacks with an error code.
1474          */
1475         if (!list_is_empty(&tx->tx_callbacks))
1476                 dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
1477
1478         list_destroy(&tx->tx_callbacks);
1479         list_destroy(&tx->tx_holds);
1480 #ifdef DEBUG_DMU_TX
1481         refcount_destroy_many(&tx->tx_space_written,
1482             refcount_count(&tx->tx_space_written));
1483         refcount_destroy_many(&tx->tx_space_freed,
1484             refcount_count(&tx->tx_space_freed));
1485 #endif
1486         kmem_free(tx, sizeof (dmu_tx_t));
1487 }
1488
1489 uint64_t
1490 dmu_tx_get_txg(dmu_tx_t *tx)
1491 {
1492         ASSERT(tx->tx_txg != 0);
1493         return (tx->tx_txg);
1494 }
1495
1496 dsl_pool_t *
1497 dmu_tx_pool(dmu_tx_t *tx)
1498 {
1499         ASSERT(tx->tx_pool != NULL);
1500         return (tx->tx_pool);
1501 }
1502
1503 void
1504 dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
1505 {
1506         dmu_tx_callback_t *dcb;
1507
1508         dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
1509
1510         dcb->dcb_func = func;
1511         dcb->dcb_data = data;
1512
1513         list_insert_tail(&tx->tx_callbacks, dcb);
1514 }
1515
1516 /*
1517  * Call all the commit callbacks on a list, with a given error code.
1518  */
1519 void
1520 dmu_tx_do_callbacks(list_t *cb_list, int error)
1521 {
1522         dmu_tx_callback_t *dcb;
1523
1524         while ((dcb = list_head(cb_list))) {
1525                 list_remove(cb_list, dcb);
1526                 dcb->dcb_func(dcb->dcb_data, error);
1527                 kmem_free(dcb, sizeof (dmu_tx_callback_t));
1528         }
1529 }
1530
1531 /*
1532  * Interface to hold a bunch of attributes.
1533  * used for creating new files.
1534  * attrsize is the total size of all attributes
1535  * to be added during object creation
1536  *
1537  * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
1538  */
1539
1540 /*
1541  * hold necessary attribute name for attribute registration.
1542  * should be a very rare case where this is needed.  If it does
1543  * happen it would only happen on the first write to the file system.
1544  */
1545 static void
1546 dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
1547 {
1548         int i;
1549
1550         if (!sa->sa_need_attr_registration)
1551                 return;
1552
1553         for (i = 0; i != sa->sa_num_attrs; i++) {
1554                 if (!sa->sa_attr_table[i].sa_registered) {
1555                         if (sa->sa_reg_attr_obj)
1556                                 dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
1557                                     B_TRUE, sa->sa_attr_table[i].sa_name);
1558                         else
1559                                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
1560                                     B_TRUE, sa->sa_attr_table[i].sa_name);
1561                 }
1562         }
1563 }
1564
1565
1566 void
1567 dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
1568 {
1569         dnode_t *dn;
1570         dmu_tx_hold_t *txh;
1571
1572         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
1573             THT_SPILL, 0, 0);
1574         if (txh == NULL)
1575                 return;
1576
1577         dn = txh->txh_dnode;
1578
1579         if (dn == NULL)
1580                 return;
1581
1582         /* If blkptr doesn't exist then add space to towrite */
1583         if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
1584                 txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
1585         } else {
1586                 blkptr_t *bp;
1587
1588                 bp = DN_SPILL_BLKPTR(dn->dn_phys);
1589                 if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
1590                     bp, bp->blk_birth))
1591                         txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE;
1592                 else
1593                         txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
1594                 if (!BP_IS_HOLE(bp))
1595                         txh->txh_space_tounref += SPA_OLD_MAXBLOCKSIZE;
1596         }
1597 }
1598
1599 void
1600 dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
1601 {
1602         sa_os_t *sa = tx->tx_objset->os_sa;
1603
1604         dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1605
1606         if (tx->tx_objset->os_sa->sa_master_obj == 0)
1607                 return;
1608
1609         if (tx->tx_objset->os_sa->sa_layout_attr_obj)
1610                 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1611         else {
1612                 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1613                 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1614                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1615                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1616         }
1617
1618         dmu_tx_sa_registration_hold(sa, tx);
1619
1620         if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill)
1621                 return;
1622
1623         (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
1624             THT_SPILL, 0, 0);
1625 }
1626
1627 /*
1628  * Hold SA attribute
1629  *
1630  * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
1631  *
1632  * variable_size is the total size of all variable sized attributes
1633  * passed to this function.  It is not the total size of all
1634  * variable size attributes that *may* exist on this object.
1635  */
1636 void
1637 dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
1638 {
1639         uint64_t object;
1640         sa_os_t *sa = tx->tx_objset->os_sa;
1641
1642         ASSERT(hdl != NULL);
1643
1644         object = sa_handle_object(hdl);
1645
1646         dmu_tx_hold_bonus(tx, object);
1647
1648         if (tx->tx_objset->os_sa->sa_master_obj == 0)
1649                 return;
1650
1651         if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
1652             tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
1653                 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1654                 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1655                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1656                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1657         }
1658
1659         dmu_tx_sa_registration_hold(sa, tx);
1660
1661         if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
1662                 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1663
1664         if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
1665                 ASSERT(tx->tx_txg == 0);
1666                 dmu_tx_hold_spill(tx, object);
1667         } else {
1668                 dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1669                 dnode_t *dn;
1670
1671                 DB_DNODE_ENTER(db);
1672                 dn = DB_DNODE(db);
1673                 if (dn->dn_have_spill) {
1674                         ASSERT(tx->tx_txg == 0);
1675                         dmu_tx_hold_spill(tx, object);
1676                 }
1677                 DB_DNODE_EXIT(db);
1678         }
1679 }
1680
1681 void
1682 dmu_tx_init(void)
1683 {
1684         dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc",
1685             KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t),
1686             KSTAT_FLAG_VIRTUAL);
1687
1688         if (dmu_tx_ksp != NULL) {
1689                 dmu_tx_ksp->ks_data = &dmu_tx_stats;
1690                 kstat_install(dmu_tx_ksp);
1691         }
1692 }
1693
1694 void
1695 dmu_tx_fini(void)
1696 {
1697         if (dmu_tx_ksp != NULL) {
1698                 kstat_delete(dmu_tx_ksp);
1699                 dmu_tx_ksp = NULL;
1700         }
1701 }
1702
1703 #if defined(_KERNEL) && defined(HAVE_SPL)
1704 EXPORT_SYMBOL(dmu_tx_create);
1705 EXPORT_SYMBOL(dmu_tx_hold_write);
1706 EXPORT_SYMBOL(dmu_tx_hold_free);
1707 EXPORT_SYMBOL(dmu_tx_hold_zap);
1708 EXPORT_SYMBOL(dmu_tx_hold_bonus);
1709 EXPORT_SYMBOL(dmu_tx_abort);
1710 EXPORT_SYMBOL(dmu_tx_assign);
1711 EXPORT_SYMBOL(dmu_tx_wait);
1712 EXPORT_SYMBOL(dmu_tx_commit);
1713 EXPORT_SYMBOL(dmu_tx_get_txg);
1714 EXPORT_SYMBOL(dmu_tx_callback_register);
1715 EXPORT_SYMBOL(dmu_tx_do_callbacks);
1716 EXPORT_SYMBOL(dmu_tx_hold_spill);
1717 EXPORT_SYMBOL(dmu_tx_hold_sa_create);
1718 EXPORT_SYMBOL(dmu_tx_hold_sa);
1719 #endif