module/zfs/dmu_tx.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  25  */
  26
  27 #include <sys/dmu.h>
  28 #include <sys/dmu_impl.h>
  29 #include <sys/dbuf.h>
  30 #include <sys/dmu_tx.h>
  31 #include <sys/dmu_objset.h>
  32 #include <sys/dsl_dataset.h> /* for dsl_dataset_block_freeable() */
  33 #include <sys/dsl_dir.h> /* for dsl_dir_tempreserve_*() */
  34 #include <sys/dsl_pool.h>
  35 #include <sys/zap_impl.h> /* for fzap_default_block_shift */
  36 #include <sys/spa.h>
  37 #include <sys/sa.h>
  38 #include <sys/sa_impl.h>
  39 #include <sys/zfs_context.h>
  40 #include <sys/varargs.h>
  41 #include <sys/trace_dmu.h>
  42
  43 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
  44     uint64_t arg1, uint64_t arg2);
  45
  46 dmu_tx_stats_t dmu_tx_stats = {
  47         { "dmu_tx_assigned",            KSTAT_DATA_UINT64 },
  48         { "dmu_tx_delay",               KSTAT_DATA_UINT64 },
  49         { "dmu_tx_error",               KSTAT_DATA_UINT64 },
  50         { "dmu_tx_suspended",           KSTAT_DATA_UINT64 },
  51         { "dmu_tx_group",               KSTAT_DATA_UINT64 },
  52         { "dmu_tx_memory_reserve",      KSTAT_DATA_UINT64 },
  53         { "dmu_tx_memory_reclaim",      KSTAT_DATA_UINT64 },
  54         { "dmu_tx_dirty_throttle",      KSTAT_DATA_UINT64 },
  55         { "dmu_tx_dirty_delay",         KSTAT_DATA_UINT64 },
  56         { "dmu_tx_dirty_over_max",      KSTAT_DATA_UINT64 },
  57         { "dmu_tx_quota",               KSTAT_DATA_UINT64 },
  58 };
  59
  60 static kstat_t *dmu_tx_ksp;
  61
  62 dmu_tx_t *
  63 dmu_tx_create_dd(dsl_dir_t *dd)
  64 {
  65         dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
  66         tx->tx_dir = dd;
  67         if (dd != NULL)
  68                 tx->tx_pool = dd->dd_pool;
  69         list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
  70             offsetof(dmu_tx_hold_t, txh_node));
  71         list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
  72             offsetof(dmu_tx_callback_t, dcb_node));
  73         tx->tx_start = gethrtime();
  74 #ifdef DEBUG_DMU_TX
  75         refcount_create(&tx->tx_space_written);
  76         refcount_create(&tx->tx_space_freed);
  77 #endif
  78         return (tx);
  79 }
  80
  81 dmu_tx_t *
  82 dmu_tx_create(objset_t *os)
  83 {
  84         dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
  85         tx->tx_objset = os;
  86         tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
  87         return (tx);
  88 }
  89
  90 dmu_tx_t *
  91 dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
  92 {
  93         dmu_tx_t *tx = dmu_tx_create_dd(NULL);
  94
  95         ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
  96         tx->tx_pool = dp;
  97         tx->tx_txg = txg;
  98         tx->tx_anyobj = TRUE;
  99
 100         return (tx);
 101 }
 102
 103 int
 104 dmu_tx_is_syncing(dmu_tx_t *tx)
 105 {
 106         return (tx->tx_anyobj);
 107 }
 108
 109 int
 110 dmu_tx_private_ok(dmu_tx_t *tx)
 111 {
 112         return (tx->tx_anyobj);
 113 }
 114
 115 static dmu_tx_hold_t *
 116 dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
 117     enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
 118 {
 119         dmu_tx_hold_t *txh;
 120         dnode_t *dn = NULL;
 121         int err;
 122
 123         if (object != DMU_NEW_OBJECT) {
 124                 err = dnode_hold(os, object, tx, &dn);
 125                 if (err) {
 126                         tx->tx_err = err;
 127                         return (NULL);
 128                 }
 129
 130                 if (err == 0 && tx->tx_txg != 0) {
 131                         mutex_enter(&dn->dn_mtx);
 132                         /*
 133                          * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
 134                          * problem, but there's no way for it to happen (for
 135                          * now, at least).
 136                          */
 137                         ASSERT(dn->dn_assigned_txg == 0);
 138                         dn->dn_assigned_txg = tx->tx_txg;
 139                         (void) refcount_add(&dn->dn_tx_holds, tx);
 140                         mutex_exit(&dn->dn_mtx);
 141                 }
 142         }
 143
 144         txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 145         txh->txh_tx = tx;
 146         txh->txh_dnode = dn;
 147 #ifdef DEBUG_DMU_TX
 148         txh->txh_type = type;
 149         txh->txh_arg1 = arg1;
 150         txh->txh_arg2 = arg2;
 151 #endif
 152         list_insert_tail(&tx->tx_holds, txh);
 153
 154         return (txh);
 155 }
 156
 157 void
 158 dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
 159 {
 160         /*
 161          * If we're syncing, they can manipulate any object anyhow, and
 162          * the hold on the dnode_t can cause problems.
 163          */
 164         if (!dmu_tx_is_syncing(tx)) {
 165                 (void) dmu_tx_hold_object_impl(tx, os,
 166                     object, THT_NEWOBJECT, 0, 0);
 167         }
 168 }
 169
 170 static int
 171 dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 172 {
 173         int err;
 174         dmu_buf_impl_t *db;
 175
 176         rw_enter(&dn->dn_struct_rwlock, RW_READER);
 177         db = dbuf_hold_level(dn, level, blkid, FTAG);
 178         rw_exit(&dn->dn_struct_rwlock);
 179         if (db == NULL)
 180                 return (SET_ERROR(EIO));
 181         err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 182         dbuf_rele(db, FTAG);
 183         return (err);
 184 }
 185
 186 static void
 187 dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
 188     int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
 189 {
 190         objset_t *os = dn->dn_objset;
 191         dsl_dataset_t *ds = os->os_dsl_dataset;
 192         int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 193         dmu_buf_impl_t *parent = NULL;
 194         blkptr_t *bp = NULL;
 195         uint64_t space;
 196
 197         if (level >= dn->dn_nlevels || history[level] == blkid)
 198                 return;
 199
 200         history[level] = blkid;
 201
 202         space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
 203
 204         if (db == NULL || db == dn->dn_dbuf) {
 205                 ASSERT(level != 0);
 206                 db = NULL;
 207         } else {
 208                 ASSERT(DB_DNODE(db) == dn);
 209                 ASSERT(db->db_level == level);
 210                 ASSERT(db->db.db_size == space);
 211                 ASSERT(db->db_blkid == blkid);
 212                 bp = db->db_blkptr;
 213                 parent = db->db_parent;
 214         }
 215
 216         freeable = (bp && (freeable ||
 217             dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
 218
 219         if (freeable)
 220                 txh->txh_space_tooverwrite += space;
 221         else
 222                 txh->txh_space_towrite += space;
 223         if (bp)
 224                 txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
 225
 226         dmu_tx_count_twig(txh, dn, parent, level + 1,
 227             blkid >> epbs, freeable, history);
 228 }
 229
 230 /* ARGSUSED */
 231 static void
 232 dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 233 {
 234         dnode_t *dn = txh->txh_dnode;
 235         uint64_t start, end, i;
 236         int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
 237         int err = 0;
 238         int l;
 239
 240         if (len == 0)
 241                 return;
 242
 243         min_bs = SPA_MINBLOCKSHIFT;
 244         max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
 245         min_ibs = DN_MIN_INDBLKSHIFT;
 246         max_ibs = DN_MAX_INDBLKSHIFT;
 247
 248         if (dn) {
 249                 uint64_t history[DN_MAX_LEVELS];
 250                 int nlvls = dn->dn_nlevels;
 251                 int delta;
 252
 253                 /*
 254                  * For i/o error checking, read the first and last level-0
 255                  * blocks (if they are not aligned), and all the level-1 blocks.
 256                  */
 257                 if (dn->dn_maxblkid == 0) {
 258                         delta = dn->dn_datablksz;
 259                         start = (off < dn->dn_datablksz) ? 0 : 1;
 260                         end = (off+len <= dn->dn_datablksz) ? 0 : 1;
 261                         if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
 262                                 err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 263                                 if (err)
 264                                         goto out;
 265                                 delta -= off;
 266                         }
 267                 } else {
 268                         zio_t *zio = zio_root(dn->dn_objset->os_spa,
 269                             NULL, NULL, ZIO_FLAG_CANFAIL);
 270
 271                         /* first level-0 block */
 272                         start = off >> dn->dn_datablkshift;
 273                         if (P2PHASE(off, dn->dn_datablksz) ||
 274                             len < dn->dn_datablksz) {
 275                                 err = dmu_tx_check_ioerr(zio, dn, 0, start);
 276                                 if (err)
 277                                         goto out;
 278                         }
 279
 280                         /* last level-0 block */
 281                         end = (off+len-1) >> dn->dn_datablkshift;
 282                         if (end != start && end <= dn->dn_maxblkid &&
 283                             P2PHASE(off+len, dn->dn_datablksz)) {
 284                                 err = dmu_tx_check_ioerr(zio, dn, 0, end);
 285                                 if (err)
 286                                         goto out;
 287                         }
 288
 289                         /* level-1 blocks */
 290                         if (nlvls > 1) {
 291                                 int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 292                                 for (i = (start>>shft)+1; i < end>>shft; i++) {
 293                                         err = dmu_tx_check_ioerr(zio, dn, 1, i);
 294                                         if (err)
 295                                                 goto out;
 296                                 }
 297                         }
 298
 299                         err = zio_wait(zio);
 300                         if (err)
 301                                 goto out;
 302                         delta = P2NPHASE(off, dn->dn_datablksz);
 303                 }
 304
 305                 min_ibs = max_ibs = dn->dn_indblkshift;
 306                 if (dn->dn_maxblkid > 0) {
 307                         /*
 308                          * The blocksize can't change,
 309                          * so we can make a more precise estimate.
 310                          */
 311                         ASSERT(dn->dn_datablkshift != 0);
 312                         min_bs = max_bs = dn->dn_datablkshift;
 313                 } else {
 314                         /*
 315                          * The blocksize can increase up to the recordsize,
 316                          * or if it is already more than the recordsize,
 317                          * up to the next power of 2.
 318                          */
 319                         min_bs = highbit64(dn->dn_datablksz - 1);
 320                         max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
 321                 }
 322
 323                 /*
 324                  * If this write is not off the end of the file
 325                  * we need to account for overwrites/unref.
 326                  */
 327                 if (start <= dn->dn_maxblkid) {
 328                         for (l = 0; l < DN_MAX_LEVELS; l++)
 329                                 history[l] = -1ULL;
 330                 }
 331                 while (start <= dn->dn_maxblkid) {
 332                         dmu_buf_impl_t *db;
 333
 334                         rw_enter(&dn->dn_struct_rwlock, RW_READER);
 335                         err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
 336                         rw_exit(&dn->dn_struct_rwlock);
 337
 338                         if (err) {
 339                                 txh->txh_tx->tx_err = err;
 340                                 return;
 341                         }
 342
 343                         dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
 344                             history);
 345                         dbuf_rele(db, FTAG);
 346                         if (++start > end) {
 347                                 /*
 348                                  * Account for new indirects appearing
 349                                  * before this IO gets assigned into a txg.
 350                                  */
 351                                 bits = 64 - min_bs;
 352                                 epbs = min_ibs - SPA_BLKPTRSHIFT;
 353                                 for (bits -= epbs * (nlvls - 1);
 354                                     bits >= 0; bits -= epbs)
 355                                         txh->txh_fudge += 1ULL << max_ibs;
 356                                 goto out;
 357                         }
 358                         off += delta;
 359                         if (len >= delta)
 360                                 len -= delta;
 361                         delta = dn->dn_datablksz;
 362                 }
 363         }
 364
 365         /*
 366          * 'end' is the last thing we will access, not one past.
 367          * This way we won't overflow when accessing the last byte.
 368          */
 369         start = P2ALIGN(off, 1ULL << max_bs);
 370         end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
 371         txh->txh_space_towrite += end - start + 1;
 372
 373         start >>= min_bs;
 374         end >>= min_bs;
 375
 376         epbs = min_ibs - SPA_BLKPTRSHIFT;
 377
 378         /*
 379          * The object contains at most 2^(64 - min_bs) blocks,
 380          * and each indirect level maps 2^epbs.
 381          */
 382         for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
 383                 start >>= epbs;
 384                 end >>= epbs;
 385                 ASSERT3U(end, >=, start);
 386                 txh->txh_space_towrite += (end - start + 1) << max_ibs;
 387                 if (start != 0) {
 388                         /*
 389                          * We also need a new blkid=0 indirect block
 390                          * to reference any existing file data.
 391                          */
 392                         txh->txh_space_towrite += 1ULL << max_ibs;
 393                 }
 394         }
 395
 396 out:
 397         if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
 398             2 * DMU_MAX_ACCESS)
 399                 err = SET_ERROR(EFBIG);
 400
 401         if (err)
 402                 txh->txh_tx->tx_err = err;
 403 }
 404
 405 static void
 406 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 407 {
 408         dnode_t *dn = txh->txh_dnode;
 409         dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset);
 410         uint64_t space = mdn->dn_datablksz +
 411             ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
 412
 413         if (dn && dn->dn_dbuf->db_blkptr &&
 414             dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 415             dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
 416                 txh->txh_space_tooverwrite += space;
 417                 txh->txh_space_tounref += space;
 418         } else {
 419                 txh->txh_space_towrite += space;
 420                 if (dn && dn->dn_dbuf->db_blkptr)
 421                         txh->txh_space_tounref += space;
 422         }
 423 }
 424
 425 void
 426 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 427 {
 428         dmu_tx_hold_t *txh;
 429
 430         ASSERT(tx->tx_txg == 0);
 431         ASSERT(len <= DMU_MAX_ACCESS);
 432         ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 433
 434         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 435             object, THT_WRITE, off, len);
 436         if (txh == NULL)
 437                 return;
 438
 439         dmu_tx_count_write(txh, off, len);
 440         dmu_tx_count_dnode(txh);
 441 }
 442
 443 static void
 444 dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 445 {
 446         uint64_t blkid, nblks, lastblk;
 447         uint64_t space = 0, unref = 0, skipped = 0;
 448         dnode_t *dn = txh->txh_dnode;
 449         dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 450         spa_t *spa = txh->txh_tx->tx_pool->dp_spa;
 451         int epbs;
 452         uint64_t l0span = 0, nl1blks = 0;
 453
 454         if (dn->dn_nlevels == 0)
 455                 return;
 456
 457         /*
 458          * The struct_rwlock protects us against dn_nlevels
 459          * changing, in case (against all odds) we manage to dirty &
 460          * sync out the changes after we check for being dirty.
 461          * Also, dbuf_hold_impl() wants us to have the struct_rwlock.
 462          */
 463         rw_enter(&dn->dn_struct_rwlock, RW_READER);
 464         epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 465         if (dn->dn_maxblkid == 0) {
 466                 if (off == 0 && len >= dn->dn_datablksz) {
 467                         blkid = 0;
 468                         nblks = 1;
 469                 } else {
 470                         rw_exit(&dn->dn_struct_rwlock);
 471                         return;
 472                 }
 473         } else {
 474                 blkid = off >> dn->dn_datablkshift;
 475                 nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
 476
 477                 if (blkid > dn->dn_maxblkid) {
 478                         rw_exit(&dn->dn_struct_rwlock);
 479                         return;
 480                 }
 481                 if (blkid + nblks > dn->dn_maxblkid)
 482                         nblks = dn->dn_maxblkid - blkid + 1;
 483
 484         }
 485         l0span = nblks;    /* save for later use to calc level > 1 overhead */
 486         if (dn->dn_nlevels == 1) {
 487                 int i;
 488                 for (i = 0; i < nblks; i++) {
 489                         blkptr_t *bp = dn->dn_phys->dn_blkptr;
 490                         ASSERT3U(blkid + i, <, dn->dn_nblkptr);
 491                         bp += blkid + i;
 492                         if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) {
 493                                 dprintf_bp(bp, "can free old%s", "");
 494                                 space += bp_get_dsize(spa, bp);
 495                         }
 496                         unref += BP_GET_ASIZE(bp);
 497                 }
 498                 nl1blks = 1;
 499                 nblks = 0;
 500         }
 501
 502         lastblk = blkid + nblks - 1;
 503         while (nblks) {
 504                 dmu_buf_impl_t *dbuf;
 505                 uint64_t ibyte, new_blkid;
 506                 int epb = 1 << epbs;
 507                 int err, i, blkoff, tochk;
 508                 blkptr_t *bp;
 509
 510                 ibyte = blkid << dn->dn_datablkshift;
 511                 err = dnode_next_offset(dn,
 512                     DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0);
 513                 new_blkid = ibyte >> dn->dn_datablkshift;
 514                 if (err == ESRCH) {
 515                         skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 516                         break;
 517                 }
 518                 if (err) {
 519                         txh->txh_tx->tx_err = err;
 520                         break;
 521                 }
 522                 if (new_blkid > lastblk) {
 523                         skipped += (lastblk >> epbs) - (blkid >> epbs) + 1;
 524                         break;
 525                 }
 526
 527                 if (new_blkid > blkid) {
 528                         ASSERT((new_blkid >> epbs) > (blkid >> epbs));
 529                         skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1;
 530                         nblks -= new_blkid - blkid;
 531                         blkid = new_blkid;
 532                 }
 533                 blkoff = P2PHASE(blkid, epb);
 534                 tochk = MIN(epb - blkoff, nblks);
 535
 536                 err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
 537                 if (err) {
 538                         txh->txh_tx->tx_err = err;
 539                         break;
 540                 }
 541
 542                 txh->txh_memory_tohold += dbuf->db.db_size;
 543
 544                 /*
 545                  * We don't check memory_tohold against DMU_MAX_ACCESS because
 546                  * memory_tohold is an over-estimation (especially the >L1
 547                  * indirect blocks), so it could fail.  Callers should have
 548                  * already verified that they will not be holding too much
 549                  * memory.
 550                  */
 551
 552                 err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
 553                 if (err != 0) {
 554                         txh->txh_tx->tx_err = err;
 555                         dbuf_rele(dbuf, FTAG);
 556                         break;
 557                 }
 558
 559                 bp = dbuf->db.db_data;
 560                 bp += blkoff;
 561
 562                 for (i = 0; i < tochk; i++) {
 563                         if (dsl_dataset_block_freeable(ds, &bp[i],
 564                             bp[i].blk_birth)) {
 565                                 dprintf_bp(&bp[i], "can free old%s", "");
 566                                 space += bp_get_dsize(spa, &bp[i]);
 567                         }
 568                         unref += BP_GET_ASIZE(bp);
 569                 }
 570                 dbuf_rele(dbuf, FTAG);
 571
 572                 ++nl1blks;
 573                 blkid += tochk;
 574                 nblks -= tochk;
 575         }
 576         rw_exit(&dn->dn_struct_rwlock);
 577
 578         /*
 579          * Add in memory requirements of higher-level indirects.
 580          * This assumes a worst-possible scenario for dn_nlevels and a
 581          * worst-possible distribution of l1-blocks over the region to free.
 582          */
 583         {
 584                 uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs);
 585                 int level = 2;
 586                 /*
 587                  * Here we don't use DN_MAX_LEVEL, but calculate it with the
 588                  * given datablkshift and indblkshift. This makes the
 589                  * difference between 19 and 8 on large files.
 590                  */
 591                 int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) /
 592                     (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
 593
 594                 while (level++ < maxlevel) {
 595                         txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1)
 596                             << dn->dn_indblkshift;
 597                         blkcnt = 1 + (blkcnt >> epbs);
 598                 }
 599         }
 600
 601         /* account for new level 1 indirect blocks that might show up */
 602         if (skipped > 0) {
 603                 txh->txh_fudge += skipped << dn->dn_indblkshift;
 604                 skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
 605                 txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
 606         }
 607         txh->txh_space_tofree += space;
 608         txh->txh_space_tounref += unref;
 609 }
 610
 611 void
 612 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 613 {
 614         dmu_tx_hold_t *txh;
 615         dnode_t *dn;
 616         int err;
 617         zio_t *zio;
 618
 619         ASSERT(tx->tx_txg == 0);
 620
 621         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 622             object, THT_FREE, off, len);
 623         if (txh == NULL)
 624                 return;
 625         dn = txh->txh_dnode;
 626         dmu_tx_count_dnode(txh);
 627
 628         if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 629                 return;
 630         if (len == DMU_OBJECT_END)
 631                 len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
 632
 633         dmu_tx_count_dnode(txh);
 634
 635         /*
 636          * For i/o error checking, we read the first and last level-0
 637          * blocks if they are not aligned, and all the level-1 blocks.
 638          *
 639          * Note:  dbuf_free_range() assumes that we have not instantiated
 640          * any level-0 dbufs that will be completely freed.  Therefore we must
 641          * exercise care to not read or count the first and last blocks
 642          * if they are blocksize-aligned.
 643          */
 644         if (dn->dn_datablkshift == 0) {
 645                 if (off != 0 || len < dn->dn_datablksz)
 646                         dmu_tx_count_write(txh, 0, dn->dn_datablksz);
 647         } else {
 648                 /* first block will be modified if it is not aligned */
 649                 if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
 650                         dmu_tx_count_write(txh, off, 1);
 651                 /* last block will be modified if it is not aligned */
 652                 if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
 653                         dmu_tx_count_write(txh, off+len, 1);
 654         }
 655
 656         /*
 657          * Check level-1 blocks.
 658          */
 659         if (dn->dn_nlevels > 1) {
 660                 int shift = dn->dn_datablkshift + dn->dn_indblkshift -
 661                     SPA_BLKPTRSHIFT;
 662                 uint64_t start = off >> shift;
 663                 uint64_t end = (off + len) >> shift;
 664                 uint64_t i;
 665
 666                 ASSERT(dn->dn_indblkshift != 0);
 667
 668                 /*
 669                  * dnode_reallocate() can result in an object with indirect
 670                  * blocks having an odd data block size.  In this case,
 671                  * just check the single block.
 672                  */
 673                 if (dn->dn_datablkshift == 0)
 674                         start = end = 0;
 675
 676                 zio = zio_root(tx->tx_pool->dp_spa,
 677                     NULL, NULL, ZIO_FLAG_CANFAIL);
 678                 for (i = start; i <= end; i++) {
 679                         uint64_t ibyte = i << shift;
 680                         err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 681                         i = ibyte >> shift;
 682                         if (err == ESRCH || i > end)
 683                                 break;
 684                         if (err) {
 685                                 tx->tx_err = err;
 686                                 return;
 687                         }
 688
 689                         err = dmu_tx_check_ioerr(zio, dn, 1, i);
 690                         if (err) {
 691                                 tx->tx_err = err;
 692                                 return;
 693                         }
 694                 }
 695                 err = zio_wait(zio);
 696                 if (err) {
 697                         tx->tx_err = err;
 698                         return;
 699                 }
 700         }
 701
 702         dmu_tx_count_free(txh, off, len);
 703 }
 704
 705 void
 706 dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 707 {
 708         dmu_tx_hold_t *txh;
 709         dnode_t *dn;
 710         dsl_dataset_phys_t *ds_phys;
 711         uint64_t nblocks;
 712         int epbs, err;
 713
 714         ASSERT(tx->tx_txg == 0);
 715
 716         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 717             object, THT_ZAP, add, (uintptr_t)name);
 718         if (txh == NULL)
 719                 return;
 720         dn = txh->txh_dnode;
 721
 722         dmu_tx_count_dnode(txh);
 723
 724         if (dn == NULL) {
 725                 /*
 726                  * We will be able to fit a new object's entries into one leaf
 727                  * block.  So there will be at most 2 blocks total,
 728                  * including the header block.
 729                  */
 730                 dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift);
 731                 return;
 732         }
 733
 734         ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 735
 736         if (dn->dn_maxblkid == 0 && !add) {
 737                 blkptr_t *bp;
 738
 739                 /*
 740                  * If there is only one block  (i.e. this is a micro-zap)
 741                  * and we are not adding anything, the accounting is simple.
 742                  */
 743                 err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 744                 if (err) {
 745                         tx->tx_err = err;
 746                         return;
 747                 }
 748
 749                 /*
 750                  * Use max block size here, since we don't know how much
 751                  * the size will change between now and the dbuf dirty call.
 752                  */
 753                 bp = &dn->dn_phys->dn_blkptr[0];
 754                 if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 755                     bp, bp->blk_birth))
 756                         txh->txh_space_tooverwrite += MZAP_MAX_BLKSZ;
 757                 else
 758                         txh->txh_space_towrite += MZAP_MAX_BLKSZ;
 759                 if (!BP_IS_HOLE(bp))
 760                         txh->txh_space_tounref += MZAP_MAX_BLKSZ;
 761                 return;
 762         }
 763
 764         if (dn->dn_maxblkid > 0 && name) {
 765                 /*
 766                  * access the name in this fat-zap so that we'll check
 767                  * for i/o errors to the leaf blocks, etc.
 768                  */
 769                 err = zap_lookup(dn->dn_objset, dn->dn_object, name,
 770                     8, 0, NULL);
 771                 if (err == EIO) {
 772                         tx->tx_err = err;
 773                         return;
 774                 }
 775         }
 776
 777         err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
 778             &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 779
 780         /*
 781          * If the modified blocks are scattered to the four winds,
 782          * we'll have to modify an indirect twig for each.
 783          */
 784         epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 785         ds_phys = dsl_dataset_phys(dn->dn_objset->os_dsl_dataset);
 786         for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
 787                 if (ds_phys->ds_prev_snap_obj)
 788                         txh->txh_space_towrite += 3 << dn->dn_indblkshift;
 789                 else
 790                         txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
 791 }
 792
 793 void
 794 dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
 795 {
 796         dmu_tx_hold_t *txh;
 797
 798         ASSERT(tx->tx_txg == 0);
 799
 800         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 801             object, THT_BONUS, 0, 0);
 802         if (txh)
 803                 dmu_tx_count_dnode(txh);
 804 }
 805
 806 void
 807 dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
 808 {
 809         dmu_tx_hold_t *txh;
 810
 811         ASSERT(tx->tx_txg == 0);
 812
 813         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 814             DMU_NEW_OBJECT, THT_SPACE, space, 0);
 815         if (txh)
 816                 txh->txh_space_towrite += space;
 817 }
 818
 819 int
 820 dmu_tx_holds(dmu_tx_t *tx, uint64_t object)
 821 {
 822         dmu_tx_hold_t *txh;
 823         int holds = 0;
 824
 825         /*
 826          * By asserting that the tx is assigned, we're counting the
 827          * number of dn_tx_holds, which is the same as the number of
 828          * dn_holds.  Otherwise, we'd be counting dn_holds, but
 829          * dn_tx_holds could be 0.
 830          */
 831         ASSERT(tx->tx_txg != 0);
 832
 833         /* if (tx->tx_anyobj == TRUE) */
 834                 /* return (0); */
 835
 836         for (txh = list_head(&tx->tx_holds); txh;
 837             txh = list_next(&tx->tx_holds, txh)) {
 838                 if (txh->txh_dnode && txh->txh_dnode->dn_object == object)
 839                         holds++;
 840         }
 841
 842         return (holds);
 843 }
 844
 845 #ifdef DEBUG_DMU_TX
 846 void
 847 dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 848 {
 849         dmu_tx_hold_t *txh;
 850         int match_object = FALSE, match_offset = FALSE;
 851         dnode_t *dn;
 852
 853         DB_DNODE_ENTER(db);
 854         dn = DB_DNODE(db);
 855         ASSERT(dn != NULL);
 856         ASSERT(tx->tx_txg != 0);
 857         ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 858         ASSERT3U(dn->dn_object, ==, db->db.db_object);
 859
 860         if (tx->tx_anyobj) {
 861                 DB_DNODE_EXIT(db);
 862                 return;
 863         }
 864
 865         /* XXX No checking on the meta dnode for now */
 866         if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 867                 DB_DNODE_EXIT(db);
 868                 return;
 869         }
 870
 871         for (txh = list_head(&tx->tx_holds); txh;
 872             txh = list_next(&tx->tx_holds, txh)) {
 873                 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 874                 if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
 875                         match_object = TRUE;
 876                 if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
 877                         int datablkshift = dn->dn_datablkshift ?
 878                             dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
 879                         int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 880                         int shift = datablkshift + epbs * db->db_level;
 881                         uint64_t beginblk = shift >= 64 ? 0 :
 882                             (txh->txh_arg1 >> shift);
 883                         uint64_t endblk = shift >= 64 ? 0 :
 884                             ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
 885                         uint64_t blkid = db->db_blkid;
 886
 887                         /* XXX txh_arg2 better not be zero... */
 888
 889                         dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
 890                             txh->txh_type, beginblk, endblk);
 891
 892                         switch (txh->txh_type) {
 893                         case THT_WRITE:
 894                                 if (blkid >= beginblk && blkid <= endblk)
 895                                         match_offset = TRUE;
 896                                 /*
 897                                  * We will let this hold work for the bonus
 898                                  * or spill buffer so that we don't need to
 899                                  * hold it when creating a new object.
 900                                  */
 901                                 if (blkid == DMU_BONUS_BLKID ||
 902                                     blkid == DMU_SPILL_BLKID)
 903                                         match_offset = TRUE;
 904                                 /*
 905                                  * They might have to increase nlevels,
 906                                  * thus dirtying the new TLIBs.  Or the
 907                                  * might have to change the block size,
 908                                  * thus dirying the new lvl=0 blk=0.
 909                                  */
 910                                 if (blkid == 0)
 911                                         match_offset = TRUE;
 912                                 break;
 913                         case THT_FREE:
 914                                 /*
 915                                  * We will dirty all the level 1 blocks in
 916                                  * the free range and perhaps the first and
 917                                  * last level 0 block.
 918                                  */
 919                                 if (blkid >= beginblk && (blkid <= endblk ||
 920                                     txh->txh_arg2 == DMU_OBJECT_END))
 921                                         match_offset = TRUE;
 922                                 break;
 923                         case THT_SPILL:
 924                                 if (blkid == DMU_SPILL_BLKID)
 925                                         match_offset = TRUE;
 926                                 break;
 927                         case THT_BONUS:
 928                                 if (blkid == DMU_BONUS_BLKID)
 929                                         match_offset = TRUE;
 930                                 break;
 931                         case THT_ZAP:
 932                                 match_offset = TRUE;
 933                                 break;
 934                         case THT_NEWOBJECT:
 935                                 match_object = TRUE;
 936                                 break;
 937                         default:
 938                                 cmn_err(CE_PANIC, "bad txh_type %d",
 939                                     txh->txh_type);
 940                         }
 941                 }
 942                 if (match_object && match_offset) {
 943                         DB_DNODE_EXIT(db);
 944                         return;
 945                 }
 946         }
 947         DB_DNODE_EXIT(db);
 948         panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
 949             (u_longlong_t)db->db.db_object, db->db_level,
 950             (u_longlong_t)db->db_blkid);
 951 }
 952 #endif
 953
 954 /*
 955  * If we can't do 10 iops, something is wrong.  Let us go ahead
 956  * and hit zfs_dirty_data_max.
 957  */
 958 hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */
 959 int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
 960
 961 /*
 962  * We delay transactions when we've determined that the backend storage
 963  * isn't able to accommodate the rate of incoming writes.
 964  *
 965  * If there is already a transaction waiting, we delay relative to when
 966  * that transaction finishes waiting.  This way the calculated min_time
 967  * is independent of the number of threads concurrently executing
 968  * transactions.
 969  *
 970  * If we are the only waiter, wait relative to when the transaction
 971  * started, rather than the current time.  This credits the transaction for
 972  * "time already served", e.g. reading indirect blocks.
 973  *
 974  * The minimum time for a transaction to take is calculated as:
 975  *     min_time = scale * (dirty - min) / (max - dirty)
 976  *     min_time is then capped at zfs_delay_max_ns.
 977  *
 978  * The delay has two degrees of freedom that can be adjusted via tunables.
 979  * The percentage of dirty data at which we start to delay is defined by
 980  * zfs_delay_min_dirty_percent. This should typically be at or above
 981  * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
 982  * delay after writing at full speed has failed to keep up with the incoming
 983  * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
 984  * speaking, this variable determines the amount of delay at the midpoint of
 985  * the curve.
 986  *
 987  * delay
 988  *  10ms +-------------------------------------------------------------*+
 989  *       |                                                             *|
 990  *   9ms +                                                             *+
 991  *       |                                                             *|
 992  *   8ms +                                                             *+
 993  *       |                                                            * |
 994  *   7ms +                                                            * +
 995  *       |                                                            * |
 996  *   6ms +                                                            * +
 997  *       |                                                            * |
 998  *   5ms +                                                           *  +
 999  *       |                                                           *  |
1000  *   4ms +                                                           *  +
1001  *       |                                                           *  |
1002  *   3ms +                                                          *   +
1003  *       |                                                          *   |
1004  *   2ms +                                              (midpoint) *    +
1005  *       |                                                  |    **     |
1006  *   1ms +                                                  v ***       +
1007  *       |             zfs_delay_scale ---------->     ********         |
1008  *     0 +-------------------------------------*********----------------+
1009  *       0%                    <- zfs_dirty_data_max ->               100%
1010  *
1011  * Note that since the delay is added to the outstanding time remaining on the
1012  * most recent transaction, the delay is effectively the inverse of IOPS.
1013  * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
1014  * was chosen such that small changes in the amount of accumulated dirty data
1015  * in the first 3/4 of the curve yield relatively small differences in the
1016  * amount of delay.
1017  *
1018  * The effects can be easier to understand when the amount of delay is
1019  * represented on a log scale:
1020  *
1021  * delay
1022  * 100ms +-------------------------------------------------------------++
1023  *       +                                                              +
1024  *       |                                                              |
1025  *       +                                                             *+
1026  *  10ms +                                                             *+
1027  *       +                                                           ** +
1028  *       |                                              (midpoint)  **  |
1029  *       +                                                  |     **    +
1030  *   1ms +                                                  v ****      +
1031  *       +             zfs_delay_scale ---------->        *****         +
1032  *       |                                             ****             |
1033  *       +                                          ****                +
1034  * 100us +                                        **                    +
1035  *       +                                       *                      +
1036  *       |                                      *                       |
1037  *       +                                     *                        +
1038  *  10us +                                     *                        +
1039  *       +                                                              +
1040  *       |                                                              |
1041  *       +                                                              +
1042  *       +--------------------------------------------------------------+
1043  *       0%                    <- zfs_dirty_data_max ->               100%
1044  *
1045  * Note here that only as the amount of dirty data approaches its limit does
1046  * the delay start to increase rapidly. The goal of a properly tuned system
1047  * should be to keep the amount of dirty data out of that range by first
1048  * ensuring that the appropriate limits are set for the I/O scheduler to reach
1049  * optimal throughput on the backend storage, and then by changing the value
1050  * of zfs_delay_scale to increase the steepness of the curve.
1051  */
1052 static void
1053 dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
1054 {
1055         dsl_pool_t *dp = tx->tx_pool;
1056         uint64_t delay_min_bytes =
1057             zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
1058         hrtime_t wakeup, min_tx_time, now;
1059
1060         if (dirty <= delay_min_bytes)
1061                 return;
1062
1063         /*
1064          * The caller has already waited until we are under the max.
1065          * We make them pass us the amount of dirty data so we don't
1066          * have to handle the case of it being >= the max, which could
1067          * cause a divide-by-zero if it's == the max.
1068          */
1069         ASSERT3U(dirty, <, zfs_dirty_data_max);
1070
1071         now = gethrtime();
1072         min_tx_time = zfs_delay_scale *
1073             (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
1074         min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
1075         if (now > tx->tx_start + min_tx_time)
1076                 return;
1077
1078         DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
1079             uint64_t, min_tx_time);
1080
1081         mutex_enter(&dp->dp_lock);
1082         wakeup = MAX(tx->tx_start + min_tx_time,
1083             dp->dp_last_wakeup + min_tx_time);
1084         dp->dp_last_wakeup = wakeup;
1085         mutex_exit(&dp->dp_lock);
1086
1087         zfs_sleep_until(wakeup);
1088 }
1089
1090 static int
1091 dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
1092 {
1093         dmu_tx_hold_t *txh;
1094         spa_t *spa = tx->tx_pool->dp_spa;
1095         uint64_t memory, asize, fsize, usize;
1096         uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge;
1097
1098         ASSERT0(tx->tx_txg);
1099
1100         if (tx->tx_err) {
1101                 DMU_TX_STAT_BUMP(dmu_tx_error);
1102                 return (tx->tx_err);
1103         }
1104
1105         if (spa_suspended(spa)) {
1106                 DMU_TX_STAT_BUMP(dmu_tx_suspended);
1107
1108                 /*
1109                  * If the user has indicated a blocking failure mode
1110                  * then return ERESTART which will block in dmu_tx_wait().
1111                  * Otherwise, return EIO so that an error can get
1112                  * propagated back to the VOP calls.
1113                  *
1114                  * Note that we always honor the txg_how flag regardless
1115                  * of the failuremode setting.
1116                  */
1117                 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
1118                     txg_how != TXG_WAIT)
1119                         return (SET_ERROR(EIO));
1120
1121                 return (SET_ERROR(ERESTART));
1122         }
1123
1124         if (!tx->tx_waited &&
1125             dsl_pool_need_dirty_delay(tx->tx_pool)) {
1126                 tx->tx_wait_dirty = B_TRUE;
1127                 DMU_TX_STAT_BUMP(dmu_tx_dirty_delay);
1128                 return (ERESTART);
1129         }
1130
1131         tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
1132         tx->tx_needassign_txh = NULL;
1133
1134         /*
1135          * NB: No error returns are allowed after txg_hold_open, but
1136          * before processing the dnode holds, due to the
1137          * dmu_tx_unassign() logic.
1138          */
1139
1140         towrite = tofree = tooverwrite = tounref = tohold = fudge = 0;
1141         for (txh = list_head(&tx->tx_holds); txh;
1142             txh = list_next(&tx->tx_holds, txh)) {
1143                 dnode_t *dn = txh->txh_dnode;
1144                 if (dn != NULL) {
1145                         mutex_enter(&dn->dn_mtx);
1146                         if (dn->dn_assigned_txg == tx->tx_txg - 1) {
1147                                 mutex_exit(&dn->dn_mtx);
1148                                 tx->tx_needassign_txh = txh;
1149                                 DMU_TX_STAT_BUMP(dmu_tx_group);
1150                                 return (SET_ERROR(ERESTART));
1151                         }
1152                         if (dn->dn_assigned_txg == 0)
1153                                 dn->dn_assigned_txg = tx->tx_txg;
1154                         ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1155                         (void) refcount_add(&dn->dn_tx_holds, tx);
1156                         mutex_exit(&dn->dn_mtx);
1157                 }
1158                 towrite += txh->txh_space_towrite;
1159                 tofree += txh->txh_space_tofree;
1160                 tooverwrite += txh->txh_space_tooverwrite;
1161                 tounref += txh->txh_space_tounref;
1162                 tohold += txh->txh_memory_tohold;
1163                 fudge += txh->txh_fudge;
1164         }
1165
1166         /*
1167          * If a snapshot has been taken since we made our estimates,
1168          * assume that we won't be able to free or overwrite anything.
1169          */
1170         if (tx->tx_objset &&
1171             dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
1172             tx->tx_lastsnap_txg) {
1173                 towrite += tooverwrite;
1174                 tooverwrite = tofree = 0;
1175         }
1176
1177         /* needed allocation: worst-case estimate of write space */
1178         asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite);
1179         /* freed space estimate: worst-case overwrite + free estimate */
1180         fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree;
1181         /* convert unrefd space to worst-case estimate */
1182         usize = spa_get_asize(tx->tx_pool->dp_spa, tounref);
1183         /* calculate memory footprint estimate */
1184         memory = towrite + tooverwrite + tohold;
1185
1186 #ifdef DEBUG_DMU_TX
1187         /*
1188          * Add in 'tohold' to account for our dirty holds on this memory
1189          * XXX - the "fudge" factor is to account for skipped blocks that
1190          * we missed because dnode_next_offset() misses in-core-only blocks.
1191          */
1192         tx->tx_space_towrite = asize +
1193             spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge);
1194         tx->tx_space_tofree = tofree;
1195         tx->tx_space_tooverwrite = tooverwrite;
1196         tx->tx_space_tounref = tounref;
1197 #endif
1198
1199         if (tx->tx_dir && asize != 0) {
1200                 int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
1201                     asize, fsize, usize, &tx->tx_tempreserve_cookie, tx);
1202                 if (err)
1203                         return (err);
1204         }
1205
1206         DMU_TX_STAT_BUMP(dmu_tx_assigned);
1207
1208         return (0);
1209 }
1210
1211 static void
1212 dmu_tx_unassign(dmu_tx_t *tx)
1213 {
1214         dmu_tx_hold_t *txh;
1215
1216         if (tx->tx_txg == 0)
1217                 return;
1218
1219         txg_rele_to_quiesce(&tx->tx_txgh);
1220
1221         /*
1222          * Walk the transaction's hold list, removing the hold on the
1223          * associated dnode, and notifying waiters if the refcount drops to 0.
1224          */
1225         for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
1226             txh = list_next(&tx->tx_holds, txh)) {
1227                 dnode_t *dn = txh->txh_dnode;
1228
1229                 if (dn == NULL)
1230                         continue;
1231                 mutex_enter(&dn->dn_mtx);
1232                 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1233
1234                 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1235                         dn->dn_assigned_txg = 0;
1236                         cv_broadcast(&dn->dn_notxholds);
1237                 }
1238                 mutex_exit(&dn->dn_mtx);
1239         }
1240
1241         txg_rele_to_sync(&tx->tx_txgh);
1242
1243         tx->tx_lasttried_txg = tx->tx_txg;
1244         tx->tx_txg = 0;
1245 }
1246
1247 /*
1248  * Assign tx to a transaction group.  txg_how can be one of:
1249  *
1250  * (1)  TXG_WAIT.  If the current open txg is full, waits until there's
1251  *      a new one.  This should be used when you're not holding locks.
1252  *      It will only fail if we're truly out of space (or over quota).
1253  *
1254  * (2)  TXG_NOWAIT.  If we can't assign into the current open txg without
1255  *      blocking, returns immediately with ERESTART.  This should be used
1256  *      whenever you're holding locks.  On an ERESTART error, the caller
1257  *      should drop locks, do a dmu_tx_wait(tx), and try again.
1258  *
1259  * (3)  TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
1260  *      has already been called on behalf of this operation (though
1261  *      most likely on a different tx).
1262  */
1263 int
1264 dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1265 {
1266         int err;
1267
1268         ASSERT(tx->tx_txg == 0);
1269         ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
1270             txg_how == TXG_WAITED);
1271         ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1272
1273         if (txg_how == TXG_WAITED)
1274                 tx->tx_waited = B_TRUE;
1275
1276         /* If we might wait, we must not hold the config lock. */
1277         ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
1278
1279         while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1280                 dmu_tx_unassign(tx);
1281
1282                 if (err != ERESTART || txg_how != TXG_WAIT)
1283                         return (err);
1284
1285                 dmu_tx_wait(tx);
1286         }
1287
1288         txg_rele_to_quiesce(&tx->tx_txgh);
1289
1290         return (0);
1291 }
1292
1293 void
1294 dmu_tx_wait(dmu_tx_t *tx)
1295 {
1296         spa_t *spa = tx->tx_pool->dp_spa;
1297         dsl_pool_t *dp = tx->tx_pool;
1298         hrtime_t before;
1299
1300         ASSERT(tx->tx_txg == 0);
1301         ASSERT(!dsl_pool_config_held(tx->tx_pool));
1302
1303         before = gethrtime();
1304
1305         if (tx->tx_wait_dirty) {
1306                 uint64_t dirty;
1307
1308                 /*
1309                  * dmu_tx_try_assign() has determined that we need to wait
1310                  * because we've consumed much or all of the dirty buffer
1311                  * space.
1312                  */
1313                 mutex_enter(&dp->dp_lock);
1314                 if (dp->dp_dirty_total >= zfs_dirty_data_max)
1315                         DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max);
1316                 while (dp->dp_dirty_total >= zfs_dirty_data_max)
1317                         cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1318                 dirty = dp->dp_dirty_total;
1319                 mutex_exit(&dp->dp_lock);
1320
1321                 dmu_tx_delay(tx, dirty);
1322
1323                 tx->tx_wait_dirty = B_FALSE;
1324
1325                 /*
1326                  * Note: setting tx_waited only has effect if the caller
1327                  * used TX_WAIT.  Otherwise they are going to destroy
1328                  * this tx and try again.  The common case, zfs_write(),
1329                  * uses TX_WAIT.
1330                  */
1331                 tx->tx_waited = B_TRUE;
1332         } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1333                 /*
1334                  * If the pool is suspended we need to wait until it
1335                  * is resumed.  Note that it's possible that the pool
1336                  * has become active after this thread has tried to
1337                  * obtain a tx.  If that's the case then tx_lasttried_txg
1338                  * would not have been set.
1339                  */
1340                 txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1341         } else if (tx->tx_needassign_txh) {
1342                 dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1343
1344                 mutex_enter(&dn->dn_mtx);
1345                 while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1346                         cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1347                 mutex_exit(&dn->dn_mtx);
1348                 tx->tx_needassign_txh = NULL;
1349         } else {
1350                 /*
1351                  * A dnode is assigned to the quiescing txg.  Wait for its
1352                  * transaction to complete.
1353                  */
1354                 txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
1355         }
1356
1357         spa_tx_assign_add_nsecs(spa, gethrtime() - before);
1358 }
1359
1360 void
1361 dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta)
1362 {
1363 #ifdef DEBUG_DMU_TX
1364         if (tx->tx_dir == NULL || delta == 0)
1365                 return;
1366
1367         if (delta > 0) {
1368                 ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=,
1369                     tx->tx_space_towrite);
1370                 (void) refcount_add_many(&tx->tx_space_written, delta, NULL);
1371         } else {
1372                 (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL);
1373         }
1374 #endif
1375 }
1376
1377 void
1378 dmu_tx_commit(dmu_tx_t *tx)
1379 {
1380         dmu_tx_hold_t *txh;
1381
1382         ASSERT(tx->tx_txg != 0);
1383
1384         /*
1385          * Go through the transaction's hold list and remove holds on
1386          * associated dnodes, notifying waiters if no holds remain.
1387          */
1388         while ((txh = list_head(&tx->tx_holds))) {
1389                 dnode_t *dn = txh->txh_dnode;
1390
1391                 list_remove(&tx->tx_holds, txh);
1392                 kmem_free(txh, sizeof (dmu_tx_hold_t));
1393                 if (dn == NULL)
1394                         continue;
1395                 mutex_enter(&dn->dn_mtx);
1396                 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1397
1398                 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1399                         dn->dn_assigned_txg = 0;
1400                         cv_broadcast(&dn->dn_notxholds);
1401                 }
1402                 mutex_exit(&dn->dn_mtx);
1403                 dnode_rele(dn, tx);
1404         }
1405
1406         if (tx->tx_tempreserve_cookie)
1407                 dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1408
1409         if (!list_is_empty(&tx->tx_callbacks))
1410                 txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
1411
1412         if (tx->tx_anyobj == FALSE)
1413                 txg_rele_to_sync(&tx->tx_txgh);
1414
1415         list_destroy(&tx->tx_callbacks);
1416         list_destroy(&tx->tx_holds);
1417 #ifdef DEBUG_DMU_TX
1418         dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
1419             tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
1420             tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
1421         refcount_destroy_many(&tx->tx_space_written,
1422             refcount_count(&tx->tx_space_written));
1423         refcount_destroy_many(&tx->tx_space_freed,
1424             refcount_count(&tx->tx_space_freed));
1425 #endif
1426         kmem_free(tx, sizeof (dmu_tx_t));
1427 }
1428
1429 void
1430 dmu_tx_abort(dmu_tx_t *tx)
1431 {
1432         dmu_tx_hold_t *txh;
1433
1434         ASSERT(tx->tx_txg == 0);
1435
1436         while ((txh = list_head(&tx->tx_holds))) {
1437                 dnode_t *dn = txh->txh_dnode;
1438
1439                 list_remove(&tx->tx_holds, txh);
1440                 kmem_free(txh, sizeof (dmu_tx_hold_t));
1441                 if (dn != NULL)
1442                         dnode_rele(dn, tx);
1443         }
1444
1445         /*
1446          * Call any registered callbacks with an error code.
1447          */
1448         if (!list_is_empty(&tx->tx_callbacks))
1449                 dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
1450
1451         list_destroy(&tx->tx_callbacks);
1452         list_destroy(&tx->tx_holds);
1453 #ifdef DEBUG_DMU_TX
1454         refcount_destroy_many(&tx->tx_space_written,
1455             refcount_count(&tx->tx_space_written));
1456         refcount_destroy_many(&tx->tx_space_freed,
1457             refcount_count(&tx->tx_space_freed));
1458 #endif
1459         kmem_free(tx, sizeof (dmu_tx_t));
1460 }
1461
1462 uint64_t
1463 dmu_tx_get_txg(dmu_tx_t *tx)
1464 {
1465         ASSERT(tx->tx_txg != 0);
1466         return (tx->tx_txg);
1467 }
1468
1469 dsl_pool_t *
1470 dmu_tx_pool(dmu_tx_t *tx)
1471 {
1472         ASSERT(tx->tx_pool != NULL);
1473         return (tx->tx_pool);
1474 }
1475
1476 void
1477 dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
1478 {
1479         dmu_tx_callback_t *dcb;
1480
1481         dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
1482
1483         dcb->dcb_func = func;
1484         dcb->dcb_data = data;
1485
1486         list_insert_tail(&tx->tx_callbacks, dcb);
1487 }
1488
1489 /*
1490  * Call all the commit callbacks on a list, with a given error code.
1491  */
1492 void
1493 dmu_tx_do_callbacks(list_t *cb_list, int error)
1494 {
1495         dmu_tx_callback_t *dcb;
1496
1497         while ((dcb = list_head(cb_list))) {
1498                 list_remove(cb_list, dcb);
1499                 dcb->dcb_func(dcb->dcb_data, error);
1500                 kmem_free(dcb, sizeof (dmu_tx_callback_t));
1501         }
1502 }
1503
1504 /*
1505  * Interface to hold a bunch of attributes.
1506  * used for creating new files.
1507  * attrsize is the total size of all attributes
1508  * to be added during object creation
1509  *
1510  * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
1511  */
1512
1513 /*
1514  * hold necessary attribute name for attribute registration.
1515  * should be a very rare case where this is needed.  If it does
1516  * happen it would only happen on the first write to the file system.
1517  */
1518 static void
1519 dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
1520 {
1521         int i;
1522
1523         if (!sa->sa_need_attr_registration)
1524                 return;
1525
1526         for (i = 0; i != sa->sa_num_attrs; i++) {
1527                 if (!sa->sa_attr_table[i].sa_registered) {
1528                         if (sa->sa_reg_attr_obj)
1529                                 dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
1530                                     B_TRUE, sa->sa_attr_table[i].sa_name);
1531                         else
1532                                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
1533                                     B_TRUE, sa->sa_attr_table[i].sa_name);
1534                 }
1535         }
1536 }
1537
1538
1539 void
1540 dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
1541 {
1542         dnode_t *dn;
1543         dmu_tx_hold_t *txh;
1544
1545         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
1546             THT_SPILL, 0, 0);
1547         if (txh == NULL)
1548                 return;
1549
1550         dn = txh->txh_dnode;
1551
1552         if (dn == NULL)
1553                 return;
1554
1555         /* If blkptr doesn't exist then add space to towrite */
1556         if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
1557                 txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
1558         } else {
1559                 blkptr_t *bp;
1560
1561                 bp = &dn->dn_phys->dn_spill;
1562                 if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
1563                     bp, bp->blk_birth))
1564                         txh->txh_space_tooverwrite += SPA_OLD_MAXBLOCKSIZE;
1565                 else
1566                         txh->txh_space_towrite += SPA_OLD_MAXBLOCKSIZE;
1567                 if (!BP_IS_HOLE(bp))
1568                         txh->txh_space_tounref += SPA_OLD_MAXBLOCKSIZE;
1569         }
1570 }
1571
1572 void
1573 dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
1574 {
1575         sa_os_t *sa = tx->tx_objset->os_sa;
1576
1577         dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1578
1579         if (tx->tx_objset->os_sa->sa_master_obj == 0)
1580                 return;
1581
1582         if (tx->tx_objset->os_sa->sa_layout_attr_obj)
1583                 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1584         else {
1585                 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1586                 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1587                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1588                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1589         }
1590
1591         dmu_tx_sa_registration_hold(sa, tx);
1592
1593         if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
1594                 return;
1595
1596         (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
1597             THT_SPILL, 0, 0);
1598 }
1599
1600 /*
1601  * Hold SA attribute
1602  *
1603  * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
1604  *
1605  * variable_size is the total size of all variable sized attributes
1606  * passed to this function.  It is not the total size of all
1607  * variable size attributes that *may* exist on this object.
1608  */
1609 void
1610 dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
1611 {
1612         uint64_t object;
1613         sa_os_t *sa = tx->tx_objset->os_sa;
1614
1615         ASSERT(hdl != NULL);
1616
1617         object = sa_handle_object(hdl);
1618
1619         dmu_tx_hold_bonus(tx, object);
1620
1621         if (tx->tx_objset->os_sa->sa_master_obj == 0)
1622                 return;
1623
1624         if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
1625             tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
1626                 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1627                 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1628                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1629                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1630         }
1631
1632         dmu_tx_sa_registration_hold(sa, tx);
1633
1634         if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
1635                 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1636
1637         if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
1638                 ASSERT(tx->tx_txg == 0);
1639                 dmu_tx_hold_spill(tx, object);
1640         } else {
1641                 dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1642                 dnode_t *dn;
1643
1644                 DB_DNODE_ENTER(db);
1645                 dn = DB_DNODE(db);
1646                 if (dn->dn_have_spill) {
1647                         ASSERT(tx->tx_txg == 0);
1648                         dmu_tx_hold_spill(tx, object);
1649                 }
1650                 DB_DNODE_EXIT(db);
1651         }
1652 }
1653
1654 void
1655 dmu_tx_init(void)
1656 {
1657         dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc",
1658             KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t),
1659             KSTAT_FLAG_VIRTUAL);
1660
1661         if (dmu_tx_ksp != NULL) {
1662                 dmu_tx_ksp->ks_data = &dmu_tx_stats;
1663                 kstat_install(dmu_tx_ksp);
1664         }
1665 }
1666
1667 void
1668 dmu_tx_fini(void)
1669 {
1670         if (dmu_tx_ksp != NULL) {
1671                 kstat_delete(dmu_tx_ksp);
1672                 dmu_tx_ksp = NULL;
1673         }
1674 }
1675
1676 #if defined(_KERNEL) && defined(HAVE_SPL)
1677 EXPORT_SYMBOL(dmu_tx_create);
1678 EXPORT_SYMBOL(dmu_tx_hold_write);
1679 EXPORT_SYMBOL(dmu_tx_hold_free);
1680 EXPORT_SYMBOL(dmu_tx_hold_zap);
1681 EXPORT_SYMBOL(dmu_tx_hold_bonus);
1682 EXPORT_SYMBOL(dmu_tx_abort);
1683 EXPORT_SYMBOL(dmu_tx_assign);
1684 EXPORT_SYMBOL(dmu_tx_wait);
1685 EXPORT_SYMBOL(dmu_tx_commit);
1686 EXPORT_SYMBOL(dmu_tx_get_txg);
1687 EXPORT_SYMBOL(dmu_tx_callback_register);
1688 EXPORT_SYMBOL(dmu_tx_do_callbacks);
1689 EXPORT_SYMBOL(dmu_tx_hold_spill);
1690 EXPORT_SYMBOL(dmu_tx_hold_sa_create);
1691 EXPORT_SYMBOL(dmu_tx_hold_sa);
1692 #endif