module/zfs/zvol.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
  23  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  24  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
  25  * LLNL-CODE-403049.
  26  *
  27  * ZFS volume emulation driver.
  28  *
  29  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  30  * Volumes are accessed through the symbolic links named:
  31  *
  32  * /dev/<pool_name>/<dataset_name>
  33  *
  34  * Volumes are persistent through reboot and module load.  No user command
  35  * needs to be run before opening and using a device.
  36  *
  37  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  38  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  39  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  40  */
  41
  42 /*
  43  * Note on locking of zvol state structures.
  44  *
  45  * These structures are used to maintain internal state used to emulate block
  46  * devices on top of zvols. In particular, management of device minor number
  47  * operations - create, remove, rename, and set_snapdev - involves access to
  48  * these structures. The zvol_state_lock is primarily used to protect the
  49  * zvol_state_list. The zv->zv_state_lock is used to protect the contents
  50  * of the zvol_state_t structures, as well as to make sure that when the
  51  * time comes to remove the structure from the list, it is not in use, and
  52  * therefore, it can be taken off zvol_state_list and freed.
  53  *
  54  * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol,
  55  * e.g. for the duration of receive and rollback operations. This lock can be
  56  * held for significant periods of time. Given that it is undesirable to hold
  57  * mutexes for long periods of time, the following lock ordering applies:
  58  * - take zvol_state_lock if necessary, to protect zvol_state_list
  59  * - take zv_suspend_lock if necessary, by the code path in question
  60  * - take zv_state_lock to protect zvol_state_t
  61  *
  62  * The minor operations are issued to spa->spa_zvol_taskq queues, that are
  63  * single-threaded (to preserve order of minor operations), and are executed
  64  * through the zvol_task_cb that dispatches the specific operations. Therefore,
  65  * these operations are serialized per pool. Consequently, we can be certain
  66  * that for a given zvol, there is only one operation at a time in progress.
  67  * That is why one can be sure that first, zvol_state_t for a given zvol is
  68  * allocated and placed on zvol_state_list, and then other minor operations
  69  * for this zvol are going to proceed in the order of issue.
  70  *
  71  */
  72
  73 #include <sys/dataset_kstats.h>
  74 #include <sys/dbuf.h>
  75 #include <sys/dmu_traverse.h>
  76 #include <sys/dsl_dataset.h>
  77 #include <sys/dsl_prop.h>
  78 #include <sys/dsl_dir.h>
  79 #include <sys/zap.h>
  80 #include <sys/zfeature.h>
  81 #include <sys/zil_impl.h>
  82 #include <sys/dmu_tx.h>
  83 #include <sys/zio.h>
  84 #include <sys/zfs_rlock.h>
  85 #include <sys/spa_impl.h>
  86 #include <sys/zvol.h>
  87 #include <sys/zvol_impl.h>
  88
  89 unsigned int zvol_inhibit_dev = 0;
  90 unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
  91
  92 struct hlist_head *zvol_htable;
  93 list_t zvol_state_list;
  94 krwlock_t zvol_state_lock;
  95 const zvol_platform_ops_t *ops;
  96
  97 typedef enum {
  98         ZVOL_ASYNC_REMOVE_MINORS,
  99         ZVOL_ASYNC_RENAME_MINORS,
 100         ZVOL_ASYNC_SET_SNAPDEV,
 101         ZVOL_ASYNC_SET_VOLMODE,
 102         ZVOL_ASYNC_MAX
 103 } zvol_async_op_t;
 104
 105 typedef struct {
 106         zvol_async_op_t op;
 107         char name1[MAXNAMELEN];
 108         char name2[MAXNAMELEN];
 109         uint64_t value;
 110 } zvol_task_t;
 111
 112 uint64_t
 113 zvol_name_hash(const char *name)
 114 {
 115         int i;
 116         uint64_t crc = -1ULL;
 117         const uint8_t *p = (const uint8_t *)name;
 118         ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 119         for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) {
 120                 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF];
 121         }
 122         return (crc);
 123 }
 124
 125 /*
 126  * Find a zvol_state_t given the name and hash generated by zvol_name_hash.
 127  * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
 128  * return (NULL) without the taking locks. The zv_suspend_lock is always taken
 129  * before zv_state_lock. The mode argument indicates the mode (including none)
 130  * for zv_suspend_lock to be taken.
 131  */
 132 zvol_state_t *
 133 zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
 134 {
 135         zvol_state_t *zv;
 136         struct hlist_node *p = NULL;
 137
 138         rw_enter(&zvol_state_lock, RW_READER);
 139         hlist_for_each(p, ZVOL_HT_HEAD(hash)) {
 140                 zv = hlist_entry(p, zvol_state_t, zv_hlink);
 141                 mutex_enter(&zv->zv_state_lock);
 142                 if (zv->zv_hash == hash &&
 143                     strncmp(zv->zv_name, name, MAXNAMELEN) == 0) {
 144                         /*
 145                          * this is the right zvol, take the locks in the
 146                          * right order
 147                          */
 148                         if (mode != RW_NONE &&
 149                             !rw_tryenter(&zv->zv_suspend_lock, mode)) {
 150                                 mutex_exit(&zv->zv_state_lock);
 151                                 rw_enter(&zv->zv_suspend_lock, mode);
 152                                 mutex_enter(&zv->zv_state_lock);
 153                                 /*
 154                                  * zvol cannot be renamed as we continue
 155                                  * to hold zvol_state_lock
 156                                  */
 157                                 ASSERT(zv->zv_hash == hash &&
 158                                     strncmp(zv->zv_name, name, MAXNAMELEN)
 159                                     == 0);
 160                         }
 161                         rw_exit(&zvol_state_lock);
 162                         return (zv);
 163                 }
 164                 mutex_exit(&zv->zv_state_lock);
 165         }
 166         rw_exit(&zvol_state_lock);
 167
 168         return (NULL);
 169 }
 170
 171 /*
 172  * Find a zvol_state_t given the name.
 173  * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
 174  * return (NULL) without the taking locks. The zv_suspend_lock is always taken
 175  * before zv_state_lock. The mode argument indicates the mode (including none)
 176  * for zv_suspend_lock to be taken.
 177  */
 178 static zvol_state_t *
 179 zvol_find_by_name(const char *name, int mode)
 180 {
 181         return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode));
 182 }
 183
 184 /*
 185  * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation.
 186  */
 187 void
 188 zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 189 {
 190         zfs_creat_t *zct = arg;
 191         nvlist_t *nvprops = zct->zct_props;
 192         int error;
 193         uint64_t volblocksize, volsize;
 194
 195         VERIFY(nvlist_lookup_uint64(nvprops,
 196             zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
 197         if (nvlist_lookup_uint64(nvprops,
 198             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
 199                 volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
 200
 201         /*
 202          * These properties must be removed from the list so the generic
 203          * property setting step won't apply to them.
 204          */
 205         VERIFY(nvlist_remove_all(nvprops,
 206             zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
 207         (void) nvlist_remove_all(nvprops,
 208             zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
 209
 210         error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
 211             DMU_OT_NONE, 0, tx);
 212         ASSERT(error == 0);
 213
 214         error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
 215             DMU_OT_NONE, 0, tx);
 216         ASSERT(error == 0);
 217
 218         error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
 219         ASSERT(error == 0);
 220 }
 221
 222 /*
 223  * ZFS_IOC_OBJSET_STATS entry point.
 224  */
 225 int
 226 zvol_get_stats(objset_t *os, nvlist_t *nv)
 227 {
 228         int error;
 229         dmu_object_info_t *doi;
 230         uint64_t val;
 231
 232         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
 233         if (error)
 234                 return (SET_ERROR(error));
 235
 236         dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
 237         doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
 238         error = dmu_object_info(os, ZVOL_OBJ, doi);
 239
 240         if (error == 0) {
 241                 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
 242                     doi->doi_data_block_size);
 243         }
 244
 245         kmem_free(doi, sizeof (dmu_object_info_t));
 246
 247         return (SET_ERROR(error));
 248 }
 249
 250 /*
 251  * Sanity check volume size.
 252  */
 253 int
 254 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
 255 {
 256         if (volsize == 0)
 257                 return (SET_ERROR(EINVAL));
 258
 259         if (volsize % blocksize != 0)
 260                 return (SET_ERROR(EINVAL));
 261
 262 #ifdef _ILP32
 263         if (volsize - 1 > SPEC_MAXOFFSET_T)
 264                 return (SET_ERROR(EOVERFLOW));
 265 #endif
 266         return (0);
 267 }
 268
 269 /*
 270  * Ensure the zap is flushed then inform the VFS of the capacity change.
 271  */
 272 static int
 273 zvol_update_volsize(uint64_t volsize, objset_t *os)
 274 {
 275         dmu_tx_t *tx;
 276         int error;
 277         uint64_t txg;
 278
 279         tx = dmu_tx_create(os);
 280         dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 281         dmu_tx_mark_netfree(tx);
 282         error = dmu_tx_assign(tx, TXG_WAIT);
 283         if (error) {
 284                 dmu_tx_abort(tx);
 285                 return (SET_ERROR(error));
 286         }
 287         txg = dmu_tx_get_txg(tx);
 288
 289         error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
 290             &volsize, tx);
 291         dmu_tx_commit(tx);
 292
 293         txg_wait_synced(dmu_objset_pool(os), txg);
 294
 295         if (error == 0)
 296                 error = dmu_free_long_range(os,
 297                     ZVOL_OBJ, volsize, DMU_OBJECT_END);
 298
 299         return (error);
 300 }
 301
 302 /*
 303  * Set ZFS_PROP_VOLSIZE set entry point.  Note that modifying the volume
 304  * size will result in a udev "change" event being generated.
 305  */
 306 int
 307 zvol_set_volsize(const char *name, uint64_t volsize)
 308 {
 309         objset_t *os = NULL;
 310         uint64_t readonly;
 311         int error;
 312         boolean_t owned = B_FALSE;
 313
 314         error = dsl_prop_get_integer(name,
 315             zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
 316         if (error != 0)
 317                 return (SET_ERROR(error));
 318         if (readonly)
 319                 return (SET_ERROR(EROFS));
 320
 321         zvol_state_t *zv = zvol_find_by_name(name, RW_READER);
 322
 323         ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) &&
 324             RW_READ_HELD(&zv->zv_suspend_lock)));
 325
 326         if (zv == NULL || zv->zv_objset == NULL) {
 327                 if (zv != NULL)
 328                         rw_exit(&zv->zv_suspend_lock);
 329                 if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE,
 330                     FTAG, &os)) != 0) {
 331                         if (zv != NULL)
 332                                 mutex_exit(&zv->zv_state_lock);
 333                         return (SET_ERROR(error));
 334                 }
 335                 owned = B_TRUE;
 336                 if (zv != NULL)
 337                         zv->zv_objset = os;
 338         } else {
 339                 os = zv->zv_objset;
 340         }
 341
 342         dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP);
 343
 344         if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) ||
 345             (error = zvol_check_volsize(volsize, doi->doi_data_block_size)))
 346                 goto out;
 347
 348         error = zvol_update_volsize(volsize, os);
 349         if (error == 0 && zv != NULL) {
 350                 zv->zv_volsize = volsize;
 351                 zv->zv_changed = 1;
 352         }
 353 out:
 354         kmem_free(doi, sizeof (dmu_object_info_t));
 355
 356         if (owned) {
 357                 dmu_objset_disown(os, B_TRUE, FTAG);
 358                 if (zv != NULL)
 359                         zv->zv_objset = NULL;
 360         } else {
 361                 rw_exit(&zv->zv_suspend_lock);
 362         }
 363
 364         if (zv != NULL)
 365                 mutex_exit(&zv->zv_state_lock);
 366
 367         if (error == 0 && zv != NULL)
 368                 ops->zv_update_volsize(zv, volsize);
 369
 370         return (SET_ERROR(error));
 371 }
 372
 373 /*
 374  * Sanity check volume block size.
 375  */
 376 int
 377 zvol_check_volblocksize(const char *name, uint64_t volblocksize)
 378 {
 379         /* Record sizes above 128k need the feature to be enabled */
 380         if (volblocksize > SPA_OLD_MAXBLOCKSIZE) {
 381                 spa_t *spa;
 382                 int error;
 383
 384                 if ((error = spa_open(name, &spa, FTAG)) != 0)
 385                         return (error);
 386
 387                 if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
 388                         spa_close(spa, FTAG);
 389                         return (SET_ERROR(ENOTSUP));
 390                 }
 391
 392                 /*
 393                  * We don't allow setting the property above 1MB,
 394                  * unless the tunable has been changed.
 395                  */
 396                 if (volblocksize > zfs_max_recordsize)
 397                         return (SET_ERROR(EDOM));
 398
 399                 spa_close(spa, FTAG);
 400         }
 401
 402         if (volblocksize < SPA_MINBLOCKSIZE ||
 403             volblocksize > SPA_MAXBLOCKSIZE ||
 404             !ISP2(volblocksize))
 405                 return (SET_ERROR(EDOM));
 406
 407         return (0);
 408 }
 409
 410 /*
 411  * Set ZFS_PROP_VOLBLOCKSIZE set entry point.
 412  */
 413 int
 414 zvol_set_volblocksize(const char *name, uint64_t volblocksize)
 415 {
 416         zvol_state_t *zv;
 417         dmu_tx_t *tx;
 418         int error;
 419
 420         zv = zvol_find_by_name(name, RW_READER);
 421
 422         if (zv == NULL)
 423                 return (SET_ERROR(ENXIO));
 424
 425         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 426         ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 427
 428         if (zv->zv_flags & ZVOL_RDONLY) {
 429                 mutex_exit(&zv->zv_state_lock);
 430                 rw_exit(&zv->zv_suspend_lock);
 431                 return (SET_ERROR(EROFS));
 432         }
 433
 434         tx = dmu_tx_create(zv->zv_objset);
 435         dmu_tx_hold_bonus(tx, ZVOL_OBJ);
 436         error = dmu_tx_assign(tx, TXG_WAIT);
 437         if (error) {
 438                 dmu_tx_abort(tx);
 439         } else {
 440                 error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
 441                     volblocksize, 0, tx);
 442                 if (error == ENOTSUP)
 443                         error = SET_ERROR(EBUSY);
 444                 dmu_tx_commit(tx);
 445                 if (error == 0)
 446                         zv->zv_volblocksize = volblocksize;
 447         }
 448
 449         mutex_exit(&zv->zv_state_lock);
 450         rw_exit(&zv->zv_suspend_lock);
 451
 452         return (SET_ERROR(error));
 453 }
 454
 455 /*
 456  * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
 457  * implement DKIOCFREE/free-long-range.
 458  */
 459 static int
 460 zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
 461 {
 462         zvol_state_t *zv = arg1;
 463         lr_truncate_t *lr = arg2;
 464         uint64_t offset, length;
 465
 466         if (byteswap)
 467                 byteswap_uint64_array(lr, sizeof (*lr));
 468
 469         offset = lr->lr_offset;
 470         length = lr->lr_length;
 471
 472         dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
 473         dmu_tx_mark_netfree(tx);
 474         int error = dmu_tx_assign(tx, TXG_WAIT);
 475         if (error != 0) {
 476                 dmu_tx_abort(tx);
 477         } else {
 478                 zil_replaying(zv->zv_zilog, tx);
 479                 dmu_tx_commit(tx);
 480                 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset,
 481                     length);
 482         }
 483
 484         return (error);
 485 }
 486
 487 /*
 488  * Replay a TX_WRITE ZIL transaction that didn't get committed
 489  * after a system failure
 490  */
 491 static int
 492 zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 493 {
 494         zvol_state_t *zv = arg1;
 495         lr_write_t *lr = arg2;
 496         objset_t *os = zv->zv_objset;
 497         char *data = (char *)(lr + 1);  /* data follows lr_write_t */
 498         uint64_t offset, length;
 499         dmu_tx_t *tx;
 500         int error;
 501
 502         if (byteswap)
 503                 byteswap_uint64_array(lr, sizeof (*lr));
 504
 505         offset = lr->lr_offset;
 506         length = lr->lr_length;
 507
 508         /* If it's a dmu_sync() block, write the whole block */
 509         if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 510                 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
 511                 if (length < blocksize) {
 512                         offset -= offset % blocksize;
 513                         length = blocksize;
 514                 }
 515         }
 516
 517         tx = dmu_tx_create(os);
 518         dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
 519         error = dmu_tx_assign(tx, TXG_WAIT);
 520         if (error) {
 521                 dmu_tx_abort(tx);
 522         } else {
 523                 dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
 524                 zil_replaying(zv->zv_zilog, tx);
 525                 dmu_tx_commit(tx);
 526         }
 527
 528         return (error);
 529 }
 530
 531 static int
 532 zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
 533 {
 534         return (SET_ERROR(ENOTSUP));
 535 }
 536
 537 /*
 538  * Callback vectors for replaying records.
 539  * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
 540  */
 541 zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
 542         zvol_replay_err,        /* no such transaction type */
 543         zvol_replay_err,        /* TX_CREATE */
 544         zvol_replay_err,        /* TX_MKDIR */
 545         zvol_replay_err,        /* TX_MKXATTR */
 546         zvol_replay_err,        /* TX_SYMLINK */
 547         zvol_replay_err,        /* TX_REMOVE */
 548         zvol_replay_err,        /* TX_RMDIR */
 549         zvol_replay_err,        /* TX_LINK */
 550         zvol_replay_err,        /* TX_RENAME */
 551         zvol_replay_write,      /* TX_WRITE */
 552         zvol_replay_truncate,   /* TX_TRUNCATE */
 553         zvol_replay_err,        /* TX_SETATTR */
 554         zvol_replay_err,        /* TX_ACL */
 555         zvol_replay_err,        /* TX_CREATE_ATTR */
 556         zvol_replay_err,        /* TX_CREATE_ACL_ATTR */
 557         zvol_replay_err,        /* TX_MKDIR_ACL */
 558         zvol_replay_err,        /* TX_MKDIR_ATTR */
 559         zvol_replay_err,        /* TX_MKDIR_ACL_ATTR */
 560         zvol_replay_err,        /* TX_WRITE2 */
 561 };
 562
 563 /*
 564  * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
 565  *
 566  * We store data in the log buffers if it's small enough.
 567  * Otherwise we will later flush the data out via dmu_sync().
 568  */
 569 ssize_t zvol_immediate_write_sz = 32768;
 570
 571 void
 572 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
 573     uint64_t size, int sync)
 574 {
 575         uint32_t blocksize = zv->zv_volblocksize;
 576         zilog_t *zilog = zv->zv_zilog;
 577         itx_wr_state_t write_state;
 578         uint64_t sz = size;
 579
 580         if (zil_replaying(zilog, tx))
 581                 return;
 582
 583         if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
 584                 write_state = WR_INDIRECT;
 585         else if (!spa_has_slogs(zilog->zl_spa) &&
 586             size >= blocksize && blocksize > zvol_immediate_write_sz)
 587                 write_state = WR_INDIRECT;
 588         else if (sync)
 589                 write_state = WR_COPIED;
 590         else
 591                 write_state = WR_NEED_COPY;
 592
 593         while (size) {
 594                 itx_t *itx;
 595                 lr_write_t *lr;
 596                 itx_wr_state_t wr_state = write_state;
 597                 ssize_t len = size;
 598
 599                 if (wr_state == WR_COPIED && size > zil_max_copied_data(zilog))
 600                         wr_state = WR_NEED_COPY;
 601                 else if (wr_state == WR_INDIRECT)
 602                         len = MIN(blocksize - P2PHASE(offset, blocksize), size);
 603
 604                 itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
 605                     (wr_state == WR_COPIED ? len : 0));
 606                 lr = (lr_write_t *)&itx->itx_lr;
 607                 if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn,
 608                     offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) {
 609                         zil_itx_destroy(itx);
 610                         itx = zil_itx_create(TX_WRITE, sizeof (*lr));
 611                         lr = (lr_write_t *)&itx->itx_lr;
 612                         wr_state = WR_NEED_COPY;
 613                 }
 614
 615                 itx->itx_wr_state = wr_state;
 616                 lr->lr_foid = ZVOL_OBJ;
 617                 lr->lr_offset = offset;
 618                 lr->lr_length = len;
 619                 lr->lr_blkoff = 0;
 620                 BP_ZERO(&lr->lr_blkptr);
 621
 622                 itx->itx_private = zv;
 623                 itx->itx_sync = sync;
 624
 625                 (void) zil_itx_assign(zilog, itx, tx);
 626
 627                 offset += len;
 628                 size -= len;
 629         }
 630
 631         if (write_state == WR_COPIED || write_state == WR_NEED_COPY) {
 632                 dsl_pool_wrlog_count(zilog->zl_dmu_pool, sz, tx->tx_txg);
 633         }
 634 }
 635
 636 /*
 637  * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
 638  */
 639 void
 640 zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
 641     boolean_t sync)
 642 {
 643         itx_t *itx;
 644         lr_truncate_t *lr;
 645         zilog_t *zilog = zv->zv_zilog;
 646
 647         if (zil_replaying(zilog, tx))
 648                 return;
 649
 650         itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
 651         lr = (lr_truncate_t *)&itx->itx_lr;
 652         lr->lr_foid = ZVOL_OBJ;
 653         lr->lr_offset = off;
 654         lr->lr_length = len;
 655
 656         itx->itx_sync = sync;
 657         zil_itx_assign(zilog, itx, tx);
 658 }
 659
 660
 661 /* ARGSUSED */
 662 static void
 663 zvol_get_done(zgd_t *zgd, int error)
 664 {
 665         if (zgd->zgd_db)
 666                 dmu_buf_rele(zgd->zgd_db, zgd);
 667
 668         zfs_rangelock_exit(zgd->zgd_lr);
 669
 670         kmem_free(zgd, sizeof (zgd_t));
 671 }
 672
 673 /*
 674  * Get data to generate a TX_WRITE intent log record.
 675  */
 676 int
 677 zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
 678     struct lwb *lwb, zio_t *zio)
 679 {
 680         zvol_state_t *zv = arg;
 681         uint64_t offset = lr->lr_offset;
 682         uint64_t size = lr->lr_length;
 683         dmu_buf_t *db;
 684         zgd_t *zgd;
 685         int error;
 686
 687         ASSERT3P(lwb, !=, NULL);
 688         ASSERT3P(zio, !=, NULL);
 689         ASSERT3U(size, !=, 0);
 690
 691         zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 692         zgd->zgd_lwb = lwb;
 693
 694         /*
 695          * Write records come in two flavors: immediate and indirect.
 696          * For small writes it's cheaper to store the data with the
 697          * log record (immediate); for large writes it's cheaper to
 698          * sync the data and get a pointer to it (indirect) so that
 699          * we don't have to write the data twice.
 700          */
 701         if (buf != NULL) { /* immediate write */
 702                 zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
 703                     size, RL_READER);
 704                 error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
 705                     DMU_READ_NO_PREFETCH);
 706         } else { /* indirect write */
 707                 /*
 708                  * Have to lock the whole block to ensure when it's written out
 709                  * and its checksum is being calculated that no one can change
 710                  * the data. Contrarily to zfs_get_data we need not re-check
 711                  * blocksize after we get the lock because it cannot be changed.
 712                  */
 713                 size = zv->zv_volblocksize;
 714                 offset = P2ALIGN_TYPED(offset, size, uint64_t);
 715                 zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
 716                     size, RL_READER);
 717                 error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
 718                     DMU_READ_NO_PREFETCH);
 719                 if (error == 0) {
 720                         blkptr_t *bp = &lr->lr_blkptr;
 721
 722                         zgd->zgd_db = db;
 723                         zgd->zgd_bp = bp;
 724
 725                         ASSERT(db != NULL);
 726                         ASSERT(db->db_offset == offset);
 727                         ASSERT(db->db_size == size);
 728
 729                         error = dmu_sync(zio, lr->lr_common.lrc_txg,
 730                             zvol_get_done, zgd);
 731
 732                         if (error == 0)
 733                                 return (0);
 734                 }
 735         }
 736
 737         zvol_get_done(zgd, error);
 738
 739         return (SET_ERROR(error));
 740 }
 741
 742 /*
 743  * The zvol_state_t's are inserted into zvol_state_list and zvol_htable.
 744  */
 745
 746 void
 747 zvol_insert(zvol_state_t *zv)
 748 {
 749         ASSERT(RW_WRITE_HELD(&zvol_state_lock));
 750         list_insert_head(&zvol_state_list, zv);
 751         hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
 752 }
 753
 754 /*
 755  * Simply remove the zvol from to list of zvols.
 756  */
 757 static void
 758 zvol_remove(zvol_state_t *zv)
 759 {
 760         ASSERT(RW_WRITE_HELD(&zvol_state_lock));
 761         list_remove(&zvol_state_list, zv);
 762         hlist_del(&zv->zv_hlink);
 763 }
 764
 765 /*
 766  * Setup zv after we just own the zv->objset
 767  */
 768 static int
 769 zvol_setup_zv(zvol_state_t *zv)
 770 {
 771         uint64_t volsize;
 772         int error;
 773         uint64_t ro;
 774         objset_t *os = zv->zv_objset;
 775
 776         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 777         ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock));
 778
 779         zv->zv_zilog = NULL;
 780         zv->zv_flags &= ~ZVOL_WRITTEN_TO;
 781
 782         error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
 783         if (error)
 784                 return (SET_ERROR(error));
 785
 786         error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 787         if (error)
 788                 return (SET_ERROR(error));
 789
 790         error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn);
 791         if (error)
 792                 return (SET_ERROR(error));
 793
 794         ops->zv_set_capacity(zv, volsize >> 9);
 795         zv->zv_volsize = volsize;
 796
 797         if (ro || dmu_objset_is_snapshot(os) ||
 798             !spa_writeable(dmu_objset_spa(os))) {
 799                 ops->zv_set_disk_ro(zv, 1);
 800                 zv->zv_flags |= ZVOL_RDONLY;
 801         } else {
 802                 ops->zv_set_disk_ro(zv, 0);
 803                 zv->zv_flags &= ~ZVOL_RDONLY;
 804         }
 805         return (0);
 806 }
 807
 808 /*
 809  * Shutdown every zv_objset related stuff except zv_objset itself.
 810  * The is the reverse of zvol_setup_zv.
 811  */
 812 static void
 813 zvol_shutdown_zv(zvol_state_t *zv)
 814 {
 815         ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
 816             RW_LOCK_HELD(&zv->zv_suspend_lock));
 817
 818         if (zv->zv_flags & ZVOL_WRITTEN_TO) {
 819                 ASSERT(zv->zv_zilog != NULL);
 820                 zil_close(zv->zv_zilog);
 821         }
 822
 823         zv->zv_zilog = NULL;
 824
 825         dnode_rele(zv->zv_dn, zv);
 826         zv->zv_dn = NULL;
 827
 828         /*
 829          * Evict cached data. We must write out any dirty data before
 830          * disowning the dataset.
 831          */
 832         if (zv->zv_flags & ZVOL_WRITTEN_TO)
 833                 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 834         (void) dmu_objset_evict_dbufs(zv->zv_objset);
 835 }
 836
 837 /*
 838  * return the proper tag for rollback and recv
 839  */
 840 void *
 841 zvol_tag(zvol_state_t *zv)
 842 {
 843         ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
 844         return (zv->zv_open_count > 0 ? zv : NULL);
 845 }
 846
 847 /*
 848  * Suspend the zvol for recv and rollback.
 849  */
 850 zvol_state_t *
 851 zvol_suspend(const char *name)
 852 {
 853         zvol_state_t *zv;
 854
 855         zv = zvol_find_by_name(name, RW_WRITER);
 856
 857         if (zv == NULL)
 858                 return (NULL);
 859
 860         /* block all I/O, release in zvol_resume. */
 861         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 862         ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
 863
 864         atomic_inc(&zv->zv_suspend_ref);
 865
 866         if (zv->zv_open_count > 0)
 867                 zvol_shutdown_zv(zv);
 868
 869         /*
 870          * do not hold zv_state_lock across suspend/resume to
 871          * avoid locking up zvol lookups
 872          */
 873         mutex_exit(&zv->zv_state_lock);
 874
 875         /* zv_suspend_lock is released in zvol_resume() */
 876         return (zv);
 877 }
 878
 879 int
 880 zvol_resume(zvol_state_t *zv)
 881 {
 882         int error = 0;
 883
 884         ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
 885
 886         mutex_enter(&zv->zv_state_lock);
 887
 888         if (zv->zv_open_count > 0) {
 889                 VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset));
 890                 VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv);
 891                 VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset));
 892                 dmu_objset_rele(zv->zv_objset, zv);
 893
 894                 error = zvol_setup_zv(zv);
 895         }
 896
 897         mutex_exit(&zv->zv_state_lock);
 898
 899         rw_exit(&zv->zv_suspend_lock);
 900         /*
 901          * We need this because we don't hold zvol_state_lock while releasing
 902          * zv_suspend_lock. zvol_remove_minors_impl thus cannot check
 903          * zv_suspend_lock to determine it is safe to free because rwlock is
 904          * not inherent atomic.
 905          */
 906         atomic_dec(&zv->zv_suspend_ref);
 907
 908         return (SET_ERROR(error));
 909 }
 910
 911 int
 912 zvol_first_open(zvol_state_t *zv, boolean_t readonly)
 913 {
 914         objset_t *os;
 915         int error, locked = 0;
 916         boolean_t ro;
 917
 918         ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 919         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 920
 921         /*
 922          * In all other cases the spa_namespace_lock is taken before the
 923          * bdev->bd_mutex lock.  But in this case the Linux __blkdev_get()
 924          * function calls fops->open() with the bdev->bd_mutex lock held.
 925          * This deadlock can be easily observed with zvols used as vdevs.
 926          *
 927          * To avoid a potential lock inversion deadlock we preemptively
 928          * try to take the spa_namespace_lock().  Normally it will not
 929          * be contended and this is safe because spa_open_common() handles
 930          * the case where the caller already holds the spa_namespace_lock.
 931          *
 932          * When it is contended we risk a lock inversion if we were to
 933          * block waiting for the lock.  Luckily, the __blkdev_get()
 934          * function allows us to return -ERESTARTSYS which will result in
 935          * bdev->bd_mutex being dropped, reacquired, and fops->open() being
 936          * called again.  This process can be repeated safely until both
 937          * locks are acquired.
 938          */
 939         if (!mutex_owned(&spa_namespace_lock)) {
 940                 locked = mutex_tryenter(&spa_namespace_lock);
 941                 if (!locked)
 942                         return (SET_ERROR(EINTR));
 943         }
 944
 945         ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
 946         error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os);
 947         if (error)
 948                 goto out_mutex;
 949
 950         zv->zv_objset = os;
 951
 952         error = zvol_setup_zv(zv);
 953
 954         if (error) {
 955                 dmu_objset_disown(os, 1, zv);
 956                 zv->zv_objset = NULL;
 957         }
 958
 959 out_mutex:
 960         if (locked)
 961                 mutex_exit(&spa_namespace_lock);
 962         return (SET_ERROR(error));
 963 }
 964
 965 void
 966 zvol_last_close(zvol_state_t *zv)
 967 {
 968         ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 969         ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 970
 971         zvol_shutdown_zv(zv);
 972
 973         dmu_objset_disown(zv->zv_objset, 1, zv);
 974         zv->zv_objset = NULL;
 975 }
 976
 977 typedef struct minors_job {
 978         list_t *list;
 979         list_node_t link;
 980         /* input */
 981         char *name;
 982         /* output */
 983         int error;
 984 } minors_job_t;
 985
 986 /*
 987  * Prefetch zvol dnodes for the minors_job
 988  */
 989 static void
 990 zvol_prefetch_minors_impl(void *arg)
 991 {
 992         minors_job_t *job = arg;
 993         char *dsname = job->name;
 994         objset_t *os = NULL;
 995
 996         job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
 997             FTAG, &os);
 998         if (job->error == 0) {
 999                 dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
1000                 dmu_objset_disown(os, B_TRUE, FTAG);
1001         }
1002 }
1003
1004 /*
1005  * Mask errors to continue dmu_objset_find() traversal
1006  */
1007 static int
1008 zvol_create_snap_minor_cb(const char *dsname, void *arg)
1009 {
1010         minors_job_t *j = arg;
1011         list_t *minors_list = j->list;
1012         const char *name = j->name;
1013
1014         ASSERT0(MUTEX_HELD(&spa_namespace_lock));
1015
1016         /* skip the designated dataset */
1017         if (name && strcmp(dsname, name) == 0)
1018                 return (0);
1019
1020         /* at this point, the dsname should name a snapshot */
1021         if (strchr(dsname, '@') == 0) {
1022                 dprintf("zvol_create_snap_minor_cb(): "
1023                     "%s is not a snapshot name\n", dsname);
1024         } else {
1025                 minors_job_t *job;
1026                 char *n = kmem_strdup(dsname);
1027                 if (n == NULL)
1028                         return (0);
1029
1030                 job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
1031                 job->name = n;
1032                 job->list = minors_list;
1033                 job->error = 0;
1034                 list_insert_tail(minors_list, job);
1035                 /* don't care if dispatch fails, because job->error is 0 */
1036                 taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
1037                     TQ_SLEEP);
1038         }
1039
1040         return (0);
1041 }
1042
1043 /*
1044  * Mask errors to continue dmu_objset_find() traversal
1045  */
1046 static int
1047 zvol_create_minors_cb(const char *dsname, void *arg)
1048 {
1049         uint64_t snapdev;
1050         int error;
1051         list_t *minors_list = arg;
1052
1053         ASSERT0(MUTEX_HELD(&spa_namespace_lock));
1054
1055         error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL);
1056         if (error)
1057                 return (0);
1058
1059         /*
1060          * Given the name and the 'snapdev' property, create device minor nodes
1061          * with the linkages to zvols/snapshots as needed.
1062          * If the name represents a zvol, create a minor node for the zvol, then
1063          * check if its snapshots are 'visible', and if so, iterate over the
1064          * snapshots and create device minor nodes for those.
1065          */
1066         if (strchr(dsname, '@') == 0) {
1067                 minors_job_t *job;
1068                 char *n = kmem_strdup(dsname);
1069                 if (n == NULL)
1070                         return (0);
1071
1072                 job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
1073                 job->name = n;
1074                 job->list = minors_list;
1075                 job->error = 0;
1076                 list_insert_tail(minors_list, job);
1077                 /* don't care if dispatch fails, because job->error is 0 */
1078                 taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
1079                     TQ_SLEEP);
1080
1081                 if (snapdev == ZFS_SNAPDEV_VISIBLE) {
1082                         /*
1083                          * traverse snapshots only, do not traverse children,
1084                          * and skip the 'dsname'
1085                          */
1086                         error = dmu_objset_find(dsname,
1087                             zvol_create_snap_minor_cb, (void *)job,
1088                             DS_FIND_SNAPSHOTS);
1089                 }
1090         } else {
1091                 dprintf("zvol_create_minors_cb(): %s is not a zvol name\n",
1092                     dsname);
1093         }
1094
1095         return (0);
1096 }
1097
1098 /*
1099  * Create minors for the specified dataset, including children and snapshots.
1100  * Pay attention to the 'snapdev' property and iterate over the snapshots
1101  * only if they are 'visible'. This approach allows one to assure that the
1102  * snapshot metadata is read from disk only if it is needed.
1103  *
1104  * The name can represent a dataset to be recursively scanned for zvols and
1105  * their snapshots, or a single zvol snapshot. If the name represents a
1106  * dataset, the scan is performed in two nested stages:
1107  * - scan the dataset for zvols, and
1108  * - for each zvol, create a minor node, then check if the zvol's snapshots
1109  *   are 'visible', and only then iterate over the snapshots if needed
1110  *
1111  * If the name represents a snapshot, a check is performed if the snapshot is
1112  * 'visible' (which also verifies that the parent is a zvol), and if so,
1113  * a minor node for that snapshot is created.
1114  */
1115 void
1116 zvol_create_minors_recursive(const char *name)
1117 {
1118         list_t minors_list;
1119         minors_job_t *job;
1120
1121         if (zvol_inhibit_dev)
1122                 return;
1123
1124         /*
1125          * This is the list for prefetch jobs. Whenever we found a match
1126          * during dmu_objset_find, we insert a minors_job to the list and do
1127          * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need
1128          * any lock because all list operation is done on the current thread.
1129          *
1130          * We will use this list to do zvol_create_minor_impl after prefetch
1131          * so we don't have to traverse using dmu_objset_find again.
1132          */
1133         list_create(&minors_list, sizeof (minors_job_t),
1134             offsetof(minors_job_t, link));
1135
1136
1137         if (strchr(name, '@') != NULL) {
1138                 uint64_t snapdev;
1139
1140                 int error = dsl_prop_get_integer(name, "snapdev",
1141                     &snapdev, NULL);
1142
1143                 if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
1144                         (void) ops->zv_create_minor(name);
1145         } else {
1146                 fstrans_cookie_t cookie = spl_fstrans_mark();
1147                 (void) dmu_objset_find(name, zvol_create_minors_cb,
1148                     &minors_list, DS_FIND_CHILDREN);
1149                 spl_fstrans_unmark(cookie);
1150         }
1151
1152         taskq_wait_outstanding(system_taskq, 0);
1153
1154         /*
1155          * Prefetch is completed, we can do zvol_create_minor_impl
1156          * sequentially.
1157          */
1158         while ((job = list_head(&minors_list)) != NULL) {
1159                 list_remove(&minors_list, job);
1160                 if (!job->error)
1161                         (void) ops->zv_create_minor(job->name);
1162                 kmem_strfree(job->name);
1163                 kmem_free(job, sizeof (minors_job_t));
1164         }
1165
1166         list_destroy(&minors_list);
1167 }
1168
1169 void
1170 zvol_create_minor(const char *name)
1171 {
1172         /*
1173          * Note: the dsl_pool_config_lock must not be held.
1174          * Minor node creation needs to obtain the zvol_state_lock.
1175          * zvol_open() obtains the zvol_state_lock and then the dsl pool
1176          * config lock.  Therefore, we can't have the config lock now if
1177          * we are going to wait for the zvol_state_lock, because it
1178          * would be a lock order inversion which could lead to deadlock.
1179          */
1180
1181         if (zvol_inhibit_dev)
1182                 return;
1183
1184         if (strchr(name, '@') != NULL) {
1185                 uint64_t snapdev;
1186
1187                 int error = dsl_prop_get_integer(name,
1188                     "snapdev", &snapdev, NULL);
1189
1190                 if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
1191                         (void) ops->zv_create_minor(name);
1192         } else {
1193                 (void) ops->zv_create_minor(name);
1194         }
1195 }
1196
1197 /*
1198  * Remove minors for specified dataset including children and snapshots.
1199  */
1200
1201 static void
1202 zvol_free_task(void *arg)
1203 {
1204         ops->zv_free(arg);
1205 }
1206
1207 void
1208 zvol_remove_minors_impl(const char *name)
1209 {
1210         zvol_state_t *zv, *zv_next;
1211         int namelen = ((name) ? strlen(name) : 0);
1212         taskqid_t t;
1213         list_t free_list;
1214
1215         if (zvol_inhibit_dev)
1216                 return;
1217
1218         list_create(&free_list, sizeof (zvol_state_t),
1219             offsetof(zvol_state_t, zv_next));
1220
1221         rw_enter(&zvol_state_lock, RW_WRITER);
1222
1223         for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
1224                 zv_next = list_next(&zvol_state_list, zv);
1225
1226                 mutex_enter(&zv->zv_state_lock);
1227                 if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
1228                     (strncmp(zv->zv_name, name, namelen) == 0 &&
1229                     (zv->zv_name[namelen] == '/' ||
1230                     zv->zv_name[namelen] == '@'))) {
1231                         /*
1232                          * By holding zv_state_lock here, we guarantee that no
1233                          * one is currently using this zv
1234                          */
1235
1236                         /* If in use, leave alone */
1237                         if (zv->zv_open_count > 0 ||
1238                             atomic_read(&zv->zv_suspend_ref)) {
1239                                 mutex_exit(&zv->zv_state_lock);
1240                                 continue;
1241                         }
1242
1243                         zvol_remove(zv);
1244
1245                         /*
1246                          * Cleared while holding zvol_state_lock as a writer
1247                          * which will prevent zvol_open() from opening it.
1248                          */
1249                         ops->zv_clear_private(zv);
1250
1251                         /* Drop zv_state_lock before zvol_free() */
1252                         mutex_exit(&zv->zv_state_lock);
1253
1254                         /* Try parallel zv_free, if failed do it in place */
1255                         t = taskq_dispatch(system_taskq, zvol_free_task, zv,
1256                             TQ_SLEEP);
1257                         if (t == TASKQID_INVALID)
1258                                 list_insert_head(&free_list, zv);
1259                 } else {
1260                         mutex_exit(&zv->zv_state_lock);
1261                 }
1262         }
1263         rw_exit(&zvol_state_lock);
1264
1265         /* Drop zvol_state_lock before calling zvol_free() */
1266         while ((zv = list_head(&free_list)) != NULL) {
1267                 list_remove(&free_list, zv);
1268                 ops->zv_free(zv);
1269         }
1270 }
1271
1272 /* Remove minor for this specific volume only */
1273 static void
1274 zvol_remove_minor_impl(const char *name)
1275 {
1276         zvol_state_t *zv = NULL, *zv_next;
1277
1278         if (zvol_inhibit_dev)
1279                 return;
1280
1281         rw_enter(&zvol_state_lock, RW_WRITER);
1282
1283         for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
1284                 zv_next = list_next(&zvol_state_list, zv);
1285
1286                 mutex_enter(&zv->zv_state_lock);
1287                 if (strcmp(zv->zv_name, name) == 0) {
1288                         /*
1289                          * By holding zv_state_lock here, we guarantee that no
1290                          * one is currently using this zv
1291                          */
1292
1293                         /* If in use, leave alone */
1294                         if (zv->zv_open_count > 0 ||
1295                             atomic_read(&zv->zv_suspend_ref)) {
1296                                 mutex_exit(&zv->zv_state_lock);
1297                                 continue;
1298                         }
1299                         zvol_remove(zv);
1300
1301                         ops->zv_clear_private(zv);
1302                         mutex_exit(&zv->zv_state_lock);
1303                         break;
1304                 } else {
1305                         mutex_exit(&zv->zv_state_lock);
1306                 }
1307         }
1308
1309         /* Drop zvol_state_lock before calling zvol_free() */
1310         rw_exit(&zvol_state_lock);
1311
1312         if (zv != NULL)
1313                 ops->zv_free(zv);
1314 }
1315
1316 /*
1317  * Rename minors for specified dataset including children and snapshots.
1318  */
1319 static void
1320 zvol_rename_minors_impl(const char *oldname, const char *newname)
1321 {
1322         zvol_state_t *zv, *zv_next;
1323         int oldnamelen, newnamelen;
1324
1325         if (zvol_inhibit_dev)
1326                 return;
1327
1328         oldnamelen = strlen(oldname);
1329         newnamelen = strlen(newname);
1330
1331         rw_enter(&zvol_state_lock, RW_READER);
1332
1333         for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
1334                 zv_next = list_next(&zvol_state_list, zv);
1335
1336                 mutex_enter(&zv->zv_state_lock);
1337
1338                 if (strcmp(zv->zv_name, oldname) == 0) {
1339                         ops->zv_rename_minor(zv, newname);
1340                 } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
1341                     (zv->zv_name[oldnamelen] == '/' ||
1342                     zv->zv_name[oldnamelen] == '@')) {
1343                         char *name = kmem_asprintf("%s%c%s", newname,
1344                             zv->zv_name[oldnamelen],
1345                             zv->zv_name + oldnamelen + 1);
1346                         ops->zv_rename_minor(zv, name);
1347                         kmem_strfree(name);
1348                 }
1349
1350                 mutex_exit(&zv->zv_state_lock);
1351         }
1352
1353         rw_exit(&zvol_state_lock);
1354 }
1355
1356 typedef struct zvol_snapdev_cb_arg {
1357         uint64_t snapdev;
1358 } zvol_snapdev_cb_arg_t;
1359
1360 static int
1361 zvol_set_snapdev_cb(const char *dsname, void *param)
1362 {
1363         zvol_snapdev_cb_arg_t *arg = param;
1364
1365         if (strchr(dsname, '@') == NULL)
1366                 return (0);
1367
1368         switch (arg->snapdev) {
1369                 case ZFS_SNAPDEV_VISIBLE:
1370                         (void) ops->zv_create_minor(dsname);
1371                         break;
1372                 case ZFS_SNAPDEV_HIDDEN:
1373                         (void) zvol_remove_minor_impl(dsname);
1374                         break;
1375         }
1376
1377         return (0);
1378 }
1379
1380 static void
1381 zvol_set_snapdev_impl(char *name, uint64_t snapdev)
1382 {
1383         zvol_snapdev_cb_arg_t arg = {snapdev};
1384         fstrans_cookie_t cookie = spl_fstrans_mark();
1385         /*
1386          * The zvol_set_snapdev_sync() sets snapdev appropriately
1387          * in the dataset hierarchy. Here, we only scan snapshots.
1388          */
1389         dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS);
1390         spl_fstrans_unmark(cookie);
1391 }
1392
1393 static void
1394 zvol_set_volmode_impl(char *name, uint64_t volmode)
1395 {
1396         fstrans_cookie_t cookie;
1397         uint64_t old_volmode;
1398         zvol_state_t *zv;
1399
1400         if (strchr(name, '@') != NULL)
1401                 return;
1402
1403         /*
1404          * It's unfortunate we need to remove minors before we create new ones:
1405          * this is necessary because our backing gendisk (zvol_state->zv_disk)
1406          * could be different when we set, for instance, volmode from "geom"
1407          * to "dev" (or vice versa).
1408          */
1409         zv = zvol_find_by_name(name, RW_NONE);
1410         if (zv == NULL && volmode == ZFS_VOLMODE_NONE)
1411                         return;
1412         if (zv != NULL) {
1413                 old_volmode = zv->zv_volmode;
1414                 mutex_exit(&zv->zv_state_lock);
1415                 if (old_volmode == volmode)
1416                         return;
1417                 zvol_wait_close(zv);
1418         }
1419         cookie = spl_fstrans_mark();
1420         switch (volmode) {
1421                 case ZFS_VOLMODE_NONE:
1422                         (void) zvol_remove_minor_impl(name);
1423                         break;
1424                 case ZFS_VOLMODE_GEOM:
1425                 case ZFS_VOLMODE_DEV:
1426                         (void) zvol_remove_minor_impl(name);
1427                         (void) ops->zv_create_minor(name);
1428                         break;
1429                 case ZFS_VOLMODE_DEFAULT:
1430                         (void) zvol_remove_minor_impl(name);
1431                         if (zvol_volmode == ZFS_VOLMODE_NONE)
1432                                 break;
1433                         else /* if zvol_volmode is invalid defaults to "geom" */
1434                                 (void) ops->zv_create_minor(name);
1435                         break;
1436         }
1437         spl_fstrans_unmark(cookie);
1438 }
1439
1440 static zvol_task_t *
1441 zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
1442     uint64_t value)
1443 {
1444         zvol_task_t *task;
1445
1446         /* Never allow tasks on hidden names. */
1447         if (name1[0] == '$')
1448                 return (NULL);
1449
1450         task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
1451         task->op = op;
1452         task->value = value;
1453
1454         strlcpy(task->name1, name1, MAXNAMELEN);
1455         if (name2 != NULL)
1456                 strlcpy(task->name2, name2, MAXNAMELEN);
1457
1458         return (task);
1459 }
1460
1461 static void
1462 zvol_task_free(zvol_task_t *task)
1463 {
1464         kmem_free(task, sizeof (zvol_task_t));
1465 }
1466
1467 /*
1468  * The worker thread function performed asynchronously.
1469  */
1470 static void
1471 zvol_task_cb(void *arg)
1472 {
1473         zvol_task_t *task = arg;
1474
1475         switch (task->op) {
1476         case ZVOL_ASYNC_REMOVE_MINORS:
1477                 zvol_remove_minors_impl(task->name1);
1478                 break;
1479         case ZVOL_ASYNC_RENAME_MINORS:
1480                 zvol_rename_minors_impl(task->name1, task->name2);
1481                 break;
1482         case ZVOL_ASYNC_SET_SNAPDEV:
1483                 zvol_set_snapdev_impl(task->name1, task->value);
1484                 break;
1485         case ZVOL_ASYNC_SET_VOLMODE:
1486                 zvol_set_volmode_impl(task->name1, task->value);
1487                 break;
1488         default:
1489                 VERIFY(0);
1490                 break;
1491         }
1492
1493         zvol_task_free(task);
1494 }
1495
1496 typedef struct zvol_set_prop_int_arg {
1497         const char *zsda_name;
1498         uint64_t zsda_value;
1499         zprop_source_t zsda_source;
1500         dmu_tx_t *zsda_tx;
1501 } zvol_set_prop_int_arg_t;
1502
1503 /*
1504  * Sanity check the dataset for safe use by the sync task.  No additional
1505  * conditions are imposed.
1506  */
1507 static int
1508 zvol_set_snapdev_check(void *arg, dmu_tx_t *tx)
1509 {
1510         zvol_set_prop_int_arg_t *zsda = arg;
1511         dsl_pool_t *dp = dmu_tx_pool(tx);
1512         dsl_dir_t *dd;
1513         int error;
1514
1515         error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
1516         if (error != 0)
1517                 return (error);
1518
1519         dsl_dir_rele(dd, FTAG);
1520
1521         return (error);
1522 }
1523
1524 /* ARGSUSED */
1525 static int
1526 zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
1527 {
1528         char dsname[MAXNAMELEN];
1529         zvol_task_t *task;
1530         uint64_t snapdev;
1531
1532         dsl_dataset_name(ds, dsname);
1533         if (dsl_prop_get_int_ds(ds, "snapdev", &snapdev) != 0)
1534                 return (0);
1535         task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, NULL, snapdev);
1536         if (task == NULL)
1537                 return (0);
1538
1539         (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
1540             task, TQ_SLEEP);
1541         return (0);
1542 }
1543
1544 /*
1545  * Traverse all child datasets and apply snapdev appropriately.
1546  * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
1547  * dataset and read the effective "snapdev" on every child in the callback
1548  * function: this is because the value is not guaranteed to be the same in the
1549  * whole dataset hierarchy.
1550  */
1551 static void
1552 zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx)
1553 {
1554         zvol_set_prop_int_arg_t *zsda = arg;
1555         dsl_pool_t *dp = dmu_tx_pool(tx);
1556         dsl_dir_t *dd;
1557         dsl_dataset_t *ds;
1558         int error;
1559
1560         VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
1561         zsda->zsda_tx = tx;
1562
1563         error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
1564         if (error == 0) {
1565                 dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV),
1566                     zsda->zsda_source, sizeof (zsda->zsda_value), 1,
1567                     &zsda->zsda_value, zsda->zsda_tx);
1568                 dsl_dataset_rele(ds, FTAG);
1569         }
1570         dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb,
1571             zsda, DS_FIND_CHILDREN);
1572
1573         dsl_dir_rele(dd, FTAG);
1574 }
1575
1576 int
1577 zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev)
1578 {
1579         zvol_set_prop_int_arg_t zsda;
1580
1581         zsda.zsda_name = ddname;
1582         zsda.zsda_source = source;
1583         zsda.zsda_value = snapdev;
1584
1585         return (dsl_sync_task(ddname, zvol_set_snapdev_check,
1586             zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
1587 }
1588
1589 /*
1590  * Sanity check the dataset for safe use by the sync task.  No additional
1591  * conditions are imposed.
1592  */
1593 static int
1594 zvol_set_volmode_check(void *arg, dmu_tx_t *tx)
1595 {
1596         zvol_set_prop_int_arg_t *zsda = arg;
1597         dsl_pool_t *dp = dmu_tx_pool(tx);
1598         dsl_dir_t *dd;
1599         int error;
1600
1601         error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
1602         if (error != 0)
1603                 return (error);
1604
1605         dsl_dir_rele(dd, FTAG);
1606
1607         return (error);
1608 }
1609
1610 /* ARGSUSED */
1611 static int
1612 zvol_set_volmode_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
1613 {
1614         char dsname[MAXNAMELEN];
1615         zvol_task_t *task;
1616         uint64_t volmode;
1617
1618         dsl_dataset_name(ds, dsname);
1619         if (dsl_prop_get_int_ds(ds, "volmode", &volmode) != 0)
1620                 return (0);
1621         task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname, NULL, volmode);
1622         if (task == NULL)
1623                 return (0);
1624
1625         (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
1626             task, TQ_SLEEP);
1627         return (0);
1628 }
1629
1630 /*
1631  * Traverse all child datasets and apply volmode appropriately.
1632  * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
1633  * dataset and read the effective "volmode" on every child in the callback
1634  * function: this is because the value is not guaranteed to be the same in the
1635  * whole dataset hierarchy.
1636  */
1637 static void
1638 zvol_set_volmode_sync(void *arg, dmu_tx_t *tx)
1639 {
1640         zvol_set_prop_int_arg_t *zsda = arg;
1641         dsl_pool_t *dp = dmu_tx_pool(tx);
1642         dsl_dir_t *dd;
1643         dsl_dataset_t *ds;
1644         int error;
1645
1646         VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
1647         zsda->zsda_tx = tx;
1648
1649         error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
1650         if (error == 0) {
1651                 dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_VOLMODE),
1652                     zsda->zsda_source, sizeof (zsda->zsda_value), 1,
1653                     &zsda->zsda_value, zsda->zsda_tx);
1654                 dsl_dataset_rele(ds, FTAG);
1655         }
1656
1657         dmu_objset_find_dp(dp, dd->dd_object, zvol_set_volmode_sync_cb,
1658             zsda, DS_FIND_CHILDREN);
1659
1660         dsl_dir_rele(dd, FTAG);
1661 }
1662
1663 int
1664 zvol_set_volmode(const char *ddname, zprop_source_t source, uint64_t volmode)
1665 {
1666         zvol_set_prop_int_arg_t zsda;
1667
1668         zsda.zsda_name = ddname;
1669         zsda.zsda_source = source;
1670         zsda.zsda_value = volmode;
1671
1672         return (dsl_sync_task(ddname, zvol_set_volmode_check,
1673             zvol_set_volmode_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
1674 }
1675
1676 void
1677 zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
1678 {
1679         zvol_task_t *task;
1680         taskqid_t id;
1681
1682         task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL);
1683         if (task == NULL)
1684                 return;
1685
1686         id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
1687         if ((async == B_FALSE) && (id != TASKQID_INVALID))
1688                 taskq_wait_id(spa->spa_zvol_taskq, id);
1689 }
1690
1691 void
1692 zvol_rename_minors(spa_t *spa, const char *name1, const char *name2,
1693     boolean_t async)
1694 {
1695         zvol_task_t *task;
1696         taskqid_t id;
1697
1698         task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL);
1699         if (task == NULL)
1700                 return;
1701
1702         id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
1703         if ((async == B_FALSE) && (id != TASKQID_INVALID))
1704                 taskq_wait_id(spa->spa_zvol_taskq, id);
1705 }
1706
1707 boolean_t
1708 zvol_is_zvol(const char *name)
1709 {
1710
1711         return (ops->zv_is_zvol(name));
1712 }
1713
1714 void
1715 zvol_register_ops(const zvol_platform_ops_t *zvol_ops)
1716 {
1717         ops = zvol_ops;
1718 }
1719
1720 int
1721 zvol_init_impl(void)
1722 {
1723         int i;
1724
1725         list_create(&zvol_state_list, sizeof (zvol_state_t),
1726             offsetof(zvol_state_t, zv_next));
1727         rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL);
1728
1729         zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head),
1730             KM_SLEEP);
1731         for (i = 0; i < ZVOL_HT_SIZE; i++)
1732                 INIT_HLIST_HEAD(&zvol_htable[i]);
1733
1734         return (0);
1735 }
1736
1737 void
1738 zvol_fini_impl(void)
1739 {
1740         zvol_remove_minors_impl(NULL);
1741
1742         /*
1743          * The call to "zvol_remove_minors_impl" may dispatch entries to
1744          * the system_taskq, but it doesn't wait for those entries to
1745          * complete before it returns. Thus, we must wait for all of the
1746          * removals to finish, before we can continue.
1747          */
1748         taskq_wait_outstanding(system_taskq, 0);
1749
1750         kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
1751         list_destroy(&zvol_state_list);
1752         rw_destroy(&zvol_state_lock);
1753 }