module/os/freebsd/zfs/zfs_vfsops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
  24  * All rights reserved.
  25  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  26  * Copyright (c) 2014 Integros [integros.com]
  27  * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
  28  */
  29
  30 /* Portions Copyright 2010 Robert Milkowski */
  31
  32 #include <sys/types.h>
  33 #include <sys/param.h>
  34 #include <sys/systm.h>
  35 #include <sys/kernel.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/kmem.h>
  38 #include <sys/acl.h>
  39 #include <sys/vnode.h>
  40 #include <sys/vfs.h>
  41 #include <sys/mntent.h>
  42 #include <sys/mount.h>
  43 #include <sys/cmn_err.h>
  44 #include <sys/zfs_znode.h>
  45 #include <sys/zfs_vnops.h>
  46 #include <sys/zfs_dir.h>
  47 #include <sys/zil.h>
  48 #include <sys/fs/zfs.h>
  49 #include <sys/dmu.h>
  50 #include <sys/dsl_prop.h>
  51 #include <sys/dsl_dataset.h>
  52 #include <sys/dsl_deleg.h>
  53 #include <sys/spa.h>
  54 #include <sys/zap.h>
  55 #include <sys/sa.h>
  56 #include <sys/sa_impl.h>
  57 #include <sys/policy.h>
  58 #include <sys/atomic.h>
  59 #include <sys/zfs_ioctl.h>
  60 #include <sys/zfs_ctldir.h>
  61 #include <sys/zfs_fuid.h>
  62 #include <sys/sunddi.h>
  63 #include <sys/dmu_objset.h>
  64 #include <sys/dsl_dir.h>
  65 #include <sys/jail.h>
  66 #include <sys/osd.h>
  67 #include <ufs/ufs/quota.h>
  68 #include <sys/zfs_quota.h>
  69
  70 #include "zfs_comutil.h"
  71
  72 #ifndef MNTK_VMSETSIZE_BUG
  73 #define MNTK_VMSETSIZE_BUG      0
  74 #endif
  75 #ifndef MNTK_NOMSYNC
  76 #define MNTK_NOMSYNC    8
  77 #endif
  78
  79 struct mtx zfs_debug_mtx;
  80 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
  81
  82 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
  83
  84 int zfs_super_owner;
  85 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
  86         "File system owners can perform privileged operation on file systems");
  87
  88 int zfs_debug_level;
  89 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
  90         "Debug level");
  91
  92 int zfs_bclone_enabled = 1;
  93 SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN,
  94         &zfs_bclone_enabled, 0, "Enable block cloning");
  95
  96 struct zfs_jailparam {
  97         int mount_snapshot;
  98 };
  99
 100 static struct zfs_jailparam zfs_jailparam0 = {
 101         .mount_snapshot = 0,
 102 };
 103
 104 static int zfs_jailparam_slot;
 105
 106 SYSCTL_JAIL_PARAM_SYS_NODE(zfs, CTLFLAG_RW, "Jail ZFS parameters");
 107 SYSCTL_JAIL_PARAM(_zfs, mount_snapshot, CTLTYPE_INT | CTLFLAG_RW, "I",
 108         "Allow mounting snapshots in the .zfs directory for unjailed datasets");
 109
 110 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
 111 static int zfs_version_acl = ZFS_ACL_VERSION;
 112 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
 113         "ZFS_ACL_VERSION");
 114 static int zfs_version_spa = SPA_VERSION;
 115 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
 116         "SPA_VERSION");
 117 static int zfs_version_zpl = ZPL_VERSION;
 118 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
 119         "ZPL_VERSION");
 120
 121 #if __FreeBSD_version >= 1400018
 122 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg,
 123     bool *mp_busy);
 124 #else
 125 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
 126 #endif
 127 static int zfs_mount(vfs_t *vfsp);
 128 static int zfs_umount(vfs_t *vfsp, int fflag);
 129 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
 130 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
 131 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
 132 static int zfs_sync(vfs_t *vfsp, int waitfor);
 133 #if __FreeBSD_version >= 1300098
 134 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
 135     struct ucred **credanonp, int *numsecflavors, int *secflavors);
 136 #else
 137 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
 138     struct ucred **credanonp, int *numsecflavors, int **secflavors);
 139 #endif
 140 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
 141 static void zfs_freevfs(vfs_t *vfsp);
 142
 143 struct vfsops zfs_vfsops = {
 144         .vfs_mount =            zfs_mount,
 145         .vfs_unmount =          zfs_umount,
 146 #if __FreeBSD_version >= 1300049
 147         .vfs_root =             vfs_cache_root,
 148         .vfs_cachedroot = zfs_root,
 149 #else
 150         .vfs_root =             zfs_root,
 151 #endif
 152         .vfs_statfs =           zfs_statfs,
 153         .vfs_vget =             zfs_vget,
 154         .vfs_sync =             zfs_sync,
 155         .vfs_checkexp =         zfs_checkexp,
 156         .vfs_fhtovp =           zfs_fhtovp,
 157         .vfs_quotactl =         zfs_quotactl,
 158 };
 159
 160 #ifdef VFCF_CROSS_COPY_FILE_RANGE
 161 VFS_SET(zfs_vfsops, zfs,
 162     VFCF_DELEGADMIN | VFCF_JAIL | VFCF_CROSS_COPY_FILE_RANGE);
 163 #else
 164 VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL);
 165 #endif
 166
 167 /*
 168  * We need to keep a count of active fs's.
 169  * This is necessary to prevent our module
 170  * from being unloaded after a umount -f
 171  */
 172 static uint32_t zfs_active_fs_count = 0;
 173
 174 int
 175 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
 176     char *setpoint)
 177 {
 178         int error;
 179         zfsvfs_t *zfvp;
 180         vfs_t *vfsp;
 181         objset_t *os;
 182         uint64_t tmp = *val;
 183
 184         error = dmu_objset_from_ds(ds, &os);
 185         if (error != 0)
 186                 return (error);
 187
 188         error = getzfsvfs_impl(os, &zfvp);
 189         if (error != 0)
 190                 return (error);
 191         if (zfvp == NULL)
 192                 return (ENOENT);
 193         vfsp = zfvp->z_vfs;
 194         switch (zfs_prop) {
 195         case ZFS_PROP_ATIME:
 196                 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
 197                         tmp = 0;
 198                 if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
 199                         tmp = 1;
 200                 break;
 201         case ZFS_PROP_DEVICES:
 202                 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
 203                         tmp = 0;
 204                 if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
 205                         tmp = 1;
 206                 break;
 207         case ZFS_PROP_EXEC:
 208                 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
 209                         tmp = 0;
 210                 if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
 211                         tmp = 1;
 212                 break;
 213         case ZFS_PROP_SETUID:
 214                 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
 215                         tmp = 0;
 216                 if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
 217                         tmp = 1;
 218                 break;
 219         case ZFS_PROP_READONLY:
 220                 if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
 221                         tmp = 0;
 222                 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
 223                         tmp = 1;
 224                 break;
 225         case ZFS_PROP_XATTR:
 226                 if (zfvp->z_flags & ZSB_XATTR)
 227                         tmp = zfvp->z_xattr;
 228                 break;
 229         case ZFS_PROP_NBMAND:
 230                 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
 231                         tmp = 0;
 232                 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
 233                         tmp = 1;
 234                 break;
 235         default:
 236                 vfs_unbusy(vfsp);
 237                 return (ENOENT);
 238         }
 239
 240         vfs_unbusy(vfsp);
 241         if (tmp != *val) {
 242                 if (setpoint)
 243                         (void) strcpy(setpoint, "temporary");
 244                 *val = tmp;
 245         }
 246         return (0);
 247 }
 248
 249 static int
 250 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
 251 {
 252         int error = 0;
 253         char buf[32];
 254         uint64_t usedobj, quotaobj;
 255         uint64_t quota, used = 0;
 256         timespec_t now;
 257
 258         usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
 259         quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 260
 261         if (quotaobj == 0 || zfsvfs->z_replay) {
 262                 error = ENOENT;
 263                 goto done;
 264         }
 265         (void) sprintf(buf, "%llx", (longlong_t)id);
 266         if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
 267             buf, sizeof (quota), 1, &quota)) != 0) {
 268                 dprintf("%s(%d): quotaobj lookup failed\n",
 269                     __FUNCTION__, __LINE__);
 270                 goto done;
 271         }
 272         /*
 273          * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
 274          * So we set them to be the same.
 275          */
 276         dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
 277         error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
 278         if (error && error != ENOENT) {
 279                 dprintf("%s(%d):  usedobj failed; %d\n",
 280                     __FUNCTION__, __LINE__, error);
 281                 goto done;
 282         }
 283         dqp->dqb_curblocks = btodb(used);
 284         dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
 285         vfs_timestamp(&now);
 286         /*
 287          * Setting this to 0 causes FreeBSD quota(8) to print
 288          * the number of days since the epoch, which isn't
 289          * particularly useful.
 290          */
 291         dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
 292 done:
 293         return (error);
 294 }
 295
 296 static int
 297 #if __FreeBSD_version >= 1400018
 298 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy)
 299 #else
 300 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
 301 #endif
 302 {
 303         zfsvfs_t *zfsvfs = vfsp->vfs_data;
 304         struct thread *td;
 305         int cmd, type, error = 0;
 306         int bitsize;
 307         zfs_userquota_prop_t quota_type;
 308         struct dqblk64 dqblk = { 0 };
 309
 310         td = curthread;
 311         cmd = cmds >> SUBCMDSHIFT;
 312         type = cmds & SUBCMDMASK;
 313
 314         if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
 315                 return (error);
 316         if (id == -1) {
 317                 switch (type) {
 318                 case USRQUOTA:
 319                         id = td->td_ucred->cr_ruid;
 320                         break;
 321                 case GRPQUOTA:
 322                         id = td->td_ucred->cr_rgid;
 323                         break;
 324                 default:
 325                         error = EINVAL;
 326 #if __FreeBSD_version < 1400018
 327                         if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
 328                                 vfs_unbusy(vfsp);
 329 #endif
 330                         goto done;
 331                 }
 332         }
 333         /*
 334          * Map BSD type to:
 335          * ZFS_PROP_USERUSED,
 336          * ZFS_PROP_USERQUOTA,
 337          * ZFS_PROP_GROUPUSED,
 338          * ZFS_PROP_GROUPQUOTA
 339          */
 340         switch (cmd) {
 341         case Q_SETQUOTA:
 342         case Q_SETQUOTA32:
 343                 if (type == USRQUOTA)
 344                         quota_type = ZFS_PROP_USERQUOTA;
 345                 else if (type == GRPQUOTA)
 346                         quota_type = ZFS_PROP_GROUPQUOTA;
 347                 else
 348                         error = EINVAL;
 349                 break;
 350         case Q_GETQUOTA:
 351         case Q_GETQUOTA32:
 352                 if (type == USRQUOTA)
 353                         quota_type = ZFS_PROP_USERUSED;
 354                 else if (type == GRPQUOTA)
 355                         quota_type = ZFS_PROP_GROUPUSED;
 356                 else
 357                         error = EINVAL;
 358                 break;
 359         }
 360
 361         /*
 362          * Depending on the cmd, we may need to get
 363          * the ruid and domain (see fuidstr_to_sid?),
 364          * the fuid (how?), or other information.
 365          * Create fuid using zfs_fuid_create(zfsvfs, id,
 366          * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
 367          * I think I can use just the id?
 368          *
 369          * Look at zfs_id_overquota() to look up a quota.
 370          * zap_lookup(something, quotaobj, fuidstring,
 371          *     sizeof (long long), 1, &quota)
 372          *
 373          * See zfs_set_userquota() to set a quota.
 374          */
 375         if ((uint32_t)type >= MAXQUOTAS) {
 376                 error = EINVAL;
 377                 goto done;
 378         }
 379
 380         switch (cmd) {
 381         case Q_GETQUOTASIZE:
 382                 bitsize = 64;
 383                 error = copyout(&bitsize, arg, sizeof (int));
 384                 break;
 385         case Q_QUOTAON:
 386                 // As far as I can tell, you can't turn quotas on or off on zfs
 387                 error = 0;
 388 #if __FreeBSD_version < 1400018
 389                 vfs_unbusy(vfsp);
 390 #endif
 391                 break;
 392         case Q_QUOTAOFF:
 393                 error = ENOTSUP;
 394 #if __FreeBSD_version < 1400018
 395                 vfs_unbusy(vfsp);
 396 #endif
 397                 break;
 398         case Q_SETQUOTA:
 399                 error = copyin(arg, &dqblk, sizeof (dqblk));
 400                 if (error == 0)
 401                         error = zfs_set_userquota(zfsvfs, quota_type,
 402                             "", id, dbtob(dqblk.dqb_bhardlimit));
 403                 break;
 404         case Q_GETQUOTA:
 405                 error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
 406                 if (error == 0)
 407                         error = copyout(&dqblk, arg, sizeof (dqblk));
 408                 break;
 409         default:
 410                 error = EINVAL;
 411                 break;
 412         }
 413 done:
 414         zfs_exit(zfsvfs, FTAG);
 415         return (error);
 416 }
 417
 418
 419 boolean_t
 420 zfs_is_readonly(zfsvfs_t *zfsvfs)
 421 {
 422         return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY));
 423 }
 424
 425 static int
 426 zfs_sync(vfs_t *vfsp, int waitfor)
 427 {
 428
 429         /*
 430          * Data integrity is job one.  We don't want a compromised kernel
 431          * writing to the storage pool, so we never sync during panic.
 432          */
 433         if (panicstr)
 434                 return (0);
 435
 436         /*
 437          * Ignore the system syncher.  ZFS already commits async data
 438          * at zfs_txg_timeout intervals.
 439          */
 440         if (waitfor == MNT_LAZY)
 441                 return (0);
 442
 443         if (vfsp != NULL) {
 444                 /*
 445                  * Sync a specific filesystem.
 446                  */
 447                 zfsvfs_t *zfsvfs = vfsp->vfs_data;
 448                 dsl_pool_t *dp;
 449                 int error;
 450
 451                 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
 452                         return (error);
 453                 dp = dmu_objset_pool(zfsvfs->z_os);
 454
 455                 /*
 456                  * If the system is shutting down, then skip any
 457                  * filesystems which may exist on a suspended pool.
 458                  */
 459                 if (rebooting && spa_suspended(dp->dp_spa)) {
 460                         zfs_exit(zfsvfs, FTAG);
 461                         return (0);
 462                 }
 463
 464                 if (zfsvfs->z_log != NULL)
 465                         zil_commit(zfsvfs->z_log, 0);
 466
 467                 zfs_exit(zfsvfs, FTAG);
 468         } else {
 469                 /*
 470                  * Sync all ZFS filesystems.  This is what happens when you
 471                  * run sync(8).  Unlike other filesystems, ZFS honors the
 472                  * request by waiting for all pools to commit all dirty data.
 473                  */
 474                 spa_sync_allpools();
 475         }
 476
 477         return (0);
 478 }
 479
 480 static void
 481 atime_changed_cb(void *arg, uint64_t newval)
 482 {
 483         zfsvfs_t *zfsvfs = arg;
 484
 485         if (newval == TRUE) {
 486                 zfsvfs->z_atime = TRUE;
 487                 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
 488                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 489                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 490         } else {
 491                 zfsvfs->z_atime = FALSE;
 492                 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
 493                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 494                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 495         }
 496 }
 497
 498 static void
 499 xattr_changed_cb(void *arg, uint64_t newval)
 500 {
 501         zfsvfs_t *zfsvfs = arg;
 502
 503         if (newval == ZFS_XATTR_OFF) {
 504                 zfsvfs->z_flags &= ~ZSB_XATTR;
 505         } else {
 506                 zfsvfs->z_flags |= ZSB_XATTR;
 507
 508                 if (newval == ZFS_XATTR_SA)
 509                         zfsvfs->z_xattr_sa = B_TRUE;
 510                 else
 511                         zfsvfs->z_xattr_sa = B_FALSE;
 512         }
 513 }
 514
 515 static void
 516 blksz_changed_cb(void *arg, uint64_t newval)
 517 {
 518         zfsvfs_t *zfsvfs = arg;
 519         ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
 520         ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
 521         ASSERT(ISP2(newval));
 522
 523         zfsvfs->z_max_blksz = newval;
 524         zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
 525 }
 526
 527 static void
 528 readonly_changed_cb(void *arg, uint64_t newval)
 529 {
 530         zfsvfs_t *zfsvfs = arg;
 531
 532         if (newval) {
 533                 /* XXX locking on vfs_flag? */
 534                 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
 535                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
 536                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
 537         } else {
 538                 /* XXX locking on vfs_flag? */
 539                 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 540                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
 541                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
 542         }
 543 }
 544
 545 static void
 546 setuid_changed_cb(void *arg, uint64_t newval)
 547 {
 548         zfsvfs_t *zfsvfs = arg;
 549
 550         if (newval == FALSE) {
 551                 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
 552                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
 553                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
 554         } else {
 555                 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
 556                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
 557                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
 558         }
 559 }
 560
 561 static void
 562 exec_changed_cb(void *arg, uint64_t newval)
 563 {
 564         zfsvfs_t *zfsvfs = arg;
 565
 566         if (newval == FALSE) {
 567                 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
 568                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
 569                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
 570         } else {
 571                 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
 572                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
 573                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
 574         }
 575 }
 576
 577 /*
 578  * The nbmand mount option can be changed at mount time.
 579  * We can't allow it to be toggled on live file systems or incorrect
 580  * behavior may be seen from cifs clients
 581  *
 582  * This property isn't registered via dsl_prop_register(), but this callback
 583  * will be called when a file system is first mounted
 584  */
 585 static void
 586 nbmand_changed_cb(void *arg, uint64_t newval)
 587 {
 588         zfsvfs_t *zfsvfs = arg;
 589         if (newval == FALSE) {
 590                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
 591                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
 592         } else {
 593                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
 594                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
 595         }
 596 }
 597
 598 static void
 599 snapdir_changed_cb(void *arg, uint64_t newval)
 600 {
 601         zfsvfs_t *zfsvfs = arg;
 602
 603         zfsvfs->z_show_ctldir = newval;
 604 }
 605
 606 static void
 607 acl_mode_changed_cb(void *arg, uint64_t newval)
 608 {
 609         zfsvfs_t *zfsvfs = arg;
 610
 611         zfsvfs->z_acl_mode = newval;
 612 }
 613
 614 static void
 615 acl_inherit_changed_cb(void *arg, uint64_t newval)
 616 {
 617         zfsvfs_t *zfsvfs = arg;
 618
 619         zfsvfs->z_acl_inherit = newval;
 620 }
 621
 622 static void
 623 acl_type_changed_cb(void *arg, uint64_t newval)
 624 {
 625         zfsvfs_t *zfsvfs = arg;
 626
 627         zfsvfs->z_acl_type = newval;
 628 }
 629
 630 static int
 631 zfs_register_callbacks(vfs_t *vfsp)
 632 {
 633         struct dsl_dataset *ds = NULL;
 634         objset_t *os = NULL;
 635         zfsvfs_t *zfsvfs = NULL;
 636         uint64_t nbmand;
 637         boolean_t readonly = B_FALSE;
 638         boolean_t do_readonly = B_FALSE;
 639         boolean_t setuid = B_FALSE;
 640         boolean_t do_setuid = B_FALSE;
 641         boolean_t exec = B_FALSE;
 642         boolean_t do_exec = B_FALSE;
 643         boolean_t xattr = B_FALSE;
 644         boolean_t atime = B_FALSE;
 645         boolean_t do_atime = B_FALSE;
 646         boolean_t do_xattr = B_FALSE;
 647         int error = 0;
 648
 649         ASSERT3P(vfsp, !=, NULL);
 650         zfsvfs = vfsp->vfs_data;
 651         ASSERT3P(zfsvfs, !=, NULL);
 652         os = zfsvfs->z_os;
 653
 654         /*
 655          * This function can be called for a snapshot when we update snapshot's
 656          * mount point, which isn't really supported.
 657          */
 658         if (dmu_objset_is_snapshot(os))
 659                 return (EOPNOTSUPP);
 660
 661         /*
 662          * The act of registering our callbacks will destroy any mount
 663          * options we may have.  In order to enable temporary overrides
 664          * of mount options, we stash away the current values and
 665          * restore them after we register the callbacks.
 666          */
 667         if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
 668             !spa_writeable(dmu_objset_spa(os))) {
 669                 readonly = B_TRUE;
 670                 do_readonly = B_TRUE;
 671         } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
 672                 readonly = B_FALSE;
 673                 do_readonly = B_TRUE;
 674         }
 675         if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 676                 setuid = B_FALSE;
 677                 do_setuid = B_TRUE;
 678         } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
 679                 setuid = B_TRUE;
 680                 do_setuid = B_TRUE;
 681         }
 682         if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
 683                 exec = B_FALSE;
 684                 do_exec = B_TRUE;
 685         } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
 686                 exec = B_TRUE;
 687                 do_exec = B_TRUE;
 688         }
 689         if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 690                 zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF;
 691                 do_xattr = B_TRUE;
 692         } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
 693                 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
 694                 do_xattr = B_TRUE;
 695         } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) {
 696                 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
 697                 do_xattr = B_TRUE;
 698         } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) {
 699                 zfsvfs->z_xattr = xattr = ZFS_XATTR_SA;
 700                 do_xattr = B_TRUE;
 701         }
 702         if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
 703                 atime = B_FALSE;
 704                 do_atime = B_TRUE;
 705         } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
 706                 atime = B_TRUE;
 707                 do_atime = B_TRUE;
 708         }
 709
 710         /*
 711          * We need to enter pool configuration here, so that we can use
 712          * dsl_prop_get_int_ds() to handle the special nbmand property below.
 713          * dsl_prop_get_integer() can not be used, because it has to acquire
 714          * spa_namespace_lock and we can not do that because we already hold
 715          * z_teardown_lock.  The problem is that spa_write_cachefile() is called
 716          * with spa_namespace_lock held and the function calls ZFS vnode
 717          * operations to write the cache file and thus z_teardown_lock is
 718          * acquired after spa_namespace_lock.
 719          */
 720         ds = dmu_objset_ds(os);
 721         dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 722
 723         /*
 724          * nbmand is a special property.  It can only be changed at
 725          * mount time.
 726          *
 727          * This is weird, but it is documented to only be changeable
 728          * at mount time.
 729          */
 730         if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
 731                 nbmand = B_FALSE;
 732         } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
 733                 nbmand = B_TRUE;
 734         } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand)) != 0) {
 735                 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 736                 return (error);
 737         }
 738
 739         /*
 740          * Register property callbacks.
 741          *
 742          * It would probably be fine to just check for i/o error from
 743          * the first prop_register(), but I guess I like to go
 744          * overboard...
 745          */
 746         error = dsl_prop_register(ds,
 747             zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
 748         error = error ? error : dsl_prop_register(ds,
 749             zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
 750         error = error ? error : dsl_prop_register(ds,
 751             zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
 752         error = error ? error : dsl_prop_register(ds,
 753             zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
 754         error = error ? error : dsl_prop_register(ds,
 755             zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
 756         error = error ? error : dsl_prop_register(ds,
 757             zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
 758         error = error ? error : dsl_prop_register(ds,
 759             zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
 760         error = error ? error : dsl_prop_register(ds,
 761             zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs);
 762         error = error ? error : dsl_prop_register(ds,
 763             zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
 764         error = error ? error : dsl_prop_register(ds,
 765             zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
 766             zfsvfs);
 767         dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 768         if (error)
 769                 goto unregister;
 770
 771         /*
 772          * Invoke our callbacks to restore temporary mount options.
 773          */
 774         if (do_readonly)
 775                 readonly_changed_cb(zfsvfs, readonly);
 776         if (do_setuid)
 777                 setuid_changed_cb(zfsvfs, setuid);
 778         if (do_exec)
 779                 exec_changed_cb(zfsvfs, exec);
 780         if (do_xattr)
 781                 xattr_changed_cb(zfsvfs, xattr);
 782         if (do_atime)
 783                 atime_changed_cb(zfsvfs, atime);
 784
 785         nbmand_changed_cb(zfsvfs, nbmand);
 786
 787         return (0);
 788
 789 unregister:
 790         dsl_prop_unregister_all(ds, zfsvfs);
 791         return (error);
 792 }
 793
 794 /*
 795  * Associate this zfsvfs with the given objset, which must be owned.
 796  * This will cache a bunch of on-disk state from the objset in the
 797  * zfsvfs.
 798  */
 799 static int
 800 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
 801 {
 802         int error;
 803         uint64_t val;
 804
 805         zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
 806         zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 807         zfsvfs->z_os = os;
 808
 809         error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
 810         if (error != 0)
 811                 return (error);
 812         if (zfsvfs->z_version >
 813             zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
 814                 (void) printf("Can't mount a version %lld file system "
 815                     "on a version %lld pool\n. Pool must be upgraded to mount "
 816                     "this file system.", (u_longlong_t)zfsvfs->z_version,
 817                     (u_longlong_t)spa_version(dmu_objset_spa(os)));
 818                 return (SET_ERROR(ENOTSUP));
 819         }
 820         error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
 821         if (error != 0)
 822                 return (error);
 823         zfsvfs->z_norm = (int)val;
 824
 825         error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
 826         if (error != 0)
 827                 return (error);
 828         zfsvfs->z_utf8 = (val != 0);
 829
 830         error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
 831         if (error != 0)
 832                 return (error);
 833         zfsvfs->z_case = (uint_t)val;
 834
 835         error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val);
 836         if (error != 0)
 837                 return (error);
 838         zfsvfs->z_acl_type = (uint_t)val;
 839
 840         /*
 841          * Fold case on file systems that are always or sometimes case
 842          * insensitive.
 843          */
 844         if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 845             zfsvfs->z_case == ZFS_CASE_MIXED)
 846                 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 847
 848         zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 849         zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 850
 851         uint64_t sa_obj = 0;
 852         if (zfsvfs->z_use_sa) {
 853                 /* should either have both of these objects or none */
 854                 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
 855                     &sa_obj);
 856                 if (error != 0)
 857                         return (error);
 858
 859                 error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val);
 860                 if (error == 0 && val == ZFS_XATTR_SA)
 861                         zfsvfs->z_xattr_sa = B_TRUE;
 862         }
 863
 864         error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
 865             &zfsvfs->z_attr_table);
 866         if (error != 0)
 867                 return (error);
 868
 869         if (zfsvfs->z_version >= ZPL_VERSION_SA)
 870                 sa_register_update_callback(os, zfs_sa_upgrade);
 871
 872         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
 873             &zfsvfs->z_root);
 874         if (error != 0)
 875                 return (error);
 876         ASSERT3U(zfsvfs->z_root, !=, 0);
 877
 878         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
 879             &zfsvfs->z_unlinkedobj);
 880         if (error != 0)
 881                 return (error);
 882
 883         error = zap_lookup(os, MASTER_NODE_OBJ,
 884             zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
 885             8, 1, &zfsvfs->z_userquota_obj);
 886         if (error == ENOENT)
 887                 zfsvfs->z_userquota_obj = 0;
 888         else if (error != 0)
 889                 return (error);
 890
 891         error = zap_lookup(os, MASTER_NODE_OBJ,
 892             zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
 893             8, 1, &zfsvfs->z_groupquota_obj);
 894         if (error == ENOENT)
 895                 zfsvfs->z_groupquota_obj = 0;
 896         else if (error != 0)
 897                 return (error);
 898
 899         error = zap_lookup(os, MASTER_NODE_OBJ,
 900             zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
 901             8, 1, &zfsvfs->z_projectquota_obj);
 902         if (error == ENOENT)
 903                 zfsvfs->z_projectquota_obj = 0;
 904         else if (error != 0)
 905                 return (error);
 906
 907         error = zap_lookup(os, MASTER_NODE_OBJ,
 908             zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
 909             8, 1, &zfsvfs->z_userobjquota_obj);
 910         if (error == ENOENT)
 911                 zfsvfs->z_userobjquota_obj = 0;
 912         else if (error != 0)
 913                 return (error);
 914
 915         error = zap_lookup(os, MASTER_NODE_OBJ,
 916             zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
 917             8, 1, &zfsvfs->z_groupobjquota_obj);
 918         if (error == ENOENT)
 919                 zfsvfs->z_groupobjquota_obj = 0;
 920         else if (error != 0)
 921                 return (error);
 922
 923         error = zap_lookup(os, MASTER_NODE_OBJ,
 924             zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
 925             8, 1, &zfsvfs->z_projectobjquota_obj);
 926         if (error == ENOENT)
 927                 zfsvfs->z_projectobjquota_obj = 0;
 928         else if (error != 0)
 929                 return (error);
 930
 931         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
 932             &zfsvfs->z_fuid_obj);
 933         if (error == ENOENT)
 934                 zfsvfs->z_fuid_obj = 0;
 935         else if (error != 0)
 936                 return (error);
 937
 938         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
 939             &zfsvfs->z_shares_dir);
 940         if (error == ENOENT)
 941                 zfsvfs->z_shares_dir = 0;
 942         else if (error != 0)
 943                 return (error);
 944
 945         /*
 946          * Only use the name cache if we are looking for a
 947          * name on a file system that does not require normalization
 948          * or case folding.  We can also look there if we happen to be
 949          * on a non-normalizing, mixed sensitivity file system IF we
 950          * are looking for the exact name (which is always the case on
 951          * FreeBSD).
 952          */
 953         zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
 954             ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
 955             !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
 956
 957         return (0);
 958 }
 959
 960 taskq_t *zfsvfs_taskq;
 961
 962 static void
 963 zfsvfs_task_unlinked_drain(void *context, int pending __unused)
 964 {
 965
 966         zfs_unlinked_drain((zfsvfs_t *)context);
 967 }
 968
 969 int
 970 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
 971 {
 972         objset_t *os;
 973         zfsvfs_t *zfsvfs;
 974         int error;
 975         boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
 976
 977         /*
 978          * XXX: Fix struct statfs so this isn't necessary!
 979          *
 980          * The 'osname' is used as the filesystem's special node, which means
 981          * it must fit in statfs.f_mntfromname, or else it can't be
 982          * enumerated, so libzfs_mnttab_find() returns NULL, which causes
 983          * 'zfs unmount' to think it's not mounted when it is.
 984          */
 985         if (strlen(osname) >= MNAMELEN)
 986                 return (SET_ERROR(ENAMETOOLONG));
 987
 988         zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 989
 990         error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs,
 991             &os);
 992         if (error != 0) {
 993                 kmem_free(zfsvfs, sizeof (zfsvfs_t));
 994                 return (error);
 995         }
 996
 997         error = zfsvfs_create_impl(zfvp, zfsvfs, os);
 998
 999         return (error);
1000 }
1001
1002
1003 int
1004 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
1005 {
1006         int error;
1007
1008         zfsvfs->z_vfs = NULL;
1009         zfsvfs->z_parent = zfsvfs;
1010
1011         mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1012         mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1013         list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1014             offsetof(znode_t, z_link_node));
1015         TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
1016             zfsvfs_task_unlinked_drain, zfsvfs);
1017         ZFS_TEARDOWN_INIT(zfsvfs);
1018         ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs);
1019         rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1020         for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1021                 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1022
1023         error = zfsvfs_init(zfsvfs, os);
1024         if (error != 0) {
1025                 dmu_objset_disown(os, B_TRUE, zfsvfs);
1026                 *zfvp = NULL;
1027                 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1028                 return (error);
1029         }
1030
1031         *zfvp = zfsvfs;
1032         return (0);
1033 }
1034
1035 static int
1036 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1037 {
1038         int error;
1039
1040         /*
1041          * Check for a bad on-disk format version now since we
1042          * lied about owning the dataset readonly before.
1043          */
1044         if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
1045             dmu_objset_incompatible_encryption_version(zfsvfs->z_os))
1046                 return (SET_ERROR(EROFS));
1047
1048         error = zfs_register_callbacks(zfsvfs->z_vfs);
1049         if (error)
1050                 return (error);
1051
1052         /*
1053          * If we are not mounting (ie: online recv), then we don't
1054          * have to worry about replaying the log as we blocked all
1055          * operations out since we closed the ZIL.
1056          */
1057         if (mounting) {
1058                 boolean_t readonly;
1059
1060                 ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
1061                 error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
1062                 if (error)
1063                         return (error);
1064                 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1065                     &zfsvfs->z_kstat.dk_zil_sums);
1066
1067                 /*
1068                  * During replay we remove the read only flag to
1069                  * allow replays to succeed.
1070                  */
1071                 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1072                 if (readonly != 0) {
1073                         zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1074                 } else {
1075                         dsl_dir_t *dd;
1076                         zap_stats_t zs;
1077
1078                         if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
1079                             &zs) == 0) {
1080                                 dataset_kstats_update_nunlinks_kstat(
1081                                     &zfsvfs->z_kstat, zs.zs_num_entries);
1082                                 dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
1083                                     "num_entries in unlinked set: %llu",
1084                                     (u_longlong_t)zs.zs_num_entries);
1085                         }
1086
1087                         zfs_unlinked_drain(zfsvfs);
1088                         dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1089                         dd->dd_activity_cancelled = B_FALSE;
1090                 }
1091
1092                 /*
1093                  * Parse and replay the intent log.
1094                  *
1095                  * Because of ziltest, this must be done after
1096                  * zfs_unlinked_drain().  (Further note: ziltest
1097                  * doesn't use readonly mounts, where
1098                  * zfs_unlinked_drain() isn't called.)  This is because
1099                  * ziltest causes spa_sync() to think it's committed,
1100                  * but actually it is not, so the intent log contains
1101                  * many txg's worth of changes.
1102                  *
1103                  * In particular, if object N is in the unlinked set in
1104                  * the last txg to actually sync, then it could be
1105                  * actually freed in a later txg and then reallocated
1106                  * in a yet later txg.  This would write a "create
1107                  * object N" record to the intent log.  Normally, this
1108                  * would be fine because the spa_sync() would have
1109                  * written out the fact that object N is free, before
1110                  * we could write the "create object N" intent log
1111                  * record.
1112                  *
1113                  * But when we are in ziltest mode, we advance the "open
1114                  * txg" without actually spa_sync()-ing the changes to
1115                  * disk.  So we would see that object N is still
1116                  * allocated and in the unlinked set, and there is an
1117                  * intent log record saying to allocate it.
1118                  */
1119                 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1120                         if (zil_replay_disable) {
1121                                 zil_destroy(zfsvfs->z_log, B_FALSE);
1122                         } else {
1123                                 boolean_t use_nc = zfsvfs->z_use_namecache;
1124                                 zfsvfs->z_use_namecache = B_FALSE;
1125                                 zfsvfs->z_replay = B_TRUE;
1126                                 zil_replay(zfsvfs->z_os, zfsvfs,
1127                                     zfs_replay_vector);
1128                                 zfsvfs->z_replay = B_FALSE;
1129                                 zfsvfs->z_use_namecache = use_nc;
1130                         }
1131                 }
1132
1133                 /* restore readonly bit */
1134                 if (readonly != 0)
1135                         zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
1136         } else {
1137                 ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL);
1138                 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1139                     &zfsvfs->z_kstat.dk_zil_sums);
1140         }
1141
1142         /*
1143          * Set the objset user_ptr to track its zfsvfs.
1144          */
1145         mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1146         dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1147         mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1148
1149         return (0);
1150 }
1151
1152 void
1153 zfsvfs_free(zfsvfs_t *zfsvfs)
1154 {
1155         int i;
1156
1157         zfs_fuid_destroy(zfsvfs);
1158
1159         mutex_destroy(&zfsvfs->z_znodes_lock);
1160         mutex_destroy(&zfsvfs->z_lock);
1161         list_destroy(&zfsvfs->z_all_znodes);
1162         ZFS_TEARDOWN_DESTROY(zfsvfs);
1163         ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs);
1164         rw_destroy(&zfsvfs->z_fuid_lock);
1165         for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1166                 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1167         dataset_kstats_destroy(&zfsvfs->z_kstat);
1168         kmem_free(zfsvfs, sizeof (zfsvfs_t));
1169 }
1170
1171 static void
1172 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1173 {
1174         zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1175         zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1176 }
1177
1178 static int
1179 zfs_domount(vfs_t *vfsp, char *osname)
1180 {
1181         uint64_t recordsize, fsid_guid;
1182         int error = 0;
1183         zfsvfs_t *zfsvfs;
1184
1185         ASSERT3P(vfsp, !=, NULL);
1186         ASSERT3P(osname, !=, NULL);
1187
1188         error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs);
1189         if (error)
1190                 return (error);
1191         zfsvfs->z_vfs = vfsp;
1192
1193         if ((error = dsl_prop_get_integer(osname,
1194             "recordsize", &recordsize, NULL)))
1195                 goto out;
1196         zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
1197         zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
1198
1199         vfsp->vfs_data = zfsvfs;
1200         vfsp->mnt_flag |= MNT_LOCAL;
1201         vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
1202         vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
1203         vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
1204         /*
1205          * This can cause a loss of coherence between ARC and page cache
1206          * on ZoF - unclear if the problem is in FreeBSD or ZoF
1207          */
1208         vfsp->mnt_kern_flag |= MNTK_NO_IOPF;    /* vn_io_fault can be used */
1209         vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
1210         vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
1211
1212 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
1213         vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
1214 #endif
1215         /*
1216          * The fsid is 64 bits, composed of an 8-bit fs type, which
1217          * separates our fsid from any other filesystem types, and a
1218          * 56-bit objset unique ID.  The objset unique ID is unique to
1219          * all objsets open on this system, provided by unique_create().
1220          * The 8-bit fs type must be put in the low bits of fsid[1]
1221          * because that's where other Solaris filesystems put it.
1222          */
1223         fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1224         ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0);
1225         vfsp->vfs_fsid.val[0] = fsid_guid;
1226         vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) |
1227             (vfsp->mnt_vfc->vfc_typenum & 0xFF);
1228
1229         /*
1230          * Set features for file system.
1231          */
1232         zfs_set_fuid_feature(zfsvfs);
1233
1234         if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1235                 uint64_t pval;
1236
1237                 atime_changed_cb(zfsvfs, B_FALSE);
1238                 readonly_changed_cb(zfsvfs, B_TRUE);
1239                 if ((error = dsl_prop_get_integer(osname,
1240                     "xattr", &pval, NULL)))
1241                         goto out;
1242                 xattr_changed_cb(zfsvfs, pval);
1243                 if ((error = dsl_prop_get_integer(osname,
1244                     "acltype", &pval, NULL)))
1245                         goto out;
1246                 acl_type_changed_cb(zfsvfs, pval);
1247                 zfsvfs->z_issnap = B_TRUE;
1248                 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1249
1250                 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1251                 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1252                 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1253         } else {
1254                 if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
1255                         goto out;
1256         }
1257
1258         vfs_mountedfrom(vfsp, osname);
1259
1260         if (!zfsvfs->z_issnap)
1261                 zfsctl_create(zfsvfs);
1262 out:
1263         if (error) {
1264                 dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
1265                 zfsvfs_free(zfsvfs);
1266         } else {
1267                 atomic_inc_32(&zfs_active_fs_count);
1268         }
1269
1270         return (error);
1271 }
1272
1273 static void
1274 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1275 {
1276         objset_t *os = zfsvfs->z_os;
1277
1278         if (!dmu_objset_is_snapshot(os))
1279                 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1280 }
1281
1282 static int
1283 getpoolname(const char *osname, char *poolname)
1284 {
1285         char *p;
1286
1287         p = strchr(osname, '/');
1288         if (p == NULL) {
1289                 if (strlen(osname) >= MAXNAMELEN)
1290                         return (ENAMETOOLONG);
1291                 (void) strcpy(poolname, osname);
1292         } else {
1293                 if (p - osname >= MAXNAMELEN)
1294                         return (ENAMETOOLONG);
1295                 (void) strlcpy(poolname, osname, p - osname + 1);
1296         }
1297         return (0);
1298 }
1299
1300 static void
1301 fetch_osname_options(char *name, bool *checkpointrewind)
1302 {
1303
1304         if (name[0] == '!') {
1305                 *checkpointrewind = true;
1306                 memmove(name, name + 1, strlen(name));
1307         } else {
1308                 *checkpointrewind = false;
1309         }
1310 }
1311
1312 static int
1313 zfs_mount(vfs_t *vfsp)
1314 {
1315         kthread_t       *td = curthread;
1316         vnode_t         *mvp = vfsp->mnt_vnodecovered;
1317         cred_t          *cr = td->td_ucred;
1318         char            *osname;
1319         int             error = 0;
1320         int             canwrite;
1321         bool            checkpointrewind, isctlsnap = false;
1322
1323         if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
1324                 return (SET_ERROR(EINVAL));
1325
1326         /*
1327          * If full-owner-access is enabled and delegated administration is
1328          * turned on, we must set nosuid.
1329          */
1330         if (zfs_super_owner &&
1331             dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
1332                 secpolicy_fs_mount_clearopts(cr, vfsp);
1333         }
1334
1335         fetch_osname_options(osname, &checkpointrewind);
1336         isctlsnap = (mvp != NULL && zfsctl_is_node(mvp) &&
1337             strchr(osname, '@') != NULL);
1338
1339         /*
1340          * Check for mount privilege?
1341          *
1342          * If we don't have privilege then see if
1343          * we have local permission to allow it
1344          */
1345         error = secpolicy_fs_mount(cr, mvp, vfsp);
1346         if (error && isctlsnap) {
1347                 secpolicy_fs_mount_clearopts(cr, vfsp);
1348         } else if (error) {
1349                 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
1350                         goto out;
1351
1352                 if (!(vfsp->vfs_flag & MS_REMOUNT)) {
1353                         vattr_t         vattr;
1354
1355                         /*
1356                          * Make sure user is the owner of the mount point
1357                          * or has sufficient privileges.
1358                          */
1359
1360                         vattr.va_mask = AT_UID;
1361
1362                         vn_lock(mvp, LK_SHARED | LK_RETRY);
1363                         if (VOP_GETATTR(mvp, &vattr, cr)) {
1364                                 VOP_UNLOCK1(mvp);
1365                                 goto out;
1366                         }
1367
1368                         if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1369                             VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
1370                                 VOP_UNLOCK1(mvp);
1371                                 goto out;
1372                         }
1373                         VOP_UNLOCK1(mvp);
1374                 }
1375
1376                 secpolicy_fs_mount_clearopts(cr, vfsp);
1377         }
1378
1379         /*
1380          * Refuse to mount a filesystem if we are in a local zone and the
1381          * dataset is not visible.
1382          */
1383         if (!INGLOBALZONE(curproc) &&
1384             (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1385                 boolean_t mount_snapshot = B_FALSE;
1386
1387                 /*
1388                  * Snapshots may be mounted in .zfs for unjailed datasets
1389                  * if allowed by the jail param zfs.mount_snapshot.
1390                  */
1391                 if (isctlsnap) {
1392                         struct prison *pr;
1393                         struct zfs_jailparam *zjp;
1394
1395                         pr = curthread->td_ucred->cr_prison;
1396                         mtx_lock(&pr->pr_mtx);
1397                         zjp = osd_jail_get(pr, zfs_jailparam_slot);
1398                         mtx_unlock(&pr->pr_mtx);
1399                         if (zjp && zjp->mount_snapshot)
1400                                 mount_snapshot = B_TRUE;
1401                 }
1402                 if (!mount_snapshot) {
1403                         error = SET_ERROR(EPERM);
1404                         goto out;
1405                 }
1406         }
1407
1408         vfsp->vfs_flag |= MNT_NFS4ACLS;
1409
1410         /*
1411          * When doing a remount, we simply refresh our temporary properties
1412          * according to those options set in the current VFS options.
1413          */
1414         if (vfsp->vfs_flag & MS_REMOUNT) {
1415                 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1416
1417                 /*
1418                  * Refresh mount options with z_teardown_lock blocking I/O while
1419                  * the filesystem is in an inconsistent state.
1420                  * The lock also serializes this code with filesystem
1421                  * manipulations between entry to zfs_suspend_fs() and return
1422                  * from zfs_resume_fs().
1423                  */
1424                 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1425                 zfs_unregister_callbacks(zfsvfs);
1426                 error = zfs_register_callbacks(vfsp);
1427                 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1428                 goto out;
1429         }
1430
1431         /* Initial root mount: try hard to import the requested root pool. */
1432         if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
1433             (vfsp->vfs_flag & MNT_UPDATE) == 0) {
1434                 char pname[MAXNAMELEN];
1435
1436                 error = getpoolname(osname, pname);
1437                 if (error == 0)
1438                         error = spa_import_rootpool(pname, checkpointrewind);
1439                 if (error)
1440                         goto out;
1441         }
1442         DROP_GIANT();
1443         error = zfs_domount(vfsp, osname);
1444         PICKUP_GIANT();
1445
1446 out:
1447         return (error);
1448 }
1449
1450 static int
1451 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
1452 {
1453         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1454         uint64_t refdbytes, availbytes, usedobjs, availobjs;
1455         int error;
1456
1457         statp->f_version = STATFS_VERSION;
1458
1459         if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1460                 return (error);
1461
1462         dmu_objset_space(zfsvfs->z_os,
1463             &refdbytes, &availbytes, &usedobjs, &availobjs);
1464
1465         /*
1466          * The underlying storage pool actually uses multiple block sizes.
1467          * We report the fragsize as the smallest block size we support,
1468          * and we report our blocksize as the filesystem's maximum blocksize.
1469          */
1470         statp->f_bsize = SPA_MINBLOCKSIZE;
1471         statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
1472
1473         /*
1474          * The following report "total" blocks of various kinds in the
1475          * file system, but reported in terms of f_frsize - the
1476          * "fragment" size.
1477          */
1478
1479         statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1480         statp->f_bfree = availbytes / statp->f_bsize;
1481         statp->f_bavail = statp->f_bfree; /* no root reservation */
1482
1483         /*
1484          * statvfs() should really be called statufs(), because it assumes
1485          * static metadata.  ZFS doesn't preallocate files, so the best
1486          * we can do is report the max that could possibly fit in f_files,
1487          * and that minus the number actually used in f_ffree.
1488          * For f_ffree, report the smaller of the number of object available
1489          * and the number of blocks (each object will take at least a block).
1490          */
1491         statp->f_ffree = MIN(availobjs, statp->f_bfree);
1492         statp->f_files = statp->f_ffree + usedobjs;
1493
1494         /*
1495          * We're a zfs filesystem.
1496          */
1497         strlcpy(statp->f_fstypename, "zfs",
1498             sizeof (statp->f_fstypename));
1499
1500         strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1501             sizeof (statp->f_mntfromname));
1502         strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1503             sizeof (statp->f_mntonname));
1504
1505         statp->f_namemax = MAXNAMELEN - 1;
1506
1507         zfs_exit(zfsvfs, FTAG);
1508         return (0);
1509 }
1510
1511 static int
1512 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
1513 {
1514         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1515         znode_t *rootzp;
1516         int error;
1517
1518         if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1519                 return (error);
1520
1521         error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1522         if (error == 0)
1523                 *vpp = ZTOV(rootzp);
1524
1525         zfs_exit(zfsvfs, FTAG);
1526
1527         if (error == 0) {
1528                 error = vn_lock(*vpp, flags);
1529                 if (error != 0) {
1530                         VN_RELE(*vpp);
1531                         *vpp = NULL;
1532                 }
1533         }
1534         return (error);
1535 }
1536
1537 /*
1538  * Teardown the zfsvfs::z_os.
1539  *
1540  * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1541  * and 'z_teardown_inactive_lock' held.
1542  */
1543 static int
1544 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1545 {
1546         znode_t *zp;
1547         dsl_dir_t *dd;
1548
1549         /*
1550          * If someone has not already unmounted this file system,
1551          * drain the zrele_taskq to ensure all active references to the
1552          * zfsvfs_t have been handled only then can it be safely destroyed.
1553          */
1554         if (zfsvfs->z_os) {
1555                 /*
1556                  * If we're unmounting we have to wait for the list to
1557                  * drain completely.
1558                  *
1559                  * If we're not unmounting there's no guarantee the list
1560                  * will drain completely, but zreles run from the taskq
1561                  * may add the parents of dir-based xattrs to the taskq
1562                  * so we want to wait for these.
1563                  *
1564                  * We can safely check z_all_znodes for being empty because the
1565                  * VFS has already blocked operations which add to it.
1566                  */
1567                 int round = 0;
1568                 while (!list_is_empty(&zfsvfs->z_all_znodes)) {
1569                         taskq_wait_outstanding(dsl_pool_zrele_taskq(
1570                             dmu_objset_pool(zfsvfs->z_os)), 0);
1571                         if (++round > 1 && !unmounting)
1572                                 break;
1573                 }
1574         }
1575         ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1576
1577         if (!unmounting) {
1578                 /*
1579                  * We purge the parent filesystem's vfsp as the parent
1580                  * filesystem and all of its snapshots have their vnode's
1581                  * v_vfsp set to the parent's filesystem's vfsp.  Note,
1582                  * 'z_parent' is self referential for non-snapshots.
1583                  */
1584 #ifdef FREEBSD_NAMECACHE
1585 #if __FreeBSD_version >= 1300117
1586                 cache_purgevfs(zfsvfs->z_parent->z_vfs);
1587 #else
1588                 cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
1589 #endif
1590 #endif
1591         }
1592
1593         /*
1594          * Close the zil. NB: Can't close the zil while zfs_inactive
1595          * threads are blocked as zil_close can call zfs_inactive.
1596          */
1597         if (zfsvfs->z_log) {
1598                 zil_close(zfsvfs->z_log);
1599                 zfsvfs->z_log = NULL;
1600         }
1601
1602         ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs);
1603
1604         /*
1605          * If we are not unmounting (ie: online recv) and someone already
1606          * unmounted this file system while we were doing the switcheroo,
1607          * or a reopen of z_os failed then just bail out now.
1608          */
1609         if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1610                 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1611                 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1612                 return (SET_ERROR(EIO));
1613         }
1614
1615         /*
1616          * At this point there are no vops active, and any new vops will
1617          * fail with EIO since we have z_teardown_lock for writer (only
1618          * relevant for forced unmount).
1619          *
1620          * Release all holds on dbufs.
1621          */
1622         mutex_enter(&zfsvfs->z_znodes_lock);
1623         for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1624             zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1625                 if (zp->z_sa_hdl != NULL) {
1626                         zfs_znode_dmu_fini(zp);
1627                 }
1628         }
1629         mutex_exit(&zfsvfs->z_znodes_lock);
1630
1631         /*
1632          * If we are unmounting, set the unmounted flag and let new vops
1633          * unblock.  zfs_inactive will have the unmounted behavior, and all
1634          * other vops will fail with EIO.
1635          */
1636         if (unmounting) {
1637                 zfsvfs->z_unmounted = B_TRUE;
1638                 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1639                 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1640         }
1641
1642         /*
1643          * z_os will be NULL if there was an error in attempting to reopen
1644          * zfsvfs, so just return as the properties had already been
1645          * unregistered and cached data had been evicted before.
1646          */
1647         if (zfsvfs->z_os == NULL)
1648                 return (0);
1649
1650         /*
1651          * Unregister properties.
1652          */
1653         zfs_unregister_callbacks(zfsvfs);
1654
1655         /*
1656          * Evict cached data
1657          */
1658         if (!zfs_is_readonly(zfsvfs))
1659                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1660         dmu_objset_evict_dbufs(zfsvfs->z_os);
1661         dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1662         dsl_dir_cancel_waiters(dd);
1663
1664         return (0);
1665 }
1666
1667 static int
1668 zfs_umount(vfs_t *vfsp, int fflag)
1669 {
1670         kthread_t *td = curthread;
1671         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1672         objset_t *os;
1673         cred_t *cr = td->td_ucred;
1674         int ret;
1675
1676         ret = secpolicy_fs_unmount(cr, vfsp);
1677         if (ret) {
1678                 if (dsl_deleg_access((char *)vfsp->vfs_resource,
1679                     ZFS_DELEG_PERM_MOUNT, cr))
1680                         return (ret);
1681         }
1682
1683         /*
1684          * Unmount any snapshots mounted under .zfs before unmounting the
1685          * dataset itself.
1686          */
1687         if (zfsvfs->z_ctldir != NULL) {
1688                 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1689                         return (ret);
1690         }
1691
1692         if (fflag & MS_FORCE) {
1693                 /*
1694                  * Mark file system as unmounted before calling
1695                  * vflush(FORCECLOSE). This way we ensure no future vnops
1696                  * will be called and risk operating on DOOMED vnodes.
1697                  */
1698                 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1699                 zfsvfs->z_unmounted = B_TRUE;
1700                 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1701         }
1702
1703         /*
1704          * Flush all the files.
1705          */
1706         ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
1707         if (ret != 0)
1708                 return (ret);
1709         while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
1710             &zfsvfs->z_unlinked_drain_task, NULL) != 0)
1711                 taskqueue_drain(zfsvfs_taskq->tq_queue,
1712                     &zfsvfs->z_unlinked_drain_task);
1713
1714         VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE));
1715         os = zfsvfs->z_os;
1716
1717         /*
1718          * z_os will be NULL if there was an error in
1719          * attempting to reopen zfsvfs.
1720          */
1721         if (os != NULL) {
1722                 /*
1723                  * Unset the objset user_ptr.
1724                  */
1725                 mutex_enter(&os->os_user_ptr_lock);
1726                 dmu_objset_set_user(os, NULL);
1727                 mutex_exit(&os->os_user_ptr_lock);
1728
1729                 /*
1730                  * Finally release the objset
1731                  */
1732                 dmu_objset_disown(os, B_TRUE, zfsvfs);
1733         }
1734
1735         /*
1736          * We can now safely destroy the '.zfs' directory node.
1737          */
1738         if (zfsvfs->z_ctldir != NULL)
1739                 zfsctl_destroy(zfsvfs);
1740         zfs_freevfs(vfsp);
1741
1742         return (0);
1743 }
1744
1745 static int
1746 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1747 {
1748         zfsvfs_t        *zfsvfs = vfsp->vfs_data;
1749         znode_t         *zp;
1750         int             err;
1751
1752         /*
1753          * zfs_zget() can't operate on virtual entries like .zfs/ or
1754          * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
1755          * This will make NFS to switch to LOOKUP instead of using VGET.
1756          */
1757         if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
1758             (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
1759                 return (EOPNOTSUPP);
1760
1761         if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1762                 return (err);
1763         err = zfs_zget(zfsvfs, ino, &zp);
1764         if (err == 0 && zp->z_unlinked) {
1765                 vrele(ZTOV(zp));
1766                 err = EINVAL;
1767         }
1768         if (err == 0)
1769                 *vpp = ZTOV(zp);
1770         zfs_exit(zfsvfs, FTAG);
1771         if (err == 0) {
1772                 err = vn_lock(*vpp, flags);
1773                 if (err != 0)
1774                         vrele(*vpp);
1775         }
1776         if (err != 0)
1777                 *vpp = NULL;
1778         return (err);
1779 }
1780
1781 static int
1782 #if __FreeBSD_version >= 1300098
1783 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
1784     struct ucred **credanonp, int *numsecflavors, int *secflavors)
1785 #else
1786 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
1787     struct ucred **credanonp, int *numsecflavors, int **secflavors)
1788 #endif
1789 {
1790         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1791
1792         /*
1793          * If this is regular file system vfsp is the same as
1794          * zfsvfs->z_parent->z_vfs, but if it is snapshot,
1795          * zfsvfs->z_parent->z_vfs represents parent file system
1796          * which we have to use here, because only this file system
1797          * has mnt_export configured.
1798          */
1799         return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
1800             credanonp, numsecflavors, secflavors));
1801 }
1802
1803 _Static_assert(sizeof (struct fid) >= SHORT_FID_LEN,
1804         "struct fid bigger than SHORT_FID_LEN");
1805 _Static_assert(sizeof (struct fid) >= LONG_FID_LEN,
1806         "struct fid bigger than LONG_FID_LEN");
1807
1808 static int
1809 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
1810 {
1811         struct componentname cn;
1812         zfsvfs_t        *zfsvfs = vfsp->vfs_data;
1813         znode_t         *zp;
1814         vnode_t         *dvp;
1815         uint64_t        object = 0;
1816         uint64_t        fid_gen = 0;
1817         uint64_t        setgen = 0;
1818         uint64_t        gen_mask;
1819         uint64_t        zp_gen;
1820         int             i, err;
1821
1822         *vpp = NULL;
1823
1824         if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1825                 return (err);
1826
1827         /*
1828          * On FreeBSD we can get snapshot's mount point or its parent file
1829          * system mount point depending if snapshot is already mounted or not.
1830          */
1831         if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
1832                 zfid_long_t     *zlfid = (zfid_long_t *)fidp;
1833                 uint64_t        objsetid = 0;
1834
1835                 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1836                         objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1837
1838                 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1839                         setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1840
1841                 zfs_exit(zfsvfs, FTAG);
1842
1843                 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1844                 if (err)
1845                         return (SET_ERROR(EINVAL));
1846                 if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1847                         return (err);
1848         }
1849
1850         if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1851                 zfid_short_t    *zfid = (zfid_short_t *)fidp;
1852
1853                 for (i = 0; i < sizeof (zfid->zf_object); i++)
1854                         object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1855
1856                 for (i = 0; i < sizeof (zfid->zf_gen); i++)
1857                         fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1858         } else {
1859                 zfs_exit(zfsvfs, FTAG);
1860                 return (SET_ERROR(EINVAL));
1861         }
1862
1863         if (fidp->fid_len == LONG_FID_LEN && setgen != 0) {
1864                 zfs_exit(zfsvfs, FTAG);
1865                 dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n",
1866                     (u_longlong_t)fid_gen, (u_longlong_t)setgen);
1867                 return (SET_ERROR(EINVAL));
1868         }
1869
1870         /*
1871          * A zero fid_gen means we are in .zfs or the .zfs/snapshot
1872          * directory tree. If the object == zfsvfs->z_shares_dir, then
1873          * we are in the .zfs/shares directory tree.
1874          */
1875         if ((fid_gen == 0 &&
1876             (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
1877             (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
1878                 zfs_exit(zfsvfs, FTAG);
1879                 VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
1880                 if (object == ZFSCTL_INO_SNAPDIR) {
1881                         cn.cn_nameptr = "snapshot";
1882                         cn.cn_namelen = strlen(cn.cn_nameptr);
1883                         cn.cn_nameiop = LOOKUP;
1884                         cn.cn_flags = ISLASTCN | LOCKLEAF;
1885                         cn.cn_lkflags = flags;
1886                         VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1887                         vput(dvp);
1888                 } else if (object == zfsvfs->z_shares_dir) {
1889                         /*
1890                          * XXX This branch must not be taken,
1891                          * if it is, then the lookup below will
1892                          * explode.
1893                          */
1894                         cn.cn_nameptr = "shares";
1895                         cn.cn_namelen = strlen(cn.cn_nameptr);
1896                         cn.cn_nameiop = LOOKUP;
1897                         cn.cn_flags = ISLASTCN;
1898                         cn.cn_lkflags = flags;
1899                         VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1900                         vput(dvp);
1901                 } else {
1902                         *vpp = dvp;
1903                 }
1904                 return (err);
1905         }
1906
1907         gen_mask = -1ULL >> (64 - 8 * i);
1908
1909         dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object,
1910             (u_longlong_t)fid_gen,
1911             (u_longlong_t)gen_mask);
1912         if ((err = zfs_zget(zfsvfs, object, &zp))) {
1913                 zfs_exit(zfsvfs, FTAG);
1914                 return (err);
1915         }
1916         (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1917             sizeof (uint64_t));
1918         zp_gen = zp_gen & gen_mask;
1919         if (zp_gen == 0)
1920                 zp_gen = 1;
1921         if (zp->z_unlinked || zp_gen != fid_gen) {
1922                 dprintf("znode gen (%llu) != fid gen (%llu)\n",
1923                     (u_longlong_t)zp_gen, (u_longlong_t)fid_gen);
1924                 vrele(ZTOV(zp));
1925                 zfs_exit(zfsvfs, FTAG);
1926                 return (SET_ERROR(EINVAL));
1927         }
1928
1929         *vpp = ZTOV(zp);
1930         zfs_exit(zfsvfs, FTAG);
1931         err = vn_lock(*vpp, flags);
1932         if (err == 0)
1933                 vnode_create_vobject(*vpp, zp->z_size, curthread);
1934         else
1935                 *vpp = NULL;
1936         return (err);
1937 }
1938
1939 /*
1940  * Block out VOPs and close zfsvfs_t::z_os
1941  *
1942  * Note, if successful, then we return with the 'z_teardown_lock' and
1943  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
1944  * dataset and objset intact so that they can be atomically handed off during
1945  * a subsequent rollback or recv operation and the resume thereafter.
1946  */
1947 int
1948 zfs_suspend_fs(zfsvfs_t *zfsvfs)
1949 {
1950         int error;
1951
1952         if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1953                 return (error);
1954
1955         return (0);
1956 }
1957
1958 /*
1959  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
1960  * is an invariant across any of the operations that can be performed while the
1961  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
1962  * are the same: the relevant objset and associated dataset are owned by
1963  * zfsvfs, held, and long held on entry.
1964  */
1965 int
1966 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
1967 {
1968         int err;
1969         znode_t *zp;
1970
1971         ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
1972         ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
1973
1974         /*
1975          * We already own this, so just update the objset_t, as the one we
1976          * had before may have been evicted.
1977          */
1978         objset_t *os;
1979         VERIFY3P(ds->ds_owner, ==, zfsvfs);
1980         VERIFY(dsl_dataset_long_held(ds));
1981         dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
1982         dsl_pool_config_enter(dp, FTAG);
1983         VERIFY0(dmu_objset_from_ds(ds, &os));
1984         dsl_pool_config_exit(dp, FTAG);
1985
1986         err = zfsvfs_init(zfsvfs, os);
1987         if (err != 0)
1988                 goto bail;
1989
1990         ds->ds_dir->dd_activity_cancelled = B_FALSE;
1991         VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE));
1992
1993         zfs_set_fuid_feature(zfsvfs);
1994
1995         /*
1996          * Attempt to re-establish all the active znodes with
1997          * their dbufs.  If a zfs_rezget() fails, then we'll let
1998          * any potential callers discover that via zfs_enter_verify_zp
1999          * when they try to use their znode.
2000          */
2001         mutex_enter(&zfsvfs->z_znodes_lock);
2002         for (zp = list_head(&zfsvfs->z_all_znodes); zp;
2003             zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2004                 (void) zfs_rezget(zp);
2005         }
2006         mutex_exit(&zfsvfs->z_znodes_lock);
2007
2008 bail:
2009         /* release the VOPs */
2010         ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2011         ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2012
2013         if (err) {
2014                 /*
2015                  * Since we couldn't setup the sa framework, try to force
2016                  * unmount this file system.
2017                  */
2018                 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
2019                         vfs_ref(zfsvfs->z_vfs);
2020                         (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
2021                 }
2022         }
2023         return (err);
2024 }
2025
2026 static void
2027 zfs_freevfs(vfs_t *vfsp)
2028 {
2029         zfsvfs_t *zfsvfs = vfsp->vfs_data;
2030
2031         zfsvfs_free(zfsvfs);
2032
2033         atomic_dec_32(&zfs_active_fs_count);
2034 }
2035
2036 #ifdef __i386__
2037 static int desiredvnodes_backup;
2038 #include <sys/vmmeter.h>
2039
2040
2041 #include <vm/vm_page.h>
2042 #include <vm/vm_object.h>
2043 #include <vm/vm_kern.h>
2044 #include <vm/vm_map.h>
2045 #endif
2046
2047 static void
2048 zfs_vnodes_adjust(void)
2049 {
2050 #ifdef __i386__
2051         int newdesiredvnodes;
2052
2053         desiredvnodes_backup = desiredvnodes;
2054
2055         /*
2056          * We calculate newdesiredvnodes the same way it is done in
2057          * vntblinit(). If it is equal to desiredvnodes, it means that
2058          * it wasn't tuned by the administrator and we can tune it down.
2059          */
2060         newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
2061             vm_kmem_size / (5 * (sizeof (struct vm_object) +
2062             sizeof (struct vnode))));
2063         if (newdesiredvnodes == desiredvnodes)
2064                 desiredvnodes = (3 * newdesiredvnodes) / 4;
2065 #endif
2066 }
2067
2068 static void
2069 zfs_vnodes_adjust_back(void)
2070 {
2071
2072 #ifdef __i386__
2073         desiredvnodes = desiredvnodes_backup;
2074 #endif
2075 }
2076
2077 #if __FreeBSD_version >= 1300139
2078 static struct sx zfs_vnlru_lock;
2079 static struct vnode *zfs_vnlru_marker;
2080 #endif
2081 static arc_prune_t *zfs_prune;
2082
2083 static void
2084 zfs_prune_task(uint64_t nr_to_scan, void *arg __unused)
2085 {
2086         if (nr_to_scan > INT_MAX)
2087                 nr_to_scan = INT_MAX;
2088 #if __FreeBSD_version >= 1300139
2089         sx_xlock(&zfs_vnlru_lock);
2090         vnlru_free_vfsops(nr_to_scan, &zfs_vfsops, zfs_vnlru_marker);
2091         sx_xunlock(&zfs_vnlru_lock);
2092 #else
2093         vnlru_free(nr_to_scan, &zfs_vfsops);
2094 #endif
2095 }
2096
2097 void
2098 zfs_init(void)
2099 {
2100
2101         printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
2102
2103         /*
2104          * Initialize .zfs directory structures
2105          */
2106         zfsctl_init();
2107
2108         /*
2109          * Initialize znode cache, vnode ops, etc...
2110          */
2111         zfs_znode_init();
2112
2113         /*
2114          * Reduce number of vnodes. Originally number of vnodes is calculated
2115          * with UFS inode in mind. We reduce it here, because it's too big for
2116          * ZFS/i386.
2117          */
2118         zfs_vnodes_adjust();
2119
2120         dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
2121
2122         zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
2123
2124 #if __FreeBSD_version >= 1300139
2125         zfs_vnlru_marker = vnlru_alloc_marker();
2126         sx_init(&zfs_vnlru_lock, "zfs vnlru lock");
2127 #endif
2128         zfs_prune = arc_add_prune_callback(zfs_prune_task, NULL);
2129 }
2130
2131 void
2132 zfs_fini(void)
2133 {
2134         arc_remove_prune_callback(zfs_prune);
2135 #if __FreeBSD_version >= 1300139
2136         vnlru_free_marker(zfs_vnlru_marker);
2137         sx_destroy(&zfs_vnlru_lock);
2138 #endif
2139
2140         taskq_destroy(zfsvfs_taskq);
2141         zfsctl_fini();
2142         zfs_znode_fini();
2143         zfs_vnodes_adjust_back();
2144 }
2145
2146 int
2147 zfs_busy(void)
2148 {
2149         return (zfs_active_fs_count != 0);
2150 }
2151
2152 /*
2153  * Release VOPs and unmount a suspended filesystem.
2154  */
2155 int
2156 zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2157 {
2158         ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
2159         ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
2160
2161         /*
2162          * We already own this, so just hold and rele it to update the
2163          * objset_t, as the one we had before may have been evicted.
2164          */
2165         objset_t *os;
2166         VERIFY3P(ds->ds_owner, ==, zfsvfs);
2167         VERIFY(dsl_dataset_long_held(ds));
2168         dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
2169         dsl_pool_config_enter(dp, FTAG);
2170         VERIFY0(dmu_objset_from_ds(ds, &os));
2171         dsl_pool_config_exit(dp, FTAG);
2172         zfsvfs->z_os = os;
2173
2174         /* release the VOPs */
2175         ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2176         ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2177
2178         /*
2179          * Try to force unmount this file system.
2180          */
2181         (void) zfs_umount(zfsvfs->z_vfs, 0);
2182         zfsvfs->z_unmounted = B_TRUE;
2183         return (0);
2184 }
2185
2186 int
2187 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2188 {
2189         int error;
2190         objset_t *os = zfsvfs->z_os;
2191         dmu_tx_t *tx;
2192
2193         if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2194                 return (SET_ERROR(EINVAL));
2195
2196         if (newvers < zfsvfs->z_version)
2197                 return (SET_ERROR(EINVAL));
2198
2199         if (zfs_spa_version_map(newvers) >
2200             spa_version(dmu_objset_spa(zfsvfs->z_os)))
2201                 return (SET_ERROR(ENOTSUP));
2202
2203         tx = dmu_tx_create(os);
2204         dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2205         if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2206                 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2207                     ZFS_SA_ATTRS);
2208                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2209         }
2210         error = dmu_tx_assign(tx, TXG_WAIT);
2211         if (error) {
2212                 dmu_tx_abort(tx);
2213                 return (error);
2214         }
2215
2216         error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2217             8, 1, &newvers, tx);
2218
2219         if (error) {
2220                 dmu_tx_commit(tx);
2221                 return (error);
2222         }
2223
2224         if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2225                 uint64_t sa_obj;
2226
2227                 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2228                     SPA_VERSION_SA);
2229                 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2230                     DMU_OT_NONE, 0, tx);
2231
2232                 error = zap_add(os, MASTER_NODE_OBJ,
2233                     ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2234                 ASSERT0(error);
2235
2236                 VERIFY0(sa_set_sa_object(os, sa_obj));
2237                 sa_register_update_callback(os, zfs_sa_upgrade);
2238         }
2239
2240         spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2241             "from %ju to %ju", (uintmax_t)zfsvfs->z_version,
2242             (uintmax_t)newvers);
2243         dmu_tx_commit(tx);
2244
2245         zfsvfs->z_version = newvers;
2246         os->os_version = newvers;
2247
2248         zfs_set_fuid_feature(zfsvfs);
2249
2250         return (0);
2251 }
2252
2253 /*
2254  * Return true if the corresponding vfs's unmounted flag is set.
2255  * Otherwise return false.
2256  * If this function returns true we know VFS unmount has been initiated.
2257  */
2258 boolean_t
2259 zfs_get_vfs_flag_unmounted(objset_t *os)
2260 {
2261         zfsvfs_t *zfvp;
2262         boolean_t unmounted = B_FALSE;
2263
2264         ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS);
2265
2266         mutex_enter(&os->os_user_ptr_lock);
2267         zfvp = dmu_objset_get_user(os);
2268         if (zfvp != NULL && zfvp->z_vfs != NULL &&
2269             (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
2270                 unmounted = B_TRUE;
2271         mutex_exit(&os->os_user_ptr_lock);
2272
2273         return (unmounted);
2274 }
2275
2276 #ifdef _KERNEL
2277 void
2278 zfsvfs_update_fromname(const char *oldname, const char *newname)
2279 {
2280         char tmpbuf[MAXPATHLEN];
2281         struct mount *mp;
2282         char *fromname;
2283         size_t oldlen;
2284
2285         oldlen = strlen(oldname);
2286
2287         mtx_lock(&mountlist_mtx);
2288         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2289                 fromname = mp->mnt_stat.f_mntfromname;
2290                 if (strcmp(fromname, oldname) == 0) {
2291                         (void) strlcpy(fromname, newname,
2292                             sizeof (mp->mnt_stat.f_mntfromname));
2293                         continue;
2294                 }
2295                 if (strncmp(fromname, oldname, oldlen) == 0 &&
2296                     (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
2297                         (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s",
2298                             newname, fromname + oldlen);
2299                         (void) strlcpy(fromname, tmpbuf,
2300                             sizeof (mp->mnt_stat.f_mntfromname));
2301                         continue;
2302                 }
2303         }
2304         mtx_unlock(&mountlist_mtx);
2305 }
2306 #endif
2307
2308 /*
2309  * Find a prison with ZFS info.
2310  * Return the ZFS info and the (locked) prison.
2311  */
2312 static struct zfs_jailparam *
2313 zfs_jailparam_find(struct prison *spr, struct prison **prp)
2314 {
2315         struct prison *pr;
2316         struct zfs_jailparam *zjp;
2317
2318         for (pr = spr; ; pr = pr->pr_parent) {
2319                 mtx_lock(&pr->pr_mtx);
2320                 if (pr == &prison0) {
2321                         zjp = &zfs_jailparam0;
2322                         break;
2323                 }
2324                 zjp = osd_jail_get(pr, zfs_jailparam_slot);
2325                 if (zjp != NULL)
2326                         break;
2327                 mtx_unlock(&pr->pr_mtx);
2328         }
2329         *prp = pr;
2330
2331         return (zjp);
2332 }
2333
2334 /*
2335  * Ensure a prison has its own ZFS info.  If zjpp is non-null, point it to the
2336  * ZFS info and lock the prison.
2337  */
2338 static void
2339 zfs_jailparam_alloc(struct prison *pr, struct zfs_jailparam **zjpp)
2340 {
2341         struct prison *ppr;
2342         struct zfs_jailparam *zjp, *nzjp;
2343         void **rsv;
2344
2345         /* If this prison already has ZFS info, return that. */
2346         zjp = zfs_jailparam_find(pr, &ppr);
2347         if (ppr == pr)
2348                 goto done;
2349
2350         /*
2351          * Allocate a new info record.  Then check again, in case something
2352          * changed during the allocation.
2353          */
2354         mtx_unlock(&ppr->pr_mtx);
2355         nzjp = malloc(sizeof (struct zfs_jailparam), M_PRISON, M_WAITOK);
2356         rsv = osd_reserve(zfs_jailparam_slot);
2357         zjp = zfs_jailparam_find(pr, &ppr);
2358         if (ppr == pr) {
2359                 free(nzjp, M_PRISON);
2360                 osd_free_reserved(rsv);
2361                 goto done;
2362         }
2363         /* Inherit the initial values from the ancestor. */
2364         mtx_lock(&pr->pr_mtx);
2365         (void) osd_jail_set_reserved(pr, zfs_jailparam_slot, rsv, nzjp);
2366         (void) memcpy(nzjp, zjp, sizeof (*zjp));
2367         zjp = nzjp;
2368         mtx_unlock(&ppr->pr_mtx);
2369 done:
2370         if (zjpp != NULL)
2371                 *zjpp = zjp;
2372         else
2373                 mtx_unlock(&pr->pr_mtx);
2374 }
2375
2376 /*
2377  * Jail OSD methods for ZFS VFS info.
2378  */
2379 static int
2380 zfs_jailparam_create(void *obj, void *data)
2381 {
2382         struct prison *pr = obj;
2383         struct vfsoptlist *opts = data;
2384         int jsys;
2385
2386         if (vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)) == 0 &&
2387             jsys == JAIL_SYS_INHERIT)
2388                 return (0);
2389         /*
2390          * Inherit a prison's initial values from its parent
2391          * (different from JAIL_SYS_INHERIT which also inherits changes).
2392          */
2393         zfs_jailparam_alloc(pr, NULL);
2394         return (0);
2395 }
2396
2397 static int
2398 zfs_jailparam_get(void *obj, void *data)
2399 {
2400         struct prison *ppr, *pr = obj;
2401         struct vfsoptlist *opts = data;
2402         struct zfs_jailparam *zjp;
2403         int jsys, error;
2404
2405         zjp = zfs_jailparam_find(pr, &ppr);
2406         jsys = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
2407         error = vfs_setopt(opts, "zfs", &jsys, sizeof (jsys));
2408         if (error != 0 && error != ENOENT)
2409                 goto done;
2410         if (jsys == JAIL_SYS_NEW) {
2411                 error = vfs_setopt(opts, "zfs.mount_snapshot",
2412                     &zjp->mount_snapshot, sizeof (zjp->mount_snapshot));
2413                 if (error != 0 && error != ENOENT)
2414                         goto done;
2415         } else {
2416                 /*
2417                  * If this prison is inheriting its ZFS info, report
2418                  * empty/zero parameters.
2419                  */
2420                 static int mount_snapshot = 0;
2421
2422                 error = vfs_setopt(opts, "zfs.mount_snapshot",
2423                     &mount_snapshot, sizeof (mount_snapshot));
2424                 if (error != 0 && error != ENOENT)
2425                         goto done;
2426         }
2427         error = 0;
2428 done:
2429         mtx_unlock(&ppr->pr_mtx);
2430         return (error);
2431 }
2432
2433 static int
2434 zfs_jailparam_set(void *obj, void *data)
2435 {
2436         struct prison *pr = obj;
2437         struct prison *ppr;
2438         struct vfsoptlist *opts = data;
2439         int error, jsys, mount_snapshot;
2440
2441         /* Set the parameters, which should be correct. */
2442         error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2443         if (error == ENOENT)
2444                 jsys = -1;
2445         error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2446             sizeof (mount_snapshot));
2447         if (error == ENOENT)
2448                 mount_snapshot = -1;
2449         else
2450                 jsys = JAIL_SYS_NEW;
2451         switch (jsys) {
2452         case JAIL_SYS_NEW:
2453         {
2454                 /* "zfs=new" or "zfs.*": the prison gets its own ZFS info. */
2455                 struct zfs_jailparam *zjp;
2456
2457                 /*
2458                  * A child jail cannot have more permissions than its parent
2459                  */
2460                 if (pr->pr_parent != &prison0) {
2461                         zjp = zfs_jailparam_find(pr->pr_parent, &ppr);
2462                         mtx_unlock(&ppr->pr_mtx);
2463                         if (zjp->mount_snapshot < mount_snapshot) {
2464                                 return (EPERM);
2465                         }
2466                 }
2467                 zfs_jailparam_alloc(pr, &zjp);
2468                 if (mount_snapshot != -1)
2469                         zjp->mount_snapshot = mount_snapshot;
2470                 mtx_unlock(&pr->pr_mtx);
2471                 break;
2472         }
2473         case JAIL_SYS_INHERIT:
2474                 /* "zfs=inherit": inherit the parent's ZFS info. */
2475                 mtx_lock(&pr->pr_mtx);
2476                 osd_jail_del(pr, zfs_jailparam_slot);
2477                 mtx_unlock(&pr->pr_mtx);
2478                 break;
2479         case -1:
2480                 /*
2481                  * If the setting being changed is not ZFS related
2482                  * then do nothing.
2483                  */
2484                 break;
2485         }
2486
2487         return (0);
2488 }
2489
2490 static int
2491 zfs_jailparam_check(void *obj __unused, void *data)
2492 {
2493         struct vfsoptlist *opts = data;
2494         int error, jsys, mount_snapshot;
2495
2496         /* Check that the parameters are correct. */
2497         error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2498         if (error != ENOENT) {
2499                 if (error != 0)
2500                         return (error);
2501                 if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT)
2502                         return (EINVAL);
2503         }
2504         error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2505             sizeof (mount_snapshot));
2506         if (error != ENOENT) {
2507                 if (error != 0)
2508                         return (error);
2509                 if (mount_snapshot != 0 && mount_snapshot != 1)
2510                         return (EINVAL);
2511         }
2512         return (0);
2513 }
2514
2515 static void
2516 zfs_jailparam_destroy(void *data)
2517 {
2518
2519         free(data, M_PRISON);
2520 }
2521
2522 static void
2523 zfs_jailparam_sysinit(void *arg __unused)
2524 {
2525         struct prison *pr;
2526         osd_method_t  methods[PR_MAXMETHOD] = {
2527                 [PR_METHOD_CREATE] = zfs_jailparam_create,
2528                 [PR_METHOD_GET] = zfs_jailparam_get,
2529                 [PR_METHOD_SET] = zfs_jailparam_set,
2530                 [PR_METHOD_CHECK] = zfs_jailparam_check,
2531         };
2532
2533         zfs_jailparam_slot = osd_jail_register(zfs_jailparam_destroy, methods);
2534         /* Copy the defaults to any existing prisons. */
2535         sx_slock(&allprison_lock);
2536         TAILQ_FOREACH(pr, &allprison, pr_list)
2537                 zfs_jailparam_alloc(pr, NULL);
2538         sx_sunlock(&allprison_lock);
2539 }
2540
2541 static void
2542 zfs_jailparam_sysuninit(void *arg __unused)
2543 {
2544
2545         osd_jail_deregister(zfs_jailparam_slot);
2546 }
2547
2548 SYSINIT(zfs_jailparam_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2549         zfs_jailparam_sysinit, NULL);
2550 SYSUNINIT(zfs_jailparam_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2551         zfs_jailparam_sysuninit, NULL);