module/os/freebsd/zfs/zfs_vfsops.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
  24  * All rights reserved.
  25  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  26  * Copyright (c) 2014 Integros [integros.com]
  27  * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
  28  */
  29
  30 /* Portions Copyright 2010 Robert Milkowski */
  31
  32 #include <sys/types.h>
  33 #include <sys/param.h>
  34 #include <sys/systm.h>
  35 #include <sys/kernel.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/kmem.h>
  38 #include <sys/acl.h>
  39 #include <sys/vnode.h>
  40 #include <sys/vfs.h>
  41 #include <sys/mntent.h>
  42 #include <sys/mount.h>
  43 #include <sys/cmn_err.h>
  44 #include <sys/zfs_znode.h>
  45 #include <sys/zfs_vnops.h>
  46 #include <sys/zfs_dir.h>
  47 #include <sys/zil.h>
  48 #include <sys/fs/zfs.h>
  49 #include <sys/dmu.h>
  50 #include <sys/dsl_prop.h>
  51 #include <sys/dsl_dataset.h>
  52 #include <sys/dsl_deleg.h>
  53 #include <sys/spa.h>
  54 #include <sys/zap.h>
  55 #include <sys/sa.h>
  56 #include <sys/sa_impl.h>
  57 #include <sys/policy.h>
  58 #include <sys/atomic.h>
  59 #include <sys/zfs_ioctl.h>
  60 #include <sys/zfs_ctldir.h>
  61 #include <sys/zfs_fuid.h>
  62 #include <sys/sunddi.h>
  63 #include <sys/dmu_objset.h>
  64 #include <sys/dsl_dir.h>
  65 #include <sys/jail.h>
  66 #include <ufs/ufs/quota.h>
  67 #include <sys/zfs_quota.h>
  68
  69 #include "zfs_comutil.h"
  70
  71 #ifndef MNTK_VMSETSIZE_BUG
  72 #define MNTK_VMSETSIZE_BUG      0
  73 #endif
  74 #ifndef MNTK_NOMSYNC
  75 #define MNTK_NOMSYNC    8
  76 #endif
  77
  78 struct mtx zfs_debug_mtx;
  79 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
  80
  81 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
  82
  83 int zfs_super_owner;
  84 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
  85         "File system owners can perform privileged operation on file systems");
  86
  87 int zfs_debug_level;
  88 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
  89         "Debug level");
  90
  91 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
  92 static int zfs_version_acl = ZFS_ACL_VERSION;
  93 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
  94         "ZFS_ACL_VERSION");
  95 static int zfs_version_spa = SPA_VERSION;
  96 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
  97         "SPA_VERSION");
  98 static int zfs_version_zpl = ZPL_VERSION;
  99 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
 100         "ZPL_VERSION");
 101
 102 #if __FreeBSD_version >= 1400018
 103 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg,
 104     bool *mp_busy);
 105 #else
 106 static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
 107 #endif
 108 static int zfs_mount(vfs_t *vfsp);
 109 static int zfs_umount(vfs_t *vfsp, int fflag);
 110 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
 111 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
 112 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
 113 static int zfs_sync(vfs_t *vfsp, int waitfor);
 114 #if __FreeBSD_version >= 1300098
 115 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
 116     struct ucred **credanonp, int *numsecflavors, int *secflavors);
 117 #else
 118 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
 119     struct ucred **credanonp, int *numsecflavors, int **secflavors);
 120 #endif
 121 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
 122 static void zfs_freevfs(vfs_t *vfsp);
 123
 124 struct vfsops zfs_vfsops = {
 125         .vfs_mount =            zfs_mount,
 126         .vfs_unmount =          zfs_umount,
 127 #if __FreeBSD_version >= 1300049
 128         .vfs_root =             vfs_cache_root,
 129         .vfs_cachedroot = zfs_root,
 130 #else
 131         .vfs_root =             zfs_root,
 132 #endif
 133         .vfs_statfs =           zfs_statfs,
 134         .vfs_vget =             zfs_vget,
 135         .vfs_sync =             zfs_sync,
 136         .vfs_checkexp =         zfs_checkexp,
 137         .vfs_fhtovp =           zfs_fhtovp,
 138         .vfs_quotactl =         zfs_quotactl,
 139 };
 140
 141 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
 142
 143 /*
 144  * We need to keep a count of active fs's.
 145  * This is necessary to prevent our module
 146  * from being unloaded after a umount -f
 147  */
 148 static uint32_t zfs_active_fs_count = 0;
 149
 150 int
 151 zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
 152     char *setpoint)
 153 {
 154         int error;
 155         zfsvfs_t *zfvp;
 156         vfs_t *vfsp;
 157         objset_t *os;
 158         uint64_t tmp = *val;
 159
 160         error = dmu_objset_from_ds(ds, &os);
 161         if (error != 0)
 162                 return (error);
 163
 164         error = getzfsvfs_impl(os, &zfvp);
 165         if (error != 0)
 166                 return (error);
 167         if (zfvp == NULL)
 168                 return (ENOENT);
 169         vfsp = zfvp->z_vfs;
 170         switch (zfs_prop) {
 171         case ZFS_PROP_ATIME:
 172                 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
 173                         tmp = 0;
 174                 if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
 175                         tmp = 1;
 176                 break;
 177         case ZFS_PROP_DEVICES:
 178                 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
 179                         tmp = 0;
 180                 if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
 181                         tmp = 1;
 182                 break;
 183         case ZFS_PROP_EXEC:
 184                 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
 185                         tmp = 0;
 186                 if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
 187                         tmp = 1;
 188                 break;
 189         case ZFS_PROP_SETUID:
 190                 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
 191                         tmp = 0;
 192                 if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
 193                         tmp = 1;
 194                 break;
 195         case ZFS_PROP_READONLY:
 196                 if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
 197                         tmp = 0;
 198                 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
 199                         tmp = 1;
 200                 break;
 201         case ZFS_PROP_XATTR:
 202                 if (zfvp->z_flags & ZSB_XATTR)
 203                         tmp = zfvp->z_xattr;
 204                 break;
 205         case ZFS_PROP_NBMAND:
 206                 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
 207                         tmp = 0;
 208                 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
 209                         tmp = 1;
 210                 break;
 211         default:
 212                 vfs_unbusy(vfsp);
 213                 return (ENOENT);
 214         }
 215
 216         vfs_unbusy(vfsp);
 217         if (tmp != *val) {
 218                 (void) strcpy(setpoint, "temporary");
 219                 *val = tmp;
 220         }
 221         return (0);
 222 }
 223
 224 static int
 225 zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
 226 {
 227         int error = 0;
 228         char buf[32];
 229         uint64_t usedobj, quotaobj;
 230         uint64_t quota, used = 0;
 231         timespec_t now;
 232
 233         usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
 234         quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 235
 236         if (quotaobj == 0 || zfsvfs->z_replay) {
 237                 error = ENOENT;
 238                 goto done;
 239         }
 240         (void) sprintf(buf, "%llx", (longlong_t)id);
 241         if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
 242             buf, sizeof (quota), 1, &quota)) != 0) {
 243                 dprintf("%s(%d): quotaobj lookup failed\n",
 244                     __FUNCTION__, __LINE__);
 245                 goto done;
 246         }
 247         /*
 248          * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
 249          * So we set them to be the same.
 250          */
 251         dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
 252         error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
 253         if (error && error != ENOENT) {
 254                 dprintf("%s(%d):  usedobj failed; %d\n",
 255                     __FUNCTION__, __LINE__, error);
 256                 goto done;
 257         }
 258         dqp->dqb_curblocks = btodb(used);
 259         dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
 260         vfs_timestamp(&now);
 261         /*
 262          * Setting this to 0 causes FreeBSD quota(8) to print
 263          * the number of days since the epoch, which isn't
 264          * particularly useful.
 265          */
 266         dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
 267 done:
 268         return (error);
 269 }
 270
 271 static int
 272 #if __FreeBSD_version >= 1400018
 273 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy)
 274 #else
 275 zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
 276 #endif
 277 {
 278         zfsvfs_t *zfsvfs = vfsp->vfs_data;
 279         struct thread *td;
 280         int cmd, type, error = 0;
 281         int bitsize;
 282         zfs_userquota_prop_t quota_type;
 283         struct dqblk64 dqblk = { 0 };
 284
 285         td = curthread;
 286         cmd = cmds >> SUBCMDSHIFT;
 287         type = cmds & SUBCMDMASK;
 288
 289         ZFS_ENTER(zfsvfs);
 290         if (id == -1) {
 291                 switch (type) {
 292                 case USRQUOTA:
 293                         id = td->td_ucred->cr_ruid;
 294                         break;
 295                 case GRPQUOTA:
 296                         id = td->td_ucred->cr_rgid;
 297                         break;
 298                 default:
 299                         error = EINVAL;
 300 #if __FreeBSD_version < 1400018
 301                         if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
 302                                 vfs_unbusy(vfsp);
 303 #endif
 304                         goto done;
 305                 }
 306         }
 307         /*
 308          * Map BSD type to:
 309          * ZFS_PROP_USERUSED,
 310          * ZFS_PROP_USERQUOTA,
 311          * ZFS_PROP_GROUPUSED,
 312          * ZFS_PROP_GROUPQUOTA
 313          */
 314         switch (cmd) {
 315         case Q_SETQUOTA:
 316         case Q_SETQUOTA32:
 317                 if (type == USRQUOTA)
 318                         quota_type = ZFS_PROP_USERQUOTA;
 319                 else if (type == GRPQUOTA)
 320                         quota_type = ZFS_PROP_GROUPQUOTA;
 321                 else
 322                         error = EINVAL;
 323                 break;
 324         case Q_GETQUOTA:
 325         case Q_GETQUOTA32:
 326                 if (type == USRQUOTA)
 327                         quota_type = ZFS_PROP_USERUSED;
 328                 else if (type == GRPQUOTA)
 329                         quota_type = ZFS_PROP_GROUPUSED;
 330                 else
 331                         error = EINVAL;
 332                 break;
 333         }
 334
 335         /*
 336          * Depending on the cmd, we may need to get
 337          * the ruid and domain (see fuidstr_to_sid?),
 338          * the fuid (how?), or other information.
 339          * Create fuid using zfs_fuid_create(zfsvfs, id,
 340          * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
 341          * I think I can use just the id?
 342          *
 343          * Look at zfs_id_overquota() to look up a quota.
 344          * zap_lookup(something, quotaobj, fuidstring,
 345          *     sizeof (long long), 1, &quota)
 346          *
 347          * See zfs_set_userquota() to set a quota.
 348          */
 349         if ((uint32_t)type >= MAXQUOTAS) {
 350                 error = EINVAL;
 351                 goto done;
 352         }
 353
 354         switch (cmd) {
 355         case Q_GETQUOTASIZE:
 356                 bitsize = 64;
 357                 error = copyout(&bitsize, arg, sizeof (int));
 358                 break;
 359         case Q_QUOTAON:
 360                 // As far as I can tell, you can't turn quotas on or off on zfs
 361                 error = 0;
 362 #if __FreeBSD_version < 1400018
 363                 vfs_unbusy(vfsp);
 364 #endif
 365                 break;
 366         case Q_QUOTAOFF:
 367                 error = ENOTSUP;
 368 #if __FreeBSD_version < 1400018
 369                 vfs_unbusy(vfsp);
 370 #endif
 371                 break;
 372         case Q_SETQUOTA:
 373                 error = copyin(arg, &dqblk, sizeof (dqblk));
 374                 if (error == 0)
 375                         error = zfs_set_userquota(zfsvfs, quota_type,
 376                             "", id, dbtob(dqblk.dqb_bhardlimit));
 377                 break;
 378         case Q_GETQUOTA:
 379                 error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
 380                 if (error == 0)
 381                         error = copyout(&dqblk, arg, sizeof (dqblk));
 382                 break;
 383         default:
 384                 error = EINVAL;
 385                 break;
 386         }
 387 done:
 388         ZFS_EXIT(zfsvfs);
 389         return (error);
 390 }
 391
 392
 393 boolean_t
 394 zfs_is_readonly(zfsvfs_t *zfsvfs)
 395 {
 396         return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY));
 397 }
 398
 399 static int
 400 zfs_sync(vfs_t *vfsp, int waitfor)
 401 {
 402
 403         /*
 404          * Data integrity is job one.  We don't want a compromised kernel
 405          * writing to the storage pool, so we never sync during panic.
 406          */
 407         if (panicstr)
 408                 return (0);
 409
 410         /*
 411          * Ignore the system syncher.  ZFS already commits async data
 412          * at zfs_txg_timeout intervals.
 413          */
 414         if (waitfor == MNT_LAZY)
 415                 return (0);
 416
 417         if (vfsp != NULL) {
 418                 /*
 419                  * Sync a specific filesystem.
 420                  */
 421                 zfsvfs_t *zfsvfs = vfsp->vfs_data;
 422                 dsl_pool_t *dp;
 423                 int error;
 424
 425                 error = vfs_stdsync(vfsp, waitfor);
 426                 if (error != 0)
 427                         return (error);
 428
 429                 ZFS_ENTER(zfsvfs);
 430                 dp = dmu_objset_pool(zfsvfs->z_os);
 431
 432                 /*
 433                  * If the system is shutting down, then skip any
 434                  * filesystems which may exist on a suspended pool.
 435                  */
 436                 if (rebooting && spa_suspended(dp->dp_spa)) {
 437                         ZFS_EXIT(zfsvfs);
 438                         return (0);
 439                 }
 440
 441                 if (zfsvfs->z_log != NULL)
 442                         zil_commit(zfsvfs->z_log, 0);
 443
 444                 ZFS_EXIT(zfsvfs);
 445         } else {
 446                 /*
 447                  * Sync all ZFS filesystems.  This is what happens when you
 448                  * run sync(8).  Unlike other filesystems, ZFS honors the
 449                  * request by waiting for all pools to commit all dirty data.
 450                  */
 451                 spa_sync_allpools();
 452         }
 453
 454         return (0);
 455 }
 456
 457 static void
 458 atime_changed_cb(void *arg, uint64_t newval)
 459 {
 460         zfsvfs_t *zfsvfs = arg;
 461
 462         if (newval == TRUE) {
 463                 zfsvfs->z_atime = TRUE;
 464                 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
 465                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 466                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 467         } else {
 468                 zfsvfs->z_atime = FALSE;
 469                 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
 470                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 471                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 472         }
 473 }
 474
 475 static void
 476 xattr_changed_cb(void *arg, uint64_t newval)
 477 {
 478         zfsvfs_t *zfsvfs = arg;
 479
 480         if (newval == ZFS_XATTR_OFF) {
 481                 zfsvfs->z_flags &= ~ZSB_XATTR;
 482         } else {
 483                 zfsvfs->z_flags |= ZSB_XATTR;
 484
 485                 if (newval == ZFS_XATTR_SA)
 486                         zfsvfs->z_xattr_sa = B_TRUE;
 487                 else
 488                         zfsvfs->z_xattr_sa = B_FALSE;
 489         }
 490 }
 491
 492 static void
 493 blksz_changed_cb(void *arg, uint64_t newval)
 494 {
 495         zfsvfs_t *zfsvfs = arg;
 496         ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
 497         ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
 498         ASSERT(ISP2(newval));
 499
 500         zfsvfs->z_max_blksz = newval;
 501         zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
 502 }
 503
 504 static void
 505 readonly_changed_cb(void *arg, uint64_t newval)
 506 {
 507         zfsvfs_t *zfsvfs = arg;
 508
 509         if (newval) {
 510                 /* XXX locking on vfs_flag? */
 511                 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
 512                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
 513                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
 514         } else {
 515                 /* XXX locking on vfs_flag? */
 516                 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 517                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
 518                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
 519         }
 520 }
 521
 522 static void
 523 setuid_changed_cb(void *arg, uint64_t newval)
 524 {
 525         zfsvfs_t *zfsvfs = arg;
 526
 527         if (newval == FALSE) {
 528                 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
 529                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
 530                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
 531         } else {
 532                 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
 533                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
 534                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
 535         }
 536 }
 537
 538 static void
 539 exec_changed_cb(void *arg, uint64_t newval)
 540 {
 541         zfsvfs_t *zfsvfs = arg;
 542
 543         if (newval == FALSE) {
 544                 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
 545                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
 546                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
 547         } else {
 548                 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
 549                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
 550                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
 551         }
 552 }
 553
 554 /*
 555  * The nbmand mount option can be changed at mount time.
 556  * We can't allow it to be toggled on live file systems or incorrect
 557  * behavior may be seen from cifs clients
 558  *
 559  * This property isn't registered via dsl_prop_register(), but this callback
 560  * will be called when a file system is first mounted
 561  */
 562 static void
 563 nbmand_changed_cb(void *arg, uint64_t newval)
 564 {
 565         zfsvfs_t *zfsvfs = arg;
 566         if (newval == FALSE) {
 567                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
 568                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
 569         } else {
 570                 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
 571                 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
 572         }
 573 }
 574
 575 static void
 576 snapdir_changed_cb(void *arg, uint64_t newval)
 577 {
 578         zfsvfs_t *zfsvfs = arg;
 579
 580         zfsvfs->z_show_ctldir = newval;
 581 }
 582
 583 static void
 584 acl_mode_changed_cb(void *arg, uint64_t newval)
 585 {
 586         zfsvfs_t *zfsvfs = arg;
 587
 588         zfsvfs->z_acl_mode = newval;
 589 }
 590
 591 static void
 592 acl_inherit_changed_cb(void *arg, uint64_t newval)
 593 {
 594         zfsvfs_t *zfsvfs = arg;
 595
 596         zfsvfs->z_acl_inherit = newval;
 597 }
 598
 599 static void
 600 acl_type_changed_cb(void *arg, uint64_t newval)
 601 {
 602         zfsvfs_t *zfsvfs = arg;
 603
 604         zfsvfs->z_acl_type = newval;
 605 }
 606
 607 static int
 608 zfs_register_callbacks(vfs_t *vfsp)
 609 {
 610         struct dsl_dataset *ds = NULL;
 611         objset_t *os = NULL;
 612         zfsvfs_t *zfsvfs = NULL;
 613         uint64_t nbmand;
 614         boolean_t readonly = B_FALSE;
 615         boolean_t do_readonly = B_FALSE;
 616         boolean_t setuid = B_FALSE;
 617         boolean_t do_setuid = B_FALSE;
 618         boolean_t exec = B_FALSE;
 619         boolean_t do_exec = B_FALSE;
 620         boolean_t xattr = B_FALSE;
 621         boolean_t atime = B_FALSE;
 622         boolean_t do_atime = B_FALSE;
 623         boolean_t do_xattr = B_FALSE;
 624         int error = 0;
 625
 626         ASSERT3P(vfsp, !=, NULL);
 627         zfsvfs = vfsp->vfs_data;
 628         ASSERT3P(zfsvfs, !=, NULL);
 629         os = zfsvfs->z_os;
 630
 631         /*
 632          * This function can be called for a snapshot when we update snapshot's
 633          * mount point, which isn't really supported.
 634          */
 635         if (dmu_objset_is_snapshot(os))
 636                 return (EOPNOTSUPP);
 637
 638         /*
 639          * The act of registering our callbacks will destroy any mount
 640          * options we may have.  In order to enable temporary overrides
 641          * of mount options, we stash away the current values and
 642          * restore them after we register the callbacks.
 643          */
 644         if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
 645             !spa_writeable(dmu_objset_spa(os))) {
 646                 readonly = B_TRUE;
 647                 do_readonly = B_TRUE;
 648         } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
 649                 readonly = B_FALSE;
 650                 do_readonly = B_TRUE;
 651         }
 652         if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 653                 setuid = B_FALSE;
 654                 do_setuid = B_TRUE;
 655         } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
 656                 setuid = B_TRUE;
 657                 do_setuid = B_TRUE;
 658         }
 659         if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
 660                 exec = B_FALSE;
 661                 do_exec = B_TRUE;
 662         } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
 663                 exec = B_TRUE;
 664                 do_exec = B_TRUE;
 665         }
 666         if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 667                 zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF;
 668                 do_xattr = B_TRUE;
 669         } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
 670                 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
 671                 do_xattr = B_TRUE;
 672         } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) {
 673                 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
 674                 do_xattr = B_TRUE;
 675         } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) {
 676                 zfsvfs->z_xattr = xattr = ZFS_XATTR_SA;
 677                 do_xattr = B_TRUE;
 678         }
 679         if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
 680                 atime = B_FALSE;
 681                 do_atime = B_TRUE;
 682         } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
 683                 atime = B_TRUE;
 684                 do_atime = B_TRUE;
 685         }
 686
 687         /*
 688          * We need to enter pool configuration here, so that we can use
 689          * dsl_prop_get_int_ds() to handle the special nbmand property below.
 690          * dsl_prop_get_integer() can not be used, because it has to acquire
 691          * spa_namespace_lock and we can not do that because we already hold
 692          * z_teardown_lock.  The problem is that spa_write_cachefile() is called
 693          * with spa_namespace_lock held and the function calls ZFS vnode
 694          * operations to write the cache file and thus z_teardown_lock is
 695          * acquired after spa_namespace_lock.
 696          */
 697         ds = dmu_objset_ds(os);
 698         dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 699
 700         /*
 701          * nbmand is a special property.  It can only be changed at
 702          * mount time.
 703          *
 704          * This is weird, but it is documented to only be changeable
 705          * at mount time.
 706          */
 707         if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
 708                 nbmand = B_FALSE;
 709         } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
 710                 nbmand = B_TRUE;
 711         } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0)) {
 712                 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 713                 return (error);
 714         }
 715
 716         /*
 717          * Register property callbacks.
 718          *
 719          * It would probably be fine to just check for i/o error from
 720          * the first prop_register(), but I guess I like to go
 721          * overboard...
 722          */
 723         error = dsl_prop_register(ds,
 724             zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
 725         error = error ? error : dsl_prop_register(ds,
 726             zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
 727         error = error ? error : dsl_prop_register(ds,
 728             zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
 729         error = error ? error : dsl_prop_register(ds,
 730             zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
 731         error = error ? error : dsl_prop_register(ds,
 732             zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
 733         error = error ? error : dsl_prop_register(ds,
 734             zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
 735         error = error ? error : dsl_prop_register(ds,
 736             zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
 737         error = error ? error : dsl_prop_register(ds,
 738             zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs);
 739         error = error ? error : dsl_prop_register(ds,
 740             zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
 741         error = error ? error : dsl_prop_register(ds,
 742             zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
 743             zfsvfs);
 744         dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 745         if (error)
 746                 goto unregister;
 747
 748         /*
 749          * Invoke our callbacks to restore temporary mount options.
 750          */
 751         if (do_readonly)
 752                 readonly_changed_cb(zfsvfs, readonly);
 753         if (do_setuid)
 754                 setuid_changed_cb(zfsvfs, setuid);
 755         if (do_exec)
 756                 exec_changed_cb(zfsvfs, exec);
 757         if (do_xattr)
 758                 xattr_changed_cb(zfsvfs, xattr);
 759         if (do_atime)
 760                 atime_changed_cb(zfsvfs, atime);
 761
 762         nbmand_changed_cb(zfsvfs, nbmand);
 763
 764         return (0);
 765
 766 unregister:
 767         dsl_prop_unregister_all(ds, zfsvfs);
 768         return (error);
 769 }
 770
 771 /*
 772  * Associate this zfsvfs with the given objset, which must be owned.
 773  * This will cache a bunch of on-disk state from the objset in the
 774  * zfsvfs.
 775  */
 776 static int
 777 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
 778 {
 779         int error;
 780         uint64_t val;
 781
 782         zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
 783         zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 784         zfsvfs->z_os = os;
 785
 786         error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
 787         if (error != 0)
 788                 return (error);
 789         if (zfsvfs->z_version >
 790             zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
 791                 (void) printf("Can't mount a version %lld file system "
 792                     "on a version %lld pool\n. Pool must be upgraded to mount "
 793                     "this file system.", (u_longlong_t)zfsvfs->z_version,
 794                     (u_longlong_t)spa_version(dmu_objset_spa(os)));
 795                 return (SET_ERROR(ENOTSUP));
 796         }
 797         error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
 798         if (error != 0)
 799                 return (error);
 800         zfsvfs->z_norm = (int)val;
 801
 802         error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
 803         if (error != 0)
 804                 return (error);
 805         zfsvfs->z_utf8 = (val != 0);
 806
 807         error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
 808         if (error != 0)
 809                 return (error);
 810         zfsvfs->z_case = (uint_t)val;
 811
 812         error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val);
 813         if (error != 0)
 814                 return (error);
 815         zfsvfs->z_acl_type = (uint_t)val;
 816
 817         /*
 818          * Fold case on file systems that are always or sometimes case
 819          * insensitive.
 820          */
 821         if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 822             zfsvfs->z_case == ZFS_CASE_MIXED)
 823                 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 824
 825         zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 826         zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 827
 828         uint64_t sa_obj = 0;
 829         if (zfsvfs->z_use_sa) {
 830                 /* should either have both of these objects or none */
 831                 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
 832                     &sa_obj);
 833                 if (error != 0)
 834                         return (error);
 835
 836                 error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val);
 837                 if (error == 0 && val == ZFS_XATTR_SA)
 838                         zfsvfs->z_xattr_sa = B_TRUE;
 839         }
 840
 841         error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
 842             &zfsvfs->z_attr_table);
 843         if (error != 0)
 844                 return (error);
 845
 846         if (zfsvfs->z_version >= ZPL_VERSION_SA)
 847                 sa_register_update_callback(os, zfs_sa_upgrade);
 848
 849         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
 850             &zfsvfs->z_root);
 851         if (error != 0)
 852                 return (error);
 853         ASSERT3U(zfsvfs->z_root, !=, 0);
 854
 855         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
 856             &zfsvfs->z_unlinkedobj);
 857         if (error != 0)
 858                 return (error);
 859
 860         error = zap_lookup(os, MASTER_NODE_OBJ,
 861             zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
 862             8, 1, &zfsvfs->z_userquota_obj);
 863         if (error == ENOENT)
 864                 zfsvfs->z_userquota_obj = 0;
 865         else if (error != 0)
 866                 return (error);
 867
 868         error = zap_lookup(os, MASTER_NODE_OBJ,
 869             zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
 870             8, 1, &zfsvfs->z_groupquota_obj);
 871         if (error == ENOENT)
 872                 zfsvfs->z_groupquota_obj = 0;
 873         else if (error != 0)
 874                 return (error);
 875
 876         error = zap_lookup(os, MASTER_NODE_OBJ,
 877             zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
 878             8, 1, &zfsvfs->z_projectquota_obj);
 879         if (error == ENOENT)
 880                 zfsvfs->z_projectquota_obj = 0;
 881         else if (error != 0)
 882                 return (error);
 883
 884         error = zap_lookup(os, MASTER_NODE_OBJ,
 885             zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
 886             8, 1, &zfsvfs->z_userobjquota_obj);
 887         if (error == ENOENT)
 888                 zfsvfs->z_userobjquota_obj = 0;
 889         else if (error != 0)
 890                 return (error);
 891
 892         error = zap_lookup(os, MASTER_NODE_OBJ,
 893             zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
 894             8, 1, &zfsvfs->z_groupobjquota_obj);
 895         if (error == ENOENT)
 896                 zfsvfs->z_groupobjquota_obj = 0;
 897         else if (error != 0)
 898                 return (error);
 899
 900         error = zap_lookup(os, MASTER_NODE_OBJ,
 901             zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
 902             8, 1, &zfsvfs->z_projectobjquota_obj);
 903         if (error == ENOENT)
 904                 zfsvfs->z_projectobjquota_obj = 0;
 905         else if (error != 0)
 906                 return (error);
 907
 908         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
 909             &zfsvfs->z_fuid_obj);
 910         if (error == ENOENT)
 911                 zfsvfs->z_fuid_obj = 0;
 912         else if (error != 0)
 913                 return (error);
 914
 915         error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
 916             &zfsvfs->z_shares_dir);
 917         if (error == ENOENT)
 918                 zfsvfs->z_shares_dir = 0;
 919         else if (error != 0)
 920                 return (error);
 921
 922         /*
 923          * Only use the name cache if we are looking for a
 924          * name on a file system that does not require normalization
 925          * or case folding.  We can also look there if we happen to be
 926          * on a non-normalizing, mixed sensitivity file system IF we
 927          * are looking for the exact name (which is always the case on
 928          * FreeBSD).
 929          */
 930         zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
 931             ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
 932             !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
 933
 934         return (0);
 935 }
 936
 937 taskq_t *zfsvfs_taskq;
 938
 939 static void
 940 zfsvfs_task_unlinked_drain(void *context, int pending __unused)
 941 {
 942
 943         zfs_unlinked_drain((zfsvfs_t *)context);
 944 }
 945
 946 int
 947 zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
 948 {
 949         objset_t *os;
 950         zfsvfs_t *zfsvfs;
 951         int error;
 952         boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
 953
 954         /*
 955          * XXX: Fix struct statfs so this isn't necessary!
 956          *
 957          * The 'osname' is used as the filesystem's special node, which means
 958          * it must fit in statfs.f_mntfromname, or else it can't be
 959          * enumerated, so libzfs_mnttab_find() returns NULL, which causes
 960          * 'zfs unmount' to think it's not mounted when it is.
 961          */
 962         if (strlen(osname) >= MNAMELEN)
 963                 return (SET_ERROR(ENAMETOOLONG));
 964
 965         zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 966
 967         error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs,
 968             &os);
 969         if (error != 0) {
 970                 kmem_free(zfsvfs, sizeof (zfsvfs_t));
 971                 return (error);
 972         }
 973
 974         error = zfsvfs_create_impl(zfvp, zfsvfs, os);
 975
 976         return (error);
 977 }
 978
 979
 980 int
 981 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
 982 {
 983         int error;
 984
 985         zfsvfs->z_vfs = NULL;
 986         zfsvfs->z_parent = zfsvfs;
 987
 988         mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 989         mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
 990         list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 991             offsetof(znode_t, z_link_node));
 992         TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
 993             zfsvfs_task_unlinked_drain, zfsvfs);
 994         ZFS_TEARDOWN_INIT(zfsvfs);
 995         ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs);
 996         rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
 997         for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 998                 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 999
1000         error = zfsvfs_init(zfsvfs, os);
1001         if (error != 0) {
1002                 dmu_objset_disown(os, B_TRUE, zfsvfs);
1003                 *zfvp = NULL;
1004                 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1005                 return (error);
1006         }
1007
1008         *zfvp = zfsvfs;
1009         return (0);
1010 }
1011
1012 static int
1013 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1014 {
1015         int error;
1016
1017         /*
1018          * Check for a bad on-disk format version now since we
1019          * lied about owning the dataset readonly before.
1020          */
1021         if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
1022             dmu_objset_incompatible_encryption_version(zfsvfs->z_os))
1023                 return (SET_ERROR(EROFS));
1024
1025         error = zfs_register_callbacks(zfsvfs->z_vfs);
1026         if (error)
1027                 return (error);
1028
1029         /*
1030          * If we are not mounting (ie: online recv), then we don't
1031          * have to worry about replaying the log as we blocked all
1032          * operations out since we closed the ZIL.
1033          */
1034         if (mounting) {
1035                 boolean_t readonly;
1036
1037                 ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
1038                 error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
1039                 if (error)
1040                         return (error);
1041                 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1042                     &zfsvfs->z_kstat.dk_zil_sums);
1043
1044                 /*
1045                  * During replay we remove the read only flag to
1046                  * allow replays to succeed.
1047                  */
1048                 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1049                 if (readonly != 0) {
1050                         zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1051                 } else {
1052                         dsl_dir_t *dd;
1053                         zap_stats_t zs;
1054
1055                         if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
1056                             &zs) == 0) {
1057                                 dataset_kstats_update_nunlinks_kstat(
1058                                     &zfsvfs->z_kstat, zs.zs_num_entries);
1059                                 dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
1060                                     "num_entries in unlinked set: %llu",
1061                                     (u_longlong_t)zs.zs_num_entries);
1062                         }
1063
1064                         zfs_unlinked_drain(zfsvfs);
1065                         dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1066                         dd->dd_activity_cancelled = B_FALSE;
1067                 }
1068
1069                 /*
1070                  * Parse and replay the intent log.
1071                  *
1072                  * Because of ziltest, this must be done after
1073                  * zfs_unlinked_drain().  (Further note: ziltest
1074                  * doesn't use readonly mounts, where
1075                  * zfs_unlinked_drain() isn't called.)  This is because
1076                  * ziltest causes spa_sync() to think it's committed,
1077                  * but actually it is not, so the intent log contains
1078                  * many txg's worth of changes.
1079                  *
1080                  * In particular, if object N is in the unlinked set in
1081                  * the last txg to actually sync, then it could be
1082                  * actually freed in a later txg and then reallocated
1083                  * in a yet later txg.  This would write a "create
1084                  * object N" record to the intent log.  Normally, this
1085                  * would be fine because the spa_sync() would have
1086                  * written out the fact that object N is free, before
1087                  * we could write the "create object N" intent log
1088                  * record.
1089                  *
1090                  * But when we are in ziltest mode, we advance the "open
1091                  * txg" without actually spa_sync()-ing the changes to
1092                  * disk.  So we would see that object N is still
1093                  * allocated and in the unlinked set, and there is an
1094                  * intent log record saying to allocate it.
1095                  */
1096                 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1097                         if (zil_replay_disable) {
1098                                 zil_destroy(zfsvfs->z_log, B_FALSE);
1099                         } else {
1100                                 boolean_t use_nc = zfsvfs->z_use_namecache;
1101                                 zfsvfs->z_use_namecache = B_FALSE;
1102                                 zfsvfs->z_replay = B_TRUE;
1103                                 zil_replay(zfsvfs->z_os, zfsvfs,
1104                                     zfs_replay_vector);
1105                                 zfsvfs->z_replay = B_FALSE;
1106                                 zfsvfs->z_use_namecache = use_nc;
1107                         }
1108                 }
1109
1110                 /* restore readonly bit */
1111                 if (readonly != 0)
1112                         zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
1113         } else {
1114                 ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL);
1115                 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1116                     &zfsvfs->z_kstat.dk_zil_sums);
1117         }
1118
1119         /*
1120          * Set the objset user_ptr to track its zfsvfs.
1121          */
1122         mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1123         dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1124         mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1125
1126         return (0);
1127 }
1128
1129 void
1130 zfsvfs_free(zfsvfs_t *zfsvfs)
1131 {
1132         int i;
1133
1134         zfs_fuid_destroy(zfsvfs);
1135
1136         mutex_destroy(&zfsvfs->z_znodes_lock);
1137         mutex_destroy(&zfsvfs->z_lock);
1138         ASSERT3U(zfsvfs->z_nr_znodes, ==, 0);
1139         list_destroy(&zfsvfs->z_all_znodes);
1140         ZFS_TEARDOWN_DESTROY(zfsvfs);
1141         ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs);
1142         rw_destroy(&zfsvfs->z_fuid_lock);
1143         for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1144                 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1145         dataset_kstats_destroy(&zfsvfs->z_kstat);
1146         kmem_free(zfsvfs, sizeof (zfsvfs_t));
1147 }
1148
1149 static void
1150 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1151 {
1152         zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
1153         zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1154 }
1155
1156 static int
1157 zfs_domount(vfs_t *vfsp, char *osname)
1158 {
1159         uint64_t recordsize, fsid_guid;
1160         int error = 0;
1161         zfsvfs_t *zfsvfs;
1162
1163         ASSERT3P(vfsp, !=, NULL);
1164         ASSERT3P(osname, !=, NULL);
1165
1166         error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs);
1167         if (error)
1168                 return (error);
1169         zfsvfs->z_vfs = vfsp;
1170
1171         if ((error = dsl_prop_get_integer(osname,
1172             "recordsize", &recordsize, NULL)))
1173                 goto out;
1174         zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
1175         zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
1176
1177         vfsp->vfs_data = zfsvfs;
1178         vfsp->mnt_flag |= MNT_LOCAL;
1179         vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
1180         vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
1181         vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
1182         /*
1183          * This can cause a loss of coherence between ARC and page cache
1184          * on ZoF - unclear if the problem is in FreeBSD or ZoF
1185          */
1186         vfsp->mnt_kern_flag |= MNTK_NO_IOPF;    /* vn_io_fault can be used */
1187         vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
1188         vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
1189
1190 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
1191         vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
1192 #endif
1193         /*
1194          * The fsid is 64 bits, composed of an 8-bit fs type, which
1195          * separates our fsid from any other filesystem types, and a
1196          * 56-bit objset unique ID.  The objset unique ID is unique to
1197          * all objsets open on this system, provided by unique_create().
1198          * The 8-bit fs type must be put in the low bits of fsid[1]
1199          * because that's where other Solaris filesystems put it.
1200          */
1201         fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
1202         ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0);
1203         vfsp->vfs_fsid.val[0] = fsid_guid;
1204         vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) |
1205             (vfsp->mnt_vfc->vfc_typenum & 0xFF);
1206
1207         /*
1208          * Set features for file system.
1209          */
1210         zfs_set_fuid_feature(zfsvfs);
1211
1212         if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1213                 uint64_t pval;
1214
1215                 atime_changed_cb(zfsvfs, B_FALSE);
1216                 readonly_changed_cb(zfsvfs, B_TRUE);
1217                 if ((error = dsl_prop_get_integer(osname,
1218                     "xattr", &pval, NULL)))
1219                         goto out;
1220                 xattr_changed_cb(zfsvfs, pval);
1221                 if ((error = dsl_prop_get_integer(osname,
1222                     "acltype", &pval, NULL)))
1223                         goto out;
1224                 acl_type_changed_cb(zfsvfs, pval);
1225                 zfsvfs->z_issnap = B_TRUE;
1226                 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1227
1228                 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1229                 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1230                 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1231         } else {
1232                 if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
1233                         goto out;
1234         }
1235
1236         vfs_mountedfrom(vfsp, osname);
1237
1238         if (!zfsvfs->z_issnap)
1239                 zfsctl_create(zfsvfs);
1240 out:
1241         if (error) {
1242                 dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
1243                 zfsvfs_free(zfsvfs);
1244         } else {
1245                 atomic_inc_32(&zfs_active_fs_count);
1246         }
1247
1248         return (error);
1249 }
1250
1251 static void
1252 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1253 {
1254         objset_t *os = zfsvfs->z_os;
1255
1256         if (!dmu_objset_is_snapshot(os))
1257                 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1258 }
1259
1260 static int
1261 getpoolname(const char *osname, char *poolname)
1262 {
1263         char *p;
1264
1265         p = strchr(osname, '/');
1266         if (p == NULL) {
1267                 if (strlen(osname) >= MAXNAMELEN)
1268                         return (ENAMETOOLONG);
1269                 (void) strcpy(poolname, osname);
1270         } else {
1271                 if (p - osname >= MAXNAMELEN)
1272                         return (ENAMETOOLONG);
1273                 (void) strncpy(poolname, osname, p - osname);
1274                 poolname[p - osname] = '\0';
1275         }
1276         return (0);
1277 }
1278
1279 static void
1280 fetch_osname_options(char *name, bool *checkpointrewind)
1281 {
1282
1283         if (name[0] == '!') {
1284                 *checkpointrewind = true;
1285                 memmove(name, name + 1, strlen(name));
1286         } else {
1287                 *checkpointrewind = false;
1288         }
1289 }
1290
1291 static int
1292 zfs_mount(vfs_t *vfsp)
1293 {
1294         kthread_t       *td = curthread;
1295         vnode_t         *mvp = vfsp->mnt_vnodecovered;
1296         cred_t          *cr = td->td_ucred;
1297         char            *osname;
1298         int             error = 0;
1299         int             canwrite;
1300         bool            checkpointrewind;
1301
1302         if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
1303                 return (SET_ERROR(EINVAL));
1304
1305         /*
1306          * If full-owner-access is enabled and delegated administration is
1307          * turned on, we must set nosuid.
1308          */
1309         if (zfs_super_owner &&
1310             dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
1311                 secpolicy_fs_mount_clearopts(cr, vfsp);
1312         }
1313
1314         fetch_osname_options(osname, &checkpointrewind);
1315
1316         /*
1317          * Check for mount privilege?
1318          *
1319          * If we don't have privilege then see if
1320          * we have local permission to allow it
1321          */
1322         error = secpolicy_fs_mount(cr, mvp, vfsp);
1323         if (error) {
1324                 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
1325                         goto out;
1326
1327                 if (!(vfsp->vfs_flag & MS_REMOUNT)) {
1328                         vattr_t         vattr;
1329
1330                         /*
1331                          * Make sure user is the owner of the mount point
1332                          * or has sufficient privileges.
1333                          */
1334
1335                         vattr.va_mask = AT_UID;
1336
1337                         vn_lock(mvp, LK_SHARED | LK_RETRY);
1338                         if (VOP_GETATTR(mvp, &vattr, cr)) {
1339                                 VOP_UNLOCK1(mvp);
1340                                 goto out;
1341                         }
1342
1343                         if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1344                             VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
1345                                 VOP_UNLOCK1(mvp);
1346                                 goto out;
1347                         }
1348                         VOP_UNLOCK1(mvp);
1349                 }
1350
1351                 secpolicy_fs_mount_clearopts(cr, vfsp);
1352         }
1353
1354         /*
1355          * Refuse to mount a filesystem if we are in a local zone and the
1356          * dataset is not visible.
1357          */
1358         if (!INGLOBALZONE(curproc) &&
1359             (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1360                 error = SET_ERROR(EPERM);
1361                 goto out;
1362         }
1363
1364         vfsp->vfs_flag |= MNT_NFS4ACLS;
1365
1366         /*
1367          * When doing a remount, we simply refresh our temporary properties
1368          * according to those options set in the current VFS options.
1369          */
1370         if (vfsp->vfs_flag & MS_REMOUNT) {
1371                 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1372
1373                 /*
1374                  * Refresh mount options with z_teardown_lock blocking I/O while
1375                  * the filesystem is in an inconsistent state.
1376                  * The lock also serializes this code with filesystem
1377                  * manipulations between entry to zfs_suspend_fs() and return
1378                  * from zfs_resume_fs().
1379                  */
1380                 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1381                 zfs_unregister_callbacks(zfsvfs);
1382                 error = zfs_register_callbacks(vfsp);
1383                 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1384                 goto out;
1385         }
1386
1387         /* Initial root mount: try hard to import the requested root pool. */
1388         if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
1389             (vfsp->vfs_flag & MNT_UPDATE) == 0) {
1390                 char pname[MAXNAMELEN];
1391
1392                 error = getpoolname(osname, pname);
1393                 if (error == 0)
1394                         error = spa_import_rootpool(pname, checkpointrewind);
1395                 if (error)
1396                         goto out;
1397         }
1398         DROP_GIANT();
1399         error = zfs_domount(vfsp, osname);
1400         PICKUP_GIANT();
1401
1402 out:
1403         return (error);
1404 }
1405
1406 static int
1407 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
1408 {
1409         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1410         uint64_t refdbytes, availbytes, usedobjs, availobjs;
1411
1412         statp->f_version = STATFS_VERSION;
1413
1414         ZFS_ENTER(zfsvfs);
1415
1416         dmu_objset_space(zfsvfs->z_os,
1417             &refdbytes, &availbytes, &usedobjs, &availobjs);
1418
1419         /*
1420          * The underlying storage pool actually uses multiple block sizes.
1421          * We report the fragsize as the smallest block size we support,
1422          * and we report our blocksize as the filesystem's maximum blocksize.
1423          */
1424         statp->f_bsize = SPA_MINBLOCKSIZE;
1425         statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
1426
1427         /*
1428          * The following report "total" blocks of various kinds in the
1429          * file system, but reported in terms of f_frsize - the
1430          * "fragment" size.
1431          */
1432
1433         statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1434         statp->f_bfree = availbytes / statp->f_bsize;
1435         statp->f_bavail = statp->f_bfree; /* no root reservation */
1436
1437         /*
1438          * statvfs() should really be called statufs(), because it assumes
1439          * static metadata.  ZFS doesn't preallocate files, so the best
1440          * we can do is report the max that could possibly fit in f_files,
1441          * and that minus the number actually used in f_ffree.
1442          * For f_ffree, report the smaller of the number of object available
1443          * and the number of blocks (each object will take at least a block).
1444          */
1445         statp->f_ffree = MIN(availobjs, statp->f_bfree);
1446         statp->f_files = statp->f_ffree + usedobjs;
1447
1448         /*
1449          * We're a zfs filesystem.
1450          */
1451         strlcpy(statp->f_fstypename, "zfs",
1452             sizeof (statp->f_fstypename));
1453
1454         strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1455             sizeof (statp->f_mntfromname));
1456         strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1457             sizeof (statp->f_mntonname));
1458
1459         statp->f_namemax = MAXNAMELEN - 1;
1460
1461         ZFS_EXIT(zfsvfs);
1462         return (0);
1463 }
1464
1465 static int
1466 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
1467 {
1468         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1469         znode_t *rootzp;
1470         int error;
1471
1472         ZFS_ENTER(zfsvfs);
1473
1474         error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1475         if (error == 0)
1476                 *vpp = ZTOV(rootzp);
1477
1478         ZFS_EXIT(zfsvfs);
1479
1480         if (error == 0) {
1481                 error = vn_lock(*vpp, flags);
1482                 if (error != 0) {
1483                         VN_RELE(*vpp);
1484                         *vpp = NULL;
1485                 }
1486         }
1487         return (error);
1488 }
1489
1490 /*
1491  * Teardown the zfsvfs::z_os.
1492  *
1493  * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1494  * and 'z_teardown_inactive_lock' held.
1495  */
1496 static int
1497 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1498 {
1499         znode_t *zp;
1500         dsl_dir_t *dd;
1501
1502         /*
1503          * If someone has not already unmounted this file system,
1504          * drain the zrele_taskq to ensure all active references to the
1505          * zfsvfs_t have been handled only then can it be safely destroyed.
1506          */
1507         if (zfsvfs->z_os) {
1508                 /*
1509                  * If we're unmounting we have to wait for the list to
1510                  * drain completely.
1511                  *
1512                  * If we're not unmounting there's no guarantee the list
1513                  * will drain completely, but zreles run from the taskq
1514                  * may add the parents of dir-based xattrs to the taskq
1515                  * so we want to wait for these.
1516                  *
1517                  * We can safely read z_nr_znodes without locking because the
1518                  * VFS has already blocked operations which add to the
1519                  * z_all_znodes list and thus increment z_nr_znodes.
1520                  */
1521                 int round = 0;
1522                 while (zfsvfs->z_nr_znodes > 0) {
1523                         taskq_wait_outstanding(dsl_pool_zrele_taskq(
1524                             dmu_objset_pool(zfsvfs->z_os)), 0);
1525                         if (++round > 1 && !unmounting)
1526                                 break;
1527                 }
1528         }
1529         ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1530
1531         if (!unmounting) {
1532                 /*
1533                  * We purge the parent filesystem's vfsp as the parent
1534                  * filesystem and all of its snapshots have their vnode's
1535                  * v_vfsp set to the parent's filesystem's vfsp.  Note,
1536                  * 'z_parent' is self referential for non-snapshots.
1537                  */
1538 #ifdef FREEBSD_NAMECACHE
1539 #if __FreeBSD_version >= 1300117
1540                 cache_purgevfs(zfsvfs->z_parent->z_vfs);
1541 #else
1542                 cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
1543 #endif
1544 #endif
1545         }
1546
1547         /*
1548          * Close the zil. NB: Can't close the zil while zfs_inactive
1549          * threads are blocked as zil_close can call zfs_inactive.
1550          */
1551         if (zfsvfs->z_log) {
1552                 zil_close(zfsvfs->z_log);
1553                 zfsvfs->z_log = NULL;
1554         }
1555
1556         ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs);
1557
1558         /*
1559          * If we are not unmounting (ie: online recv) and someone already
1560          * unmounted this file system while we were doing the switcheroo,
1561          * or a reopen of z_os failed then just bail out now.
1562          */
1563         if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1564                 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1565                 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1566                 return (SET_ERROR(EIO));
1567         }
1568
1569         /*
1570          * At this point there are no vops active, and any new vops will
1571          * fail with EIO since we have z_teardown_lock for writer (only
1572          * relevant for forced unmount).
1573          *
1574          * Release all holds on dbufs.
1575          */
1576         mutex_enter(&zfsvfs->z_znodes_lock);
1577         for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1578             zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1579                 if (zp->z_sa_hdl != NULL) {
1580                         zfs_znode_dmu_fini(zp);
1581                 }
1582         }
1583         mutex_exit(&zfsvfs->z_znodes_lock);
1584
1585         /*
1586          * If we are unmounting, set the unmounted flag and let new vops
1587          * unblock.  zfs_inactive will have the unmounted behavior, and all
1588          * other vops will fail with EIO.
1589          */
1590         if (unmounting) {
1591                 zfsvfs->z_unmounted = B_TRUE;
1592                 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1593                 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1594         }
1595
1596         /*
1597          * z_os will be NULL if there was an error in attempting to reopen
1598          * zfsvfs, so just return as the properties had already been
1599          * unregistered and cached data had been evicted before.
1600          */
1601         if (zfsvfs->z_os == NULL)
1602                 return (0);
1603
1604         /*
1605          * Unregister properties.
1606          */
1607         zfs_unregister_callbacks(zfsvfs);
1608
1609         /*
1610          * Evict cached data
1611          */
1612         if (!zfs_is_readonly(zfsvfs))
1613                 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1614         dmu_objset_evict_dbufs(zfsvfs->z_os);
1615         dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1616         dsl_dir_cancel_waiters(dd);
1617
1618         return (0);
1619 }
1620
1621 static int
1622 zfs_umount(vfs_t *vfsp, int fflag)
1623 {
1624         kthread_t *td = curthread;
1625         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1626         objset_t *os;
1627         cred_t *cr = td->td_ucred;
1628         int ret;
1629
1630         ret = secpolicy_fs_unmount(cr, vfsp);
1631         if (ret) {
1632                 if (dsl_deleg_access((char *)vfsp->vfs_resource,
1633                     ZFS_DELEG_PERM_MOUNT, cr))
1634                         return (ret);
1635         }
1636
1637         /*
1638          * Unmount any snapshots mounted under .zfs before unmounting the
1639          * dataset itself.
1640          */
1641         if (zfsvfs->z_ctldir != NULL) {
1642                 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1643                         return (ret);
1644         }
1645
1646         if (fflag & MS_FORCE) {
1647                 /*
1648                  * Mark file system as unmounted before calling
1649                  * vflush(FORCECLOSE). This way we ensure no future vnops
1650                  * will be called and risk operating on DOOMED vnodes.
1651                  */
1652                 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
1653                 zfsvfs->z_unmounted = B_TRUE;
1654                 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1655         }
1656
1657         /*
1658          * Flush all the files.
1659          */
1660         ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
1661         if (ret != 0)
1662                 return (ret);
1663         while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
1664             &zfsvfs->z_unlinked_drain_task, NULL) != 0)
1665                 taskqueue_drain(zfsvfs_taskq->tq_queue,
1666                     &zfsvfs->z_unlinked_drain_task);
1667
1668         VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE));
1669         os = zfsvfs->z_os;
1670
1671         /*
1672          * z_os will be NULL if there was an error in
1673          * attempting to reopen zfsvfs.
1674          */
1675         if (os != NULL) {
1676                 /*
1677                  * Unset the objset user_ptr.
1678                  */
1679                 mutex_enter(&os->os_user_ptr_lock);
1680                 dmu_objset_set_user(os, NULL);
1681                 mutex_exit(&os->os_user_ptr_lock);
1682
1683                 /*
1684                  * Finally release the objset
1685                  */
1686                 dmu_objset_disown(os, B_TRUE, zfsvfs);
1687         }
1688
1689         /*
1690          * We can now safely destroy the '.zfs' directory node.
1691          */
1692         if (zfsvfs->z_ctldir != NULL)
1693                 zfsctl_destroy(zfsvfs);
1694         zfs_freevfs(vfsp);
1695
1696         return (0);
1697 }
1698
1699 static int
1700 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1701 {
1702         zfsvfs_t        *zfsvfs = vfsp->vfs_data;
1703         znode_t         *zp;
1704         int             err;
1705
1706         /*
1707          * zfs_zget() can't operate on virtual entries like .zfs/ or
1708          * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
1709          * This will make NFS to switch to LOOKUP instead of using VGET.
1710          */
1711         if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
1712             (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
1713                 return (EOPNOTSUPP);
1714
1715         ZFS_ENTER(zfsvfs);
1716         err = zfs_zget(zfsvfs, ino, &zp);
1717         if (err == 0 && zp->z_unlinked) {
1718                 vrele(ZTOV(zp));
1719                 err = EINVAL;
1720         }
1721         if (err == 0)
1722                 *vpp = ZTOV(zp);
1723         ZFS_EXIT(zfsvfs);
1724         if (err == 0) {
1725                 err = vn_lock(*vpp, flags);
1726                 if (err != 0)
1727                         vrele(*vpp);
1728         }
1729         if (err != 0)
1730                 *vpp = NULL;
1731         return (err);
1732 }
1733
1734 static int
1735 #if __FreeBSD_version >= 1300098
1736 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
1737     struct ucred **credanonp, int *numsecflavors, int *secflavors)
1738 #else
1739 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
1740     struct ucred **credanonp, int *numsecflavors, int **secflavors)
1741 #endif
1742 {
1743         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1744
1745         /*
1746          * If this is regular file system vfsp is the same as
1747          * zfsvfs->z_parent->z_vfs, but if it is snapshot,
1748          * zfsvfs->z_parent->z_vfs represents parent file system
1749          * which we have to use here, because only this file system
1750          * has mnt_export configured.
1751          */
1752         return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
1753             credanonp, numsecflavors, secflavors));
1754 }
1755
1756 _Static_assert(sizeof (struct fid) >= SHORT_FID_LEN,
1757         "struct fid bigger than SHORT_FID_LEN");
1758 _Static_assert(sizeof (struct fid) >= LONG_FID_LEN,
1759         "struct fid bigger than LONG_FID_LEN");
1760
1761 static int
1762 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
1763 {
1764         struct componentname cn;
1765         zfsvfs_t        *zfsvfs = vfsp->vfs_data;
1766         znode_t         *zp;
1767         vnode_t         *dvp;
1768         uint64_t        object = 0;
1769         uint64_t        fid_gen = 0;
1770         uint64_t        setgen = 0;
1771         uint64_t        gen_mask;
1772         uint64_t        zp_gen;
1773         int             i, err;
1774
1775         *vpp = NULL;
1776
1777         ZFS_ENTER(zfsvfs);
1778
1779         /*
1780          * On FreeBSD we can get snapshot's mount point or its parent file
1781          * system mount point depending if snapshot is already mounted or not.
1782          */
1783         if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
1784                 zfid_long_t     *zlfid = (zfid_long_t *)fidp;
1785                 uint64_t        objsetid = 0;
1786
1787                 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1788                         objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1789
1790                 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1791                         setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1792
1793                 ZFS_EXIT(zfsvfs);
1794
1795                 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1796                 if (err)
1797                         return (SET_ERROR(EINVAL));
1798                 ZFS_ENTER(zfsvfs);
1799         }
1800
1801         if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1802                 zfid_short_t    *zfid = (zfid_short_t *)fidp;
1803
1804                 for (i = 0; i < sizeof (zfid->zf_object); i++)
1805                         object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1806
1807                 for (i = 0; i < sizeof (zfid->zf_gen); i++)
1808                         fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1809         } else {
1810                 ZFS_EXIT(zfsvfs);
1811                 return (SET_ERROR(EINVAL));
1812         }
1813
1814         if (fidp->fid_len == LONG_FID_LEN && (fid_gen > 1 || setgen != 0)) {
1815                 dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n",
1816                     (u_longlong_t)fid_gen, (u_longlong_t)setgen);
1817                 return (SET_ERROR(EINVAL));
1818         }
1819
1820         /*
1821          * A zero fid_gen means we are in .zfs or the .zfs/snapshot
1822          * directory tree. If the object == zfsvfs->z_shares_dir, then
1823          * we are in the .zfs/shares directory tree.
1824          */
1825         if ((fid_gen == 0 &&
1826             (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
1827             (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
1828                 ZFS_EXIT(zfsvfs);
1829                 VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
1830                 if (object == ZFSCTL_INO_SNAPDIR) {
1831                         cn.cn_nameptr = "snapshot";
1832                         cn.cn_namelen = strlen(cn.cn_nameptr);
1833                         cn.cn_nameiop = LOOKUP;
1834                         cn.cn_flags = ISLASTCN | LOCKLEAF;
1835                         cn.cn_lkflags = flags;
1836                         VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1837                         vput(dvp);
1838                 } else if (object == zfsvfs->z_shares_dir) {
1839                         /*
1840                          * XXX This branch must not be taken,
1841                          * if it is, then the lookup below will
1842                          * explode.
1843                          */
1844                         cn.cn_nameptr = "shares";
1845                         cn.cn_namelen = strlen(cn.cn_nameptr);
1846                         cn.cn_nameiop = LOOKUP;
1847                         cn.cn_flags = ISLASTCN;
1848                         cn.cn_lkflags = flags;
1849                         VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1850                         vput(dvp);
1851                 } else {
1852                         *vpp = dvp;
1853                 }
1854                 return (err);
1855         }
1856
1857         gen_mask = -1ULL >> (64 - 8 * i);
1858
1859         dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object,
1860             (u_longlong_t)fid_gen,
1861             (u_longlong_t)gen_mask);
1862         if ((err = zfs_zget(zfsvfs, object, &zp))) {
1863                 ZFS_EXIT(zfsvfs);
1864                 return (err);
1865         }
1866         (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1867             sizeof (uint64_t));
1868         zp_gen = zp_gen & gen_mask;
1869         if (zp_gen == 0)
1870                 zp_gen = 1;
1871         if (zp->z_unlinked || zp_gen != fid_gen) {
1872                 dprintf("znode gen (%llu) != fid gen (%llu)\n",
1873                     (u_longlong_t)zp_gen, (u_longlong_t)fid_gen);
1874                 vrele(ZTOV(zp));
1875                 ZFS_EXIT(zfsvfs);
1876                 return (SET_ERROR(EINVAL));
1877         }
1878
1879         *vpp = ZTOV(zp);
1880         ZFS_EXIT(zfsvfs);
1881         err = vn_lock(*vpp, flags);
1882         if (err == 0)
1883                 vnode_create_vobject(*vpp, zp->z_size, curthread);
1884         else
1885                 *vpp = NULL;
1886         return (err);
1887 }
1888
1889 /*
1890  * Block out VOPs and close zfsvfs_t::z_os
1891  *
1892  * Note, if successful, then we return with the 'z_teardown_lock' and
1893  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
1894  * dataset and objset intact so that they can be atomically handed off during
1895  * a subsequent rollback or recv operation and the resume thereafter.
1896  */
1897 int
1898 zfs_suspend_fs(zfsvfs_t *zfsvfs)
1899 {
1900         int error;
1901
1902         if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1903                 return (error);
1904
1905         return (0);
1906 }
1907
1908 /*
1909  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
1910  * is an invariant across any of the operations that can be performed while the
1911  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
1912  * are the same: the relevant objset and associated dataset are owned by
1913  * zfsvfs, held, and long held on entry.
1914  */
1915 int
1916 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
1917 {
1918         int err;
1919         znode_t *zp;
1920
1921         ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
1922         ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
1923
1924         /*
1925          * We already own this, so just update the objset_t, as the one we
1926          * had before may have been evicted.
1927          */
1928         objset_t *os;
1929         VERIFY3P(ds->ds_owner, ==, zfsvfs);
1930         VERIFY(dsl_dataset_long_held(ds));
1931         dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
1932         dsl_pool_config_enter(dp, FTAG);
1933         VERIFY0(dmu_objset_from_ds(ds, &os));
1934         dsl_pool_config_exit(dp, FTAG);
1935
1936         err = zfsvfs_init(zfsvfs, os);
1937         if (err != 0)
1938                 goto bail;
1939
1940         ds->ds_dir->dd_activity_cancelled = B_FALSE;
1941         VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE));
1942
1943         zfs_set_fuid_feature(zfsvfs);
1944
1945         /*
1946          * Attempt to re-establish all the active znodes with
1947          * their dbufs.  If a zfs_rezget() fails, then we'll let
1948          * any potential callers discover that via ZFS_ENTER_VERIFY_VP
1949          * when they try to use their znode.
1950          */
1951         mutex_enter(&zfsvfs->z_znodes_lock);
1952         for (zp = list_head(&zfsvfs->z_all_znodes); zp;
1953             zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1954                 (void) zfs_rezget(zp);
1955         }
1956         mutex_exit(&zfsvfs->z_znodes_lock);
1957
1958 bail:
1959         /* release the VOPs */
1960         ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
1961         ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
1962
1963         if (err) {
1964                 /*
1965                  * Since we couldn't setup the sa framework, try to force
1966                  * unmount this file system.
1967                  */
1968                 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
1969                         vfs_ref(zfsvfs->z_vfs);
1970                         (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
1971                 }
1972         }
1973         return (err);
1974 }
1975
1976 static void
1977 zfs_freevfs(vfs_t *vfsp)
1978 {
1979         zfsvfs_t *zfsvfs = vfsp->vfs_data;
1980
1981         zfsvfs_free(zfsvfs);
1982
1983         atomic_dec_32(&zfs_active_fs_count);
1984 }
1985
1986 #ifdef __i386__
1987 static int desiredvnodes_backup;
1988 #include <sys/vmmeter.h>
1989
1990
1991 #include <vm/vm_page.h>
1992 #include <vm/vm_object.h>
1993 #include <vm/vm_kern.h>
1994 #include <vm/vm_map.h>
1995 #endif
1996
1997 static void
1998 zfs_vnodes_adjust(void)
1999 {
2000 #ifdef __i386__
2001         int newdesiredvnodes;
2002
2003         desiredvnodes_backup = desiredvnodes;
2004
2005         /*
2006          * We calculate newdesiredvnodes the same way it is done in
2007          * vntblinit(). If it is equal to desiredvnodes, it means that
2008          * it wasn't tuned by the administrator and we can tune it down.
2009          */
2010         newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
2011             vm_kmem_size / (5 * (sizeof (struct vm_object) +
2012             sizeof (struct vnode))));
2013         if (newdesiredvnodes == desiredvnodes)
2014                 desiredvnodes = (3 * newdesiredvnodes) / 4;
2015 #endif
2016 }
2017
2018 static void
2019 zfs_vnodes_adjust_back(void)
2020 {
2021
2022 #ifdef __i386__
2023         desiredvnodes = desiredvnodes_backup;
2024 #endif
2025 }
2026
2027 void
2028 zfs_init(void)
2029 {
2030
2031         printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
2032
2033         /*
2034          * Initialize .zfs directory structures
2035          */
2036         zfsctl_init();
2037
2038         /*
2039          * Initialize znode cache, vnode ops, etc...
2040          */
2041         zfs_znode_init();
2042
2043         /*
2044          * Reduce number of vnodes. Originally number of vnodes is calculated
2045          * with UFS inode in mind. We reduce it here, because it's too big for
2046          * ZFS/i386.
2047          */
2048         zfs_vnodes_adjust();
2049
2050         dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
2051
2052         zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
2053 }
2054
2055 void
2056 zfs_fini(void)
2057 {
2058         taskq_destroy(zfsvfs_taskq);
2059         zfsctl_fini();
2060         zfs_znode_fini();
2061         zfs_vnodes_adjust_back();
2062 }
2063
2064 int
2065 zfs_busy(void)
2066 {
2067         return (zfs_active_fs_count != 0);
2068 }
2069
2070 /*
2071  * Release VOPs and unmount a suspended filesystem.
2072  */
2073 int
2074 zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2075 {
2076         ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
2077         ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
2078
2079         /*
2080          * We already own this, so just hold and rele it to update the
2081          * objset_t, as the one we had before may have been evicted.
2082          */
2083         objset_t *os;
2084         VERIFY3P(ds->ds_owner, ==, zfsvfs);
2085         VERIFY(dsl_dataset_long_held(ds));
2086         dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
2087         dsl_pool_config_enter(dp, FTAG);
2088         VERIFY0(dmu_objset_from_ds(ds, &os));
2089         dsl_pool_config_exit(dp, FTAG);
2090         zfsvfs->z_os = os;
2091
2092         /* release the VOPs */
2093         ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
2094         ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
2095
2096         /*
2097          * Try to force unmount this file system.
2098          */
2099         (void) zfs_umount(zfsvfs->z_vfs, 0);
2100         zfsvfs->z_unmounted = B_TRUE;
2101         return (0);
2102 }
2103
2104 int
2105 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2106 {
2107         int error;
2108         objset_t *os = zfsvfs->z_os;
2109         dmu_tx_t *tx;
2110
2111         if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2112                 return (SET_ERROR(EINVAL));
2113
2114         if (newvers < zfsvfs->z_version)
2115                 return (SET_ERROR(EINVAL));
2116
2117         if (zfs_spa_version_map(newvers) >
2118             spa_version(dmu_objset_spa(zfsvfs->z_os)))
2119                 return (SET_ERROR(ENOTSUP));
2120
2121         tx = dmu_tx_create(os);
2122         dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2123         if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2124                 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2125                     ZFS_SA_ATTRS);
2126                 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2127         }
2128         error = dmu_tx_assign(tx, TXG_WAIT);
2129         if (error) {
2130                 dmu_tx_abort(tx);
2131                 return (error);
2132         }
2133
2134         error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2135             8, 1, &newvers, tx);
2136
2137         if (error) {
2138                 dmu_tx_commit(tx);
2139                 return (error);
2140         }
2141
2142         if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2143                 uint64_t sa_obj;
2144
2145                 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2146                     SPA_VERSION_SA);
2147                 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2148                     DMU_OT_NONE, 0, tx);
2149
2150                 error = zap_add(os, MASTER_NODE_OBJ,
2151                     ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2152                 ASSERT0(error);
2153
2154                 VERIFY0(sa_set_sa_object(os, sa_obj));
2155                 sa_register_update_callback(os, zfs_sa_upgrade);
2156         }
2157
2158         spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
2159             "from %ju to %ju", (uintmax_t)zfsvfs->z_version,
2160             (uintmax_t)newvers);
2161         dmu_tx_commit(tx);
2162
2163         zfsvfs->z_version = newvers;
2164         os->os_version = newvers;
2165
2166         zfs_set_fuid_feature(zfsvfs);
2167
2168         return (0);
2169 }
2170
2171 /*
2172  * Read a property stored within the master node.
2173  */
2174 int
2175 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2176 {
2177         uint64_t *cached_copy = NULL;
2178
2179         /*
2180          * Figure out where in the objset_t the cached copy would live, if it
2181          * is available for the requested property.
2182          */
2183         if (os != NULL) {
2184                 switch (prop) {
2185                 case ZFS_PROP_VERSION:
2186                         cached_copy = &os->os_version;
2187                         break;
2188                 case ZFS_PROP_NORMALIZE:
2189                         cached_copy = &os->os_normalization;
2190                         break;
2191                 case ZFS_PROP_UTF8ONLY:
2192                         cached_copy = &os->os_utf8only;
2193                         break;
2194                 case ZFS_PROP_CASE:
2195                         cached_copy = &os->os_casesensitivity;
2196                         break;
2197                 default:
2198                         break;
2199                 }
2200         }
2201         if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
2202                 *value = *cached_copy;
2203                 return (0);
2204         }
2205
2206         /*
2207          * If the property wasn't cached, look up the file system's value for
2208          * the property. For the version property, we look up a slightly
2209          * different string.
2210          */
2211         const char *pname;
2212         int error = ENOENT;
2213         if (prop == ZFS_PROP_VERSION) {
2214                 pname = ZPL_VERSION_STR;
2215         } else {
2216                 pname = zfs_prop_to_name(prop);
2217         }
2218
2219         if (os != NULL) {
2220                 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
2221                 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2222         }
2223
2224         if (error == ENOENT) {
2225                 /* No value set, use the default value */
2226                 switch (prop) {
2227                 case ZFS_PROP_VERSION:
2228                         *value = ZPL_VERSION;
2229                         break;
2230                 case ZFS_PROP_NORMALIZE:
2231                 case ZFS_PROP_UTF8ONLY:
2232                         *value = 0;
2233                         break;
2234                 case ZFS_PROP_CASE:
2235                         *value = ZFS_CASE_SENSITIVE;
2236                         break;
2237                 case ZFS_PROP_ACLTYPE:
2238                         *value = ZFS_ACLTYPE_NFSV4;
2239                         break;
2240                 default:
2241                         return (error);
2242                 }
2243                 error = 0;
2244         }
2245
2246         /*
2247          * If one of the methods for getting the property value above worked,
2248          * copy it into the objset_t's cache.
2249          */
2250         if (error == 0 && cached_copy != NULL) {
2251                 *cached_copy = *value;
2252         }
2253
2254         return (error);
2255 }
2256
2257 /*
2258  * Return true if the corresponding vfs's unmounted flag is set.
2259  * Otherwise return false.
2260  * If this function returns true we know VFS unmount has been initiated.
2261  */
2262 boolean_t
2263 zfs_get_vfs_flag_unmounted(objset_t *os)
2264 {
2265         zfsvfs_t *zfvp;
2266         boolean_t unmounted = B_FALSE;
2267
2268         ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS);
2269
2270         mutex_enter(&os->os_user_ptr_lock);
2271         zfvp = dmu_objset_get_user(os);
2272         if (zfvp != NULL && zfvp->z_vfs != NULL &&
2273             (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
2274                 unmounted = B_TRUE;
2275         mutex_exit(&os->os_user_ptr_lock);
2276
2277         return (unmounted);
2278 }
2279
2280 #ifdef _KERNEL
2281 void
2282 zfsvfs_update_fromname(const char *oldname, const char *newname)
2283 {
2284         char tmpbuf[MAXPATHLEN];
2285         struct mount *mp;
2286         char *fromname;
2287         size_t oldlen;
2288
2289         oldlen = strlen(oldname);
2290
2291         mtx_lock(&mountlist_mtx);
2292         TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2293                 fromname = mp->mnt_stat.f_mntfromname;
2294                 if (strcmp(fromname, oldname) == 0) {
2295                         (void) strlcpy(fromname, newname,
2296                             sizeof (mp->mnt_stat.f_mntfromname));
2297                         continue;
2298                 }
2299                 if (strncmp(fromname, oldname, oldlen) == 0 &&
2300                     (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
2301                         (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s",
2302                             newname, fromname + oldlen);
2303                         (void) strlcpy(fromname, tmpbuf,
2304                             sizeof (mp->mnt_stat.f_mntfromname));
2305                         continue;
2306                 }
2307         }
2308         mtx_unlock(&mountlist_mtx);
2309 }
2310 #endif