4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
24 * All rights reserved.
25 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
30 /* Portions Copyright 2010 Robert Milkowski */
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/kernel.h>
36 #include <sys/sysmacros.h>
39 #include <sys/vnode.h>
41 #include <sys/mntent.h>
42 #include <sys/mount.h>
43 #include <sys/cmn_err.h>
44 #include <sys/zfs_znode.h>
45 #include <sys/zfs_vnops.h>
46 #include <sys/zfs_dir.h>
48 #include <sys/fs/zfs.h>
50 #include <sys/dsl_prop.h>
51 #include <sys/dsl_dataset.h>
52 #include <sys/dsl_deleg.h>
56 #include <sys/sa_impl.h>
57 #include <sys/policy.h>
58 #include <sys/atomic.h>
59 #include <sys/zfs_ioctl.h>
60 #include <sys/zfs_ctldir.h>
61 #include <sys/zfs_fuid.h>
62 #include <sys/sunddi.h>
63 #include <sys/dmu_objset.h>
64 #include <sys/dsl_dir.h>
67 #include <ufs/ufs/quota.h>
68 #include <sys/zfs_quota.h>
70 #include "zfs_comutil.h"
72 #ifndef MNTK_VMSETSIZE_BUG
73 #define MNTK_VMSETSIZE_BUG 0
76 #define MNTK_NOMSYNC 8
79 struct mtx zfs_debug_mtx
;
80 MTX_SYSINIT(zfs_debug_mtx
, &zfs_debug_mtx
, "zfs_debug", MTX_DEF
);
82 SYSCTL_NODE(_vfs
, OID_AUTO
, zfs
, CTLFLAG_RW
, 0, "ZFS file system");
85 SYSCTL_INT(_vfs_zfs
, OID_AUTO
, super_owner
, CTLFLAG_RW
, &zfs_super_owner
, 0,
86 "File system owners can perform privileged operation on file systems");
89 SYSCTL_INT(_vfs_zfs
, OID_AUTO
, debug
, CTLFLAG_RWTUN
, &zfs_debug_level
, 0,
92 int zfs_bclone_enabled
= 1;
93 SYSCTL_INT(_vfs_zfs
, OID_AUTO
, bclone_enabled
, CTLFLAG_RWTUN
,
94 &zfs_bclone_enabled
, 0, "Enable block cloning");
96 struct zfs_jailparam
{
100 static struct zfs_jailparam zfs_jailparam0
= {
104 static int zfs_jailparam_slot
;
106 SYSCTL_JAIL_PARAM_SYS_NODE(zfs
, CTLFLAG_RW
, "Jail ZFS parameters");
107 SYSCTL_JAIL_PARAM(_zfs
, mount_snapshot
, CTLTYPE_INT
| CTLFLAG_RW
, "I",
108 "Allow mounting snapshots in the .zfs directory for unjailed datasets");
110 SYSCTL_NODE(_vfs_zfs
, OID_AUTO
, version
, CTLFLAG_RD
, 0, "ZFS versions");
111 static int zfs_version_acl
= ZFS_ACL_VERSION
;
112 SYSCTL_INT(_vfs_zfs_version
, OID_AUTO
, acl
, CTLFLAG_RD
, &zfs_version_acl
, 0,
114 static int zfs_version_spa
= SPA_VERSION
;
115 SYSCTL_INT(_vfs_zfs_version
, OID_AUTO
, spa
, CTLFLAG_RD
, &zfs_version_spa
, 0,
117 static int zfs_version_zpl
= ZPL_VERSION
;
118 SYSCTL_INT(_vfs_zfs_version
, OID_AUTO
, zpl
, CTLFLAG_RD
, &zfs_version_zpl
, 0,
121 #if __FreeBSD_version >= 1400018
122 static int zfs_quotactl(vfs_t
*vfsp
, int cmds
, uid_t id
, void *arg
,
125 static int zfs_quotactl(vfs_t
*vfsp
, int cmds
, uid_t id
, void *arg
);
127 static int zfs_mount(vfs_t
*vfsp
);
128 static int zfs_umount(vfs_t
*vfsp
, int fflag
);
129 static int zfs_root(vfs_t
*vfsp
, int flags
, vnode_t
**vpp
);
130 static int zfs_statfs(vfs_t
*vfsp
, struct statfs
*statp
);
131 static int zfs_vget(vfs_t
*vfsp
, ino_t ino
, int flags
, vnode_t
**vpp
);
132 static int zfs_sync(vfs_t
*vfsp
, int waitfor
);
133 #if __FreeBSD_version >= 1300098
134 static int zfs_checkexp(vfs_t
*vfsp
, struct sockaddr
*nam
, uint64_t *extflagsp
,
135 struct ucred
**credanonp
, int *numsecflavors
, int *secflavors
);
137 static int zfs_checkexp(vfs_t
*vfsp
, struct sockaddr
*nam
, int *extflagsp
,
138 struct ucred
**credanonp
, int *numsecflavors
, int **secflavors
);
140 static int zfs_fhtovp(vfs_t
*vfsp
, fid_t
*fidp
, int flags
, vnode_t
**vpp
);
141 static void zfs_freevfs(vfs_t
*vfsp
);
143 struct vfsops zfs_vfsops
= {
144 .vfs_mount
= zfs_mount
,
145 .vfs_unmount
= zfs_umount
,
146 #if __FreeBSD_version >= 1300049
147 .vfs_root
= vfs_cache_root
,
148 .vfs_cachedroot
= zfs_root
,
150 .vfs_root
= zfs_root
,
152 .vfs_statfs
= zfs_statfs
,
153 .vfs_vget
= zfs_vget
,
154 .vfs_sync
= zfs_sync
,
155 .vfs_checkexp
= zfs_checkexp
,
156 .vfs_fhtovp
= zfs_fhtovp
,
157 .vfs_quotactl
= zfs_quotactl
,
160 #ifdef VFCF_CROSS_COPY_FILE_RANGE
161 VFS_SET(zfs_vfsops
, zfs
,
162 VFCF_DELEGADMIN
| VFCF_JAIL
| VFCF_CROSS_COPY_FILE_RANGE
);
164 VFS_SET(zfs_vfsops
, zfs
, VFCF_DELEGADMIN
| VFCF_JAIL
);
168 * We need to keep a count of active fs's.
169 * This is necessary to prevent our module
170 * from being unloaded after a umount -f
172 static uint32_t zfs_active_fs_count
= 0;
175 zfs_get_temporary_prop(dsl_dataset_t
*ds
, zfs_prop_t zfs_prop
, uint64_t *val
,
184 error
= dmu_objset_from_ds(ds
, &os
);
188 error
= getzfsvfs_impl(os
, &zfvp
);
196 if (vfs_optionisset(vfsp
, MNTOPT_NOATIME
, NULL
))
198 if (vfs_optionisset(vfsp
, MNTOPT_ATIME
, NULL
))
201 case ZFS_PROP_DEVICES
:
202 if (vfs_optionisset(vfsp
, MNTOPT_NODEVICES
, NULL
))
204 if (vfs_optionisset(vfsp
, MNTOPT_DEVICES
, NULL
))
208 if (vfs_optionisset(vfsp
, MNTOPT_NOEXEC
, NULL
))
210 if (vfs_optionisset(vfsp
, MNTOPT_EXEC
, NULL
))
213 case ZFS_PROP_SETUID
:
214 if (vfs_optionisset(vfsp
, MNTOPT_NOSETUID
, NULL
))
216 if (vfs_optionisset(vfsp
, MNTOPT_SETUID
, NULL
))
219 case ZFS_PROP_READONLY
:
220 if (vfs_optionisset(vfsp
, MNTOPT_RW
, NULL
))
222 if (vfs_optionisset(vfsp
, MNTOPT_RO
, NULL
))
226 if (zfvp
->z_flags
& ZSB_XATTR
)
229 case ZFS_PROP_NBMAND
:
230 if (vfs_optionisset(vfsp
, MNTOPT_NONBMAND
, NULL
))
232 if (vfs_optionisset(vfsp
, MNTOPT_NBMAND
, NULL
))
243 (void) strcpy(setpoint
, "temporary");
250 zfs_getquota(zfsvfs_t
*zfsvfs
, uid_t id
, int isgroup
, struct dqblk64
*dqp
)
254 uint64_t usedobj
, quotaobj
;
255 uint64_t quota
, used
= 0;
258 usedobj
= isgroup
? DMU_GROUPUSED_OBJECT
: DMU_USERUSED_OBJECT
;
259 quotaobj
= isgroup
? zfsvfs
->z_groupquota_obj
: zfsvfs
->z_userquota_obj
;
261 if (quotaobj
== 0 || zfsvfs
->z_replay
) {
265 (void) sprintf(buf
, "%llx", (longlong_t
)id
);
266 if ((error
= zap_lookup(zfsvfs
->z_os
, quotaobj
,
267 buf
, sizeof (quota
), 1, "a
)) != 0) {
268 dprintf("%s(%d): quotaobj lookup failed\n",
269 __FUNCTION__
, __LINE__
);
273 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
274 * So we set them to be the same.
276 dqp
->dqb_bsoftlimit
= dqp
->dqb_bhardlimit
= btodb(quota
);
277 error
= zap_lookup(zfsvfs
->z_os
, usedobj
, buf
, sizeof (used
), 1, &used
);
278 if (error
&& error
!= ENOENT
) {
279 dprintf("%s(%d): usedobj failed; %d\n",
280 __FUNCTION__
, __LINE__
, error
);
283 dqp
->dqb_curblocks
= btodb(used
);
284 dqp
->dqb_ihardlimit
= dqp
->dqb_isoftlimit
= 0;
287 * Setting this to 0 causes FreeBSD quota(8) to print
288 * the number of days since the epoch, which isn't
289 * particularly useful.
291 dqp
->dqb_btime
= dqp
->dqb_itime
= now
.tv_sec
;
297 #if __FreeBSD_version >= 1400018
298 zfs_quotactl(vfs_t
*vfsp
, int cmds
, uid_t id
, void *arg
, bool *mp_busy
)
300 zfs_quotactl(vfs_t
*vfsp
, int cmds
, uid_t id
, void *arg
)
303 zfsvfs_t
*zfsvfs
= vfsp
->vfs_data
;
305 int cmd
, type
, error
= 0;
307 zfs_userquota_prop_t quota_type
;
308 struct dqblk64 dqblk
= { 0 };
311 cmd
= cmds
>> SUBCMDSHIFT
;
312 type
= cmds
& SUBCMDMASK
;
314 if ((error
= zfs_enter(zfsvfs
, FTAG
)) != 0)
319 id
= td
->td_ucred
->cr_ruid
;
322 id
= td
->td_ucred
->cr_rgid
;
326 #if __FreeBSD_version < 1400018
327 if (cmd
== Q_QUOTAON
|| cmd
== Q_QUOTAOFF
)
336 * ZFS_PROP_USERQUOTA,
337 * ZFS_PROP_GROUPUSED,
338 * ZFS_PROP_GROUPQUOTA
343 if (type
== USRQUOTA
)
344 quota_type
= ZFS_PROP_USERQUOTA
;
345 else if (type
== GRPQUOTA
)
346 quota_type
= ZFS_PROP_GROUPQUOTA
;
352 if (type
== USRQUOTA
)
353 quota_type
= ZFS_PROP_USERUSED
;
354 else if (type
== GRPQUOTA
)
355 quota_type
= ZFS_PROP_GROUPUSED
;
362 * Depending on the cmd, we may need to get
363 * the ruid and domain (see fuidstr_to_sid?),
364 * the fuid (how?), or other information.
365 * Create fuid using zfs_fuid_create(zfsvfs, id,
366 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
367 * I think I can use just the id?
369 * Look at zfs_id_overquota() to look up a quota.
370 * zap_lookup(something, quotaobj, fuidstring,
371 * sizeof (long long), 1, "a)
373 * See zfs_set_userquota() to set a quota.
375 if ((uint32_t)type
>= MAXQUOTAS
) {
383 error
= copyout(&bitsize
, arg
, sizeof (int));
386 // As far as I can tell, you can't turn quotas on or off on zfs
388 #if __FreeBSD_version < 1400018
394 #if __FreeBSD_version < 1400018
399 error
= copyin(arg
, &dqblk
, sizeof (dqblk
));
401 error
= zfs_set_userquota(zfsvfs
, quota_type
,
402 "", id
, dbtob(dqblk
.dqb_bhardlimit
));
405 error
= zfs_getquota(zfsvfs
, id
, type
== GRPQUOTA
, &dqblk
);
407 error
= copyout(&dqblk
, arg
, sizeof (dqblk
));
414 zfs_exit(zfsvfs
, FTAG
);
420 zfs_is_readonly(zfsvfs_t
*zfsvfs
)
422 return (!!(zfsvfs
->z_vfs
->vfs_flag
& VFS_RDONLY
));
426 zfs_sync(vfs_t
*vfsp
, int waitfor
)
430 * Data integrity is job one. We don't want a compromised kernel
431 * writing to the storage pool, so we never sync during panic.
437 * Ignore the system syncher. ZFS already commits async data
438 * at zfs_txg_timeout intervals.
440 if (waitfor
== MNT_LAZY
)
445 * Sync a specific filesystem.
447 zfsvfs_t
*zfsvfs
= vfsp
->vfs_data
;
451 if ((error
= zfs_enter(zfsvfs
, FTAG
)) != 0)
453 dp
= dmu_objset_pool(zfsvfs
->z_os
);
456 * If the system is shutting down, then skip any
457 * filesystems which may exist on a suspended pool.
459 if (rebooting
&& spa_suspended(dp
->dp_spa
)) {
460 zfs_exit(zfsvfs
, FTAG
);
464 if (zfsvfs
->z_log
!= NULL
)
465 zil_commit(zfsvfs
->z_log
, 0);
467 zfs_exit(zfsvfs
, FTAG
);
470 * Sync all ZFS filesystems. This is what happens when you
471 * run sync(8). Unlike other filesystems, ZFS honors the
472 * request by waiting for all pools to commit all dirty data.
481 atime_changed_cb(void *arg
, uint64_t newval
)
483 zfsvfs_t
*zfsvfs
= arg
;
485 if (newval
== TRUE
) {
486 zfsvfs
->z_atime
= TRUE
;
487 zfsvfs
->z_vfs
->vfs_flag
&= ~MNT_NOATIME
;
488 vfs_clearmntopt(zfsvfs
->z_vfs
, MNTOPT_NOATIME
);
489 vfs_setmntopt(zfsvfs
->z_vfs
, MNTOPT_ATIME
, NULL
, 0);
491 zfsvfs
->z_atime
= FALSE
;
492 zfsvfs
->z_vfs
->vfs_flag
|= MNT_NOATIME
;
493 vfs_clearmntopt(zfsvfs
->z_vfs
, MNTOPT_ATIME
);
494 vfs_setmntopt(zfsvfs
->z_vfs
, MNTOPT_NOATIME
, NULL
, 0);
499 xattr_changed_cb(void *arg
, uint64_t newval
)
501 zfsvfs_t
*zfsvfs
= arg
;
503 if (newval
== ZFS_XATTR_OFF
) {
504 zfsvfs
->z_flags
&= ~ZSB_XATTR
;
506 zfsvfs
->z_flags
|= ZSB_XATTR
;
508 if (newval
== ZFS_XATTR_SA
)
509 zfsvfs
->z_xattr_sa
= B_TRUE
;
511 zfsvfs
->z_xattr_sa
= B_FALSE
;
516 blksz_changed_cb(void *arg
, uint64_t newval
)
518 zfsvfs_t
*zfsvfs
= arg
;
519 ASSERT3U(newval
, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs
->z_os
)));
520 ASSERT3U(newval
, >=, SPA_MINBLOCKSIZE
);
521 ASSERT(ISP2(newval
));
523 zfsvfs
->z_max_blksz
= newval
;
524 zfsvfs
->z_vfs
->mnt_stat
.f_iosize
= newval
;
528 readonly_changed_cb(void *arg
, uint64_t newval
)
530 zfsvfs_t
*zfsvfs
= arg
;
533 /* XXX locking on vfs_flag? */
534 zfsvfs
->z_vfs
->vfs_flag
|= VFS_RDONLY
;
535 vfs_clearmntopt(zfsvfs
->z_vfs
, MNTOPT_RW
);
536 vfs_setmntopt(zfsvfs
->z_vfs
, MNTOPT_RO
, NULL
, 0);
538 /* XXX locking on vfs_flag? */
539 zfsvfs
->z_vfs
->vfs_flag
&= ~VFS_RDONLY
;
540 vfs_clearmntopt(zfsvfs
->z_vfs
, MNTOPT_RO
);
541 vfs_setmntopt(zfsvfs
->z_vfs
, MNTOPT_RW
, NULL
, 0);
546 setuid_changed_cb(void *arg
, uint64_t newval
)
548 zfsvfs_t
*zfsvfs
= arg
;
550 if (newval
== FALSE
) {
551 zfsvfs
->z_vfs
->vfs_flag
|= VFS_NOSETUID
;
552 vfs_clearmntopt(zfsvfs
->z_vfs
, MNTOPT_SETUID
);
553 vfs_setmntopt(zfsvfs
->z_vfs
, MNTOPT_NOSETUID
, NULL
, 0);
555 zfsvfs
->z_vfs
->vfs_flag
&= ~VFS_NOSETUID
;
556 vfs_clearmntopt(zfsvfs
->z_vfs
, MNTOPT_NOSETUID
);
557 vfs_setmntopt(zfsvfs
->z_vfs
, MNTOPT_SETUID
, NULL
, 0);
562 exec_changed_cb(void *arg
, uint64_t newval
)
564 zfsvfs_t
*zfsvfs
= arg
;
566 if (newval
== FALSE
) {
567 zfsvfs
->z_vfs
->vfs_flag
|= VFS_NOEXEC
;
568 vfs_clearmntopt(zfsvfs
->z_vfs
, MNTOPT_EXEC
);
569 vfs_setmntopt(zfsvfs
->z_vfs
, MNTOPT_NOEXEC
, NULL
, 0);
571 zfsvfs
->z_vfs
->vfs_flag
&= ~VFS_NOEXEC
;
572 vfs_clearmntopt(zfsvfs
->z_vfs
, MNTOPT_NOEXEC
);
573 vfs_setmntopt(zfsvfs
->z_vfs
, MNTOPT_EXEC
, NULL
, 0);
578 * The nbmand mount option can be changed at mount time.
579 * We can't allow it to be toggled on live file systems or incorrect
580 * behavior may be seen from cifs clients
582 * This property isn't registered via dsl_prop_register(), but this callback
583 * will be called when a file system is first mounted
586 nbmand_changed_cb(void *arg
, uint64_t newval
)
588 zfsvfs_t
*zfsvfs
= arg
;
589 if (newval
== FALSE
) {
590 vfs_clearmntopt(zfsvfs
->z_vfs
, MNTOPT_NBMAND
);
591 vfs_setmntopt(zfsvfs
->z_vfs
, MNTOPT_NONBMAND
, NULL
, 0);
593 vfs_clearmntopt(zfsvfs
->z_vfs
, MNTOPT_NONBMAND
);
594 vfs_setmntopt(zfsvfs
->z_vfs
, MNTOPT_NBMAND
, NULL
, 0);
599 snapdir_changed_cb(void *arg
, uint64_t newval
)
601 zfsvfs_t
*zfsvfs
= arg
;
603 zfsvfs
->z_show_ctldir
= newval
;
607 acl_mode_changed_cb(void *arg
, uint64_t newval
)
609 zfsvfs_t
*zfsvfs
= arg
;
611 zfsvfs
->z_acl_mode
= newval
;
615 acl_inherit_changed_cb(void *arg
, uint64_t newval
)
617 zfsvfs_t
*zfsvfs
= arg
;
619 zfsvfs
->z_acl_inherit
= newval
;
623 acl_type_changed_cb(void *arg
, uint64_t newval
)
625 zfsvfs_t
*zfsvfs
= arg
;
627 zfsvfs
->z_acl_type
= newval
;
631 zfs_register_callbacks(vfs_t
*vfsp
)
633 struct dsl_dataset
*ds
= NULL
;
635 zfsvfs_t
*zfsvfs
= NULL
;
637 boolean_t readonly
= B_FALSE
;
638 boolean_t do_readonly
= B_FALSE
;
639 boolean_t setuid
= B_FALSE
;
640 boolean_t do_setuid
= B_FALSE
;
641 boolean_t exec
= B_FALSE
;
642 boolean_t do_exec
= B_FALSE
;
643 boolean_t xattr
= B_FALSE
;
644 boolean_t atime
= B_FALSE
;
645 boolean_t do_atime
= B_FALSE
;
646 boolean_t do_xattr
= B_FALSE
;
649 ASSERT3P(vfsp
, !=, NULL
);
650 zfsvfs
= vfsp
->vfs_data
;
651 ASSERT3P(zfsvfs
, !=, NULL
);
655 * This function can be called for a snapshot when we update snapshot's
656 * mount point, which isn't really supported.
658 if (dmu_objset_is_snapshot(os
))
662 * The act of registering our callbacks will destroy any mount
663 * options we may have. In order to enable temporary overrides
664 * of mount options, we stash away the current values and
665 * restore them after we register the callbacks.
667 if (vfs_optionisset(vfsp
, MNTOPT_RO
, NULL
) ||
668 !spa_writeable(dmu_objset_spa(os
))) {
670 do_readonly
= B_TRUE
;
671 } else if (vfs_optionisset(vfsp
, MNTOPT_RW
, NULL
)) {
673 do_readonly
= B_TRUE
;
675 if (vfs_optionisset(vfsp
, MNTOPT_NOSETUID
, NULL
)) {
678 } else if (vfs_optionisset(vfsp
, MNTOPT_SETUID
, NULL
)) {
682 if (vfs_optionisset(vfsp
, MNTOPT_NOEXEC
, NULL
)) {
685 } else if (vfs_optionisset(vfsp
, MNTOPT_EXEC
, NULL
)) {
689 if (vfs_optionisset(vfsp
, MNTOPT_NOXATTR
, NULL
)) {
690 zfsvfs
->z_xattr
= xattr
= ZFS_XATTR_OFF
;
692 } else if (vfs_optionisset(vfsp
, MNTOPT_XATTR
, NULL
)) {
693 zfsvfs
->z_xattr
= xattr
= ZFS_XATTR_DIR
;
695 } else if (vfs_optionisset(vfsp
, MNTOPT_DIRXATTR
, NULL
)) {
696 zfsvfs
->z_xattr
= xattr
= ZFS_XATTR_DIR
;
698 } else if (vfs_optionisset(vfsp
, MNTOPT_SAXATTR
, NULL
)) {
699 zfsvfs
->z_xattr
= xattr
= ZFS_XATTR_SA
;
702 if (vfs_optionisset(vfsp
, MNTOPT_NOATIME
, NULL
)) {
705 } else if (vfs_optionisset(vfsp
, MNTOPT_ATIME
, NULL
)) {
711 * We need to enter pool configuration here, so that we can use
712 * dsl_prop_get_int_ds() to handle the special nbmand property below.
713 * dsl_prop_get_integer() can not be used, because it has to acquire
714 * spa_namespace_lock and we can not do that because we already hold
715 * z_teardown_lock. The problem is that spa_write_cachefile() is called
716 * with spa_namespace_lock held and the function calls ZFS vnode
717 * operations to write the cache file and thus z_teardown_lock is
718 * acquired after spa_namespace_lock.
720 ds
= dmu_objset_ds(os
);
721 dsl_pool_config_enter(dmu_objset_pool(os
), FTAG
);
724 * nbmand is a special property. It can only be changed at
727 * This is weird, but it is documented to only be changeable
730 if (vfs_optionisset(vfsp
, MNTOPT_NONBMAND
, NULL
)) {
732 } else if (vfs_optionisset(vfsp
, MNTOPT_NBMAND
, NULL
)) {
734 } else if ((error
= dsl_prop_get_int_ds(ds
, "nbmand", &nbmand
)) != 0) {
735 dsl_pool_config_exit(dmu_objset_pool(os
), FTAG
);
740 * Register property callbacks.
742 * It would probably be fine to just check for i/o error from
743 * the first prop_register(), but I guess I like to go
746 error
= dsl_prop_register(ds
,
747 zfs_prop_to_name(ZFS_PROP_ATIME
), atime_changed_cb
, zfsvfs
);
748 error
= error
? error
: dsl_prop_register(ds
,
749 zfs_prop_to_name(ZFS_PROP_XATTR
), xattr_changed_cb
, zfsvfs
);
750 error
= error
? error
: dsl_prop_register(ds
,
751 zfs_prop_to_name(ZFS_PROP_RECORDSIZE
), blksz_changed_cb
, zfsvfs
);
752 error
= error
? error
: dsl_prop_register(ds
,
753 zfs_prop_to_name(ZFS_PROP_READONLY
), readonly_changed_cb
, zfsvfs
);
754 error
= error
? error
: dsl_prop_register(ds
,
755 zfs_prop_to_name(ZFS_PROP_SETUID
), setuid_changed_cb
, zfsvfs
);
756 error
= error
? error
: dsl_prop_register(ds
,
757 zfs_prop_to_name(ZFS_PROP_EXEC
), exec_changed_cb
, zfsvfs
);
758 error
= error
? error
: dsl_prop_register(ds
,
759 zfs_prop_to_name(ZFS_PROP_SNAPDIR
), snapdir_changed_cb
, zfsvfs
);
760 error
= error
? error
: dsl_prop_register(ds
,
761 zfs_prop_to_name(ZFS_PROP_ACLTYPE
), acl_type_changed_cb
, zfsvfs
);
762 error
= error
? error
: dsl_prop_register(ds
,
763 zfs_prop_to_name(ZFS_PROP_ACLMODE
), acl_mode_changed_cb
, zfsvfs
);
764 error
= error
? error
: dsl_prop_register(ds
,
765 zfs_prop_to_name(ZFS_PROP_ACLINHERIT
), acl_inherit_changed_cb
,
767 dsl_pool_config_exit(dmu_objset_pool(os
), FTAG
);
772 * Invoke our callbacks to restore temporary mount options.
775 readonly_changed_cb(zfsvfs
, readonly
);
777 setuid_changed_cb(zfsvfs
, setuid
);
779 exec_changed_cb(zfsvfs
, exec
);
781 xattr_changed_cb(zfsvfs
, xattr
);
783 atime_changed_cb(zfsvfs
, atime
);
785 nbmand_changed_cb(zfsvfs
, nbmand
);
790 dsl_prop_unregister_all(ds
, zfsvfs
);
795 * Associate this zfsvfs with the given objset, which must be owned.
796 * This will cache a bunch of on-disk state from the objset in the
800 zfsvfs_init(zfsvfs_t
*zfsvfs
, objset_t
*os
)
805 zfsvfs
->z_max_blksz
= SPA_OLD_MAXBLOCKSIZE
;
806 zfsvfs
->z_show_ctldir
= ZFS_SNAPDIR_VISIBLE
;
809 error
= zfs_get_zplprop(os
, ZFS_PROP_VERSION
, &zfsvfs
->z_version
);
812 if (zfsvfs
->z_version
>
813 zfs_zpl_version_map(spa_version(dmu_objset_spa(os
)))) {
814 (void) printf("Can't mount a version %lld file system "
815 "on a version %lld pool\n. Pool must be upgraded to mount "
816 "this file system.", (u_longlong_t
)zfsvfs
->z_version
,
817 (u_longlong_t
)spa_version(dmu_objset_spa(os
)));
818 return (SET_ERROR(ENOTSUP
));
820 error
= zfs_get_zplprop(os
, ZFS_PROP_NORMALIZE
, &val
);
823 zfsvfs
->z_norm
= (int)val
;
825 error
= zfs_get_zplprop(os
, ZFS_PROP_UTF8ONLY
, &val
);
828 zfsvfs
->z_utf8
= (val
!= 0);
830 error
= zfs_get_zplprop(os
, ZFS_PROP_CASE
, &val
);
833 zfsvfs
->z_case
= (uint_t
)val
;
835 error
= zfs_get_zplprop(os
, ZFS_PROP_ACLTYPE
, &val
);
838 zfsvfs
->z_acl_type
= (uint_t
)val
;
841 * Fold case on file systems that are always or sometimes case
844 if (zfsvfs
->z_case
== ZFS_CASE_INSENSITIVE
||
845 zfsvfs
->z_case
== ZFS_CASE_MIXED
)
846 zfsvfs
->z_norm
|= U8_TEXTPREP_TOUPPER
;
848 zfsvfs
->z_use_fuids
= USE_FUIDS(zfsvfs
->z_version
, zfsvfs
->z_os
);
849 zfsvfs
->z_use_sa
= USE_SA(zfsvfs
->z_version
, zfsvfs
->z_os
);
852 if (zfsvfs
->z_use_sa
) {
853 /* should either have both of these objects or none */
854 error
= zap_lookup(os
, MASTER_NODE_OBJ
, ZFS_SA_ATTRS
, 8, 1,
859 error
= zfs_get_zplprop(os
, ZFS_PROP_XATTR
, &val
);
860 if (error
== 0 && val
== ZFS_XATTR_SA
)
861 zfsvfs
->z_xattr_sa
= B_TRUE
;
864 error
= sa_setup(os
, sa_obj
, zfs_attr_table
, ZPL_END
,
865 &zfsvfs
->z_attr_table
);
869 if (zfsvfs
->z_version
>= ZPL_VERSION_SA
)
870 sa_register_update_callback(os
, zfs_sa_upgrade
);
872 error
= zap_lookup(os
, MASTER_NODE_OBJ
, ZFS_ROOT_OBJ
, 8, 1,
876 ASSERT3U(zfsvfs
->z_root
, !=, 0);
878 error
= zap_lookup(os
, MASTER_NODE_OBJ
, ZFS_UNLINKED_SET
, 8, 1,
879 &zfsvfs
->z_unlinkedobj
);
883 error
= zap_lookup(os
, MASTER_NODE_OBJ
,
884 zfs_userquota_prop_prefixes
[ZFS_PROP_USERQUOTA
],
885 8, 1, &zfsvfs
->z_userquota_obj
);
887 zfsvfs
->z_userquota_obj
= 0;
891 error
= zap_lookup(os
, MASTER_NODE_OBJ
,
892 zfs_userquota_prop_prefixes
[ZFS_PROP_GROUPQUOTA
],
893 8, 1, &zfsvfs
->z_groupquota_obj
);
895 zfsvfs
->z_groupquota_obj
= 0;
899 error
= zap_lookup(os
, MASTER_NODE_OBJ
,
900 zfs_userquota_prop_prefixes
[ZFS_PROP_PROJECTQUOTA
],
901 8, 1, &zfsvfs
->z_projectquota_obj
);
903 zfsvfs
->z_projectquota_obj
= 0;
907 error
= zap_lookup(os
, MASTER_NODE_OBJ
,
908 zfs_userquota_prop_prefixes
[ZFS_PROP_USEROBJQUOTA
],
909 8, 1, &zfsvfs
->z_userobjquota_obj
);
911 zfsvfs
->z_userobjquota_obj
= 0;
915 error
= zap_lookup(os
, MASTER_NODE_OBJ
,
916 zfs_userquota_prop_prefixes
[ZFS_PROP_GROUPOBJQUOTA
],
917 8, 1, &zfsvfs
->z_groupobjquota_obj
);
919 zfsvfs
->z_groupobjquota_obj
= 0;
923 error
= zap_lookup(os
, MASTER_NODE_OBJ
,
924 zfs_userquota_prop_prefixes
[ZFS_PROP_PROJECTOBJQUOTA
],
925 8, 1, &zfsvfs
->z_projectobjquota_obj
);
927 zfsvfs
->z_projectobjquota_obj
= 0;
931 error
= zap_lookup(os
, MASTER_NODE_OBJ
, ZFS_FUID_TABLES
, 8, 1,
932 &zfsvfs
->z_fuid_obj
);
934 zfsvfs
->z_fuid_obj
= 0;
938 error
= zap_lookup(os
, MASTER_NODE_OBJ
, ZFS_SHARES_DIR
, 8, 1,
939 &zfsvfs
->z_shares_dir
);
941 zfsvfs
->z_shares_dir
= 0;
946 * Only use the name cache if we are looking for a
947 * name on a file system that does not require normalization
948 * or case folding. We can also look there if we happen to be
949 * on a non-normalizing, mixed sensitivity file system IF we
950 * are looking for the exact name (which is always the case on
953 zfsvfs
->z_use_namecache
= !zfsvfs
->z_norm
||
954 ((zfsvfs
->z_case
== ZFS_CASE_MIXED
) &&
955 !(zfsvfs
->z_norm
& ~U8_TEXTPREP_TOUPPER
));
960 taskq_t
*zfsvfs_taskq
;
963 zfsvfs_task_unlinked_drain(void *context
, int pending __unused
)
966 zfs_unlinked_drain((zfsvfs_t
*)context
);
970 zfsvfs_create(const char *osname
, boolean_t readonly
, zfsvfs_t
**zfvp
)
975 boolean_t ro
= (readonly
|| (strchr(osname
, '@') != NULL
));
978 * XXX: Fix struct statfs so this isn't necessary!
980 * The 'osname' is used as the filesystem's special node, which means
981 * it must fit in statfs.f_mntfromname, or else it can't be
982 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
983 * 'zfs unmount' to think it's not mounted when it is.
985 if (strlen(osname
) >= MNAMELEN
)
986 return (SET_ERROR(ENAMETOOLONG
));
988 zfsvfs
= kmem_zalloc(sizeof (zfsvfs_t
), KM_SLEEP
);
990 error
= dmu_objset_own(osname
, DMU_OST_ZFS
, ro
, B_TRUE
, zfsvfs
,
993 kmem_free(zfsvfs
, sizeof (zfsvfs_t
));
997 error
= zfsvfs_create_impl(zfvp
, zfsvfs
, os
);
1004 zfsvfs_create_impl(zfsvfs_t
**zfvp
, zfsvfs_t
*zfsvfs
, objset_t
*os
)
1008 zfsvfs
->z_vfs
= NULL
;
1009 zfsvfs
->z_parent
= zfsvfs
;
1011 mutex_init(&zfsvfs
->z_znodes_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1012 mutex_init(&zfsvfs
->z_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1013 list_create(&zfsvfs
->z_all_znodes
, sizeof (znode_t
),
1014 offsetof(znode_t
, z_link_node
));
1015 TASK_INIT(&zfsvfs
->z_unlinked_drain_task
, 0,
1016 zfsvfs_task_unlinked_drain
, zfsvfs
);
1017 ZFS_TEARDOWN_INIT(zfsvfs
);
1018 ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs
);
1019 rw_init(&zfsvfs
->z_fuid_lock
, NULL
, RW_DEFAULT
, NULL
);
1020 for (int i
= 0; i
!= ZFS_OBJ_MTX_SZ
; i
++)
1021 mutex_init(&zfsvfs
->z_hold_mtx
[i
], NULL
, MUTEX_DEFAULT
, NULL
);
1023 error
= zfsvfs_init(zfsvfs
, os
);
1025 dmu_objset_disown(os
, B_TRUE
, zfsvfs
);
1027 kmem_free(zfsvfs
, sizeof (zfsvfs_t
));
1036 zfsvfs_setup(zfsvfs_t
*zfsvfs
, boolean_t mounting
)
1041 * Check for a bad on-disk format version now since we
1042 * lied about owning the dataset readonly before.
1044 if (!(zfsvfs
->z_vfs
->vfs_flag
& VFS_RDONLY
) &&
1045 dmu_objset_incompatible_encryption_version(zfsvfs
->z_os
))
1046 return (SET_ERROR(EROFS
));
1048 error
= zfs_register_callbacks(zfsvfs
->z_vfs
);
1053 * If we are not mounting (ie: online recv), then we don't
1054 * have to worry about replaying the log as we blocked all
1055 * operations out since we closed the ZIL.
1060 ASSERT3P(zfsvfs
->z_kstat
.dk_kstats
, ==, NULL
);
1061 error
= dataset_kstats_create(&zfsvfs
->z_kstat
, zfsvfs
->z_os
);
1064 zfsvfs
->z_log
= zil_open(zfsvfs
->z_os
, zfs_get_data
,
1065 &zfsvfs
->z_kstat
.dk_zil_sums
);
1068 * During replay we remove the read only flag to
1069 * allow replays to succeed.
1071 readonly
= zfsvfs
->z_vfs
->vfs_flag
& VFS_RDONLY
;
1072 if (readonly
!= 0) {
1073 zfsvfs
->z_vfs
->vfs_flag
&= ~VFS_RDONLY
;
1078 if (zap_get_stats(zfsvfs
->z_os
, zfsvfs
->z_unlinkedobj
,
1080 dataset_kstats_update_nunlinks_kstat(
1081 &zfsvfs
->z_kstat
, zs
.zs_num_entries
);
1082 dprintf_ds(zfsvfs
->z_os
->os_dsl_dataset
,
1083 "num_entries in unlinked set: %llu",
1084 (u_longlong_t
)zs
.zs_num_entries
);
1087 zfs_unlinked_drain(zfsvfs
);
1088 dd
= zfsvfs
->z_os
->os_dsl_dataset
->ds_dir
;
1089 dd
->dd_activity_cancelled
= B_FALSE
;
1093 * Parse and replay the intent log.
1095 * Because of ziltest, this must be done after
1096 * zfs_unlinked_drain(). (Further note: ziltest
1097 * doesn't use readonly mounts, where
1098 * zfs_unlinked_drain() isn't called.) This is because
1099 * ziltest causes spa_sync() to think it's committed,
1100 * but actually it is not, so the intent log contains
1101 * many txg's worth of changes.
1103 * In particular, if object N is in the unlinked set in
1104 * the last txg to actually sync, then it could be
1105 * actually freed in a later txg and then reallocated
1106 * in a yet later txg. This would write a "create
1107 * object N" record to the intent log. Normally, this
1108 * would be fine because the spa_sync() would have
1109 * written out the fact that object N is free, before
1110 * we could write the "create object N" intent log
1113 * But when we are in ziltest mode, we advance the "open
1114 * txg" without actually spa_sync()-ing the changes to
1115 * disk. So we would see that object N is still
1116 * allocated and in the unlinked set, and there is an
1117 * intent log record saying to allocate it.
1119 if (spa_writeable(dmu_objset_spa(zfsvfs
->z_os
))) {
1120 if (zil_replay_disable
) {
1121 zil_destroy(zfsvfs
->z_log
, B_FALSE
);
1123 boolean_t use_nc
= zfsvfs
->z_use_namecache
;
1124 zfsvfs
->z_use_namecache
= B_FALSE
;
1125 zfsvfs
->z_replay
= B_TRUE
;
1126 zil_replay(zfsvfs
->z_os
, zfsvfs
,
1128 zfsvfs
->z_replay
= B_FALSE
;
1129 zfsvfs
->z_use_namecache
= use_nc
;
1133 /* restore readonly bit */
1135 zfsvfs
->z_vfs
->vfs_flag
|= VFS_RDONLY
;
1137 ASSERT3P(zfsvfs
->z_kstat
.dk_kstats
, !=, NULL
);
1138 zfsvfs
->z_log
= zil_open(zfsvfs
->z_os
, zfs_get_data
,
1139 &zfsvfs
->z_kstat
.dk_zil_sums
);
1143 * Set the objset user_ptr to track its zfsvfs.
1145 mutex_enter(&zfsvfs
->z_os
->os_user_ptr_lock
);
1146 dmu_objset_set_user(zfsvfs
->z_os
, zfsvfs
);
1147 mutex_exit(&zfsvfs
->z_os
->os_user_ptr_lock
);
1153 zfsvfs_free(zfsvfs_t
*zfsvfs
)
1157 zfs_fuid_destroy(zfsvfs
);
1159 mutex_destroy(&zfsvfs
->z_znodes_lock
);
1160 mutex_destroy(&zfsvfs
->z_lock
);
1161 list_destroy(&zfsvfs
->z_all_znodes
);
1162 ZFS_TEARDOWN_DESTROY(zfsvfs
);
1163 ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs
);
1164 rw_destroy(&zfsvfs
->z_fuid_lock
);
1165 for (i
= 0; i
!= ZFS_OBJ_MTX_SZ
; i
++)
1166 mutex_destroy(&zfsvfs
->z_hold_mtx
[i
]);
1167 dataset_kstats_destroy(&zfsvfs
->z_kstat
);
1168 kmem_free(zfsvfs
, sizeof (zfsvfs_t
));
1172 zfs_set_fuid_feature(zfsvfs_t
*zfsvfs
)
1174 zfsvfs
->z_use_fuids
= USE_FUIDS(zfsvfs
->z_version
, zfsvfs
->z_os
);
1175 zfsvfs
->z_use_sa
= USE_SA(zfsvfs
->z_version
, zfsvfs
->z_os
);
1179 zfs_domount(vfs_t
*vfsp
, char *osname
)
1181 uint64_t recordsize
, fsid_guid
;
1185 ASSERT3P(vfsp
, !=, NULL
);
1186 ASSERT3P(osname
, !=, NULL
);
1188 error
= zfsvfs_create(osname
, vfsp
->mnt_flag
& MNT_RDONLY
, &zfsvfs
);
1191 zfsvfs
->z_vfs
= vfsp
;
1193 if ((error
= dsl_prop_get_integer(osname
,
1194 "recordsize", &recordsize
, NULL
)))
1196 zfsvfs
->z_vfs
->vfs_bsize
= SPA_MINBLOCKSIZE
;
1197 zfsvfs
->z_vfs
->mnt_stat
.f_iosize
= recordsize
;
1199 vfsp
->vfs_data
= zfsvfs
;
1200 vfsp
->mnt_flag
|= MNT_LOCAL
;
1201 vfsp
->mnt_kern_flag
|= MNTK_LOOKUP_SHARED
;
1202 vfsp
->mnt_kern_flag
|= MNTK_SHARED_WRITES
;
1203 vfsp
->mnt_kern_flag
|= MNTK_EXTENDED_SHARED
;
1205 * This can cause a loss of coherence between ARC and page cache
1206 * on ZoF - unclear if the problem is in FreeBSD or ZoF
1208 vfsp
->mnt_kern_flag
|= MNTK_NO_IOPF
; /* vn_io_fault can be used */
1209 vfsp
->mnt_kern_flag
|= MNTK_NOMSYNC
;
1210 vfsp
->mnt_kern_flag
|= MNTK_VMSETSIZE_BUG
;
1212 #if defined(_KERNEL) && !defined(KMEM_DEBUG)
1213 vfsp
->mnt_kern_flag
|= MNTK_FPLOOKUP
;
1216 * The fsid is 64 bits, composed of an 8-bit fs type, which
1217 * separates our fsid from any other filesystem types, and a
1218 * 56-bit objset unique ID. The objset unique ID is unique to
1219 * all objsets open on this system, provided by unique_create().
1220 * The 8-bit fs type must be put in the low bits of fsid[1]
1221 * because that's where other Solaris filesystems put it.
1223 fsid_guid
= dmu_objset_fsid_guid(zfsvfs
->z_os
);
1224 ASSERT3U((fsid_guid
& ~((1ULL << 56) - 1)), ==, 0);
1225 vfsp
->vfs_fsid
.val
[0] = fsid_guid
;
1226 vfsp
->vfs_fsid
.val
[1] = ((fsid_guid
>> 32) << 8) |
1227 (vfsp
->mnt_vfc
->vfc_typenum
& 0xFF);
1230 * Set features for file system.
1232 zfs_set_fuid_feature(zfsvfs
);
1234 if (dmu_objset_is_snapshot(zfsvfs
->z_os
)) {
1237 atime_changed_cb(zfsvfs
, B_FALSE
);
1238 readonly_changed_cb(zfsvfs
, B_TRUE
);
1239 if ((error
= dsl_prop_get_integer(osname
,
1240 "xattr", &pval
, NULL
)))
1242 xattr_changed_cb(zfsvfs
, pval
);
1243 if ((error
= dsl_prop_get_integer(osname
,
1244 "acltype", &pval
, NULL
)))
1246 acl_type_changed_cb(zfsvfs
, pval
);
1247 zfsvfs
->z_issnap
= B_TRUE
;
1248 zfsvfs
->z_os
->os_sync
= ZFS_SYNC_DISABLED
;
1250 mutex_enter(&zfsvfs
->z_os
->os_user_ptr_lock
);
1251 dmu_objset_set_user(zfsvfs
->z_os
, zfsvfs
);
1252 mutex_exit(&zfsvfs
->z_os
->os_user_ptr_lock
);
1254 if ((error
= zfsvfs_setup(zfsvfs
, B_TRUE
)))
1258 vfs_mountedfrom(vfsp
, osname
);
1260 if (!zfsvfs
->z_issnap
)
1261 zfsctl_create(zfsvfs
);
1264 dmu_objset_disown(zfsvfs
->z_os
, B_TRUE
, zfsvfs
);
1265 zfsvfs_free(zfsvfs
);
1267 atomic_inc_32(&zfs_active_fs_count
);
1274 zfs_unregister_callbacks(zfsvfs_t
*zfsvfs
)
1276 objset_t
*os
= zfsvfs
->z_os
;
1278 if (!dmu_objset_is_snapshot(os
))
1279 dsl_prop_unregister_all(dmu_objset_ds(os
), zfsvfs
);
1283 getpoolname(const char *osname
, char *poolname
)
1287 p
= strchr(osname
, '/');
1289 if (strlen(osname
) >= MAXNAMELEN
)
1290 return (ENAMETOOLONG
);
1291 (void) strcpy(poolname
, osname
);
1293 if (p
- osname
>= MAXNAMELEN
)
1294 return (ENAMETOOLONG
);
1295 (void) strlcpy(poolname
, osname
, p
- osname
+ 1);
1301 fetch_osname_options(char *name
, bool *checkpointrewind
)
1304 if (name
[0] == '!') {
1305 *checkpointrewind
= true;
1306 memmove(name
, name
+ 1, strlen(name
));
1308 *checkpointrewind
= false;
1313 zfs_mount(vfs_t
*vfsp
)
1315 kthread_t
*td
= curthread
;
1316 vnode_t
*mvp
= vfsp
->mnt_vnodecovered
;
1317 cred_t
*cr
= td
->td_ucred
;
1321 bool checkpointrewind
, isctlsnap
= false;
1323 if (vfs_getopt(vfsp
->mnt_optnew
, "from", (void **)&osname
, NULL
))
1324 return (SET_ERROR(EINVAL
));
1327 * If full-owner-access is enabled and delegated administration is
1328 * turned on, we must set nosuid.
1330 if (zfs_super_owner
&&
1331 dsl_deleg_access(osname
, ZFS_DELEG_PERM_MOUNT
, cr
) != ECANCELED
) {
1332 secpolicy_fs_mount_clearopts(cr
, vfsp
);
1335 fetch_osname_options(osname
, &checkpointrewind
);
1336 isctlsnap
= (mvp
!= NULL
&& zfsctl_is_node(mvp
) &&
1337 strchr(osname
, '@') != NULL
);
1340 * Check for mount privilege?
1342 * If we don't have privilege then see if
1343 * we have local permission to allow it
1345 error
= secpolicy_fs_mount(cr
, mvp
, vfsp
);
1346 if (error
&& isctlsnap
) {
1347 secpolicy_fs_mount_clearopts(cr
, vfsp
);
1349 if (dsl_deleg_access(osname
, ZFS_DELEG_PERM_MOUNT
, cr
) != 0)
1352 if (!(vfsp
->vfs_flag
& MS_REMOUNT
)) {
1356 * Make sure user is the owner of the mount point
1357 * or has sufficient privileges.
1360 vattr
.va_mask
= AT_UID
;
1362 vn_lock(mvp
, LK_SHARED
| LK_RETRY
);
1363 if (VOP_GETATTR(mvp
, &vattr
, cr
)) {
1368 if (secpolicy_vnode_owner(mvp
, cr
, vattr
.va_uid
) != 0 &&
1369 VOP_ACCESS(mvp
, VWRITE
, cr
, td
) != 0) {
1376 secpolicy_fs_mount_clearopts(cr
, vfsp
);
1380 * Refuse to mount a filesystem if we are in a local zone and the
1381 * dataset is not visible.
1383 if (!INGLOBALZONE(curproc
) &&
1384 (!zone_dataset_visible(osname
, &canwrite
) || !canwrite
)) {
1385 boolean_t mount_snapshot
= B_FALSE
;
1388 * Snapshots may be mounted in .zfs for unjailed datasets
1389 * if allowed by the jail param zfs.mount_snapshot.
1393 struct zfs_jailparam
*zjp
;
1395 pr
= curthread
->td_ucred
->cr_prison
;
1396 mtx_lock(&pr
->pr_mtx
);
1397 zjp
= osd_jail_get(pr
, zfs_jailparam_slot
);
1398 mtx_unlock(&pr
->pr_mtx
);
1399 if (zjp
&& zjp
->mount_snapshot
)
1400 mount_snapshot
= B_TRUE
;
1402 if (!mount_snapshot
) {
1403 error
= SET_ERROR(EPERM
);
1408 vfsp
->vfs_flag
|= MNT_NFS4ACLS
;
1411 * When doing a remount, we simply refresh our temporary properties
1412 * according to those options set in the current VFS options.
1414 if (vfsp
->vfs_flag
& MS_REMOUNT
) {
1415 zfsvfs_t
*zfsvfs
= vfsp
->vfs_data
;
1418 * Refresh mount options with z_teardown_lock blocking I/O while
1419 * the filesystem is in an inconsistent state.
1420 * The lock also serializes this code with filesystem
1421 * manipulations between entry to zfs_suspend_fs() and return
1422 * from zfs_resume_fs().
1424 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs
, FTAG
);
1425 zfs_unregister_callbacks(zfsvfs
);
1426 error
= zfs_register_callbacks(vfsp
);
1427 ZFS_TEARDOWN_EXIT(zfsvfs
, FTAG
);
1431 /* Initial root mount: try hard to import the requested root pool. */
1432 if ((vfsp
->vfs_flag
& MNT_ROOTFS
) != 0 &&
1433 (vfsp
->vfs_flag
& MNT_UPDATE
) == 0) {
1434 char pname
[MAXNAMELEN
];
1436 error
= getpoolname(osname
, pname
);
1438 error
= spa_import_rootpool(pname
, checkpointrewind
);
1443 error
= zfs_domount(vfsp
, osname
);
1451 zfs_statfs(vfs_t
*vfsp
, struct statfs
*statp
)
1453 zfsvfs_t
*zfsvfs
= vfsp
->vfs_data
;
1454 uint64_t refdbytes
, availbytes
, usedobjs
, availobjs
;
1457 statp
->f_version
= STATFS_VERSION
;
1459 if ((error
= zfs_enter(zfsvfs
, FTAG
)) != 0)
1462 dmu_objset_space(zfsvfs
->z_os
,
1463 &refdbytes
, &availbytes
, &usedobjs
, &availobjs
);
1466 * The underlying storage pool actually uses multiple block sizes.
1467 * We report the fragsize as the smallest block size we support,
1468 * and we report our blocksize as the filesystem's maximum blocksize.
1470 statp
->f_bsize
= SPA_MINBLOCKSIZE
;
1471 statp
->f_iosize
= zfsvfs
->z_vfs
->mnt_stat
.f_iosize
;
1474 * The following report "total" blocks of various kinds in the
1475 * file system, but reported in terms of f_frsize - the
1479 statp
->f_blocks
= (refdbytes
+ availbytes
) >> SPA_MINBLOCKSHIFT
;
1480 statp
->f_bfree
= availbytes
/ statp
->f_bsize
;
1481 statp
->f_bavail
= statp
->f_bfree
; /* no root reservation */
1484 * statvfs() should really be called statufs(), because it assumes
1485 * static metadata. ZFS doesn't preallocate files, so the best
1486 * we can do is report the max that could possibly fit in f_files,
1487 * and that minus the number actually used in f_ffree.
1488 * For f_ffree, report the smaller of the number of object available
1489 * and the number of blocks (each object will take at least a block).
1491 statp
->f_ffree
= MIN(availobjs
, statp
->f_bfree
);
1492 statp
->f_files
= statp
->f_ffree
+ usedobjs
;
1495 * We're a zfs filesystem.
1497 strlcpy(statp
->f_fstypename
, "zfs",
1498 sizeof (statp
->f_fstypename
));
1500 strlcpy(statp
->f_mntfromname
, vfsp
->mnt_stat
.f_mntfromname
,
1501 sizeof (statp
->f_mntfromname
));
1502 strlcpy(statp
->f_mntonname
, vfsp
->mnt_stat
.f_mntonname
,
1503 sizeof (statp
->f_mntonname
));
1505 statp
->f_namemax
= MAXNAMELEN
- 1;
1507 zfs_exit(zfsvfs
, FTAG
);
1512 zfs_root(vfs_t
*vfsp
, int flags
, vnode_t
**vpp
)
1514 zfsvfs_t
*zfsvfs
= vfsp
->vfs_data
;
1518 if ((error
= zfs_enter(zfsvfs
, FTAG
)) != 0)
1521 error
= zfs_zget(zfsvfs
, zfsvfs
->z_root
, &rootzp
);
1523 *vpp
= ZTOV(rootzp
);
1525 zfs_exit(zfsvfs
, FTAG
);
1528 error
= vn_lock(*vpp
, flags
);
1538 * Teardown the zfsvfs::z_os.
1540 * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1541 * and 'z_teardown_inactive_lock' held.
1544 zfsvfs_teardown(zfsvfs_t
*zfsvfs
, boolean_t unmounting
)
1550 * If someone has not already unmounted this file system,
1551 * drain the zrele_taskq to ensure all active references to the
1552 * zfsvfs_t have been handled only then can it be safely destroyed.
1556 * If we're unmounting we have to wait for the list to
1559 * If we're not unmounting there's no guarantee the list
1560 * will drain completely, but zreles run from the taskq
1561 * may add the parents of dir-based xattrs to the taskq
1562 * so we want to wait for these.
1564 * We can safely check z_all_znodes for being empty because the
1565 * VFS has already blocked operations which add to it.
1568 while (!list_is_empty(&zfsvfs
->z_all_znodes
)) {
1569 taskq_wait_outstanding(dsl_pool_zrele_taskq(
1570 dmu_objset_pool(zfsvfs
->z_os
)), 0);
1571 if (++round
> 1 && !unmounting
)
1575 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs
, FTAG
);
1579 * We purge the parent filesystem's vfsp as the parent
1580 * filesystem and all of its snapshots have their vnode's
1581 * v_vfsp set to the parent's filesystem's vfsp. Note,
1582 * 'z_parent' is self referential for non-snapshots.
1584 #ifdef FREEBSD_NAMECACHE
1585 #if __FreeBSD_version >= 1300117
1586 cache_purgevfs(zfsvfs
->z_parent
->z_vfs
);
1588 cache_purgevfs(zfsvfs
->z_parent
->z_vfs
, true);
1594 * Close the zil. NB: Can't close the zil while zfs_inactive
1595 * threads are blocked as zil_close can call zfs_inactive.
1597 if (zfsvfs
->z_log
) {
1598 zil_close(zfsvfs
->z_log
);
1599 zfsvfs
->z_log
= NULL
;
1602 ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs
);
1605 * If we are not unmounting (ie: online recv) and someone already
1606 * unmounted this file system while we were doing the switcheroo,
1607 * or a reopen of z_os failed then just bail out now.
1609 if (!unmounting
&& (zfsvfs
->z_unmounted
|| zfsvfs
->z_os
== NULL
)) {
1610 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs
);
1611 ZFS_TEARDOWN_EXIT(zfsvfs
, FTAG
);
1612 return (SET_ERROR(EIO
));
1616 * At this point there are no vops active, and any new vops will
1617 * fail with EIO since we have z_teardown_lock for writer (only
1618 * relevant for forced unmount).
1620 * Release all holds on dbufs.
1622 mutex_enter(&zfsvfs
->z_znodes_lock
);
1623 for (zp
= list_head(&zfsvfs
->z_all_znodes
); zp
!= NULL
;
1624 zp
= list_next(&zfsvfs
->z_all_znodes
, zp
)) {
1625 if (zp
->z_sa_hdl
!= NULL
) {
1626 zfs_znode_dmu_fini(zp
);
1629 mutex_exit(&zfsvfs
->z_znodes_lock
);
1632 * If we are unmounting, set the unmounted flag and let new vops
1633 * unblock. zfs_inactive will have the unmounted behavior, and all
1634 * other vops will fail with EIO.
1637 zfsvfs
->z_unmounted
= B_TRUE
;
1638 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs
);
1639 ZFS_TEARDOWN_EXIT(zfsvfs
, FTAG
);
1643 * z_os will be NULL if there was an error in attempting to reopen
1644 * zfsvfs, so just return as the properties had already been
1645 * unregistered and cached data had been evicted before.
1647 if (zfsvfs
->z_os
== NULL
)
1651 * Unregister properties.
1653 zfs_unregister_callbacks(zfsvfs
);
1658 if (!zfs_is_readonly(zfsvfs
))
1659 txg_wait_synced(dmu_objset_pool(zfsvfs
->z_os
), 0);
1660 dmu_objset_evict_dbufs(zfsvfs
->z_os
);
1661 dd
= zfsvfs
->z_os
->os_dsl_dataset
->ds_dir
;
1662 dsl_dir_cancel_waiters(dd
);
1668 zfs_umount(vfs_t
*vfsp
, int fflag
)
1670 kthread_t
*td
= curthread
;
1671 zfsvfs_t
*zfsvfs
= vfsp
->vfs_data
;
1673 cred_t
*cr
= td
->td_ucred
;
1676 ret
= secpolicy_fs_unmount(cr
, vfsp
);
1678 if (dsl_deleg_access((char *)vfsp
->vfs_resource
,
1679 ZFS_DELEG_PERM_MOUNT
, cr
))
1684 * Unmount any snapshots mounted under .zfs before unmounting the
1687 if (zfsvfs
->z_ctldir
!= NULL
) {
1688 if ((ret
= zfsctl_umount_snapshots(vfsp
, fflag
, cr
)) != 0)
1692 if (fflag
& MS_FORCE
) {
1694 * Mark file system as unmounted before calling
1695 * vflush(FORCECLOSE). This way we ensure no future vnops
1696 * will be called and risk operating on DOOMED vnodes.
1698 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs
, FTAG
);
1699 zfsvfs
->z_unmounted
= B_TRUE
;
1700 ZFS_TEARDOWN_EXIT(zfsvfs
, FTAG
);
1704 * Flush all the files.
1706 ret
= vflush(vfsp
, 0, (fflag
& MS_FORCE
) ? FORCECLOSE
: 0, td
);
1709 while (taskqueue_cancel(zfsvfs_taskq
->tq_queue
,
1710 &zfsvfs
->z_unlinked_drain_task
, NULL
) != 0)
1711 taskqueue_drain(zfsvfs_taskq
->tq_queue
,
1712 &zfsvfs
->z_unlinked_drain_task
);
1714 VERIFY0(zfsvfs_teardown(zfsvfs
, B_TRUE
));
1718 * z_os will be NULL if there was an error in
1719 * attempting to reopen zfsvfs.
1723 * Unset the objset user_ptr.
1725 mutex_enter(&os
->os_user_ptr_lock
);
1726 dmu_objset_set_user(os
, NULL
);
1727 mutex_exit(&os
->os_user_ptr_lock
);
1730 * Finally release the objset
1732 dmu_objset_disown(os
, B_TRUE
, zfsvfs
);
1736 * We can now safely destroy the '.zfs' directory node.
1738 if (zfsvfs
->z_ctldir
!= NULL
)
1739 zfsctl_destroy(zfsvfs
);
1746 zfs_vget(vfs_t
*vfsp
, ino_t ino
, int flags
, vnode_t
**vpp
)
1748 zfsvfs_t
*zfsvfs
= vfsp
->vfs_data
;
1753 * zfs_zget() can't operate on virtual entries like .zfs/ or
1754 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
1755 * This will make NFS to switch to LOOKUP instead of using VGET.
1757 if (ino
== ZFSCTL_INO_ROOT
|| ino
== ZFSCTL_INO_SNAPDIR
||
1758 (zfsvfs
->z_shares_dir
!= 0 && ino
== zfsvfs
->z_shares_dir
))
1759 return (EOPNOTSUPP
);
1761 if ((err
= zfs_enter(zfsvfs
, FTAG
)) != 0)
1763 err
= zfs_zget(zfsvfs
, ino
, &zp
);
1764 if (err
== 0 && zp
->z_unlinked
) {
1770 zfs_exit(zfsvfs
, FTAG
);
1772 err
= vn_lock(*vpp
, flags
);
1782 #if __FreeBSD_version >= 1300098
1783 zfs_checkexp(vfs_t
*vfsp
, struct sockaddr
*nam
, uint64_t *extflagsp
,
1784 struct ucred
**credanonp
, int *numsecflavors
, int *secflavors
)
1786 zfs_checkexp(vfs_t
*vfsp
, struct sockaddr
*nam
, int *extflagsp
,
1787 struct ucred
**credanonp
, int *numsecflavors
, int **secflavors
)
1790 zfsvfs_t
*zfsvfs
= vfsp
->vfs_data
;
1793 * If this is regular file system vfsp is the same as
1794 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
1795 * zfsvfs->z_parent->z_vfs represents parent file system
1796 * which we have to use here, because only this file system
1797 * has mnt_export configured.
1799 return (vfs_stdcheckexp(zfsvfs
->z_parent
->z_vfs
, nam
, extflagsp
,
1800 credanonp
, numsecflavors
, secflavors
));
1803 _Static_assert(sizeof (struct fid
) >= SHORT_FID_LEN
,
1804 "struct fid bigger than SHORT_FID_LEN");
1805 _Static_assert(sizeof (struct fid
) >= LONG_FID_LEN
,
1806 "struct fid bigger than LONG_FID_LEN");
1809 zfs_fhtovp(vfs_t
*vfsp
, fid_t
*fidp
, int flags
, vnode_t
**vpp
)
1811 struct componentname cn
;
1812 zfsvfs_t
*zfsvfs
= vfsp
->vfs_data
;
1815 uint64_t object
= 0;
1816 uint64_t fid_gen
= 0;
1817 uint64_t setgen
= 0;
1824 if ((err
= zfs_enter(zfsvfs
, FTAG
)) != 0)
1828 * On FreeBSD we can get snapshot's mount point or its parent file
1829 * system mount point depending if snapshot is already mounted or not.
1831 if (zfsvfs
->z_parent
== zfsvfs
&& fidp
->fid_len
== LONG_FID_LEN
) {
1832 zfid_long_t
*zlfid
= (zfid_long_t
*)fidp
;
1833 uint64_t objsetid
= 0;
1835 for (i
= 0; i
< sizeof (zlfid
->zf_setid
); i
++)
1836 objsetid
|= ((uint64_t)zlfid
->zf_setid
[i
]) << (8 * i
);
1838 for (i
= 0; i
< sizeof (zlfid
->zf_setgen
); i
++)
1839 setgen
|= ((uint64_t)zlfid
->zf_setgen
[i
]) << (8 * i
);
1841 zfs_exit(zfsvfs
, FTAG
);
1843 err
= zfsctl_lookup_objset(vfsp
, objsetid
, &zfsvfs
);
1845 return (SET_ERROR(EINVAL
));
1846 if ((err
= zfs_enter(zfsvfs
, FTAG
)) != 0)
1850 if (fidp
->fid_len
== SHORT_FID_LEN
|| fidp
->fid_len
== LONG_FID_LEN
) {
1851 zfid_short_t
*zfid
= (zfid_short_t
*)fidp
;
1853 for (i
= 0; i
< sizeof (zfid
->zf_object
); i
++)
1854 object
|= ((uint64_t)zfid
->zf_object
[i
]) << (8 * i
);
1856 for (i
= 0; i
< sizeof (zfid
->zf_gen
); i
++)
1857 fid_gen
|= ((uint64_t)zfid
->zf_gen
[i
]) << (8 * i
);
1859 zfs_exit(zfsvfs
, FTAG
);
1860 return (SET_ERROR(EINVAL
));
1863 if (fidp
->fid_len
== LONG_FID_LEN
&& setgen
!= 0) {
1864 zfs_exit(zfsvfs
, FTAG
);
1865 dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n",
1866 (u_longlong_t
)fid_gen
, (u_longlong_t
)setgen
);
1867 return (SET_ERROR(EINVAL
));
1871 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
1872 * directory tree. If the object == zfsvfs->z_shares_dir, then
1873 * we are in the .zfs/shares directory tree.
1875 if ((fid_gen
== 0 &&
1876 (object
== ZFSCTL_INO_ROOT
|| object
== ZFSCTL_INO_SNAPDIR
)) ||
1877 (zfsvfs
->z_shares_dir
!= 0 && object
== zfsvfs
->z_shares_dir
)) {
1878 zfs_exit(zfsvfs
, FTAG
);
1879 VERIFY0(zfsctl_root(zfsvfs
, LK_SHARED
, &dvp
));
1880 if (object
== ZFSCTL_INO_SNAPDIR
) {
1881 cn
.cn_nameptr
= "snapshot";
1882 cn
.cn_namelen
= strlen(cn
.cn_nameptr
);
1883 cn
.cn_nameiop
= LOOKUP
;
1884 cn
.cn_flags
= ISLASTCN
| LOCKLEAF
;
1885 cn
.cn_lkflags
= flags
;
1886 VERIFY0(VOP_LOOKUP(dvp
, vpp
, &cn
));
1888 } else if (object
== zfsvfs
->z_shares_dir
) {
1890 * XXX This branch must not be taken,
1891 * if it is, then the lookup below will
1894 cn
.cn_nameptr
= "shares";
1895 cn
.cn_namelen
= strlen(cn
.cn_nameptr
);
1896 cn
.cn_nameiop
= LOOKUP
;
1897 cn
.cn_flags
= ISLASTCN
;
1898 cn
.cn_lkflags
= flags
;
1899 VERIFY0(VOP_LOOKUP(dvp
, vpp
, &cn
));
1907 gen_mask
= -1ULL >> (64 - 8 * i
);
1909 dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t
)object
,
1910 (u_longlong_t
)fid_gen
,
1911 (u_longlong_t
)gen_mask
);
1912 if ((err
= zfs_zget(zfsvfs
, object
, &zp
))) {
1913 zfs_exit(zfsvfs
, FTAG
);
1916 (void) sa_lookup(zp
->z_sa_hdl
, SA_ZPL_GEN(zfsvfs
), &zp_gen
,
1918 zp_gen
= zp_gen
& gen_mask
;
1921 if (zp
->z_unlinked
|| zp_gen
!= fid_gen
) {
1922 dprintf("znode gen (%llu) != fid gen (%llu)\n",
1923 (u_longlong_t
)zp_gen
, (u_longlong_t
)fid_gen
);
1925 zfs_exit(zfsvfs
, FTAG
);
1926 return (SET_ERROR(EINVAL
));
1930 zfs_exit(zfsvfs
, FTAG
);
1931 err
= vn_lock(*vpp
, flags
);
1933 vnode_create_vobject(*vpp
, zp
->z_size
, curthread
);
1940 * Block out VOPs and close zfsvfs_t::z_os
1942 * Note, if successful, then we return with the 'z_teardown_lock' and
1943 * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
1944 * dataset and objset intact so that they can be atomically handed off during
1945 * a subsequent rollback or recv operation and the resume thereafter.
1948 zfs_suspend_fs(zfsvfs_t
*zfsvfs
)
1952 if ((error
= zfsvfs_teardown(zfsvfs
, B_FALSE
)) != 0)
1959 * Rebuild SA and release VOPs. Note that ownership of the underlying dataset
1960 * is an invariant across any of the operations that can be performed while the
1961 * filesystem was suspended. Whether it succeeded or failed, the preconditions
1962 * are the same: the relevant objset and associated dataset are owned by
1963 * zfsvfs, held, and long held on entry.
1966 zfs_resume_fs(zfsvfs_t
*zfsvfs
, dsl_dataset_t
*ds
)
1971 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs
));
1972 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs
));
1975 * We already own this, so just update the objset_t, as the one we
1976 * had before may have been evicted.
1979 VERIFY3P(ds
->ds_owner
, ==, zfsvfs
);
1980 VERIFY(dsl_dataset_long_held(ds
));
1981 dsl_pool_t
*dp
= spa_get_dsl(dsl_dataset_get_spa(ds
));
1982 dsl_pool_config_enter(dp
, FTAG
);
1983 VERIFY0(dmu_objset_from_ds(ds
, &os
));
1984 dsl_pool_config_exit(dp
, FTAG
);
1986 err
= zfsvfs_init(zfsvfs
, os
);
1990 ds
->ds_dir
->dd_activity_cancelled
= B_FALSE
;
1991 VERIFY0(zfsvfs_setup(zfsvfs
, B_FALSE
));
1993 zfs_set_fuid_feature(zfsvfs
);
1996 * Attempt to re-establish all the active znodes with
1997 * their dbufs. If a zfs_rezget() fails, then we'll let
1998 * any potential callers discover that via zfs_enter_verify_zp
1999 * when they try to use their znode.
2001 mutex_enter(&zfsvfs
->z_znodes_lock
);
2002 for (zp
= list_head(&zfsvfs
->z_all_znodes
); zp
;
2003 zp
= list_next(&zfsvfs
->z_all_znodes
, zp
)) {
2004 (void) zfs_rezget(zp
);
2006 mutex_exit(&zfsvfs
->z_znodes_lock
);
2009 /* release the VOPs */
2010 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs
);
2011 ZFS_TEARDOWN_EXIT(zfsvfs
, FTAG
);
2015 * Since we couldn't setup the sa framework, try to force
2016 * unmount this file system.
2018 if (vn_vfswlock(zfsvfs
->z_vfs
->vfs_vnodecovered
) == 0) {
2019 vfs_ref(zfsvfs
->z_vfs
);
2020 (void) dounmount(zfsvfs
->z_vfs
, MS_FORCE
, curthread
);
2027 zfs_freevfs(vfs_t
*vfsp
)
2029 zfsvfs_t
*zfsvfs
= vfsp
->vfs_data
;
2031 zfsvfs_free(zfsvfs
);
2033 atomic_dec_32(&zfs_active_fs_count
);
2037 static int desiredvnodes_backup
;
2038 #include <sys/vmmeter.h>
2041 #include <vm/vm_page.h>
2042 #include <vm/vm_object.h>
2043 #include <vm/vm_kern.h>
2044 #include <vm/vm_map.h>
2048 zfs_vnodes_adjust(void)
2051 int newdesiredvnodes
;
2053 desiredvnodes_backup
= desiredvnodes
;
2056 * We calculate newdesiredvnodes the same way it is done in
2057 * vntblinit(). If it is equal to desiredvnodes, it means that
2058 * it wasn't tuned by the administrator and we can tune it down.
2060 newdesiredvnodes
= min(maxproc
+ vm_cnt
.v_page_count
/ 4, 2 *
2061 vm_kmem_size
/ (5 * (sizeof (struct vm_object
) +
2062 sizeof (struct vnode
))));
2063 if (newdesiredvnodes
== desiredvnodes
)
2064 desiredvnodes
= (3 * newdesiredvnodes
) / 4;
2069 zfs_vnodes_adjust_back(void)
2073 desiredvnodes
= desiredvnodes_backup
;
2077 #if __FreeBSD_version >= 1300139
2078 static struct sx zfs_vnlru_lock
;
2079 static struct vnode
*zfs_vnlru_marker
;
2081 static arc_prune_t
*zfs_prune
;
2084 zfs_prune_task(uint64_t nr_to_scan
, void *arg __unused
)
2086 if (nr_to_scan
> INT_MAX
)
2087 nr_to_scan
= INT_MAX
;
2088 #if __FreeBSD_version >= 1300139
2089 sx_xlock(&zfs_vnlru_lock
);
2090 vnlru_free_vfsops(nr_to_scan
, &zfs_vfsops
, zfs_vnlru_marker
);
2091 sx_xunlock(&zfs_vnlru_lock
);
2093 vnlru_free(nr_to_scan
, &zfs_vfsops
);
2101 printf("ZFS filesystem version: " ZPL_VERSION_STRING
"\n");
2104 * Initialize .zfs directory structures
2109 * Initialize znode cache, vnode ops, etc...
2114 * Reduce number of vnodes. Originally number of vnodes is calculated
2115 * with UFS inode in mind. We reduce it here, because it's too big for
2118 zfs_vnodes_adjust();
2120 dmu_objset_register_type(DMU_OST_ZFS
, zpl_get_file_info
);
2122 zfsvfs_taskq
= taskq_create("zfsvfs", 1, minclsyspri
, 0, 0, 0);
2124 #if __FreeBSD_version >= 1300139
2125 zfs_vnlru_marker
= vnlru_alloc_marker();
2126 sx_init(&zfs_vnlru_lock
, "zfs vnlru lock");
2128 zfs_prune
= arc_add_prune_callback(zfs_prune_task
, NULL
);
2134 arc_remove_prune_callback(zfs_prune
);
2135 #if __FreeBSD_version >= 1300139
2136 vnlru_free_marker(zfs_vnlru_marker
);
2137 sx_destroy(&zfs_vnlru_lock
);
2140 taskq_destroy(zfsvfs_taskq
);
2143 zfs_vnodes_adjust_back();
2149 return (zfs_active_fs_count
!= 0);
2153 * Release VOPs and unmount a suspended filesystem.
2156 zfs_end_fs(zfsvfs_t
*zfsvfs
, dsl_dataset_t
*ds
)
2158 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs
));
2159 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs
));
2162 * We already own this, so just hold and rele it to update the
2163 * objset_t, as the one we had before may have been evicted.
2166 VERIFY3P(ds
->ds_owner
, ==, zfsvfs
);
2167 VERIFY(dsl_dataset_long_held(ds
));
2168 dsl_pool_t
*dp
= spa_get_dsl(dsl_dataset_get_spa(ds
));
2169 dsl_pool_config_enter(dp
, FTAG
);
2170 VERIFY0(dmu_objset_from_ds(ds
, &os
));
2171 dsl_pool_config_exit(dp
, FTAG
);
2174 /* release the VOPs */
2175 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs
);
2176 ZFS_TEARDOWN_EXIT(zfsvfs
, FTAG
);
2179 * Try to force unmount this file system.
2181 (void) zfs_umount(zfsvfs
->z_vfs
, 0);
2182 zfsvfs
->z_unmounted
= B_TRUE
;
2187 zfs_set_version(zfsvfs_t
*zfsvfs
, uint64_t newvers
)
2190 objset_t
*os
= zfsvfs
->z_os
;
2193 if (newvers
< ZPL_VERSION_INITIAL
|| newvers
> ZPL_VERSION
)
2194 return (SET_ERROR(EINVAL
));
2196 if (newvers
< zfsvfs
->z_version
)
2197 return (SET_ERROR(EINVAL
));
2199 if (zfs_spa_version_map(newvers
) >
2200 spa_version(dmu_objset_spa(zfsvfs
->z_os
)))
2201 return (SET_ERROR(ENOTSUP
));
2203 tx
= dmu_tx_create(os
);
2204 dmu_tx_hold_zap(tx
, MASTER_NODE_OBJ
, B_FALSE
, ZPL_VERSION_STR
);
2205 if (newvers
>= ZPL_VERSION_SA
&& !zfsvfs
->z_use_sa
) {
2206 dmu_tx_hold_zap(tx
, MASTER_NODE_OBJ
, B_TRUE
,
2208 dmu_tx_hold_zap(tx
, DMU_NEW_OBJECT
, FALSE
, NULL
);
2210 error
= dmu_tx_assign(tx
, TXG_WAIT
);
2216 error
= zap_update(os
, MASTER_NODE_OBJ
, ZPL_VERSION_STR
,
2217 8, 1, &newvers
, tx
);
2224 if (newvers
>= ZPL_VERSION_SA
&& !zfsvfs
->z_use_sa
) {
2227 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs
->z_os
)), >=,
2229 sa_obj
= zap_create(os
, DMU_OT_SA_MASTER_NODE
,
2230 DMU_OT_NONE
, 0, tx
);
2232 error
= zap_add(os
, MASTER_NODE_OBJ
,
2233 ZFS_SA_ATTRS
, 8, 1, &sa_obj
, tx
);
2236 VERIFY0(sa_set_sa_object(os
, sa_obj
));
2237 sa_register_update_callback(os
, zfs_sa_upgrade
);
2240 spa_history_log_internal_ds(dmu_objset_ds(os
), "upgrade", tx
,
2241 "from %ju to %ju", (uintmax_t)zfsvfs
->z_version
,
2242 (uintmax_t)newvers
);
2245 zfsvfs
->z_version
= newvers
;
2246 os
->os_version
= newvers
;
2248 zfs_set_fuid_feature(zfsvfs
);
2254 * Return true if the corresponding vfs's unmounted flag is set.
2255 * Otherwise return false.
2256 * If this function returns true we know VFS unmount has been initiated.
2259 zfs_get_vfs_flag_unmounted(objset_t
*os
)
2262 boolean_t unmounted
= B_FALSE
;
2264 ASSERT3U(dmu_objset_type(os
), ==, DMU_OST_ZFS
);
2266 mutex_enter(&os
->os_user_ptr_lock
);
2267 zfvp
= dmu_objset_get_user(os
);
2268 if (zfvp
!= NULL
&& zfvp
->z_vfs
!= NULL
&&
2269 (zfvp
->z_vfs
->mnt_kern_flag
& MNTK_UNMOUNT
))
2271 mutex_exit(&os
->os_user_ptr_lock
);
2278 zfsvfs_update_fromname(const char *oldname
, const char *newname
)
2280 char tmpbuf
[MAXPATHLEN
];
2285 oldlen
= strlen(oldname
);
2287 mtx_lock(&mountlist_mtx
);
2288 TAILQ_FOREACH(mp
, &mountlist
, mnt_list
) {
2289 fromname
= mp
->mnt_stat
.f_mntfromname
;
2290 if (strcmp(fromname
, oldname
) == 0) {
2291 (void) strlcpy(fromname
, newname
,
2292 sizeof (mp
->mnt_stat
.f_mntfromname
));
2295 if (strncmp(fromname
, oldname
, oldlen
) == 0 &&
2296 (fromname
[oldlen
] == '/' || fromname
[oldlen
] == '@')) {
2297 (void) snprintf(tmpbuf
, sizeof (tmpbuf
), "%s%s",
2298 newname
, fromname
+ oldlen
);
2299 (void) strlcpy(fromname
, tmpbuf
,
2300 sizeof (mp
->mnt_stat
.f_mntfromname
));
2304 mtx_unlock(&mountlist_mtx
);
2309 * Find a prison with ZFS info.
2310 * Return the ZFS info and the (locked) prison.
2312 static struct zfs_jailparam
*
2313 zfs_jailparam_find(struct prison
*spr
, struct prison
**prp
)
2316 struct zfs_jailparam
*zjp
;
2318 for (pr
= spr
; ; pr
= pr
->pr_parent
) {
2319 mtx_lock(&pr
->pr_mtx
);
2320 if (pr
== &prison0
) {
2321 zjp
= &zfs_jailparam0
;
2324 zjp
= osd_jail_get(pr
, zfs_jailparam_slot
);
2327 mtx_unlock(&pr
->pr_mtx
);
2335 * Ensure a prison has its own ZFS info. If zjpp is non-null, point it to the
2336 * ZFS info and lock the prison.
2339 zfs_jailparam_alloc(struct prison
*pr
, struct zfs_jailparam
**zjpp
)
2342 struct zfs_jailparam
*zjp
, *nzjp
;
2345 /* If this prison already has ZFS info, return that. */
2346 zjp
= zfs_jailparam_find(pr
, &ppr
);
2351 * Allocate a new info record. Then check again, in case something
2352 * changed during the allocation.
2354 mtx_unlock(&ppr
->pr_mtx
);
2355 nzjp
= malloc(sizeof (struct zfs_jailparam
), M_PRISON
, M_WAITOK
);
2356 rsv
= osd_reserve(zfs_jailparam_slot
);
2357 zjp
= zfs_jailparam_find(pr
, &ppr
);
2359 free(nzjp
, M_PRISON
);
2360 osd_free_reserved(rsv
);
2363 /* Inherit the initial values from the ancestor. */
2364 mtx_lock(&pr
->pr_mtx
);
2365 (void) osd_jail_set_reserved(pr
, zfs_jailparam_slot
, rsv
, nzjp
);
2366 (void) memcpy(nzjp
, zjp
, sizeof (*zjp
));
2368 mtx_unlock(&ppr
->pr_mtx
);
2373 mtx_unlock(&pr
->pr_mtx
);
2377 * Jail OSD methods for ZFS VFS info.
2380 zfs_jailparam_create(void *obj
, void *data
)
2382 struct prison
*pr
= obj
;
2383 struct vfsoptlist
*opts
= data
;
2386 if (vfs_copyopt(opts
, "zfs", &jsys
, sizeof (jsys
)) == 0 &&
2387 jsys
== JAIL_SYS_INHERIT
)
2390 * Inherit a prison's initial values from its parent
2391 * (different from JAIL_SYS_INHERIT which also inherits changes).
2393 zfs_jailparam_alloc(pr
, NULL
);
2398 zfs_jailparam_get(void *obj
, void *data
)
2400 struct prison
*ppr
, *pr
= obj
;
2401 struct vfsoptlist
*opts
= data
;
2402 struct zfs_jailparam
*zjp
;
2405 zjp
= zfs_jailparam_find(pr
, &ppr
);
2406 jsys
= (ppr
== pr
) ? JAIL_SYS_NEW
: JAIL_SYS_INHERIT
;
2407 error
= vfs_setopt(opts
, "zfs", &jsys
, sizeof (jsys
));
2408 if (error
!= 0 && error
!= ENOENT
)
2410 if (jsys
== JAIL_SYS_NEW
) {
2411 error
= vfs_setopt(opts
, "zfs.mount_snapshot",
2412 &zjp
->mount_snapshot
, sizeof (zjp
->mount_snapshot
));
2413 if (error
!= 0 && error
!= ENOENT
)
2417 * If this prison is inheriting its ZFS info, report
2418 * empty/zero parameters.
2420 static int mount_snapshot
= 0;
2422 error
= vfs_setopt(opts
, "zfs.mount_snapshot",
2423 &mount_snapshot
, sizeof (mount_snapshot
));
2424 if (error
!= 0 && error
!= ENOENT
)
2429 mtx_unlock(&ppr
->pr_mtx
);
2434 zfs_jailparam_set(void *obj
, void *data
)
2436 struct prison
*pr
= obj
;
2438 struct vfsoptlist
*opts
= data
;
2439 int error
, jsys
, mount_snapshot
;
2441 /* Set the parameters, which should be correct. */
2442 error
= vfs_copyopt(opts
, "zfs", &jsys
, sizeof (jsys
));
2443 if (error
== ENOENT
)
2445 error
= vfs_copyopt(opts
, "zfs.mount_snapshot", &mount_snapshot
,
2446 sizeof (mount_snapshot
));
2447 if (error
== ENOENT
)
2448 mount_snapshot
= -1;
2450 jsys
= JAIL_SYS_NEW
;
2454 /* "zfs=new" or "zfs.*": the prison gets its own ZFS info. */
2455 struct zfs_jailparam
*zjp
;
2458 * A child jail cannot have more permissions than its parent
2460 if (pr
->pr_parent
!= &prison0
) {
2461 zjp
= zfs_jailparam_find(pr
->pr_parent
, &ppr
);
2462 mtx_unlock(&ppr
->pr_mtx
);
2463 if (zjp
->mount_snapshot
< mount_snapshot
) {
2467 zfs_jailparam_alloc(pr
, &zjp
);
2468 if (mount_snapshot
!= -1)
2469 zjp
->mount_snapshot
= mount_snapshot
;
2470 mtx_unlock(&pr
->pr_mtx
);
2473 case JAIL_SYS_INHERIT
:
2474 /* "zfs=inherit": inherit the parent's ZFS info. */
2475 mtx_lock(&pr
->pr_mtx
);
2476 osd_jail_del(pr
, zfs_jailparam_slot
);
2477 mtx_unlock(&pr
->pr_mtx
);
2481 * If the setting being changed is not ZFS related
2491 zfs_jailparam_check(void *obj __unused
, void *data
)
2493 struct vfsoptlist
*opts
= data
;
2494 int error
, jsys
, mount_snapshot
;
2496 /* Check that the parameters are correct. */
2497 error
= vfs_copyopt(opts
, "zfs", &jsys
, sizeof (jsys
));
2498 if (error
!= ENOENT
) {
2501 if (jsys
!= JAIL_SYS_NEW
&& jsys
!= JAIL_SYS_INHERIT
)
2504 error
= vfs_copyopt(opts
, "zfs.mount_snapshot", &mount_snapshot
,
2505 sizeof (mount_snapshot
));
2506 if (error
!= ENOENT
) {
2509 if (mount_snapshot
!= 0 && mount_snapshot
!= 1)
2516 zfs_jailparam_destroy(void *data
)
2519 free(data
, M_PRISON
);
2523 zfs_jailparam_sysinit(void *arg __unused
)
2526 osd_method_t methods
[PR_MAXMETHOD
] = {
2527 [PR_METHOD_CREATE
] = zfs_jailparam_create
,
2528 [PR_METHOD_GET
] = zfs_jailparam_get
,
2529 [PR_METHOD_SET
] = zfs_jailparam_set
,
2530 [PR_METHOD_CHECK
] = zfs_jailparam_check
,
2533 zfs_jailparam_slot
= osd_jail_register(zfs_jailparam_destroy
, methods
);
2534 /* Copy the defaults to any existing prisons. */
2535 sx_slock(&allprison_lock
);
2536 TAILQ_FOREACH(pr
, &allprison
, pr_list
)
2537 zfs_jailparam_alloc(pr
, NULL
);
2538 sx_sunlock(&allprison_lock
);
2542 zfs_jailparam_sysuninit(void *arg __unused
)
2545 osd_jail_deregister(zfs_jailparam_slot
);
2548 SYSINIT(zfs_jailparam_sysinit
, SI_SUB_DRIVERS
, SI_ORDER_ANY
,
2549 zfs_jailparam_sysinit
, NULL
);
2550 SYSUNINIT(zfs_jailparam_sysuninit
, SI_SUB_DRIVERS
, SI_ORDER_ANY
,
2551 zfs_jailparam_sysuninit
, NULL
);