]> git.proxmox.com Git - mirror_zfs.git/blame - module/os/freebsd/zfs/zfs_vfsops.c
BRT: Fix FICLONE/FICLONERANGE shortened copy
[mirror_zfs.git] / module / os / freebsd / zfs / zfs_vfsops.c
CommitLineData
9f0a21e6
MM
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
1d3ba0bf 9 * or https://opensource.org/licenses/CDDL-1.0.
9f0a21e6
MM
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
24 * All rights reserved.
25 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
28 */
29
30/* Portions Copyright 2010 Robert Milkowski */
31
32#include <sys/types.h>
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/kernel.h>
36#include <sys/sysmacros.h>
37#include <sys/kmem.h>
38#include <sys/acl.h>
39#include <sys/vnode.h>
40#include <sys/vfs.h>
41#include <sys/mntent.h>
42#include <sys/mount.h>
43#include <sys/cmn_err.h>
44#include <sys/zfs_znode.h>
ab8c935e 45#include <sys/zfs_vnops.h>
9f0a21e6
MM
46#include <sys/zfs_dir.h>
47#include <sys/zil.h>
48#include <sys/fs/zfs.h>
49#include <sys/dmu.h>
50#include <sys/dsl_prop.h>
51#include <sys/dsl_dataset.h>
52#include <sys/dsl_deleg.h>
53#include <sys/spa.h>
54#include <sys/zap.h>
55#include <sys/sa.h>
56#include <sys/sa_impl.h>
57#include <sys/policy.h>
58#include <sys/atomic.h>
59#include <sys/zfs_ioctl.h>
60#include <sys/zfs_ctldir.h>
61#include <sys/zfs_fuid.h>
62#include <sys/sunddi.h>
63#include <sys/dmu_objset.h>
64#include <sys/dsl_dir.h>
9f0a21e6 65#include <sys/jail.h>
595d3ac2 66#include <sys/osd.h>
9f0a21e6
MM
67#include <ufs/ufs/quota.h>
68#include <sys/zfs_quota.h>
69
70#include "zfs_comutil.h"
71
72#ifndef MNTK_VMSETSIZE_BUG
73#define MNTK_VMSETSIZE_BUG 0
74#endif
75#ifndef MNTK_NOMSYNC
76#define MNTK_NOMSYNC 8
77#endif
78
9f0a21e6
MM
79struct mtx zfs_debug_mtx;
80MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
81
82SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
83
84int zfs_super_owner;
85SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
7ada752a 86 "File system owners can perform privileged operation on file systems");
9f0a21e6
MM
87
88int zfs_debug_level;
89SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
90 "Debug level");
91
595d3ac2
AJ
92struct zfs_jailparam {
93 int mount_snapshot;
94};
95
96static struct zfs_jailparam zfs_jailparam0 = {
97 .mount_snapshot = 0,
98};
99
100static int zfs_jailparam_slot;
101
102SYSCTL_JAIL_PARAM_SYS_NODE(zfs, CTLFLAG_RW, "Jail ZFS parameters");
103SYSCTL_JAIL_PARAM(_zfs, mount_snapshot, CTLTYPE_INT | CTLFLAG_RW, "I",
104 "Allow mounting snapshots in the .zfs directory for unjailed datasets");
105
9f0a21e6
MM
106SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
107static int zfs_version_acl = ZFS_ACL_VERSION;
108SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
7ada752a 109 "ZFS_ACL_VERSION");
9f0a21e6
MM
110static int zfs_version_spa = SPA_VERSION;
111SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
7ada752a 112 "SPA_VERSION");
9f0a21e6
MM
113static int zfs_version_zpl = ZPL_VERSION;
114SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
7ada752a 115 "ZPL_VERSION");
9f0a21e6 116
8dddb25d
JH
117#if __FreeBSD_version >= 1400018
118static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg,
119 bool *mp_busy);
120#else
9f0a21e6 121static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
8dddb25d 122#endif
9f0a21e6
MM
123static int zfs_mount(vfs_t *vfsp);
124static int zfs_umount(vfs_t *vfsp, int fflag);
125static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
126static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
127static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
128static int zfs_sync(vfs_t *vfsp, int waitfor);
2e6af52b
RM
129#if __FreeBSD_version >= 1300098
130static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
131 struct ucred **credanonp, int *numsecflavors, int *secflavors);
132#else
9f0a21e6
MM
133static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
134 struct ucred **credanonp, int *numsecflavors, int **secflavors);
2e6af52b 135#endif
9f0a21e6
MM
136static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
137static void zfs_freevfs(vfs_t *vfsp);
138
139struct vfsops zfs_vfsops = {
140 .vfs_mount = zfs_mount,
141 .vfs_unmount = zfs_umount,
142#if __FreeBSD_version >= 1300049
143 .vfs_root = vfs_cache_root,
144 .vfs_cachedroot = zfs_root,
145#else
146 .vfs_root = zfs_root,
147#endif
148 .vfs_statfs = zfs_statfs,
149 .vfs_vget = zfs_vget,
150 .vfs_sync = zfs_sync,
151 .vfs_checkexp = zfs_checkexp,
152 .vfs_fhtovp = zfs_fhtovp,
153 .vfs_quotactl = zfs_quotactl,
154};
155
67a1b037
PJD
156#ifdef VFCF_CROSS_COPY_FILE_RANGE
157VFS_SET(zfs_vfsops, zfs,
158 VFCF_DELEGADMIN | VFCF_JAIL | VFCF_CROSS_COPY_FILE_RANGE);
159#else
160VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL);
161#endif
9f0a21e6
MM
162
163/*
164 * We need to keep a count of active fs's.
165 * This is necessary to prevent our module
166 * from being unloaded after a umount -f
167 */
168static uint32_t zfs_active_fs_count = 0;
169
170int
171zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
172 char *setpoint)
173{
174 int error;
175 zfsvfs_t *zfvp;
176 vfs_t *vfsp;
177 objset_t *os;
178 uint64_t tmp = *val;
179
180 error = dmu_objset_from_ds(ds, &os);
181 if (error != 0)
182 return (error);
183
184 error = getzfsvfs_impl(os, &zfvp);
185 if (error != 0)
186 return (error);
187 if (zfvp == NULL)
188 return (ENOENT);
189 vfsp = zfvp->z_vfs;
190 switch (zfs_prop) {
191 case ZFS_PROP_ATIME:
192 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
193 tmp = 0;
194 if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
195 tmp = 1;
196 break;
197 case ZFS_PROP_DEVICES:
198 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
199 tmp = 0;
200 if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
201 tmp = 1;
202 break;
203 case ZFS_PROP_EXEC:
204 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
205 tmp = 0;
206 if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
207 tmp = 1;
208 break;
209 case ZFS_PROP_SETUID:
210 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
211 tmp = 0;
212 if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
213 tmp = 1;
214 break;
215 case ZFS_PROP_READONLY:
216 if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
217 tmp = 0;
218 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
219 tmp = 1;
220 break;
221 case ZFS_PROP_XATTR:
222 if (zfvp->z_flags & ZSB_XATTR)
223 tmp = zfvp->z_xattr;
224 break;
225 case ZFS_PROP_NBMAND:
226 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
227 tmp = 0;
228 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
229 tmp = 1;
230 break;
231 default:
232 vfs_unbusy(vfsp);
233 return (ENOENT);
234 }
235
236 vfs_unbusy(vfsp);
237 if (tmp != *val) {
3a7d2a0c
RY
238 if (setpoint)
239 (void) strcpy(setpoint, "temporary");
9f0a21e6
MM
240 *val = tmp;
241 }
242 return (0);
243}
244
245static int
246zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
247{
248 int error = 0;
249 char buf[32];
250 uint64_t usedobj, quotaobj;
251 uint64_t quota, used = 0;
252 timespec_t now;
253
254 usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
255 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
256
257 if (quotaobj == 0 || zfsvfs->z_replay) {
258 error = ENOENT;
259 goto done;
260 }
261 (void) sprintf(buf, "%llx", (longlong_t)id);
262 if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
263 buf, sizeof (quota), 1, &quota)) != 0) {
264 dprintf("%s(%d): quotaobj lookup failed\n",
265 __FUNCTION__, __LINE__);
266 goto done;
267 }
268 /*
269 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
270 * So we set them to be the same.
271 */
272 dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
273 error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
274 if (error && error != ENOENT) {
275 dprintf("%s(%d): usedobj failed; %d\n",
276 __FUNCTION__, __LINE__, error);
277 goto done;
278 }
279 dqp->dqb_curblocks = btodb(used);
280 dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
281 vfs_timestamp(&now);
282 /*
283 * Setting this to 0 causes FreeBSD quota(8) to print
284 * the number of days since the epoch, which isn't
285 * particularly useful.
286 */
287 dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
288done:
289 return (error);
290}
291
292static int
8dddb25d
JH
293#if __FreeBSD_version >= 1400018
294zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy)
295#else
9f0a21e6 296zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
8dddb25d 297#endif
9f0a21e6
MM
298{
299 zfsvfs_t *zfsvfs = vfsp->vfs_data;
300 struct thread *td;
301 int cmd, type, error = 0;
302 int bitsize;
303 zfs_userquota_prop_t quota_type;
304 struct dqblk64 dqblk = { 0 };
305
306 td = curthread;
307 cmd = cmds >> SUBCMDSHIFT;
308 type = cmds & SUBCMDMASK;
309
768eaced
CC
310 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
311 return (error);
9f0a21e6
MM
312 if (id == -1) {
313 switch (type) {
314 case USRQUOTA:
315 id = td->td_ucred->cr_ruid;
316 break;
317 case GRPQUOTA:
318 id = td->td_ucred->cr_rgid;
319 break;
320 default:
321 error = EINVAL;
8dddb25d 322#if __FreeBSD_version < 1400018
9f0a21e6
MM
323 if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
324 vfs_unbusy(vfsp);
8dddb25d 325#endif
9f0a21e6
MM
326 goto done;
327 }
328 }
329 /*
330 * Map BSD type to:
331 * ZFS_PROP_USERUSED,
332 * ZFS_PROP_USERQUOTA,
333 * ZFS_PROP_GROUPUSED,
334 * ZFS_PROP_GROUPQUOTA
335 */
336 switch (cmd) {
337 case Q_SETQUOTA:
338 case Q_SETQUOTA32:
339 if (type == USRQUOTA)
340 quota_type = ZFS_PROP_USERQUOTA;
341 else if (type == GRPQUOTA)
342 quota_type = ZFS_PROP_GROUPQUOTA;
343 else
344 error = EINVAL;
345 break;
346 case Q_GETQUOTA:
347 case Q_GETQUOTA32:
348 if (type == USRQUOTA)
349 quota_type = ZFS_PROP_USERUSED;
350 else if (type == GRPQUOTA)
351 quota_type = ZFS_PROP_GROUPUSED;
352 else
353 error = EINVAL;
354 break;
355 }
356
357 /*
358 * Depending on the cmd, we may need to get
359 * the ruid and domain (see fuidstr_to_sid?),
360 * the fuid (how?), or other information.
361 * Create fuid using zfs_fuid_create(zfsvfs, id,
362 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
363 * I think I can use just the id?
364 *
365 * Look at zfs_id_overquota() to look up a quota.
366 * zap_lookup(something, quotaobj, fuidstring,
367 * sizeof (long long), 1, &quota)
368 *
369 * See zfs_set_userquota() to set a quota.
370 */
371 if ((uint32_t)type >= MAXQUOTAS) {
372 error = EINVAL;
373 goto done;
374 }
375
376 switch (cmd) {
377 case Q_GETQUOTASIZE:
378 bitsize = 64;
379 error = copyout(&bitsize, arg, sizeof (int));
380 break;
381 case Q_QUOTAON:
382 // As far as I can tell, you can't turn quotas on or off on zfs
383 error = 0;
8dddb25d 384#if __FreeBSD_version < 1400018
9f0a21e6 385 vfs_unbusy(vfsp);
8dddb25d 386#endif
9f0a21e6
MM
387 break;
388 case Q_QUOTAOFF:
389 error = ENOTSUP;
8dddb25d 390#if __FreeBSD_version < 1400018
9f0a21e6 391 vfs_unbusy(vfsp);
8dddb25d 392#endif
9f0a21e6
MM
393 break;
394 case Q_SETQUOTA:
7b0e3903 395 error = copyin(arg, &dqblk, sizeof (dqblk));
9f0a21e6
MM
396 if (error == 0)
397 error = zfs_set_userquota(zfsvfs, quota_type,
398 "", id, dbtob(dqblk.dqb_bhardlimit));
399 break;
400 case Q_GETQUOTA:
401 error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
402 if (error == 0)
403 error = copyout(&dqblk, arg, sizeof (dqblk));
404 break;
405 default:
406 error = EINVAL;
407 break;
408 }
409done:
768eaced 410 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
411 return (error);
412}
413
414
415boolean_t
416zfs_is_readonly(zfsvfs_t *zfsvfs)
417{
418 return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY));
419}
420
9f0a21e6
MM
421static int
422zfs_sync(vfs_t *vfsp, int waitfor)
423{
424
425 /*
426 * Data integrity is job one. We don't want a compromised kernel
427 * writing to the storage pool, so we never sync during panic.
428 */
429 if (panicstr)
430 return (0);
431
432 /*
433 * Ignore the system syncher. ZFS already commits async data
434 * at zfs_txg_timeout intervals.
435 */
436 if (waitfor == MNT_LAZY)
437 return (0);
438
439 if (vfsp != NULL) {
440 /*
441 * Sync a specific filesystem.
442 */
443 zfsvfs_t *zfsvfs = vfsp->vfs_data;
444 dsl_pool_t *dp;
445 int error;
446
768eaced
CC
447 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
448 return (error);
9f0a21e6
MM
449 dp = dmu_objset_pool(zfsvfs->z_os);
450
451 /*
452 * If the system is shutting down, then skip any
453 * filesystems which may exist on a suspended pool.
454 */
455 if (rebooting && spa_suspended(dp->dp_spa)) {
768eaced 456 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
457 return (0);
458 }
459
460 if (zfsvfs->z_log != NULL)
461 zil_commit(zfsvfs->z_log, 0);
462
768eaced 463 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
464 } else {
465 /*
466 * Sync all ZFS filesystems. This is what happens when you
76d04993 467 * run sync(8). Unlike other filesystems, ZFS honors the
9f0a21e6
MM
468 * request by waiting for all pools to commit all dirty data.
469 */
470 spa_sync_allpools();
471 }
472
473 return (0);
474}
475
476static void
477atime_changed_cb(void *arg, uint64_t newval)
478{
479 zfsvfs_t *zfsvfs = arg;
480
481 if (newval == TRUE) {
482 zfsvfs->z_atime = TRUE;
483 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
484 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
485 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
486 } else {
487 zfsvfs->z_atime = FALSE;
488 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
489 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
490 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
491 }
492}
493
494static void
495xattr_changed_cb(void *arg, uint64_t newval)
496{
497 zfsvfs_t *zfsvfs = arg;
498
499 if (newval == ZFS_XATTR_OFF) {
500 zfsvfs->z_flags &= ~ZSB_XATTR;
501 } else {
502 zfsvfs->z_flags |= ZSB_XATTR;
503
504 if (newval == ZFS_XATTR_SA)
505 zfsvfs->z_xattr_sa = B_TRUE;
506 else
507 zfsvfs->z_xattr_sa = B_FALSE;
508 }
509}
510
511static void
512blksz_changed_cb(void *arg, uint64_t newval)
513{
514 zfsvfs_t *zfsvfs = arg;
515 ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
516 ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
517 ASSERT(ISP2(newval));
518
519 zfsvfs->z_max_blksz = newval;
520 zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
521}
522
523static void
524readonly_changed_cb(void *arg, uint64_t newval)
525{
526 zfsvfs_t *zfsvfs = arg;
527
528 if (newval) {
529 /* XXX locking on vfs_flag? */
530 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
531 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
532 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
533 } else {
534 /* XXX locking on vfs_flag? */
535 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
536 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
537 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
538 }
539}
540
541static void
542setuid_changed_cb(void *arg, uint64_t newval)
543{
544 zfsvfs_t *zfsvfs = arg;
545
546 if (newval == FALSE) {
547 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
548 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
549 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
550 } else {
551 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
552 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
553 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
554 }
555}
556
557static void
558exec_changed_cb(void *arg, uint64_t newval)
559{
560 zfsvfs_t *zfsvfs = arg;
561
562 if (newval == FALSE) {
563 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
564 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
565 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
566 } else {
567 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
568 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
569 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
570 }
571}
572
573/*
574 * The nbmand mount option can be changed at mount time.
575 * We can't allow it to be toggled on live file systems or incorrect
576 * behavior may be seen from cifs clients
577 *
578 * This property isn't registered via dsl_prop_register(), but this callback
579 * will be called when a file system is first mounted
580 */
581static void
582nbmand_changed_cb(void *arg, uint64_t newval)
583{
584 zfsvfs_t *zfsvfs = arg;
585 if (newval == FALSE) {
586 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
587 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
588 } else {
589 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
590 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
591 }
592}
593
594static void
595snapdir_changed_cb(void *arg, uint64_t newval)
596{
597 zfsvfs_t *zfsvfs = arg;
598
599 zfsvfs->z_show_ctldir = newval;
600}
601
9f0a21e6
MM
602static void
603acl_mode_changed_cb(void *arg, uint64_t newval)
604{
605 zfsvfs_t *zfsvfs = arg;
606
607 zfsvfs->z_acl_mode = newval;
608}
609
610static void
611acl_inherit_changed_cb(void *arg, uint64_t newval)
612{
613 zfsvfs_t *zfsvfs = arg;
614
615 zfsvfs->z_acl_inherit = newval;
616}
617
485b50bb
RM
618static void
619acl_type_changed_cb(void *arg, uint64_t newval)
620{
621 zfsvfs_t *zfsvfs = arg;
622
623 zfsvfs->z_acl_type = newval;
624}
625
9f0a21e6
MM
626static int
627zfs_register_callbacks(vfs_t *vfsp)
628{
629 struct dsl_dataset *ds = NULL;
630 objset_t *os = NULL;
631 zfsvfs_t *zfsvfs = NULL;
632 uint64_t nbmand;
633 boolean_t readonly = B_FALSE;
634 boolean_t do_readonly = B_FALSE;
635 boolean_t setuid = B_FALSE;
636 boolean_t do_setuid = B_FALSE;
637 boolean_t exec = B_FALSE;
638 boolean_t do_exec = B_FALSE;
639 boolean_t xattr = B_FALSE;
640 boolean_t atime = B_FALSE;
641 boolean_t do_atime = B_FALSE;
642 boolean_t do_xattr = B_FALSE;
643 int error = 0;
644
e4efb709 645 ASSERT3P(vfsp, !=, NULL);
9f0a21e6 646 zfsvfs = vfsp->vfs_data;
e4efb709 647 ASSERT3P(zfsvfs, !=, NULL);
9f0a21e6
MM
648 os = zfsvfs->z_os;
649
650 /*
651 * This function can be called for a snapshot when we update snapshot's
652 * mount point, which isn't really supported.
653 */
654 if (dmu_objset_is_snapshot(os))
655 return (EOPNOTSUPP);
656
657 /*
658 * The act of registering our callbacks will destroy any mount
659 * options we may have. In order to enable temporary overrides
660 * of mount options, we stash away the current values and
661 * restore them after we register the callbacks.
662 */
663 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
664 !spa_writeable(dmu_objset_spa(os))) {
665 readonly = B_TRUE;
666 do_readonly = B_TRUE;
667 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
668 readonly = B_FALSE;
669 do_readonly = B_TRUE;
670 }
671 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
672 setuid = B_FALSE;
673 do_setuid = B_TRUE;
674 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
675 setuid = B_TRUE;
676 do_setuid = B_TRUE;
677 }
678 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
679 exec = B_FALSE;
680 do_exec = B_TRUE;
681 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
682 exec = B_TRUE;
683 do_exec = B_TRUE;
684 }
685 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
686 zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF;
687 do_xattr = B_TRUE;
688 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
689 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
690 do_xattr = B_TRUE;
691 } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) {
692 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
693 do_xattr = B_TRUE;
694 } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) {
695 zfsvfs->z_xattr = xattr = ZFS_XATTR_SA;
696 do_xattr = B_TRUE;
697 }
698 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
699 atime = B_FALSE;
700 do_atime = B_TRUE;
701 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
702 atime = B_TRUE;
703 do_atime = B_TRUE;
704 }
705
706 /*
707 * We need to enter pool configuration here, so that we can use
708 * dsl_prop_get_int_ds() to handle the special nbmand property below.
709 * dsl_prop_get_integer() can not be used, because it has to acquire
710 * spa_namespace_lock and we can not do that because we already hold
711 * z_teardown_lock. The problem is that spa_write_cachefile() is called
712 * with spa_namespace_lock held and the function calls ZFS vnode
713 * operations to write the cache file and thus z_teardown_lock is
714 * acquired after spa_namespace_lock.
715 */
716 ds = dmu_objset_ds(os);
717 dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
718
719 /*
720 * nbmand is a special property. It can only be changed at
721 * mount time.
722 *
723 * This is weird, but it is documented to only be changeable
724 * at mount time.
725 */
726 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
727 nbmand = B_FALSE;
728 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
729 nbmand = B_TRUE;
7b9a4230 730 } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand)) != 0) {
9f0a21e6
MM
731 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
732 return (error);
733 }
734
735 /*
736 * Register property callbacks.
737 *
738 * It would probably be fine to just check for i/o error from
739 * the first prop_register(), but I guess I like to go
740 * overboard...
741 */
742 error = dsl_prop_register(ds,
743 zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
744 error = error ? error : dsl_prop_register(ds,
745 zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
746 error = error ? error : dsl_prop_register(ds,
747 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
748 error = error ? error : dsl_prop_register(ds,
749 zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
750 error = error ? error : dsl_prop_register(ds,
751 zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
752 error = error ? error : dsl_prop_register(ds,
753 zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
754 error = error ? error : dsl_prop_register(ds,
755 zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
485b50bb
RM
756 error = error ? error : dsl_prop_register(ds,
757 zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs);
9f0a21e6
MM
758 error = error ? error : dsl_prop_register(ds,
759 zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
760 error = error ? error : dsl_prop_register(ds,
761 zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
762 zfsvfs);
9f0a21e6
MM
763 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
764 if (error)
765 goto unregister;
766
767 /*
768 * Invoke our callbacks to restore temporary mount options.
769 */
770 if (do_readonly)
771 readonly_changed_cb(zfsvfs, readonly);
772 if (do_setuid)
773 setuid_changed_cb(zfsvfs, setuid);
774 if (do_exec)
775 exec_changed_cb(zfsvfs, exec);
776 if (do_xattr)
777 xattr_changed_cb(zfsvfs, xattr);
778 if (do_atime)
779 atime_changed_cb(zfsvfs, atime);
780
781 nbmand_changed_cb(zfsvfs, nbmand);
782
783 return (0);
784
785unregister:
786 dsl_prop_unregister_all(ds, zfsvfs);
787 return (error);
788}
789
790/*
791 * Associate this zfsvfs with the given objset, which must be owned.
792 * This will cache a bunch of on-disk state from the objset in the
793 * zfsvfs.
794 */
795static int
796zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
797{
798 int error;
799 uint64_t val;
800
801 zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
802 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
803 zfsvfs->z_os = os;
804
805 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
806 if (error != 0)
807 return (error);
808 if (zfsvfs->z_version >
809 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
810 (void) printf("Can't mount a version %lld file system "
811 "on a version %lld pool\n. Pool must be upgraded to mount "
812 "this file system.", (u_longlong_t)zfsvfs->z_version,
813 (u_longlong_t)spa_version(dmu_objset_spa(os)));
814 return (SET_ERROR(ENOTSUP));
815 }
816 error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
817 if (error != 0)
818 return (error);
819 zfsvfs->z_norm = (int)val;
820
821 error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
822 if (error != 0)
823 return (error);
824 zfsvfs->z_utf8 = (val != 0);
825
826 error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
827 if (error != 0)
828 return (error);
829 zfsvfs->z_case = (uint_t)val;
830
485b50bb
RM
831 error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val);
832 if (error != 0)
833 return (error);
834 zfsvfs->z_acl_type = (uint_t)val;
835
9f0a21e6
MM
836 /*
837 * Fold case on file systems that are always or sometimes case
838 * insensitive.
839 */
840 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
841 zfsvfs->z_case == ZFS_CASE_MIXED)
842 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
843
844 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
845 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
846
847 uint64_t sa_obj = 0;
848 if (zfsvfs->z_use_sa) {
849 /* should either have both of these objects or none */
850 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
851 &sa_obj);
852 if (error != 0)
853 return (error);
210231ed
RM
854
855 error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val);
856 if (error == 0 && val == ZFS_XATTR_SA)
857 zfsvfs->z_xattr_sa = B_TRUE;
9f0a21e6
MM
858 }
859
860 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
861 &zfsvfs->z_attr_table);
862 if (error != 0)
863 return (error);
864
865 if (zfsvfs->z_version >= ZPL_VERSION_SA)
866 sa_register_update_callback(os, zfs_sa_upgrade);
867
868 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
869 &zfsvfs->z_root);
870 if (error != 0)
871 return (error);
e4efb709 872 ASSERT3U(zfsvfs->z_root, !=, 0);
9f0a21e6
MM
873
874 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
875 &zfsvfs->z_unlinkedobj);
876 if (error != 0)
877 return (error);
878
879 error = zap_lookup(os, MASTER_NODE_OBJ,
880 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
881 8, 1, &zfsvfs->z_userquota_obj);
882 if (error == ENOENT)
883 zfsvfs->z_userquota_obj = 0;
884 else if (error != 0)
885 return (error);
886
887 error = zap_lookup(os, MASTER_NODE_OBJ,
888 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
889 8, 1, &zfsvfs->z_groupquota_obj);
890 if (error == ENOENT)
891 zfsvfs->z_groupquota_obj = 0;
892 else if (error != 0)
893 return (error);
894
895 error = zap_lookup(os, MASTER_NODE_OBJ,
896 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
897 8, 1, &zfsvfs->z_projectquota_obj);
898 if (error == ENOENT)
899 zfsvfs->z_projectquota_obj = 0;
900 else if (error != 0)
901 return (error);
902
903 error = zap_lookup(os, MASTER_NODE_OBJ,
904 zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
905 8, 1, &zfsvfs->z_userobjquota_obj);
906 if (error == ENOENT)
907 zfsvfs->z_userobjquota_obj = 0;
908 else if (error != 0)
909 return (error);
910
911 error = zap_lookup(os, MASTER_NODE_OBJ,
912 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
913 8, 1, &zfsvfs->z_groupobjquota_obj);
914 if (error == ENOENT)
915 zfsvfs->z_groupobjquota_obj = 0;
916 else if (error != 0)
917 return (error);
918
919 error = zap_lookup(os, MASTER_NODE_OBJ,
920 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
921 8, 1, &zfsvfs->z_projectobjquota_obj);
922 if (error == ENOENT)
923 zfsvfs->z_projectobjquota_obj = 0;
924 else if (error != 0)
925 return (error);
926
927 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
928 &zfsvfs->z_fuid_obj);
929 if (error == ENOENT)
930 zfsvfs->z_fuid_obj = 0;
931 else if (error != 0)
932 return (error);
933
934 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
935 &zfsvfs->z_shares_dir);
936 if (error == ENOENT)
937 zfsvfs->z_shares_dir = 0;
938 else if (error != 0)
939 return (error);
940
941 /*
942 * Only use the name cache if we are looking for a
943 * name on a file system that does not require normalization
944 * or case folding. We can also look there if we happen to be
945 * on a non-normalizing, mixed sensitivity file system IF we
946 * are looking for the exact name (which is always the case on
947 * FreeBSD).
948 */
949 zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
950 ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
951 !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
952
953 return (0);
954}
955
956taskq_t *zfsvfs_taskq;
957
958static void
959zfsvfs_task_unlinked_drain(void *context, int pending __unused)
960{
961
962 zfs_unlinked_drain((zfsvfs_t *)context);
963}
964
965int
966zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
967{
968 objset_t *os;
969 zfsvfs_t *zfsvfs;
970 int error;
971 boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
972
973 /*
974 * XXX: Fix struct statfs so this isn't necessary!
975 *
976 * The 'osname' is used as the filesystem's special node, which means
977 * it must fit in statfs.f_mntfromname, or else it can't be
978 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
979 * 'zfs unmount' to think it's not mounted when it is.
980 */
981 if (strlen(osname) >= MNAMELEN)
982 return (SET_ERROR(ENAMETOOLONG));
983
984 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
985
986 error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs,
987 &os);
988 if (error != 0) {
989 kmem_free(zfsvfs, sizeof (zfsvfs_t));
990 return (error);
991 }
992
993 error = zfsvfs_create_impl(zfvp, zfsvfs, os);
994
995 return (error);
996}
997
998
999int
1000zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
1001{
1002 int error;
1003
1004 zfsvfs->z_vfs = NULL;
1005 zfsvfs->z_parent = zfsvfs;
1006
1007 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1008 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1009 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1010 offsetof(znode_t, z_link_node));
1011 TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
1012 zfsvfs_task_unlinked_drain, zfsvfs);
5ebe425a 1013 ZFS_TEARDOWN_INIT(zfsvfs);
9847f77f 1014 ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs);
9f0a21e6
MM
1015 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1016 for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1017 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1018
1019 error = zfsvfs_init(zfsvfs, os);
1020 if (error != 0) {
1021 dmu_objset_disown(os, B_TRUE, zfsvfs);
1022 *zfvp = NULL;
1023 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1024 return (error);
1025 }
1026
1027 *zfvp = zfsvfs;
1028 return (0);
1029}
1030
1031static int
1032zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1033{
1034 int error;
1035
1036 /*
1037 * Check for a bad on-disk format version now since we
1038 * lied about owning the dataset readonly before.
1039 */
1040 if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
1041 dmu_objset_incompatible_encryption_version(zfsvfs->z_os))
1042 return (SET_ERROR(EROFS));
1043
1044 error = zfs_register_callbacks(zfsvfs->z_vfs);
1045 if (error)
1046 return (error);
1047
9f0a21e6
MM
1048 /*
1049 * If we are not mounting (ie: online recv), then we don't
1050 * have to worry about replaying the log as we blocked all
1051 * operations out since we closed the ZIL.
1052 */
1053 if (mounting) {
1054 boolean_t readonly;
1055
4547fc4e 1056 ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
fb087146
AH
1057 error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
1058 if (error)
1059 return (error);
1060 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1061 &zfsvfs->z_kstat.dk_zil_sums);
4547fc4e 1062
9f0a21e6
MM
1063 /*
1064 * During replay we remove the read only flag to
1065 * allow replays to succeed.
1066 */
1067 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1068 if (readonly != 0) {
1069 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1070 } else {
1071 dsl_dir_t *dd;
4547fc4e
AJ
1072 zap_stats_t zs;
1073
1074 if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
1075 &zs) == 0) {
1076 dataset_kstats_update_nunlinks_kstat(
1077 &zfsvfs->z_kstat, zs.zs_num_entries);
1078 dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
1079 "num_entries in unlinked set: %llu",
8e739b2c 1080 (u_longlong_t)zs.zs_num_entries);
4547fc4e 1081 }
9f0a21e6
MM
1082
1083 zfs_unlinked_drain(zfsvfs);
1084 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1085 dd->dd_activity_cancelled = B_FALSE;
1086 }
1087
1088 /*
1089 * Parse and replay the intent log.
1090 *
1091 * Because of ziltest, this must be done after
1092 * zfs_unlinked_drain(). (Further note: ziltest
1093 * doesn't use readonly mounts, where
1094 * zfs_unlinked_drain() isn't called.) This is because
1095 * ziltest causes spa_sync() to think it's committed,
1096 * but actually it is not, so the intent log contains
1097 * many txg's worth of changes.
1098 *
1099 * In particular, if object N is in the unlinked set in
1100 * the last txg to actually sync, then it could be
1101 * actually freed in a later txg and then reallocated
1102 * in a yet later txg. This would write a "create
1103 * object N" record to the intent log. Normally, this
1104 * would be fine because the spa_sync() would have
1105 * written out the fact that object N is free, before
1106 * we could write the "create object N" intent log
1107 * record.
1108 *
1109 * But when we are in ziltest mode, we advance the "open
1110 * txg" without actually spa_sync()-ing the changes to
1111 * disk. So we would see that object N is still
1112 * allocated and in the unlinked set, and there is an
1113 * intent log record saying to allocate it.
1114 */
1115 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1116 if (zil_replay_disable) {
1117 zil_destroy(zfsvfs->z_log, B_FALSE);
1118 } else {
1119 boolean_t use_nc = zfsvfs->z_use_namecache;
1120 zfsvfs->z_use_namecache = B_FALSE;
1121 zfsvfs->z_replay = B_TRUE;
1122 zil_replay(zfsvfs->z_os, zfsvfs,
1123 zfs_replay_vector);
1124 zfsvfs->z_replay = B_FALSE;
1125 zfsvfs->z_use_namecache = use_nc;
1126 }
1127 }
1128
1129 /* restore readonly bit */
1130 if (readonly != 0)
1131 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
fb087146
AH
1132 } else {
1133 ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL);
1134 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1135 &zfsvfs->z_kstat.dk_zil_sums);
9f0a21e6
MM
1136 }
1137
1138 /*
1139 * Set the objset user_ptr to track its zfsvfs.
1140 */
1141 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1142 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1143 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1144
1145 return (0);
1146}
1147
9f0a21e6
MM
1148void
1149zfsvfs_free(zfsvfs_t *zfsvfs)
1150{
1151 int i;
1152
9f0a21e6
MM
1153 zfs_fuid_destroy(zfsvfs);
1154
1155 mutex_destroy(&zfsvfs->z_znodes_lock);
1156 mutex_destroy(&zfsvfs->z_lock);
9f0a21e6 1157 list_destroy(&zfsvfs->z_all_znodes);
5ebe425a 1158 ZFS_TEARDOWN_DESTROY(zfsvfs);
9847f77f 1159 ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs);
9f0a21e6
MM
1160 rw_destroy(&zfsvfs->z_fuid_lock);
1161 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1162 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
4547fc4e 1163 dataset_kstats_destroy(&zfsvfs->z_kstat);
9f0a21e6
MM
1164 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1165}
1166
1167static void
1168zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1169{
1170 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
9f0a21e6
MM
1171 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1172}
1173
1174static int
1175zfs_domount(vfs_t *vfsp, char *osname)
1176{
1177 uint64_t recordsize, fsid_guid;
1178 int error = 0;
1179 zfsvfs_t *zfsvfs;
1180
e4efb709
RM
1181 ASSERT3P(vfsp, !=, NULL);
1182 ASSERT3P(osname, !=, NULL);
9f0a21e6
MM
1183
1184 error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs);
1185 if (error)
1186 return (error);
1187 zfsvfs->z_vfs = vfsp;
1188
1189 if ((error = dsl_prop_get_integer(osname,
1190 "recordsize", &recordsize, NULL)))
1191 goto out;
1192 zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
1193 zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
1194
1195 vfsp->vfs_data = zfsvfs;
1196 vfsp->mnt_flag |= MNT_LOCAL;
1197 vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
1198 vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
1199 vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
1200 /*
1201 * This can cause a loss of coherence between ARC and page cache
1202 * on ZoF - unclear if the problem is in FreeBSD or ZoF
1203 */
1204 vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */
1205 vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
1206 vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
1207
1b376d17
MM
1208#if defined(_KERNEL) && !defined(KMEM_DEBUG)
1209 vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
1210#endif
9f0a21e6
MM
1211 /*
1212 * The fsid is 64 bits, composed of an 8-bit fs type, which
1213 * separates our fsid from any other filesystem types, and a
1214 * 56-bit objset unique ID. The objset unique ID is unique to
1215 * all objsets open on this system, provided by unique_create().
1216 * The 8-bit fs type must be put in the low bits of fsid[1]
1217 * because that's where other Solaris filesystems put it.
1218 */
1219 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
e4efb709 1220 ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0);
9f0a21e6 1221 vfsp->vfs_fsid.val[0] = fsid_guid;
e4efb709 1222 vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) |
9f0a21e6
MM
1223 (vfsp->mnt_vfc->vfc_typenum & 0xFF);
1224
1225 /*
1226 * Set features for file system.
1227 */
1228 zfs_set_fuid_feature(zfsvfs);
9f0a21e6
MM
1229
1230 if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1231 uint64_t pval;
1232
1233 atime_changed_cb(zfsvfs, B_FALSE);
1234 readonly_changed_cb(zfsvfs, B_TRUE);
1235 if ((error = dsl_prop_get_integer(osname,
1236 "xattr", &pval, NULL)))
1237 goto out;
1238 xattr_changed_cb(zfsvfs, pval);
485b50bb
RM
1239 if ((error = dsl_prop_get_integer(osname,
1240 "acltype", &pval, NULL)))
1241 goto out;
1242 acl_type_changed_cb(zfsvfs, pval);
9f0a21e6
MM
1243 zfsvfs->z_issnap = B_TRUE;
1244 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1245
1246 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1247 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1248 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1249 } else {
1250 if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
1251 goto out;
1252 }
1253
1254 vfs_mountedfrom(vfsp, osname);
1255
1256 if (!zfsvfs->z_issnap)
1257 zfsctl_create(zfsvfs);
1258out:
1259 if (error) {
1260 dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
1261 zfsvfs_free(zfsvfs);
1262 } else {
1263 atomic_inc_32(&zfs_active_fs_count);
1264 }
1265
1266 return (error);
1267}
1268
65c7cc49 1269static void
9f0a21e6
MM
1270zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1271{
1272 objset_t *os = zfsvfs->z_os;
1273
1274 if (!dmu_objset_is_snapshot(os))
1275 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1276}
1277
9f0a21e6
MM
1278static int
1279getpoolname(const char *osname, char *poolname)
1280{
1281 char *p;
1282
1283 p = strchr(osname, '/');
1284 if (p == NULL) {
1285 if (strlen(osname) >= MAXNAMELEN)
1286 return (ENAMETOOLONG);
1287 (void) strcpy(poolname, osname);
1288 } else {
1289 if (p - osname >= MAXNAMELEN)
1290 return (ENAMETOOLONG);
7584fbe8 1291 (void) strlcpy(poolname, osname, p - osname + 1);
9f0a21e6
MM
1292 }
1293 return (0);
1294}
1295
e464f7c7
MZ
1296static void
1297fetch_osname_options(char *name, bool *checkpointrewind)
1298{
1299
1300 if (name[0] == '!') {
1301 *checkpointrewind = true;
1302 memmove(name, name + 1, strlen(name));
1303 } else {
1304 *checkpointrewind = false;
1305 }
1306}
1307
9f0a21e6
MM
1308static int
1309zfs_mount(vfs_t *vfsp)
1310{
1311 kthread_t *td = curthread;
1312 vnode_t *mvp = vfsp->mnt_vnodecovered;
1313 cred_t *cr = td->td_ucred;
1314 char *osname;
1315 int error = 0;
1316 int canwrite;
595d3ac2 1317 bool checkpointrewind, isctlsnap = false;
9f0a21e6
MM
1318
1319 if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
1320 return (SET_ERROR(EINVAL));
1321
1322 /*
1323 * If full-owner-access is enabled and delegated administration is
1324 * turned on, we must set nosuid.
1325 */
1326 if (zfs_super_owner &&
1327 dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
1328 secpolicy_fs_mount_clearopts(cr, vfsp);
1329 }
1330
e464f7c7 1331 fetch_osname_options(osname, &checkpointrewind);
d27a0028
AJ
1332 isctlsnap = (mvp != NULL && zfsctl_is_node(mvp) &&
1333 strchr(osname, '@') != NULL);
e464f7c7 1334
9f0a21e6
MM
1335 /*
1336 * Check for mount privilege?
1337 *
1338 * If we don't have privilege then see if
1339 * we have local permission to allow it
1340 */
1341 error = secpolicy_fs_mount(cr, mvp, vfsp);
595d3ac2
AJ
1342 if (error && isctlsnap) {
1343 secpolicy_fs_mount_clearopts(cr, vfsp);
1344 } else if (error) {
9f0a21e6
MM
1345 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
1346 goto out;
1347
1348 if (!(vfsp->vfs_flag & MS_REMOUNT)) {
1349 vattr_t vattr;
1350
1351 /*
1352 * Make sure user is the owner of the mount point
1353 * or has sufficient privileges.
1354 */
1355
1356 vattr.va_mask = AT_UID;
1357
1358 vn_lock(mvp, LK_SHARED | LK_RETRY);
1359 if (VOP_GETATTR(mvp, &vattr, cr)) {
1360 VOP_UNLOCK1(mvp);
1361 goto out;
1362 }
1363
1364 if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1365 VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
1366 VOP_UNLOCK1(mvp);
1367 goto out;
1368 }
1369 VOP_UNLOCK1(mvp);
1370 }
1371
1372 secpolicy_fs_mount_clearopts(cr, vfsp);
1373 }
1374
1375 /*
1376 * Refuse to mount a filesystem if we are in a local zone and the
1377 * dataset is not visible.
1378 */
1379 if (!INGLOBALZONE(curproc) &&
1380 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
595d3ac2
AJ
1381 boolean_t mount_snapshot = B_FALSE;
1382
1383 /*
1384 * Snapshots may be mounted in .zfs for unjailed datasets
1385 * if allowed by the jail param zfs.mount_snapshot.
1386 */
1387 if (isctlsnap) {
1388 struct prison *pr;
1389 struct zfs_jailparam *zjp;
1390
1391 pr = curthread->td_ucred->cr_prison;
1392 mtx_lock(&pr->pr_mtx);
1393 zjp = osd_jail_get(pr, zfs_jailparam_slot);
1394 mtx_unlock(&pr->pr_mtx);
1395 if (zjp && zjp->mount_snapshot)
1396 mount_snapshot = B_TRUE;
1397 }
1398 if (!mount_snapshot) {
1399 error = SET_ERROR(EPERM);
1400 goto out;
1401 }
9f0a21e6
MM
1402 }
1403
9f0a21e6
MM
1404 vfsp->vfs_flag |= MNT_NFS4ACLS;
1405
1406 /*
1407 * When doing a remount, we simply refresh our temporary properties
1408 * according to those options set in the current VFS options.
1409 */
1410 if (vfsp->vfs_flag & MS_REMOUNT) {
1411 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1412
1413 /*
1414 * Refresh mount options with z_teardown_lock blocking I/O while
1415 * the filesystem is in an inconsistent state.
1416 * The lock also serializes this code with filesystem
1417 * manipulations between entry to zfs_suspend_fs() and return
1418 * from zfs_resume_fs().
1419 */
5ebe425a 1420 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
9f0a21e6
MM
1421 zfs_unregister_callbacks(zfsvfs);
1422 error = zfs_register_callbacks(vfsp);
5ebe425a 1423 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
9f0a21e6
MM
1424 goto out;
1425 }
1426
1427 /* Initial root mount: try hard to import the requested root pool. */
1428 if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
1429 (vfsp->vfs_flag & MNT_UPDATE) == 0) {
1430 char pname[MAXNAMELEN];
1431
1432 error = getpoolname(osname, pname);
1433 if (error == 0)
e464f7c7 1434 error = spa_import_rootpool(pname, checkpointrewind);
9f0a21e6
MM
1435 if (error)
1436 goto out;
1437 }
1438 DROP_GIANT();
1439 error = zfs_domount(vfsp, osname);
1440 PICKUP_GIANT();
1441
1442out:
1443 return (error);
1444}
1445
1446static int
1447zfs_statfs(vfs_t *vfsp, struct statfs *statp)
1448{
1449 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1450 uint64_t refdbytes, availbytes, usedobjs, availobjs;
768eaced 1451 int error;
9f0a21e6
MM
1452
1453 statp->f_version = STATFS_VERSION;
1454
768eaced
CC
1455 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1456 return (error);
9f0a21e6
MM
1457
1458 dmu_objset_space(zfsvfs->z_os,
1459 &refdbytes, &availbytes, &usedobjs, &availobjs);
1460
1461 /*
1462 * The underlying storage pool actually uses multiple block sizes.
1463 * We report the fragsize as the smallest block size we support,
1464 * and we report our blocksize as the filesystem's maximum blocksize.
1465 */
1466 statp->f_bsize = SPA_MINBLOCKSIZE;
1467 statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
1468
1469 /*
1470 * The following report "total" blocks of various kinds in the
1471 * file system, but reported in terms of f_frsize - the
1472 * "fragment" size.
1473 */
1474
1475 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1476 statp->f_bfree = availbytes / statp->f_bsize;
1477 statp->f_bavail = statp->f_bfree; /* no root reservation */
1478
1479 /*
1480 * statvfs() should really be called statufs(), because it assumes
1481 * static metadata. ZFS doesn't preallocate files, so the best
1482 * we can do is report the max that could possibly fit in f_files,
1483 * and that minus the number actually used in f_ffree.
1484 * For f_ffree, report the smaller of the number of object available
1485 * and the number of blocks (each object will take at least a block).
1486 */
1487 statp->f_ffree = MIN(availobjs, statp->f_bfree);
1488 statp->f_files = statp->f_ffree + usedobjs;
1489
1490 /*
1491 * We're a zfs filesystem.
1492 */
1493 strlcpy(statp->f_fstypename, "zfs",
1494 sizeof (statp->f_fstypename));
1495
1496 strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1497 sizeof (statp->f_mntfromname));
1498 strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1499 sizeof (statp->f_mntonname));
1500
1501 statp->f_namemax = MAXNAMELEN - 1;
1502
768eaced 1503 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
1504 return (0);
1505}
1506
1507static int
1508zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
1509{
1510 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1511 znode_t *rootzp;
1512 int error;
1513
768eaced
CC
1514 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1515 return (error);
9f0a21e6
MM
1516
1517 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1518 if (error == 0)
1519 *vpp = ZTOV(rootzp);
1520
768eaced 1521 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
1522
1523 if (error == 0) {
1524 error = vn_lock(*vpp, flags);
1525 if (error != 0) {
1526 VN_RELE(*vpp);
1527 *vpp = NULL;
1528 }
1529 }
1530 return (error);
1531}
1532
1533/*
1534 * Teardown the zfsvfs::z_os.
1535 *
1536 * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1537 * and 'z_teardown_inactive_lock' held.
1538 */
1539static int
1540zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1541{
1542 znode_t *zp;
1543 dsl_dir_t *dd;
1544
1545 /*
1546 * If someone has not already unmounted this file system,
1547 * drain the zrele_taskq to ensure all active references to the
1548 * zfsvfs_t have been handled only then can it be safely destroyed.
1549 */
1550 if (zfsvfs->z_os) {
1551 /*
1552 * If we're unmounting we have to wait for the list to
1553 * drain completely.
1554 *
1555 * If we're not unmounting there's no guarantee the list
1556 * will drain completely, but zreles run from the taskq
1557 * may add the parents of dir-based xattrs to the taskq
1558 * so we want to wait for these.
1559 *
ee720ad7
MG
1560 * We can safely check z_all_znodes for being empty because the
1561 * VFS has already blocked operations which add to it.
9f0a21e6
MM
1562 */
1563 int round = 0;
ee720ad7 1564 while (!list_is_empty(&zfsvfs->z_all_znodes)) {
9f0a21e6
MM
1565 taskq_wait_outstanding(dsl_pool_zrele_taskq(
1566 dmu_objset_pool(zfsvfs->z_os)), 0);
1567 if (++round > 1 && !unmounting)
1568 break;
1569 }
1570 }
5ebe425a 1571 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
9f0a21e6
MM
1572
1573 if (!unmounting) {
1574 /*
1575 * We purge the parent filesystem's vfsp as the parent
1576 * filesystem and all of its snapshots have their vnode's
1577 * v_vfsp set to the parent's filesystem's vfsp. Note,
1578 * 'z_parent' is self referential for non-snapshots.
1579 */
1580#ifdef FREEBSD_NAMECACHE
f6bb7c02
MG
1581#if __FreeBSD_version >= 1300117
1582 cache_purgevfs(zfsvfs->z_parent->z_vfs);
1583#else
9f0a21e6 1584 cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
f6bb7c02 1585#endif
9f0a21e6
MM
1586#endif
1587 }
1588
1589 /*
1590 * Close the zil. NB: Can't close the zil while zfs_inactive
1591 * threads are blocked as zil_close can call zfs_inactive.
1592 */
1593 if (zfsvfs->z_log) {
1594 zil_close(zfsvfs->z_log);
1595 zfsvfs->z_log = NULL;
1596 }
1597
9847f77f 1598 ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs);
9f0a21e6
MM
1599
1600 /*
1601 * If we are not unmounting (ie: online recv) and someone already
1602 * unmounted this file system while we were doing the switcheroo,
1603 * or a reopen of z_os failed then just bail out now.
1604 */
1605 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
9847f77f 1606 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
5ebe425a 1607 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
9f0a21e6
MM
1608 return (SET_ERROR(EIO));
1609 }
1610
1611 /*
1612 * At this point there are no vops active, and any new vops will
1613 * fail with EIO since we have z_teardown_lock for writer (only
dd4bc569 1614 * relevant for forced unmount).
9f0a21e6
MM
1615 *
1616 * Release all holds on dbufs.
1617 */
1618 mutex_enter(&zfsvfs->z_znodes_lock);
1619 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
e4efb709
RM
1620 zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1621 if (zp->z_sa_hdl != NULL) {
9f0a21e6
MM
1622 zfs_znode_dmu_fini(zp);
1623 }
e4efb709 1624 }
9f0a21e6
MM
1625 mutex_exit(&zfsvfs->z_znodes_lock);
1626
1627 /*
1628 * If we are unmounting, set the unmounted flag and let new vops
1629 * unblock. zfs_inactive will have the unmounted behavior, and all
1630 * other vops will fail with EIO.
1631 */
1632 if (unmounting) {
1633 zfsvfs->z_unmounted = B_TRUE;
9847f77f 1634 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
5ebe425a 1635 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
9f0a21e6
MM
1636 }
1637
1638 /*
1639 * z_os will be NULL if there was an error in attempting to reopen
1640 * zfsvfs, so just return as the properties had already been
1641 * unregistered and cached data had been evicted before.
1642 */
1643 if (zfsvfs->z_os == NULL)
1644 return (0);
1645
1646 /*
1647 * Unregister properties.
1648 */
1649 zfs_unregister_callbacks(zfsvfs);
1650
1651 /*
1652 * Evict cached data
1653 */
1654 if (!zfs_is_readonly(zfsvfs))
1655 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1656 dmu_objset_evict_dbufs(zfsvfs->z_os);
1657 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1658 dsl_dir_cancel_waiters(dd);
1659
1660 return (0);
1661}
1662
9f0a21e6
MM
1663static int
1664zfs_umount(vfs_t *vfsp, int fflag)
1665{
1666 kthread_t *td = curthread;
1667 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1668 objset_t *os;
1669 cred_t *cr = td->td_ucred;
1670 int ret;
1671
1672 ret = secpolicy_fs_unmount(cr, vfsp);
1673 if (ret) {
1674 if (dsl_deleg_access((char *)vfsp->vfs_resource,
1675 ZFS_DELEG_PERM_MOUNT, cr))
1676 return (ret);
1677 }
1678
1679 /*
1680 * Unmount any snapshots mounted under .zfs before unmounting the
1681 * dataset itself.
1682 */
1683 if (zfsvfs->z_ctldir != NULL) {
1684 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1685 return (ret);
1686 }
1687
1688 if (fflag & MS_FORCE) {
1689 /*
1690 * Mark file system as unmounted before calling
1691 * vflush(FORCECLOSE). This way we ensure no future vnops
1692 * will be called and risk operating on DOOMED vnodes.
1693 */
5ebe425a 1694 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
9f0a21e6 1695 zfsvfs->z_unmounted = B_TRUE;
5ebe425a 1696 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
9f0a21e6
MM
1697 }
1698
1699 /*
1700 * Flush all the files.
1701 */
1702 ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
1703 if (ret != 0)
1704 return (ret);
1705 while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
1706 &zfsvfs->z_unlinked_drain_task, NULL) != 0)
1707 taskqueue_drain(zfsvfs_taskq->tq_queue,
1708 &zfsvfs->z_unlinked_drain_task);
1709
e4efb709 1710 VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE));
9f0a21e6
MM
1711 os = zfsvfs->z_os;
1712
1713 /*
1714 * z_os will be NULL if there was an error in
1715 * attempting to reopen zfsvfs.
1716 */
1717 if (os != NULL) {
1718 /*
1719 * Unset the objset user_ptr.
1720 */
1721 mutex_enter(&os->os_user_ptr_lock);
1722 dmu_objset_set_user(os, NULL);
1723 mutex_exit(&os->os_user_ptr_lock);
1724
1725 /*
1726 * Finally release the objset
1727 */
1728 dmu_objset_disown(os, B_TRUE, zfsvfs);
1729 }
1730
1731 /*
1732 * We can now safely destroy the '.zfs' directory node.
1733 */
1734 if (zfsvfs->z_ctldir != NULL)
1735 zfsctl_destroy(zfsvfs);
1736 zfs_freevfs(vfsp);
1737
1738 return (0);
1739}
1740
1741static int
1742zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1743{
1744 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1745 znode_t *zp;
1746 int err;
1747
1748 /*
1749 * zfs_zget() can't operate on virtual entries like .zfs/ or
1750 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
1751 * This will make NFS to switch to LOOKUP instead of using VGET.
1752 */
1753 if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
1754 (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
1755 return (EOPNOTSUPP);
1756
768eaced
CC
1757 if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1758 return (err);
9f0a21e6
MM
1759 err = zfs_zget(zfsvfs, ino, &zp);
1760 if (err == 0 && zp->z_unlinked) {
1761 vrele(ZTOV(zp));
1762 err = EINVAL;
1763 }
1764 if (err == 0)
1765 *vpp = ZTOV(zp);
768eaced 1766 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
1767 if (err == 0) {
1768 err = vn_lock(*vpp, flags);
1769 if (err != 0)
1770 vrele(*vpp);
1771 }
1772 if (err != 0)
1773 *vpp = NULL;
1774 return (err);
1775}
1776
1c08fa8b 1777static int
2e6af52b 1778#if __FreeBSD_version >= 1300098
1c08fa8b
RM
1779zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
1780 struct ucred **credanonp, int *numsecflavors, int *secflavors)
2e6af52b 1781#else
9f0a21e6
MM
1782zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
1783 struct ucred **credanonp, int *numsecflavors, int **secflavors)
2e6af52b 1784#endif
9f0a21e6
MM
1785{
1786 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1787
1788 /*
1789 * If this is regular file system vfsp is the same as
1790 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
1791 * zfsvfs->z_parent->z_vfs represents parent file system
1792 * which we have to use here, because only this file system
1793 * has mnt_export configured.
1794 */
1795 return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
1796 credanonp, numsecflavors, secflavors));
1797}
1798
c70bb2f6
AZ
1799_Static_assert(sizeof (struct fid) >= SHORT_FID_LEN,
1800 "struct fid bigger than SHORT_FID_LEN");
1801_Static_assert(sizeof (struct fid) >= LONG_FID_LEN,
1802 "struct fid bigger than LONG_FID_LEN");
9f0a21e6
MM
1803
1804static int
1805zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
1806{
1807 struct componentname cn;
1808 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1809 znode_t *zp;
1810 vnode_t *dvp;
1811 uint64_t object = 0;
1812 uint64_t fid_gen = 0;
43dbf881 1813 uint64_t setgen = 0;
9f0a21e6
MM
1814 uint64_t gen_mask;
1815 uint64_t zp_gen;
1816 int i, err;
1817
1818 *vpp = NULL;
1819
768eaced
CC
1820 if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1821 return (err);
9f0a21e6
MM
1822
1823 /*
1824 * On FreeBSD we can get snapshot's mount point or its parent file
1825 * system mount point depending if snapshot is already mounted or not.
1826 */
1827 if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
1828 zfid_long_t *zlfid = (zfid_long_t *)fidp;
1829 uint64_t objsetid = 0;
9f0a21e6
MM
1830
1831 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1832 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1833
1834 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1835 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1836
768eaced 1837 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
1838
1839 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1840 if (err)
1841 return (SET_ERROR(EINVAL));
768eaced
CC
1842 if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1843 return (err);
9f0a21e6
MM
1844 }
1845
1846 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1847 zfid_short_t *zfid = (zfid_short_t *)fidp;
1848
1849 for (i = 0; i < sizeof (zfid->zf_object); i++)
1850 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1851
1852 for (i = 0; i < sizeof (zfid->zf_gen); i++)
1853 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1854 } else {
768eaced 1855 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
1856 return (SET_ERROR(EINVAL));
1857 }
1858
ed566bf1
MJ
1859 if (fidp->fid_len == LONG_FID_LEN && setgen != 0) {
1860 zfs_exit(zfsvfs, FTAG);
43dbf881
AZ
1861 dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n",
1862 (u_longlong_t)fid_gen, (u_longlong_t)setgen);
1863 return (SET_ERROR(EINVAL));
1864 }
1865
9f0a21e6
MM
1866 /*
1867 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
1868 * directory tree. If the object == zfsvfs->z_shares_dir, then
1869 * we are in the .zfs/shares directory tree.
1870 */
1871 if ((fid_gen == 0 &&
1872 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
1873 (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
768eaced 1874 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
1875 VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
1876 if (object == ZFSCTL_INO_SNAPDIR) {
1877 cn.cn_nameptr = "snapshot";
1878 cn.cn_namelen = strlen(cn.cn_nameptr);
1879 cn.cn_nameiop = LOOKUP;
1880 cn.cn_flags = ISLASTCN | LOCKLEAF;
1881 cn.cn_lkflags = flags;
1882 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1883 vput(dvp);
1884 } else if (object == zfsvfs->z_shares_dir) {
1885 /*
1886 * XXX This branch must not be taken,
1887 * if it is, then the lookup below will
1888 * explode.
1889 */
1890 cn.cn_nameptr = "shares";
1891 cn.cn_namelen = strlen(cn.cn_nameptr);
1892 cn.cn_nameiop = LOOKUP;
1893 cn.cn_flags = ISLASTCN;
1894 cn.cn_lkflags = flags;
1895 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1896 vput(dvp);
1897 } else {
1898 *vpp = dvp;
1899 }
1900 return (err);
1901 }
1902
1903 gen_mask = -1ULL >> (64 - 8 * i);
1904
8e739b2c
RE
1905 dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object,
1906 (u_longlong_t)fid_gen,
1907 (u_longlong_t)gen_mask);
9f0a21e6 1908 if ((err = zfs_zget(zfsvfs, object, &zp))) {
768eaced 1909 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
1910 return (err);
1911 }
1912 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1913 sizeof (uint64_t));
1914 zp_gen = zp_gen & gen_mask;
1915 if (zp_gen == 0)
1916 zp_gen = 1;
1917 if (zp->z_unlinked || zp_gen != fid_gen) {
8e739b2c
RE
1918 dprintf("znode gen (%llu) != fid gen (%llu)\n",
1919 (u_longlong_t)zp_gen, (u_longlong_t)fid_gen);
9f0a21e6 1920 vrele(ZTOV(zp));
768eaced 1921 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
1922 return (SET_ERROR(EINVAL));
1923 }
1924
1925 *vpp = ZTOV(zp);
768eaced 1926 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
1927 err = vn_lock(*vpp, flags);
1928 if (err == 0)
1929 vnode_create_vobject(*vpp, zp->z_size, curthread);
1930 else
1931 *vpp = NULL;
1932 return (err);
1933}
1934
1935/*
1936 * Block out VOPs and close zfsvfs_t::z_os
1937 *
1938 * Note, if successful, then we return with the 'z_teardown_lock' and
1939 * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
1940 * dataset and objset intact so that they can be atomically handed off during
1941 * a subsequent rollback or recv operation and the resume thereafter.
1942 */
1943int
1944zfs_suspend_fs(zfsvfs_t *zfsvfs)
1945{
1946 int error;
1947
1948 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1949 return (error);
1950
1951 return (0);
1952}
1953
1954/*
1955 * Rebuild SA and release VOPs. Note that ownership of the underlying dataset
1956 * is an invariant across any of the operations that can be performed while the
1957 * filesystem was suspended. Whether it succeeded or failed, the preconditions
1958 * are the same: the relevant objset and associated dataset are owned by
1959 * zfsvfs, held, and long held on entry.
1960 */
1961int
1962zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
1963{
1964 int err;
1965 znode_t *zp;
1966
5ebe425a 1967 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
9847f77f 1968 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
9f0a21e6
MM
1969
1970 /*
1971 * We already own this, so just update the objset_t, as the one we
1972 * had before may have been evicted.
1973 */
1974 objset_t *os;
1975 VERIFY3P(ds->ds_owner, ==, zfsvfs);
1976 VERIFY(dsl_dataset_long_held(ds));
1977 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
1978 dsl_pool_config_enter(dp, FTAG);
1979 VERIFY0(dmu_objset_from_ds(ds, &os));
1980 dsl_pool_config_exit(dp, FTAG);
1981
1982 err = zfsvfs_init(zfsvfs, os);
1983 if (err != 0)
1984 goto bail;
1985
1986 ds->ds_dir->dd_activity_cancelled = B_FALSE;
e4efb709 1987 VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE));
9f0a21e6
MM
1988
1989 zfs_set_fuid_feature(zfsvfs);
1990
1991 /*
1992 * Attempt to re-establish all the active znodes with
1993 * their dbufs. If a zfs_rezget() fails, then we'll let
768eaced 1994 * any potential callers discover that via zfs_enter_verify_zp
9f0a21e6
MM
1995 * when they try to use their znode.
1996 */
1997 mutex_enter(&zfsvfs->z_znodes_lock);
1998 for (zp = list_head(&zfsvfs->z_all_znodes); zp;
1999 zp = list_next(&zfsvfs->z_all_znodes, zp)) {
2000 (void) zfs_rezget(zp);
2001 }
2002 mutex_exit(&zfsvfs->z_znodes_lock);
2003
2004bail:
2005 /* release the VOPs */
9847f77f 2006 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
5ebe425a 2007 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
9f0a21e6
MM
2008
2009 if (err) {
2010 /*
2011 * Since we couldn't setup the sa framework, try to force
2012 * unmount this file system.
2013 */
2014 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
2015 vfs_ref(zfsvfs->z_vfs);
2016 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
2017 }
2018 }
2019 return (err);
2020}
2021
2022static void
2023zfs_freevfs(vfs_t *vfsp)
2024{
2025 zfsvfs_t *zfsvfs = vfsp->vfs_data;
2026
2027 zfsvfs_free(zfsvfs);
2028
2029 atomic_dec_32(&zfs_active_fs_count);
2030}
2031
2032#ifdef __i386__
2033static int desiredvnodes_backup;
47ed79ff
MM
2034#include <sys/vmmeter.h>
2035
2036
2037#include <vm/vm_page.h>
2038#include <vm/vm_object.h>
2039#include <vm/vm_kern.h>
2040#include <vm/vm_map.h>
9f0a21e6
MM
2041#endif
2042
2043static void
2044zfs_vnodes_adjust(void)
2045{
2046#ifdef __i386__
2047 int newdesiredvnodes;
2048
2049 desiredvnodes_backup = desiredvnodes;
2050
2051 /*
2052 * We calculate newdesiredvnodes the same way it is done in
2053 * vntblinit(). If it is equal to desiredvnodes, it means that
2054 * it wasn't tuned by the administrator and we can tune it down.
2055 */
2056 newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
2057 vm_kmem_size / (5 * (sizeof (struct vm_object) +
2058 sizeof (struct vnode))));
2059 if (newdesiredvnodes == desiredvnodes)
2060 desiredvnodes = (3 * newdesiredvnodes) / 4;
2061#endif
2062}
2063
2064static void
2065zfs_vnodes_adjust_back(void)
2066{
2067
2068#ifdef __i386__
2069 desiredvnodes = desiredvnodes_backup;
2070#endif
2071}
2072
799e09f7
AM
2073#if __FreeBSD_version >= 1300139
2074static struct sx zfs_vnlru_lock;
2075static struct vnode *zfs_vnlru_marker;
2076#endif
2077static arc_prune_t *zfs_prune;
2078
2079static void
2080zfs_prune_task(uint64_t nr_to_scan, void *arg __unused)
2081{
2082 if (nr_to_scan > INT_MAX)
2083 nr_to_scan = INT_MAX;
2084#if __FreeBSD_version >= 1300139
2085 sx_xlock(&zfs_vnlru_lock);
2086 vnlru_free_vfsops(nr_to_scan, &zfs_vfsops, zfs_vnlru_marker);
2087 sx_xunlock(&zfs_vnlru_lock);
2088#else
2089 vnlru_free(nr_to_scan, &zfs_vfsops);
2090#endif
2091}
2092
9f0a21e6
MM
2093void
2094zfs_init(void)
2095{
2096
2097 printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
2098
2099 /*
2100 * Initialize .zfs directory structures
2101 */
2102 zfsctl_init();
2103
2104 /*
2105 * Initialize znode cache, vnode ops, etc...
2106 */
2107 zfs_znode_init();
2108
2109 /*
2110 * Reduce number of vnodes. Originally number of vnodes is calculated
2111 * with UFS inode in mind. We reduce it here, because it's too big for
2112 * ZFS/i386.
2113 */
2114 zfs_vnodes_adjust();
2115
7bcb7f08 2116 dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
9f0a21e6
MM
2117
2118 zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
799e09f7
AM
2119
2120#if __FreeBSD_version >= 1300139
2121 zfs_vnlru_marker = vnlru_alloc_marker();
2122 sx_init(&zfs_vnlru_lock, "zfs vnlru lock");
2123#endif
2124 zfs_prune = arc_add_prune_callback(zfs_prune_task, NULL);
9f0a21e6
MM
2125}
2126
2127void
2128zfs_fini(void)
2129{
799e09f7
AM
2130 arc_remove_prune_callback(zfs_prune);
2131#if __FreeBSD_version >= 1300139
2132 vnlru_free_marker(zfs_vnlru_marker);
2133 sx_destroy(&zfs_vnlru_lock);
2134#endif
2135
9f0a21e6
MM
2136 taskq_destroy(zfsvfs_taskq);
2137 zfsctl_fini();
2138 zfs_znode_fini();
2139 zfs_vnodes_adjust_back();
2140}
2141
2142int
2143zfs_busy(void)
2144{
2145 return (zfs_active_fs_count != 0);
2146}
2147
2148/*
2149 * Release VOPs and unmount a suspended filesystem.
2150 */
2151int
2152zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2153{
5ebe425a 2154 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
9847f77f 2155 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
9f0a21e6
MM
2156
2157 /*
2158 * We already own this, so just hold and rele it to update the
2159 * objset_t, as the one we had before may have been evicted.
2160 */
2161 objset_t *os;
2162 VERIFY3P(ds->ds_owner, ==, zfsvfs);
2163 VERIFY(dsl_dataset_long_held(ds));
2164 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
2165 dsl_pool_config_enter(dp, FTAG);
2166 VERIFY0(dmu_objset_from_ds(ds, &os));
2167 dsl_pool_config_exit(dp, FTAG);
2168 zfsvfs->z_os = os;
2169
2170 /* release the VOPs */
9847f77f 2171 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
5ebe425a 2172 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
9f0a21e6
MM
2173
2174 /*
2175 * Try to force unmount this file system.
2176 */
2177 (void) zfs_umount(zfsvfs->z_vfs, 0);
2178 zfsvfs->z_unmounted = B_TRUE;
2179 return (0);
2180}
2181
2182int
2183zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2184{
2185 int error;
2186 objset_t *os = zfsvfs->z_os;
2187 dmu_tx_t *tx;
2188
2189 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2190 return (SET_ERROR(EINVAL));
2191
2192 if (newvers < zfsvfs->z_version)
2193 return (SET_ERROR(EINVAL));
2194
2195 if (zfs_spa_version_map(newvers) >
2196 spa_version(dmu_objset_spa(zfsvfs->z_os)))
2197 return (SET_ERROR(ENOTSUP));
2198
2199 tx = dmu_tx_create(os);
2200 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2201 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2202 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2203 ZFS_SA_ATTRS);
2204 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2205 }
2206 error = dmu_tx_assign(tx, TXG_WAIT);
2207 if (error) {
2208 dmu_tx_abort(tx);
2209 return (error);
2210 }
2211
2212 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2213 8, 1, &newvers, tx);
2214
2215 if (error) {
2216 dmu_tx_commit(tx);
2217 return (error);
2218 }
2219
2220 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2221 uint64_t sa_obj;
2222
2223 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2224 SPA_VERSION_SA);
2225 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2226 DMU_OT_NONE, 0, tx);
2227
2228 error = zap_add(os, MASTER_NODE_OBJ,
2229 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2230 ASSERT0(error);
2231
e4efb709 2232 VERIFY0(sa_set_sa_object(os, sa_obj));
9f0a21e6
MM
2233 sa_register_update_callback(os, zfs_sa_upgrade);
2234 }
2235
2236 spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
47ed79ff
MM
2237 "from %ju to %ju", (uintmax_t)zfsvfs->z_version,
2238 (uintmax_t)newvers);
9f0a21e6
MM
2239 dmu_tx_commit(tx);
2240
2241 zfsvfs->z_version = newvers;
2242 os->os_version = newvers;
2243
2244 zfs_set_fuid_feature(zfsvfs);
2245
2246 return (0);
2247}
2248
9f0a21e6 2249/*
dd4bc569 2250 * Return true if the corresponding vfs's unmounted flag is set.
9f0a21e6
MM
2251 * Otherwise return false.
2252 * If this function returns true we know VFS unmount has been initiated.
2253 */
2254boolean_t
2255zfs_get_vfs_flag_unmounted(objset_t *os)
2256{
2257 zfsvfs_t *zfvp;
2258 boolean_t unmounted = B_FALSE;
2259
e4efb709 2260 ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS);
9f0a21e6
MM
2261
2262 mutex_enter(&os->os_user_ptr_lock);
2263 zfvp = dmu_objset_get_user(os);
2264 if (zfvp != NULL && zfvp->z_vfs != NULL &&
2265 (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
2266 unmounted = B_TRUE;
2267 mutex_exit(&os->os_user_ptr_lock);
2268
2269 return (unmounted);
2270}
2271
2272#ifdef _KERNEL
2273void
2274zfsvfs_update_fromname(const char *oldname, const char *newname)
2275{
2276 char tmpbuf[MAXPATHLEN];
2277 struct mount *mp;
2278 char *fromname;
2279 size_t oldlen;
2280
2281 oldlen = strlen(oldname);
2282
2283 mtx_lock(&mountlist_mtx);
2284 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2285 fromname = mp->mnt_stat.f_mntfromname;
2286 if (strcmp(fromname, oldname) == 0) {
2287 (void) strlcpy(fromname, newname,
2288 sizeof (mp->mnt_stat.f_mntfromname));
2289 continue;
2290 }
2291 if (strncmp(fromname, oldname, oldlen) == 0 &&
2292 (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
2293 (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s",
2294 newname, fromname + oldlen);
2295 (void) strlcpy(fromname, tmpbuf,
2296 sizeof (mp->mnt_stat.f_mntfromname));
2297 continue;
2298 }
2299 }
2300 mtx_unlock(&mountlist_mtx);
2301}
2302#endif
595d3ac2
AJ
2303
2304/*
2305 * Find a prison with ZFS info.
2306 * Return the ZFS info and the (locked) prison.
2307 */
2308static struct zfs_jailparam *
2309zfs_jailparam_find(struct prison *spr, struct prison **prp)
2310{
2311 struct prison *pr;
2312 struct zfs_jailparam *zjp;
2313
2314 for (pr = spr; ; pr = pr->pr_parent) {
2315 mtx_lock(&pr->pr_mtx);
2316 if (pr == &prison0) {
2317 zjp = &zfs_jailparam0;
2318 break;
2319 }
2320 zjp = osd_jail_get(pr, zfs_jailparam_slot);
2321 if (zjp != NULL)
2322 break;
2323 mtx_unlock(&pr->pr_mtx);
2324 }
2325 *prp = pr;
2326
2327 return (zjp);
2328}
2329
2330/*
2331 * Ensure a prison has its own ZFS info. If zjpp is non-null, point it to the
2332 * ZFS info and lock the prison.
2333 */
2334static void
2335zfs_jailparam_alloc(struct prison *pr, struct zfs_jailparam **zjpp)
2336{
2337 struct prison *ppr;
2338 struct zfs_jailparam *zjp, *nzjp;
2339 void **rsv;
2340
2341 /* If this prison already has ZFS info, return that. */
2342 zjp = zfs_jailparam_find(pr, &ppr);
2343 if (ppr == pr)
2344 goto done;
2345
2346 /*
2347 * Allocate a new info record. Then check again, in case something
2348 * changed during the allocation.
2349 */
2350 mtx_unlock(&ppr->pr_mtx);
2351 nzjp = malloc(sizeof (struct zfs_jailparam), M_PRISON, M_WAITOK);
2352 rsv = osd_reserve(zfs_jailparam_slot);
2353 zjp = zfs_jailparam_find(pr, &ppr);
2354 if (ppr == pr) {
2355 free(nzjp, M_PRISON);
2356 osd_free_reserved(rsv);
2357 goto done;
2358 }
2359 /* Inherit the initial values from the ancestor. */
2360 mtx_lock(&pr->pr_mtx);
2361 (void) osd_jail_set_reserved(pr, zfs_jailparam_slot, rsv, nzjp);
2362 (void) memcpy(nzjp, zjp, sizeof (*zjp));
2363 zjp = nzjp;
2364 mtx_unlock(&ppr->pr_mtx);
2365done:
2366 if (zjpp != NULL)
2367 *zjpp = zjp;
2368 else
2369 mtx_unlock(&pr->pr_mtx);
2370}
2371
2372/*
2373 * Jail OSD methods for ZFS VFS info.
2374 */
2375static int
2376zfs_jailparam_create(void *obj, void *data)
2377{
2378 struct prison *pr = obj;
2379 struct vfsoptlist *opts = data;
2380 int jsys;
2381
2382 if (vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)) == 0 &&
2383 jsys == JAIL_SYS_INHERIT)
2384 return (0);
2385 /*
2386 * Inherit a prison's initial values from its parent
2387 * (different from JAIL_SYS_INHERIT which also inherits changes).
2388 */
2389 zfs_jailparam_alloc(pr, NULL);
2390 return (0);
2391}
2392
2393static int
2394zfs_jailparam_get(void *obj, void *data)
2395{
2396 struct prison *ppr, *pr = obj;
2397 struct vfsoptlist *opts = data;
2398 struct zfs_jailparam *zjp;
2399 int jsys, error;
2400
2401 zjp = zfs_jailparam_find(pr, &ppr);
2402 jsys = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
2403 error = vfs_setopt(opts, "zfs", &jsys, sizeof (jsys));
2404 if (error != 0 && error != ENOENT)
2405 goto done;
2406 if (jsys == JAIL_SYS_NEW) {
2407 error = vfs_setopt(opts, "zfs.mount_snapshot",
2408 &zjp->mount_snapshot, sizeof (zjp->mount_snapshot));
2409 if (error != 0 && error != ENOENT)
2410 goto done;
2411 } else {
2412 /*
2413 * If this prison is inheriting its ZFS info, report
2414 * empty/zero parameters.
2415 */
2416 static int mount_snapshot = 0;
2417
2418 error = vfs_setopt(opts, "zfs.mount_snapshot",
2419 &mount_snapshot, sizeof (mount_snapshot));
2420 if (error != 0 && error != ENOENT)
2421 goto done;
2422 }
2423 error = 0;
2424done:
2425 mtx_unlock(&ppr->pr_mtx);
2426 return (error);
2427}
2428
2429static int
2430zfs_jailparam_set(void *obj, void *data)
2431{
2432 struct prison *pr = obj;
2433 struct prison *ppr;
2434 struct vfsoptlist *opts = data;
2435 int error, jsys, mount_snapshot;
2436
2437 /* Set the parameters, which should be correct. */
2438 error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2439 if (error == ENOENT)
2440 jsys = -1;
2441 error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2442 sizeof (mount_snapshot));
2443 if (error == ENOENT)
2444 mount_snapshot = -1;
2445 else
2446 jsys = JAIL_SYS_NEW;
1d56c6d0
AJ
2447 switch (jsys) {
2448 case JAIL_SYS_NEW:
2449 {
595d3ac2
AJ
2450 /* "zfs=new" or "zfs.*": the prison gets its own ZFS info. */
2451 struct zfs_jailparam *zjp;
2452
2453 /*
2454 * A child jail cannot have more permissions than its parent
2455 */
2456 if (pr->pr_parent != &prison0) {
2457 zjp = zfs_jailparam_find(pr->pr_parent, &ppr);
2458 mtx_unlock(&ppr->pr_mtx);
2459 if (zjp->mount_snapshot < mount_snapshot) {
2460 return (EPERM);
2461 }
2462 }
2463 zfs_jailparam_alloc(pr, &zjp);
2464 if (mount_snapshot != -1)
2465 zjp->mount_snapshot = mount_snapshot;
2466 mtx_unlock(&pr->pr_mtx);
1d56c6d0
AJ
2467 break;
2468 }
2469 case JAIL_SYS_INHERIT:
595d3ac2
AJ
2470 /* "zfs=inherit": inherit the parent's ZFS info. */
2471 mtx_lock(&pr->pr_mtx);
2472 osd_jail_del(pr, zfs_jailparam_slot);
2473 mtx_unlock(&pr->pr_mtx);
1d56c6d0
AJ
2474 break;
2475 case -1:
2476 /*
2477 * If the setting being changed is not ZFS related
2478 * then do nothing.
2479 */
2480 break;
595d3ac2 2481 }
1d56c6d0 2482
595d3ac2
AJ
2483 return (0);
2484}
2485
2486static int
2487zfs_jailparam_check(void *obj __unused, void *data)
2488{
2489 struct vfsoptlist *opts = data;
2490 int error, jsys, mount_snapshot;
2491
2492 /* Check that the parameters are correct. */
2493 error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2494 if (error != ENOENT) {
2495 if (error != 0)
2496 return (error);
2497 if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT)
2498 return (EINVAL);
2499 }
2500 error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2501 sizeof (mount_snapshot));
2502 if (error != ENOENT) {
2503 if (error != 0)
2504 return (error);
2505 if (mount_snapshot != 0 && mount_snapshot != 1)
2506 return (EINVAL);
2507 }
2508 return (0);
2509}
2510
2511static void
2512zfs_jailparam_destroy(void *data)
2513{
2514
2515 free(data, M_PRISON);
2516}
2517
2518static void
2519zfs_jailparam_sysinit(void *arg __unused)
2520{
2521 struct prison *pr;
2522 osd_method_t methods[PR_MAXMETHOD] = {
2523 [PR_METHOD_CREATE] = zfs_jailparam_create,
2524 [PR_METHOD_GET] = zfs_jailparam_get,
2525 [PR_METHOD_SET] = zfs_jailparam_set,
2526 [PR_METHOD_CHECK] = zfs_jailparam_check,
2527 };
2528
2529 zfs_jailparam_slot = osd_jail_register(zfs_jailparam_destroy, methods);
2530 /* Copy the defaults to any existing prisons. */
2531 sx_slock(&allprison_lock);
2532 TAILQ_FOREACH(pr, &allprison, pr_list)
2533 zfs_jailparam_alloc(pr, NULL);
2534 sx_sunlock(&allprison_lock);
2535}
2536
2537static void
2538zfs_jailparam_sysuninit(void *arg __unused)
2539{
2540
2541 osd_jail_deregister(zfs_jailparam_slot);
2542}
2543
2544SYSINIT(zfs_jailparam_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2545 zfs_jailparam_sysinit, NULL);
2546SYSUNINIT(zfs_jailparam_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2547 zfs_jailparam_sysuninit, NULL);