]> git.proxmox.com Git - mirror_zfs.git/blame - module/os/freebsd/zfs/zfs_vfsops.c
zio can deadlock during device removal
[mirror_zfs.git] / module / os / freebsd / zfs / zfs_vfsops.c
CommitLineData
9f0a21e6
MM
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
1d3ba0bf 9 * or https://opensource.org/licenses/CDDL-1.0.
9f0a21e6
MM
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
24 * All rights reserved.
25 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
28 */
29
30/* Portions Copyright 2010 Robert Milkowski */
31
32#include <sys/types.h>
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/kernel.h>
36#include <sys/sysmacros.h>
37#include <sys/kmem.h>
38#include <sys/acl.h>
39#include <sys/vnode.h>
40#include <sys/vfs.h>
41#include <sys/mntent.h>
42#include <sys/mount.h>
43#include <sys/cmn_err.h>
44#include <sys/zfs_znode.h>
ab8c935e 45#include <sys/zfs_vnops.h>
9f0a21e6
MM
46#include <sys/zfs_dir.h>
47#include <sys/zil.h>
48#include <sys/fs/zfs.h>
49#include <sys/dmu.h>
50#include <sys/dsl_prop.h>
51#include <sys/dsl_dataset.h>
52#include <sys/dsl_deleg.h>
53#include <sys/spa.h>
54#include <sys/zap.h>
55#include <sys/sa.h>
56#include <sys/sa_impl.h>
57#include <sys/policy.h>
58#include <sys/atomic.h>
59#include <sys/zfs_ioctl.h>
60#include <sys/zfs_ctldir.h>
61#include <sys/zfs_fuid.h>
62#include <sys/sunddi.h>
63#include <sys/dmu_objset.h>
64#include <sys/dsl_dir.h>
9f0a21e6 65#include <sys/jail.h>
595d3ac2 66#include <sys/osd.h>
9f0a21e6
MM
67#include <ufs/ufs/quota.h>
68#include <sys/zfs_quota.h>
69
70#include "zfs_comutil.h"
71
72#ifndef MNTK_VMSETSIZE_BUG
73#define MNTK_VMSETSIZE_BUG 0
74#endif
75#ifndef MNTK_NOMSYNC
76#define MNTK_NOMSYNC 8
77#endif
78
9f0a21e6
MM
79struct mtx zfs_debug_mtx;
80MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
81
82SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
83
84int zfs_super_owner;
85SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
7ada752a 86 "File system owners can perform privileged operation on file systems");
9f0a21e6
MM
87
88int zfs_debug_level;
89SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
90 "Debug level");
91
595d3ac2
AJ
92struct zfs_jailparam {
93 int mount_snapshot;
94};
95
96static struct zfs_jailparam zfs_jailparam0 = {
97 .mount_snapshot = 0,
98};
99
100static int zfs_jailparam_slot;
101
102SYSCTL_JAIL_PARAM_SYS_NODE(zfs, CTLFLAG_RW, "Jail ZFS parameters");
103SYSCTL_JAIL_PARAM(_zfs, mount_snapshot, CTLTYPE_INT | CTLFLAG_RW, "I",
104 "Allow mounting snapshots in the .zfs directory for unjailed datasets");
105
9f0a21e6
MM
106SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
107static int zfs_version_acl = ZFS_ACL_VERSION;
108SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
7ada752a 109 "ZFS_ACL_VERSION");
9f0a21e6
MM
110static int zfs_version_spa = SPA_VERSION;
111SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
7ada752a 112 "SPA_VERSION");
9f0a21e6
MM
113static int zfs_version_zpl = ZPL_VERSION;
114SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
7ada752a 115 "ZPL_VERSION");
9f0a21e6 116
8dddb25d
JH
117#if __FreeBSD_version >= 1400018
118static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg,
119 bool *mp_busy);
120#else
9f0a21e6 121static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
8dddb25d 122#endif
9f0a21e6
MM
123static int zfs_mount(vfs_t *vfsp);
124static int zfs_umount(vfs_t *vfsp, int fflag);
125static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
126static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
127static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
128static int zfs_sync(vfs_t *vfsp, int waitfor);
2e6af52b
RM
129#if __FreeBSD_version >= 1300098
130static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
131 struct ucred **credanonp, int *numsecflavors, int *secflavors);
132#else
9f0a21e6
MM
133static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
134 struct ucred **credanonp, int *numsecflavors, int **secflavors);
2e6af52b 135#endif
9f0a21e6
MM
136static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
137static void zfs_freevfs(vfs_t *vfsp);
138
139struct vfsops zfs_vfsops = {
140 .vfs_mount = zfs_mount,
141 .vfs_unmount = zfs_umount,
142#if __FreeBSD_version >= 1300049
143 .vfs_root = vfs_cache_root,
144 .vfs_cachedroot = zfs_root,
145#else
146 .vfs_root = zfs_root,
147#endif
148 .vfs_statfs = zfs_statfs,
149 .vfs_vget = zfs_vget,
150 .vfs_sync = zfs_sync,
151 .vfs_checkexp = zfs_checkexp,
152 .vfs_fhtovp = zfs_fhtovp,
153 .vfs_quotactl = zfs_quotactl,
154};
155
156VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
157
158/*
159 * We need to keep a count of active fs's.
160 * This is necessary to prevent our module
161 * from being unloaded after a umount -f
162 */
163static uint32_t zfs_active_fs_count = 0;
164
165int
166zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
167 char *setpoint)
168{
169 int error;
170 zfsvfs_t *zfvp;
171 vfs_t *vfsp;
172 objset_t *os;
173 uint64_t tmp = *val;
174
175 error = dmu_objset_from_ds(ds, &os);
176 if (error != 0)
177 return (error);
178
179 error = getzfsvfs_impl(os, &zfvp);
180 if (error != 0)
181 return (error);
182 if (zfvp == NULL)
183 return (ENOENT);
184 vfsp = zfvp->z_vfs;
185 switch (zfs_prop) {
186 case ZFS_PROP_ATIME:
187 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
188 tmp = 0;
189 if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
190 tmp = 1;
191 break;
192 case ZFS_PROP_DEVICES:
193 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
194 tmp = 0;
195 if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
196 tmp = 1;
197 break;
198 case ZFS_PROP_EXEC:
199 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
200 tmp = 0;
201 if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
202 tmp = 1;
203 break;
204 case ZFS_PROP_SETUID:
205 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
206 tmp = 0;
207 if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
208 tmp = 1;
209 break;
210 case ZFS_PROP_READONLY:
211 if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
212 tmp = 0;
213 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
214 tmp = 1;
215 break;
216 case ZFS_PROP_XATTR:
217 if (zfvp->z_flags & ZSB_XATTR)
218 tmp = zfvp->z_xattr;
219 break;
220 case ZFS_PROP_NBMAND:
221 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
222 tmp = 0;
223 if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
224 tmp = 1;
225 break;
226 default:
227 vfs_unbusy(vfsp);
228 return (ENOENT);
229 }
230
231 vfs_unbusy(vfsp);
232 if (tmp != *val) {
233 (void) strcpy(setpoint, "temporary");
234 *val = tmp;
235 }
236 return (0);
237}
238
239static int
240zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
241{
242 int error = 0;
243 char buf[32];
244 uint64_t usedobj, quotaobj;
245 uint64_t quota, used = 0;
246 timespec_t now;
247
248 usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
249 quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
250
251 if (quotaobj == 0 || zfsvfs->z_replay) {
252 error = ENOENT;
253 goto done;
254 }
255 (void) sprintf(buf, "%llx", (longlong_t)id);
256 if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
257 buf, sizeof (quota), 1, &quota)) != 0) {
258 dprintf("%s(%d): quotaobj lookup failed\n",
259 __FUNCTION__, __LINE__);
260 goto done;
261 }
262 /*
263 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
264 * So we set them to be the same.
265 */
266 dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
267 error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
268 if (error && error != ENOENT) {
269 dprintf("%s(%d): usedobj failed; %d\n",
270 __FUNCTION__, __LINE__, error);
271 goto done;
272 }
273 dqp->dqb_curblocks = btodb(used);
274 dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
275 vfs_timestamp(&now);
276 /*
277 * Setting this to 0 causes FreeBSD quota(8) to print
278 * the number of days since the epoch, which isn't
279 * particularly useful.
280 */
281 dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
282done:
283 return (error);
284}
285
286static int
8dddb25d
JH
287#if __FreeBSD_version >= 1400018
288zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg, bool *mp_busy)
289#else
9f0a21e6 290zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
8dddb25d 291#endif
9f0a21e6
MM
292{
293 zfsvfs_t *zfsvfs = vfsp->vfs_data;
294 struct thread *td;
295 int cmd, type, error = 0;
296 int bitsize;
297 zfs_userquota_prop_t quota_type;
298 struct dqblk64 dqblk = { 0 };
299
300 td = curthread;
301 cmd = cmds >> SUBCMDSHIFT;
302 type = cmds & SUBCMDMASK;
303
768eaced
CC
304 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
305 return (error);
9f0a21e6
MM
306 if (id == -1) {
307 switch (type) {
308 case USRQUOTA:
309 id = td->td_ucred->cr_ruid;
310 break;
311 case GRPQUOTA:
312 id = td->td_ucred->cr_rgid;
313 break;
314 default:
315 error = EINVAL;
8dddb25d 316#if __FreeBSD_version < 1400018
9f0a21e6
MM
317 if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
318 vfs_unbusy(vfsp);
8dddb25d 319#endif
9f0a21e6
MM
320 goto done;
321 }
322 }
323 /*
324 * Map BSD type to:
325 * ZFS_PROP_USERUSED,
326 * ZFS_PROP_USERQUOTA,
327 * ZFS_PROP_GROUPUSED,
328 * ZFS_PROP_GROUPQUOTA
329 */
330 switch (cmd) {
331 case Q_SETQUOTA:
332 case Q_SETQUOTA32:
333 if (type == USRQUOTA)
334 quota_type = ZFS_PROP_USERQUOTA;
335 else if (type == GRPQUOTA)
336 quota_type = ZFS_PROP_GROUPQUOTA;
337 else
338 error = EINVAL;
339 break;
340 case Q_GETQUOTA:
341 case Q_GETQUOTA32:
342 if (type == USRQUOTA)
343 quota_type = ZFS_PROP_USERUSED;
344 else if (type == GRPQUOTA)
345 quota_type = ZFS_PROP_GROUPUSED;
346 else
347 error = EINVAL;
348 break;
349 }
350
351 /*
352 * Depending on the cmd, we may need to get
353 * the ruid and domain (see fuidstr_to_sid?),
354 * the fuid (how?), or other information.
355 * Create fuid using zfs_fuid_create(zfsvfs, id,
356 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
357 * I think I can use just the id?
358 *
359 * Look at zfs_id_overquota() to look up a quota.
360 * zap_lookup(something, quotaobj, fuidstring,
361 * sizeof (long long), 1, &quota)
362 *
363 * See zfs_set_userquota() to set a quota.
364 */
365 if ((uint32_t)type >= MAXQUOTAS) {
366 error = EINVAL;
367 goto done;
368 }
369
370 switch (cmd) {
371 case Q_GETQUOTASIZE:
372 bitsize = 64;
373 error = copyout(&bitsize, arg, sizeof (int));
374 break;
375 case Q_QUOTAON:
376 // As far as I can tell, you can't turn quotas on or off on zfs
377 error = 0;
8dddb25d 378#if __FreeBSD_version < 1400018
9f0a21e6 379 vfs_unbusy(vfsp);
8dddb25d 380#endif
9f0a21e6
MM
381 break;
382 case Q_QUOTAOFF:
383 error = ENOTSUP;
8dddb25d 384#if __FreeBSD_version < 1400018
9f0a21e6 385 vfs_unbusy(vfsp);
8dddb25d 386#endif
9f0a21e6
MM
387 break;
388 case Q_SETQUOTA:
7b0e3903 389 error = copyin(arg, &dqblk, sizeof (dqblk));
9f0a21e6
MM
390 if (error == 0)
391 error = zfs_set_userquota(zfsvfs, quota_type,
392 "", id, dbtob(dqblk.dqb_bhardlimit));
393 break;
394 case Q_GETQUOTA:
395 error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
396 if (error == 0)
397 error = copyout(&dqblk, arg, sizeof (dqblk));
398 break;
399 default:
400 error = EINVAL;
401 break;
402 }
403done:
768eaced 404 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
405 return (error);
406}
407
408
409boolean_t
410zfs_is_readonly(zfsvfs_t *zfsvfs)
411{
412 return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY));
413}
414
9f0a21e6
MM
415static int
416zfs_sync(vfs_t *vfsp, int waitfor)
417{
418
419 /*
420 * Data integrity is job one. We don't want a compromised kernel
421 * writing to the storage pool, so we never sync during panic.
422 */
423 if (panicstr)
424 return (0);
425
426 /*
427 * Ignore the system syncher. ZFS already commits async data
428 * at zfs_txg_timeout intervals.
429 */
430 if (waitfor == MNT_LAZY)
431 return (0);
432
433 if (vfsp != NULL) {
434 /*
435 * Sync a specific filesystem.
436 */
437 zfsvfs_t *zfsvfs = vfsp->vfs_data;
438 dsl_pool_t *dp;
439 int error;
440
768eaced
CC
441 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
442 return (error);
9f0a21e6
MM
443 dp = dmu_objset_pool(zfsvfs->z_os);
444
445 /*
446 * If the system is shutting down, then skip any
447 * filesystems which may exist on a suspended pool.
448 */
449 if (rebooting && spa_suspended(dp->dp_spa)) {
768eaced 450 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
451 return (0);
452 }
453
454 if (zfsvfs->z_log != NULL)
455 zil_commit(zfsvfs->z_log, 0);
456
768eaced 457 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
458 } else {
459 /*
460 * Sync all ZFS filesystems. This is what happens when you
76d04993 461 * run sync(8). Unlike other filesystems, ZFS honors the
9f0a21e6
MM
462 * request by waiting for all pools to commit all dirty data.
463 */
464 spa_sync_allpools();
465 }
466
467 return (0);
468}
469
470static void
471atime_changed_cb(void *arg, uint64_t newval)
472{
473 zfsvfs_t *zfsvfs = arg;
474
475 if (newval == TRUE) {
476 zfsvfs->z_atime = TRUE;
477 zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
478 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
479 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
480 } else {
481 zfsvfs->z_atime = FALSE;
482 zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
483 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
484 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
485 }
486}
487
488static void
489xattr_changed_cb(void *arg, uint64_t newval)
490{
491 zfsvfs_t *zfsvfs = arg;
492
493 if (newval == ZFS_XATTR_OFF) {
494 zfsvfs->z_flags &= ~ZSB_XATTR;
495 } else {
496 zfsvfs->z_flags |= ZSB_XATTR;
497
498 if (newval == ZFS_XATTR_SA)
499 zfsvfs->z_xattr_sa = B_TRUE;
500 else
501 zfsvfs->z_xattr_sa = B_FALSE;
502 }
503}
504
505static void
506blksz_changed_cb(void *arg, uint64_t newval)
507{
508 zfsvfs_t *zfsvfs = arg;
509 ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
510 ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
511 ASSERT(ISP2(newval));
512
513 zfsvfs->z_max_blksz = newval;
514 zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
515}
516
517static void
518readonly_changed_cb(void *arg, uint64_t newval)
519{
520 zfsvfs_t *zfsvfs = arg;
521
522 if (newval) {
523 /* XXX locking on vfs_flag? */
524 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
525 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
526 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
527 } else {
528 /* XXX locking on vfs_flag? */
529 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
530 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
531 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
532 }
533}
534
535static void
536setuid_changed_cb(void *arg, uint64_t newval)
537{
538 zfsvfs_t *zfsvfs = arg;
539
540 if (newval == FALSE) {
541 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
542 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
543 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
544 } else {
545 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
546 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
547 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
548 }
549}
550
551static void
552exec_changed_cb(void *arg, uint64_t newval)
553{
554 zfsvfs_t *zfsvfs = arg;
555
556 if (newval == FALSE) {
557 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
558 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
559 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
560 } else {
561 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
562 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
563 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
564 }
565}
566
567/*
568 * The nbmand mount option can be changed at mount time.
569 * We can't allow it to be toggled on live file systems or incorrect
570 * behavior may be seen from cifs clients
571 *
572 * This property isn't registered via dsl_prop_register(), but this callback
573 * will be called when a file system is first mounted
574 */
575static void
576nbmand_changed_cb(void *arg, uint64_t newval)
577{
578 zfsvfs_t *zfsvfs = arg;
579 if (newval == FALSE) {
580 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
581 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
582 } else {
583 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
584 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
585 }
586}
587
588static void
589snapdir_changed_cb(void *arg, uint64_t newval)
590{
591 zfsvfs_t *zfsvfs = arg;
592
593 zfsvfs->z_show_ctldir = newval;
594}
595
9f0a21e6
MM
596static void
597acl_mode_changed_cb(void *arg, uint64_t newval)
598{
599 zfsvfs_t *zfsvfs = arg;
600
601 zfsvfs->z_acl_mode = newval;
602}
603
604static void
605acl_inherit_changed_cb(void *arg, uint64_t newval)
606{
607 zfsvfs_t *zfsvfs = arg;
608
609 zfsvfs->z_acl_inherit = newval;
610}
611
485b50bb
RM
612static void
613acl_type_changed_cb(void *arg, uint64_t newval)
614{
615 zfsvfs_t *zfsvfs = arg;
616
617 zfsvfs->z_acl_type = newval;
618}
619
9f0a21e6
MM
620static int
621zfs_register_callbacks(vfs_t *vfsp)
622{
623 struct dsl_dataset *ds = NULL;
624 objset_t *os = NULL;
625 zfsvfs_t *zfsvfs = NULL;
626 uint64_t nbmand;
627 boolean_t readonly = B_FALSE;
628 boolean_t do_readonly = B_FALSE;
629 boolean_t setuid = B_FALSE;
630 boolean_t do_setuid = B_FALSE;
631 boolean_t exec = B_FALSE;
632 boolean_t do_exec = B_FALSE;
633 boolean_t xattr = B_FALSE;
634 boolean_t atime = B_FALSE;
635 boolean_t do_atime = B_FALSE;
636 boolean_t do_xattr = B_FALSE;
637 int error = 0;
638
e4efb709 639 ASSERT3P(vfsp, !=, NULL);
9f0a21e6 640 zfsvfs = vfsp->vfs_data;
e4efb709 641 ASSERT3P(zfsvfs, !=, NULL);
9f0a21e6
MM
642 os = zfsvfs->z_os;
643
644 /*
645 * This function can be called for a snapshot when we update snapshot's
646 * mount point, which isn't really supported.
647 */
648 if (dmu_objset_is_snapshot(os))
649 return (EOPNOTSUPP);
650
651 /*
652 * The act of registering our callbacks will destroy any mount
653 * options we may have. In order to enable temporary overrides
654 * of mount options, we stash away the current values and
655 * restore them after we register the callbacks.
656 */
657 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
658 !spa_writeable(dmu_objset_spa(os))) {
659 readonly = B_TRUE;
660 do_readonly = B_TRUE;
661 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
662 readonly = B_FALSE;
663 do_readonly = B_TRUE;
664 }
665 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
666 setuid = B_FALSE;
667 do_setuid = B_TRUE;
668 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
669 setuid = B_TRUE;
670 do_setuid = B_TRUE;
671 }
672 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
673 exec = B_FALSE;
674 do_exec = B_TRUE;
675 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
676 exec = B_TRUE;
677 do_exec = B_TRUE;
678 }
679 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
680 zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF;
681 do_xattr = B_TRUE;
682 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
683 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
684 do_xattr = B_TRUE;
685 } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) {
686 zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
687 do_xattr = B_TRUE;
688 } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) {
689 zfsvfs->z_xattr = xattr = ZFS_XATTR_SA;
690 do_xattr = B_TRUE;
691 }
692 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
693 atime = B_FALSE;
694 do_atime = B_TRUE;
695 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
696 atime = B_TRUE;
697 do_atime = B_TRUE;
698 }
699
700 /*
701 * We need to enter pool configuration here, so that we can use
702 * dsl_prop_get_int_ds() to handle the special nbmand property below.
703 * dsl_prop_get_integer() can not be used, because it has to acquire
704 * spa_namespace_lock and we can not do that because we already hold
705 * z_teardown_lock. The problem is that spa_write_cachefile() is called
706 * with spa_namespace_lock held and the function calls ZFS vnode
707 * operations to write the cache file and thus z_teardown_lock is
708 * acquired after spa_namespace_lock.
709 */
710 ds = dmu_objset_ds(os);
711 dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
712
713 /*
714 * nbmand is a special property. It can only be changed at
715 * mount time.
716 *
717 * This is weird, but it is documented to only be changeable
718 * at mount time.
719 */
720 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
721 nbmand = B_FALSE;
722 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
723 nbmand = B_TRUE;
724 } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0)) {
725 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
726 return (error);
727 }
728
729 /*
730 * Register property callbacks.
731 *
732 * It would probably be fine to just check for i/o error from
733 * the first prop_register(), but I guess I like to go
734 * overboard...
735 */
736 error = dsl_prop_register(ds,
737 zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
738 error = error ? error : dsl_prop_register(ds,
739 zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
740 error = error ? error : dsl_prop_register(ds,
741 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
742 error = error ? error : dsl_prop_register(ds,
743 zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
744 error = error ? error : dsl_prop_register(ds,
745 zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
746 error = error ? error : dsl_prop_register(ds,
747 zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
748 error = error ? error : dsl_prop_register(ds,
749 zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
485b50bb
RM
750 error = error ? error : dsl_prop_register(ds,
751 zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs);
9f0a21e6
MM
752 error = error ? error : dsl_prop_register(ds,
753 zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
754 error = error ? error : dsl_prop_register(ds,
755 zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
756 zfsvfs);
9f0a21e6
MM
757 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
758 if (error)
759 goto unregister;
760
761 /*
762 * Invoke our callbacks to restore temporary mount options.
763 */
764 if (do_readonly)
765 readonly_changed_cb(zfsvfs, readonly);
766 if (do_setuid)
767 setuid_changed_cb(zfsvfs, setuid);
768 if (do_exec)
769 exec_changed_cb(zfsvfs, exec);
770 if (do_xattr)
771 xattr_changed_cb(zfsvfs, xattr);
772 if (do_atime)
773 atime_changed_cb(zfsvfs, atime);
774
775 nbmand_changed_cb(zfsvfs, nbmand);
776
777 return (0);
778
779unregister:
780 dsl_prop_unregister_all(ds, zfsvfs);
781 return (error);
782}
783
784/*
785 * Associate this zfsvfs with the given objset, which must be owned.
786 * This will cache a bunch of on-disk state from the objset in the
787 * zfsvfs.
788 */
789static int
790zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
791{
792 int error;
793 uint64_t val;
794
795 zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
796 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
797 zfsvfs->z_os = os;
798
799 error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
800 if (error != 0)
801 return (error);
802 if (zfsvfs->z_version >
803 zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
804 (void) printf("Can't mount a version %lld file system "
805 "on a version %lld pool\n. Pool must be upgraded to mount "
806 "this file system.", (u_longlong_t)zfsvfs->z_version,
807 (u_longlong_t)spa_version(dmu_objset_spa(os)));
808 return (SET_ERROR(ENOTSUP));
809 }
810 error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
811 if (error != 0)
812 return (error);
813 zfsvfs->z_norm = (int)val;
814
815 error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
816 if (error != 0)
817 return (error);
818 zfsvfs->z_utf8 = (val != 0);
819
820 error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
821 if (error != 0)
822 return (error);
823 zfsvfs->z_case = (uint_t)val;
824
485b50bb
RM
825 error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val);
826 if (error != 0)
827 return (error);
828 zfsvfs->z_acl_type = (uint_t)val;
829
9f0a21e6
MM
830 /*
831 * Fold case on file systems that are always or sometimes case
832 * insensitive.
833 */
834 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
835 zfsvfs->z_case == ZFS_CASE_MIXED)
836 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
837
838 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
839 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
840
841 uint64_t sa_obj = 0;
842 if (zfsvfs->z_use_sa) {
843 /* should either have both of these objects or none */
844 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
845 &sa_obj);
846 if (error != 0)
847 return (error);
210231ed
RM
848
849 error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val);
850 if (error == 0 && val == ZFS_XATTR_SA)
851 zfsvfs->z_xattr_sa = B_TRUE;
9f0a21e6
MM
852 }
853
854 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
855 &zfsvfs->z_attr_table);
856 if (error != 0)
857 return (error);
858
859 if (zfsvfs->z_version >= ZPL_VERSION_SA)
860 sa_register_update_callback(os, zfs_sa_upgrade);
861
862 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
863 &zfsvfs->z_root);
864 if (error != 0)
865 return (error);
e4efb709 866 ASSERT3U(zfsvfs->z_root, !=, 0);
9f0a21e6
MM
867
868 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
869 &zfsvfs->z_unlinkedobj);
870 if (error != 0)
871 return (error);
872
873 error = zap_lookup(os, MASTER_NODE_OBJ,
874 zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
875 8, 1, &zfsvfs->z_userquota_obj);
876 if (error == ENOENT)
877 zfsvfs->z_userquota_obj = 0;
878 else if (error != 0)
879 return (error);
880
881 error = zap_lookup(os, MASTER_NODE_OBJ,
882 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
883 8, 1, &zfsvfs->z_groupquota_obj);
884 if (error == ENOENT)
885 zfsvfs->z_groupquota_obj = 0;
886 else if (error != 0)
887 return (error);
888
889 error = zap_lookup(os, MASTER_NODE_OBJ,
890 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
891 8, 1, &zfsvfs->z_projectquota_obj);
892 if (error == ENOENT)
893 zfsvfs->z_projectquota_obj = 0;
894 else if (error != 0)
895 return (error);
896
897 error = zap_lookup(os, MASTER_NODE_OBJ,
898 zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
899 8, 1, &zfsvfs->z_userobjquota_obj);
900 if (error == ENOENT)
901 zfsvfs->z_userobjquota_obj = 0;
902 else if (error != 0)
903 return (error);
904
905 error = zap_lookup(os, MASTER_NODE_OBJ,
906 zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
907 8, 1, &zfsvfs->z_groupobjquota_obj);
908 if (error == ENOENT)
909 zfsvfs->z_groupobjquota_obj = 0;
910 else if (error != 0)
911 return (error);
912
913 error = zap_lookup(os, MASTER_NODE_OBJ,
914 zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
915 8, 1, &zfsvfs->z_projectobjquota_obj);
916 if (error == ENOENT)
917 zfsvfs->z_projectobjquota_obj = 0;
918 else if (error != 0)
919 return (error);
920
921 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
922 &zfsvfs->z_fuid_obj);
923 if (error == ENOENT)
924 zfsvfs->z_fuid_obj = 0;
925 else if (error != 0)
926 return (error);
927
928 error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
929 &zfsvfs->z_shares_dir);
930 if (error == ENOENT)
931 zfsvfs->z_shares_dir = 0;
932 else if (error != 0)
933 return (error);
934
935 /*
936 * Only use the name cache if we are looking for a
937 * name on a file system that does not require normalization
938 * or case folding. We can also look there if we happen to be
939 * on a non-normalizing, mixed sensitivity file system IF we
940 * are looking for the exact name (which is always the case on
941 * FreeBSD).
942 */
943 zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
944 ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
945 !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
946
947 return (0);
948}
949
950taskq_t *zfsvfs_taskq;
951
952static void
953zfsvfs_task_unlinked_drain(void *context, int pending __unused)
954{
955
956 zfs_unlinked_drain((zfsvfs_t *)context);
957}
958
959int
960zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
961{
962 objset_t *os;
963 zfsvfs_t *zfsvfs;
964 int error;
965 boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
966
967 /*
968 * XXX: Fix struct statfs so this isn't necessary!
969 *
970 * The 'osname' is used as the filesystem's special node, which means
971 * it must fit in statfs.f_mntfromname, or else it can't be
972 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
973 * 'zfs unmount' to think it's not mounted when it is.
974 */
975 if (strlen(osname) >= MNAMELEN)
976 return (SET_ERROR(ENAMETOOLONG));
977
978 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
979
980 error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs,
981 &os);
982 if (error != 0) {
983 kmem_free(zfsvfs, sizeof (zfsvfs_t));
984 return (error);
985 }
986
987 error = zfsvfs_create_impl(zfvp, zfsvfs, os);
988
989 return (error);
990}
991
992
993int
994zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
995{
996 int error;
997
998 zfsvfs->z_vfs = NULL;
999 zfsvfs->z_parent = zfsvfs;
1000
1001 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1002 mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
1003 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1004 offsetof(znode_t, z_link_node));
1005 TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
1006 zfsvfs_task_unlinked_drain, zfsvfs);
5ebe425a 1007 ZFS_TEARDOWN_INIT(zfsvfs);
9847f77f 1008 ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs);
9f0a21e6
MM
1009 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
1010 for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1011 mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1012
1013 error = zfsvfs_init(zfsvfs, os);
1014 if (error != 0) {
1015 dmu_objset_disown(os, B_TRUE, zfsvfs);
1016 *zfvp = NULL;
1017 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1018 return (error);
1019 }
1020
1021 *zfvp = zfsvfs;
1022 return (0);
1023}
1024
1025static int
1026zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
1027{
1028 int error;
1029
1030 /*
1031 * Check for a bad on-disk format version now since we
1032 * lied about owning the dataset readonly before.
1033 */
1034 if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
1035 dmu_objset_incompatible_encryption_version(zfsvfs->z_os))
1036 return (SET_ERROR(EROFS));
1037
1038 error = zfs_register_callbacks(zfsvfs->z_vfs);
1039 if (error)
1040 return (error);
1041
9f0a21e6
MM
1042 /*
1043 * If we are not mounting (ie: online recv), then we don't
1044 * have to worry about replaying the log as we blocked all
1045 * operations out since we closed the ZIL.
1046 */
1047 if (mounting) {
1048 boolean_t readonly;
1049
4547fc4e 1050 ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
fb087146
AH
1051 error = dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
1052 if (error)
1053 return (error);
1054 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1055 &zfsvfs->z_kstat.dk_zil_sums);
4547fc4e 1056
9f0a21e6
MM
1057 /*
1058 * During replay we remove the read only flag to
1059 * allow replays to succeed.
1060 */
1061 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
1062 if (readonly != 0) {
1063 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
1064 } else {
1065 dsl_dir_t *dd;
4547fc4e
AJ
1066 zap_stats_t zs;
1067
1068 if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
1069 &zs) == 0) {
1070 dataset_kstats_update_nunlinks_kstat(
1071 &zfsvfs->z_kstat, zs.zs_num_entries);
1072 dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
1073 "num_entries in unlinked set: %llu",
8e739b2c 1074 (u_longlong_t)zs.zs_num_entries);
4547fc4e 1075 }
9f0a21e6
MM
1076
1077 zfs_unlinked_drain(zfsvfs);
1078 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1079 dd->dd_activity_cancelled = B_FALSE;
1080 }
1081
1082 /*
1083 * Parse and replay the intent log.
1084 *
1085 * Because of ziltest, this must be done after
1086 * zfs_unlinked_drain(). (Further note: ziltest
1087 * doesn't use readonly mounts, where
1088 * zfs_unlinked_drain() isn't called.) This is because
1089 * ziltest causes spa_sync() to think it's committed,
1090 * but actually it is not, so the intent log contains
1091 * many txg's worth of changes.
1092 *
1093 * In particular, if object N is in the unlinked set in
1094 * the last txg to actually sync, then it could be
1095 * actually freed in a later txg and then reallocated
1096 * in a yet later txg. This would write a "create
1097 * object N" record to the intent log. Normally, this
1098 * would be fine because the spa_sync() would have
1099 * written out the fact that object N is free, before
1100 * we could write the "create object N" intent log
1101 * record.
1102 *
1103 * But when we are in ziltest mode, we advance the "open
1104 * txg" without actually spa_sync()-ing the changes to
1105 * disk. So we would see that object N is still
1106 * allocated and in the unlinked set, and there is an
1107 * intent log record saying to allocate it.
1108 */
1109 if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
1110 if (zil_replay_disable) {
1111 zil_destroy(zfsvfs->z_log, B_FALSE);
1112 } else {
1113 boolean_t use_nc = zfsvfs->z_use_namecache;
1114 zfsvfs->z_use_namecache = B_FALSE;
1115 zfsvfs->z_replay = B_TRUE;
1116 zil_replay(zfsvfs->z_os, zfsvfs,
1117 zfs_replay_vector);
1118 zfsvfs->z_replay = B_FALSE;
1119 zfsvfs->z_use_namecache = use_nc;
1120 }
1121 }
1122
1123 /* restore readonly bit */
1124 if (readonly != 0)
1125 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
fb087146
AH
1126 } else {
1127 ASSERT3P(zfsvfs->z_kstat.dk_kstats, !=, NULL);
1128 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data,
1129 &zfsvfs->z_kstat.dk_zil_sums);
9f0a21e6
MM
1130 }
1131
1132 /*
1133 * Set the objset user_ptr to track its zfsvfs.
1134 */
1135 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1136 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1137 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1138
1139 return (0);
1140}
1141
9f0a21e6
MM
1142void
1143zfsvfs_free(zfsvfs_t *zfsvfs)
1144{
1145 int i;
1146
9f0a21e6
MM
1147 zfs_fuid_destroy(zfsvfs);
1148
1149 mutex_destroy(&zfsvfs->z_znodes_lock);
1150 mutex_destroy(&zfsvfs->z_lock);
e4efb709 1151 ASSERT3U(zfsvfs->z_nr_znodes, ==, 0);
9f0a21e6 1152 list_destroy(&zfsvfs->z_all_znodes);
5ebe425a 1153 ZFS_TEARDOWN_DESTROY(zfsvfs);
9847f77f 1154 ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs);
9f0a21e6
MM
1155 rw_destroy(&zfsvfs->z_fuid_lock);
1156 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1157 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
4547fc4e 1158 dataset_kstats_destroy(&zfsvfs->z_kstat);
9f0a21e6
MM
1159 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1160}
1161
1162static void
1163zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
1164{
1165 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
9f0a21e6
MM
1166 zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
1167}
1168
1169static int
1170zfs_domount(vfs_t *vfsp, char *osname)
1171{
1172 uint64_t recordsize, fsid_guid;
1173 int error = 0;
1174 zfsvfs_t *zfsvfs;
1175
e4efb709
RM
1176 ASSERT3P(vfsp, !=, NULL);
1177 ASSERT3P(osname, !=, NULL);
9f0a21e6
MM
1178
1179 error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs);
1180 if (error)
1181 return (error);
1182 zfsvfs->z_vfs = vfsp;
1183
1184 if ((error = dsl_prop_get_integer(osname,
1185 "recordsize", &recordsize, NULL)))
1186 goto out;
1187 zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
1188 zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
1189
1190 vfsp->vfs_data = zfsvfs;
1191 vfsp->mnt_flag |= MNT_LOCAL;
1192 vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
1193 vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
1194 vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
1195 /*
1196 * This can cause a loss of coherence between ARC and page cache
1197 * on ZoF - unclear if the problem is in FreeBSD or ZoF
1198 */
1199 vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */
1200 vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
1201 vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
1202
1b376d17
MM
1203#if defined(_KERNEL) && !defined(KMEM_DEBUG)
1204 vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
1205#endif
9f0a21e6
MM
1206 /*
1207 * The fsid is 64 bits, composed of an 8-bit fs type, which
1208 * separates our fsid from any other filesystem types, and a
1209 * 56-bit objset unique ID. The objset unique ID is unique to
1210 * all objsets open on this system, provided by unique_create().
1211 * The 8-bit fs type must be put in the low bits of fsid[1]
1212 * because that's where other Solaris filesystems put it.
1213 */
1214 fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
e4efb709 1215 ASSERT3U((fsid_guid & ~((1ULL << 56) - 1)), ==, 0);
9f0a21e6 1216 vfsp->vfs_fsid.val[0] = fsid_guid;
e4efb709 1217 vfsp->vfs_fsid.val[1] = ((fsid_guid >> 32) << 8) |
9f0a21e6
MM
1218 (vfsp->mnt_vfc->vfc_typenum & 0xFF);
1219
1220 /*
1221 * Set features for file system.
1222 */
1223 zfs_set_fuid_feature(zfsvfs);
9f0a21e6
MM
1224
1225 if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
1226 uint64_t pval;
1227
1228 atime_changed_cb(zfsvfs, B_FALSE);
1229 readonly_changed_cb(zfsvfs, B_TRUE);
1230 if ((error = dsl_prop_get_integer(osname,
1231 "xattr", &pval, NULL)))
1232 goto out;
1233 xattr_changed_cb(zfsvfs, pval);
485b50bb
RM
1234 if ((error = dsl_prop_get_integer(osname,
1235 "acltype", &pval, NULL)))
1236 goto out;
1237 acl_type_changed_cb(zfsvfs, pval);
9f0a21e6
MM
1238 zfsvfs->z_issnap = B_TRUE;
1239 zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
1240
1241 mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
1242 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
1243 mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
1244 } else {
1245 if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
1246 goto out;
1247 }
1248
1249 vfs_mountedfrom(vfsp, osname);
1250
1251 if (!zfsvfs->z_issnap)
1252 zfsctl_create(zfsvfs);
1253out:
1254 if (error) {
1255 dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
1256 zfsvfs_free(zfsvfs);
1257 } else {
1258 atomic_inc_32(&zfs_active_fs_count);
1259 }
1260
1261 return (error);
1262}
1263
65c7cc49 1264static void
9f0a21e6
MM
1265zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
1266{
1267 objset_t *os = zfsvfs->z_os;
1268
1269 if (!dmu_objset_is_snapshot(os))
1270 dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
1271}
1272
9f0a21e6
MM
1273static int
1274getpoolname(const char *osname, char *poolname)
1275{
1276 char *p;
1277
1278 p = strchr(osname, '/');
1279 if (p == NULL) {
1280 if (strlen(osname) >= MAXNAMELEN)
1281 return (ENAMETOOLONG);
1282 (void) strcpy(poolname, osname);
1283 } else {
1284 if (p - osname >= MAXNAMELEN)
1285 return (ENAMETOOLONG);
7584fbe8 1286 (void) strlcpy(poolname, osname, p - osname + 1);
9f0a21e6
MM
1287 }
1288 return (0);
1289}
1290
e464f7c7
MZ
1291static void
1292fetch_osname_options(char *name, bool *checkpointrewind)
1293{
1294
1295 if (name[0] == '!') {
1296 *checkpointrewind = true;
1297 memmove(name, name + 1, strlen(name));
1298 } else {
1299 *checkpointrewind = false;
1300 }
1301}
1302
9f0a21e6
MM
1303static int
1304zfs_mount(vfs_t *vfsp)
1305{
1306 kthread_t *td = curthread;
1307 vnode_t *mvp = vfsp->mnt_vnodecovered;
1308 cred_t *cr = td->td_ucred;
1309 char *osname;
1310 int error = 0;
1311 int canwrite;
595d3ac2 1312 bool checkpointrewind, isctlsnap = false;
9f0a21e6
MM
1313
1314 if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
1315 return (SET_ERROR(EINVAL));
1316
1317 /*
1318 * If full-owner-access is enabled and delegated administration is
1319 * turned on, we must set nosuid.
1320 */
1321 if (zfs_super_owner &&
1322 dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
1323 secpolicy_fs_mount_clearopts(cr, vfsp);
1324 }
1325
e464f7c7 1326 fetch_osname_options(osname, &checkpointrewind);
d27a0028
AJ
1327 isctlsnap = (mvp != NULL && zfsctl_is_node(mvp) &&
1328 strchr(osname, '@') != NULL);
e464f7c7 1329
9f0a21e6
MM
1330 /*
1331 * Check for mount privilege?
1332 *
1333 * If we don't have privilege then see if
1334 * we have local permission to allow it
1335 */
1336 error = secpolicy_fs_mount(cr, mvp, vfsp);
595d3ac2
AJ
1337 if (error && isctlsnap) {
1338 secpolicy_fs_mount_clearopts(cr, vfsp);
1339 } else if (error) {
9f0a21e6
MM
1340 if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
1341 goto out;
1342
1343 if (!(vfsp->vfs_flag & MS_REMOUNT)) {
1344 vattr_t vattr;
1345
1346 /*
1347 * Make sure user is the owner of the mount point
1348 * or has sufficient privileges.
1349 */
1350
1351 vattr.va_mask = AT_UID;
1352
1353 vn_lock(mvp, LK_SHARED | LK_RETRY);
1354 if (VOP_GETATTR(mvp, &vattr, cr)) {
1355 VOP_UNLOCK1(mvp);
1356 goto out;
1357 }
1358
1359 if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
1360 VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
1361 VOP_UNLOCK1(mvp);
1362 goto out;
1363 }
1364 VOP_UNLOCK1(mvp);
1365 }
1366
1367 secpolicy_fs_mount_clearopts(cr, vfsp);
1368 }
1369
1370 /*
1371 * Refuse to mount a filesystem if we are in a local zone and the
1372 * dataset is not visible.
1373 */
1374 if (!INGLOBALZONE(curproc) &&
1375 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
595d3ac2
AJ
1376 boolean_t mount_snapshot = B_FALSE;
1377
1378 /*
1379 * Snapshots may be mounted in .zfs for unjailed datasets
1380 * if allowed by the jail param zfs.mount_snapshot.
1381 */
1382 if (isctlsnap) {
1383 struct prison *pr;
1384 struct zfs_jailparam *zjp;
1385
1386 pr = curthread->td_ucred->cr_prison;
1387 mtx_lock(&pr->pr_mtx);
1388 zjp = osd_jail_get(pr, zfs_jailparam_slot);
1389 mtx_unlock(&pr->pr_mtx);
1390 if (zjp && zjp->mount_snapshot)
1391 mount_snapshot = B_TRUE;
1392 }
1393 if (!mount_snapshot) {
1394 error = SET_ERROR(EPERM);
1395 goto out;
1396 }
9f0a21e6
MM
1397 }
1398
9f0a21e6
MM
1399 vfsp->vfs_flag |= MNT_NFS4ACLS;
1400
1401 /*
1402 * When doing a remount, we simply refresh our temporary properties
1403 * according to those options set in the current VFS options.
1404 */
1405 if (vfsp->vfs_flag & MS_REMOUNT) {
1406 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1407
1408 /*
1409 * Refresh mount options with z_teardown_lock blocking I/O while
1410 * the filesystem is in an inconsistent state.
1411 * The lock also serializes this code with filesystem
1412 * manipulations between entry to zfs_suspend_fs() and return
1413 * from zfs_resume_fs().
1414 */
5ebe425a 1415 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
9f0a21e6
MM
1416 zfs_unregister_callbacks(zfsvfs);
1417 error = zfs_register_callbacks(vfsp);
5ebe425a 1418 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
9f0a21e6
MM
1419 goto out;
1420 }
1421
1422 /* Initial root mount: try hard to import the requested root pool. */
1423 if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
1424 (vfsp->vfs_flag & MNT_UPDATE) == 0) {
1425 char pname[MAXNAMELEN];
1426
1427 error = getpoolname(osname, pname);
1428 if (error == 0)
e464f7c7 1429 error = spa_import_rootpool(pname, checkpointrewind);
9f0a21e6
MM
1430 if (error)
1431 goto out;
1432 }
1433 DROP_GIANT();
1434 error = zfs_domount(vfsp, osname);
1435 PICKUP_GIANT();
1436
1437out:
1438 return (error);
1439}
1440
1441static int
1442zfs_statfs(vfs_t *vfsp, struct statfs *statp)
1443{
1444 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1445 uint64_t refdbytes, availbytes, usedobjs, availobjs;
768eaced 1446 int error;
9f0a21e6
MM
1447
1448 statp->f_version = STATFS_VERSION;
1449
768eaced
CC
1450 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1451 return (error);
9f0a21e6
MM
1452
1453 dmu_objset_space(zfsvfs->z_os,
1454 &refdbytes, &availbytes, &usedobjs, &availobjs);
1455
1456 /*
1457 * The underlying storage pool actually uses multiple block sizes.
1458 * We report the fragsize as the smallest block size we support,
1459 * and we report our blocksize as the filesystem's maximum blocksize.
1460 */
1461 statp->f_bsize = SPA_MINBLOCKSIZE;
1462 statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
1463
1464 /*
1465 * The following report "total" blocks of various kinds in the
1466 * file system, but reported in terms of f_frsize - the
1467 * "fragment" size.
1468 */
1469
1470 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1471 statp->f_bfree = availbytes / statp->f_bsize;
1472 statp->f_bavail = statp->f_bfree; /* no root reservation */
1473
1474 /*
1475 * statvfs() should really be called statufs(), because it assumes
1476 * static metadata. ZFS doesn't preallocate files, so the best
1477 * we can do is report the max that could possibly fit in f_files,
1478 * and that minus the number actually used in f_ffree.
1479 * For f_ffree, report the smaller of the number of object available
1480 * and the number of blocks (each object will take at least a block).
1481 */
1482 statp->f_ffree = MIN(availobjs, statp->f_bfree);
1483 statp->f_files = statp->f_ffree + usedobjs;
1484
1485 /*
1486 * We're a zfs filesystem.
1487 */
1488 strlcpy(statp->f_fstypename, "zfs",
1489 sizeof (statp->f_fstypename));
1490
1491 strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
1492 sizeof (statp->f_mntfromname));
1493 strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
1494 sizeof (statp->f_mntonname));
1495
1496 statp->f_namemax = MAXNAMELEN - 1;
1497
768eaced 1498 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
1499 return (0);
1500}
1501
1502static int
1503zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
1504{
1505 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1506 znode_t *rootzp;
1507 int error;
1508
768eaced
CC
1509 if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
1510 return (error);
9f0a21e6
MM
1511
1512 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1513 if (error == 0)
1514 *vpp = ZTOV(rootzp);
1515
768eaced 1516 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
1517
1518 if (error == 0) {
1519 error = vn_lock(*vpp, flags);
1520 if (error != 0) {
1521 VN_RELE(*vpp);
1522 *vpp = NULL;
1523 }
1524 }
1525 return (error);
1526}
1527
1528/*
1529 * Teardown the zfsvfs::z_os.
1530 *
1531 * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
1532 * and 'z_teardown_inactive_lock' held.
1533 */
1534static int
1535zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1536{
1537 znode_t *zp;
1538 dsl_dir_t *dd;
1539
1540 /*
1541 * If someone has not already unmounted this file system,
1542 * drain the zrele_taskq to ensure all active references to the
1543 * zfsvfs_t have been handled only then can it be safely destroyed.
1544 */
1545 if (zfsvfs->z_os) {
1546 /*
1547 * If we're unmounting we have to wait for the list to
1548 * drain completely.
1549 *
1550 * If we're not unmounting there's no guarantee the list
1551 * will drain completely, but zreles run from the taskq
1552 * may add the parents of dir-based xattrs to the taskq
1553 * so we want to wait for these.
1554 *
1555 * We can safely read z_nr_znodes without locking because the
1556 * VFS has already blocked operations which add to the
1557 * z_all_znodes list and thus increment z_nr_znodes.
1558 */
1559 int round = 0;
1560 while (zfsvfs->z_nr_znodes > 0) {
1561 taskq_wait_outstanding(dsl_pool_zrele_taskq(
1562 dmu_objset_pool(zfsvfs->z_os)), 0);
1563 if (++round > 1 && !unmounting)
1564 break;
1565 }
1566 }
5ebe425a 1567 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
9f0a21e6
MM
1568
1569 if (!unmounting) {
1570 /*
1571 * We purge the parent filesystem's vfsp as the parent
1572 * filesystem and all of its snapshots have their vnode's
1573 * v_vfsp set to the parent's filesystem's vfsp. Note,
1574 * 'z_parent' is self referential for non-snapshots.
1575 */
1576#ifdef FREEBSD_NAMECACHE
f6bb7c02
MG
1577#if __FreeBSD_version >= 1300117
1578 cache_purgevfs(zfsvfs->z_parent->z_vfs);
1579#else
9f0a21e6 1580 cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
f6bb7c02 1581#endif
9f0a21e6
MM
1582#endif
1583 }
1584
1585 /*
1586 * Close the zil. NB: Can't close the zil while zfs_inactive
1587 * threads are blocked as zil_close can call zfs_inactive.
1588 */
1589 if (zfsvfs->z_log) {
1590 zil_close(zfsvfs->z_log);
1591 zfsvfs->z_log = NULL;
1592 }
1593
9847f77f 1594 ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs);
9f0a21e6
MM
1595
1596 /*
1597 * If we are not unmounting (ie: online recv) and someone already
1598 * unmounted this file system while we were doing the switcheroo,
1599 * or a reopen of z_os failed then just bail out now.
1600 */
1601 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
9847f77f 1602 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
5ebe425a 1603 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
9f0a21e6
MM
1604 return (SET_ERROR(EIO));
1605 }
1606
1607 /*
1608 * At this point there are no vops active, and any new vops will
1609 * fail with EIO since we have z_teardown_lock for writer (only
dd4bc569 1610 * relevant for forced unmount).
9f0a21e6
MM
1611 *
1612 * Release all holds on dbufs.
1613 */
1614 mutex_enter(&zfsvfs->z_znodes_lock);
1615 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
e4efb709
RM
1616 zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1617 if (zp->z_sa_hdl != NULL) {
9f0a21e6
MM
1618 zfs_znode_dmu_fini(zp);
1619 }
e4efb709 1620 }
9f0a21e6
MM
1621 mutex_exit(&zfsvfs->z_znodes_lock);
1622
1623 /*
1624 * If we are unmounting, set the unmounted flag and let new vops
1625 * unblock. zfs_inactive will have the unmounted behavior, and all
1626 * other vops will fail with EIO.
1627 */
1628 if (unmounting) {
1629 zfsvfs->z_unmounted = B_TRUE;
9847f77f 1630 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
5ebe425a 1631 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
9f0a21e6
MM
1632 }
1633
1634 /*
1635 * z_os will be NULL if there was an error in attempting to reopen
1636 * zfsvfs, so just return as the properties had already been
1637 * unregistered and cached data had been evicted before.
1638 */
1639 if (zfsvfs->z_os == NULL)
1640 return (0);
1641
1642 /*
1643 * Unregister properties.
1644 */
1645 zfs_unregister_callbacks(zfsvfs);
1646
1647 /*
1648 * Evict cached data
1649 */
1650 if (!zfs_is_readonly(zfsvfs))
1651 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1652 dmu_objset_evict_dbufs(zfsvfs->z_os);
1653 dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
1654 dsl_dir_cancel_waiters(dd);
1655
1656 return (0);
1657}
1658
9f0a21e6
MM
1659static int
1660zfs_umount(vfs_t *vfsp, int fflag)
1661{
1662 kthread_t *td = curthread;
1663 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1664 objset_t *os;
1665 cred_t *cr = td->td_ucred;
1666 int ret;
1667
1668 ret = secpolicy_fs_unmount(cr, vfsp);
1669 if (ret) {
1670 if (dsl_deleg_access((char *)vfsp->vfs_resource,
1671 ZFS_DELEG_PERM_MOUNT, cr))
1672 return (ret);
1673 }
1674
1675 /*
1676 * Unmount any snapshots mounted under .zfs before unmounting the
1677 * dataset itself.
1678 */
1679 if (zfsvfs->z_ctldir != NULL) {
1680 if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
1681 return (ret);
1682 }
1683
1684 if (fflag & MS_FORCE) {
1685 /*
1686 * Mark file system as unmounted before calling
1687 * vflush(FORCECLOSE). This way we ensure no future vnops
1688 * will be called and risk operating on DOOMED vnodes.
1689 */
5ebe425a 1690 ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
9f0a21e6 1691 zfsvfs->z_unmounted = B_TRUE;
5ebe425a 1692 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
9f0a21e6
MM
1693 }
1694
1695 /*
1696 * Flush all the files.
1697 */
1698 ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
1699 if (ret != 0)
1700 return (ret);
1701 while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
1702 &zfsvfs->z_unlinked_drain_task, NULL) != 0)
1703 taskqueue_drain(zfsvfs_taskq->tq_queue,
1704 &zfsvfs->z_unlinked_drain_task);
1705
e4efb709 1706 VERIFY0(zfsvfs_teardown(zfsvfs, B_TRUE));
9f0a21e6
MM
1707 os = zfsvfs->z_os;
1708
1709 /*
1710 * z_os will be NULL if there was an error in
1711 * attempting to reopen zfsvfs.
1712 */
1713 if (os != NULL) {
1714 /*
1715 * Unset the objset user_ptr.
1716 */
1717 mutex_enter(&os->os_user_ptr_lock);
1718 dmu_objset_set_user(os, NULL);
1719 mutex_exit(&os->os_user_ptr_lock);
1720
1721 /*
1722 * Finally release the objset
1723 */
1724 dmu_objset_disown(os, B_TRUE, zfsvfs);
1725 }
1726
1727 /*
1728 * We can now safely destroy the '.zfs' directory node.
1729 */
1730 if (zfsvfs->z_ctldir != NULL)
1731 zfsctl_destroy(zfsvfs);
1732 zfs_freevfs(vfsp);
1733
1734 return (0);
1735}
1736
1737static int
1738zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
1739{
1740 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1741 znode_t *zp;
1742 int err;
1743
1744 /*
1745 * zfs_zget() can't operate on virtual entries like .zfs/ or
1746 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
1747 * This will make NFS to switch to LOOKUP instead of using VGET.
1748 */
1749 if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
1750 (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
1751 return (EOPNOTSUPP);
1752
768eaced
CC
1753 if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1754 return (err);
9f0a21e6
MM
1755 err = zfs_zget(zfsvfs, ino, &zp);
1756 if (err == 0 && zp->z_unlinked) {
1757 vrele(ZTOV(zp));
1758 err = EINVAL;
1759 }
1760 if (err == 0)
1761 *vpp = ZTOV(zp);
768eaced 1762 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
1763 if (err == 0) {
1764 err = vn_lock(*vpp, flags);
1765 if (err != 0)
1766 vrele(*vpp);
1767 }
1768 if (err != 0)
1769 *vpp = NULL;
1770 return (err);
1771}
1772
1c08fa8b 1773static int
2e6af52b 1774#if __FreeBSD_version >= 1300098
1c08fa8b
RM
1775zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
1776 struct ucred **credanonp, int *numsecflavors, int *secflavors)
2e6af52b 1777#else
9f0a21e6
MM
1778zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
1779 struct ucred **credanonp, int *numsecflavors, int **secflavors)
2e6af52b 1780#endif
9f0a21e6
MM
1781{
1782 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1783
1784 /*
1785 * If this is regular file system vfsp is the same as
1786 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
1787 * zfsvfs->z_parent->z_vfs represents parent file system
1788 * which we have to use here, because only this file system
1789 * has mnt_export configured.
1790 */
1791 return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
1792 credanonp, numsecflavors, secflavors));
1793}
1794
c70bb2f6
AZ
1795_Static_assert(sizeof (struct fid) >= SHORT_FID_LEN,
1796 "struct fid bigger than SHORT_FID_LEN");
1797_Static_assert(sizeof (struct fid) >= LONG_FID_LEN,
1798 "struct fid bigger than LONG_FID_LEN");
9f0a21e6
MM
1799
1800static int
1801zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
1802{
1803 struct componentname cn;
1804 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1805 znode_t *zp;
1806 vnode_t *dvp;
1807 uint64_t object = 0;
1808 uint64_t fid_gen = 0;
43dbf881 1809 uint64_t setgen = 0;
9f0a21e6
MM
1810 uint64_t gen_mask;
1811 uint64_t zp_gen;
1812 int i, err;
1813
1814 *vpp = NULL;
1815
768eaced
CC
1816 if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1817 return (err);
9f0a21e6
MM
1818
1819 /*
1820 * On FreeBSD we can get snapshot's mount point or its parent file
1821 * system mount point depending if snapshot is already mounted or not.
1822 */
1823 if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
1824 zfid_long_t *zlfid = (zfid_long_t *)fidp;
1825 uint64_t objsetid = 0;
9f0a21e6
MM
1826
1827 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1828 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1829
1830 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1831 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1832
768eaced 1833 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
1834
1835 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1836 if (err)
1837 return (SET_ERROR(EINVAL));
768eaced
CC
1838 if ((err = zfs_enter(zfsvfs, FTAG)) != 0)
1839 return (err);
9f0a21e6
MM
1840 }
1841
1842 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1843 zfid_short_t *zfid = (zfid_short_t *)fidp;
1844
1845 for (i = 0; i < sizeof (zfid->zf_object); i++)
1846 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1847
1848 for (i = 0; i < sizeof (zfid->zf_gen); i++)
1849 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1850 } else {
768eaced 1851 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
1852 return (SET_ERROR(EINVAL));
1853 }
1854
ed566bf1
MJ
1855 if (fidp->fid_len == LONG_FID_LEN && setgen != 0) {
1856 zfs_exit(zfsvfs, FTAG);
43dbf881
AZ
1857 dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n",
1858 (u_longlong_t)fid_gen, (u_longlong_t)setgen);
1859 return (SET_ERROR(EINVAL));
1860 }
1861
9f0a21e6
MM
1862 /*
1863 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
1864 * directory tree. If the object == zfsvfs->z_shares_dir, then
1865 * we are in the .zfs/shares directory tree.
1866 */
1867 if ((fid_gen == 0 &&
1868 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
1869 (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
768eaced 1870 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
1871 VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
1872 if (object == ZFSCTL_INO_SNAPDIR) {
1873 cn.cn_nameptr = "snapshot";
1874 cn.cn_namelen = strlen(cn.cn_nameptr);
1875 cn.cn_nameiop = LOOKUP;
1876 cn.cn_flags = ISLASTCN | LOCKLEAF;
1877 cn.cn_lkflags = flags;
1878 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1879 vput(dvp);
1880 } else if (object == zfsvfs->z_shares_dir) {
1881 /*
1882 * XXX This branch must not be taken,
1883 * if it is, then the lookup below will
1884 * explode.
1885 */
1886 cn.cn_nameptr = "shares";
1887 cn.cn_namelen = strlen(cn.cn_nameptr);
1888 cn.cn_nameiop = LOOKUP;
1889 cn.cn_flags = ISLASTCN;
1890 cn.cn_lkflags = flags;
1891 VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
1892 vput(dvp);
1893 } else {
1894 *vpp = dvp;
1895 }
1896 return (err);
1897 }
1898
1899 gen_mask = -1ULL >> (64 - 8 * i);
1900
8e739b2c
RE
1901 dprintf("getting %llu [%llu mask %llx]\n", (u_longlong_t)object,
1902 (u_longlong_t)fid_gen,
1903 (u_longlong_t)gen_mask);
9f0a21e6 1904 if ((err = zfs_zget(zfsvfs, object, &zp))) {
768eaced 1905 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
1906 return (err);
1907 }
1908 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
1909 sizeof (uint64_t));
1910 zp_gen = zp_gen & gen_mask;
1911 if (zp_gen == 0)
1912 zp_gen = 1;
1913 if (zp->z_unlinked || zp_gen != fid_gen) {
8e739b2c
RE
1914 dprintf("znode gen (%llu) != fid gen (%llu)\n",
1915 (u_longlong_t)zp_gen, (u_longlong_t)fid_gen);
9f0a21e6 1916 vrele(ZTOV(zp));
768eaced 1917 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
1918 return (SET_ERROR(EINVAL));
1919 }
1920
1921 *vpp = ZTOV(zp);
768eaced 1922 zfs_exit(zfsvfs, FTAG);
9f0a21e6
MM
1923 err = vn_lock(*vpp, flags);
1924 if (err == 0)
1925 vnode_create_vobject(*vpp, zp->z_size, curthread);
1926 else
1927 *vpp = NULL;
1928 return (err);
1929}
1930
1931/*
1932 * Block out VOPs and close zfsvfs_t::z_os
1933 *
1934 * Note, if successful, then we return with the 'z_teardown_lock' and
1935 * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
1936 * dataset and objset intact so that they can be atomically handed off during
1937 * a subsequent rollback or recv operation and the resume thereafter.
1938 */
1939int
1940zfs_suspend_fs(zfsvfs_t *zfsvfs)
1941{
1942 int error;
1943
1944 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1945 return (error);
1946
1947 return (0);
1948}
1949
1950/*
1951 * Rebuild SA and release VOPs. Note that ownership of the underlying dataset
1952 * is an invariant across any of the operations that can be performed while the
1953 * filesystem was suspended. Whether it succeeded or failed, the preconditions
1954 * are the same: the relevant objset and associated dataset are owned by
1955 * zfsvfs, held, and long held on entry.
1956 */
1957int
1958zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
1959{
1960 int err;
1961 znode_t *zp;
1962
5ebe425a 1963 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
9847f77f 1964 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
9f0a21e6
MM
1965
1966 /*
1967 * We already own this, so just update the objset_t, as the one we
1968 * had before may have been evicted.
1969 */
1970 objset_t *os;
1971 VERIFY3P(ds->ds_owner, ==, zfsvfs);
1972 VERIFY(dsl_dataset_long_held(ds));
1973 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
1974 dsl_pool_config_enter(dp, FTAG);
1975 VERIFY0(dmu_objset_from_ds(ds, &os));
1976 dsl_pool_config_exit(dp, FTAG);
1977
1978 err = zfsvfs_init(zfsvfs, os);
1979 if (err != 0)
1980 goto bail;
1981
1982 ds->ds_dir->dd_activity_cancelled = B_FALSE;
e4efb709 1983 VERIFY0(zfsvfs_setup(zfsvfs, B_FALSE));
9f0a21e6
MM
1984
1985 zfs_set_fuid_feature(zfsvfs);
1986
1987 /*
1988 * Attempt to re-establish all the active znodes with
1989 * their dbufs. If a zfs_rezget() fails, then we'll let
768eaced 1990 * any potential callers discover that via zfs_enter_verify_zp
9f0a21e6
MM
1991 * when they try to use their znode.
1992 */
1993 mutex_enter(&zfsvfs->z_znodes_lock);
1994 for (zp = list_head(&zfsvfs->z_all_znodes); zp;
1995 zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1996 (void) zfs_rezget(zp);
1997 }
1998 mutex_exit(&zfsvfs->z_znodes_lock);
1999
2000bail:
2001 /* release the VOPs */
9847f77f 2002 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
5ebe425a 2003 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
9f0a21e6
MM
2004
2005 if (err) {
2006 /*
2007 * Since we couldn't setup the sa framework, try to force
2008 * unmount this file system.
2009 */
2010 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
2011 vfs_ref(zfsvfs->z_vfs);
2012 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
2013 }
2014 }
2015 return (err);
2016}
2017
2018static void
2019zfs_freevfs(vfs_t *vfsp)
2020{
2021 zfsvfs_t *zfsvfs = vfsp->vfs_data;
2022
2023 zfsvfs_free(zfsvfs);
2024
2025 atomic_dec_32(&zfs_active_fs_count);
2026}
2027
2028#ifdef __i386__
2029static int desiredvnodes_backup;
47ed79ff
MM
2030#include <sys/vmmeter.h>
2031
2032
2033#include <vm/vm_page.h>
2034#include <vm/vm_object.h>
2035#include <vm/vm_kern.h>
2036#include <vm/vm_map.h>
9f0a21e6
MM
2037#endif
2038
2039static void
2040zfs_vnodes_adjust(void)
2041{
2042#ifdef __i386__
2043 int newdesiredvnodes;
2044
2045 desiredvnodes_backup = desiredvnodes;
2046
2047 /*
2048 * We calculate newdesiredvnodes the same way it is done in
2049 * vntblinit(). If it is equal to desiredvnodes, it means that
2050 * it wasn't tuned by the administrator and we can tune it down.
2051 */
2052 newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
2053 vm_kmem_size / (5 * (sizeof (struct vm_object) +
2054 sizeof (struct vnode))));
2055 if (newdesiredvnodes == desiredvnodes)
2056 desiredvnodes = (3 * newdesiredvnodes) / 4;
2057#endif
2058}
2059
2060static void
2061zfs_vnodes_adjust_back(void)
2062{
2063
2064#ifdef __i386__
2065 desiredvnodes = desiredvnodes_backup;
2066#endif
2067}
2068
2069void
2070zfs_init(void)
2071{
2072
2073 printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
2074
2075 /*
2076 * Initialize .zfs directory structures
2077 */
2078 zfsctl_init();
2079
2080 /*
2081 * Initialize znode cache, vnode ops, etc...
2082 */
2083 zfs_znode_init();
2084
2085 /*
2086 * Reduce number of vnodes. Originally number of vnodes is calculated
2087 * with UFS inode in mind. We reduce it here, because it's too big for
2088 * ZFS/i386.
2089 */
2090 zfs_vnodes_adjust();
2091
7bcb7f08 2092 dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
9f0a21e6
MM
2093
2094 zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
2095}
2096
2097void
2098zfs_fini(void)
2099{
2100 taskq_destroy(zfsvfs_taskq);
2101 zfsctl_fini();
2102 zfs_znode_fini();
2103 zfs_vnodes_adjust_back();
2104}
2105
2106int
2107zfs_busy(void)
2108{
2109 return (zfs_active_fs_count != 0);
2110}
2111
2112/*
2113 * Release VOPs and unmount a suspended filesystem.
2114 */
2115int
2116zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
2117{
5ebe425a 2118 ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
9847f77f 2119 ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
9f0a21e6
MM
2120
2121 /*
2122 * We already own this, so just hold and rele it to update the
2123 * objset_t, as the one we had before may have been evicted.
2124 */
2125 objset_t *os;
2126 VERIFY3P(ds->ds_owner, ==, zfsvfs);
2127 VERIFY(dsl_dataset_long_held(ds));
2128 dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
2129 dsl_pool_config_enter(dp, FTAG);
2130 VERIFY0(dmu_objset_from_ds(ds, &os));
2131 dsl_pool_config_exit(dp, FTAG);
2132 zfsvfs->z_os = os;
2133
2134 /* release the VOPs */
9847f77f 2135 ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
5ebe425a 2136 ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
9f0a21e6
MM
2137
2138 /*
2139 * Try to force unmount this file system.
2140 */
2141 (void) zfs_umount(zfsvfs->z_vfs, 0);
2142 zfsvfs->z_unmounted = B_TRUE;
2143 return (0);
2144}
2145
2146int
2147zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
2148{
2149 int error;
2150 objset_t *os = zfsvfs->z_os;
2151 dmu_tx_t *tx;
2152
2153 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
2154 return (SET_ERROR(EINVAL));
2155
2156 if (newvers < zfsvfs->z_version)
2157 return (SET_ERROR(EINVAL));
2158
2159 if (zfs_spa_version_map(newvers) >
2160 spa_version(dmu_objset_spa(zfsvfs->z_os)))
2161 return (SET_ERROR(ENOTSUP));
2162
2163 tx = dmu_tx_create(os);
2164 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
2165 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2166 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
2167 ZFS_SA_ATTRS);
2168 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2169 }
2170 error = dmu_tx_assign(tx, TXG_WAIT);
2171 if (error) {
2172 dmu_tx_abort(tx);
2173 return (error);
2174 }
2175
2176 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
2177 8, 1, &newvers, tx);
2178
2179 if (error) {
2180 dmu_tx_commit(tx);
2181 return (error);
2182 }
2183
2184 if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
2185 uint64_t sa_obj;
2186
2187 ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
2188 SPA_VERSION_SA);
2189 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
2190 DMU_OT_NONE, 0, tx);
2191
2192 error = zap_add(os, MASTER_NODE_OBJ,
2193 ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
2194 ASSERT0(error);
2195
e4efb709 2196 VERIFY0(sa_set_sa_object(os, sa_obj));
9f0a21e6
MM
2197 sa_register_update_callback(os, zfs_sa_upgrade);
2198 }
2199
2200 spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
47ed79ff
MM
2201 "from %ju to %ju", (uintmax_t)zfsvfs->z_version,
2202 (uintmax_t)newvers);
9f0a21e6
MM
2203 dmu_tx_commit(tx);
2204
2205 zfsvfs->z_version = newvers;
2206 os->os_version = newvers;
2207
2208 zfs_set_fuid_feature(zfsvfs);
2209
2210 return (0);
2211}
2212
2213/*
2214 * Read a property stored within the master node.
2215 */
2216int
2217zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2218{
2219 uint64_t *cached_copy = NULL;
2220
2221 /*
2222 * Figure out where in the objset_t the cached copy would live, if it
2223 * is available for the requested property.
2224 */
2225 if (os != NULL) {
2226 switch (prop) {
2227 case ZFS_PROP_VERSION:
2228 cached_copy = &os->os_version;
2229 break;
2230 case ZFS_PROP_NORMALIZE:
2231 cached_copy = &os->os_normalization;
2232 break;
2233 case ZFS_PROP_UTF8ONLY:
2234 cached_copy = &os->os_utf8only;
2235 break;
2236 case ZFS_PROP_CASE:
2237 cached_copy = &os->os_casesensitivity;
2238 break;
2239 default:
2240 break;
2241 }
2242 }
2243 if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
2244 *value = *cached_copy;
2245 return (0);
2246 }
2247
2248 /*
2249 * If the property wasn't cached, look up the file system's value for
2250 * the property. For the version property, we look up a slightly
2251 * different string.
2252 */
2253 const char *pname;
2254 int error = ENOENT;
2255 if (prop == ZFS_PROP_VERSION) {
2256 pname = ZPL_VERSION_STR;
2257 } else {
2258 pname = zfs_prop_to_name(prop);
2259 }
2260
2261 if (os != NULL) {
2262 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
2263 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2264 }
2265
2266 if (error == ENOENT) {
2267 /* No value set, use the default value */
2268 switch (prop) {
2269 case ZFS_PROP_VERSION:
2270 *value = ZPL_VERSION;
2271 break;
2272 case ZFS_PROP_NORMALIZE:
2273 case ZFS_PROP_UTF8ONLY:
2274 *value = 0;
2275 break;
2276 case ZFS_PROP_CASE:
2277 *value = ZFS_CASE_SENSITIVE;
2278 break;
485b50bb
RM
2279 case ZFS_PROP_ACLTYPE:
2280 *value = ZFS_ACLTYPE_NFSV4;
2281 break;
9f0a21e6
MM
2282 default:
2283 return (error);
2284 }
2285 error = 0;
2286 }
2287
2288 /*
2289 * If one of the methods for getting the property value above worked,
2290 * copy it into the objset_t's cache.
2291 */
2292 if (error == 0 && cached_copy != NULL) {
2293 *cached_copy = *value;
2294 }
2295
2296 return (error);
2297}
2298
2299/*
dd4bc569 2300 * Return true if the corresponding vfs's unmounted flag is set.
9f0a21e6
MM
2301 * Otherwise return false.
2302 * If this function returns true we know VFS unmount has been initiated.
2303 */
2304boolean_t
2305zfs_get_vfs_flag_unmounted(objset_t *os)
2306{
2307 zfsvfs_t *zfvp;
2308 boolean_t unmounted = B_FALSE;
2309
e4efb709 2310 ASSERT3U(dmu_objset_type(os), ==, DMU_OST_ZFS);
9f0a21e6
MM
2311
2312 mutex_enter(&os->os_user_ptr_lock);
2313 zfvp = dmu_objset_get_user(os);
2314 if (zfvp != NULL && zfvp->z_vfs != NULL &&
2315 (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
2316 unmounted = B_TRUE;
2317 mutex_exit(&os->os_user_ptr_lock);
2318
2319 return (unmounted);
2320}
2321
2322#ifdef _KERNEL
2323void
2324zfsvfs_update_fromname(const char *oldname, const char *newname)
2325{
2326 char tmpbuf[MAXPATHLEN];
2327 struct mount *mp;
2328 char *fromname;
2329 size_t oldlen;
2330
2331 oldlen = strlen(oldname);
2332
2333 mtx_lock(&mountlist_mtx);
2334 TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2335 fromname = mp->mnt_stat.f_mntfromname;
2336 if (strcmp(fromname, oldname) == 0) {
2337 (void) strlcpy(fromname, newname,
2338 sizeof (mp->mnt_stat.f_mntfromname));
2339 continue;
2340 }
2341 if (strncmp(fromname, oldname, oldlen) == 0 &&
2342 (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
2343 (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s",
2344 newname, fromname + oldlen);
2345 (void) strlcpy(fromname, tmpbuf,
2346 sizeof (mp->mnt_stat.f_mntfromname));
2347 continue;
2348 }
2349 }
2350 mtx_unlock(&mountlist_mtx);
2351}
2352#endif
595d3ac2
AJ
2353
2354/*
2355 * Find a prison with ZFS info.
2356 * Return the ZFS info and the (locked) prison.
2357 */
2358static struct zfs_jailparam *
2359zfs_jailparam_find(struct prison *spr, struct prison **prp)
2360{
2361 struct prison *pr;
2362 struct zfs_jailparam *zjp;
2363
2364 for (pr = spr; ; pr = pr->pr_parent) {
2365 mtx_lock(&pr->pr_mtx);
2366 if (pr == &prison0) {
2367 zjp = &zfs_jailparam0;
2368 break;
2369 }
2370 zjp = osd_jail_get(pr, zfs_jailparam_slot);
2371 if (zjp != NULL)
2372 break;
2373 mtx_unlock(&pr->pr_mtx);
2374 }
2375 *prp = pr;
2376
2377 return (zjp);
2378}
2379
2380/*
2381 * Ensure a prison has its own ZFS info. If zjpp is non-null, point it to the
2382 * ZFS info and lock the prison.
2383 */
2384static void
2385zfs_jailparam_alloc(struct prison *pr, struct zfs_jailparam **zjpp)
2386{
2387 struct prison *ppr;
2388 struct zfs_jailparam *zjp, *nzjp;
2389 void **rsv;
2390
2391 /* If this prison already has ZFS info, return that. */
2392 zjp = zfs_jailparam_find(pr, &ppr);
2393 if (ppr == pr)
2394 goto done;
2395
2396 /*
2397 * Allocate a new info record. Then check again, in case something
2398 * changed during the allocation.
2399 */
2400 mtx_unlock(&ppr->pr_mtx);
2401 nzjp = malloc(sizeof (struct zfs_jailparam), M_PRISON, M_WAITOK);
2402 rsv = osd_reserve(zfs_jailparam_slot);
2403 zjp = zfs_jailparam_find(pr, &ppr);
2404 if (ppr == pr) {
2405 free(nzjp, M_PRISON);
2406 osd_free_reserved(rsv);
2407 goto done;
2408 }
2409 /* Inherit the initial values from the ancestor. */
2410 mtx_lock(&pr->pr_mtx);
2411 (void) osd_jail_set_reserved(pr, zfs_jailparam_slot, rsv, nzjp);
2412 (void) memcpy(nzjp, zjp, sizeof (*zjp));
2413 zjp = nzjp;
2414 mtx_unlock(&ppr->pr_mtx);
2415done:
2416 if (zjpp != NULL)
2417 *zjpp = zjp;
2418 else
2419 mtx_unlock(&pr->pr_mtx);
2420}
2421
2422/*
2423 * Jail OSD methods for ZFS VFS info.
2424 */
2425static int
2426zfs_jailparam_create(void *obj, void *data)
2427{
2428 struct prison *pr = obj;
2429 struct vfsoptlist *opts = data;
2430 int jsys;
2431
2432 if (vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys)) == 0 &&
2433 jsys == JAIL_SYS_INHERIT)
2434 return (0);
2435 /*
2436 * Inherit a prison's initial values from its parent
2437 * (different from JAIL_SYS_INHERIT which also inherits changes).
2438 */
2439 zfs_jailparam_alloc(pr, NULL);
2440 return (0);
2441}
2442
2443static int
2444zfs_jailparam_get(void *obj, void *data)
2445{
2446 struct prison *ppr, *pr = obj;
2447 struct vfsoptlist *opts = data;
2448 struct zfs_jailparam *zjp;
2449 int jsys, error;
2450
2451 zjp = zfs_jailparam_find(pr, &ppr);
2452 jsys = (ppr == pr) ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
2453 error = vfs_setopt(opts, "zfs", &jsys, sizeof (jsys));
2454 if (error != 0 && error != ENOENT)
2455 goto done;
2456 if (jsys == JAIL_SYS_NEW) {
2457 error = vfs_setopt(opts, "zfs.mount_snapshot",
2458 &zjp->mount_snapshot, sizeof (zjp->mount_snapshot));
2459 if (error != 0 && error != ENOENT)
2460 goto done;
2461 } else {
2462 /*
2463 * If this prison is inheriting its ZFS info, report
2464 * empty/zero parameters.
2465 */
2466 static int mount_snapshot = 0;
2467
2468 error = vfs_setopt(opts, "zfs.mount_snapshot",
2469 &mount_snapshot, sizeof (mount_snapshot));
2470 if (error != 0 && error != ENOENT)
2471 goto done;
2472 }
2473 error = 0;
2474done:
2475 mtx_unlock(&ppr->pr_mtx);
2476 return (error);
2477}
2478
2479static int
2480zfs_jailparam_set(void *obj, void *data)
2481{
2482 struct prison *pr = obj;
2483 struct prison *ppr;
2484 struct vfsoptlist *opts = data;
2485 int error, jsys, mount_snapshot;
2486
2487 /* Set the parameters, which should be correct. */
2488 error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2489 if (error == ENOENT)
2490 jsys = -1;
2491 error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2492 sizeof (mount_snapshot));
2493 if (error == ENOENT)
2494 mount_snapshot = -1;
2495 else
2496 jsys = JAIL_SYS_NEW;
2497 if (jsys == JAIL_SYS_NEW) {
2498 /* "zfs=new" or "zfs.*": the prison gets its own ZFS info. */
2499 struct zfs_jailparam *zjp;
2500
2501 /*
2502 * A child jail cannot have more permissions than its parent
2503 */
2504 if (pr->pr_parent != &prison0) {
2505 zjp = zfs_jailparam_find(pr->pr_parent, &ppr);
2506 mtx_unlock(&ppr->pr_mtx);
2507 if (zjp->mount_snapshot < mount_snapshot) {
2508 return (EPERM);
2509 }
2510 }
2511 zfs_jailparam_alloc(pr, &zjp);
2512 if (mount_snapshot != -1)
2513 zjp->mount_snapshot = mount_snapshot;
2514 mtx_unlock(&pr->pr_mtx);
2515 } else {
2516 /* "zfs=inherit": inherit the parent's ZFS info. */
2517 mtx_lock(&pr->pr_mtx);
2518 osd_jail_del(pr, zfs_jailparam_slot);
2519 mtx_unlock(&pr->pr_mtx);
2520 }
2521 return (0);
2522}
2523
2524static int
2525zfs_jailparam_check(void *obj __unused, void *data)
2526{
2527 struct vfsoptlist *opts = data;
2528 int error, jsys, mount_snapshot;
2529
2530 /* Check that the parameters are correct. */
2531 error = vfs_copyopt(opts, "zfs", &jsys, sizeof (jsys));
2532 if (error != ENOENT) {
2533 if (error != 0)
2534 return (error);
2535 if (jsys != JAIL_SYS_NEW && jsys != JAIL_SYS_INHERIT)
2536 return (EINVAL);
2537 }
2538 error = vfs_copyopt(opts, "zfs.mount_snapshot", &mount_snapshot,
2539 sizeof (mount_snapshot));
2540 if (error != ENOENT) {
2541 if (error != 0)
2542 return (error);
2543 if (mount_snapshot != 0 && mount_snapshot != 1)
2544 return (EINVAL);
2545 }
2546 return (0);
2547}
2548
2549static void
2550zfs_jailparam_destroy(void *data)
2551{
2552
2553 free(data, M_PRISON);
2554}
2555
2556static void
2557zfs_jailparam_sysinit(void *arg __unused)
2558{
2559 struct prison *pr;
2560 osd_method_t methods[PR_MAXMETHOD] = {
2561 [PR_METHOD_CREATE] = zfs_jailparam_create,
2562 [PR_METHOD_GET] = zfs_jailparam_get,
2563 [PR_METHOD_SET] = zfs_jailparam_set,
2564 [PR_METHOD_CHECK] = zfs_jailparam_check,
2565 };
2566
2567 zfs_jailparam_slot = osd_jail_register(zfs_jailparam_destroy, methods);
2568 /* Copy the defaults to any existing prisons. */
2569 sx_slock(&allprison_lock);
2570 TAILQ_FOREACH(pr, &allprison, pr_list)
2571 zfs_jailparam_alloc(pr, NULL);
2572 sx_sunlock(&allprison_lock);
2573}
2574
2575static void
2576zfs_jailparam_sysuninit(void *arg __unused)
2577{
2578
2579 osd_jail_deregister(zfs_jailparam_slot);
2580}
2581
2582SYSINIT(zfs_jailparam_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2583 zfs_jailparam_sysinit, NULL);
2584SYSUNINIT(zfs_jailparam_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY,
2585 zfs_jailparam_sysuninit, NULL);