]> git.proxmox.com Git - mirror_zfs.git/blame - zfs/lib/libdmu-ctl/zfs_vfsops.c
Initial Linux ZFS GIT Repo
[mirror_zfs.git] / zfs / lib / libdmu-ctl / zfs_vfsops.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident "@(#)zfs_vfsops.c 1.41 08/04/11 SMI"
27
28#include <sys/types.h>
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/sysmacros.h>
32#include <sys/kmem.h>
33#include <sys/pathname.h>
34#include <sys/vnode.h>
35#include <sys/vfs.h>
36#include <sys/vfs_opreg.h>
37#include <sys/mntent.h>
38#include <sys/mount.h>
39#include <sys/cmn_err.h>
40#include "fs/fs_subr.h"
41#include <sys/zfs_znode.h>
42#include <sys/zfs_dir.h>
43#include <sys/zil.h>
44#include <sys/fs/zfs.h>
45#include <sys/dmu.h>
46#include <sys/dsl_prop.h>
47#include <sys/dsl_dataset.h>
48#include <sys/dsl_deleg.h>
49#include <sys/spa.h>
50#include <sys/zap.h>
51#include <sys/varargs.h>
52#include <sys/policy.h>
53#include <sys/atomic.h>
54#include <sys/mkdev.h>
55#include <sys/modctl.h>
56#include <sys/refstr.h>
57#include <sys/zfs_ioctl.h>
58#include <sys/zfs_ctldir.h>
59#include <sys/zfs_fuid.h>
60#include <sys/bootconf.h>
61#include <sys/sunddi.h>
62#include <sys/dnlc.h>
63#include <sys/dmu_objset.h>
64#include <sys/spa_boot.h>
65
66int zfsfstype;
67vfsops_t *zfs_vfsops = NULL;
68static major_t zfs_major;
69static minor_t zfs_minor;
70static kmutex_t zfs_dev_mtx;
71
72static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
73static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
74static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
75static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
76static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
77static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
78static void zfs_freevfs(vfs_t *vfsp);
79
80static const fs_operation_def_t zfs_vfsops_template[] = {
81 VFSNAME_MOUNT, { .vfs_mount = zfs_mount },
82 VFSNAME_MOUNTROOT, { .vfs_mountroot = zfs_mountroot },
83 VFSNAME_UNMOUNT, { .vfs_unmount = zfs_umount },
84 VFSNAME_ROOT, { .vfs_root = zfs_root },
85 VFSNAME_STATVFS, { .vfs_statvfs = zfs_statvfs },
86 VFSNAME_SYNC, { .vfs_sync = zfs_sync },
87 VFSNAME_VGET, { .vfs_vget = zfs_vget },
88 VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs },
89 NULL, NULL
90};
91
92static const fs_operation_def_t zfs_vfsops_eio_template[] = {
93 VFSNAME_FREEVFS, { .vfs_freevfs = zfs_freevfs },
94 NULL, NULL
95};
96
97/*
98 * We need to keep a count of active fs's.
99 * This is necessary to prevent our module
100 * from being unloaded after a umount -f
101 */
102static uint32_t zfs_active_fs_count = 0;
103
104static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
105static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
106static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
107static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
108
109/*
110 * MO_DEFAULT is not used since the default value is determined
111 * by the equivalent property.
112 */
113static mntopt_t mntopts[] = {
114 { MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
115 { MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
116 { MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
117 { MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
118};
119
120static mntopts_t zfs_mntopts = {
121 sizeof (mntopts) / sizeof (mntopt_t),
122 mntopts
123};
124
125/*ARGSUSED*/
126int
127zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
128{
129 /*
130 * Data integrity is job one. We don't want a compromised kernel
131 * writing to the storage pool, so we never sync during panic.
132 */
133 if (panicstr)
134 return (0);
135
136 /*
137 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
138 * to sync metadata, which they would otherwise cache indefinitely.
139 * Semantically, the only requirement is that the sync be initiated.
140 * The DMU syncs out txgs frequently, so there's nothing to do.
141 */
142 if (flag & SYNC_ATTR)
143 return (0);
144
145 if (vfsp != NULL) {
146 /*
147 * Sync a specific filesystem.
148 */
149 zfsvfs_t *zfsvfs = vfsp->vfs_data;
150
151 ZFS_ENTER(zfsvfs);
152 if (zfsvfs->z_log != NULL)
153 zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
154 else
155 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
156 ZFS_EXIT(zfsvfs);
157 } else {
158 /*
159 * Sync all ZFS filesystems. This is what happens when you
160 * run sync(1M). Unlike other filesystems, ZFS honors the
161 * request by waiting for all pools to commit all dirty data.
162 */
163 spa_sync_allpools();
164 }
165
166 return (0);
167}
168
169static int
170zfs_create_unique_device(dev_t *dev)
171{
172 major_t new_major;
173
174 do {
175 ASSERT3U(zfs_minor, <=, MAXMIN32);
176 minor_t start = zfs_minor;
177 do {
178 mutex_enter(&zfs_dev_mtx);
179 if (zfs_minor >= MAXMIN32) {
180 /*
181 * If we're still using the real major
182 * keep out of /dev/zfs and /dev/zvol minor
183 * number space. If we're using a getudev()'ed
184 * major number, we can use all of its minors.
185 */
186 if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
187 zfs_minor = ZFS_MIN_MINOR;
188 else
189 zfs_minor = 0;
190 } else {
191 zfs_minor++;
192 }
193 *dev = makedevice(zfs_major, zfs_minor);
194 mutex_exit(&zfs_dev_mtx);
195 } while (vfs_devismounted(*dev) && zfs_minor != start);
196 if (zfs_minor == start) {
197 /*
198 * We are using all ~262,000 minor numbers for the
199 * current major number. Create a new major number.
200 */
201 if ((new_major = getudev()) == (major_t)-1) {
202 cmn_err(CE_WARN,
203 "zfs_mount: Can't get unique major "
204 "device number.");
205 return (-1);
206 }
207 mutex_enter(&zfs_dev_mtx);
208 zfs_major = new_major;
209 zfs_minor = 0;
210
211 mutex_exit(&zfs_dev_mtx);
212 } else {
213 break;
214 }
215 /* CONSTANTCONDITION */
216 } while (1);
217
218 return (0);
219}
220
221static void
222atime_changed_cb(void *arg, uint64_t newval)
223{
224 zfsvfs_t *zfsvfs = arg;
225
226 if (newval == TRUE) {
227 zfsvfs->z_atime = TRUE;
228 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
229 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
230 } else {
231 zfsvfs->z_atime = FALSE;
232 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
233 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
234 }
235}
236
237static void
238xattr_changed_cb(void *arg, uint64_t newval)
239{
240 zfsvfs_t *zfsvfs = arg;
241
242 if (newval == TRUE) {
243 /* XXX locking on vfs_flag? */
244 zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
245 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
246 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
247 } else {
248 /* XXX locking on vfs_flag? */
249 zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
250 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
251 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
252 }
253}
254
255static void
256blksz_changed_cb(void *arg, uint64_t newval)
257{
258 zfsvfs_t *zfsvfs = arg;
259
260 if (newval < SPA_MINBLOCKSIZE ||
261 newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
262 newval = SPA_MAXBLOCKSIZE;
263
264 zfsvfs->z_max_blksz = newval;
265 zfsvfs->z_vfs->vfs_bsize = newval;
266}
267
268static void
269readonly_changed_cb(void *arg, uint64_t newval)
270{
271 zfsvfs_t *zfsvfs = arg;
272
273 if (newval) {
274 /* XXX locking on vfs_flag? */
275 zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
276 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
277 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
278 } else {
279 /* XXX locking on vfs_flag? */
280 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
281 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
282 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
283 }
284}
285
286static void
287devices_changed_cb(void *arg, uint64_t newval)
288{
289 zfsvfs_t *zfsvfs = arg;
290
291 if (newval == FALSE) {
292 zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
293 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
294 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
295 } else {
296 zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
297 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
298 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
299 }
300}
301
302static void
303setuid_changed_cb(void *arg, uint64_t newval)
304{
305 zfsvfs_t *zfsvfs = arg;
306
307 if (newval == FALSE) {
308 zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
309 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
310 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
311 } else {
312 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
313 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
314 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
315 }
316}
317
318static void
319exec_changed_cb(void *arg, uint64_t newval)
320{
321 zfsvfs_t *zfsvfs = arg;
322
323 if (newval == FALSE) {
324 zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
325 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
326 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
327 } else {
328 zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
329 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
330 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
331 }
332}
333
334/*
335 * The nbmand mount option can be changed at mount time.
336 * We can't allow it to be toggled on live file systems or incorrect
337 * behavior may be seen from cifs clients
338 *
339 * This property isn't registered via dsl_prop_register(), but this callback
340 * will be called when a file system is first mounted
341 */
342static void
343nbmand_changed_cb(void *arg, uint64_t newval)
344{
345 zfsvfs_t *zfsvfs = arg;
346 if (newval == FALSE) {
347 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
348 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
349 } else {
350 vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
351 vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
352 }
353}
354
355static void
356snapdir_changed_cb(void *arg, uint64_t newval)
357{
358 zfsvfs_t *zfsvfs = arg;
359
360 zfsvfs->z_show_ctldir = newval;
361}
362
363static void
364vscan_changed_cb(void *arg, uint64_t newval)
365{
366 zfsvfs_t *zfsvfs = arg;
367
368 zfsvfs->z_vscan = newval;
369}
370
371static void
372acl_mode_changed_cb(void *arg, uint64_t newval)
373{
374 zfsvfs_t *zfsvfs = arg;
375
376 zfsvfs->z_acl_mode = newval;
377}
378
379static void
380acl_inherit_changed_cb(void *arg, uint64_t newval)
381{
382 zfsvfs_t *zfsvfs = arg;
383
384 zfsvfs->z_acl_inherit = newval;
385}
386
387static int
388zfs_register_callbacks(vfs_t *vfsp)
389{
390 struct dsl_dataset *ds = NULL;
391 objset_t *os = NULL;
392 zfsvfs_t *zfsvfs = NULL;
393 uint64_t nbmand;
394 int readonly, do_readonly = B_FALSE;
395 int setuid, do_setuid = B_FALSE;
396 int exec, do_exec = B_FALSE;
397 int devices, do_devices = B_FALSE;
398 int xattr, do_xattr = B_FALSE;
399 int atime, do_atime = B_FALSE;
400 int error = 0;
401
402 ASSERT(vfsp);
403 zfsvfs = vfsp->vfs_data;
404 ASSERT(zfsvfs);
405 os = zfsvfs->z_os;
406
407 /*
408 * The act of registering our callbacks will destroy any mount
409 * options we may have. In order to enable temporary overrides
410 * of mount options, we stash away the current values and
411 * restore them after we register the callbacks.
412 */
413 if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
414 readonly = B_TRUE;
415 do_readonly = B_TRUE;
416 } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
417 readonly = B_FALSE;
418 do_readonly = B_TRUE;
419 }
420 if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
421 devices = B_FALSE;
422 setuid = B_FALSE;
423 do_devices = B_TRUE;
424 do_setuid = B_TRUE;
425 } else {
426 if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
427 devices = B_FALSE;
428 do_devices = B_TRUE;
429 } else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
430 devices = B_TRUE;
431 do_devices = B_TRUE;
432 }
433
434 if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
435 setuid = B_FALSE;
436 do_setuid = B_TRUE;
437 } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
438 setuid = B_TRUE;
439 do_setuid = B_TRUE;
440 }
441 }
442 if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
443 exec = B_FALSE;
444 do_exec = B_TRUE;
445 } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
446 exec = B_TRUE;
447 do_exec = B_TRUE;
448 }
449 if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
450 xattr = B_FALSE;
451 do_xattr = B_TRUE;
452 } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
453 xattr = B_TRUE;
454 do_xattr = B_TRUE;
455 }
456 if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
457 atime = B_FALSE;
458 do_atime = B_TRUE;
459 } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
460 atime = B_TRUE;
461 do_atime = B_TRUE;
462 }
463
464 /*
465 * nbmand is a special property. It can only be changed at
466 * mount time.
467 *
468 * This is weird, but it is documented to only be changeable
469 * at mount time.
470 */
471 if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
472 nbmand = B_FALSE;
473 } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
474 nbmand = B_TRUE;
475 } else {
476 char osname[MAXNAMELEN];
477
478 dmu_objset_name(os, osname);
479 if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
480 NULL))
481 return (error);
482 }
483
484 /*
485 * Register property callbacks.
486 *
487 * It would probably be fine to just check for i/o error from
488 * the first prop_register(), but I guess I like to go
489 * overboard...
490 */
491 ds = dmu_objset_ds(os);
492 error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
493 error = error ? error : dsl_prop_register(ds,
494 "xattr", xattr_changed_cb, zfsvfs);
495 error = error ? error : dsl_prop_register(ds,
496 "recordsize", blksz_changed_cb, zfsvfs);
497 error = error ? error : dsl_prop_register(ds,
498 "readonly", readonly_changed_cb, zfsvfs);
499 error = error ? error : dsl_prop_register(ds,
500 "devices", devices_changed_cb, zfsvfs);
501 error = error ? error : dsl_prop_register(ds,
502 "setuid", setuid_changed_cb, zfsvfs);
503 error = error ? error : dsl_prop_register(ds,
504 "exec", exec_changed_cb, zfsvfs);
505 error = error ? error : dsl_prop_register(ds,
506 "snapdir", snapdir_changed_cb, zfsvfs);
507 error = error ? error : dsl_prop_register(ds,
508 "aclmode", acl_mode_changed_cb, zfsvfs);
509 error = error ? error : dsl_prop_register(ds,
510 "aclinherit", acl_inherit_changed_cb, zfsvfs);
511 error = error ? error : dsl_prop_register(ds,
512 "vscan", vscan_changed_cb, zfsvfs);
513 if (error)
514 goto unregister;
515
516 /*
517 * Invoke our callbacks to restore temporary mount options.
518 */
519 if (do_readonly)
520 readonly_changed_cb(zfsvfs, readonly);
521 if (do_setuid)
522 setuid_changed_cb(zfsvfs, setuid);
523 if (do_exec)
524 exec_changed_cb(zfsvfs, exec);
525 if (do_devices)
526 devices_changed_cb(zfsvfs, devices);
527 if (do_xattr)
528 xattr_changed_cb(zfsvfs, xattr);
529 if (do_atime)
530 atime_changed_cb(zfsvfs, atime);
531
532 nbmand_changed_cb(zfsvfs, nbmand);
533
534 return (0);
535
536unregister:
537 /*
538 * We may attempt to unregister some callbacks that are not
539 * registered, but this is OK; it will simply return ENOMSG,
540 * which we will ignore.
541 */
542 (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
543 (void) dsl_prop_unregister(ds, "xattr", xattr_changed_cb, zfsvfs);
544 (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
545 (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
546 (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs);
547 (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
548 (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
549 (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
550 (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
551 (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
552 zfsvfs);
553 (void) dsl_prop_unregister(ds, "vscan", vscan_changed_cb, zfsvfs);
554 return (error);
555
556}
557
558static int
559zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
560{
561 uint_t readonly;
562 int error;
563
564 error = zfs_register_callbacks(zfsvfs->z_vfs);
565 if (error)
566 return (error);
567
568 /*
569 * Set the objset user_ptr to track its zfsvfs.
570 */
571 mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
572 dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
573 mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
574
575 /*
576 * If we are not mounting (ie: online recv), then we don't
577 * have to worry about replaying the log as we blocked all
578 * operations out since we closed the ZIL.
579 */
580 if (mounting) {
581 /*
582 * During replay we remove the read only flag to
583 * allow replays to succeed.
584 */
585 readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
586 if (readonly != 0)
587 zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
588 else
589 zfs_unlinked_drain(zfsvfs);
590
591 /*
592 * Parse and replay the intent log.
593 *
594 * Because of ziltest, this must be done after
595 * zfs_unlinked_drain(). (Further note: ziltest doesn't
596 * use readonly mounts, where zfs_unlinked_drain() isn't
597 * called.) This is because ziltest causes spa_sync()
598 * to think it's committed, but actually it is not, so
599 * the intent log contains many txg's worth of changes.
600 *
601 * In particular, if object N is in the unlinked set in
602 * the last txg to actually sync, then it could be
603 * actually freed in a later txg and then reallocated in
604 * a yet later txg. This would write a "create object
605 * N" record to the intent log. Normally, this would be
606 * fine because the spa_sync() would have written out
607 * the fact that object N is free, before we could write
608 * the "create object N" intent log record.
609 *
610 * But when we are in ziltest mode, we advance the "open
611 * txg" without actually spa_sync()-ing the changes to
612 * disk. So we would see that object N is still
613 * allocated and in the unlinked set, and there is an
614 * intent log record saying to allocate it.
615 */
616 zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
617 zfs_replay_vector);
618
619 zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
620 }
621
622 if (!zil_disable)
623 zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
624
625 return (0);
626}
627
628static void
629zfs_freezfsvfs(zfsvfs_t *zfsvfs)
630{
631 mutex_destroy(&zfsvfs->z_znodes_lock);
632 mutex_destroy(&zfsvfs->z_online_recv_lock);
633 list_destroy(&zfsvfs->z_all_znodes);
634 rrw_destroy(&zfsvfs->z_teardown_lock);
635 rw_destroy(&zfsvfs->z_teardown_inactive_lock);
636 rw_destroy(&zfsvfs->z_fuid_lock);
637 kmem_free(zfsvfs, sizeof (zfsvfs_t));
638}
639
640static int
641zfs_domount(vfs_t *vfsp, char *osname, cred_t *cr)
642{
643 dev_t mount_dev;
644 uint64_t recordsize, readonly;
645 int error = 0;
646 int mode;
647 zfsvfs_t *zfsvfs;
648 znode_t *zp = NULL;
649
650 ASSERT(vfsp);
651 ASSERT(osname);
652
653 /*
654 * Initialize the zfs-specific filesystem structure.
655 * Should probably make this a kmem cache, shuffle fields,
656 * and just bzero up to z_hold_mtx[].
657 */
658 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
659 zfsvfs->z_vfs = vfsp;
660 zfsvfs->z_parent = zfsvfs;
661 zfsvfs->z_assign = TXG_NOWAIT;
662 zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
663 zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
664
665 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
666 mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
667 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
668 offsetof(znode_t, z_link_node));
669 rrw_init(&zfsvfs->z_teardown_lock);
670 rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
671 rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
672
673 /* Initialize the generic filesystem structure. */
674 vfsp->vfs_bcount = 0;
675 vfsp->vfs_data = NULL;
676
677 if (zfs_create_unique_device(&mount_dev) == -1) {
678 error = ENODEV;
679 goto out;
680 }
681 ASSERT(vfs_devismounted(mount_dev) == 0);
682
683 if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
684 NULL))
685 goto out;
686
687 vfsp->vfs_dev = mount_dev;
688 vfsp->vfs_fstype = zfsfstype;
689 vfsp->vfs_bsize = recordsize;
690 vfsp->vfs_flag |= VFS_NOTRUNC;
691 vfsp->vfs_data = zfsvfs;
692
693 if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
694 goto out;
695
696 if (readonly)
697 mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
698 else
699 mode = DS_MODE_PRIMARY;
700
701 error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
702 if (error == EROFS) {
703 mode = DS_MODE_PRIMARY | DS_MODE_READONLY;
704 error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
705 &zfsvfs->z_os);
706 }
707
708 if (error)
709 goto out;
710
711 if (error = zfs_init_fs(zfsvfs, &zp, cr))
712 goto out;
713
714 /* The call to zfs_init_fs leaves the vnode held, release it here. */
715 VN_RELE(ZTOV(zp));
716
717 /*
718 * Set features for file system.
719 */
720 zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
721 if (zfsvfs->z_use_fuids) {
722 vfs_set_feature(vfsp, VFSFT_XVATTR);
723 vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS);
724 vfs_set_feature(vfsp, VFSFT_ACLONCREATE);
725 }
726 if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
727 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
728 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
729 vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
730 } else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
731 vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
732 vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
733 }
734
735 if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
736 uint64_t pval;
737
738 ASSERT(mode & DS_MODE_READONLY);
739 atime_changed_cb(zfsvfs, B_FALSE);
740 readonly_changed_cb(zfsvfs, B_TRUE);
741 if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
742 goto out;
743 xattr_changed_cb(zfsvfs, pval);
744 zfsvfs->z_issnap = B_TRUE;
745 } else {
746 error = zfsvfs_setup(zfsvfs, B_TRUE);
747 }
748
749 if (!zfsvfs->z_issnap)
750 zfsctl_create(zfsvfs);
751out:
752 if (error) {
753 if (zfsvfs->z_os)
754 dmu_objset_close(zfsvfs->z_os);
755 zfs_freezfsvfs(zfsvfs);
756 } else {
757 atomic_add_32(&zfs_active_fs_count, 1);
758 }
759
760 return (error);
761}
762
763void
764zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
765{
766 objset_t *os = zfsvfs->z_os;
767 struct dsl_dataset *ds;
768
769 /*
770 * Unregister properties.
771 */
772 if (!dmu_objset_is_snapshot(os)) {
773 ds = dmu_objset_ds(os);
774 VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
775 zfsvfs) == 0);
776
777 VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
778 zfsvfs) == 0);
779
780 VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
781 zfsvfs) == 0);
782
783 VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
784 zfsvfs) == 0);
785
786 VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
787 zfsvfs) == 0);
788
789 VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
790 zfsvfs) == 0);
791
792 VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
793 zfsvfs) == 0);
794
795 VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
796 zfsvfs) == 0);
797
798 VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
799 zfsvfs) == 0);
800
801 VERIFY(dsl_prop_unregister(ds, "aclinherit",
802 acl_inherit_changed_cb, zfsvfs) == 0);
803
804 VERIFY(dsl_prop_unregister(ds, "vscan",
805 vscan_changed_cb, zfsvfs) == 0);
806 }
807}
808
809/*
810 * Convert a decimal digit string to a uint64_t integer.
811 */
812static int
813str_to_uint64(char *str, uint64_t *objnum)
814{
815 uint64_t num = 0;
816
817 while (*str) {
818 if (*str < '0' || *str > '9')
819 return (EINVAL);
820
821 num = num*10 + *str++ - '0';
822 }
823
824 *objnum = num;
825 return (0);
826}
827
828/*
829 * The boot path passed from the boot loader is in the form of
830 * "rootpool-name/root-filesystem-object-number'. Convert this
831 * string to a dataset name: "rootpool-name/root-filesystem-name".
832 */
833static int
834zfs_parse_bootfs(char *bpath, char *outpath)
835{
836 char *slashp;
837 uint64_t objnum;
838 int error;
839
840 if (*bpath == 0 || *bpath == '/')
841 return (EINVAL);
842
843 slashp = strchr(bpath, '/');
844
845 /* if no '/', just return the pool name */
846 if (slashp == NULL) {
847 (void) strcpy(outpath, bpath);
848 return (0);
849 }
850
851 if (error = str_to_uint64(slashp+1, &objnum))
852 return (error);
853
854 *slashp = '\0';
855 error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
856 *slashp = '/';
857
858 return (error);
859}
860
861static int
862zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
863{
864 int error = 0;
865 static int zfsrootdone = 0;
866 zfsvfs_t *zfsvfs = NULL;
867 znode_t *zp = NULL;
868 vnode_t *vp = NULL;
869 char *zfs_bootfs;
870
871 ASSERT(vfsp);
872
873 /*
874 * The filesystem that we mount as root is defined in the
875 * boot property "zfs-bootfs" with a format of
876 * "poolname/root-dataset-objnum".
877 */
878 if (why == ROOT_INIT) {
879 if (zfsrootdone++)
880 return (EBUSY);
881 /*
882 * the process of doing a spa_load will require the
883 * clock to be set before we could (for example) do
884 * something better by looking at the timestamp on
885 * an uberblock, so just set it to -1.
886 */
887 clkset(-1);
888
889 if ((zfs_bootfs = spa_get_bootfs()) == NULL) {
890 cmn_err(CE_NOTE, "\nspa_get_bootfs: can not get "
891 "bootfs name \n");
892 return (EINVAL);
893 }
894
895 if (error = spa_import_rootpool(rootfs.bo_name)) {
896 spa_free_bootfs(zfs_bootfs);
897 cmn_err(CE_NOTE, "\nspa_import_rootpool: error %d\n",
898 error);
899 return (error);
900 }
901
902 if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
903 spa_free_bootfs(zfs_bootfs);
904 cmn_err(CE_NOTE, "\nzfs_parse_bootfs: error %d\n",
905 error);
906 return (error);
907 }
908
909 spa_free_bootfs(zfs_bootfs);
910
911 if (error = vfs_lock(vfsp))
912 return (error);
913
914 if (error = zfs_domount(vfsp, rootfs.bo_name, CRED())) {
915 cmn_err(CE_NOTE, "\nzfs_domount: error %d\n", error);
916 goto out;
917 }
918
919 zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
920 ASSERT(zfsvfs);
921 if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
922 cmn_err(CE_NOTE, "\nzfs_zget: error %d\n", error);
923 goto out;
924 }
925
926 vp = ZTOV(zp);
927 mutex_enter(&vp->v_lock);
928 vp->v_flag |= VROOT;
929 mutex_exit(&vp->v_lock);
930 rootvp = vp;
931
932 /*
933 * The zfs_zget call above returns with a hold on vp, we release
934 * it here.
935 */
936 VN_RELE(vp);
937
938 vfs_add((struct vnode *)0, vfsp,
939 (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
940out:
941 vfs_unlock(vfsp);
942 return (error);
943 } else if (why == ROOT_REMOUNT) {
944 readonly_changed_cb(vfsp->vfs_data, B_FALSE);
945 vfsp->vfs_flag |= VFS_REMOUNT;
946
947 /* refresh mount options */
948 zfs_unregister_callbacks(vfsp->vfs_data);
949 return (zfs_register_callbacks(vfsp));
950
951 } else if (why == ROOT_UNMOUNT) {
952 zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
953 (void) zfs_sync(vfsp, 0, 0);
954 return (0);
955 }
956
957 /*
958 * if "why" is equal to anything else other than ROOT_INIT,
959 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
960 */
961 return (ENOTSUP);
962}
963
964/*ARGSUSED*/
965static int
966zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
967{
968 char *osname;
969 pathname_t spn;
970 int error = 0;
971 uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ?
972 UIO_SYSSPACE : UIO_USERSPACE;
973 int canwrite;
974
975 if (mvp->v_type != VDIR)
976 return (ENOTDIR);
977
978 mutex_enter(&mvp->v_lock);
979 if ((uap->flags & MS_REMOUNT) == 0 &&
980 (uap->flags & MS_OVERLAY) == 0 &&
981 (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
982 mutex_exit(&mvp->v_lock);
983 return (EBUSY);
984 }
985 mutex_exit(&mvp->v_lock);
986
987 /*
988 * ZFS does not support passing unparsed data in via MS_DATA.
989 * Users should use the MS_OPTIONSTR interface; this means
990 * that all option parsing is already done and the options struct
991 * can be interrogated.
992 */
993 if ((uap->flags & MS_DATA) && uap->datalen > 0)
994 return (EINVAL);
995
996 /*
997 * Get the objset name (the "special" mount argument).
998 */
999 if (error = pn_get(uap->spec, fromspace, &spn))
1000 return (error);
1001
1002 osname = spn.pn_path;
1003
1004 /*
1005 * Check for mount privilege?
1006 *
1007 * If we don't have privilege then see if
1008 * we have local permission to allow it
1009 */
1010 error = secpolicy_fs_mount(cr, mvp, vfsp);
1011 if (error) {
1012 error = dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr);
1013 if (error == 0) {
1014 vattr_t vattr;
1015
1016 /*
1017 * Make sure user is the owner of the mount point
1018 * or has sufficient privileges.
1019 */
1020
1021 vattr.va_mask = AT_UID;
1022
1023 if (error = VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
1024 goto out;
1025 }
1026
1027 if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 &&
1028 VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) {
1029 error = EPERM;
1030 goto out;
1031 }
1032
1033 secpolicy_fs_mount_clearopts(cr, vfsp);
1034 } else {
1035 goto out;
1036 }
1037 }
1038
1039 /*
1040 * Refuse to mount a filesystem if we are in a local zone and the
1041 * dataset is not visible.
1042 */
1043 if (!INGLOBALZONE(curproc) &&
1044 (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
1045 error = EPERM;
1046 goto out;
1047 }
1048
1049 /*
1050 * When doing a remount, we simply refresh our temporary properties
1051 * according to those options set in the current VFS options.
1052 */
1053 if (uap->flags & MS_REMOUNT) {
1054 /* refresh mount options */
1055 zfs_unregister_callbacks(vfsp->vfs_data);
1056 error = zfs_register_callbacks(vfsp);
1057 goto out;
1058 }
1059
1060 error = zfs_domount(vfsp, osname, cr);
1061
1062out:
1063 pn_free(&spn);
1064 return (error);
1065}
1066
1067static int
1068zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp)
1069{
1070 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1071 dev32_t d32;
1072 uint64_t refdbytes, availbytes, usedobjs, availobjs;
1073
1074 ZFS_ENTER(zfsvfs);
1075
1076 dmu_objset_space(zfsvfs->z_os,
1077 &refdbytes, &availbytes, &usedobjs, &availobjs);
1078
1079 /*
1080 * The underlying storage pool actually uses multiple block sizes.
1081 * We report the fragsize as the smallest block size we support,
1082 * and we report our blocksize as the filesystem's maximum blocksize.
1083 */
1084 statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
1085 statp->f_bsize = zfsvfs->z_max_blksz;
1086
1087 /*
1088 * The following report "total" blocks of various kinds in the
1089 * file system, but reported in terms of f_frsize - the
1090 * "fragment" size.
1091 */
1092
1093 statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
1094 statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT;
1095 statp->f_bavail = statp->f_bfree; /* no root reservation */
1096
1097 /*
1098 * statvfs() should really be called statufs(), because it assumes
1099 * static metadata. ZFS doesn't preallocate files, so the best
1100 * we can do is report the max that could possibly fit in f_files,
1101 * and that minus the number actually used in f_ffree.
1102 * For f_ffree, report the smaller of the number of object available
1103 * and the number of blocks (each object will take at least a block).
1104 */
1105 statp->f_ffree = MIN(availobjs, statp->f_bfree);
1106 statp->f_favail = statp->f_ffree; /* no "root reservation" */
1107 statp->f_files = statp->f_ffree + usedobjs;
1108
1109 (void) cmpldev(&d32, vfsp->vfs_dev);
1110 statp->f_fsid = d32;
1111
1112 /*
1113 * We're a zfs filesystem.
1114 */
1115 (void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name);
1116
1117 statp->f_flag = vf_to_stf(vfsp->vfs_flag);
1118
1119 statp->f_namemax = ZFS_MAXNAMELEN;
1120
1121 /*
1122 * We have all of 32 characters to stuff a string here.
1123 * Is there anything useful we could/should provide?
1124 */
1125 bzero(statp->f_fstr, sizeof (statp->f_fstr));
1126
1127 ZFS_EXIT(zfsvfs);
1128 return (0);
1129}
1130
1131static int
1132zfs_root(vfs_t *vfsp, vnode_t **vpp)
1133{
1134 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1135 znode_t *rootzp;
1136 int error;
1137
1138 ZFS_ENTER(zfsvfs);
1139
1140 error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
1141 if (error == 0)
1142 *vpp = ZTOV(rootzp);
1143
1144 ZFS_EXIT(zfsvfs);
1145 return (error);
1146}
1147
1148/*
1149 * Teardown the zfsvfs::z_os.
1150 *
1151 * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
1152 * and 'z_teardown_inactive_lock' held.
1153 */
1154static int
1155zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
1156{
1157 znode_t *zp;
1158
1159 rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1160
1161 if (!unmounting) {
1162 /*
1163 * We purge the parent filesystem's vfsp as the parent
1164 * filesystem and all of its snapshots have their vnode's
1165 * v_vfsp set to the parent's filesystem's vfsp. Note,
1166 * 'z_parent' is self referential for non-snapshots.
1167 */
1168 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1169 }
1170
1171 /*
1172 * Close the zil. NB: Can't close the zil while zfs_inactive
1173 * threads are blocked as zil_close can call zfs_inactive.
1174 */
1175 if (zfsvfs->z_log) {
1176 zil_close(zfsvfs->z_log);
1177 zfsvfs->z_log = NULL;
1178 }
1179
1180 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
1181
1182 /*
1183 * If we are not unmounting (ie: online recv) and someone already
1184 * unmounted this file system while we were doing the switcheroo,
1185 * or a reopen of z_os failed then just bail out now.
1186 */
1187 if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
1188 rw_exit(&zfsvfs->z_teardown_inactive_lock);
1189 rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1190 return (EIO);
1191 }
1192
1193 /*
1194 * At this point there are no vops active, and any new vops will
1195 * fail with EIO since we have z_teardown_lock for writer (only
1196 * relavent for forced unmount).
1197 *
1198 * Release all holds on dbufs.
1199 */
1200 mutex_enter(&zfsvfs->z_znodes_lock);
1201 for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
1202 zp = list_next(&zfsvfs->z_all_znodes, zp))
1203 if (zp->z_dbuf) {
1204 ASSERT(ZTOV(zp)->v_count > 0);
1205 zfs_znode_dmu_fini(zp);
1206 }
1207 mutex_exit(&zfsvfs->z_znodes_lock);
1208
1209 /*
1210 * If we are unmounting, set the unmounted flag and let new vops
1211 * unblock. zfs_inactive will have the unmounted behavior, and all
1212 * other vops will fail with EIO.
1213 */
1214 if (unmounting) {
1215 zfsvfs->z_unmounted = B_TRUE;
1216 rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1217 rw_exit(&zfsvfs->z_teardown_inactive_lock);
1218 }
1219
1220 /*
1221 * z_os will be NULL if there was an error in attempting to reopen
1222 * zfsvfs, so just return as the properties had already been
1223 * unregistered and cached data had been evicted before.
1224 */
1225 if (zfsvfs->z_os == NULL)
1226 return (0);
1227
1228 /*
1229 * Unregister properties.
1230 */
1231 zfs_unregister_callbacks(zfsvfs);
1232
1233 /*
1234 * Evict cached data
1235 */
1236 if (dmu_objset_evict_dbufs(zfsvfs->z_os)) {
1237 txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
1238 (void) dmu_objset_evict_dbufs(zfsvfs->z_os);
1239 }
1240
1241 return (0);
1242}
1243
1244/*ARGSUSED*/
1245static int
1246zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
1247{
1248 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1249 objset_t *os;
1250 int ret;
1251
1252 ret = secpolicy_fs_unmount(cr, vfsp);
1253 if (ret) {
1254 ret = dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
1255 ZFS_DELEG_PERM_MOUNT, cr);
1256 if (ret)
1257 return (ret);
1258 }
1259
1260 /*
1261 * We purge the parent filesystem's vfsp as the parent filesystem
1262 * and all of its snapshots have their vnode's v_vfsp set to the
1263 * parent's filesystem's vfsp. Note, 'z_parent' is self
1264 * referential for non-snapshots.
1265 */
1266 (void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
1267
1268 /*
1269 * Unmount any snapshots mounted under .zfs before unmounting the
1270 * dataset itself.
1271 */
1272 if (zfsvfs->z_ctldir != NULL &&
1273 (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
1274 return (ret);
1275 }
1276
1277 if (!(fflag & MS_FORCE)) {
1278 /*
1279 * Check the number of active vnodes in the file system.
1280 * Our count is maintained in the vfs structure, but the
1281 * number is off by 1 to indicate a hold on the vfs
1282 * structure itself.
1283 *
1284 * The '.zfs' directory maintains a reference of its
1285 * own, and any active references underneath are
1286 * reflected in the vnode count.
1287 */
1288 if (zfsvfs->z_ctldir == NULL) {
1289 if (vfsp->vfs_count > 1)
1290 return (EBUSY);
1291 } else {
1292 if (vfsp->vfs_count > 2 ||
1293 zfsvfs->z_ctldir->v_count > 1)
1294 return (EBUSY);
1295 }
1296 }
1297
1298 vfsp->vfs_flag |= VFS_UNMOUNTED;
1299
1300 VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
1301 os = zfsvfs->z_os;
1302
1303 /*
1304 * z_os will be NULL if there was an error in
1305 * attempting to reopen zfsvfs.
1306 */
1307 if (os != NULL) {
1308 /*
1309 * Unset the objset user_ptr.
1310 */
1311 mutex_enter(&os->os->os_user_ptr_lock);
1312 dmu_objset_set_user(os, NULL);
1313 mutex_exit(&os->os->os_user_ptr_lock);
1314
1315 /*
1316 * Finally close the objset
1317 */
1318 dmu_objset_close(os);
1319 }
1320
1321 /*
1322 * We can now safely destroy the '.zfs' directory node.
1323 */
1324 if (zfsvfs->z_ctldir != NULL)
1325 zfsctl_destroy(zfsvfs);
1326
1327 return (0);
1328}
1329
1330static int
1331zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
1332{
1333 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1334 znode_t *zp;
1335 uint64_t object = 0;
1336 uint64_t fid_gen = 0;
1337 uint64_t gen_mask;
1338 uint64_t zp_gen;
1339 int i, err;
1340
1341 *vpp = NULL;
1342
1343 ZFS_ENTER(zfsvfs);
1344
1345 if (fidp->fid_len == LONG_FID_LEN) {
1346 zfid_long_t *zlfid = (zfid_long_t *)fidp;
1347 uint64_t objsetid = 0;
1348 uint64_t setgen = 0;
1349
1350 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
1351 objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
1352
1353 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
1354 setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
1355
1356 ZFS_EXIT(zfsvfs);
1357
1358 err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
1359 if (err)
1360 return (EINVAL);
1361 ZFS_ENTER(zfsvfs);
1362 }
1363
1364 if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
1365 zfid_short_t *zfid = (zfid_short_t *)fidp;
1366
1367 for (i = 0; i < sizeof (zfid->zf_object); i++)
1368 object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
1369
1370 for (i = 0; i < sizeof (zfid->zf_gen); i++)
1371 fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
1372 } else {
1373 ZFS_EXIT(zfsvfs);
1374 return (EINVAL);
1375 }
1376
1377 /* A zero fid_gen means we are in the .zfs control directories */
1378 if (fid_gen == 0 &&
1379 (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
1380 *vpp = zfsvfs->z_ctldir;
1381 ASSERT(*vpp != NULL);
1382 if (object == ZFSCTL_INO_SNAPDIR) {
1383 VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
1384 0, NULL, NULL, NULL, NULL, NULL) == 0);
1385 } else {
1386 VN_HOLD(*vpp);
1387 }
1388 ZFS_EXIT(zfsvfs);
1389 return (0);
1390 }
1391
1392 gen_mask = -1ULL >> (64 - 8 * i);
1393
1394 dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
1395 if (err = zfs_zget(zfsvfs, object, &zp)) {
1396 ZFS_EXIT(zfsvfs);
1397 return (err);
1398 }
1399 zp_gen = zp->z_phys->zp_gen & gen_mask;
1400 if (zp_gen == 0)
1401 zp_gen = 1;
1402 if (zp->z_unlinked || zp_gen != fid_gen) {
1403 dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
1404 VN_RELE(ZTOV(zp));
1405 ZFS_EXIT(zfsvfs);
1406 return (EINVAL);
1407 }
1408
1409 *vpp = ZTOV(zp);
1410 ZFS_EXIT(zfsvfs);
1411 return (0);
1412}
1413
1414/*
1415 * Block out VOPs and close zfsvfs_t::z_os
1416 *
1417 * Note, if successful, then we return with the 'z_teardown_lock' and
1418 * 'z_teardown_inactive_lock' write held.
1419 */
1420int
1421zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode)
1422{
1423 int error;
1424
1425 if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
1426 return (error);
1427
1428 *mode = zfsvfs->z_os->os_mode;
1429 dmu_objset_name(zfsvfs->z_os, name);
1430 dmu_objset_close(zfsvfs->z_os);
1431
1432 return (0);
1433}
1434
1435/*
1436 * Reopen zfsvfs_t::z_os and release VOPs.
1437 */
1438int
1439zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
1440{
1441 int err;
1442
1443 ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
1444 ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
1445
1446 err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
1447 if (err) {
1448 zfsvfs->z_os = NULL;
1449 } else {
1450 znode_t *zp;
1451
1452 VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
1453
1454 /*
1455 * Attempt to re-establish all the active znodes with
1456 * their dbufs. If a zfs_rezget() fails, then we'll let
1457 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
1458 * when they try to use their znode.
1459 */
1460 mutex_enter(&zfsvfs->z_znodes_lock);
1461 for (zp = list_head(&zfsvfs->z_all_znodes); zp;
1462 zp = list_next(&zfsvfs->z_all_znodes, zp)) {
1463 (void) zfs_rezget(zp);
1464 }
1465 mutex_exit(&zfsvfs->z_znodes_lock);
1466
1467 }
1468
1469 /* release the VOPs */
1470 rw_exit(&zfsvfs->z_teardown_inactive_lock);
1471 rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1472
1473 if (err) {
1474 /*
1475 * Since we couldn't reopen zfsvfs::z_os, force
1476 * unmount this file system.
1477 */
1478 if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
1479 (void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
1480 }
1481 return (err);
1482}
1483
1484static void
1485zfs_freevfs(vfs_t *vfsp)
1486{
1487 zfsvfs_t *zfsvfs = vfsp->vfs_data;
1488 int i;
1489
1490 for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1491 mutex_destroy(&zfsvfs->z_hold_mtx[i]);
1492
1493 zfs_fuid_destroy(zfsvfs);
1494 zfs_freezfsvfs(zfsvfs);
1495
1496 atomic_add_32(&zfs_active_fs_count, -1);
1497}
1498
1499/*
1500 * VFS_INIT() initialization. Note that there is no VFS_FINI(),
1501 * so we can't safely do any non-idempotent initialization here.
1502 * Leave that to zfs_init() and zfs_fini(), which are called
1503 * from the module's _init() and _fini() entry points.
1504 */
1505/*ARGSUSED*/
1506static int
1507zfs_vfsinit(int fstype, char *name)
1508{
1509 int error;
1510
1511 zfsfstype = fstype;
1512
1513 /*
1514 * Setup vfsops and vnodeops tables.
1515 */
1516 error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops);
1517 if (error != 0) {
1518 cmn_err(CE_WARN, "zfs: bad vfs ops template");
1519 }
1520
1521 error = zfs_create_op_tables();
1522 if (error) {
1523 zfs_remove_op_tables();
1524 cmn_err(CE_WARN, "zfs: bad vnode ops template");
1525 (void) vfs_freevfsops_by_type(zfsfstype);
1526 return (error);
1527 }
1528
1529 mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
1530
1531 /*
1532 * Unique major number for all zfs mounts.
1533 * If we run out of 32-bit minors, we'll getudev() another major.
1534 */
1535 zfs_major = ddi_name_to_major(ZFS_DRIVER);
1536 zfs_minor = ZFS_MIN_MINOR;
1537
1538 return (0);
1539}
1540
1541void
1542zfs_init(void)
1543{
1544 /*
1545 * Initialize .zfs directory structures
1546 */
1547 zfsctl_init();
1548
1549 /*
1550 * Initialize znode cache, vnode ops, etc...
1551 */
1552 zfs_znode_init();
1553}
1554
1555void
1556zfs_fini(void)
1557{
1558 zfsctl_fini();
1559 zfs_znode_fini();
1560}
1561
1562int
1563zfs_busy(void)
1564{
1565 return (zfs_active_fs_count != 0);
1566}
1567
1568int
1569zfs_set_version(const char *name, uint64_t newvers)
1570{
1571 int error;
1572 objset_t *os;
1573 dmu_tx_t *tx;
1574 uint64_t curvers;
1575
1576 /*
1577 * XXX for now, require that the filesystem be unmounted. Would
1578 * be nice to find the zfsvfs_t and just update that if
1579 * possible.
1580 */
1581
1582 if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
1583 return (EINVAL);
1584
1585 error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_PRIMARY, &os);
1586 if (error)
1587 return (error);
1588
1589 error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
1590 8, 1, &curvers);
1591 if (error)
1592 goto out;
1593 if (newvers < curvers) {
1594 error = EINVAL;
1595 goto out;
1596 }
1597
1598 tx = dmu_tx_create(os);
1599 dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR);
1600 error = dmu_tx_assign(tx, TXG_WAIT);
1601 if (error) {
1602 dmu_tx_abort(tx);
1603 goto out;
1604 }
1605 error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1,
1606 &newvers, tx);
1607
1608 spa_history_internal_log(LOG_DS_UPGRADE,
1609 dmu_objset_spa(os), tx, CRED(),
1610 "oldver=%llu newver=%llu dataset = %llu", curvers, newvers,
1611 dmu_objset_id(os));
1612 dmu_tx_commit(tx);
1613
1614out:
1615 dmu_objset_close(os);
1616 return (error);
1617}
1618
1619/*
1620 * Read a property stored within the master node.
1621 */
1622int
1623zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
1624{
1625 const char *pname;
1626 int error;
1627
1628 /*
1629 * Look up the file system's value for the property. For the
1630 * version property, we look up a slightly different string.
1631 */
1632 if (prop == ZFS_PROP_VERSION)
1633 pname = ZPL_VERSION_STR;
1634 else
1635 pname = zfs_prop_to_name(prop);
1636
1637 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
1638
1639 if (error == ENOENT) {
1640 /* No value set, use the default value */
1641 switch (prop) {
1642 case ZFS_PROP_VERSION:
1643 *value = ZPL_VERSION;
1644 break;
1645 case ZFS_PROP_NORMALIZE:
1646 case ZFS_PROP_UTF8ONLY:
1647 *value = 0;
1648 break;
1649 case ZFS_PROP_CASE:
1650 *value = ZFS_CASE_SENSITIVE;
1651 break;
1652 default:
1653 return (error);
1654 }
1655 error = 0;
1656 }
1657 return (error);
1658}
1659
1660static vfsdef_t vfw = {
1661 VFSDEF_VERSION,
1662 MNTTYPE_ZFS,
1663 zfs_vfsinit,
1664 VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS|
1665 VSW_XID,
1666 &zfs_mntopts
1667};
1668
1669struct modlfs zfs_modlfs = {
1670 &mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw
1671};