4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
25 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
27 * Rewritten for Linux by:
28 * Rohan Puri <rohan.puri15@gmail.com>
29 * Brian Behlendorf <behlendorf1@llnl.gov>
30 * Copyright (c) 2013 by Delphix. All rights reserved.
31 * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
32 * Copyright (c) 2018 George Melikov. All Rights Reserved.
33 * Copyright (c) 2019 Datto, Inc. All rights reserved.
37 * ZFS control directory (a.k.a. ".zfs")
39 * This directory provides a common location for all ZFS meta-objects.
40 * Currently, this is only the 'snapshot' and 'shares' directory, but this may
41 * expand in the future. The elements are built dynamically, as the hierarchy
42 * does not actually exist on disk.
44 * For 'snapshot', we don't want to have all snapshots always mounted, because
45 * this would take up a huge amount of space in /etc/mnttab. We have three
48 * ctldir ------> snapshotdir -------> snapshot
54 * The 'snapshot' node contains just enough information to lookup '..' and act
55 * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we
56 * perform an automount of the underlying filesystem and return the
57 * corresponding inode.
59 * All mounts are handled automatically by an user mode helper which invokes
60 * the mount procedure. Unmounts are handled by allowing the mount
61 * point to expire so the kernel may automatically unmount it.
63 * The '.zfs', '.zfs/snapshot', and all directories created under
64 * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') all share the same
65 * zfsvfs_t as the head filesystem (what '.zfs' lives under).
67 * File systems mounted on top of the '.zfs/snapshot/<snapname>' paths
68 * (ie: snapshots) are complete ZFS filesystems and have their own unique
69 * zfsvfs_t. However, the fsid reported by these mounts will be the same
70 * as that used by the parent zfsvfs_t to make NFS happy.
73 #include <sys/types.h>
74 #include <sys/param.h>
76 #include <sys/sysmacros.h>
77 #include <sys/pathname.h>
79 #include <sys/zfs_ctldir.h>
80 #include <sys/zfs_ioctl.h>
81 #include <sys/zfs_vfsops.h>
82 #include <sys/zfs_vnops.h>
85 #include <sys/dmu_objset.h>
86 #include <sys/dsl_destroy.h>
87 #include <sys/dsl_deleg.h>
89 #include <sys/mntent.h>
90 #include "zfs_namecheck.h"
93 * Two AVL trees are maintained which contain all currently automounted
94 * snapshots. Every automounted snapshots maps to a single zfs_snapentry_t
97 * - be attached to both trees, and
98 * - be unique, no duplicate entries are allowed.
100 * The zfs_snapshots_by_name tree is indexed by the full dataset name
101 * while the zfs_snapshots_by_objsetid tree is indexed by the unique
102 * objsetid. This allows for fast lookups either by name or objsetid.
104 static avl_tree_t zfs_snapshots_by_name
;
105 static avl_tree_t zfs_snapshots_by_objsetid
;
106 static krwlock_t zfs_snapshot_lock
;
109 * Control Directory Tunables (.zfs)
111 int zfs_expire_snapshot
= ZFSCTL_EXPIRE_SNAPSHOT
;
112 int zfs_admin_snapshot
= 0;
115 char *se_name
; /* full snapshot name */
116 char *se_path
; /* full mount path */
117 spa_t
*se_spa
; /* pool spa */
118 uint64_t se_objsetid
; /* snapshot objset id */
119 struct dentry
*se_root_dentry
; /* snapshot root dentry */
120 taskqid_t se_taskqid
; /* scheduled unmount taskqid */
121 avl_node_t se_node_name
; /* zfs_snapshots_by_name link */
122 avl_node_t se_node_objsetid
; /* zfs_snapshots_by_objsetid link */
123 zfs_refcount_t se_refcount
; /* reference count */
126 static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t
*se
, int delay
);
129 * Allocate a new zfs_snapentry_t being careful to make a copy of the
130 * the snapshot name and provided mount point. No reference is taken.
132 static zfs_snapentry_t
*
133 zfsctl_snapshot_alloc(char *full_name
, char *full_path
, spa_t
*spa
,
134 uint64_t objsetid
, struct dentry
*root_dentry
)
138 se
= kmem_zalloc(sizeof (zfs_snapentry_t
), KM_SLEEP
);
140 se
->se_name
= strdup(full_name
);
141 se
->se_path
= strdup(full_path
);
143 se
->se_objsetid
= objsetid
;
144 se
->se_root_dentry
= root_dentry
;
145 se
->se_taskqid
= TASKQID_INVALID
;
147 zfs_refcount_create(&se
->se_refcount
);
153 * Free a zfs_snapentry_t the caller must ensure there are no active
157 zfsctl_snapshot_free(zfs_snapentry_t
*se
)
159 zfs_refcount_destroy(&se
->se_refcount
);
160 strfree(se
->se_name
);
161 strfree(se
->se_path
);
163 kmem_free(se
, sizeof (zfs_snapentry_t
));
167 * Hold a reference on the zfs_snapentry_t.
170 zfsctl_snapshot_hold(zfs_snapentry_t
*se
)
172 zfs_refcount_add(&se
->se_refcount
, NULL
);
176 * Release a reference on the zfs_snapentry_t. When the number of
177 * references drops to zero the structure will be freed.
180 zfsctl_snapshot_rele(zfs_snapentry_t
*se
)
182 if (zfs_refcount_remove(&se
->se_refcount
, NULL
) == 0)
183 zfsctl_snapshot_free(se
);
187 * Add a zfs_snapentry_t to both the zfs_snapshots_by_name and
188 * zfs_snapshots_by_objsetid trees. While the zfs_snapentry_t is part
189 * of the trees a reference is held.
192 zfsctl_snapshot_add(zfs_snapentry_t
*se
)
194 ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock
));
195 zfsctl_snapshot_hold(se
);
196 avl_add(&zfs_snapshots_by_name
, se
);
197 avl_add(&zfs_snapshots_by_objsetid
, se
);
201 * Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and
202 * zfs_snapshots_by_objsetid trees. Upon removal a reference is dropped,
203 * this can result in the structure being freed if that was the last
204 * remaining reference.
207 zfsctl_snapshot_remove(zfs_snapentry_t
*se
)
209 ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock
));
210 avl_remove(&zfs_snapshots_by_name
, se
);
211 avl_remove(&zfs_snapshots_by_objsetid
, se
);
212 zfsctl_snapshot_rele(se
);
216 * Snapshot name comparison function for the zfs_snapshots_by_name.
219 snapentry_compare_by_name(const void *a
, const void *b
)
221 const zfs_snapentry_t
*se_a
= a
;
222 const zfs_snapentry_t
*se_b
= b
;
225 ret
= strcmp(se_a
->se_name
, se_b
->se_name
);
236 * Snapshot name comparison function for the zfs_snapshots_by_objsetid.
239 snapentry_compare_by_objsetid(const void *a
, const void *b
)
241 const zfs_snapentry_t
*se_a
= a
;
242 const zfs_snapentry_t
*se_b
= b
;
244 if (se_a
->se_spa
!= se_b
->se_spa
)
245 return ((ulong_t
)se_a
->se_spa
< (ulong_t
)se_b
->se_spa
? -1 : 1);
247 if (se_a
->se_objsetid
< se_b
->se_objsetid
)
249 else if (se_a
->se_objsetid
> se_b
->se_objsetid
)
256 * Find a zfs_snapentry_t in zfs_snapshots_by_name. If the snapname
257 * is found a pointer to the zfs_snapentry_t is returned and a reference
258 * taken on the structure. The caller is responsible for dropping the
259 * reference with zfsctl_snapshot_rele(). If the snapname is not found
260 * NULL will be returned.
262 static zfs_snapentry_t
*
263 zfsctl_snapshot_find_by_name(char *snapname
)
265 zfs_snapentry_t
*se
, search
;
267 ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock
));
269 search
.se_name
= snapname
;
270 se
= avl_find(&zfs_snapshots_by_name
, &search
, NULL
);
272 zfsctl_snapshot_hold(se
);
278 * Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id
279 * rather than the snapname. In all other respects it behaves the same
280 * as zfsctl_snapshot_find_by_name().
282 static zfs_snapentry_t
*
283 zfsctl_snapshot_find_by_objsetid(spa_t
*spa
, uint64_t objsetid
)
285 zfs_snapentry_t
*se
, search
;
287 ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock
));
290 search
.se_objsetid
= objsetid
;
291 se
= avl_find(&zfs_snapshots_by_objsetid
, &search
, NULL
);
293 zfsctl_snapshot_hold(se
);
299 * Rename a zfs_snapentry_t in the zfs_snapshots_by_name. The structure is
300 * removed, renamed, and added back to the new correct location in the tree.
303 zfsctl_snapshot_rename(char *old_snapname
, char *new_snapname
)
307 ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock
));
309 se
= zfsctl_snapshot_find_by_name(old_snapname
);
311 return (SET_ERROR(ENOENT
));
313 zfsctl_snapshot_remove(se
);
314 strfree(se
->se_name
);
315 se
->se_name
= strdup(new_snapname
);
316 zfsctl_snapshot_add(se
);
317 zfsctl_snapshot_rele(se
);
323 * Delayed task responsible for unmounting an expired automounted snapshot.
326 snapentry_expire(void *data
)
328 zfs_snapentry_t
*se
= (zfs_snapentry_t
*)data
;
329 spa_t
*spa
= se
->se_spa
;
330 uint64_t objsetid
= se
->se_objsetid
;
332 if (zfs_expire_snapshot
<= 0) {
333 zfsctl_snapshot_rele(se
);
337 se
->se_taskqid
= TASKQID_INVALID
;
338 (void) zfsctl_snapshot_unmount(se
->se_name
, MNT_EXPIRE
);
339 zfsctl_snapshot_rele(se
);
342 * Reschedule the unmount if the zfs_snapentry_t wasn't removed.
343 * This can occur when the snapshot is busy.
345 rw_enter(&zfs_snapshot_lock
, RW_READER
);
346 if ((se
= zfsctl_snapshot_find_by_objsetid(spa
, objsetid
)) != NULL
) {
347 zfsctl_snapshot_unmount_delay_impl(se
, zfs_expire_snapshot
);
348 zfsctl_snapshot_rele(se
);
350 rw_exit(&zfs_snapshot_lock
);
354 * Cancel an automatic unmount of a snapname. This callback is responsible
355 * for dropping the reference on the zfs_snapentry_t which was taken when
359 zfsctl_snapshot_unmount_cancel(zfs_snapentry_t
*se
)
361 if (taskq_cancel_id(system_delay_taskq
, se
->se_taskqid
) == 0) {
362 se
->se_taskqid
= TASKQID_INVALID
;
363 zfsctl_snapshot_rele(se
);
368 * Dispatch the unmount task for delayed handling with a hold protecting it.
371 zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t
*se
, int delay
)
373 ASSERT3S(se
->se_taskqid
, ==, TASKQID_INVALID
);
378 zfsctl_snapshot_hold(se
);
379 se
->se_taskqid
= taskq_dispatch_delay(system_delay_taskq
,
380 snapentry_expire
, se
, TQ_SLEEP
, ddi_get_lbolt() + delay
* HZ
);
384 * Schedule an automatic unmount of objset id to occur in delay seconds from
385 * now. Any previous delayed unmount will be cancelled in favor of the
386 * updated deadline. A reference is taken by zfsctl_snapshot_find_by_name()
387 * and held until the outstanding task is handled or cancelled.
390 zfsctl_snapshot_unmount_delay(spa_t
*spa
, uint64_t objsetid
, int delay
)
395 rw_enter(&zfs_snapshot_lock
, RW_READER
);
396 if ((se
= zfsctl_snapshot_find_by_objsetid(spa
, objsetid
)) != NULL
) {
397 zfsctl_snapshot_unmount_cancel(se
);
398 zfsctl_snapshot_unmount_delay_impl(se
, delay
);
399 zfsctl_snapshot_rele(se
);
402 rw_exit(&zfs_snapshot_lock
);
408 * Check if snapname is currently mounted. Returned non-zero when mounted
409 * and zero when unmounted.
412 zfsctl_snapshot_ismounted(char *snapname
)
415 boolean_t ismounted
= B_FALSE
;
417 rw_enter(&zfs_snapshot_lock
, RW_READER
);
418 if ((se
= zfsctl_snapshot_find_by_name(snapname
)) != NULL
) {
419 zfsctl_snapshot_rele(se
);
422 rw_exit(&zfs_snapshot_lock
);
428 * Check if the given inode is a part of the virtual .zfs directory.
431 zfsctl_is_node(struct inode
*ip
)
433 return (ITOZ(ip
)->z_is_ctldir
);
437 * Check if the given inode is a .zfs/snapshots/snapname directory.
440 zfsctl_is_snapdir(struct inode
*ip
)
442 return (zfsctl_is_node(ip
) && (ip
->i_ino
<= ZFSCTL_INO_SNAPDIRS
));
446 * Allocate a new inode with the passed id and ops.
448 static struct inode
*
449 zfsctl_inode_alloc(zfsvfs_t
*zfsvfs
, uint64_t id
,
450 const struct file_operations
*fops
, const struct inode_operations
*ops
)
452 inode_timespec_t now
;
456 ip
= new_inode(zfsvfs
->z_sb
);
460 now
= current_time(ip
);
462 ASSERT3P(zp
->z_dirlocks
, ==, NULL
);
463 ASSERT3P(zp
->z_acl_cached
, ==, NULL
);
464 ASSERT3P(zp
->z_xattr_cached
, ==, NULL
);
466 zp
->z_unlinked
= B_FALSE
;
467 zp
->z_atime_dirty
= B_FALSE
;
468 zp
->z_zn_prefetch
= B_FALSE
;
469 zp
->z_moved
= B_FALSE
;
470 zp
->z_is_sa
= B_FALSE
;
471 zp
->z_is_mapped
= B_FALSE
;
472 zp
->z_is_ctldir
= B_TRUE
;
473 zp
->z_is_stale
= B_FALSE
;
482 ip
->i_generation
= 0;
484 ip
->i_mode
= (S_IFDIR
| S_IRWXUGO
);
485 ip
->i_uid
= SUID_TO_KUID(0);
486 ip
->i_gid
= SGID_TO_KGID(0);
487 ip
->i_blkbits
= SPA_MINBLOCKSHIFT
;
493 #if defined(IOP_XATTR)
494 ip
->i_opflags
&= ~IOP_XATTR
;
497 if (insert_inode_locked(ip
)) {
498 unlock_new_inode(ip
);
503 mutex_enter(&zfsvfs
->z_znodes_lock
);
504 list_insert_tail(&zfsvfs
->z_all_znodes
, zp
);
505 zfsvfs
->z_nr_znodes
++;
507 mutex_exit(&zfsvfs
->z_znodes_lock
);
509 unlock_new_inode(ip
);
515 * Lookup the inode with given id, it will be allocated if needed.
517 static struct inode
*
518 zfsctl_inode_lookup(zfsvfs_t
*zfsvfs
, uint64_t id
,
519 const struct file_operations
*fops
, const struct inode_operations
*ops
)
521 struct inode
*ip
= NULL
;
524 ip
= ilookup(zfsvfs
->z_sb
, (unsigned long)id
);
528 /* May fail due to concurrent zfsctl_inode_alloc() */
529 ip
= zfsctl_inode_alloc(zfsvfs
, id
, fops
, ops
);
536 * Create the '.zfs' directory. This directory is cached as part of the VFS
537 * structure. This results in a hold on the zfsvfs_t. The code in zfs_umount()
538 * therefore checks against a vfs_count of 2 instead of 1. This reference
539 * is removed when the ctldir is destroyed in the unmount. All other entities
540 * under the '.zfs' directory are created dynamically as needed.
542 * Because the dynamically created '.zfs' directory entries assume the use
543 * of 64-bit inode numbers this support must be disabled on 32-bit systems.
546 zfsctl_create(zfsvfs_t
*zfsvfs
)
548 ASSERT(zfsvfs
->z_ctldir
== NULL
);
550 zfsvfs
->z_ctldir
= zfsctl_inode_alloc(zfsvfs
, ZFSCTL_INO_ROOT
,
551 &zpl_fops_root
, &zpl_ops_root
);
552 if (zfsvfs
->z_ctldir
== NULL
)
553 return (SET_ERROR(ENOENT
));
559 * Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name.
560 * Only called when the filesystem is unmounted.
563 zfsctl_destroy(zfsvfs_t
*zfsvfs
)
565 if (zfsvfs
->z_issnap
) {
567 spa_t
*spa
= zfsvfs
->z_os
->os_spa
;
568 uint64_t objsetid
= dmu_objset_id(zfsvfs
->z_os
);
570 rw_enter(&zfs_snapshot_lock
, RW_WRITER
);
571 se
= zfsctl_snapshot_find_by_objsetid(spa
, objsetid
);
573 zfsctl_snapshot_remove(se
);
574 rw_exit(&zfs_snapshot_lock
);
576 zfsctl_snapshot_unmount_cancel(se
);
577 zfsctl_snapshot_rele(se
);
579 } else if (zfsvfs
->z_ctldir
) {
580 iput(zfsvfs
->z_ctldir
);
581 zfsvfs
->z_ctldir
= NULL
;
586 * Given a root znode, retrieve the associated .zfs directory.
587 * Add a hold to the vnode and return it.
590 zfsctl_root(znode_t
*zp
)
592 ASSERT(zfs_has_ctldir(zp
));
593 igrab(ZTOZSB(zp
)->z_ctldir
);
594 return (ZTOZSB(zp
)->z_ctldir
);
598 * Generate a long fid to indicate a snapdir. We encode whether snapdir is
599 * already mounted in gen field. We do this because nfsd lookup will not
600 * trigger automount. Next time the nfsd does fh_to_dentry, we will notice
601 * this and do automount and return ESTALE to force nfsd revalidate and follow
605 zfsctl_snapdir_fid(struct inode
*ip
, fid_t
*fidp
)
607 zfid_short_t
*zfid
= (zfid_short_t
*)fidp
;
608 zfid_long_t
*zlfid
= (zfid_long_t
*)fidp
;
613 struct dentry
*dentry
;
615 if (fidp
->fid_len
< LONG_FID_LEN
) {
616 fidp
->fid_len
= LONG_FID_LEN
;
617 return (SET_ERROR(ENOSPC
));
621 objsetid
= ZFSCTL_INO_SNAPDIRS
- ip
->i_ino
;
622 zfid
->zf_len
= LONG_FID_LEN
;
624 dentry
= d_obtain_alias(igrab(ip
));
625 if (!IS_ERR(dentry
)) {
626 gen
= !!d_mountpoint(dentry
);
630 for (i
= 0; i
< sizeof (zfid
->zf_object
); i
++)
631 zfid
->zf_object
[i
] = (uint8_t)(object
>> (8 * i
));
633 for (i
= 0; i
< sizeof (zfid
->zf_gen
); i
++)
634 zfid
->zf_gen
[i
] = (uint8_t)(gen
>> (8 * i
));
636 for (i
= 0; i
< sizeof (zlfid
->zf_setid
); i
++)
637 zlfid
->zf_setid
[i
] = (uint8_t)(objsetid
>> (8 * i
));
639 for (i
= 0; i
< sizeof (zlfid
->zf_setgen
); i
++)
640 zlfid
->zf_setgen
[i
] = 0;
646 * Generate an appropriate fid for an entry in the .zfs directory.
649 zfsctl_fid(struct inode
*ip
, fid_t
*fidp
)
651 znode_t
*zp
= ITOZ(ip
);
652 zfsvfs_t
*zfsvfs
= ITOZSB(ip
);
653 uint64_t object
= zp
->z_id
;
659 if (zfsctl_is_snapdir(ip
)) {
661 return (zfsctl_snapdir_fid(ip
, fidp
));
664 if (fidp
->fid_len
< SHORT_FID_LEN
) {
665 fidp
->fid_len
= SHORT_FID_LEN
;
667 return (SET_ERROR(ENOSPC
));
670 zfid
= (zfid_short_t
*)fidp
;
672 zfid
->zf_len
= SHORT_FID_LEN
;
674 for (i
= 0; i
< sizeof (zfid
->zf_object
); i
++)
675 zfid
->zf_object
[i
] = (uint8_t)(object
>> (8 * i
));
677 /* .zfs znodes always have a generation number of 0 */
678 for (i
= 0; i
< sizeof (zfid
->zf_gen
); i
++)
686 * Construct a full dataset name in full_name: "pool/dataset@snap_name"
689 zfsctl_snapshot_name(zfsvfs_t
*zfsvfs
, const char *snap_name
, int len
,
692 objset_t
*os
= zfsvfs
->z_os
;
694 if (zfs_component_namecheck(snap_name
, NULL
, NULL
) != 0)
695 return (SET_ERROR(EILSEQ
));
697 dmu_objset_name(os
, full_name
);
698 if ((strlen(full_name
) + 1 + strlen(snap_name
)) >= len
)
699 return (SET_ERROR(ENAMETOOLONG
));
701 (void) strcat(full_name
, "@");
702 (void) strcat(full_name
, snap_name
);
708 * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/"
711 zfsctl_snapshot_path_objset(zfsvfs_t
*zfsvfs
, uint64_t objsetid
,
712 int path_len
, char *full_path
)
714 objset_t
*os
= zfsvfs
->z_os
;
715 fstrans_cookie_t cookie
;
717 boolean_t case_conflict
;
718 uint64_t id
, pos
= 0;
721 if (zfsvfs
->z_vfs
->vfs_mntpoint
== NULL
)
722 return (SET_ERROR(ENOENT
));
724 cookie
= spl_fstrans_mark();
725 snapname
= kmem_alloc(ZFS_MAX_DATASET_NAME_LEN
, KM_SLEEP
);
728 dsl_pool_config_enter(dmu_objset_pool(os
), FTAG
);
729 error
= dmu_snapshot_list_next(zfsvfs
->z_os
,
730 ZFS_MAX_DATASET_NAME_LEN
, snapname
, &id
, &pos
,
732 dsl_pool_config_exit(dmu_objset_pool(os
), FTAG
);
740 snprintf(full_path
, path_len
, "%s/.zfs/snapshot/%s",
741 zfsvfs
->z_vfs
->vfs_mntpoint
, snapname
);
743 kmem_free(snapname
, ZFS_MAX_DATASET_NAME_LEN
);
744 spl_fstrans_unmark(cookie
);
750 * Special case the handling of "..".
753 zfsctl_root_lookup(struct inode
*dip
, char *name
, struct inode
**ipp
,
754 int flags
, cred_t
*cr
, int *direntflags
, pathname_t
*realpnp
)
756 zfsvfs_t
*zfsvfs
= ITOZSB(dip
);
761 if (strcmp(name
, "..") == 0) {
762 *ipp
= dip
->i_sb
->s_root
->d_inode
;
763 } else if (strcmp(name
, ZFS_SNAPDIR_NAME
) == 0) {
764 *ipp
= zfsctl_inode_lookup(zfsvfs
, ZFSCTL_INO_SNAPDIR
,
765 &zpl_fops_snapdir
, &zpl_ops_snapdir
);
766 } else if (strcmp(name
, ZFS_SHAREDIR_NAME
) == 0) {
767 *ipp
= zfsctl_inode_lookup(zfsvfs
, ZFSCTL_INO_SHARES
,
768 &zpl_fops_shares
, &zpl_ops_shares
);
774 error
= SET_ERROR(ENOENT
);
782 * Lookup entry point for the 'snapshot' directory. Try to open the
783 * snapshot if it exist, creating the pseudo filesystem inode as necessary.
786 zfsctl_snapdir_lookup(struct inode
*dip
, char *name
, struct inode
**ipp
,
787 int flags
, cred_t
*cr
, int *direntflags
, pathname_t
*realpnp
)
789 zfsvfs_t
*zfsvfs
= ITOZSB(dip
);
795 error
= dmu_snapshot_lookup(zfsvfs
->z_os
, name
, &id
);
801 *ipp
= zfsctl_inode_lookup(zfsvfs
, ZFSCTL_INO_SNAPDIRS
- id
,
802 &simple_dir_operations
, &simple_dir_inode_operations
);
804 error
= SET_ERROR(ENOENT
);
812 * Renaming a directory under '.zfs/snapshot' will automatically trigger
813 * a rename of the snapshot to the new given name. The rename is confined
814 * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere.
817 zfsctl_snapdir_rename(struct inode
*sdip
, char *snm
,
818 struct inode
*tdip
, char *tnm
, cred_t
*cr
, int flags
)
820 zfsvfs_t
*zfsvfs
= ITOZSB(sdip
);
821 char *to
, *from
, *real
, *fsname
;
824 if (!zfs_admin_snapshot
)
825 return (SET_ERROR(EACCES
));
829 to
= kmem_alloc(ZFS_MAX_DATASET_NAME_LEN
, KM_SLEEP
);
830 from
= kmem_alloc(ZFS_MAX_DATASET_NAME_LEN
, KM_SLEEP
);
831 real
= kmem_alloc(ZFS_MAX_DATASET_NAME_LEN
, KM_SLEEP
);
832 fsname
= kmem_alloc(ZFS_MAX_DATASET_NAME_LEN
, KM_SLEEP
);
834 if (zfsvfs
->z_case
== ZFS_CASE_INSENSITIVE
) {
835 error
= dmu_snapshot_realname(zfsvfs
->z_os
, snm
, real
,
836 ZFS_MAX_DATASET_NAME_LEN
, NULL
);
839 } else if (error
!= ENOTSUP
) {
844 dmu_objset_name(zfsvfs
->z_os
, fsname
);
846 error
= zfsctl_snapshot_name(ITOZSB(sdip
), snm
,
847 ZFS_MAX_DATASET_NAME_LEN
, from
);
849 error
= zfsctl_snapshot_name(ITOZSB(tdip
), tnm
,
850 ZFS_MAX_DATASET_NAME_LEN
, to
);
852 error
= zfs_secpolicy_rename_perms(from
, to
, cr
);
857 * Cannot move snapshots out of the snapdir.
860 error
= SET_ERROR(EINVAL
);
865 * No-op when names are identical.
867 if (strcmp(snm
, tnm
) == 0) {
872 rw_enter(&zfs_snapshot_lock
, RW_WRITER
);
874 error
= dsl_dataset_rename_snapshot(fsname
, snm
, tnm
, B_FALSE
);
876 (void) zfsctl_snapshot_rename(snm
, tnm
);
878 rw_exit(&zfs_snapshot_lock
);
880 kmem_free(from
, ZFS_MAX_DATASET_NAME_LEN
);
881 kmem_free(to
, ZFS_MAX_DATASET_NAME_LEN
);
882 kmem_free(real
, ZFS_MAX_DATASET_NAME_LEN
);
883 kmem_free(fsname
, ZFS_MAX_DATASET_NAME_LEN
);
891 * Removing a directory under '.zfs/snapshot' will automatically trigger
892 * the removal of the snapshot with the given name.
895 zfsctl_snapdir_remove(struct inode
*dip
, char *name
, cred_t
*cr
, int flags
)
897 zfsvfs_t
*zfsvfs
= ITOZSB(dip
);
898 char *snapname
, *real
;
901 if (!zfs_admin_snapshot
)
902 return (SET_ERROR(EACCES
));
906 snapname
= kmem_alloc(ZFS_MAX_DATASET_NAME_LEN
, KM_SLEEP
);
907 real
= kmem_alloc(ZFS_MAX_DATASET_NAME_LEN
, KM_SLEEP
);
909 if (zfsvfs
->z_case
== ZFS_CASE_INSENSITIVE
) {
910 error
= dmu_snapshot_realname(zfsvfs
->z_os
, name
, real
,
911 ZFS_MAX_DATASET_NAME_LEN
, NULL
);
914 } else if (error
!= ENOTSUP
) {
919 error
= zfsctl_snapshot_name(ITOZSB(dip
), name
,
920 ZFS_MAX_DATASET_NAME_LEN
, snapname
);
922 error
= zfs_secpolicy_destroy_perms(snapname
, cr
);
926 error
= zfsctl_snapshot_unmount(snapname
, MNT_FORCE
);
927 if ((error
== 0) || (error
== ENOENT
))
928 error
= dsl_destroy_snapshot(snapname
, B_FALSE
);
930 kmem_free(snapname
, ZFS_MAX_DATASET_NAME_LEN
);
931 kmem_free(real
, ZFS_MAX_DATASET_NAME_LEN
);
939 * Creating a directory under '.zfs/snapshot' will automatically trigger
940 * the creation of a new snapshot with the given name.
943 zfsctl_snapdir_mkdir(struct inode
*dip
, char *dirname
, vattr_t
*vap
,
944 struct inode
**ipp
, cred_t
*cr
, int flags
)
946 zfsvfs_t
*zfsvfs
= ITOZSB(dip
);
950 if (!zfs_admin_snapshot
)
951 return (SET_ERROR(EACCES
));
953 dsname
= kmem_alloc(ZFS_MAX_DATASET_NAME_LEN
, KM_SLEEP
);
955 if (zfs_component_namecheck(dirname
, NULL
, NULL
) != 0) {
956 error
= SET_ERROR(EILSEQ
);
960 dmu_objset_name(zfsvfs
->z_os
, dsname
);
962 error
= zfs_secpolicy_snapshot_perms(dsname
, cr
);
967 error
= dmu_objset_snapshot_one(dsname
, dirname
);
971 error
= zfsctl_snapdir_lookup(dip
, dirname
, ipp
,
975 kmem_free(dsname
, ZFS_MAX_DATASET_NAME_LEN
);
981 * Attempt to unmount a snapshot by making a call to user space.
982 * There is no assurance that this can or will succeed, is just a
983 * best effort. In the case where it does fail, perhaps because
984 * it's in use, the unmount will fail harmlessly.
987 zfsctl_snapshot_unmount(char *snapname
, int flags
)
989 char *argv
[] = { "/usr/bin/env", "umount", "-t", "zfs", "-n", NULL
,
991 char *envp
[] = { NULL
};
995 rw_enter(&zfs_snapshot_lock
, RW_READER
);
996 if ((se
= zfsctl_snapshot_find_by_name(snapname
)) == NULL
) {
997 rw_exit(&zfs_snapshot_lock
);
998 return (SET_ERROR(ENOENT
));
1000 rw_exit(&zfs_snapshot_lock
);
1002 if (flags
& MNT_FORCE
)
1004 argv
[5] = se
->se_path
;
1005 dprintf("unmount; path=%s\n", se
->se_path
);
1006 error
= call_usermodehelper(argv
[0], argv
, envp
, UMH_WAIT_PROC
);
1007 zfsctl_snapshot_rele(se
);
1011 * The umount system utility will return 256 on error. We must
1012 * assume this error is because the file system is busy so it is
1013 * converted to the more sensible EBUSY.
1016 error
= SET_ERROR(EBUSY
);
1022 zfsctl_snapshot_mount(struct path
*path
, int flags
)
1024 struct dentry
*dentry
= path
->dentry
;
1025 struct inode
*ip
= dentry
->d_inode
;
1027 zfsvfs_t
*snap_zfsvfs
;
1028 zfs_snapentry_t
*se
;
1029 char *full_name
, *full_path
;
1030 char *argv
[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL
, NULL
,
1032 char *envp
[] = { NULL
};
1037 return (SET_ERROR(EISDIR
));
1039 zfsvfs
= ITOZSB(ip
);
1042 full_name
= kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN
, KM_SLEEP
);
1043 full_path
= kmem_zalloc(MAXPATHLEN
, KM_SLEEP
);
1045 error
= zfsctl_snapshot_name(zfsvfs
, dname(dentry
),
1046 ZFS_MAX_DATASET_NAME_LEN
, full_name
);
1051 * Construct a mount point path from sb of the ctldir inode and dirent
1052 * name, instead of from d_path(), so that chroot'd process doesn't fail
1055 snprintf(full_path
, MAXPATHLEN
, "%s/.zfs/snapshot/%s",
1056 zfsvfs
->z_vfs
->vfs_mntpoint
, dname(dentry
));
1059 * Multiple concurrent automounts of a snapshot are never allowed.
1060 * The snapshot may be manually mounted as many times as desired.
1062 if (zfsctl_snapshot_ismounted(full_name
)) {
1068 * Attempt to mount the snapshot from user space. Normally this
1069 * would be done using the vfs_kern_mount() function, however that
1070 * function is marked GPL-only and cannot be used. On error we
1071 * careful to log the real error to the console and return EISDIR
1072 * to safely abort the automount. This should be very rare.
1074 * If the user mode helper happens to return EBUSY, a concurrent
1075 * mount is already in progress in which case the error is ignored.
1076 * Take note that if the program was executed successfully the return
1077 * value from call_usermodehelper() will be (exitcode << 8 + signal).
1079 dprintf("mount; name=%s path=%s\n", full_name
, full_path
);
1080 argv
[5] = full_name
;
1081 argv
[6] = full_path
;
1082 error
= call_usermodehelper(argv
[0], argv
, envp
, UMH_WAIT_PROC
);
1084 if (!(error
& MOUNT_BUSY
<< 8)) {
1085 zfs_dbgmsg("Unable to automount %s error=%d",
1087 error
= SET_ERROR(EISDIR
);
1090 * EBUSY, this could mean a concurrent mount, or the
1091 * snapshot has already been mounted at completely
1092 * different place. We return 0 so VFS will retry. For
1093 * the latter case the VFS will retry several times
1094 * and return ELOOP, which is probably not a very good
1103 * Follow down in to the mounted snapshot and set MNT_SHRINKABLE
1104 * to identify this as an automounted filesystem.
1108 if (zpl_follow_down_one(&spath
)) {
1109 snap_zfsvfs
= ITOZSB(spath
.dentry
->d_inode
);
1110 snap_zfsvfs
->z_parent
= zfsvfs
;
1111 dentry
= spath
.dentry
;
1112 spath
.mnt
->mnt_flags
|= MNT_SHRINKABLE
;
1114 rw_enter(&zfs_snapshot_lock
, RW_WRITER
);
1115 se
= zfsctl_snapshot_alloc(full_name
, full_path
,
1116 snap_zfsvfs
->z_os
->os_spa
, dmu_objset_id(snap_zfsvfs
->z_os
),
1118 zfsctl_snapshot_add(se
);
1119 zfsctl_snapshot_unmount_delay_impl(se
, zfs_expire_snapshot
);
1120 rw_exit(&zfs_snapshot_lock
);
1124 kmem_free(full_name
, ZFS_MAX_DATASET_NAME_LEN
);
1125 kmem_free(full_path
, MAXPATHLEN
);
1133 * Get the snapdir inode from fid
1136 zfsctl_snapdir_vget(struct super_block
*sb
, uint64_t objsetid
, int gen
,
1142 struct dentry
*dentry
;
1144 mnt
= kmem_alloc(MAXPATHLEN
, KM_SLEEP
);
1146 error
= zfsctl_snapshot_path_objset(sb
->s_fs_info
, objsetid
,
1151 /* Trigger automount */
1152 error
= -kern_path(mnt
, LOOKUP_FOLLOW
|LOOKUP_DIRECTORY
, &path
);
1158 * Get the snapdir inode. Note, we don't want to use the above
1159 * path because it contains the root of the snapshot rather
1162 *ipp
= ilookup(sb
, ZFSCTL_INO_SNAPDIRS
- objsetid
);
1164 error
= SET_ERROR(ENOENT
);
1168 /* check gen, see zfsctl_snapdir_fid */
1169 dentry
= d_obtain_alias(igrab(*ipp
));
1170 if (gen
!= (!IS_ERR(dentry
) && d_mountpoint(dentry
))) {
1173 error
= SET_ERROR(ENOENT
);
1175 if (!IS_ERR(dentry
))
1178 kmem_free(mnt
, MAXPATHLEN
);
1183 zfsctl_shares_lookup(struct inode
*dip
, char *name
, struct inode
**ipp
,
1184 int flags
, cred_t
*cr
, int *direntflags
, pathname_t
*realpnp
)
1186 zfsvfs_t
*zfsvfs
= ITOZSB(dip
);
1193 if (zfsvfs
->z_shares_dir
== 0) {
1195 return (SET_ERROR(ENOTSUP
));
1198 if ((error
= zfs_zget(zfsvfs
, zfsvfs
->z_shares_dir
, &dzp
)) == 0) {
1199 error
= zfs_lookup(ZTOI(dzp
), name
, &ip
, 0, cr
, NULL
, NULL
);
1209 * Initialize the various pieces we'll need to create and manipulate .zfs
1210 * directories. Currently this is unused but available.
1215 avl_create(&zfs_snapshots_by_name
, snapentry_compare_by_name
,
1216 sizeof (zfs_snapentry_t
), offsetof(zfs_snapentry_t
,
1218 avl_create(&zfs_snapshots_by_objsetid
, snapentry_compare_by_objsetid
,
1219 sizeof (zfs_snapentry_t
), offsetof(zfs_snapentry_t
,
1221 rw_init(&zfs_snapshot_lock
, NULL
, RW_DEFAULT
, NULL
);
1225 * Cleanup the various pieces we needed for .zfs directories. In particular
1226 * ensure the expiry timer is canceled safely.
1231 avl_destroy(&zfs_snapshots_by_name
);
1232 avl_destroy(&zfs_snapshots_by_objsetid
);
1233 rw_destroy(&zfs_snapshot_lock
);
1236 module_param(zfs_admin_snapshot
, int, 0644);
1237 MODULE_PARM_DESC(zfs_admin_snapshot
, "Enable mkdir/rmdir/mv in .zfs/snapshot");
1239 module_param(zfs_expire_snapshot
, int, 0644);
1240 MODULE_PARM_DESC(zfs_expire_snapshot
, "Seconds to expire .zfs/snapshot");