]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/zfs_ctldir.c
Fix snapshot automount behavior when concurrent or fail
[mirror_zfs.git] / module / zfs / zfs_ctldir.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 *
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
25 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
26 * LLNL-CODE-403049.
27 * Rewritten for Linux by:
28 * Rohan Puri <rohan.puri15@gmail.com>
29 * Brian Behlendorf <behlendorf1@llnl.gov>
30 * Copyright (c) 2013 by Delphix. All rights reserved.
31 */
32
33 /*
34 * ZFS control directory (a.k.a. ".zfs")
35 *
36 * This directory provides a common location for all ZFS meta-objects.
37 * Currently, this is only the 'snapshot' and 'shares' directory, but this may
38 * expand in the future. The elements are built dynamically, as the hierarchy
39 * does not actually exist on disk.
40 *
41 * For 'snapshot', we don't want to have all snapshots always mounted, because
42 * this would take up a huge amount of space in /etc/mnttab. We have three
43 * types of objects:
44 *
45 * ctldir ------> snapshotdir -------> snapshot
46 * |
47 * |
48 * V
49 * mounted fs
50 *
51 * The 'snapshot' node contains just enough information to lookup '..' and act
52 * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we
53 * perform an automount of the underlying filesystem and return the
54 * corresponding inode.
55 *
56 * All mounts are handled automatically by an user mode helper which invokes
57 * the mount mount procedure. Unmounts are handled by allowing the mount
58 * point to expire so the kernel may automatically unmount it.
59 *
60 * The '.zfs', '.zfs/snapshot', and all directories created under
61 * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') all share the same
62 * share the same zfs_sb_t as the head filesystem (what '.zfs' lives under).
63 *
64 * File systems mounted on top of the '.zfs/snapshot/<snapname>' paths
65 * (ie: snapshots) are complete ZFS filesystems and have their own unique
66 * zfs_sb_t. However, the fsid reported by these mounts will be the same
67 * as that used by the parent zfs_sb_t to make NFS happy.
68 */
69
70 #include <sys/types.h>
71 #include <sys/param.h>
72 #include <sys/time.h>
73 #include <sys/systm.h>
74 #include <sys/sysmacros.h>
75 #include <sys/pathname.h>
76 #include <sys/vfs.h>
77 #include <sys/vfs_opreg.h>
78 #include <sys/zfs_ctldir.h>
79 #include <sys/zfs_ioctl.h>
80 #include <sys/zfs_vfsops.h>
81 #include <sys/zfs_vnops.h>
82 #include <sys/stat.h>
83 #include <sys/dmu.h>
84 #include <sys/dsl_destroy.h>
85 #include <sys/dsl_deleg.h>
86 #include <sys/mount.h>
87 #include <sys/zpl.h>
88 #include "zfs_namecheck.h"
89
90 /*
91 * Two AVL trees are maintained which contain all currently automounted
92 * snapshots. Every automounted snapshots maps to a single zfs_snapentry_t
93 * entry which MUST:
94 *
95 * - be attached to both trees, and
96 * - be unique, no duplicate entries are allowed.
97 *
98 * The zfs_snapshots_by_name tree is indexed by the full dataset name
99 * while the zfs_snapshots_by_objsetid tree is indexed by the unique
100 * objsetid. This allows for fast lookups either by name or objsetid.
101 */
102 static avl_tree_t zfs_snapshots_by_name;
103 static avl_tree_t zfs_snapshots_by_objsetid;
104 static kmutex_t zfs_snapshot_lock;
105
106 /*
107 * Control Directory Tunables (.zfs)
108 */
109 int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT;
110 int zfs_admin_snapshot = 0;
111
112 /*
113 * Dedicated task queue for unmounting snapshots.
114 */
115 static taskq_t *zfs_expire_taskq;
116
117 typedef struct {
118 char *se_name; /* full snapshot name */
119 char *se_path; /* full mount path */
120 uint64_t se_objsetid; /* snapshot objset id */
121 struct dentry *se_root_dentry; /* snapshot root dentry */
122 taskqid_t se_taskqid; /* scheduled unmount taskqid */
123 avl_node_t se_node_name; /* zfs_snapshots_by_name link */
124 avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */
125 refcount_t se_refcount; /* reference count */
126 } zfs_snapentry_t;
127
128 static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay);
129
130 /*
131 * Allocate a new zfs_snapentry_t being careful to make a copy of the
132 * the snapshot name and provided mount point. No reference is taken.
133 */
134 static zfs_snapentry_t *
135 zfsctl_snapshot_alloc(char *full_name, char *full_path, uint64_t objsetid,
136 struct dentry *root_dentry)
137 {
138 zfs_snapentry_t *se;
139
140 se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP);
141
142 se->se_name = strdup(full_name);
143 se->se_path = strdup(full_path);
144 se->se_objsetid = objsetid;
145 se->se_root_dentry = root_dentry;
146 se->se_taskqid = -1;
147
148 refcount_create(&se->se_refcount);
149
150 return (se);
151 }
152
153 /*
154 * Free a zfs_snapentry_t the called must ensure there are no active
155 * references.
156 */
157 static void
158 zfsctl_snapshot_free(zfs_snapentry_t *se)
159 {
160 refcount_destroy(&se->se_refcount);
161 strfree(se->se_name);
162 strfree(se->se_path);
163
164 kmem_free(se, sizeof (zfs_snapentry_t));
165 }
166
167 /*
168 * Hold a reference on the zfs_snapentry_t.
169 */
170 static void
171 zfsctl_snapshot_hold(zfs_snapentry_t *se)
172 {
173 refcount_add(&se->se_refcount, NULL);
174 }
175
176 /*
177 * Release a reference on the zfs_snapentry_t. When the number of
178 * references drops to zero the structure will be freed.
179 */
180 static void
181 zfsctl_snapshot_rele(zfs_snapentry_t *se)
182 {
183 if (refcount_remove(&se->se_refcount, NULL) == 0)
184 zfsctl_snapshot_free(se);
185 }
186
187 /*
188 * Add a zfs_snapentry_t to both the zfs_snapshots_by_name and
189 * zfs_snapshots_by_objsetid trees. While the zfs_snapentry_t is part
190 * of the trees a reference is held.
191 */
192 static void
193 zfsctl_snapshot_add(zfs_snapentry_t *se)
194 {
195 ASSERT(MUTEX_HELD(&zfs_snapshot_lock));
196 refcount_add(&se->se_refcount, NULL);
197 avl_add(&zfs_snapshots_by_name, se);
198 avl_add(&zfs_snapshots_by_objsetid, se);
199 }
200
201 /*
202 * Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and
203 * zfs_snapshots_by_objsetid trees. Upon removal a reference is dropped,
204 * this can result in the structure being freed if that was the last
205 * remaining reference.
206 */
207 static void
208 zfsctl_snapshot_remove(zfs_snapentry_t *se)
209 {
210 ASSERT(MUTEX_HELD(&zfs_snapshot_lock));
211 avl_remove(&zfs_snapshots_by_name, se);
212 avl_remove(&zfs_snapshots_by_objsetid, se);
213 zfsctl_snapshot_rele(se);
214 }
215
216 /*
217 * Snapshot name comparison function for the zfs_snapshots_by_name.
218 */
219 static int
220 snapentry_compare_by_name(const void *a, const void *b)
221 {
222 const zfs_snapentry_t *se_a = a;
223 const zfs_snapentry_t *se_b = b;
224 int ret;
225
226 ret = strcmp(se_a->se_name, se_b->se_name);
227
228 if (ret < 0)
229 return (-1);
230 else if (ret > 0)
231 return (1);
232 else
233 return (0);
234 }
235
236 /*
237 * Snapshot name comparison function for the zfs_snapshots_by_objsetid.
238 */
239 static int
240 snapentry_compare_by_objsetid(const void *a, const void *b)
241 {
242 const zfs_snapentry_t *se_a = a;
243 const zfs_snapentry_t *se_b = b;
244
245 if (se_a->se_objsetid < se_b->se_objsetid)
246 return (-1);
247 else if (se_a->se_objsetid > se_b->se_objsetid)
248 return (1);
249 else
250 return (0);
251 }
252
253 /*
254 * Find a zfs_snapentry_t in zfs_snapshots_by_name. If the snapname
255 * is found a pointer to the zfs_snapentry_t is returned and a reference
256 * taken on the structure. The caller is responsible for dropping the
257 * reference with zfsctl_snapshot_rele(). If the snapname is not found
258 * NULL will be returned.
259 */
260 static zfs_snapentry_t *
261 zfsctl_snapshot_find_by_name(char *snapname)
262 {
263 zfs_snapentry_t *se, search;
264
265 ASSERT(MUTEX_HELD(&zfs_snapshot_lock));
266
267 search.se_name = snapname;
268 se = avl_find(&zfs_snapshots_by_name, &search, NULL);
269 if (se)
270 refcount_add(&se->se_refcount, NULL);
271
272 return (se);
273 }
274
275 /*
276 * Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id
277 * rather than the snapname. In all other respects it behaves the same
278 * as zfsctl_snapshot_find_by_name().
279 */
280 static zfs_snapentry_t *
281 zfsctl_snapshot_find_by_objsetid(uint64_t objsetid)
282 {
283 zfs_snapentry_t *se, search;
284
285 ASSERT(MUTEX_HELD(&zfs_snapshot_lock));
286
287 search.se_objsetid = objsetid;
288 se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL);
289 if (se)
290 refcount_add(&se->se_refcount, NULL);
291
292 return (se);
293 }
294
295 /*
296 * Rename a zfs_snapentry_t in the zfs_snapshots_by_name. The structure is
297 * removed, renamed, and added back to the new correct location in the tree.
298 */
299 static int
300 zfsctl_snapshot_rename(char *old_snapname, char *new_snapname)
301 {
302 zfs_snapentry_t *se;
303
304 ASSERT(MUTEX_HELD(&zfs_snapshot_lock));
305
306 se = zfsctl_snapshot_find_by_name(old_snapname);
307 if (se == NULL)
308 return (ENOENT);
309
310 zfsctl_snapshot_remove(se);
311 strfree(se->se_name);
312 se->se_name = strdup(new_snapname);
313 zfsctl_snapshot_add(se);
314 zfsctl_snapshot_rele(se);
315
316 return (0);
317 }
318
319 /*
320 * Delayed task responsible for unmounting an expired automounted snapshot.
321 */
322 static void
323 snapentry_expire(void *data)
324 {
325 zfs_snapentry_t *se = (zfs_snapentry_t *)data;
326 uint64_t objsetid = se->se_objsetid;
327
328 se->se_taskqid = -1;
329 (void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE);
330 zfsctl_snapshot_rele(se);
331
332 /*
333 * Reschedule the unmount if the zfs_snapentry_t wasn't removed.
334 * This can occur when the snapshot is busy.
335 */
336 mutex_enter(&zfs_snapshot_lock);
337 if ((se = zfsctl_snapshot_find_by_objsetid(objsetid)) != NULL) {
338 zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
339 zfsctl_snapshot_rele(se);
340 }
341 mutex_exit(&zfs_snapshot_lock);
342 }
343
344 /*
345 * Cancel an automatic unmount of a snapname. This callback is responsible
346 * for dropping the reference on the zfs_snapentry_t which was taken when
347 * during dispatch.
348 */
349 static void
350 zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)
351 {
352 ASSERT(MUTEX_HELD(&zfs_snapshot_lock));
353
354 if (taskq_cancel_id(zfs_expire_taskq, se->se_taskqid) == 0) {
355 se->se_taskqid = -1;
356 zfsctl_snapshot_rele(se);
357 }
358 }
359
360 /*
361 * Dispatch the unmount task for delayed handling with a hold protecting it.
362 */
363 static void
364 zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)
365 {
366 ASSERT3S(se->se_taskqid, ==, -1);
367
368 se->se_taskqid = taskq_dispatch_delay(zfs_expire_taskq,
369 snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ);
370 zfsctl_snapshot_hold(se);
371 }
372
373 /*
374 * Schedule an automatic unmount of objset id to occur in delay seconds from
375 * now. Any previous delayed unmount will be cancelled in favor of the
376 * updated deadline. A reference is taken by zfsctl_snapshot_find_by_name()
377 * and held until the outstanding task is handled or cancelled.
378 */
379 int
380 zfsctl_snapshot_unmount_delay(uint64_t objsetid, int delay)
381 {
382 zfs_snapentry_t *se;
383 int error = ENOENT;
384
385 mutex_enter(&zfs_snapshot_lock);
386 if ((se = zfsctl_snapshot_find_by_objsetid(objsetid)) != NULL) {
387 zfsctl_snapshot_unmount_cancel(se);
388 zfsctl_snapshot_unmount_delay_impl(se, delay);
389 zfsctl_snapshot_rele(se);
390 error = 0;
391 }
392 mutex_exit(&zfs_snapshot_lock);
393
394 return (error);
395 }
396
397 /*
398 * Check if snapname is currently mounted. Returned non-zero when mounted
399 * and zero when unmounted.
400 */
401 static boolean_t
402 zfsctl_snapshot_ismounted(char *snapname)
403 {
404 zfs_snapentry_t *se;
405 boolean_t ismounted = B_FALSE;
406
407 mutex_enter(&zfs_snapshot_lock);
408 if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) {
409 zfsctl_snapshot_rele(se);
410 ismounted = B_TRUE;
411 }
412 mutex_exit(&zfs_snapshot_lock);
413
414 return (ismounted);
415 }
416
417 /*
418 * Check if the given inode is a part of the virtual .zfs directory.
419 */
420 boolean_t
421 zfsctl_is_node(struct inode *ip)
422 {
423 return (ITOZ(ip)->z_is_ctldir);
424 }
425
426 /*
427 * Check if the given inode is a .zfs/snapshots/snapname directory.
428 */
429 boolean_t
430 zfsctl_is_snapdir(struct inode *ip)
431 {
432 return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS));
433 }
434
435 /*
436 * Allocate a new inode with the passed id and ops.
437 */
438 static struct inode *
439 zfsctl_inode_alloc(zfs_sb_t *zsb, uint64_t id,
440 const struct file_operations *fops, const struct inode_operations *ops)
441 {
442 struct timespec now = current_fs_time(zsb->z_sb);
443 struct inode *ip;
444 znode_t *zp;
445
446 ip = new_inode(zsb->z_sb);
447 if (ip == NULL)
448 return (NULL);
449
450 zp = ITOZ(ip);
451 ASSERT3P(zp->z_dirlocks, ==, NULL);
452 ASSERT3P(zp->z_acl_cached, ==, NULL);
453 ASSERT3P(zp->z_xattr_cached, ==, NULL);
454 zp->z_id = id;
455 zp->z_unlinked = 0;
456 zp->z_atime_dirty = 0;
457 zp->z_zn_prefetch = 0;
458 zp->z_moved = 0;
459 zp->z_sa_hdl = NULL;
460 zp->z_blksz = 0;
461 zp->z_seq = 0;
462 zp->z_mapcnt = 0;
463 zp->z_gen = 0;
464 zp->z_size = 0;
465 zp->z_atime[0] = 0;
466 zp->z_atime[1] = 0;
467 zp->z_links = 0;
468 zp->z_pflags = 0;
469 zp->z_uid = 0;
470 zp->z_gid = 0;
471 zp->z_mode = 0;
472 zp->z_sync_cnt = 0;
473 zp->z_is_zvol = B_FALSE;
474 zp->z_is_mapped = B_FALSE;
475 zp->z_is_ctldir = B_TRUE;
476 zp->z_is_sa = B_FALSE;
477 zp->z_is_stale = B_FALSE;
478 ip->i_ino = id;
479 ip->i_mode = (S_IFDIR | S_IRUGO | S_IXUGO);
480 ip->i_uid = SUID_TO_KUID(0);
481 ip->i_gid = SGID_TO_KGID(0);
482 ip->i_blkbits = SPA_MINBLOCKSHIFT;
483 ip->i_atime = now;
484 ip->i_mtime = now;
485 ip->i_ctime = now;
486 ip->i_fop = fops;
487 ip->i_op = ops;
488
489 if (insert_inode_locked(ip)) {
490 unlock_new_inode(ip);
491 iput(ip);
492 return (NULL);
493 }
494
495 mutex_enter(&zsb->z_znodes_lock);
496 list_insert_tail(&zsb->z_all_znodes, zp);
497 zsb->z_nr_znodes++;
498 membar_producer();
499 mutex_exit(&zsb->z_znodes_lock);
500
501 unlock_new_inode(ip);
502
503 return (ip);
504 }
505
506 /*
507 * Lookup the inode with given id, it will be allocated if needed.
508 */
509 static struct inode *
510 zfsctl_inode_lookup(zfs_sb_t *zsb, uint64_t id,
511 const struct file_operations *fops, const struct inode_operations *ops)
512 {
513 struct inode *ip = NULL;
514
515 while (ip == NULL) {
516 ip = ilookup(zsb->z_sb, (unsigned long)id);
517 if (ip)
518 break;
519
520 /* May fail due to concurrent zfsctl_inode_alloc() */
521 ip = zfsctl_inode_alloc(zsb, id, fops, ops);
522 }
523
524 return (ip);
525 }
526
527 /*
528 * Create the '.zfs' directory. This directory is cached as part of the VFS
529 * structure. This results in a hold on the zfs_sb_t. The code in zfs_umount()
530 * therefore checks against a vfs_count of 2 instead of 1. This reference
531 * is removed when the ctldir is destroyed in the unmount. All other entities
532 * under the '.zfs' directory are created dynamically as needed.
533 *
534 * Because the dynamically created '.zfs' directory entries assume the use
535 * of 64-bit inode numbers this support must be disabled on 32-bit systems.
536 */
537 int
538 zfsctl_create(zfs_sb_t *zsb)
539 {
540 #if defined(CONFIG_64BIT)
541 ASSERT(zsb->z_ctldir == NULL);
542
543 zsb->z_ctldir = zfsctl_inode_alloc(zsb, ZFSCTL_INO_ROOT,
544 &zpl_fops_root, &zpl_ops_root);
545 if (zsb->z_ctldir == NULL)
546 return (SET_ERROR(ENOENT));
547
548 return (0);
549 #else
550 return (SET_ERROR(EOPNOTSUPP));
551 #endif /* CONFIG_64BIT */
552 }
553
554 /*
555 * Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name.
556 * Only called when the filesystem is unmounted.
557 */
558 void
559 zfsctl_destroy(zfs_sb_t *zsb)
560 {
561 if (zsb->z_issnap) {
562 zfs_snapentry_t *se;
563 uint64_t objsetid = dmu_objset_id(zsb->z_os);
564
565 mutex_enter(&zfs_snapshot_lock);
566 if ((se = zfsctl_snapshot_find_by_objsetid(objsetid)) != NULL) {
567 zfsctl_snapshot_unmount_cancel(se);
568 zfsctl_snapshot_remove(se);
569 zfsctl_snapshot_rele(se);
570 }
571 mutex_exit(&zfs_snapshot_lock);
572 } else if (zsb->z_ctldir) {
573 iput(zsb->z_ctldir);
574 zsb->z_ctldir = NULL;
575 }
576 }
577
578 /*
579 * Given a root znode, retrieve the associated .zfs directory.
580 * Add a hold to the vnode and return it.
581 */
582 struct inode *
583 zfsctl_root(znode_t *zp)
584 {
585 ASSERT(zfs_has_ctldir(zp));
586 igrab(ZTOZSB(zp)->z_ctldir);
587 return (ZTOZSB(zp)->z_ctldir);
588 }
589 /*
590 * Generate a long fid which includes the root object and objset of a
591 * snapshot but not the generation number. For the root object the
592 * generation number is ignored when zero to avoid needing to open
593 * the dataset when generating fids for the snapshot names.
594 */
595 static int
596 zfsctl_snapdir_fid(struct inode *ip, fid_t *fidp)
597 {
598 zfs_sb_t *zsb = ITOZSB(ip);
599 zfid_short_t *zfid = (zfid_short_t *)fidp;
600 zfid_long_t *zlfid = (zfid_long_t *)fidp;
601 uint32_t gen = 0;
602 uint64_t object;
603 uint64_t objsetid;
604 int i;
605
606 object = zsb->z_root;
607 objsetid = ZFSCTL_INO_SNAPDIRS - ip->i_ino;
608 zfid->zf_len = LONG_FID_LEN;
609
610 for (i = 0; i < sizeof (zfid->zf_object); i++)
611 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
612
613 for (i = 0; i < sizeof (zfid->zf_gen); i++)
614 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
615
616 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
617 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
618
619 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
620 zlfid->zf_setgen[i] = 0;
621
622 return (0);
623 }
624
625 /*
626 * Generate an appropriate fid for an entry in the .zfs directory.
627 */
628 int
629 zfsctl_fid(struct inode *ip, fid_t *fidp)
630 {
631 znode_t *zp = ITOZ(ip);
632 zfs_sb_t *zsb = ITOZSB(ip);
633 uint64_t object = zp->z_id;
634 zfid_short_t *zfid;
635 int i;
636
637 ZFS_ENTER(zsb);
638
639 if (fidp->fid_len < SHORT_FID_LEN) {
640 fidp->fid_len = SHORT_FID_LEN;
641 ZFS_EXIT(zsb);
642 return (SET_ERROR(ENOSPC));
643 }
644
645 if (zfsctl_is_snapdir(ip)) {
646 ZFS_EXIT(zsb);
647 return (zfsctl_snapdir_fid(ip, fidp));
648 }
649
650 zfid = (zfid_short_t *)fidp;
651
652 zfid->zf_len = SHORT_FID_LEN;
653
654 for (i = 0; i < sizeof (zfid->zf_object); i++)
655 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
656
657 /* .zfs znodes always have a generation number of 0 */
658 for (i = 0; i < sizeof (zfid->zf_gen); i++)
659 zfid->zf_gen[i] = 0;
660
661 ZFS_EXIT(zsb);
662 return (0);
663 }
664
665 /*
666 * Construct a full dataset name in full_name: "pool/dataset@snap_name"
667 */
668 static int
669 zfsctl_snapshot_name(zfs_sb_t *zsb, const char *snap_name, int len,
670 char *full_name)
671 {
672 objset_t *os = zsb->z_os;
673
674 if (zfs_component_namecheck(snap_name, NULL, NULL) != 0)
675 return (SET_ERROR(EILSEQ));
676
677 dmu_objset_name(os, full_name);
678 if ((strlen(full_name) + 1 + strlen(snap_name)) >= len)
679 return (SET_ERROR(ENAMETOOLONG));
680
681 (void) strcat(full_name, "@");
682 (void) strcat(full_name, snap_name);
683
684 return (0);
685 }
686
687 /*
688 * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/"
689 */
690 static int
691 zfsctl_snapshot_path(struct path *path, int len, char *full_path)
692 {
693 char *path_buffer, *path_ptr;
694 int path_len, error = 0;
695
696 path_buffer = kmem_alloc(len, KM_SLEEP);
697
698 path_ptr = d_path(path, path_buffer, len);
699 if (IS_ERR(path_ptr)) {
700 error = -PTR_ERR(path_ptr);
701 goto out;
702 }
703
704 path_len = path_buffer + len - 1 - path_ptr;
705 if (path_len > len) {
706 error = SET_ERROR(EFAULT);
707 goto out;
708 }
709
710 memcpy(full_path, path_ptr, path_len);
711 full_path[path_len] = '\0';
712 out:
713 kmem_free(path_buffer, len);
714
715 return (error);
716 }
717
718 /*
719 * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/"
720 */
721 static int
722 zfsctl_snapshot_path_objset(zfs_sb_t *zsb, uint64_t objsetid,
723 int path_len, char *full_path)
724 {
725 objset_t *os = zsb->z_os;
726 fstrans_cookie_t cookie;
727 char *snapname;
728 boolean_t case_conflict;
729 uint64_t id, pos = 0;
730 int error = 0;
731
732 if (zsb->z_mntopts->z_mntpoint == NULL)
733 return (ENOENT);
734
735 cookie = spl_fstrans_mark();
736 snapname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
737
738 while (error == 0) {
739 dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
740 error = dmu_snapshot_list_next(zsb->z_os, MAXNAMELEN,
741 snapname, &id, &pos, &case_conflict);
742 dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
743 if (error)
744 goto out;
745
746 if (id == objsetid)
747 break;
748 }
749
750 memset(full_path, 0, path_len);
751 snprintf(full_path, path_len - 1, "%s/.zfs/snapshot/%s",
752 zsb->z_mntopts->z_mntpoint, snapname);
753 out:
754 kmem_free(snapname, MAXNAMELEN);
755 spl_fstrans_unmark(cookie);
756
757 return (error);
758 }
759
760 /*
761 * Special case the handling of "..".
762 */
763 int
764 zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp,
765 int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
766 {
767 zfs_sb_t *zsb = ITOZSB(dip);
768 int error = 0;
769
770 ZFS_ENTER(zsb);
771
772 if (strcmp(name, "..") == 0) {
773 *ipp = dip->i_sb->s_root->d_inode;
774 } else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) {
775 *ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SNAPDIR,
776 &zpl_fops_snapdir, &zpl_ops_snapdir);
777 } else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) {
778 *ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SHARES,
779 &zpl_fops_shares, &zpl_ops_shares);
780 } else {
781 *ipp = NULL;
782 }
783
784 if (*ipp == NULL)
785 error = SET_ERROR(ENOENT);
786
787 ZFS_EXIT(zsb);
788
789 return (error);
790 }
791
792 /*
793 * Lookup entry point for the 'snapshot' directory. Try to open the
794 * snapshot if it exist, creating the pseudo filesystem inode as necessary.
795 * Perform a mount of the associated dataset on top of the inode.
796 */
797 int
798 zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp,
799 int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
800 {
801 zfs_sb_t *zsb = ITOZSB(dip);
802 uint64_t id;
803 int error;
804
805 ZFS_ENTER(zsb);
806
807 error = dmu_snapshot_lookup(zsb->z_os, name, &id);
808 if (error) {
809 ZFS_EXIT(zsb);
810 return (error);
811 }
812
813 *ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SNAPDIRS - id,
814 &simple_dir_operations, &simple_dir_inode_operations);
815 if (*ipp == NULL)
816 error = SET_ERROR(ENOENT);
817
818 ZFS_EXIT(zsb);
819
820 return (error);
821 }
822
823 /*
824 * Renaming a directory under '.zfs/snapshot' will automatically trigger
825 * a rename of the snapshot to the new given name. The rename is confined
826 * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere.
827 */
828 int
829 zfsctl_snapdir_rename(struct inode *sdip, char *snm,
830 struct inode *tdip, char *tnm, cred_t *cr, int flags)
831 {
832 zfs_sb_t *zsb = ITOZSB(sdip);
833 char *to, *from, *real, *fsname;
834 int error;
835
836 if (!zfs_admin_snapshot)
837 return (EACCES);
838
839 ZFS_ENTER(zsb);
840
841 to = kmem_alloc(MAXNAMELEN, KM_SLEEP);
842 from = kmem_alloc(MAXNAMELEN, KM_SLEEP);
843 real = kmem_alloc(MAXNAMELEN, KM_SLEEP);
844 fsname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
845
846 if (zsb->z_case == ZFS_CASE_INSENSITIVE) {
847 error = dmu_snapshot_realname(zsb->z_os, snm, real,
848 MAXNAMELEN, NULL);
849 if (error == 0) {
850 snm = real;
851 } else if (error != ENOTSUP) {
852 goto out;
853 }
854 }
855
856 dmu_objset_name(zsb->z_os, fsname);
857
858 error = zfsctl_snapshot_name(ITOZSB(sdip), snm, MAXNAMELEN, from);
859 if (error == 0)
860 error = zfsctl_snapshot_name(ITOZSB(tdip), tnm, MAXNAMELEN, to);
861 if (error == 0)
862 error = zfs_secpolicy_rename_perms(from, to, cr);
863 if (error != 0)
864 goto out;
865
866 /*
867 * Cannot move snapshots out of the snapdir.
868 */
869 if (sdip != tdip) {
870 error = SET_ERROR(EINVAL);
871 goto out;
872 }
873
874 /*
875 * No-op when names are identical.
876 */
877 if (strcmp(snm, tnm) == 0) {
878 error = 0;
879 goto out;
880 }
881
882 mutex_enter(&zfs_snapshot_lock);
883
884 error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE);
885 if (error == 0)
886 (void) zfsctl_snapshot_rename(snm, tnm);
887
888 mutex_exit(&zfs_snapshot_lock);
889 out:
890 kmem_free(from, MAXNAMELEN);
891 kmem_free(to, MAXNAMELEN);
892 kmem_free(real, MAXNAMELEN);
893 kmem_free(fsname, MAXNAMELEN);
894
895 ZFS_EXIT(zsb);
896
897 return (error);
898 }
899
900 /*
901 * Removing a directory under '.zfs/snapshot' will automatically trigger
902 * the removal of the snapshot with the given name.
903 */
904 int
905 zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags)
906 {
907 zfs_sb_t *zsb = ITOZSB(dip);
908 char *snapname, *real;
909 int error;
910
911 if (!zfs_admin_snapshot)
912 return (EACCES);
913
914 ZFS_ENTER(zsb);
915
916 snapname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
917 real = kmem_alloc(MAXNAMELEN, KM_SLEEP);
918
919 if (zsb->z_case == ZFS_CASE_INSENSITIVE) {
920 error = dmu_snapshot_realname(zsb->z_os, name, real,
921 MAXNAMELEN, NULL);
922 if (error == 0) {
923 name = real;
924 } else if (error != ENOTSUP) {
925 goto out;
926 }
927 }
928
929 error = zfsctl_snapshot_name(ITOZSB(dip), name, MAXNAMELEN, snapname);
930 if (error == 0)
931 error = zfs_secpolicy_destroy_perms(snapname, cr);
932 if (error != 0)
933 goto out;
934
935 error = zfsctl_snapshot_unmount(snapname, MNT_FORCE);
936 if ((error == 0) || (error == ENOENT))
937 error = dsl_destroy_snapshot(snapname, B_FALSE);
938 out:
939 kmem_free(snapname, MAXNAMELEN);
940 kmem_free(real, MAXNAMELEN);
941
942 ZFS_EXIT(zsb);
943
944 return (error);
945 }
946
947 /*
948 * Creating a directory under '.zfs/snapshot' will automatically trigger
949 * the creation of a new snapshot with the given name.
950 */
951 int
952 zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap,
953 struct inode **ipp, cred_t *cr, int flags)
954 {
955 zfs_sb_t *zsb = ITOZSB(dip);
956 char *dsname;
957 int error;
958
959 if (!zfs_admin_snapshot)
960 return (EACCES);
961
962 dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
963
964 if (zfs_component_namecheck(dirname, NULL, NULL) != 0) {
965 error = SET_ERROR(EILSEQ);
966 goto out;
967 }
968
969 dmu_objset_name(zsb->z_os, dsname);
970
971 error = zfs_secpolicy_snapshot_perms(dsname, cr);
972 if (error != 0)
973 goto out;
974
975 if (error == 0) {
976 error = dmu_objset_snapshot_one(dsname, dirname);
977 if (error != 0)
978 goto out;
979
980 error = zfsctl_snapdir_lookup(dip, dirname, ipp,
981 0, cr, NULL, NULL);
982 }
983 out:
984 kmem_free(dsname, MAXNAMELEN);
985
986 return (error);
987 }
988
989 /*
990 * Attempt to unmount a snapshot by making a call to user space.
991 * There is no assurance that this can or will succeed, is just a
992 * best effort. In the case where it does fail, perhaps because
993 * it's in use, the unmount will fail harmlessly.
994 */
995 #define SET_UNMOUNT_CMD \
996 "exec 0</dev/null " \
997 " 1>/dev/null " \
998 " 2>/dev/null; " \
999 "umount -t zfs -n %s'%s'"
1000
1001 int
1002 zfsctl_snapshot_unmount(char *snapname, int flags)
1003 {
1004 char *argv[] = { "/bin/sh", "-c", NULL, NULL };
1005 char *envp[] = { NULL };
1006 zfs_snapentry_t *se;
1007 int error;
1008
1009 mutex_enter(&zfs_snapshot_lock);
1010 if ((se = zfsctl_snapshot_find_by_name(snapname)) == NULL) {
1011 mutex_exit(&zfs_snapshot_lock);
1012 return (ENOENT);
1013 }
1014 mutex_exit(&zfs_snapshot_lock);
1015
1016 argv[2] = kmem_asprintf(SET_UNMOUNT_CMD,
1017 flags & MNT_FORCE ? "-f " : "", se->se_path);
1018 zfsctl_snapshot_rele(se);
1019 dprintf("unmount; path=%s\n", se->se_path);
1020 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
1021 strfree(argv[2]);
1022
1023
1024 /*
1025 * The umount system utility will return 256 on error. We must
1026 * assume this error is because the file system is busy so it is
1027 * converted to the more sensible EBUSY.
1028 */
1029 if (error)
1030 error = SET_ERROR(EBUSY);
1031
1032 return (error);
1033 }
1034
1035 #define MOUNT_BUSY 0x80 /* Mount failed due to EBUSY (from mntent.h) */
1036 #define SET_MOUNT_CMD \
1037 "exec 0</dev/null " \
1038 " 1>/dev/null " \
1039 " 2>/dev/null; " \
1040 "mount -t zfs -n '%s' '%s'"
1041
1042 int
1043 zfsctl_snapshot_mount(struct path *path, int flags)
1044 {
1045 struct dentry *dentry = path->dentry;
1046 struct inode *ip = dentry->d_inode;
1047 zfs_sb_t *zsb;
1048 zfs_sb_t *snap_zsb;
1049 zfs_snapentry_t *se;
1050 char *full_name, *full_path;
1051 char *argv[] = { "/bin/sh", "-c", NULL, NULL };
1052 char *envp[] = { NULL };
1053 int error;
1054 struct path spath;
1055
1056 if (ip == NULL)
1057 return (EISDIR);
1058
1059 zsb = ITOZSB(ip);
1060 ZFS_ENTER(zsb);
1061
1062 full_name = kmem_zalloc(MAXNAMELEN, KM_SLEEP);
1063 full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
1064
1065 error = zfsctl_snapshot_name(zsb, dname(dentry),
1066 MAXNAMELEN, full_name);
1067 if (error)
1068 goto error;
1069
1070 error = zfsctl_snapshot_path(path, MAXPATHLEN, full_path);
1071 if (error)
1072 goto error;
1073
1074 /*
1075 * Multiple concurrent automounts of a snapshot are never allowed.
1076 * The snapshot may be manually mounted as many times as desired.
1077 */
1078 if (zfsctl_snapshot_ismounted(full_name)) {
1079 error = SET_ERROR(EISDIR);
1080 goto error;
1081 }
1082
1083 /*
1084 * Attempt to mount the snapshot from user space. Normally this
1085 * would be done using the vfs_kern_mount() function, however that
1086 * function is marked GPL-only and cannot be used. On error we
1087 * careful to log the real error to the console and return EISDIR
1088 * to safely abort the automount. This should be very rare.
1089 *
1090 * If the user mode helper happens to return EBUSY, a concurrent
1091 * mount is already in progress in which case the error is ignored.
1092 * Take note that if the program was executed successfully the return
1093 * value from call_usermodehelper() will be (exitcode << 8 + signal).
1094 */
1095 dprintf("mount; name=%s path=%s\n", full_name, full_path);
1096 argv[2] = kmem_asprintf(SET_MOUNT_CMD, full_name, full_path);
1097 error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
1098 strfree(argv[2]);
1099 if (error) {
1100 if (!(error & MOUNT_BUSY << 8)) {
1101 cmn_err(CE_WARN, "Unable to automount %s/%s: %d",
1102 full_path, full_name, error);
1103 error = SET_ERROR(EISDIR);
1104 } else {
1105 /*
1106 * EBUSY, this could mean a concurrent mount, or the
1107 * snapshot has already been mounted at completely
1108 * different place. We return 0 so VFS will retry. For
1109 * the latter case the VFS will retry several times
1110 * and return ELOOP, which is probably not a very good
1111 * behavior.
1112 */
1113 error = 0;
1114 }
1115 goto error;
1116 }
1117
1118 /*
1119 * Follow down in to the mounted snapshot and set MNT_SHRINKABLE
1120 * to identify this as an automounted filesystem.
1121 */
1122 spath = *path;
1123 path_get(&spath);
1124 if (zpl_follow_down_one(&spath)) {
1125 snap_zsb = ITOZSB(spath.dentry->d_inode);
1126 snap_zsb->z_parent = zsb;
1127 dentry = spath.dentry;
1128 spath.mnt->mnt_flags |= MNT_SHRINKABLE;
1129
1130 mutex_enter(&zfs_snapshot_lock);
1131 se = zfsctl_snapshot_alloc(full_name, full_path,
1132 dmu_objset_id(snap_zsb->z_os), dentry);
1133 zfsctl_snapshot_add(se);
1134 zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
1135 mutex_exit(&zfs_snapshot_lock);
1136 }
1137 path_put(&spath);
1138 error:
1139 kmem_free(full_name, MAXNAMELEN);
1140 kmem_free(full_path, MAXPATHLEN);
1141
1142 ZFS_EXIT(zsb);
1143
1144 return (error);
1145 }
1146
1147 /*
1148 * Given the objset id of the snapshot return its zfs_sb_t as zsbp.
1149 */
1150 int
1151 zfsctl_lookup_objset(struct super_block *sb, uint64_t objsetid, zfs_sb_t **zsbp)
1152 {
1153 zfs_snapentry_t *se;
1154 int error;
1155
1156 /*
1157 * Verify that the snapshot is mounted then lookup the mounted root
1158 * rather than the covered mount point. This may fail if the
1159 * snapshot has just been unmounted by an unrelated user space
1160 * process. This race cannot occur to an expired mount point
1161 * because we hold the zfs_snapshot_lock to prevent the race.
1162 */
1163 mutex_enter(&zfs_snapshot_lock);
1164 if ((se = zfsctl_snapshot_find_by_objsetid(objsetid)) != NULL) {
1165 zfs_sb_t *zsb;
1166
1167 zsb = ITOZSB(se->se_root_dentry->d_inode);
1168 ASSERT3U(dmu_objset_id(zsb->z_os), ==, objsetid);
1169
1170 if (time_after(jiffies, zsb->z_snap_defer_time +
1171 MAX(zfs_expire_snapshot * HZ / 2, HZ))) {
1172 zsb->z_snap_defer_time = jiffies;
1173 zfsctl_snapshot_unmount_delay(objsetid,
1174 zfs_expire_snapshot);
1175 }
1176
1177 *zsbp = zsb;
1178 zfsctl_snapshot_rele(se);
1179 error = SET_ERROR(0);
1180 } else {
1181 error = SET_ERROR(ENOENT);
1182 }
1183 mutex_exit(&zfs_snapshot_lock);
1184
1185 /*
1186 * Automount the snapshot given the objset id by constructing the
1187 * full mount point and performing a traversal.
1188 */
1189 if (error == ENOENT) {
1190 struct path path;
1191 char *mnt;
1192
1193 mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1194 error = zfsctl_snapshot_path_objset(sb->s_fs_info, objsetid,
1195 MAXPATHLEN, mnt);
1196 if (error) {
1197 kmem_free(mnt, MAXPATHLEN);
1198 return (SET_ERROR(error));
1199 }
1200
1201 error = kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path);
1202 if (error == 0) {
1203 *zsbp = ITOZSB(path.dentry->d_inode);
1204 path_put(&path);
1205 }
1206
1207 kmem_free(mnt, MAXPATHLEN);
1208 }
1209
1210 return (error);
1211 }
1212
1213 int
1214 zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,
1215 int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
1216 {
1217 zfs_sb_t *zsb = ITOZSB(dip);
1218 struct inode *ip;
1219 znode_t *dzp;
1220 int error;
1221
1222 ZFS_ENTER(zsb);
1223
1224 if (zsb->z_shares_dir == 0) {
1225 ZFS_EXIT(zsb);
1226 return (SET_ERROR(ENOTSUP));
1227 }
1228
1229 error = zfs_zget(zsb, zsb->z_shares_dir, &dzp);
1230 if (error) {
1231 ZFS_EXIT(zsb);
1232 return (error);
1233 }
1234
1235 error = zfs_lookup(ZTOI(dzp), name, &ip, 0, cr, NULL, NULL);
1236
1237 iput(ZTOI(dzp));
1238 ZFS_EXIT(zsb);
1239
1240 return (error);
1241 }
1242
1243
1244 /*
1245 * Initialize the various pieces we'll need to create and manipulate .zfs
1246 * directories. Currently this is unused but available.
1247 */
1248 void
1249 zfsctl_init(void)
1250 {
1251 avl_create(&zfs_snapshots_by_name, snapentry_compare_by_name,
1252 sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
1253 se_node_name));
1254 avl_create(&zfs_snapshots_by_objsetid, snapentry_compare_by_objsetid,
1255 sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
1256 se_node_objsetid));
1257 mutex_init(&zfs_snapshot_lock, NULL, MUTEX_DEFAULT, NULL);
1258
1259 zfs_expire_taskq = taskq_create("z_unmount", 1, defclsyspri,
1260 1, 8, TASKQ_PREPOPULATE);
1261 }
1262
1263 /*
1264 * Cleanup the various pieces we needed for .zfs directories. In particular
1265 * ensure the expiry timer is canceled safely.
1266 */
1267 void
1268 zfsctl_fini(void)
1269 {
1270 taskq_destroy(zfs_expire_taskq);
1271
1272 avl_destroy(&zfs_snapshots_by_name);
1273 avl_destroy(&zfs_snapshots_by_objsetid);
1274 mutex_destroy(&zfs_snapshot_lock);
1275 }
1276
1277 module_param(zfs_admin_snapshot, int, 0644);
1278 MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot");
1279
1280 module_param(zfs_expire_snapshot, int, 0644);
1281 MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot");