module/zfs/zfs_ctldir.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  *
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
  25  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  26  * LLNL-CODE-403049.
  27  * Rewritten for Linux by:
  28  *   Rohan Puri <rohan.puri15@gmail.com>
  29  *   Brian Behlendorf <behlendorf1@llnl.gov>
  30  */
  31
  32 /*
  33  * ZFS control directory (a.k.a. ".zfs")
  34  *
  35  * This directory provides a common location for all ZFS meta-objects.
  36  * Currently, this is only the 'snapshot' and 'shares' directory, but this may
  37  * expand in the future.  The elements are built dynamically, as the hierarchy
  38  * does not actually exist on disk.
  39  *
  40  * For 'snapshot', we don't want to have all snapshots always mounted, because
  41  * this would take up a huge amount of space in /etc/mnttab.  We have three
  42  * types of objects:
  43  *
  44  *      ctldir ------> snapshotdir -------> snapshot
  45  *                                             |
  46  *                                             |
  47  *                                             V
  48  *                                         mounted fs
  49  *
  50  * The 'snapshot' node contains just enough information to lookup '..' and act
  51  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
  52  * perform an automount of the underlying filesystem and return the
  53  * corresponding inode.
  54  *
  55  * All mounts are handled automatically by an user mode helper which invokes
  56  * the mount mount procedure.  Unmounts are handled by allowing the mount
  57  * point to expire so the kernel may automatically unmount it.
  58  *
  59  * The '.zfs', '.zfs/snapshot', and all directories created under
  60  * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') all share the same
  61  * share the same zfs_sb_t as the head filesystem (what '.zfs' lives under).
  62  *
  63  * File systems mounted on top of the '.zfs/snapshot/<snapname>' paths
  64  * (ie: snapshots) are complete ZFS filesystems and have their own unique
  65  * zfs_sb_t.  However, the fsid reported by these mounts will be the same
  66  * as that used by the parent zfs_sb_t to make NFS happy.
  67  */
  68
  69 #include <sys/types.h>
  70 #include <sys/param.h>
  71 #include <sys/time.h>
  72 #include <sys/systm.h>
  73 #include <sys/sysmacros.h>
  74 #include <sys/pathname.h>
  75 #include <sys/vfs.h>
  76 #include <sys/vfs_opreg.h>
  77 #include <sys/zfs_ctldir.h>
  78 #include <sys/zfs_ioctl.h>
  79 #include <sys/zfs_vfsops.h>
  80 #include <sys/zfs_vnops.h>
  81 #include <sys/stat.h>
  82 #include <sys/dmu.h>
  83 #include <sys/dsl_deleg.h>
  84 #include <sys/mount.h>
  85 #include <sys/zpl.h>
  86 #include "zfs_namecheck.h"
  87
  88 /*
  89  * Control Directory Tunables (.zfs)
  90  */
  91 int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT;
  92
  93 /*
  94  * Dedicated task queue for unmounting snapshots.
  95  */
  96 static taskq_t *zfs_expire_taskq;
  97
  98 static zfs_snapentry_t *
  99 zfsctl_sep_alloc(void)
 100 {
 101         return kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP);
 102 }
 103
 104 void
 105 zfsctl_sep_free(zfs_snapentry_t *sep)
 106 {
 107         kmem_free(sep->se_name, MAXNAMELEN);
 108         kmem_free(sep->se_path, PATH_MAX);
 109         kmem_free(sep, sizeof (zfs_snapentry_t));
 110 }
 111
 112 /*
 113  * Attempt to expire an automounted snapshot, unmounts are attempted every
 114  * 'zfs_expire_snapshot' seconds until they succeed.  The work request is
 115  * responsible for rescheduling itself and freeing the zfs_expire_snapshot_t.
 116  */
 117 static void
 118 zfsctl_expire_snapshot(void *data)
 119 {
 120         zfs_snapentry_t *sep = (zfs_snapentry_t *)data;
 121         zfs_sb_t *zsb = ITOZSB(sep->se_inode);
 122         int error;
 123
 124         error = zfsctl_unmount_snapshot(zsb, sep->se_name, MNT_EXPIRE);
 125         if (error == EBUSY)
 126                 sep->se_taskqid = taskq_dispatch_delay(zfs_expire_taskq,
 127                     zfsctl_expire_snapshot, sep, TQ_SLEEP,
 128                     ddi_get_lbolt() + zfs_expire_snapshot * HZ);
 129 }
 130
 131 int
 132 snapentry_compare(const void *a, const void *b)
 133 {
 134         const zfs_snapentry_t *sa = a;
 135         const zfs_snapentry_t *sb = b;
 136         int ret = strcmp(sa->se_name, sb->se_name);
 137
 138         if (ret < 0)
 139                 return (-1);
 140         else if (ret > 0)
 141                 return (1);
 142         else
 143                 return (0);
 144 }
 145
 146 boolean_t
 147 zfsctl_is_node(struct inode *ip)
 148 {
 149         return (ITOZ(ip)->z_is_ctldir);
 150 }
 151
 152 boolean_t
 153 zfsctl_is_snapdir(struct inode *ip)
 154 {
 155         return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS));
 156 }
 157
 158 /*
 159  * Allocate a new inode with the passed id and ops.
 160  */
 161 static struct inode *
 162 zfsctl_inode_alloc(zfs_sb_t *zsb, uint64_t id,
 163     const struct file_operations *fops, const struct inode_operations *ops)
 164 {
 165         struct timespec now = current_fs_time(zsb->z_sb);
 166         struct inode *ip;
 167         znode_t *zp;
 168
 169         ip = new_inode(zsb->z_sb);
 170         if (ip == NULL)
 171                 return (NULL);
 172
 173         zp = ITOZ(ip);
 174         ASSERT3P(zp->z_dirlocks, ==, NULL);
 175         ASSERT3P(zp->z_acl_cached, ==, NULL);
 176         ASSERT3P(zp->z_xattr_cached, ==, NULL);
 177         zp->z_id = id;
 178         zp->z_unlinked = 0;
 179         zp->z_atime_dirty = 0;
 180         zp->z_zn_prefetch = 0;
 181         zp->z_moved = 0;
 182         zp->z_sa_hdl = NULL;
 183         zp->z_blksz = 0;
 184         zp->z_seq = 0;
 185         zp->z_mapcnt = 0;
 186         zp->z_gen = 0;
 187         zp->z_size = 0;
 188         zp->z_atime[0] = 0;
 189         zp->z_atime[1] = 0;
 190         zp->z_links = 0;
 191         zp->z_pflags = 0;
 192         zp->z_uid = 0;
 193         zp->z_gid = 0;
 194         zp->z_mode = 0;
 195         zp->z_sync_cnt = 0;
 196         zp->z_is_zvol = B_FALSE;
 197         zp->z_is_mapped = B_FALSE;
 198         zp->z_is_ctldir = B_TRUE;
 199         zp->z_is_sa = B_FALSE;
 200         zp->z_is_stale = B_FALSE;
 201         ip->i_ino = id;
 202         ip->i_mode = (S_IFDIR | S_IRUGO | S_IXUGO);
 203         ip->i_uid = SUID_TO_KUID(0);
 204         ip->i_gid = SGID_TO_KGID(0);
 205         ip->i_blkbits = SPA_MINBLOCKSHIFT;
 206         ip->i_atime = now;
 207         ip->i_mtime = now;
 208         ip->i_ctime = now;
 209         ip->i_fop = fops;
 210         ip->i_op = ops;
 211
 212         if (insert_inode_locked(ip)) {
 213                 unlock_new_inode(ip);
 214                 iput(ip);
 215                 return (NULL);
 216         }
 217
 218         mutex_enter(&zsb->z_znodes_lock);
 219         list_insert_tail(&zsb->z_all_znodes, zp);
 220         zsb->z_nr_znodes++;
 221         membar_producer();
 222         mutex_exit(&zsb->z_znodes_lock);
 223
 224         unlock_new_inode(ip);
 225
 226         return (ip);
 227 }
 228
 229 /*
 230  * Lookup the inode with given id, it will be allocated if needed.
 231  */
 232 static struct inode *
 233 zfsctl_inode_lookup(zfs_sb_t *zsb, uint64_t id,
 234     const struct file_operations *fops, const struct inode_operations *ops)
 235 {
 236         struct inode *ip = NULL;
 237
 238         while (ip == NULL) {
 239                 ip = ilookup(zsb->z_sb, (unsigned long)id);
 240                 if (ip)
 241                         break;
 242
 243                 /* May fail due to concurrent zfsctl_inode_alloc() */
 244                 ip = zfsctl_inode_alloc(zsb, id, fops, ops);
 245         }
 246
 247         return (ip);
 248 }
 249
 250 /*
 251  * Free zfsctl inode specific structures, currently there are none.
 252  */
 253 void
 254 zfsctl_inode_destroy(struct inode *ip)
 255 {
 256         return;
 257 }
 258
 259 /*
 260  * An inode is being evicted from the cache.
 261  */
 262 void
 263 zfsctl_inode_inactive(struct inode *ip)
 264 {
 265         if (zfsctl_is_snapdir(ip))
 266                 zfsctl_snapdir_inactive(ip);
 267 }
 268
 269 /*
 270  * Create the '.zfs' directory.  This directory is cached as part of the VFS
 271  * structure.  This results in a hold on the zfs_sb_t.  The code in zfs_umount()
 272  * therefore checks against a vfs_count of 2 instead of 1.  This reference
 273  * is removed when the ctldir is destroyed in the unmount.  All other entities
 274  * under the '.zfs' directory are created dynamically as needed.
 275  *
 276  * Because the dynamically created '.zfs' directory entries assume the use
 277  * of 64-bit inode numbers this support must be disabled on 32-bit systems.
 278  */
 279 int
 280 zfsctl_create(zfs_sb_t *zsb)
 281 {
 282 #if defined(CONFIG_64BIT)
 283         ASSERT(zsb->z_ctldir == NULL);
 284
 285         zsb->z_ctldir = zfsctl_inode_alloc(zsb, ZFSCTL_INO_ROOT,
 286             &zpl_fops_root, &zpl_ops_root);
 287         if (zsb->z_ctldir == NULL)
 288                 return (ENOENT);
 289
 290         return (0);
 291 #else
 292         return (EOPNOTSUPP);
 293 #endif /* CONFIG_64BIT */
 294 }
 295
 296 /*
 297  * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
 298  */
 299 void
 300 zfsctl_destroy(zfs_sb_t *zsb)
 301 {
 302         iput(zsb->z_ctldir);
 303         zsb->z_ctldir = NULL;
 304 }
 305
 306 /*
 307  * Given a root znode, retrieve the associated .zfs directory.
 308  * Add a hold to the vnode and return it.
 309  */
 310 struct inode *
 311 zfsctl_root(znode_t *zp)
 312 {
 313         ASSERT(zfs_has_ctldir(zp));
 314         igrab(ZTOZSB(zp)->z_ctldir);
 315         return (ZTOZSB(zp)->z_ctldir);
 316 }
 317
 318 /*ARGSUSED*/
 319 int
 320 zfsctl_fid(struct inode *ip, fid_t *fidp)
 321 {
 322         znode_t         *zp = ITOZ(ip);
 323         zfs_sb_t        *zsb = ITOZSB(ip);
 324         uint64_t        object = zp->z_id;
 325         zfid_short_t    *zfid;
 326         int             i;
 327
 328         ZFS_ENTER(zsb);
 329
 330         if (fidp->fid_len < SHORT_FID_LEN) {
 331                 fidp->fid_len = SHORT_FID_LEN;
 332                 ZFS_EXIT(zsb);
 333                 return (ENOSPC);
 334         }
 335
 336         zfid = (zfid_short_t *)fidp;
 337
 338         zfid->zf_len = SHORT_FID_LEN;
 339
 340         for (i = 0; i < sizeof (zfid->zf_object); i++)
 341                 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 342
 343         /* .zfs znodes always have a generation number of 0 */
 344         for (i = 0; i < sizeof (zfid->zf_gen); i++)
 345                 zfid->zf_gen[i] = 0;
 346
 347         ZFS_EXIT(zsb);
 348         return (0);
 349 }
 350
 351 static int
 352 zfsctl_snapshot_zname(struct inode *ip, const char *name, int len, char *zname)
 353 {
 354         objset_t *os = ITOZSB(ip)->z_os;
 355
 356         if (snapshot_namecheck(name, NULL, NULL) != 0)
 357                 return (EILSEQ);
 358
 359         dmu_objset_name(os, zname);
 360         if ((strlen(zname) + 1 + strlen(name)) >= len)
 361                 return (ENAMETOOLONG);
 362
 363         (void) strcat(zname, "@");
 364         (void) strcat(zname, name);
 365
 366         return (0);
 367 }
 368
 369 static int
 370 zfsctl_snapshot_zpath(struct path *path, int len, char *zpath)
 371 {
 372         char *path_buffer, *path_ptr;
 373         int path_len, error = 0;
 374
 375         path_buffer = kmem_alloc(len, KM_SLEEP);
 376
 377         path_ptr = d_path(path, path_buffer, len);
 378         if (IS_ERR(path_ptr)) {
 379                 error = -PTR_ERR(path_ptr);
 380                 goto out;
 381         }
 382
 383         path_len = path_buffer + len - 1 - path_ptr;
 384         if (path_len > len) {
 385                 error = EFAULT;
 386                 goto out;
 387         }
 388
 389         memcpy(zpath, path_ptr, path_len);
 390         zpath[path_len] = '\0';
 391 out:
 392         kmem_free(path_buffer, len);
 393
 394         return (error);
 395 }
 396
 397 /*
 398  * Special case the handling of "..".
 399  */
 400 /* ARGSUSED */
 401 int
 402 zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp,
 403     int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
 404 {
 405         zfs_sb_t *zsb = ITOZSB(dip);
 406         int error = 0;
 407
 408         ZFS_ENTER(zsb);
 409
 410         if (strcmp(name, "..") == 0) {
 411                 *ipp = dip->i_sb->s_root->d_inode;
 412         } else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) {
 413                 *ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SNAPDIR,
 414                     &zpl_fops_snapdir, &zpl_ops_snapdir);
 415         } else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) {
 416                 *ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SHARES,
 417                     &zpl_fops_shares, &zpl_ops_shares);
 418         } else {
 419                 *ipp = NULL;
 420         }
 421
 422         if (*ipp == NULL)
 423                 error = ENOENT;
 424
 425         ZFS_EXIT(zsb);
 426
 427         return (error);
 428 }
 429
 430 /*
 431  * Lookup entry point for the 'snapshot' directory.  Try to open the
 432  * snapshot if it exist, creating the pseudo filesystem inode as necessary.
 433  * Perform a mount of the associated dataset on top of the inode.
 434  */
 435 /* ARGSUSED */
 436 int
 437 zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp,
 438     int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
 439 {
 440         zfs_sb_t *zsb = ITOZSB(dip);
 441         uint64_t id;
 442         int error;
 443
 444         ZFS_ENTER(zsb);
 445
 446         error = dmu_snapshot_lookup(zsb->z_os, name, &id);
 447         if (error) {
 448                 ZFS_EXIT(zsb);
 449                 return (error);
 450         }
 451
 452         *ipp = zfsctl_inode_lookup(zsb, ZFSCTL_INO_SNAPDIRS - id,
 453             &simple_dir_operations, &simple_dir_inode_operations);
 454         if (*ipp) {
 455 #ifdef HAVE_AUTOMOUNT
 456                 (*ipp)->i_flags |= S_AUTOMOUNT;
 457 #endif /* HAVE_AUTOMOUNT */
 458         } else {
 459                 error = ENOENT;
 460         }
 461
 462         ZFS_EXIT(zsb);
 463
 464         return (error);
 465 }
 466
 467 static void
 468 zfsctl_rename_snap(zfs_sb_t *zsb, zfs_snapentry_t *sep, const char *name)
 469 {
 470         avl_index_t where;
 471
 472         ASSERT(MUTEX_HELD(&zsb->z_ctldir_lock));
 473         ASSERT(sep != NULL);
 474
 475         /*
 476          * Change the name in the AVL tree.
 477          */
 478         avl_remove(&zsb->z_ctldir_snaps, sep);
 479         (void) strcpy(sep->se_name, name);
 480         VERIFY(avl_find(&zsb->z_ctldir_snaps, sep, &where) == NULL);
 481         avl_insert(&zsb->z_ctldir_snaps, sep, where);
 482 }
 483
 484 /*
 485  * Renaming a directory under '.zfs/snapshot' will automatically trigger
 486  * a rename of the snapshot to the new given name.  The rename is confined
 487  * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere.
 488  */
 489 /*ARGSUSED*/
 490 int
 491 zfsctl_snapdir_rename(struct inode *sdip, char *sname,
 492     struct inode *tdip, char *tname, cred_t *cr, int flags)
 493 {
 494         zfs_sb_t *zsb = ITOZSB(sdip);
 495         zfs_snapentry_t search, *sep;
 496         avl_index_t where;
 497         char *to, *from, *real;
 498         int error;
 499
 500         ZFS_ENTER(zsb);
 501
 502         to = kmem_alloc(MAXNAMELEN, KM_SLEEP);
 503         from = kmem_alloc(MAXNAMELEN, KM_SLEEP);
 504         real = kmem_alloc(MAXNAMELEN, KM_SLEEP);
 505
 506         if (zsb->z_case == ZFS_CASE_INSENSITIVE) {
 507                 error = dmu_snapshot_realname(zsb->z_os, sname, real,
 508                     MAXNAMELEN, NULL);
 509                 if (error == 0) {
 510                         sname = real;
 511                 } else if (error != ENOTSUP) {
 512                         goto out;
 513                 }
 514         }
 515
 516         error = zfsctl_snapshot_zname(sdip, sname, MAXNAMELEN, from);
 517         if (!error)
 518                 error = zfsctl_snapshot_zname(tdip, tname, MAXNAMELEN, to);
 519         if (!error)
 520                 error = zfs_secpolicy_rename_perms(from, to, cr);
 521         if (error)
 522                 goto out;
 523
 524         /*
 525          * Cannot move snapshots out of the snapdir.
 526          */
 527         if (sdip != tdip) {
 528                 error = EINVAL;
 529                 goto out;
 530         }
 531
 532         /*
 533          * No-op when names are identical.
 534          */
 535         if (strcmp(sname, tname) == 0) {
 536                 error = 0;
 537                 goto out;
 538         }
 539
 540         mutex_enter(&zsb->z_ctldir_lock);
 541
 542         error = dmu_objset_rename(from, to, B_FALSE);
 543         if (error)
 544                 goto out_unlock;
 545
 546         search.se_name = (char *)sname;
 547         sep = avl_find(&zsb->z_ctldir_snaps, &search, &where);
 548         if (sep)
 549                 zfsctl_rename_snap(zsb, sep, tname);
 550
 551 out_unlock:
 552         mutex_exit(&zsb->z_ctldir_lock);
 553 out:
 554         kmem_free(from, MAXNAMELEN);
 555         kmem_free(to, MAXNAMELEN);
 556         kmem_free(real, MAXNAMELEN);
 557
 558         ZFS_EXIT(zsb);
 559
 560         return (error);
 561 }
 562
 563 /*
 564  * Removing a directory under '.zfs/snapshot' will automatically trigger
 565  * the removal of the snapshot with the given name.
 566  */
 567 /* ARGSUSED */
 568 int
 569 zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags)
 570 {
 571         zfs_sb_t *zsb = ITOZSB(dip);
 572         char *snapname, *real;
 573         int error;
 574
 575         ZFS_ENTER(zsb);
 576
 577         snapname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
 578         real = kmem_alloc(MAXNAMELEN, KM_SLEEP);
 579
 580         if (zsb->z_case == ZFS_CASE_INSENSITIVE) {
 581                 error = dmu_snapshot_realname(zsb->z_os, name, real,
 582                     MAXNAMELEN, NULL);
 583                 if (error == 0) {
 584                         name = real;
 585                 } else if (error != ENOTSUP) {
 586                         goto out;
 587                 }
 588         }
 589
 590         error = zfsctl_snapshot_zname(dip, name, MAXNAMELEN, snapname);
 591         if (!error)
 592                 error = zfs_secpolicy_destroy_perms(snapname, cr);
 593         if (error)
 594                 goto out;
 595
 596         error = zfsctl_unmount_snapshot(zsb, name, MNT_FORCE);
 597         if ((error == 0) || (error == ENOENT))
 598                 error = dmu_objset_destroy(snapname, B_FALSE);
 599 out:
 600         kmem_free(snapname, MAXNAMELEN);
 601         kmem_free(real, MAXNAMELEN);
 602
 603         ZFS_EXIT(zsb);
 604
 605         return (error);
 606 }
 607
 608 /*
 609  * Creating a directory under '.zfs/snapshot' will automatically trigger
 610  * the creation of a new snapshot with the given name.
 611  */
 612 /* ARGSUSED */
 613 int
 614 zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap,
 615         struct inode **ipp, cred_t *cr, int flags)
 616 {
 617         zfs_sb_t *zsb = ITOZSB(dip);
 618         char *dsname;
 619         int error;
 620
 621         dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
 622
 623         if (snapshot_namecheck(dirname, NULL, NULL) != 0) {
 624                 error = EILSEQ;
 625                 goto out;
 626         }
 627
 628         dmu_objset_name(zsb->z_os, dsname);
 629
 630         error = zfs_secpolicy_snapshot_perms(dsname, cr);
 631         if (error)
 632                 goto out;
 633
 634         if (error == 0) {
 635                 error = dmu_objset_snapshot(dsname, dirname,
 636                     NULL, NULL, B_FALSE, B_FALSE, -1);
 637                 if (error)
 638                         goto out;
 639
 640                 error = zfsctl_snapdir_lookup(dip, dirname, ipp,
 641                     0, cr, NULL, NULL);
 642         }
 643 out:
 644         kmem_free(dsname, MAXNAMELEN);
 645
 646         return (error);
 647 }
 648
 649 /*
 650  * When a .zfs/snapshot/<snapshot> inode is evicted they must be removed
 651  * from the snapshot list.  This will normally happen as part of the auto
 652  * unmount, however in the case of a manual snapshot unmount this will be
 653  * the only notification we receive.
 654  */
 655 void
 656 zfsctl_snapdir_inactive(struct inode *ip)
 657 {
 658         zfs_sb_t *zsb = ITOZSB(ip);
 659         zfs_snapentry_t *sep, *next;
 660
 661         mutex_enter(&zsb->z_ctldir_lock);
 662
 663         sep = avl_first(&zsb->z_ctldir_snaps);
 664         while (sep != NULL) {
 665                 next = AVL_NEXT(&zsb->z_ctldir_snaps, sep);
 666
 667                 if (sep->se_inode == ip) {
 668                         avl_remove(&zsb->z_ctldir_snaps, sep);
 669                         taskq_cancel_id(zfs_expire_taskq, sep->se_taskqid);
 670                         zfsctl_sep_free(sep);
 671                         break;
 672                 }
 673                 sep = next;
 674         }
 675
 676         mutex_exit(&zsb->z_ctldir_lock);
 677 }
 678
 679 /*
 680  * Attempt to unmount a snapshot by making a call to user space.
 681  * There is no assurance that this can or will succeed, is just a
 682  * best effort.  In the case where it does fail, perhaps because
 683  * it's in use, the unmount will fail harmlessly.
 684  */
 685 #define SET_UNMOUNT_CMD \
 686         "exec 0</dev/null " \
 687         "     1>/dev/null " \
 688         "     2>/dev/null; " \
 689         "umount -t zfs -n %s'%s'"
 690
 691 static int
 692 __zfsctl_unmount_snapshot(zfs_snapentry_t *sep, int flags)
 693 {
 694         char *argv[] = { "/bin/sh", "-c", NULL, NULL };
 695         char *envp[] = { NULL };
 696         int error;
 697
 698         argv[2] = kmem_asprintf(SET_UNMOUNT_CMD,
 699             flags & MNT_FORCE ? "-f " : "", sep->se_path);
 700         error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
 701         strfree(argv[2]);
 702
 703         /*
 704          * The umount system utility will return 256 on error.  We must
 705          * assume this error is because the file system is busy so it is
 706          * converted to the more sensible EBUSY.
 707          */
 708         if (error)
 709                 error = EBUSY;
 710
 711         /*
 712          * This was the result of a manual unmount, cancel the delayed work
 713          * to prevent zfsctl_expire_snapshot() from attempting a unmount.
 714          */
 715         if ((error == 0) && !(flags & MNT_EXPIRE))
 716                 taskq_cancel_id(zfs_expire_taskq, sep->se_taskqid);
 717
 718
 719         return (error);
 720 }
 721
 722 int
 723 zfsctl_unmount_snapshot(zfs_sb_t *zsb, char *name, int flags)
 724 {
 725         zfs_snapentry_t search;
 726         zfs_snapentry_t *sep;
 727         int error = 0;
 728
 729         mutex_enter(&zsb->z_ctldir_lock);
 730
 731         search.se_name = name;
 732         sep = avl_find(&zsb->z_ctldir_snaps, &search, NULL);
 733         if (sep) {
 734                 avl_remove(&zsb->z_ctldir_snaps, sep);
 735                 mutex_exit(&zsb->z_ctldir_lock);
 736
 737                 error = __zfsctl_unmount_snapshot(sep, flags);
 738
 739                 mutex_enter(&zsb->z_ctldir_lock);
 740                 if (error == EBUSY)
 741                         avl_add(&zsb->z_ctldir_snaps, sep);
 742                 else
 743                         zfsctl_sep_free(sep);
 744         } else {
 745                 error = ENOENT;
 746         }
 747
 748         mutex_exit(&zsb->z_ctldir_lock);
 749         ASSERT3S(error, >=, 0);
 750
 751         return (error);
 752 }
 753
 754 /*
 755  * Traverse all mounted snapshots and attempt to unmount them.  This
 756  * is best effort, on failure EEXIST is returned and count will be set
 757  * to the number of file snapshots which could not be unmounted.
 758  */
 759 int
 760 zfsctl_unmount_snapshots(zfs_sb_t *zsb, int flags, int *count)
 761 {
 762         zfs_snapentry_t *sep, *next;
 763         int error = 0;
 764
 765         *count = 0;
 766
 767         ASSERT(zsb->z_ctldir != NULL);
 768         mutex_enter(&zsb->z_ctldir_lock);
 769
 770         sep = avl_first(&zsb->z_ctldir_snaps);
 771         while (sep != NULL) {
 772                 next = AVL_NEXT(&zsb->z_ctldir_snaps, sep);
 773                 avl_remove(&zsb->z_ctldir_snaps, sep);
 774                 mutex_exit(&zsb->z_ctldir_lock);
 775
 776                 error = __zfsctl_unmount_snapshot(sep, flags);
 777
 778                 mutex_enter(&zsb->z_ctldir_lock);
 779                 if (error == EBUSY) {
 780                         avl_add(&zsb->z_ctldir_snaps, sep);
 781                         (*count)++;
 782                 } else {
 783                         zfsctl_sep_free(sep);
 784                 }
 785
 786                 sep = next;
 787         }
 788
 789         mutex_exit(&zsb->z_ctldir_lock);
 790
 791         return ((*count > 0) ? EEXIST : 0);
 792 }
 793
 794 #define SET_MOUNT_CMD \
 795         "exec 0</dev/null " \
 796         "     1>/dev/null " \
 797         "     2>/dev/null; " \
 798         "mount -t zfs -n '%s' '%s'"
 799
 800 int
 801 zfsctl_mount_snapshot(struct path *path, int flags)
 802 {
 803         struct dentry *dentry = path->dentry;
 804         struct inode *ip = dentry->d_inode;
 805         zfs_sb_t *zsb = ITOZSB(ip);
 806         char *full_name, *full_path;
 807         zfs_snapentry_t *sep;
 808         zfs_snapentry_t search;
 809         char *argv[] = { "/bin/sh", "-c", NULL, NULL };
 810         char *envp[] = { NULL };
 811         int error;
 812
 813         ZFS_ENTER(zsb);
 814
 815         full_name = kmem_zalloc(MAXNAMELEN, KM_SLEEP);
 816         full_path = kmem_zalloc(PATH_MAX, KM_SLEEP);
 817
 818         error = zfsctl_snapshot_zname(ip, dname(dentry), MAXNAMELEN, full_name);
 819         if (error)
 820                 goto error;
 821
 822         error = zfsctl_snapshot_zpath(path, PATH_MAX, full_path);
 823         if (error)
 824                 goto error;
 825
 826         /*
 827          * Attempt to mount the snapshot from user space.  Normally this
 828          * would be done using the vfs_kern_mount() function, however that
 829          * function is marked GPL-only and cannot be used.  On error we
 830          * careful to log the real error to the console and return EISDIR
 831          * to safely abort the automount.  This should be very rare.
 832          */
 833         argv[2] = kmem_asprintf(SET_MOUNT_CMD, full_name, full_path);
 834         error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
 835         strfree(argv[2]);
 836         if (error) {
 837                 printk("ZFS: Unable to automount %s at %s: %d\n",
 838                     full_name, full_path, error);
 839                 error = EISDIR;
 840                 goto error;
 841         }
 842
 843         mutex_enter(&zsb->z_ctldir_lock);
 844
 845         /*
 846          * Ensure a previous entry does not exist, if it does safely remove
 847          * it any cancel the outstanding expiration.  This can occur when a
 848          * snapshot is manually unmounted and then an automount is triggered.
 849          */
 850         search.se_name = full_name;
 851         sep = avl_find(&zsb->z_ctldir_snaps, &search, NULL);
 852         if (sep) {
 853                 avl_remove(&zsb->z_ctldir_snaps, sep);
 854                 taskq_cancel_id(zfs_expire_taskq, sep->se_taskqid);
 855                 zfsctl_sep_free(sep);
 856         }
 857
 858         sep = zfsctl_sep_alloc();
 859         sep->se_name = full_name;
 860         sep->se_path = full_path;
 861         sep->se_inode = ip;
 862         avl_add(&zsb->z_ctldir_snaps, sep);
 863
 864         sep->se_taskqid = taskq_dispatch_delay(zfs_expire_taskq,
 865             zfsctl_expire_snapshot, sep, TQ_SLEEP,
 866             ddi_get_lbolt() + zfs_expire_snapshot * HZ);
 867
 868         mutex_exit(&zsb->z_ctldir_lock);
 869 error:
 870         if (error) {
 871                 kmem_free(full_name, MAXNAMELEN);
 872                 kmem_free(full_path, PATH_MAX);
 873         }
 874
 875         ZFS_EXIT(zsb);
 876
 877         return (error);
 878 }
 879
 880 /*
 881  * Check if this super block has a matching objset id.
 882  */
 883 static int
 884 zfsctl_test_super(struct super_block *sb, void *objsetidp)
 885 {
 886         zfs_sb_t *zsb = sb->s_fs_info;
 887         uint64_t objsetid = *(uint64_t *)objsetidp;
 888
 889         return (dmu_objset_id(zsb->z_os) == objsetid);
 890 }
 891
 892 /*
 893  * Prevent a new super block from being allocated if an existing one
 894  * could not be located.  We only want to preform a lookup operation.
 895  */
 896 static int
 897 zfsctl_set_super(struct super_block *sb, void *objsetidp)
 898 {
 899         return (-EEXIST);
 900 }
 901
 902 int
 903 zfsctl_lookup_objset(struct super_block *sb, uint64_t objsetid, zfs_sb_t **zsbp)
 904 {
 905         zfs_sb_t *zsb = sb->s_fs_info;
 906         struct super_block *sbp;
 907         zfs_snapentry_t *sep;
 908         uint64_t id;
 909         int error;
 910
 911         ASSERT(zsb->z_ctldir != NULL);
 912
 913         mutex_enter(&zsb->z_ctldir_lock);
 914
 915         /*
 916          * Verify that the snapshot is mounted.
 917          */
 918         sep = avl_first(&zsb->z_ctldir_snaps);
 919         while (sep != NULL) {
 920                 error = dmu_snapshot_lookup(zsb->z_os, sep->se_name, &id);
 921                 if (error)
 922                         goto out;
 923
 924                 if (id == objsetid)
 925                         break;
 926
 927                 sep = AVL_NEXT(&zsb->z_ctldir_snaps, sep);
 928         }
 929
 930         if (sep != NULL) {
 931                 /*
 932                  * Lookup the mounted root rather than the covered mount
 933                  * point.  This may fail if the snapshot has just been
 934                  * unmounted by an unrelated user space process.  This
 935                  * race cannot occur to an expired mount point because
 936                  * we hold the zsb->z_ctldir_lock to prevent the race.
 937                  */
 938                 sbp = zpl_sget(&zpl_fs_type, zfsctl_test_super,
 939                     zfsctl_set_super, 0, &id);
 940                 if (IS_ERR(sbp)) {
 941                         error = -PTR_ERR(sbp);
 942                 } else {
 943                         *zsbp = sbp->s_fs_info;
 944                         deactivate_super(sbp);
 945                 }
 946         } else {
 947                 error = EINVAL;
 948         }
 949 out:
 950         mutex_exit(&zsb->z_ctldir_lock);
 951         ASSERT3S(error, >=, 0);
 952
 953         return (error);
 954 }
 955
 956 /* ARGSUSED */
 957 int
 958 zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,
 959     int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
 960 {
 961         zfs_sb_t *zsb = ITOZSB(dip);
 962         struct inode *ip;
 963         znode_t *dzp;
 964         int error;
 965
 966         ZFS_ENTER(zsb);
 967
 968         if (zsb->z_shares_dir == 0) {
 969                 ZFS_EXIT(zsb);
 970                 return (ENOTSUP);
 971         }
 972
 973         error = zfs_zget(zsb, zsb->z_shares_dir, &dzp);
 974         if (error) {
 975                 ZFS_EXIT(zsb);
 976                 return (error);
 977         }
 978
 979         error = zfs_lookup(ZTOI(dzp), name, &ip, 0, cr, NULL, NULL);
 980
 981         iput(ZTOI(dzp));
 982         ZFS_EXIT(zsb);
 983
 984         return (error);
 985 }
 986
 987
 988 /*
 989  * Initialize the various pieces we'll need to create and manipulate .zfs
 990  * directories.  Currently this is unused but available.
 991  */
 992 void
 993 zfsctl_init(void)
 994 {
 995         zfs_expire_taskq = taskq_create("z_unmount", 1, maxclsyspri,
 996             1, 8, TASKQ_PREPOPULATE);
 997 }
 998
 999 /*
1000  * Cleanup the various pieces we needed for .zfs directories.  In particular
1001  * ensure the expiry timer is canceled safely.
1002  */
1003 void
1004 zfsctl_fini(void)
1005 {
1006         taskq_destroy(zfs_expire_taskq);
1007 }
1008
1009 module_param(zfs_expire_snapshot, int, 0644);
1010 MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot");