module/zfs/zfs_znode.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  24  */
  25
  26 /* Portions Copyright 2007 Jeremy Teo */
  27
  28 #ifdef _KERNEL
  29 #include <sys/types.h>
  30 #include <sys/param.h>
  31 #include <sys/time.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/mntent.h>
  34 #include <sys/u8_textprep.h>
  35 #include <sys/dsl_dataset.h>
  36 #include <sys/vfs.h>
  37 #include <sys/vnode.h>
  38 #include <sys/file.h>
  39 #include <sys/kmem.h>
  40 #include <sys/errno.h>
  41 #include <sys/mode.h>
  42 #include <sys/atomic.h>
  43 #include <sys/zfs_dir.h>
  44 #include <sys/zfs_acl.h>
  45 #include <sys/zfs_ioctl.h>
  46 #include <sys/zfs_rlock.h>
  47 #include <sys/zfs_fuid.h>
  48 #include <sys/zfs_vnops.h>
  49 #include <sys/zfs_ctldir.h>
  50 #include <sys/dnode.h>
  51 #include <sys/fs/zfs.h>
  52 #include <sys/zpl.h>
  53 #endif /* _KERNEL */
  54
  55 #include <sys/dmu.h>
  56 #include <sys/dmu_objset.h>
  57 #include <sys/dmu_tx.h>
  58 #include <sys/refcount.h>
  59 #include <sys/stat.h>
  60 #include <sys/zap.h>
  61 #include <sys/zfs_znode.h>
  62 #include <sys/sa.h>
  63 #include <sys/zfs_sa.h>
  64 #include <sys/zfs_stat.h>
  65
  66 #include "zfs_prop.h"
  67 #include "zfs_comutil.h"
  68
  69 /*
  70  * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
  71  * turned on when DEBUG is also defined.
  72  */
  73 #ifdef  DEBUG
  74 #define ZNODE_STATS
  75 #endif  /* DEBUG */
  76
  77 #ifdef  ZNODE_STATS
  78 #define ZNODE_STAT_ADD(stat)                    ((stat)++)
  79 #else
  80 #define ZNODE_STAT_ADD(stat)                    /* nothing */
  81 #endif  /* ZNODE_STATS */
  82
  83 /*
  84  * Functions needed for userland (ie: libzpool) are not put under
  85  * #ifdef_KERNEL; the rest of the functions have dependencies
  86  * (such as VFS logic) that will not compile easily in userland.
  87  */
  88 #ifdef _KERNEL
  89
  90 static kmem_cache_t *znode_cache = NULL;
  91 static kmem_cache_t *znode_hold_cache = NULL;
  92 unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
  93
  94 /*ARGSUSED*/
  95 static int
  96 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
  97 {
  98         znode_t *zp = buf;
  99
 100         inode_init_once(ZTOI(zp));
 101         list_link_init(&zp->z_link_node);
 102
 103         mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
 104         rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
 105         rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
 106         mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
 107         rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
 108
 109         zfs_rlock_init(&zp->z_range_lock);
 110
 111         zp->z_dirlocks = NULL;
 112         zp->z_acl_cached = NULL;
 113         zp->z_xattr_cached = NULL;
 114         zp->z_xattr_parent = 0;
 115         zp->z_moved = 0;
 116         return (0);
 117 }
 118
 119 /*ARGSUSED*/
 120 static void
 121 zfs_znode_cache_destructor(void *buf, void *arg)
 122 {
 123         znode_t *zp = buf;
 124
 125         ASSERT(!list_link_active(&zp->z_link_node));
 126         mutex_destroy(&zp->z_lock);
 127         rw_destroy(&zp->z_parent_lock);
 128         rw_destroy(&zp->z_name_lock);
 129         mutex_destroy(&zp->z_acl_lock);
 130         rw_destroy(&zp->z_xattr_lock);
 131         zfs_rlock_destroy(&zp->z_range_lock);
 132
 133         ASSERT(zp->z_dirlocks == NULL);
 134         ASSERT(zp->z_acl_cached == NULL);
 135         ASSERT(zp->z_xattr_cached == NULL);
 136 }
 137
 138 static int
 139 zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
 140 {
 141         znode_hold_t *zh = buf;
 142
 143         mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
 144         refcount_create(&zh->zh_refcount);
 145         zh->zh_obj = ZFS_NO_OBJECT;
 146
 147         return (0);
 148 }
 149
 150 static void
 151 zfs_znode_hold_cache_destructor(void *buf, void *arg)
 152 {
 153         znode_hold_t *zh = buf;
 154
 155         mutex_destroy(&zh->zh_lock);
 156         refcount_destroy(&zh->zh_refcount);
 157 }
 158
 159 void
 160 zfs_znode_init(void)
 161 {
 162         /*
 163          * Initialize zcache.  The KMC_SLAB hint is used in order that it be
 164          * backed by kmalloc() when on the Linux slab in order that any
 165          * wait_on_bit() operations on the related inode operate properly.
 166          */
 167         ASSERT(znode_cache == NULL);
 168         znode_cache = kmem_cache_create("zfs_znode_cache",
 169             sizeof (znode_t), 0, zfs_znode_cache_constructor,
 170             zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB);
 171
 172         ASSERT(znode_hold_cache == NULL);
 173         znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
 174             sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
 175             zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
 176 }
 177
 178 void
 179 zfs_znode_fini(void)
 180 {
 181         /*
 182          * Cleanup zcache
 183          */
 184         if (znode_cache)
 185                 kmem_cache_destroy(znode_cache);
 186         znode_cache = NULL;
 187
 188         if (znode_hold_cache)
 189                 kmem_cache_destroy(znode_hold_cache);
 190         znode_hold_cache = NULL;
 191 }
 192
 193 /*
 194  * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
 195  * serialize access to a znode and its SA buffer while the object is being
 196  * created or destroyed.  This kind of locking would normally reside in the
 197  * znode itself but in this case that's impossible because the znode and SA
 198  * buffer may not yet exist.  Therefore the locking is handled externally
 199  * with an array of mutexs and AVLs trees which contain per-object locks.
 200  *
 201  * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
 202  * in to the correct AVL tree and finally the per-object lock is held.  In
 203  * zfs_znode_hold_exit() the process is reversed.  The per-object lock is
 204  * released, removed from the AVL tree and destroyed if there are no waiters.
 205  *
 206  * This scheme has two important properties:
 207  *
 208  * 1) No memory allocations are performed while holding one of the z_hold_locks.
 209  *    This ensures evict(), which can be called from direct memory reclaim, will
 210  *    never block waiting on a z_hold_locks which just happens to have hashed
 211  *    to the same index.
 212  *
 213  * 2) All locks used to serialize access to an object are per-object and never
 214  *    shared.  This minimizes lock contention without creating a large number
 215  *    of dedicated locks.
 216  *
 217  * On the downside it does require znode_lock_t structures to be frequently
 218  * allocated and freed.  However, because these are backed by a kmem cache
 219  * and very short lived this cost is minimal.
 220  */
 221 int
 222 zfs_znode_hold_compare(const void *a, const void *b)
 223 {
 224         const znode_hold_t *zh_a = (const znode_hold_t *)a;
 225         const znode_hold_t *zh_b = (const znode_hold_t *)b;
 226
 227         return (AVL_CMP(zh_a->zh_obj, zh_b->zh_obj));
 228 }
 229
 230 boolean_t
 231 zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
 232 {
 233         znode_hold_t *zh, search;
 234         int i = ZFS_OBJ_HASH(zfsvfs, obj);
 235         boolean_t held;
 236
 237         search.zh_obj = obj;
 238
 239         mutex_enter(&zfsvfs->z_hold_locks[i]);
 240         zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
 241         held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
 242         mutex_exit(&zfsvfs->z_hold_locks[i]);
 243
 244         return (held);
 245 }
 246
 247 static znode_hold_t *
 248 zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
 249 {
 250         znode_hold_t *zh, *zh_new, search;
 251         int i = ZFS_OBJ_HASH(zfsvfs, obj);
 252         boolean_t found = B_FALSE;
 253
 254         zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
 255         zh_new->zh_obj = obj;
 256         search.zh_obj = obj;
 257
 258         mutex_enter(&zfsvfs->z_hold_locks[i]);
 259         zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
 260         if (likely(zh == NULL)) {
 261                 zh = zh_new;
 262                 avl_add(&zfsvfs->z_hold_trees[i], zh);
 263         } else {
 264                 ASSERT3U(zh->zh_obj, ==, obj);
 265                 found = B_TRUE;
 266         }
 267         refcount_add(&zh->zh_refcount, NULL);
 268         mutex_exit(&zfsvfs->z_hold_locks[i]);
 269
 270         if (found == B_TRUE)
 271                 kmem_cache_free(znode_hold_cache, zh_new);
 272
 273         ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
 274         ASSERT3S(refcount_count(&zh->zh_refcount), >, 0);
 275         mutex_enter(&zh->zh_lock);
 276
 277         return (zh);
 278 }
 279
 280 static void
 281 zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
 282 {
 283         int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
 284         boolean_t remove = B_FALSE;
 285
 286         ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
 287         ASSERT3S(refcount_count(&zh->zh_refcount), >, 0);
 288         mutex_exit(&zh->zh_lock);
 289
 290         mutex_enter(&zfsvfs->z_hold_locks[i]);
 291         if (refcount_remove(&zh->zh_refcount, NULL) == 0) {
 292                 avl_remove(&zfsvfs->z_hold_trees[i], zh);
 293                 remove = B_TRUE;
 294         }
 295         mutex_exit(&zfsvfs->z_hold_locks[i]);
 296
 297         if (remove == B_TRUE)
 298                 kmem_cache_free(znode_hold_cache, zh);
 299 }
 300
 301 int
 302 zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
 303 {
 304 #ifdef HAVE_SMB_SHARE
 305         zfs_acl_ids_t acl_ids;
 306         vattr_t vattr;
 307         znode_t *sharezp;
 308         vnode_t *vp;
 309         znode_t *zp;
 310         int error;
 311
 312         vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
 313         vattr.va_mode = S_IFDIR | 0555;
 314         vattr.va_uid = crgetuid(kcred);
 315         vattr.va_gid = crgetgid(kcred);
 316
 317         sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
 318         sharezp->z_moved = 0;
 319         sharezp->z_unlinked = 0;
 320         sharezp->z_atime_dirty = 0;
 321         sharezp->z_zfsvfs = zfsvfs;
 322         sharezp->z_is_sa = zfsvfs->z_use_sa;
 323         sharezp->z_pflags = 0;
 324
 325         vp = ZTOV(sharezp);
 326         vn_reinit(vp);
 327         vp->v_type = VDIR;
 328
 329         VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
 330             kcred, NULL, &acl_ids));
 331         zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
 332         ASSERT3P(zp, ==, sharezp);
 333         ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */
 334         POINTER_INVALIDATE(&sharezp->z_zfsvfs);
 335         error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
 336             ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
 337         zfsvfs->z_shares_dir = sharezp->z_id;
 338
 339         zfs_acl_ids_free(&acl_ids);
 340         // ZTOV(sharezp)->v_count = 0;
 341         sa_handle_destroy(sharezp->z_sa_hdl);
 342         kmem_cache_free(znode_cache, sharezp);
 343
 344         return (error);
 345 #else
 346         return (0);
 347 #endif /* HAVE_SMB_SHARE */
 348 }
 349
 350 static void
 351 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
 352     dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
 353 {
 354         ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
 355
 356         mutex_enter(&zp->z_lock);
 357
 358         ASSERT(zp->z_sa_hdl == NULL);
 359         ASSERT(zp->z_acl_cached == NULL);
 360         if (sa_hdl == NULL) {
 361                 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
 362                     SA_HDL_SHARED, &zp->z_sa_hdl));
 363         } else {
 364                 zp->z_sa_hdl = sa_hdl;
 365                 sa_set_userp(sa_hdl, zp);
 366         }
 367
 368         zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
 369
 370         mutex_exit(&zp->z_lock);
 371 }
 372
 373 void
 374 zfs_znode_dmu_fini(znode_t *zp)
 375 {
 376         ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked ||
 377             RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
 378
 379         sa_handle_destroy(zp->z_sa_hdl);
 380         zp->z_sa_hdl = NULL;
 381 }
 382
 383 /*
 384  * Called by new_inode() to allocate a new inode.
 385  */
 386 int
 387 zfs_inode_alloc(struct super_block *sb, struct inode **ip)
 388 {
 389         znode_t *zp;
 390
 391         zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
 392         *ip = ZTOI(zp);
 393
 394         return (0);
 395 }
 396
 397 /*
 398  * Called in multiple places when an inode should be destroyed.
 399  */
 400 void
 401 zfs_inode_destroy(struct inode *ip)
 402 {
 403         znode_t *zp = ITOZ(ip);
 404         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 405
 406         mutex_enter(&zfsvfs->z_znodes_lock);
 407         if (list_link_active(&zp->z_link_node)) {
 408                 list_remove(&zfsvfs->z_all_znodes, zp);
 409                 zfsvfs->z_nr_znodes--;
 410         }
 411         mutex_exit(&zfsvfs->z_znodes_lock);
 412
 413         if (zp->z_acl_cached) {
 414                 zfs_acl_free(zp->z_acl_cached);
 415                 zp->z_acl_cached = NULL;
 416         }
 417
 418         if (zp->z_xattr_cached) {
 419                 nvlist_free(zp->z_xattr_cached);
 420                 zp->z_xattr_cached = NULL;
 421         }
 422
 423         kmem_cache_free(znode_cache, zp);
 424 }
 425
 426 static void
 427 zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
 428 {
 429         uint64_t rdev = 0;
 430
 431         switch (ip->i_mode & S_IFMT) {
 432         case S_IFREG:
 433                 ip->i_op = &zpl_inode_operations;
 434                 ip->i_fop = &zpl_file_operations;
 435                 ip->i_mapping->a_ops = &zpl_address_space_operations;
 436                 break;
 437
 438         case S_IFDIR:
 439                 ip->i_op = &zpl_dir_inode_operations;
 440                 ip->i_fop = &zpl_dir_file_operations;
 441                 ITOZ(ip)->z_zn_prefetch = B_TRUE;
 442                 break;
 443
 444         case S_IFLNK:
 445                 ip->i_op = &zpl_symlink_inode_operations;
 446                 break;
 447
 448         /*
 449          * rdev is only stored in a SA only for device files.
 450          */
 451         case S_IFCHR:
 452         case S_IFBLK:
 453                 (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
 454                     sizeof (rdev));
 455                 /*FALLTHROUGH*/
 456         case S_IFIFO:
 457         case S_IFSOCK:
 458                 init_special_inode(ip, ip->i_mode, rdev);
 459                 ip->i_op = &zpl_special_inode_operations;
 460                 break;
 461
 462         default:
 463                 zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
 464                     (u_longlong_t)ip->i_ino, ip->i_mode);
 465
 466                 /* Assume the inode is a file and attempt to continue */
 467                 ip->i_mode = S_IFREG | 0644;
 468                 ip->i_op = &zpl_inode_operations;
 469                 ip->i_fop = &zpl_file_operations;
 470                 ip->i_mapping->a_ops = &zpl_address_space_operations;
 471                 break;
 472         }
 473 }
 474
 475 void
 476 zfs_set_inode_flags(znode_t *zp, struct inode *ip)
 477 {
 478         /*
 479          * Linux and Solaris have different sets of file attributes, so we
 480          * restrict this conversion to the intersection of the two.
 481          */
 482 #ifdef HAVE_INODE_SET_FLAGS
 483         unsigned int flags = 0;
 484         if (zp->z_pflags & ZFS_IMMUTABLE)
 485                 flags |= S_IMMUTABLE;
 486         if (zp->z_pflags & ZFS_APPENDONLY)
 487                 flags |= S_APPEND;
 488
 489         inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
 490 #else
 491         if (zp->z_pflags & ZFS_IMMUTABLE)
 492                 ip->i_flags |= S_IMMUTABLE;
 493         else
 494                 ip->i_flags &= ~S_IMMUTABLE;
 495
 496         if (zp->z_pflags & ZFS_APPENDONLY)
 497                 ip->i_flags |= S_APPEND;
 498         else
 499                 ip->i_flags &= ~S_APPEND;
 500 #endif
 501 }
 502
 503 /*
 504  * Update the embedded inode given the znode.  We should work toward
 505  * eliminating this function as soon as possible by removing values
 506  * which are duplicated between the znode and inode.  If the generic
 507  * inode has the correct field it should be used, and the ZFS code
 508  * updated to access the inode.  This can be done incrementally.
 509  */
 510 void
 511 zfs_inode_update(znode_t *zp)
 512 {
 513         zfsvfs_t        *zfsvfs;
 514         struct inode    *ip;
 515         uint32_t        blksize;
 516         u_longlong_t    i_blocks;
 517
 518         ASSERT(zp != NULL);
 519         zfsvfs = ZTOZSB(zp);
 520         ip = ZTOI(zp);
 521
 522         /* Skip .zfs control nodes which do not exist on disk. */
 523         if (zfsctl_is_node(ip))
 524                 return;
 525
 526         dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
 527
 528         spin_lock(&ip->i_lock);
 529         ip->i_blocks = i_blocks;
 530         i_size_write(ip, zp->z_size);
 531         spin_unlock(&ip->i_lock);
 532 }
 533
 534
 535 /*
 536  * Construct a znode+inode and initialize.
 537  *
 538  * This does not do a call to dmu_set_user() that is
 539  * up to the caller to do, in case you don't want to
 540  * return the znode
 541  */
 542 static znode_t *
 543 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 544     dmu_object_type_t obj_type, uint64_t obj, sa_handle_t *hdl)
 545 {
 546         znode_t *zp;
 547         struct inode *ip;
 548         uint64_t mode;
 549         uint64_t parent;
 550         uint64_t tmp_gen;
 551         uint64_t links;
 552         uint64_t z_uid, z_gid;
 553         uint64_t atime[2], mtime[2], ctime[2];
 554         uint64_t projid = ZFS_DEFAULT_PROJID;
 555         sa_bulk_attr_t bulk[11];
 556         int count = 0;
 557
 558         ASSERT(zfsvfs != NULL);
 559
 560         ip = new_inode(zfsvfs->z_sb);
 561         if (ip == NULL)
 562                 return (NULL);
 563
 564         zp = ITOZ(ip);
 565         ASSERT(zp->z_dirlocks == NULL);
 566         ASSERT3P(zp->z_acl_cached, ==, NULL);
 567         ASSERT3P(zp->z_xattr_cached, ==, NULL);
 568         zp->z_moved = 0;
 569         zp->z_sa_hdl = NULL;
 570         zp->z_unlinked = 0;
 571         zp->z_atime_dirty = 0;
 572         zp->z_mapcnt = 0;
 573         zp->z_id = db->db_object;
 574         zp->z_blksz = blksz;
 575         zp->z_seq = 0x7A4653;
 576         zp->z_sync_cnt = 0;
 577         zp->z_is_mapped = B_FALSE;
 578         zp->z_is_ctldir = B_FALSE;
 579         zp->z_is_stale = B_FALSE;
 580         zp->z_range_lock.zr_size = &zp->z_size;
 581         zp->z_range_lock.zr_blksz = &zp->z_blksz;
 582         zp->z_range_lock.zr_max_blksz = &ZTOZSB(zp)->z_max_blksz;
 583
 584         zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
 585
 586         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
 587         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
 588         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 589             &zp->z_size, 8);
 590         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
 591         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 592             &zp->z_pflags, 8);
 593         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
 594             &parent, 8);
 595         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
 596         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
 597         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
 598         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 599         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 600
 601         if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
 602             (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
 603             (zp->z_pflags & ZFS_PROJID) &&
 604             sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
 605                 if (hdl == NULL)
 606                         sa_handle_destroy(zp->z_sa_hdl);
 607                 zp->z_sa_hdl = NULL;
 608                 goto error;
 609         }
 610
 611         zp->z_projid = projid;
 612         zp->z_mode = ip->i_mode = mode;
 613         ip->i_generation = (uint32_t)tmp_gen;
 614         ip->i_blkbits = SPA_MINBLOCKSHIFT;
 615         set_nlink(ip, (uint32_t)links);
 616         zfs_uid_write(ip, z_uid);
 617         zfs_gid_write(ip, z_gid);
 618         zfs_set_inode_flags(zp, ip);
 619
 620         /* Cache the xattr parent id */
 621         if (zp->z_pflags & ZFS_XATTR)
 622                 zp->z_xattr_parent = parent;
 623
 624         ZFS_TIME_DECODE(&ip->i_atime, atime);
 625         ZFS_TIME_DECODE(&ip->i_mtime, mtime);
 626         ZFS_TIME_DECODE(&ip->i_ctime, ctime);
 627
 628         ip->i_ino = obj;
 629         zfs_inode_update(zp);
 630         zfs_inode_set_ops(zfsvfs, ip);
 631
 632         /*
 633          * The only way insert_inode_locked() can fail is if the ip->i_ino
 634          * number is already hashed for this super block.  This can never
 635          * happen because the inode numbers map 1:1 with the object numbers.
 636          *
 637          * The one exception is rolling back a mounted file system, but in
 638          * this case all the active inode are unhashed during the rollback.
 639          */
 640         VERIFY3S(insert_inode_locked(ip), ==, 0);
 641
 642         mutex_enter(&zfsvfs->z_znodes_lock);
 643         list_insert_tail(&zfsvfs->z_all_znodes, zp);
 644         zfsvfs->z_nr_znodes++;
 645         membar_producer();
 646         mutex_exit(&zfsvfs->z_znodes_lock);
 647
 648         unlock_new_inode(ip);
 649         return (zp);
 650
 651 error:
 652         iput(ip);
 653         return (NULL);
 654 }
 655
 656 /*
 657  * Safely mark an inode dirty.  Inodes which are part of a read-only
 658  * file system or snapshot may not be dirtied.
 659  */
 660 void
 661 zfs_mark_inode_dirty(struct inode *ip)
 662 {
 663         zfsvfs_t *zfsvfs = ITOZSB(ip);
 664
 665         if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
 666                 return;
 667
 668         mark_inode_dirty(ip);
 669 }
 670
 671 static uint64_t empty_xattr;
 672 static uint64_t pad[4];
 673 static zfs_acl_phys_t acl_phys;
 674 /*
 675  * Create a new DMU object to hold a zfs znode.
 676  *
 677  *      IN:     dzp     - parent directory for new znode
 678  *              vap     - file attributes for new znode
 679  *              tx      - dmu transaction id for zap operations
 680  *              cr      - credentials of caller
 681  *              flag    - flags:
 682  *                        IS_ROOT_NODE  - new object will be root
 683  *                        IS_XATTR      - new object is an attribute
 684  *              bonuslen - length of bonus buffer
 685  *              setaclp  - File/Dir initial ACL
 686  *              fuidp    - Tracks fuid allocation.
 687  *
 688  *      OUT:    zpp     - allocated znode
 689  *
 690  */
 691 void
 692 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 693     uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
 694 {
 695         uint64_t        crtime[2], atime[2], mtime[2], ctime[2];
 696         uint64_t        mode, size, links, parent, pflags;
 697         uint64_t        projid = ZFS_DEFAULT_PROJID;
 698         uint64_t        rdev = 0;
 699         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 700         dmu_buf_t       *db;
 701         inode_timespec_t now;
 702         uint64_t        gen, obj;
 703         int             bonuslen;
 704         int             dnodesize;
 705         sa_handle_t     *sa_hdl;
 706         dmu_object_type_t obj_type;
 707         sa_bulk_attr_t  *sa_attrs;
 708         int             cnt = 0;
 709         zfs_acl_locator_cb_t locate = { 0 };
 710         znode_hold_t    *zh;
 711
 712         if (zfsvfs->z_replay) {
 713                 obj = vap->va_nodeid;
 714                 now = vap->va_ctime;            /* see zfs_replay_create() */
 715                 gen = vap->va_nblocks;          /* ditto */
 716                 dnodesize = vap->va_fsid;       /* ditto */
 717         } else {
 718                 obj = 0;
 719                 gethrestime(&now);
 720                 gen = dmu_tx_get_txg(tx);
 721                 dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
 722         }
 723
 724         if (dnodesize == 0)
 725                 dnodesize = DNODE_MIN_SIZE;
 726
 727         obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
 728
 729         bonuslen = (obj_type == DMU_OT_SA) ?
 730             DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
 731
 732         /*
 733          * Create a new DMU object.
 734          */
 735         /*
 736          * There's currently no mechanism for pre-reading the blocks that will
 737          * be needed to allocate a new object, so we accept the small chance
 738          * that there will be an i/o error and we will fail one of the
 739          * assertions below.
 740          */
 741         if (S_ISDIR(vap->va_mode)) {
 742                 if (zfsvfs->z_replay) {
 743                         VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
 744                             zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 745                             obj_type, bonuslen, dnodesize, tx));
 746                 } else {
 747                         obj = zap_create_norm_dnsize(zfsvfs->z_os,
 748                             zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 749                             obj_type, bonuslen, dnodesize, tx);
 750                 }
 751         } else {
 752                 if (zfsvfs->z_replay) {
 753                         VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
 754                             DMU_OT_PLAIN_FILE_CONTENTS, 0,
 755                             obj_type, bonuslen, dnodesize, tx));
 756                 } else {
 757                         obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
 758                             DMU_OT_PLAIN_FILE_CONTENTS, 0,
 759                             obj_type, bonuslen, dnodesize, tx);
 760                 }
 761         }
 762
 763         zh = zfs_znode_hold_enter(zfsvfs, obj);
 764         VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
 765
 766         /*
 767          * If this is the root, fix up the half-initialized parent pointer
 768          * to reference the just-allocated physical data area.
 769          */
 770         if (flag & IS_ROOT_NODE) {
 771                 dzp->z_id = obj;
 772         }
 773
 774         /*
 775          * If parent is an xattr, so am I.
 776          */
 777         if (dzp->z_pflags & ZFS_XATTR) {
 778                 flag |= IS_XATTR;
 779         }
 780
 781         if (zfsvfs->z_use_fuids)
 782                 pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
 783         else
 784                 pflags = 0;
 785
 786         if (S_ISDIR(vap->va_mode)) {
 787                 size = 2;               /* contents ("." and "..") */
 788                 links = 2;
 789         } else {
 790                 size = 0;
 791                 links = (flag & IS_TMPFILE) ? 0 : 1;
 792         }
 793
 794         if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
 795                 rdev = vap->va_rdev;
 796
 797         parent = dzp->z_id;
 798         mode = acl_ids->z_mode;
 799         if (flag & IS_XATTR)
 800                 pflags |= ZFS_XATTR;
 801
 802         if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
 803                 /*
 804                  * With ZFS_PROJID flag, we can easily know whether there is
 805                  * project ID stored on disk or not. See zfs_space_delta_cb().
 806                  */
 807                 if (obj_type != DMU_OT_ZNODE &&
 808                     dmu_objset_projectquota_enabled(zfsvfs->z_os))
 809                         pflags |= ZFS_PROJID;
 810
 811                 /*
 812                  * Inherit project ID from parent if required.
 813                  */
 814                 projid = zfs_inherit_projid(dzp);
 815                 if (dzp->z_pflags & ZFS_PROJINHERIT)
 816                         pflags |= ZFS_PROJINHERIT;
 817         }
 818
 819         /*
 820          * No execs denied will be deterimed when zfs_mode_compute() is called.
 821          */
 822         pflags |= acl_ids->z_aclp->z_hints &
 823             (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
 824             ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
 825
 826         ZFS_TIME_ENCODE(&now, crtime);
 827         ZFS_TIME_ENCODE(&now, ctime);
 828
 829         if (vap->va_mask & ATTR_ATIME) {
 830                 ZFS_TIME_ENCODE(&vap->va_atime, atime);
 831         } else {
 832                 ZFS_TIME_ENCODE(&now, atime);
 833         }
 834
 835         if (vap->va_mask & ATTR_MTIME) {
 836                 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 837         } else {
 838                 ZFS_TIME_ENCODE(&now, mtime);
 839         }
 840
 841         /* Now add in all of the "SA" attributes */
 842         VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
 843             &sa_hdl));
 844
 845         /*
 846          * Setup the array of attributes to be replaced/set on the new file
 847          *
 848          * order for  DMU_OT_ZNODE is critical since it needs to be constructed
 849          * in the old znode_phys_t format.  Don't change this ordering
 850          */
 851         sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
 852
 853         if (obj_type == DMU_OT_ZNODE) {
 854                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
 855                     NULL, &atime, 16);
 856                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
 857                     NULL, &mtime, 16);
 858                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
 859                     NULL, &ctime, 16);
 860                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
 861                     NULL, &crtime, 16);
 862                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
 863                     NULL, &gen, 8);
 864                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
 865                     NULL, &mode, 8);
 866                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
 867                     NULL, &size, 8);
 868                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
 869                     NULL, &parent, 8);
 870         } else {
 871                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
 872                     NULL, &mode, 8);
 873                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
 874                     NULL, &size, 8);
 875                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
 876                     NULL, &gen, 8);
 877                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
 878                     NULL, &acl_ids->z_fuid, 8);
 879                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
 880                     NULL, &acl_ids->z_fgid, 8);
 881                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
 882                     NULL, &parent, 8);
 883                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
 884                     NULL, &pflags, 8);
 885                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
 886                     NULL, &atime, 16);
 887                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
 888                     NULL, &mtime, 16);
 889                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
 890                     NULL, &ctime, 16);
 891                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
 892                     NULL, &crtime, 16);
 893         }
 894
 895         SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
 896
 897         if (obj_type == DMU_OT_ZNODE) {
 898                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
 899                     &empty_xattr, 8);
 900         } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
 901             pflags & ZFS_PROJID) {
 902                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
 903                     NULL, &projid, 8);
 904         }
 905         if (obj_type == DMU_OT_ZNODE ||
 906             (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
 907                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
 908                     NULL, &rdev, 8);
 909         }
 910         if (obj_type == DMU_OT_ZNODE) {
 911                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
 912                     NULL, &pflags, 8);
 913                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
 914                     &acl_ids->z_fuid, 8);
 915                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
 916                     &acl_ids->z_fgid, 8);
 917                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
 918                     sizeof (uint64_t) * 4);
 919                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
 920                     &acl_phys, sizeof (zfs_acl_phys_t));
 921         } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
 922                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
 923                     &acl_ids->z_aclp->z_acl_count, 8);
 924                 locate.cb_aclp = acl_ids->z_aclp;
 925                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
 926                     zfs_acl_data_locator, &locate,
 927                     acl_ids->z_aclp->z_acl_bytes);
 928                 mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
 929                     acl_ids->z_fuid, acl_ids->z_fgid);
 930         }
 931
 932         VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
 933
 934         if (!(flag & IS_ROOT_NODE)) {
 935                 /*
 936                  * The call to zfs_znode_alloc() may fail if memory is low
 937                  * via the call path: alloc_inode() -> inode_init_always() ->
 938                  * security_inode_alloc() -> inode_alloc_security().  Since
 939                  * the existing code is written such that zfs_mknode() can
 940                  * not fail retry until sufficient memory has been reclaimed.
 941                  */
 942                 do {
 943                         *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, obj,
 944                             sa_hdl);
 945                 } while (*zpp == NULL);
 946
 947                 VERIFY(*zpp != NULL);
 948                 VERIFY(dzp != NULL);
 949         } else {
 950                 /*
 951                  * If we are creating the root node, the "parent" we
 952                  * passed in is the znode for the root.
 953                  */
 954                 *zpp = dzp;
 955
 956                 (*zpp)->z_sa_hdl = sa_hdl;
 957         }
 958
 959         (*zpp)->z_pflags = pflags;
 960         (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
 961         (*zpp)->z_dnodesize = dnodesize;
 962         (*zpp)->z_projid = projid;
 963
 964         if (obj_type == DMU_OT_ZNODE ||
 965             acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
 966                 VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
 967         }
 968         kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
 969         zfs_znode_hold_exit(zfsvfs, zh);
 970 }
 971
 972 /*
 973  * Update in-core attributes.  It is assumed the caller will be doing an
 974  * sa_bulk_update to push the changes out.
 975  */
 976 void
 977 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
 978 {
 979         xoptattr_t *xoap;
 980         boolean_t update_inode = B_FALSE;
 981
 982         xoap = xva_getxoptattr(xvap);
 983         ASSERT(xoap);
 984
 985         if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
 986                 uint64_t times[2];
 987                 ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
 988                 (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
 989                     &times, sizeof (times), tx);
 990                 XVA_SET_RTN(xvap, XAT_CREATETIME);
 991         }
 992         if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
 993                 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
 994                     zp->z_pflags, tx);
 995                 XVA_SET_RTN(xvap, XAT_READONLY);
 996         }
 997         if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
 998                 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
 999                     zp->z_pflags, tx);
1000                 XVA_SET_RTN(xvap, XAT_HIDDEN);
1001         }
1002         if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
1003                 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
1004                     zp->z_pflags, tx);
1005                 XVA_SET_RTN(xvap, XAT_SYSTEM);
1006         }
1007         if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
1008                 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
1009                     zp->z_pflags, tx);
1010                 XVA_SET_RTN(xvap, XAT_ARCHIVE);
1011         }
1012         if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
1013                 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
1014                     zp->z_pflags, tx);
1015                 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
1016
1017                 update_inode = B_TRUE;
1018         }
1019         if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
1020                 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
1021                     zp->z_pflags, tx);
1022                 XVA_SET_RTN(xvap, XAT_NOUNLINK);
1023         }
1024         if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1025                 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
1026                     zp->z_pflags, tx);
1027                 XVA_SET_RTN(xvap, XAT_APPENDONLY);
1028
1029                 update_inode = B_TRUE;
1030         }
1031         if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1032                 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
1033                     zp->z_pflags, tx);
1034                 XVA_SET_RTN(xvap, XAT_NODUMP);
1035         }
1036         if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1037                 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
1038                     zp->z_pflags, tx);
1039                 XVA_SET_RTN(xvap, XAT_OPAQUE);
1040         }
1041         if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1042                 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
1043                     xoap->xoa_av_quarantined, zp->z_pflags, tx);
1044                 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1045         }
1046         if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1047                 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
1048                     zp->z_pflags, tx);
1049                 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1050         }
1051         if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
1052                 zfs_sa_set_scanstamp(zp, xvap, tx);
1053                 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
1054         }
1055         if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1056                 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
1057                     zp->z_pflags, tx);
1058                 XVA_SET_RTN(xvap, XAT_REPARSE);
1059         }
1060         if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1061                 ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
1062                     zp->z_pflags, tx);
1063                 XVA_SET_RTN(xvap, XAT_OFFLINE);
1064         }
1065         if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1066                 ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
1067                     zp->z_pflags, tx);
1068                 XVA_SET_RTN(xvap, XAT_SPARSE);
1069         }
1070         if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
1071                 ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
1072                     zp->z_pflags, tx);
1073                 XVA_SET_RTN(xvap, XAT_PROJINHERIT);
1074         }
1075
1076         if (update_inode)
1077                 zfs_set_inode_flags(zp, ZTOI(zp));
1078 }
1079
1080 int
1081 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
1082 {
1083         dmu_object_info_t doi;
1084         dmu_buf_t       *db;
1085         znode_t         *zp;
1086         znode_hold_t    *zh;
1087         int err;
1088         sa_handle_t     *hdl;
1089
1090         *zpp = NULL;
1091
1092 again:
1093         zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1094
1095         err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1096         if (err) {
1097                 zfs_znode_hold_exit(zfsvfs, zh);
1098                 return (err);
1099         }
1100
1101         dmu_object_info_from_db(db, &doi);
1102         if (doi.doi_bonus_type != DMU_OT_SA &&
1103             (doi.doi_bonus_type != DMU_OT_ZNODE ||
1104             (doi.doi_bonus_type == DMU_OT_ZNODE &&
1105             doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1106                 sa_buf_rele(db, NULL);
1107                 zfs_znode_hold_exit(zfsvfs, zh);
1108                 return (SET_ERROR(EINVAL));
1109         }
1110
1111         hdl = dmu_buf_get_user(db);
1112         if (hdl != NULL) {
1113                 zp = sa_get_userdata(hdl);
1114
1115
1116                 /*
1117                  * Since "SA" does immediate eviction we
1118                  * should never find a sa handle that doesn't
1119                  * know about the znode.
1120                  */
1121
1122                 ASSERT3P(zp, !=, NULL);
1123
1124                 mutex_enter(&zp->z_lock);
1125                 ASSERT3U(zp->z_id, ==, obj_num);
1126                 /*
1127                  * If igrab() returns NULL the VFS has independently
1128                  * determined the inode should be evicted and has
1129                  * called iput_final() to start the eviction process.
1130                  * The SA handle is still valid but because the VFS
1131                  * requires that the eviction succeed we must drop
1132                  * our locks and references to allow the eviction to
1133                  * complete.  The zfs_zget() may then be retried.
1134                  *
1135                  * This unlikely case could be optimized by registering
1136                  * a sops->drop_inode() callback.  The callback would
1137                  * need to detect the active SA hold thereby informing
1138                  * the VFS that this inode should not be evicted.
1139                  */
1140                 if (igrab(ZTOI(zp)) == NULL) {
1141                         mutex_exit(&zp->z_lock);
1142                         sa_buf_rele(db, NULL);
1143                         zfs_znode_hold_exit(zfsvfs, zh);
1144                         /* inode might need this to finish evict */
1145                         cond_resched();
1146                         goto again;
1147                 }
1148                 *zpp = zp;
1149                 err = 0;
1150                 mutex_exit(&zp->z_lock);
1151                 sa_buf_rele(db, NULL);
1152                 zfs_znode_hold_exit(zfsvfs, zh);
1153                 return (err);
1154         }
1155
1156         /*
1157          * Not found create new znode/vnode but only if file exists.
1158          *
1159          * There is a small window where zfs_vget() could
1160          * find this object while a file create is still in
1161          * progress.  This is checked for in zfs_znode_alloc()
1162          *
1163          * if zfs_znode_alloc() fails it will drop the hold on the
1164          * bonus buffer.
1165          */
1166         zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
1167             doi.doi_bonus_type, obj_num, NULL);
1168         if (zp == NULL) {
1169                 err = SET_ERROR(ENOENT);
1170         } else {
1171                 *zpp = zp;
1172         }
1173         zfs_znode_hold_exit(zfsvfs, zh);
1174         return (err);
1175 }
1176
1177 int
1178 zfs_rezget(znode_t *zp)
1179 {
1180         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1181         dmu_object_info_t doi;
1182         dmu_buf_t *db;
1183         uint64_t obj_num = zp->z_id;
1184         uint64_t mode;
1185         uint64_t links;
1186         sa_bulk_attr_t bulk[10];
1187         int err;
1188         int count = 0;
1189         uint64_t gen;
1190         uint64_t z_uid, z_gid;
1191         uint64_t atime[2], mtime[2], ctime[2];
1192         uint64_t projid = ZFS_DEFAULT_PROJID;
1193         znode_hold_t *zh;
1194
1195         /*
1196          * skip ctldir, otherwise they will always get invalidated. This will
1197          * cause funny behaviour for the mounted snapdirs. Especially for
1198          * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
1199          * anyone automount it again as long as someone is still using the
1200          * detached mount.
1201          */
1202         if (zp->z_is_ctldir)
1203                 return (0);
1204
1205         zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1206
1207         mutex_enter(&zp->z_acl_lock);
1208         if (zp->z_acl_cached) {
1209                 zfs_acl_free(zp->z_acl_cached);
1210                 zp->z_acl_cached = NULL;
1211         }
1212         mutex_exit(&zp->z_acl_lock);
1213
1214         rw_enter(&zp->z_xattr_lock, RW_WRITER);
1215         if (zp->z_xattr_cached) {
1216                 nvlist_free(zp->z_xattr_cached);
1217                 zp->z_xattr_cached = NULL;
1218         }
1219         rw_exit(&zp->z_xattr_lock);
1220
1221         ASSERT(zp->z_sa_hdl == NULL);
1222         err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1223         if (err) {
1224                 zfs_znode_hold_exit(zfsvfs, zh);
1225                 return (err);
1226         }
1227
1228         dmu_object_info_from_db(db, &doi);
1229         if (doi.doi_bonus_type != DMU_OT_SA &&
1230             (doi.doi_bonus_type != DMU_OT_ZNODE ||
1231             (doi.doi_bonus_type == DMU_OT_ZNODE &&
1232             doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1233                 sa_buf_rele(db, NULL);
1234                 zfs_znode_hold_exit(zfsvfs, zh);
1235                 return (SET_ERROR(EINVAL));
1236         }
1237
1238         zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
1239
1240         /* reload cached values */
1241         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
1242             &gen, sizeof (gen));
1243         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1244             &zp->z_size, sizeof (zp->z_size));
1245         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
1246             &links, sizeof (links));
1247         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1248             &zp->z_pflags, sizeof (zp->z_pflags));
1249         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1250             &z_uid, sizeof (z_uid));
1251         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1252             &z_gid, sizeof (z_gid));
1253         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1254             &mode, sizeof (mode));
1255         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1256             &atime, 16);
1257         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
1258             &mtime, 16);
1259         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
1260             &ctime, 16);
1261
1262         if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
1263                 zfs_znode_dmu_fini(zp);
1264                 zfs_znode_hold_exit(zfsvfs, zh);
1265                 return (SET_ERROR(EIO));
1266         }
1267
1268         if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
1269                 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
1270                     &projid, 8);
1271                 if (err != 0 && err != ENOENT) {
1272                         zfs_znode_dmu_fini(zp);
1273                         zfs_znode_hold_exit(zfsvfs, zh);
1274                         return (SET_ERROR(err));
1275                 }
1276         }
1277
1278         zp->z_projid = projid;
1279         zp->z_mode = ZTOI(zp)->i_mode = mode;
1280         zfs_uid_write(ZTOI(zp), z_uid);
1281         zfs_gid_write(ZTOI(zp), z_gid);
1282
1283         ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
1284         ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
1285         ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime);
1286
1287         if (gen != ZTOI(zp)->i_generation) {
1288                 zfs_znode_dmu_fini(zp);
1289                 zfs_znode_hold_exit(zfsvfs, zh);
1290                 return (SET_ERROR(EIO));
1291         }
1292
1293         set_nlink(ZTOI(zp), (uint32_t)links);
1294         zfs_set_inode_flags(zp, ZTOI(zp));
1295
1296         zp->z_blksz = doi.doi_data_block_size;
1297         zp->z_atime_dirty = 0;
1298         zfs_inode_update(zp);
1299
1300         /*
1301          * If the file has zero links, then it has been unlinked on the send
1302          * side and it must be in the received unlinked set.
1303          * We call zfs_znode_dmu_fini() now to prevent any accesses to the
1304          * stale data and to prevent automatical removal of the file in
1305          * zfs_zinactive().  The file will be removed either when it is removed
1306          * on the send side and the next incremental stream is received or
1307          * when the unlinked set gets processed.
1308          */
1309         zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
1310         if (zp->z_unlinked)
1311                 zfs_znode_dmu_fini(zp);
1312
1313         zfs_znode_hold_exit(zfsvfs, zh);
1314
1315         return (0);
1316 }
1317
1318 void
1319 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1320 {
1321         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1322         objset_t *os = zfsvfs->z_os;
1323         uint64_t obj = zp->z_id;
1324         uint64_t acl_obj = zfs_external_acl(zp);
1325         znode_hold_t *zh;
1326
1327         zh = zfs_znode_hold_enter(zfsvfs, obj);
1328         if (acl_obj) {
1329                 VERIFY(!zp->z_is_sa);
1330                 VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1331         }
1332         VERIFY(0 == dmu_object_free(os, obj, tx));
1333         zfs_znode_dmu_fini(zp);
1334         zfs_znode_hold_exit(zfsvfs, zh);
1335 }
1336
1337 void
1338 zfs_zinactive(znode_t *zp)
1339 {
1340         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1341         uint64_t z_id = zp->z_id;
1342         znode_hold_t *zh;
1343
1344         ASSERT(zp->z_sa_hdl);
1345
1346         /*
1347          * Don't allow a zfs_zget() while were trying to release this znode.
1348          */
1349         zh = zfs_znode_hold_enter(zfsvfs, z_id);
1350
1351         mutex_enter(&zp->z_lock);
1352
1353         /*
1354          * If this was the last reference to a file with no links, remove
1355          * the file from the file system unless the file system is mounted
1356          * read-only.  That can happen, for example, if the file system was
1357          * originally read-write, the file was opened, then unlinked and
1358          * the file system was made read-only before the file was finally
1359          * closed.  The file will remain in the unlinked set.
1360          */
1361         if (zp->z_unlinked) {
1362                 ASSERT(!zfsvfs->z_issnap);
1363                 if (!zfs_is_readonly(zfsvfs)) {
1364                         mutex_exit(&zp->z_lock);
1365                         zfs_znode_hold_exit(zfsvfs, zh);
1366                         zfs_rmnode(zp);
1367                         return;
1368                 }
1369         }
1370
1371         mutex_exit(&zp->z_lock);
1372         zfs_znode_dmu_fini(zp);
1373
1374         zfs_znode_hold_exit(zfsvfs, zh);
1375 }
1376
1377 static inline int
1378 zfs_compare_timespec(struct timespec *t1, struct timespec *t2)
1379 {
1380         if (t1->tv_sec < t2->tv_sec)
1381                 return (-1);
1382
1383         if (t1->tv_sec > t2->tv_sec)
1384                 return (1);
1385
1386         return (t1->tv_nsec - t2->tv_nsec);
1387 }
1388
1389 /*
1390  * Prepare to update znode time stamps.
1391  *
1392  *      IN:     zp      - znode requiring timestamp update
1393  *              flag    - ATTR_MTIME, ATTR_CTIME flags
1394  *
1395  *      OUT:    zp      - z_seq
1396  *              mtime   - new mtime
1397  *              ctime   - new ctime
1398  *
1399  *      Note: We don't update atime here, because we rely on Linux VFS to do
1400  *      atime updating.
1401  */
1402 void
1403 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
1404     uint64_t ctime[2])
1405 {
1406         inode_timespec_t now;
1407
1408         gethrestime(&now);
1409
1410         zp->z_seq++;
1411
1412         if (flag & ATTR_MTIME) {
1413                 ZFS_TIME_ENCODE(&now, mtime);
1414                 ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime);
1415                 if (ZTOZSB(zp)->z_use_fuids) {
1416                         zp->z_pflags |= (ZFS_ARCHIVE |
1417                             ZFS_AV_MODIFIED);
1418                 }
1419         }
1420
1421         if (flag & ATTR_CTIME) {
1422                 ZFS_TIME_ENCODE(&now, ctime);
1423                 ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime);
1424                 if (ZTOZSB(zp)->z_use_fuids)
1425                         zp->z_pflags |= ZFS_ARCHIVE;
1426         }
1427 }
1428
1429 /*
1430  * Grow the block size for a file.
1431  *
1432  *      IN:     zp      - znode of file to free data in.
1433  *              size    - requested block size
1434  *              tx      - open transaction.
1435  *
1436  * NOTE: this function assumes that the znode is write locked.
1437  */
1438 void
1439 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1440 {
1441         int             error;
1442         u_longlong_t    dummy;
1443
1444         if (size <= zp->z_blksz)
1445                 return;
1446         /*
1447          * If the file size is already greater than the current blocksize,
1448          * we will not grow.  If there is more than one block in a file,
1449          * the blocksize cannot change.
1450          */
1451         if (zp->z_blksz && zp->z_size > zp->z_blksz)
1452                 return;
1453
1454         error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
1455             size, 0, tx);
1456
1457         if (error == ENOTSUP)
1458                 return;
1459         ASSERT0(error);
1460
1461         /* What blocksize did we actually get? */
1462         dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
1463 }
1464
1465 /*
1466  * Increase the file length
1467  *
1468  *      IN:     zp      - znode of file to free data in.
1469  *              end     - new end-of-file
1470  *
1471  *      RETURN: 0 on success, error code on failure
1472  */
1473 static int
1474 zfs_extend(znode_t *zp, uint64_t end)
1475 {
1476         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1477         dmu_tx_t *tx;
1478         rl_t *rl;
1479         uint64_t newblksz;
1480         int error;
1481
1482         /*
1483          * We will change zp_size, lock the whole file.
1484          */
1485         rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER);
1486
1487         /*
1488          * Nothing to do if file already at desired length.
1489          */
1490         if (end <= zp->z_size) {
1491                 zfs_range_unlock(rl);
1492                 return (0);
1493         }
1494         tx = dmu_tx_create(zfsvfs->z_os);
1495         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1496         zfs_sa_upgrade_txholds(tx, zp);
1497         if (end > zp->z_blksz &&
1498             (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1499                 /*
1500                  * We are growing the file past the current block size.
1501                  */
1502                 if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
1503                         /*
1504                          * File's blocksize is already larger than the
1505                          * "recordsize" property.  Only let it grow to
1506                          * the next power of 2.
1507                          */
1508                         ASSERT(!ISP2(zp->z_blksz));
1509                         newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
1510                 } else {
1511                         newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
1512                 }
1513                 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1514         } else {
1515                 newblksz = 0;
1516         }
1517
1518         error = dmu_tx_assign(tx, TXG_WAIT);
1519         if (error) {
1520                 dmu_tx_abort(tx);
1521                 zfs_range_unlock(rl);
1522                 return (error);
1523         }
1524
1525         if (newblksz)
1526                 zfs_grow_blocksize(zp, newblksz, tx);
1527
1528         zp->z_size = end;
1529
1530         VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
1531             &zp->z_size, sizeof (zp->z_size), tx));
1532
1533         zfs_range_unlock(rl);
1534
1535         dmu_tx_commit(tx);
1536
1537         return (0);
1538 }
1539
1540 /*
1541  * zfs_zero_partial_page - Modeled after update_pages() but
1542  * with different arguments and semantics for use by zfs_freesp().
1543  *
1544  * Zeroes a piece of a single page cache entry for zp at offset
1545  * start and length len.
1546  *
1547  * Caller must acquire a range lock on the file for the region
1548  * being zeroed in order that the ARC and page cache stay in sync.
1549  */
1550 static void
1551 zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
1552 {
1553         struct address_space *mp = ZTOI(zp)->i_mapping;
1554         struct page *pp;
1555         int64_t off;
1556         void *pb;
1557
1558         ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
1559
1560         off = start & (PAGE_SIZE - 1);
1561         start &= PAGE_MASK;
1562
1563         pp = find_lock_page(mp, start >> PAGE_SHIFT);
1564         if (pp) {
1565                 if (mapping_writably_mapped(mp))
1566                         flush_dcache_page(pp);
1567
1568                 pb = kmap(pp);
1569                 bzero(pb + off, len);
1570                 kunmap(pp);
1571
1572                 if (mapping_writably_mapped(mp))
1573                         flush_dcache_page(pp);
1574
1575                 mark_page_accessed(pp);
1576                 SetPageUptodate(pp);
1577                 ClearPageError(pp);
1578                 unlock_page(pp);
1579                 put_page(pp);
1580         }
1581 }
1582
1583 /*
1584  * Free space in a file.
1585  *
1586  *      IN:     zp      - znode of file to free data in.
1587  *              off     - start of section to free.
1588  *              len     - length of section to free.
1589  *
1590  *      RETURN: 0 on success, error code on failure
1591  */
1592 static int
1593 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1594 {
1595         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1596         rl_t *rl;
1597         int error;
1598
1599         /*
1600          * Lock the range being freed.
1601          */
1602         rl = zfs_range_lock(&zp->z_range_lock, off, len, RL_WRITER);
1603
1604         /*
1605          * Nothing to do if file already at desired length.
1606          */
1607         if (off >= zp->z_size) {
1608                 zfs_range_unlock(rl);
1609                 return (0);
1610         }
1611
1612         if (off + len > zp->z_size)
1613                 len = zp->z_size - off;
1614
1615         error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1616
1617         /*
1618          * Zero partial page cache entries.  This must be done under a
1619          * range lock in order to keep the ARC and page cache in sync.
1620          */
1621         if (zp->z_is_mapped) {
1622                 loff_t first_page, last_page, page_len;
1623                 loff_t first_page_offset, last_page_offset;
1624
1625                 /* first possible full page in hole */
1626                 first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
1627                 /* last page of hole */
1628                 last_page = (off + len) >> PAGE_SHIFT;
1629
1630                 /* offset of first_page */
1631                 first_page_offset = first_page << PAGE_SHIFT;
1632                 /* offset of last_page */
1633                 last_page_offset = last_page << PAGE_SHIFT;
1634
1635                 /* truncate whole pages */
1636                 if (last_page_offset > first_page_offset) {
1637                         truncate_inode_pages_range(ZTOI(zp)->i_mapping,
1638                             first_page_offset, last_page_offset - 1);
1639                 }
1640
1641                 /* truncate sub-page ranges */
1642                 if (first_page > last_page) {
1643                         /* entire punched area within a single page */
1644                         zfs_zero_partial_page(zp, off, len);
1645                 } else {
1646                         /* beginning of punched area at the end of a page */
1647                         page_len  = first_page_offset - off;
1648                         if (page_len > 0)
1649                                 zfs_zero_partial_page(zp, off, page_len);
1650
1651                         /* end of punched area at the beginning of a page */
1652                         page_len = off + len - last_page_offset;
1653                         if (page_len > 0)
1654                                 zfs_zero_partial_page(zp, last_page_offset,
1655                                     page_len);
1656                 }
1657         }
1658         zfs_range_unlock(rl);
1659
1660         return (error);
1661 }
1662
1663 /*
1664  * Truncate a file
1665  *
1666  *      IN:     zp      - znode of file to free data in.
1667  *              end     - new end-of-file.
1668  *
1669  *      RETURN: 0 on success, error code on failure
1670  */
1671 static int
1672 zfs_trunc(znode_t *zp, uint64_t end)
1673 {
1674         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1675         dmu_tx_t *tx;
1676         rl_t *rl;
1677         int error;
1678         sa_bulk_attr_t bulk[2];
1679         int count = 0;
1680
1681         /*
1682          * We will change zp_size, lock the whole file.
1683          */
1684         rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER);
1685
1686         /*
1687          * Nothing to do if file already at desired length.
1688          */
1689         if (end >= zp->z_size) {
1690                 zfs_range_unlock(rl);
1691                 return (0);
1692         }
1693
1694         error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
1695             DMU_OBJECT_END);
1696         if (error) {
1697                 zfs_range_unlock(rl);
1698                 return (error);
1699         }
1700         tx = dmu_tx_create(zfsvfs->z_os);
1701         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1702         zfs_sa_upgrade_txholds(tx, zp);
1703         dmu_tx_mark_netfree(tx);
1704         error = dmu_tx_assign(tx, TXG_WAIT);
1705         if (error) {
1706                 dmu_tx_abort(tx);
1707                 zfs_range_unlock(rl);
1708                 return (error);
1709         }
1710
1711         zp->z_size = end;
1712         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
1713             NULL, &zp->z_size, sizeof (zp->z_size));
1714
1715         if (end == 0) {
1716                 zp->z_pflags &= ~ZFS_SPARSE;
1717                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1718                     NULL, &zp->z_pflags, 8);
1719         }
1720         VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
1721
1722         dmu_tx_commit(tx);
1723
1724         zfs_range_unlock(rl);
1725
1726         return (0);
1727 }
1728
1729 /*
1730  * Free space in a file
1731  *
1732  *      IN:     zp      - znode of file to free data in.
1733  *              off     - start of range
1734  *              len     - end of range (0 => EOF)
1735  *              flag    - current file open mode flags.
1736  *              log     - TRUE if this action should be logged
1737  *
1738  *      RETURN: 0 on success, error code on failure
1739  */
1740 int
1741 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1742 {
1743         dmu_tx_t *tx;
1744         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1745         zilog_t *zilog = zfsvfs->z_log;
1746         uint64_t mode;
1747         uint64_t mtime[2], ctime[2];
1748         sa_bulk_attr_t bulk[3];
1749         int count = 0;
1750         int error;
1751
1752         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
1753             sizeof (mode))) != 0)
1754                 return (error);
1755
1756         if (off > zp->z_size) {
1757                 error =  zfs_extend(zp, off+len);
1758                 if (error == 0 && log)
1759                         goto log;
1760                 goto out;
1761         }
1762
1763         if (len == 0) {
1764                 error = zfs_trunc(zp, off);
1765         } else {
1766                 if ((error = zfs_free_range(zp, off, len)) == 0 &&
1767                     off + len > zp->z_size)
1768                         error = zfs_extend(zp, off+len);
1769         }
1770         if (error || !log)
1771                 goto out;
1772 log:
1773         tx = dmu_tx_create(zfsvfs->z_os);
1774         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1775         zfs_sa_upgrade_txholds(tx, zp);
1776         error = dmu_tx_assign(tx, TXG_WAIT);
1777         if (error) {
1778                 dmu_tx_abort(tx);
1779                 goto out;
1780         }
1781
1782         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
1783         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
1784         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1785             NULL, &zp->z_pflags, 8);
1786         zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
1787         error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1788         ASSERT(error == 0);
1789
1790         zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1791
1792         dmu_tx_commit(tx);
1793
1794         zfs_inode_update(zp);
1795         error = 0;
1796
1797 out:
1798         /*
1799          * Truncate the page cache - for file truncate operations, use
1800          * the purpose-built API for truncations.  For punching operations,
1801          * the truncation is handled under a range lock in zfs_free_range.
1802          */
1803         if (len == 0)
1804                 truncate_setsize(ZTOI(zp), off);
1805         return (error);
1806 }
1807
1808 void
1809 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1810 {
1811         struct super_block *sb;
1812         zfsvfs_t        *zfsvfs;
1813         uint64_t        moid, obj, sa_obj, version;
1814         uint64_t        sense = ZFS_CASE_SENSITIVE;
1815         uint64_t        norm = 0;
1816         nvpair_t        *elem;
1817         int             size;
1818         int             error;
1819         int             i;
1820         znode_t         *rootzp = NULL;
1821         vattr_t         vattr;
1822         znode_t         *zp;
1823         zfs_acl_ids_t   acl_ids;
1824
1825         /*
1826          * First attempt to create master node.
1827          */
1828         /*
1829          * In an empty objset, there are no blocks to read and thus
1830          * there can be no i/o errors (which we assert below).
1831          */
1832         moid = MASTER_NODE_OBJ;
1833         error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1834             DMU_OT_NONE, 0, tx);
1835         ASSERT(error == 0);
1836
1837         /*
1838          * Set starting attributes.
1839          */
1840         version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
1841         elem = NULL;
1842         while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1843                 /* For the moment we expect all zpl props to be uint64_ts */
1844                 uint64_t val;
1845                 char *name;
1846
1847                 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1848                 VERIFY(nvpair_value_uint64(elem, &val) == 0);
1849                 name = nvpair_name(elem);
1850                 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1851                         if (val < version)
1852                                 version = val;
1853                 } else {
1854                         error = zap_update(os, moid, name, 8, 1, &val, tx);
1855                 }
1856                 ASSERT(error == 0);
1857                 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1858                         norm = val;
1859                 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1860                         sense = val;
1861         }
1862         ASSERT(version != 0);
1863         error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
1864
1865         /*
1866          * Create zap object used for SA attribute registration
1867          */
1868
1869         if (version >= ZPL_VERSION_SA) {
1870                 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
1871                     DMU_OT_NONE, 0, tx);
1872                 error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
1873                 ASSERT(error == 0);
1874         } else {
1875                 sa_obj = 0;
1876         }
1877         /*
1878          * Create a delete queue.
1879          */
1880         obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1881
1882         error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
1883         ASSERT(error == 0);
1884
1885         /*
1886          * Create root znode.  Create minimal znode/inode/zfsvfs/sb
1887          * to allow zfs_mknode to work.
1888          */
1889         vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
1890         vattr.va_mode = S_IFDIR|0755;
1891         vattr.va_uid = crgetuid(cr);
1892         vattr.va_gid = crgetgid(cr);
1893
1894         rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1895         rootzp->z_moved = 0;
1896         rootzp->z_unlinked = 0;
1897         rootzp->z_atime_dirty = 0;
1898         rootzp->z_is_sa = USE_SA(version, os);
1899         rootzp->z_pflags = 0;
1900
1901         zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1902         zfsvfs->z_os = os;
1903         zfsvfs->z_parent = zfsvfs;
1904         zfsvfs->z_version = version;
1905         zfsvfs->z_use_fuids = USE_FUIDS(version, os);
1906         zfsvfs->z_use_sa = USE_SA(version, os);
1907         zfsvfs->z_norm = norm;
1908
1909         sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
1910         sb->s_fs_info = zfsvfs;
1911
1912         ZTOI(rootzp)->i_sb = sb;
1913
1914         error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
1915             &zfsvfs->z_attr_table);
1916
1917         ASSERT(error == 0);
1918
1919         /*
1920          * Fold case on file systems that are always or sometimes case
1921          * insensitive.
1922          */
1923         if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1924                 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
1925
1926         mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1927         list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1928             offsetof(znode_t, z_link_node));
1929
1930         size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
1931         zfsvfs->z_hold_size = size;
1932         zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
1933             KM_SLEEP);
1934         zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
1935         for (i = 0; i != size; i++) {
1936                 avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
1937                     sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
1938                 mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
1939         }
1940
1941         VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1942             cr, NULL, &acl_ids));
1943         zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
1944         ASSERT3P(zp, ==, rootzp);
1945         error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1946         ASSERT(error == 0);
1947         zfs_acl_ids_free(&acl_ids);
1948
1949         atomic_set(&ZTOI(rootzp)->i_count, 0);
1950         sa_handle_destroy(rootzp->z_sa_hdl);
1951         kmem_cache_free(znode_cache, rootzp);
1952
1953         /*
1954          * Create shares directory
1955          */
1956         error = zfs_create_share_dir(zfsvfs, tx);
1957         ASSERT(error == 0);
1958
1959         for (i = 0; i != size; i++) {
1960                 avl_destroy(&zfsvfs->z_hold_trees[i]);
1961                 mutex_destroy(&zfsvfs->z_hold_locks[i]);
1962         }
1963
1964         mutex_destroy(&zfsvfs->z_znodes_lock);
1965
1966         vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
1967         vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
1968         kmem_free(sb, sizeof (struct super_block));
1969         kmem_free(zfsvfs, sizeof (zfsvfs_t));
1970 }
1971 #endif /* _KERNEL */
1972
1973 static int
1974 zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
1975 {
1976         uint64_t sa_obj = 0;
1977         int error;
1978
1979         error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
1980         if (error != 0 && error != ENOENT)
1981                 return (error);
1982
1983         error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
1984         return (error);
1985 }
1986
1987 static int
1988 zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
1989     dmu_buf_t **db, void *tag)
1990 {
1991         dmu_object_info_t doi;
1992         int error;
1993
1994         if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
1995                 return (error);
1996
1997         dmu_object_info_from_db(*db, &doi);
1998         if ((doi.doi_bonus_type != DMU_OT_SA &&
1999             doi.doi_bonus_type != DMU_OT_ZNODE) ||
2000             (doi.doi_bonus_type == DMU_OT_ZNODE &&
2001             doi.doi_bonus_size < sizeof (znode_phys_t))) {
2002                 sa_buf_rele(*db, tag);
2003                 return (SET_ERROR(ENOTSUP));
2004         }
2005
2006         error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
2007         if (error != 0) {
2008                 sa_buf_rele(*db, tag);
2009                 return (error);
2010         }
2011
2012         return (0);
2013 }
2014
2015 void
2016 zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
2017 {
2018         sa_handle_destroy(hdl);
2019         sa_buf_rele(db, tag);
2020 }
2021
2022 /*
2023  * Given an object number, return its parent object number and whether
2024  * or not the object is an extended attribute directory.
2025  */
2026 static int
2027 zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
2028     uint64_t *pobjp, int *is_xattrdir)
2029 {
2030         uint64_t parent;
2031         uint64_t pflags;
2032         uint64_t mode;
2033         uint64_t parent_mode;
2034         sa_bulk_attr_t bulk[3];
2035         sa_handle_t *sa_hdl;
2036         dmu_buf_t *sa_db;
2037         int count = 0;
2038         int error;
2039
2040         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
2041             &parent, sizeof (parent));
2042         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
2043             &pflags, sizeof (pflags));
2044         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2045             &mode, sizeof (mode));
2046
2047         if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
2048                 return (error);
2049
2050         /*
2051          * When a link is removed its parent pointer is not changed and will
2052          * be invalid.  There are two cases where a link is removed but the
2053          * file stays around, when it goes to the delete queue and when there
2054          * are additional links.
2055          */
2056         error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
2057         if (error != 0)
2058                 return (error);
2059
2060         error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
2061         zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2062         if (error != 0)
2063                 return (error);
2064
2065         *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
2066
2067         /*
2068          * Extended attributes can be applied to files, directories, etc.
2069          * Otherwise the parent must be a directory.
2070          */
2071         if (!*is_xattrdir && !S_ISDIR(parent_mode))
2072                 return (SET_ERROR(EINVAL));
2073
2074         *pobjp = parent;
2075
2076         return (0);
2077 }
2078
2079 /*
2080  * Given an object number, return some zpl level statistics
2081  */
2082 static int
2083 zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
2084     zfs_stat_t *sb)
2085 {
2086         sa_bulk_attr_t bulk[4];
2087         int count = 0;
2088
2089         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2090             &sb->zs_mode, sizeof (sb->zs_mode));
2091         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
2092             &sb->zs_gen, sizeof (sb->zs_gen));
2093         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
2094             &sb->zs_links, sizeof (sb->zs_links));
2095         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
2096             &sb->zs_ctime, sizeof (sb->zs_ctime));
2097
2098         return (sa_bulk_lookup(hdl, bulk, count));
2099 }
2100
2101 static int
2102 zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
2103     sa_attr_type_t *sa_table, char *buf, int len)
2104 {
2105         sa_handle_t *sa_hdl;
2106         sa_handle_t *prevhdl = NULL;
2107         dmu_buf_t *prevdb = NULL;
2108         dmu_buf_t *sa_db = NULL;
2109         char *path = buf + len - 1;
2110         int error;
2111
2112         *path = '\0';
2113         sa_hdl = hdl;
2114
2115         uint64_t deleteq_obj;
2116         VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
2117             ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
2118         error = zap_lookup_int(osp, deleteq_obj, obj);
2119         if (error == 0) {
2120                 return (ESTALE);
2121         } else if (error != ENOENT) {
2122                 return (error);
2123         }
2124         error = 0;
2125
2126         for (;;) {
2127                 uint64_t pobj = 0;
2128                 char component[MAXNAMELEN + 2];
2129                 size_t complen;
2130                 int is_xattrdir = 0;
2131
2132                 if (prevdb)
2133                         zfs_release_sa_handle(prevhdl, prevdb, FTAG);
2134
2135                 if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
2136                     &is_xattrdir)) != 0)
2137                         break;
2138
2139                 if (pobj == obj) {
2140                         if (path[0] != '/')
2141                                 *--path = '/';
2142                         break;
2143                 }
2144
2145                 component[0] = '/';
2146                 if (is_xattrdir) {
2147                         (void) sprintf(component + 1, "<xattrdir>");
2148                 } else {
2149                         error = zap_value_search(osp, pobj, obj,
2150                             ZFS_DIRENT_OBJ(-1ULL), component + 1);
2151                         if (error != 0)
2152                                 break;
2153                 }
2154
2155                 complen = strlen(component);
2156                 path -= complen;
2157                 ASSERT(path >= buf);
2158                 bcopy(component, path, complen);
2159                 obj = pobj;
2160
2161                 if (sa_hdl != hdl) {
2162                         prevhdl = sa_hdl;
2163                         prevdb = sa_db;
2164                 }
2165                 error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
2166                 if (error != 0) {
2167                         sa_hdl = prevhdl;
2168                         sa_db = prevdb;
2169                         break;
2170                 }
2171         }
2172
2173         if (sa_hdl != NULL && sa_hdl != hdl) {
2174                 ASSERT(sa_db != NULL);
2175                 zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2176         }
2177
2178         if (error == 0)
2179                 (void) memmove(buf, path, buf + len - path);
2180
2181         return (error);
2182 }
2183
2184 int
2185 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
2186 {
2187         sa_attr_type_t *sa_table;
2188         sa_handle_t *hdl;
2189         dmu_buf_t *db;
2190         int error;
2191
2192         error = zfs_sa_setup(osp, &sa_table);
2193         if (error != 0)
2194                 return (error);
2195
2196         error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2197         if (error != 0)
2198                 return (error);
2199
2200         error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2201
2202         zfs_release_sa_handle(hdl, db, FTAG);
2203         return (error);
2204 }
2205
2206 int
2207 zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
2208     char *buf, int len)
2209 {
2210         char *path = buf + len - 1;
2211         sa_attr_type_t *sa_table;
2212         sa_handle_t *hdl;
2213         dmu_buf_t *db;
2214         int error;
2215
2216         *path = '\0';
2217
2218         error = zfs_sa_setup(osp, &sa_table);
2219         if (error != 0)
2220                 return (error);
2221
2222         error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2223         if (error != 0)
2224                 return (error);
2225
2226         error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
2227         if (error != 0) {
2228                 zfs_release_sa_handle(hdl, db, FTAG);
2229                 return (error);
2230         }
2231
2232         error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2233
2234         zfs_release_sa_handle(hdl, db, FTAG);
2235         return (error);
2236 }
2237
2238 #if defined(_KERNEL)
2239 EXPORT_SYMBOL(zfs_create_fs);
2240 EXPORT_SYMBOL(zfs_obj_to_path);
2241
2242 /* CSTYLED */
2243 module_param(zfs_object_mutex_size, uint, 0644);
2244 MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
2245 #endif