module/os/linux/zfs/zfs_znode.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  24  */
  25
  26 /* Portions Copyright 2007 Jeremy Teo */
  27
  28 #ifdef _KERNEL
  29 #include <sys/types.h>
  30 #include <sys/param.h>
  31 #include <sys/time.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/mntent.h>
  34 #include <sys/u8_textprep.h>
  35 #include <sys/dsl_dataset.h>
  36 #include <sys/vfs.h>
  37 #include <sys/vnode.h>
  38 #include <sys/file.h>
  39 #include <sys/kmem.h>
  40 #include <sys/errno.h>
  41 #include <sys/atomic.h>
  42 #include <sys/zfs_dir.h>
  43 #include <sys/zfs_acl.h>
  44 #include <sys/zfs_ioctl.h>
  45 #include <sys/zfs_rlock.h>
  46 #include <sys/zfs_fuid.h>
  47 #include <sys/zfs_vnops.h>
  48 #include <sys/zfs_ctldir.h>
  49 #include <sys/dnode.h>
  50 #include <sys/fs/zfs.h>
  51 #include <sys/zpl.h>
  52 #endif /* _KERNEL */
  53
  54 #include <sys/dmu.h>
  55 #include <sys/dmu_objset.h>
  56 #include <sys/dmu_tx.h>
  57 #include <sys/zfs_refcount.h>
  58 #include <sys/stat.h>
  59 #include <sys/zap.h>
  60 #include <sys/zfs_znode.h>
  61 #include <sys/sa.h>
  62 #include <sys/zfs_sa.h>
  63 #include <sys/zfs_stat.h>
  64
  65 #include "zfs_prop.h"
  66 #include "zfs_comutil.h"
  67
  68 /*
  69  * Functions needed for userland (ie: libzpool) are not put under
  70  * #ifdef_KERNEL; the rest of the functions have dependencies
  71  * (such as VFS logic) that will not compile easily in userland.
  72  */
  73 #ifdef _KERNEL
  74
  75 static kmem_cache_t *znode_cache = NULL;
  76 static kmem_cache_t *znode_hold_cache = NULL;
  77 unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
  78
  79 /*
  80  * This is used by the test suite so that it can delay znodes from being
  81  * freed in order to inspect the unlinked set.
  82  */
  83 static int zfs_unlink_suspend_progress = 0;
  84
  85 /*
  86  * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
  87  * z_rangelock. It will modify the offset and length of the lock to reflect
  88  * znode-specific information, and convert RL_APPEND to RL_WRITER.  This is
  89  * called with the rangelock_t's rl_lock held, which avoids races.
  90  */
  91 static void
  92 zfs_rangelock_cb(zfs_locked_range_t *new, void *arg)
  93 {
  94         znode_t *zp = arg;
  95
  96         /*
  97          * If in append mode, convert to writer and lock starting at the
  98          * current end of file.
  99          */
 100         if (new->lr_type == RL_APPEND) {
 101                 new->lr_offset = zp->z_size;
 102                 new->lr_type = RL_WRITER;
 103         }
 104
 105         /*
 106          * If we need to grow the block size then lock the whole file range.
 107          */
 108         uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
 109         if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
 110             zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
 111                 new->lr_offset = 0;
 112                 new->lr_length = UINT64_MAX;
 113         }
 114 }
 115
 116 static int
 117 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 118 {
 119         (void) arg, (void) kmflags;
 120         znode_t *zp = buf;
 121
 122         inode_init_once(ZTOI(zp));
 123         list_link_init(&zp->z_link_node);
 124
 125         mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
 126         rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
 127         rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
 128         mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
 129         rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
 130
 131         zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
 132
 133         zp->z_dirlocks = NULL;
 134         zp->z_acl_cached = NULL;
 135         zp->z_xattr_cached = NULL;
 136         zp->z_xattr_parent = 0;
 137         zp->z_sync_writes_cnt = 0;
 138         zp->z_async_writes_cnt = 0;
 139
 140         return (0);
 141 }
 142
 143 static void
 144 zfs_znode_cache_destructor(void *buf, void *arg)
 145 {
 146         (void) arg;
 147         znode_t *zp = buf;
 148
 149         ASSERT(!list_link_active(&zp->z_link_node));
 150         mutex_destroy(&zp->z_lock);
 151         rw_destroy(&zp->z_parent_lock);
 152         rw_destroy(&zp->z_name_lock);
 153         mutex_destroy(&zp->z_acl_lock);
 154         rw_destroy(&zp->z_xattr_lock);
 155         zfs_rangelock_fini(&zp->z_rangelock);
 156
 157         ASSERT3P(zp->z_dirlocks, ==, NULL);
 158         ASSERT3P(zp->z_acl_cached, ==, NULL);
 159         ASSERT3P(zp->z_xattr_cached, ==, NULL);
 160
 161         ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
 162         ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
 163 }
 164
 165 static int
 166 zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
 167 {
 168         (void) arg, (void) kmflags;
 169         znode_hold_t *zh = buf;
 170
 171         mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
 172         zh->zh_refcount = 0;
 173
 174         return (0);
 175 }
 176
 177 static void
 178 zfs_znode_hold_cache_destructor(void *buf, void *arg)
 179 {
 180         (void) arg;
 181         znode_hold_t *zh = buf;
 182
 183         mutex_destroy(&zh->zh_lock);
 184 }
 185
 186 void
 187 zfs_znode_init(void)
 188 {
 189         /*
 190          * Initialize zcache.  The KMC_SLAB hint is used in order that it be
 191          * backed by kmalloc() when on the Linux slab in order that any
 192          * wait_on_bit() operations on the related inode operate properly.
 193          */
 194         ASSERT(znode_cache == NULL);
 195         znode_cache = kmem_cache_create("zfs_znode_cache",
 196             sizeof (znode_t), 0, zfs_znode_cache_constructor,
 197             zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB);
 198
 199         ASSERT(znode_hold_cache == NULL);
 200         znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
 201             sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
 202             zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
 203 }
 204
 205 void
 206 zfs_znode_fini(void)
 207 {
 208         /*
 209          * Cleanup zcache
 210          */
 211         if (znode_cache)
 212                 kmem_cache_destroy(znode_cache);
 213         znode_cache = NULL;
 214
 215         if (znode_hold_cache)
 216                 kmem_cache_destroy(znode_hold_cache);
 217         znode_hold_cache = NULL;
 218 }
 219
 220 /*
 221  * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
 222  * serialize access to a znode and its SA buffer while the object is being
 223  * created or destroyed.  This kind of locking would normally reside in the
 224  * znode itself but in this case that's impossible because the znode and SA
 225  * buffer may not yet exist.  Therefore the locking is handled externally
 226  * with an array of mutexes and AVLs trees which contain per-object locks.
 227  *
 228  * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
 229  * in to the correct AVL tree and finally the per-object lock is held.  In
 230  * zfs_znode_hold_exit() the process is reversed.  The per-object lock is
 231  * released, removed from the AVL tree and destroyed if there are no waiters.
 232  *
 233  * This scheme has two important properties:
 234  *
 235  * 1) No memory allocations are performed while holding one of the z_hold_locks.
 236  *    This ensures evict(), which can be called from direct memory reclaim, will
 237  *    never block waiting on a z_hold_locks which just happens to have hashed
 238  *    to the same index.
 239  *
 240  * 2) All locks used to serialize access to an object are per-object and never
 241  *    shared.  This minimizes lock contention without creating a large number
 242  *    of dedicated locks.
 243  *
 244  * On the downside it does require znode_lock_t structures to be frequently
 245  * allocated and freed.  However, because these are backed by a kmem cache
 246  * and very short lived this cost is minimal.
 247  */
 248 int
 249 zfs_znode_hold_compare(const void *a, const void *b)
 250 {
 251         const znode_hold_t *zh_a = (const znode_hold_t *)a;
 252         const znode_hold_t *zh_b = (const znode_hold_t *)b;
 253
 254         return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj));
 255 }
 256
 257 static boolean_t __maybe_unused
 258 zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
 259 {
 260         znode_hold_t *zh, search;
 261         int i = ZFS_OBJ_HASH(zfsvfs, obj);
 262         boolean_t held;
 263
 264         search.zh_obj = obj;
 265
 266         mutex_enter(&zfsvfs->z_hold_locks[i]);
 267         zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
 268         held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
 269         mutex_exit(&zfsvfs->z_hold_locks[i]);
 270
 271         return (held);
 272 }
 273
 274 znode_hold_t *
 275 zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
 276 {
 277         znode_hold_t *zh, *zh_new, search;
 278         int i = ZFS_OBJ_HASH(zfsvfs, obj);
 279         boolean_t found = B_FALSE;
 280
 281         zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
 282         search.zh_obj = obj;
 283
 284         mutex_enter(&zfsvfs->z_hold_locks[i]);
 285         zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
 286         if (likely(zh == NULL)) {
 287                 zh = zh_new;
 288                 zh->zh_obj = obj;
 289                 avl_add(&zfsvfs->z_hold_trees[i], zh);
 290         } else {
 291                 ASSERT3U(zh->zh_obj, ==, obj);
 292                 found = B_TRUE;
 293         }
 294         zh->zh_refcount++;
 295         ASSERT3S(zh->zh_refcount, >, 0);
 296         mutex_exit(&zfsvfs->z_hold_locks[i]);
 297
 298         if (found == B_TRUE)
 299                 kmem_cache_free(znode_hold_cache, zh_new);
 300
 301         ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
 302         mutex_enter(&zh->zh_lock);
 303
 304         return (zh);
 305 }
 306
 307 void
 308 zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
 309 {
 310         int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
 311         boolean_t remove = B_FALSE;
 312
 313         ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
 314         mutex_exit(&zh->zh_lock);
 315
 316         mutex_enter(&zfsvfs->z_hold_locks[i]);
 317         ASSERT3S(zh->zh_refcount, >, 0);
 318         if (--zh->zh_refcount == 0) {
 319                 avl_remove(&zfsvfs->z_hold_trees[i], zh);
 320                 remove = B_TRUE;
 321         }
 322         mutex_exit(&zfsvfs->z_hold_locks[i]);
 323
 324         if (remove == B_TRUE)
 325                 kmem_cache_free(znode_hold_cache, zh);
 326 }
 327
 328 dev_t
 329 zfs_cmpldev(uint64_t dev)
 330 {
 331         return (dev);
 332 }
 333
 334 static void
 335 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
 336     dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
 337 {
 338         ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
 339
 340         mutex_enter(&zp->z_lock);
 341
 342         ASSERT(zp->z_sa_hdl == NULL);
 343         ASSERT(zp->z_acl_cached == NULL);
 344         if (sa_hdl == NULL) {
 345                 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
 346                     SA_HDL_SHARED, &zp->z_sa_hdl));
 347         } else {
 348                 zp->z_sa_hdl = sa_hdl;
 349                 sa_set_userp(sa_hdl, zp);
 350         }
 351
 352         zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
 353
 354         mutex_exit(&zp->z_lock);
 355 }
 356
 357 void
 358 zfs_znode_dmu_fini(znode_t *zp)
 359 {
 360         ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) ||
 361             RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
 362
 363         sa_handle_destroy(zp->z_sa_hdl);
 364         zp->z_sa_hdl = NULL;
 365 }
 366
 367 /*
 368  * Called by new_inode() to allocate a new inode.
 369  */
 370 int
 371 zfs_inode_alloc(struct super_block *sb, struct inode **ip)
 372 {
 373         znode_t *zp;
 374
 375         zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
 376         *ip = ZTOI(zp);
 377
 378         return (0);
 379 }
 380
 381 /*
 382  * Called in multiple places when an inode should be destroyed.
 383  */
 384 void
 385 zfs_inode_destroy(struct inode *ip)
 386 {
 387         znode_t *zp = ITOZ(ip);
 388         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 389
 390         mutex_enter(&zfsvfs->z_znodes_lock);
 391         if (list_link_active(&zp->z_link_node)) {
 392                 list_remove(&zfsvfs->z_all_znodes, zp);
 393         }
 394         mutex_exit(&zfsvfs->z_znodes_lock);
 395
 396         if (zp->z_acl_cached) {
 397                 zfs_acl_free(zp->z_acl_cached);
 398                 zp->z_acl_cached = NULL;
 399         }
 400
 401         if (zp->z_xattr_cached) {
 402                 nvlist_free(zp->z_xattr_cached);
 403                 zp->z_xattr_cached = NULL;
 404         }
 405
 406         kmem_cache_free(znode_cache, zp);
 407 }
 408
 409 static void
 410 zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
 411 {
 412         uint64_t rdev = 0;
 413
 414         switch (ip->i_mode & S_IFMT) {
 415         case S_IFREG:
 416                 ip->i_op = &zpl_inode_operations;
 417 #ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
 418                 ip->i_fop = &zpl_file_operations.kabi_fops;
 419 #else
 420                 ip->i_fop = &zpl_file_operations;
 421 #endif
 422                 ip->i_mapping->a_ops = &zpl_address_space_operations;
 423                 break;
 424
 425         case S_IFDIR:
 426 #ifdef HAVE_RENAME2_OPERATIONS_WRAPPER
 427                 ip->i_flags |= S_IOPS_WRAPPER;
 428                 ip->i_op = &zpl_dir_inode_operations.ops;
 429 #else
 430                 ip->i_op = &zpl_dir_inode_operations;
 431 #endif
 432                 ip->i_fop = &zpl_dir_file_operations;
 433                 ITOZ(ip)->z_zn_prefetch = B_TRUE;
 434                 break;
 435
 436         case S_IFLNK:
 437                 ip->i_op = &zpl_symlink_inode_operations;
 438                 break;
 439
 440         /*
 441          * rdev is only stored in a SA only for device files.
 442          */
 443         case S_IFCHR:
 444         case S_IFBLK:
 445                 (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
 446                     sizeof (rdev));
 447                 zfs_fallthrough;
 448         case S_IFIFO:
 449         case S_IFSOCK:
 450                 init_special_inode(ip, ip->i_mode, rdev);
 451                 ip->i_op = &zpl_special_inode_operations;
 452                 break;
 453
 454         default:
 455                 zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
 456                     (u_longlong_t)ip->i_ino, ip->i_mode);
 457
 458                 /* Assume the inode is a file and attempt to continue */
 459                 ip->i_mode = S_IFREG | 0644;
 460                 ip->i_op = &zpl_inode_operations;
 461 #ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
 462                 ip->i_fop = &zpl_file_operations.kabi_fops;
 463 #else
 464                 ip->i_fop = &zpl_file_operations;
 465 #endif
 466                 ip->i_mapping->a_ops = &zpl_address_space_operations;
 467                 break;
 468         }
 469 }
 470
 471 static void
 472 zfs_set_inode_flags(znode_t *zp, struct inode *ip)
 473 {
 474         /*
 475          * Linux and Solaris have different sets of file attributes, so we
 476          * restrict this conversion to the intersection of the two.
 477          */
 478 #ifdef HAVE_INODE_SET_FLAGS
 479         unsigned int flags = 0;
 480         if (zp->z_pflags & ZFS_IMMUTABLE)
 481                 flags |= S_IMMUTABLE;
 482         if (zp->z_pflags & ZFS_APPENDONLY)
 483                 flags |= S_APPEND;
 484
 485         inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
 486 #else
 487         if (zp->z_pflags & ZFS_IMMUTABLE)
 488                 ip->i_flags |= S_IMMUTABLE;
 489         else
 490                 ip->i_flags &= ~S_IMMUTABLE;
 491
 492         if (zp->z_pflags & ZFS_APPENDONLY)
 493                 ip->i_flags |= S_APPEND;
 494         else
 495                 ip->i_flags &= ~S_APPEND;
 496 #endif
 497 }
 498
 499 /*
 500  * Update the embedded inode given the znode.
 501  */
 502 void
 503 zfs_znode_update_vfs(znode_t *zp)
 504 {
 505         struct inode    *ip;
 506         uint32_t        blksize;
 507         u_longlong_t    i_blocks;
 508
 509         ASSERT(zp != NULL);
 510         ip = ZTOI(zp);
 511
 512         /* Skip .zfs control nodes which do not exist on disk. */
 513         if (zfsctl_is_node(ip))
 514                 return;
 515
 516         dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
 517
 518         spin_lock(&ip->i_lock);
 519         ip->i_mode = zp->z_mode;
 520         ip->i_blocks = i_blocks;
 521         i_size_write(ip, zp->z_size);
 522         spin_unlock(&ip->i_lock);
 523 }
 524
 525
 526 /*
 527  * Construct a znode+inode and initialize.
 528  *
 529  * This does not do a call to dmu_set_user() that is
 530  * up to the caller to do, in case you don't want to
 531  * return the znode
 532  */
 533 static znode_t *
 534 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 535     dmu_object_type_t obj_type, sa_handle_t *hdl)
 536 {
 537         znode_t *zp;
 538         struct inode *ip;
 539         uint64_t mode;
 540         uint64_t parent;
 541         uint64_t tmp_gen;
 542         uint64_t links;
 543         uint64_t z_uid, z_gid;
 544         uint64_t atime[2], mtime[2], ctime[2], btime[2];
 545         inode_timespec_t tmp_ctime;
 546         uint64_t projid = ZFS_DEFAULT_PROJID;
 547         sa_bulk_attr_t bulk[12];
 548         int count = 0;
 549
 550         ASSERT(zfsvfs != NULL);
 551
 552         ip = new_inode(zfsvfs->z_sb);
 553         if (ip == NULL)
 554                 return (NULL);
 555
 556         zp = ITOZ(ip);
 557         ASSERT(zp->z_dirlocks == NULL);
 558         ASSERT3P(zp->z_acl_cached, ==, NULL);
 559         ASSERT3P(zp->z_xattr_cached, ==, NULL);
 560         zp->z_unlinked = B_FALSE;
 561         zp->z_atime_dirty = B_FALSE;
 562 #if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE)
 563         zp->z_is_mapped = B_FALSE;
 564 #endif
 565         zp->z_is_ctldir = B_FALSE;
 566         zp->z_suspended = B_FALSE;
 567         zp->z_sa_hdl = NULL;
 568         zp->z_mapcnt = 0;
 569         zp->z_id = db->db_object;
 570         zp->z_blksz = blksz;
 571         zp->z_seq = 0x7A4653;
 572         zp->z_sync_cnt = 0;
 573         zp->z_sync_writes_cnt = 0;
 574         zp->z_async_writes_cnt = 0;
 575
 576         zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
 577
 578         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
 579         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
 580         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 581             &zp->z_size, 8);
 582         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
 583         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 584             &zp->z_pflags, 8);
 585         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
 586             &parent, 8);
 587         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
 588         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
 589         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
 590         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 591         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 592         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
 593
 594         if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
 595             (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
 596             (zp->z_pflags & ZFS_PROJID) &&
 597             sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
 598                 if (hdl == NULL)
 599                         sa_handle_destroy(zp->z_sa_hdl);
 600                 zp->z_sa_hdl = NULL;
 601                 goto error;
 602         }
 603
 604         zp->z_projid = projid;
 605         zp->z_mode = ip->i_mode = mode;
 606         ip->i_generation = (uint32_t)tmp_gen;
 607         ip->i_blkbits = SPA_MINBLOCKSHIFT;
 608         set_nlink(ip, (uint32_t)links);
 609         zfs_uid_write(ip, z_uid);
 610         zfs_gid_write(ip, z_gid);
 611         zfs_set_inode_flags(zp, ip);
 612
 613         /* Cache the xattr parent id */
 614         if (zp->z_pflags & ZFS_XATTR)
 615                 zp->z_xattr_parent = parent;
 616
 617         ZFS_TIME_DECODE(&ip->i_atime, atime);
 618         ZFS_TIME_DECODE(&ip->i_mtime, mtime);
 619         ZFS_TIME_DECODE(&tmp_ctime, ctime);
 620         zpl_inode_set_ctime_to_ts(ip, tmp_ctime);
 621         ZFS_TIME_DECODE(&zp->z_btime, btime);
 622
 623         ip->i_ino = zp->z_id;
 624         zfs_znode_update_vfs(zp);
 625         zfs_inode_set_ops(zfsvfs, ip);
 626
 627         /*
 628          * The only way insert_inode_locked() can fail is if the ip->i_ino
 629          * number is already hashed for this super block.  This can never
 630          * happen because the inode numbers map 1:1 with the object numbers.
 631          *
 632          * Exceptions include rolling back a mounted file system, either
 633          * from the zfs rollback or zfs recv command.
 634          *
 635          * Active inodes are unhashed during the rollback, but since zrele
 636          * can happen asynchronously, we can't guarantee they've been
 637          * unhashed.  This can cause hash collisions in unlinked drain
 638          * processing so do not hash unlinked znodes.
 639          */
 640         if (links > 0)
 641                 VERIFY3S(insert_inode_locked(ip), ==, 0);
 642
 643         mutex_enter(&zfsvfs->z_znodes_lock);
 644         list_insert_tail(&zfsvfs->z_all_znodes, zp);
 645         mutex_exit(&zfsvfs->z_znodes_lock);
 646
 647         if (links > 0)
 648                 unlock_new_inode(ip);
 649         return (zp);
 650
 651 error:
 652         iput(ip);
 653         return (NULL);
 654 }
 655
 656 /*
 657  * Safely mark an inode dirty.  Inodes which are part of a read-only
 658  * file system or snapshot may not be dirtied.
 659  */
 660 void
 661 zfs_mark_inode_dirty(struct inode *ip)
 662 {
 663         zfsvfs_t *zfsvfs = ITOZSB(ip);
 664
 665         if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
 666                 return;
 667
 668         mark_inode_dirty(ip);
 669 }
 670
 671 static uint64_t empty_xattr;
 672 static uint64_t pad[4];
 673 static zfs_acl_phys_t acl_phys;
 674 /*
 675  * Create a new DMU object to hold a zfs znode.
 676  *
 677  *      IN:     dzp     - parent directory for new znode
 678  *              vap     - file attributes for new znode
 679  *              tx      - dmu transaction id for zap operations
 680  *              cr      - credentials of caller
 681  *              flag    - flags:
 682  *                        IS_ROOT_NODE  - new object will be root
 683  *                        IS_TMPFILE    - new object is of O_TMPFILE
 684  *                        IS_XATTR      - new object is an attribute
 685  *              acl_ids - ACL related attributes
 686  *
 687  *      OUT:    zpp     - allocated znode (set to dzp if IS_ROOT_NODE)
 688  *
 689  */
 690 void
 691 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 692     uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
 693 {
 694         uint64_t        crtime[2], atime[2], mtime[2], ctime[2];
 695         uint64_t        mode, size, links, parent, pflags;
 696         uint64_t        projid = ZFS_DEFAULT_PROJID;
 697         uint64_t        rdev = 0;
 698         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 699         dmu_buf_t       *db;
 700         inode_timespec_t now;
 701         uint64_t        gen, obj;
 702         int             bonuslen;
 703         int             dnodesize;
 704         sa_handle_t     *sa_hdl;
 705         dmu_object_type_t obj_type;
 706         sa_bulk_attr_t  *sa_attrs;
 707         int             cnt = 0;
 708         zfs_acl_locator_cb_t locate = { 0 };
 709         znode_hold_t    *zh;
 710
 711         if (zfsvfs->z_replay) {
 712                 obj = vap->va_nodeid;
 713                 now = vap->va_ctime;            /* see zfs_replay_create() */
 714                 gen = vap->va_nblocks;          /* ditto */
 715                 dnodesize = vap->va_fsid;       /* ditto */
 716         } else {
 717                 obj = 0;
 718                 gethrestime(&now);
 719                 gen = dmu_tx_get_txg(tx);
 720                 dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
 721         }
 722
 723         if (dnodesize == 0)
 724                 dnodesize = DNODE_MIN_SIZE;
 725
 726         obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
 727
 728         bonuslen = (obj_type == DMU_OT_SA) ?
 729             DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
 730
 731         /*
 732          * Create a new DMU object.
 733          */
 734         /*
 735          * There's currently no mechanism for pre-reading the blocks that will
 736          * be needed to allocate a new object, so we accept the small chance
 737          * that there will be an i/o error and we will fail one of the
 738          * assertions below.
 739          */
 740         if (S_ISDIR(vap->va_mode)) {
 741                 if (zfsvfs->z_replay) {
 742                         VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
 743                             zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 744                             obj_type, bonuslen, dnodesize, tx));
 745                 } else {
 746                         obj = zap_create_norm_dnsize(zfsvfs->z_os,
 747                             zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 748                             obj_type, bonuslen, dnodesize, tx);
 749                 }
 750         } else {
 751                 if (zfsvfs->z_replay) {
 752                         VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
 753                             DMU_OT_PLAIN_FILE_CONTENTS, 0,
 754                             obj_type, bonuslen, dnodesize, tx));
 755                 } else {
 756                         obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
 757                             DMU_OT_PLAIN_FILE_CONTENTS, 0,
 758                             obj_type, bonuslen, dnodesize, tx);
 759                 }
 760         }
 761
 762         zh = zfs_znode_hold_enter(zfsvfs, obj);
 763         VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
 764
 765         /*
 766          * If this is the root, fix up the half-initialized parent pointer
 767          * to reference the just-allocated physical data area.
 768          */
 769         if (flag & IS_ROOT_NODE) {
 770                 dzp->z_id = obj;
 771         }
 772
 773         /*
 774          * If parent is an xattr, so am I.
 775          */
 776         if (dzp->z_pflags & ZFS_XATTR) {
 777                 flag |= IS_XATTR;
 778         }
 779
 780         if (zfsvfs->z_use_fuids)
 781                 pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
 782         else
 783                 pflags = 0;
 784
 785         if (S_ISDIR(vap->va_mode)) {
 786                 size = 2;               /* contents ("." and "..") */
 787                 links = 2;
 788         } else {
 789                 size = 0;
 790                 links = (flag & IS_TMPFILE) ? 0 : 1;
 791         }
 792
 793         if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
 794                 rdev = vap->va_rdev;
 795
 796         parent = dzp->z_id;
 797         mode = acl_ids->z_mode;
 798         if (flag & IS_XATTR)
 799                 pflags |= ZFS_XATTR;
 800
 801         if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
 802                 /*
 803                  * With ZFS_PROJID flag, we can easily know whether there is
 804                  * project ID stored on disk or not. See zfs_space_delta_cb().
 805                  */
 806                 if (obj_type != DMU_OT_ZNODE &&
 807                     dmu_objset_projectquota_enabled(zfsvfs->z_os))
 808                         pflags |= ZFS_PROJID;
 809
 810                 /*
 811                  * Inherit project ID from parent if required.
 812                  */
 813                 projid = zfs_inherit_projid(dzp);
 814                 if (dzp->z_pflags & ZFS_PROJINHERIT)
 815                         pflags |= ZFS_PROJINHERIT;
 816         }
 817
 818         /*
 819          * No execs denied will be determined when zfs_mode_compute() is called.
 820          */
 821         pflags |= acl_ids->z_aclp->z_hints &
 822             (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
 823             ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
 824
 825         ZFS_TIME_ENCODE(&now, crtime);
 826         ZFS_TIME_ENCODE(&now, ctime);
 827
 828         if (vap->va_mask & ATTR_ATIME) {
 829                 ZFS_TIME_ENCODE(&vap->va_atime, atime);
 830         } else {
 831                 ZFS_TIME_ENCODE(&now, atime);
 832         }
 833
 834         if (vap->va_mask & ATTR_MTIME) {
 835                 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 836         } else {
 837                 ZFS_TIME_ENCODE(&now, mtime);
 838         }
 839
 840         /* Now add in all of the "SA" attributes */
 841         VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
 842             &sa_hdl));
 843
 844         /*
 845          * Setup the array of attributes to be replaced/set on the new file
 846          *
 847          * order for  DMU_OT_ZNODE is critical since it needs to be constructed
 848          * in the old znode_phys_t format.  Don't change this ordering
 849          */
 850         sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
 851
 852         if (obj_type == DMU_OT_ZNODE) {
 853                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
 854                     NULL, &atime, 16);
 855                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
 856                     NULL, &mtime, 16);
 857                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
 858                     NULL, &ctime, 16);
 859                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
 860                     NULL, &crtime, 16);
 861                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
 862                     NULL, &gen, 8);
 863                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
 864                     NULL, &mode, 8);
 865                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
 866                     NULL, &size, 8);
 867                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
 868                     NULL, &parent, 8);
 869         } else {
 870                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
 871                     NULL, &mode, 8);
 872                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
 873                     NULL, &size, 8);
 874                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
 875                     NULL, &gen, 8);
 876                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
 877                     NULL, &acl_ids->z_fuid, 8);
 878                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
 879                     NULL, &acl_ids->z_fgid, 8);
 880                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
 881                     NULL, &parent, 8);
 882                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
 883                     NULL, &pflags, 8);
 884                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
 885                     NULL, &atime, 16);
 886                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
 887                     NULL, &mtime, 16);
 888                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
 889                     NULL, &ctime, 16);
 890                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
 891                     NULL, &crtime, 16);
 892         }
 893
 894         SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
 895
 896         if (obj_type == DMU_OT_ZNODE) {
 897                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
 898                     &empty_xattr, 8);
 899         } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
 900             pflags & ZFS_PROJID) {
 901                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
 902                     NULL, &projid, 8);
 903         }
 904         if (obj_type == DMU_OT_ZNODE ||
 905             (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
 906                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
 907                     NULL, &rdev, 8);
 908         }
 909         if (obj_type == DMU_OT_ZNODE) {
 910                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
 911                     NULL, &pflags, 8);
 912                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
 913                     &acl_ids->z_fuid, 8);
 914                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
 915                     &acl_ids->z_fgid, 8);
 916                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
 917                     sizeof (uint64_t) * 4);
 918                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
 919                     &acl_phys, sizeof (zfs_acl_phys_t));
 920         } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
 921                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
 922                     &acl_ids->z_aclp->z_acl_count, 8);
 923                 locate.cb_aclp = acl_ids->z_aclp;
 924                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
 925                     zfs_acl_data_locator, &locate,
 926                     acl_ids->z_aclp->z_acl_bytes);
 927                 mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
 928                     acl_ids->z_fuid, acl_ids->z_fgid);
 929         }
 930
 931         VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
 932
 933         if (!(flag & IS_ROOT_NODE)) {
 934                 /*
 935                  * The call to zfs_znode_alloc() may fail if memory is low
 936                  * via the call path: alloc_inode() -> inode_init_always() ->
 937                  * security_inode_alloc() -> inode_alloc_security().  Since
 938                  * the existing code is written such that zfs_mknode() can
 939                  * not fail retry until sufficient memory has been reclaimed.
 940                  */
 941                 do {
 942                         *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
 943                 } while (*zpp == NULL);
 944
 945                 VERIFY(*zpp != NULL);
 946                 VERIFY(dzp != NULL);
 947         } else {
 948                 /*
 949                  * If we are creating the root node, the "parent" we
 950                  * passed in is the znode for the root.
 951                  */
 952                 *zpp = dzp;
 953
 954                 (*zpp)->z_sa_hdl = sa_hdl;
 955         }
 956
 957         (*zpp)->z_pflags = pflags;
 958         (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
 959         (*zpp)->z_dnodesize = dnodesize;
 960         (*zpp)->z_projid = projid;
 961
 962         if (obj_type == DMU_OT_ZNODE ||
 963             acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
 964                 VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
 965         }
 966         kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
 967         zfs_znode_hold_exit(zfsvfs, zh);
 968 }
 969
 970 /*
 971  * Update in-core attributes.  It is assumed the caller will be doing an
 972  * sa_bulk_update to push the changes out.
 973  */
 974 void
 975 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
 976 {
 977         xoptattr_t *xoap;
 978         boolean_t update_inode = B_FALSE;
 979
 980         xoap = xva_getxoptattr(xvap);
 981         ASSERT(xoap);
 982
 983         if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
 984                 uint64_t times[2];
 985                 ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
 986                 (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
 987                     &times, sizeof (times), tx);
 988                 XVA_SET_RTN(xvap, XAT_CREATETIME);
 989         }
 990         if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
 991                 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
 992                     zp->z_pflags, tx);
 993                 XVA_SET_RTN(xvap, XAT_READONLY);
 994         }
 995         if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
 996                 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
 997                     zp->z_pflags, tx);
 998                 XVA_SET_RTN(xvap, XAT_HIDDEN);
 999         }
1000         if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
1001                 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
1002                     zp->z_pflags, tx);
1003                 XVA_SET_RTN(xvap, XAT_SYSTEM);
1004         }
1005         if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
1006                 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
1007                     zp->z_pflags, tx);
1008                 XVA_SET_RTN(xvap, XAT_ARCHIVE);
1009         }
1010         if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
1011                 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
1012                     zp->z_pflags, tx);
1013                 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
1014
1015                 update_inode = B_TRUE;
1016         }
1017         if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
1018                 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
1019                     zp->z_pflags, tx);
1020                 XVA_SET_RTN(xvap, XAT_NOUNLINK);
1021         }
1022         if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1023                 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
1024                     zp->z_pflags, tx);
1025                 XVA_SET_RTN(xvap, XAT_APPENDONLY);
1026
1027                 update_inode = B_TRUE;
1028         }
1029         if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1030                 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
1031                     zp->z_pflags, tx);
1032                 XVA_SET_RTN(xvap, XAT_NODUMP);
1033         }
1034         if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1035                 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
1036                     zp->z_pflags, tx);
1037                 XVA_SET_RTN(xvap, XAT_OPAQUE);
1038         }
1039         if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1040                 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
1041                     xoap->xoa_av_quarantined, zp->z_pflags, tx);
1042                 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1043         }
1044         if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1045                 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
1046                     zp->z_pflags, tx);
1047                 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1048         }
1049         if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
1050                 zfs_sa_set_scanstamp(zp, xvap, tx);
1051                 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
1052         }
1053         if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1054                 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
1055                     zp->z_pflags, tx);
1056                 XVA_SET_RTN(xvap, XAT_REPARSE);
1057         }
1058         if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1059                 ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
1060                     zp->z_pflags, tx);
1061                 XVA_SET_RTN(xvap, XAT_OFFLINE);
1062         }
1063         if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1064                 ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
1065                     zp->z_pflags, tx);
1066                 XVA_SET_RTN(xvap, XAT_SPARSE);
1067         }
1068         if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
1069                 ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
1070                     zp->z_pflags, tx);
1071                 XVA_SET_RTN(xvap, XAT_PROJINHERIT);
1072         }
1073
1074         if (update_inode)
1075                 zfs_set_inode_flags(zp, ZTOI(zp));
1076 }
1077
1078 int
1079 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
1080 {
1081         dmu_object_info_t doi;
1082         dmu_buf_t       *db;
1083         znode_t         *zp;
1084         znode_hold_t    *zh;
1085         int err;
1086         sa_handle_t     *hdl;
1087
1088         *zpp = NULL;
1089
1090 again:
1091         zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1092
1093         err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1094         if (err) {
1095                 zfs_znode_hold_exit(zfsvfs, zh);
1096                 return (err);
1097         }
1098
1099         dmu_object_info_from_db(db, &doi);
1100         if (doi.doi_bonus_type != DMU_OT_SA &&
1101             (doi.doi_bonus_type != DMU_OT_ZNODE ||
1102             (doi.doi_bonus_type == DMU_OT_ZNODE &&
1103             doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1104                 sa_buf_rele(db, NULL);
1105                 zfs_znode_hold_exit(zfsvfs, zh);
1106                 return (SET_ERROR(EINVAL));
1107         }
1108
1109         hdl = dmu_buf_get_user(db);
1110         if (hdl != NULL) {
1111                 zp = sa_get_userdata(hdl);
1112
1113
1114                 /*
1115                  * Since "SA" does immediate eviction we
1116                  * should never find a sa handle that doesn't
1117                  * know about the znode.
1118                  */
1119
1120                 ASSERT3P(zp, !=, NULL);
1121
1122                 mutex_enter(&zp->z_lock);
1123                 ASSERT3U(zp->z_id, ==, obj_num);
1124                 /*
1125                  * If zp->z_unlinked is set, the znode is already marked
1126                  * for deletion and should not be discovered. Check this
1127                  * after checking igrab() due to fsetxattr() & O_TMPFILE.
1128                  *
1129                  * If igrab() returns NULL the VFS has independently
1130                  * determined the inode should be evicted and has
1131                  * called iput_final() to start the eviction process.
1132                  * The SA handle is still valid but because the VFS
1133                  * requires that the eviction succeed we must drop
1134                  * our locks and references to allow the eviction to
1135                  * complete.  The zfs_zget() may then be retried.
1136                  *
1137                  * This unlikely case could be optimized by registering
1138                  * a sops->drop_inode() callback.  The callback would
1139                  * need to detect the active SA hold thereby informing
1140                  * the VFS that this inode should not be evicted.
1141                  */
1142                 if (igrab(ZTOI(zp)) == NULL) {
1143                         if (zp->z_unlinked)
1144                                 err = SET_ERROR(ENOENT);
1145                         else
1146                                 err = SET_ERROR(EAGAIN);
1147                 } else {
1148                         *zpp = zp;
1149                         err = 0;
1150                 }
1151
1152                 mutex_exit(&zp->z_lock);
1153                 sa_buf_rele(db, NULL);
1154                 zfs_znode_hold_exit(zfsvfs, zh);
1155
1156                 if (err == EAGAIN) {
1157                         /* inode might need this to finish evict */
1158                         cond_resched();
1159                         goto again;
1160                 }
1161                 return (err);
1162         }
1163
1164         /*
1165          * Not found create new znode/vnode but only if file exists.
1166          *
1167          * There is a small window where zfs_vget() could
1168          * find this object while a file create is still in
1169          * progress.  This is checked for in zfs_znode_alloc()
1170          *
1171          * if zfs_znode_alloc() fails it will drop the hold on the
1172          * bonus buffer.
1173          */
1174         zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
1175             doi.doi_bonus_type, NULL);
1176         if (zp == NULL) {
1177                 err = SET_ERROR(ENOENT);
1178         } else {
1179                 *zpp = zp;
1180         }
1181         zfs_znode_hold_exit(zfsvfs, zh);
1182         return (err);
1183 }
1184
1185 int
1186 zfs_rezget(znode_t *zp)
1187 {
1188         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1189         dmu_object_info_t doi;
1190         dmu_buf_t *db;
1191         uint64_t obj_num = zp->z_id;
1192         uint64_t mode;
1193         uint64_t links;
1194         sa_bulk_attr_t bulk[11];
1195         int err;
1196         int count = 0;
1197         uint64_t gen;
1198         uint64_t z_uid, z_gid;
1199         uint64_t atime[2], mtime[2], ctime[2], btime[2];
1200         inode_timespec_t tmp_ctime;
1201         uint64_t projid = ZFS_DEFAULT_PROJID;
1202         znode_hold_t *zh;
1203
1204         /*
1205          * skip ctldir, otherwise they will always get invalidated. This will
1206          * cause funny behaviour for the mounted snapdirs. Especially for
1207          * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
1208          * anyone automount it again as long as someone is still using the
1209          * detached mount.
1210          */
1211         if (zp->z_is_ctldir)
1212                 return (0);
1213
1214         zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1215
1216         mutex_enter(&zp->z_acl_lock);
1217         if (zp->z_acl_cached) {
1218                 zfs_acl_free(zp->z_acl_cached);
1219                 zp->z_acl_cached = NULL;
1220         }
1221         mutex_exit(&zp->z_acl_lock);
1222
1223         rw_enter(&zp->z_xattr_lock, RW_WRITER);
1224         if (zp->z_xattr_cached) {
1225                 nvlist_free(zp->z_xattr_cached);
1226                 zp->z_xattr_cached = NULL;
1227         }
1228         rw_exit(&zp->z_xattr_lock);
1229
1230         ASSERT(zp->z_sa_hdl == NULL);
1231         err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1232         if (err) {
1233                 zfs_znode_hold_exit(zfsvfs, zh);
1234                 return (err);
1235         }
1236
1237         dmu_object_info_from_db(db, &doi);
1238         if (doi.doi_bonus_type != DMU_OT_SA &&
1239             (doi.doi_bonus_type != DMU_OT_ZNODE ||
1240             (doi.doi_bonus_type == DMU_OT_ZNODE &&
1241             doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1242                 sa_buf_rele(db, NULL);
1243                 zfs_znode_hold_exit(zfsvfs, zh);
1244                 return (SET_ERROR(EINVAL));
1245         }
1246
1247         zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
1248
1249         /* reload cached values */
1250         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
1251             &gen, sizeof (gen));
1252         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1253             &zp->z_size, sizeof (zp->z_size));
1254         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
1255             &links, sizeof (links));
1256         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1257             &zp->z_pflags, sizeof (zp->z_pflags));
1258         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1259             &z_uid, sizeof (z_uid));
1260         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1261             &z_gid, sizeof (z_gid));
1262         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1263             &mode, sizeof (mode));
1264         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1265             &atime, 16);
1266         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
1267             &mtime, 16);
1268         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
1269             &ctime, 16);
1270         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
1271
1272         if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
1273                 zfs_znode_dmu_fini(zp);
1274                 zfs_znode_hold_exit(zfsvfs, zh);
1275                 return (SET_ERROR(EIO));
1276         }
1277
1278         if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
1279                 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
1280                     &projid, 8);
1281                 if (err != 0 && err != ENOENT) {
1282                         zfs_znode_dmu_fini(zp);
1283                         zfs_znode_hold_exit(zfsvfs, zh);
1284                         return (SET_ERROR(err));
1285                 }
1286         }
1287
1288         zp->z_projid = projid;
1289         zp->z_mode = ZTOI(zp)->i_mode = mode;
1290         zfs_uid_write(ZTOI(zp), z_uid);
1291         zfs_gid_write(ZTOI(zp), z_gid);
1292
1293         ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
1294         ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
1295         ZFS_TIME_DECODE(&tmp_ctime, ctime);
1296         zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime);
1297         ZFS_TIME_DECODE(&zp->z_btime, btime);
1298
1299         if ((uint32_t)gen != ZTOI(zp)->i_generation) {
1300                 zfs_znode_dmu_fini(zp);
1301                 zfs_znode_hold_exit(zfsvfs, zh);
1302                 return (SET_ERROR(EIO));
1303         }
1304
1305         set_nlink(ZTOI(zp), (uint32_t)links);
1306         zfs_set_inode_flags(zp, ZTOI(zp));
1307
1308         zp->z_blksz = doi.doi_data_block_size;
1309         zp->z_atime_dirty = B_FALSE;
1310         zfs_znode_update_vfs(zp);
1311
1312         /*
1313          * If the file has zero links, then it has been unlinked on the send
1314          * side and it must be in the received unlinked set.
1315          * We call zfs_znode_dmu_fini() now to prevent any accesses to the
1316          * stale data and to prevent automatic removal of the file in
1317          * zfs_zinactive().  The file will be removed either when it is removed
1318          * on the send side and the next incremental stream is received or
1319          * when the unlinked set gets processed.
1320          */
1321         zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
1322         if (zp->z_unlinked)
1323                 zfs_znode_dmu_fini(zp);
1324
1325         zfs_znode_hold_exit(zfsvfs, zh);
1326
1327         return (0);
1328 }
1329
1330 void
1331 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1332 {
1333         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1334         objset_t *os = zfsvfs->z_os;
1335         uint64_t obj = zp->z_id;
1336         uint64_t acl_obj = zfs_external_acl(zp);
1337         znode_hold_t *zh;
1338
1339         zh = zfs_znode_hold_enter(zfsvfs, obj);
1340         if (acl_obj) {
1341                 VERIFY(!zp->z_is_sa);
1342                 VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1343         }
1344         VERIFY(0 == dmu_object_free(os, obj, tx));
1345         zfs_znode_dmu_fini(zp);
1346         zfs_znode_hold_exit(zfsvfs, zh);
1347 }
1348
1349 void
1350 zfs_zinactive(znode_t *zp)
1351 {
1352         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1353         uint64_t z_id = zp->z_id;
1354         znode_hold_t *zh;
1355
1356         ASSERT(zp->z_sa_hdl);
1357
1358         /*
1359          * Don't allow a zfs_zget() while were trying to release this znode.
1360          */
1361         zh = zfs_znode_hold_enter(zfsvfs, z_id);
1362
1363         mutex_enter(&zp->z_lock);
1364
1365         /*
1366          * If this was the last reference to a file with no links, remove
1367          * the file from the file system unless the file system is mounted
1368          * read-only.  That can happen, for example, if the file system was
1369          * originally read-write, the file was opened, then unlinked and
1370          * the file system was made read-only before the file was finally
1371          * closed.  The file will remain in the unlinked set.
1372          */
1373         if (zp->z_unlinked) {
1374                 ASSERT(!zfsvfs->z_issnap);
1375                 if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) {
1376                         mutex_exit(&zp->z_lock);
1377                         zfs_znode_hold_exit(zfsvfs, zh);
1378                         zfs_rmnode(zp);
1379                         return;
1380                 }
1381         }
1382
1383         mutex_exit(&zp->z_lock);
1384         zfs_znode_dmu_fini(zp);
1385
1386         zfs_znode_hold_exit(zfsvfs, zh);
1387 }
1388
1389 #if defined(HAVE_INODE_TIMESPEC64_TIMES)
1390 #define zfs_compare_timespec timespec64_compare
1391 #else
1392 #define zfs_compare_timespec timespec_compare
1393 #endif
1394
1395 /*
1396  * Determine whether the znode's atime must be updated.  The logic mostly
1397  * duplicates the Linux kernel's relatime_need_update() functionality.
1398  * This function is only called if the underlying filesystem actually has
1399  * atime updates enabled.
1400  */
1401 boolean_t
1402 zfs_relatime_need_update(const struct inode *ip)
1403 {
1404         inode_timespec_t now, tmp_ctime;
1405
1406         gethrestime(&now);
1407         /*
1408          * In relatime mode, only update the atime if the previous atime
1409          * is earlier than either the ctime or mtime or if at least a day
1410          * has passed since the last update of atime.
1411          */
1412         if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0)
1413                 return (B_TRUE);
1414
1415         tmp_ctime = zpl_inode_get_ctime(ip);
1416         if (zfs_compare_timespec(&tmp_ctime, &ip->i_atime) >= 0)
1417                 return (B_TRUE);
1418
1419         if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60)
1420                 return (B_TRUE);
1421
1422         return (B_FALSE);
1423 }
1424
1425 /*
1426  * Prepare to update znode time stamps.
1427  *
1428  *      IN:     zp      - znode requiring timestamp update
1429  *              flag    - ATTR_MTIME, ATTR_CTIME flags
1430  *
1431  *      OUT:    zp      - z_seq
1432  *              mtime   - new mtime
1433  *              ctime   - new ctime
1434  *
1435  *      Note: We don't update atime here, because we rely on Linux VFS to do
1436  *      atime updating.
1437  */
1438 void
1439 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
1440     uint64_t ctime[2])
1441 {
1442         inode_timespec_t now, tmp_ctime;
1443
1444         gethrestime(&now);
1445
1446         zp->z_seq++;
1447
1448         if (flag & ATTR_MTIME) {
1449                 ZFS_TIME_ENCODE(&now, mtime);
1450                 ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime);
1451                 if (ZTOZSB(zp)->z_use_fuids) {
1452                         zp->z_pflags |= (ZFS_ARCHIVE |
1453                             ZFS_AV_MODIFIED);
1454                 }
1455         }
1456
1457         if (flag & ATTR_CTIME) {
1458                 ZFS_TIME_ENCODE(&now, ctime);
1459                 ZFS_TIME_DECODE(&tmp_ctime, ctime);
1460                 zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime);
1461                 if (ZTOZSB(zp)->z_use_fuids)
1462                         zp->z_pflags |= ZFS_ARCHIVE;
1463         }
1464 }
1465
1466 /*
1467  * Grow the block size for a file.
1468  *
1469  *      IN:     zp      - znode of file to free data in.
1470  *              size    - requested block size
1471  *              tx      - open transaction.
1472  *
1473  * NOTE: this function assumes that the znode is write locked.
1474  */
1475 void
1476 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1477 {
1478         int             error;
1479         u_longlong_t    dummy;
1480
1481         if (size <= zp->z_blksz)
1482                 return;
1483         /*
1484          * If the file size is already greater than the current blocksize,
1485          * we will not grow.  If there is more than one block in a file,
1486          * the blocksize cannot change.
1487          */
1488         if (zp->z_blksz && zp->z_size > zp->z_blksz)
1489                 return;
1490
1491         error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
1492             size, 0, tx);
1493
1494         if (error == ENOTSUP)
1495                 return;
1496         ASSERT0(error);
1497
1498         /* What blocksize did we actually get? */
1499         dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
1500 }
1501
1502 /*
1503  * Increase the file length
1504  *
1505  *      IN:     zp      - znode of file to free data in.
1506  *              end     - new end-of-file
1507  *
1508  *      RETURN: 0 on success, error code on failure
1509  */
1510 static int
1511 zfs_extend(znode_t *zp, uint64_t end)
1512 {
1513         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1514         dmu_tx_t *tx;
1515         zfs_locked_range_t *lr;
1516         uint64_t newblksz;
1517         int error;
1518
1519         /*
1520          * We will change zp_size, lock the whole file.
1521          */
1522         lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
1523
1524         /*
1525          * Nothing to do if file already at desired length.
1526          */
1527         if (end <= zp->z_size) {
1528                 zfs_rangelock_exit(lr);
1529                 return (0);
1530         }
1531         tx = dmu_tx_create(zfsvfs->z_os);
1532         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1533         zfs_sa_upgrade_txholds(tx, zp);
1534         if (end > zp->z_blksz &&
1535             (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1536                 /*
1537                  * We are growing the file past the current block size.
1538                  */
1539                 if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
1540                         /*
1541                          * File's blocksize is already larger than the
1542                          * "recordsize" property.  Only let it grow to
1543                          * the next power of 2.
1544                          */
1545                         ASSERT(!ISP2(zp->z_blksz));
1546                         newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
1547                 } else {
1548                         newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
1549                 }
1550                 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1551         } else {
1552                 newblksz = 0;
1553         }
1554
1555         error = dmu_tx_assign(tx, TXG_WAIT);
1556         if (error) {
1557                 dmu_tx_abort(tx);
1558                 zfs_rangelock_exit(lr);
1559                 return (error);
1560         }
1561
1562         if (newblksz)
1563                 zfs_grow_blocksize(zp, newblksz, tx);
1564
1565         zp->z_size = end;
1566
1567         VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
1568             &zp->z_size, sizeof (zp->z_size), tx));
1569
1570         zfs_rangelock_exit(lr);
1571
1572         dmu_tx_commit(tx);
1573
1574         return (0);
1575 }
1576
1577 /*
1578  * zfs_zero_partial_page - Modeled after update_pages() but
1579  * with different arguments and semantics for use by zfs_freesp().
1580  *
1581  * Zeroes a piece of a single page cache entry for zp at offset
1582  * start and length len.
1583  *
1584  * Caller must acquire a range lock on the file for the region
1585  * being zeroed in order that the ARC and page cache stay in sync.
1586  */
1587 static void
1588 zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
1589 {
1590         struct address_space *mp = ZTOI(zp)->i_mapping;
1591         struct page *pp;
1592         int64_t off;
1593         void *pb;
1594
1595         ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
1596
1597         off = start & (PAGE_SIZE - 1);
1598         start &= PAGE_MASK;
1599
1600         pp = find_lock_page(mp, start >> PAGE_SHIFT);
1601         if (pp) {
1602                 if (mapping_writably_mapped(mp))
1603                         flush_dcache_page(pp);
1604
1605                 pb = kmap(pp);
1606                 memset(pb + off, 0, len);
1607                 kunmap(pp);
1608
1609                 if (mapping_writably_mapped(mp))
1610                         flush_dcache_page(pp);
1611
1612                 mark_page_accessed(pp);
1613                 SetPageUptodate(pp);
1614                 ClearPageError(pp);
1615                 unlock_page(pp);
1616                 put_page(pp);
1617         }
1618 }
1619
1620 /*
1621  * Free space in a file.
1622  *
1623  *      IN:     zp      - znode of file to free data in.
1624  *              off     - start of section to free.
1625  *              len     - length of section to free.
1626  *
1627  *      RETURN: 0 on success, error code on failure
1628  */
1629 static int
1630 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1631 {
1632         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1633         zfs_locked_range_t *lr;
1634         int error;
1635
1636         /*
1637          * Lock the range being freed.
1638          */
1639         lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
1640
1641         /*
1642          * Nothing to do if file already at desired length.
1643          */
1644         if (off >= zp->z_size) {
1645                 zfs_rangelock_exit(lr);
1646                 return (0);
1647         }
1648
1649         if (off + len > zp->z_size)
1650                 len = zp->z_size - off;
1651
1652         error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1653
1654         /*
1655          * Zero partial page cache entries.  This must be done under a
1656          * range lock in order to keep the ARC and page cache in sync.
1657          */
1658         if (zn_has_cached_data(zp, off, off + len - 1)) {
1659                 loff_t first_page, last_page, page_len;
1660                 loff_t first_page_offset, last_page_offset;
1661
1662                 /* first possible full page in hole */
1663                 first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
1664                 /* last page of hole */
1665                 last_page = (off + len) >> PAGE_SHIFT;
1666
1667                 /* offset of first_page */
1668                 first_page_offset = first_page << PAGE_SHIFT;
1669                 /* offset of last_page */
1670                 last_page_offset = last_page << PAGE_SHIFT;
1671
1672                 /* truncate whole pages */
1673                 if (last_page_offset > first_page_offset) {
1674                         truncate_inode_pages_range(ZTOI(zp)->i_mapping,
1675                             first_page_offset, last_page_offset - 1);
1676                 }
1677
1678                 /* truncate sub-page ranges */
1679                 if (first_page > last_page) {
1680                         /* entire punched area within a single page */
1681                         zfs_zero_partial_page(zp, off, len);
1682                 } else {
1683                         /* beginning of punched area at the end of a page */
1684                         page_len  = first_page_offset - off;
1685                         if (page_len > 0)
1686                                 zfs_zero_partial_page(zp, off, page_len);
1687
1688                         /* end of punched area at the beginning of a page */
1689                         page_len = off + len - last_page_offset;
1690                         if (page_len > 0)
1691                                 zfs_zero_partial_page(zp, last_page_offset,
1692                                     page_len);
1693                 }
1694         }
1695         zfs_rangelock_exit(lr);
1696
1697         return (error);
1698 }
1699
1700 /*
1701  * Truncate a file
1702  *
1703  *      IN:     zp      - znode of file to free data in.
1704  *              end     - new end-of-file.
1705  *
1706  *      RETURN: 0 on success, error code on failure
1707  */
1708 static int
1709 zfs_trunc(znode_t *zp, uint64_t end)
1710 {
1711         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1712         dmu_tx_t *tx;
1713         zfs_locked_range_t *lr;
1714         int error;
1715         sa_bulk_attr_t bulk[2];
1716         int count = 0;
1717
1718         /*
1719          * We will change zp_size, lock the whole file.
1720          */
1721         lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
1722
1723         /*
1724          * Nothing to do if file already at desired length.
1725          */
1726         if (end >= zp->z_size) {
1727                 zfs_rangelock_exit(lr);
1728                 return (0);
1729         }
1730
1731         error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
1732             DMU_OBJECT_END);
1733         if (error) {
1734                 zfs_rangelock_exit(lr);
1735                 return (error);
1736         }
1737         tx = dmu_tx_create(zfsvfs->z_os);
1738         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1739         zfs_sa_upgrade_txholds(tx, zp);
1740         dmu_tx_mark_netfree(tx);
1741         error = dmu_tx_assign(tx, TXG_WAIT);
1742         if (error) {
1743                 dmu_tx_abort(tx);
1744                 zfs_rangelock_exit(lr);
1745                 return (error);
1746         }
1747
1748         zp->z_size = end;
1749         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
1750             NULL, &zp->z_size, sizeof (zp->z_size));
1751
1752         if (end == 0) {
1753                 zp->z_pflags &= ~ZFS_SPARSE;
1754                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1755                     NULL, &zp->z_pflags, 8);
1756         }
1757         VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
1758
1759         dmu_tx_commit(tx);
1760         zfs_rangelock_exit(lr);
1761
1762         return (0);
1763 }
1764
1765 /*
1766  * Free space in a file
1767  *
1768  *      IN:     zp      - znode of file to free data in.
1769  *              off     - start of range
1770  *              len     - end of range (0 => EOF)
1771  *              flag    - current file open mode flags.
1772  *              log     - TRUE if this action should be logged
1773  *
1774  *      RETURN: 0 on success, error code on failure
1775  */
1776 int
1777 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1778 {
1779         dmu_tx_t *tx;
1780         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1781         zilog_t *zilog = zfsvfs->z_log;
1782         uint64_t mode;
1783         uint64_t mtime[2], ctime[2];
1784         sa_bulk_attr_t bulk[3];
1785         int count = 0;
1786         int error;
1787
1788         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
1789             sizeof (mode))) != 0)
1790                 return (error);
1791
1792         if (off > zp->z_size) {
1793                 error =  zfs_extend(zp, off+len);
1794                 if (error == 0 && log)
1795                         goto log;
1796                 goto out;
1797         }
1798
1799         if (len == 0) {
1800                 error = zfs_trunc(zp, off);
1801         } else {
1802                 if ((error = zfs_free_range(zp, off, len)) == 0 &&
1803                     off + len > zp->z_size)
1804                         error = zfs_extend(zp, off+len);
1805         }
1806         if (error || !log)
1807                 goto out;
1808 log:
1809         tx = dmu_tx_create(zfsvfs->z_os);
1810         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1811         zfs_sa_upgrade_txholds(tx, zp);
1812         error = dmu_tx_assign(tx, TXG_WAIT);
1813         if (error) {
1814                 dmu_tx_abort(tx);
1815                 goto out;
1816         }
1817
1818         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
1819         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
1820         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1821             NULL, &zp->z_pflags, 8);
1822         zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
1823         error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1824         ASSERT(error == 0);
1825
1826         zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1827
1828         dmu_tx_commit(tx);
1829
1830         zfs_znode_update_vfs(zp);
1831         error = 0;
1832
1833 out:
1834         /*
1835          * Truncate the page cache - for file truncate operations, use
1836          * the purpose-built API for truncations.  For punching operations,
1837          * the truncation is handled under a range lock in zfs_free_range.
1838          */
1839         if (len == 0)
1840                 truncate_setsize(ZTOI(zp), off);
1841         return (error);
1842 }
1843
1844 void
1845 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1846 {
1847         struct super_block *sb;
1848         zfsvfs_t        *zfsvfs;
1849         uint64_t        moid, obj, sa_obj, version;
1850         uint64_t        sense = ZFS_CASE_SENSITIVE;
1851         uint64_t        norm = 0;
1852         nvpair_t        *elem;
1853         int             size;
1854         int             error;
1855         int             i;
1856         znode_t         *rootzp = NULL;
1857         vattr_t         vattr;
1858         znode_t         *zp;
1859         zfs_acl_ids_t   acl_ids;
1860
1861         /*
1862          * First attempt to create master node.
1863          */
1864         /*
1865          * In an empty objset, there are no blocks to read and thus
1866          * there can be no i/o errors (which we assert below).
1867          */
1868         moid = MASTER_NODE_OBJ;
1869         error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1870             DMU_OT_NONE, 0, tx);
1871         ASSERT(error == 0);
1872
1873         /*
1874          * Set starting attributes.
1875          */
1876         version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
1877         elem = NULL;
1878         while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1879                 /* For the moment we expect all zpl props to be uint64_ts */
1880                 uint64_t val;
1881                 const char *name;
1882
1883                 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1884                 VERIFY(nvpair_value_uint64(elem, &val) == 0);
1885                 name = nvpair_name(elem);
1886                 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1887                         if (val < version)
1888                                 version = val;
1889                 } else {
1890                         error = zap_update(os, moid, name, 8, 1, &val, tx);
1891                 }
1892                 ASSERT(error == 0);
1893                 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1894                         norm = val;
1895                 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1896                         sense = val;
1897         }
1898         ASSERT(version != 0);
1899         error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
1900         ASSERT(error == 0);
1901
1902         /*
1903          * Create zap object used for SA attribute registration
1904          */
1905
1906         if (version >= ZPL_VERSION_SA) {
1907                 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
1908                     DMU_OT_NONE, 0, tx);
1909                 error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
1910                 ASSERT(error == 0);
1911         } else {
1912                 sa_obj = 0;
1913         }
1914         /*
1915          * Create a delete queue.
1916          */
1917         obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1918
1919         error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
1920         ASSERT(error == 0);
1921
1922         /*
1923          * Create root znode.  Create minimal znode/inode/zfsvfs/sb
1924          * to allow zfs_mknode to work.
1925          */
1926         vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
1927         vattr.va_mode = S_IFDIR|0755;
1928         vattr.va_uid = crgetuid(cr);
1929         vattr.va_gid = crgetgid(cr);
1930
1931         rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1932         rootzp->z_unlinked = B_FALSE;
1933         rootzp->z_atime_dirty = B_FALSE;
1934         rootzp->z_is_sa = USE_SA(version, os);
1935         rootzp->z_pflags = 0;
1936
1937         zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1938         zfsvfs->z_os = os;
1939         zfsvfs->z_parent = zfsvfs;
1940         zfsvfs->z_version = version;
1941         zfsvfs->z_use_fuids = USE_FUIDS(version, os);
1942         zfsvfs->z_use_sa = USE_SA(version, os);
1943         zfsvfs->z_norm = norm;
1944
1945         sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
1946         sb->s_fs_info = zfsvfs;
1947
1948         ZTOI(rootzp)->i_sb = sb;
1949
1950         error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
1951             &zfsvfs->z_attr_table);
1952
1953         ASSERT(error == 0);
1954
1955         /*
1956          * Fold case on file systems that are always or sometimes case
1957          * insensitive.
1958          */
1959         if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1960                 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
1961
1962         mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1963         list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1964             offsetof(znode_t, z_link_node));
1965
1966         size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
1967         zfsvfs->z_hold_size = size;
1968         zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
1969             KM_SLEEP);
1970         zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
1971         for (i = 0; i != size; i++) {
1972                 avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
1973                     sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
1974                 mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
1975         }
1976
1977         VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1978             cr, NULL, &acl_ids, zfs_init_idmap));
1979         zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
1980         ASSERT3P(zp, ==, rootzp);
1981         error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1982         ASSERT(error == 0);
1983         zfs_acl_ids_free(&acl_ids);
1984
1985         atomic_set(&ZTOI(rootzp)->i_count, 0);
1986         sa_handle_destroy(rootzp->z_sa_hdl);
1987         kmem_cache_free(znode_cache, rootzp);
1988
1989         for (i = 0; i != size; i++) {
1990                 avl_destroy(&zfsvfs->z_hold_trees[i]);
1991                 mutex_destroy(&zfsvfs->z_hold_locks[i]);
1992         }
1993
1994         mutex_destroy(&zfsvfs->z_znodes_lock);
1995
1996         vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
1997         vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
1998         kmem_free(sb, sizeof (struct super_block));
1999         kmem_free(zfsvfs, sizeof (zfsvfs_t));
2000 }
2001 #endif /* _KERNEL */
2002
2003 static int
2004 zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
2005 {
2006         uint64_t sa_obj = 0;
2007         int error;
2008
2009         error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
2010         if (error != 0 && error != ENOENT)
2011                 return (error);
2012
2013         error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
2014         return (error);
2015 }
2016
2017 static int
2018 zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
2019     dmu_buf_t **db, const void *tag)
2020 {
2021         dmu_object_info_t doi;
2022         int error;
2023
2024         if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
2025                 return (error);
2026
2027         dmu_object_info_from_db(*db, &doi);
2028         if ((doi.doi_bonus_type != DMU_OT_SA &&
2029             doi.doi_bonus_type != DMU_OT_ZNODE) ||
2030             (doi.doi_bonus_type == DMU_OT_ZNODE &&
2031             doi.doi_bonus_size < sizeof (znode_phys_t))) {
2032                 sa_buf_rele(*db, tag);
2033                 return (SET_ERROR(ENOTSUP));
2034         }
2035
2036         error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
2037         if (error != 0) {
2038                 sa_buf_rele(*db, tag);
2039                 return (error);
2040         }
2041
2042         return (0);
2043 }
2044
2045 static void
2046 zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, const void *tag)
2047 {
2048         sa_handle_destroy(hdl);
2049         sa_buf_rele(db, tag);
2050 }
2051
2052 /*
2053  * Given an object number, return its parent object number and whether
2054  * or not the object is an extended attribute directory.
2055  */
2056 static int
2057 zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
2058     uint64_t *pobjp, int *is_xattrdir)
2059 {
2060         uint64_t parent;
2061         uint64_t pflags;
2062         uint64_t mode;
2063         uint64_t parent_mode;
2064         sa_bulk_attr_t bulk[3];
2065         sa_handle_t *sa_hdl;
2066         dmu_buf_t *sa_db;
2067         int count = 0;
2068         int error;
2069
2070         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
2071             &parent, sizeof (parent));
2072         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
2073             &pflags, sizeof (pflags));
2074         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2075             &mode, sizeof (mode));
2076
2077         if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
2078                 return (error);
2079
2080         /*
2081          * When a link is removed its parent pointer is not changed and will
2082          * be invalid.  There are two cases where a link is removed but the
2083          * file stays around, when it goes to the delete queue and when there
2084          * are additional links.
2085          */
2086         error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
2087         if (error != 0)
2088                 return (error);
2089
2090         error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
2091         zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2092         if (error != 0)
2093                 return (error);
2094
2095         *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
2096
2097         /*
2098          * Extended attributes can be applied to files, directories, etc.
2099          * Otherwise the parent must be a directory.
2100          */
2101         if (!*is_xattrdir && !S_ISDIR(parent_mode))
2102                 return (SET_ERROR(EINVAL));
2103
2104         *pobjp = parent;
2105
2106         return (0);
2107 }
2108
2109 /*
2110  * Given an object number, return some zpl level statistics
2111  */
2112 static int
2113 zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
2114     zfs_stat_t *sb)
2115 {
2116         sa_bulk_attr_t bulk[4];
2117         int count = 0;
2118
2119         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2120             &sb->zs_mode, sizeof (sb->zs_mode));
2121         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
2122             &sb->zs_gen, sizeof (sb->zs_gen));
2123         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
2124             &sb->zs_links, sizeof (sb->zs_links));
2125         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
2126             &sb->zs_ctime, sizeof (sb->zs_ctime));
2127
2128         return (sa_bulk_lookup(hdl, bulk, count));
2129 }
2130
2131 static int
2132 zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
2133     sa_attr_type_t *sa_table, char *buf, int len)
2134 {
2135         sa_handle_t *sa_hdl;
2136         sa_handle_t *prevhdl = NULL;
2137         dmu_buf_t *prevdb = NULL;
2138         dmu_buf_t *sa_db = NULL;
2139         char *path = buf + len - 1;
2140         int error;
2141
2142         *path = '\0';
2143         sa_hdl = hdl;
2144
2145         uint64_t deleteq_obj;
2146         VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
2147             ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
2148         error = zap_lookup_int(osp, deleteq_obj, obj);
2149         if (error == 0) {
2150                 return (ESTALE);
2151         } else if (error != ENOENT) {
2152                 return (error);
2153         }
2154
2155         for (;;) {
2156                 uint64_t pobj = 0;
2157                 char component[MAXNAMELEN + 2];
2158                 size_t complen;
2159                 int is_xattrdir = 0;
2160
2161                 if (prevdb) {
2162                         ASSERT(prevhdl != NULL);
2163                         zfs_release_sa_handle(prevhdl, prevdb, FTAG);
2164                 }
2165
2166                 if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
2167                     &is_xattrdir)) != 0)
2168                         break;
2169
2170                 if (pobj == obj) {
2171                         if (path[0] != '/')
2172                                 *--path = '/';
2173                         break;
2174                 }
2175
2176                 component[0] = '/';
2177                 if (is_xattrdir) {
2178                         strcpy(component + 1, "<xattrdir>");
2179                 } else {
2180                         error = zap_value_search(osp, pobj, obj,
2181                             ZFS_DIRENT_OBJ(-1ULL), component + 1);
2182                         if (error != 0)
2183                                 break;
2184                 }
2185
2186                 complen = strlen(component);
2187                 path -= complen;
2188                 ASSERT(path >= buf);
2189                 memcpy(path, component, complen);
2190                 obj = pobj;
2191
2192                 if (sa_hdl != hdl) {
2193                         prevhdl = sa_hdl;
2194                         prevdb = sa_db;
2195                 }
2196                 error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
2197                 if (error != 0) {
2198                         sa_hdl = prevhdl;
2199                         sa_db = prevdb;
2200                         break;
2201                 }
2202         }
2203
2204         if (sa_hdl != NULL && sa_hdl != hdl) {
2205                 ASSERT(sa_db != NULL);
2206                 zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2207         }
2208
2209         if (error == 0)
2210                 (void) memmove(buf, path, buf + len - path);
2211
2212         return (error);
2213 }
2214
2215 int
2216 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
2217 {
2218         sa_attr_type_t *sa_table;
2219         sa_handle_t *hdl;
2220         dmu_buf_t *db;
2221         int error;
2222
2223         error = zfs_sa_setup(osp, &sa_table);
2224         if (error != 0)
2225                 return (error);
2226
2227         error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2228         if (error != 0)
2229                 return (error);
2230
2231         error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2232
2233         zfs_release_sa_handle(hdl, db, FTAG);
2234         return (error);
2235 }
2236
2237 int
2238 zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
2239     char *buf, int len)
2240 {
2241         char *path = buf + len - 1;
2242         sa_attr_type_t *sa_table;
2243         sa_handle_t *hdl;
2244         dmu_buf_t *db;
2245         int error;
2246
2247         *path = '\0';
2248
2249         error = zfs_sa_setup(osp, &sa_table);
2250         if (error != 0)
2251                 return (error);
2252
2253         error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2254         if (error != 0)
2255                 return (error);
2256
2257         error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
2258         if (error != 0) {
2259                 zfs_release_sa_handle(hdl, db, FTAG);
2260                 return (error);
2261         }
2262
2263         error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2264
2265         zfs_release_sa_handle(hdl, db, FTAG);
2266         return (error);
2267 }
2268
2269 /*
2270  * Read a property stored within the master node.
2271  */
2272 int
2273 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2274 {
2275         uint64_t *cached_copy = NULL;
2276
2277         /*
2278          * Figure out where in the objset_t the cached copy would live, if it
2279          * is available for the requested property.
2280          */
2281         if (os != NULL) {
2282                 switch (prop) {
2283                 case ZFS_PROP_VERSION:
2284                         cached_copy = &os->os_version;
2285                         break;
2286                 case ZFS_PROP_NORMALIZE:
2287                         cached_copy = &os->os_normalization;
2288                         break;
2289                 case ZFS_PROP_UTF8ONLY:
2290                         cached_copy = &os->os_utf8only;
2291                         break;
2292                 case ZFS_PROP_CASE:
2293                         cached_copy = &os->os_casesensitivity;
2294                         break;
2295                 default:
2296                         break;
2297                 }
2298         }
2299         if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
2300                 *value = *cached_copy;
2301                 return (0);
2302         }
2303
2304         /*
2305          * If the property wasn't cached, look up the file system's value for
2306          * the property. For the version property, we look up a slightly
2307          * different string.
2308          */
2309         const char *pname;
2310         int error = ENOENT;
2311         if (prop == ZFS_PROP_VERSION)
2312                 pname = ZPL_VERSION_STR;
2313         else
2314                 pname = zfs_prop_to_name(prop);
2315
2316         if (os != NULL) {
2317                 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
2318                 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2319         }
2320
2321         if (error == ENOENT) {
2322                 /* No value set, use the default value */
2323                 switch (prop) {
2324                 case ZFS_PROP_VERSION:
2325                         *value = ZPL_VERSION;
2326                         break;
2327                 case ZFS_PROP_NORMALIZE:
2328                 case ZFS_PROP_UTF8ONLY:
2329                         *value = 0;
2330                         break;
2331                 case ZFS_PROP_CASE:
2332                         *value = ZFS_CASE_SENSITIVE;
2333                         break;
2334                 case ZFS_PROP_ACLTYPE:
2335                         *value = ZFS_ACLTYPE_OFF;
2336                         break;
2337                 default:
2338                         return (error);
2339                 }
2340                 error = 0;
2341         }
2342
2343         /*
2344          * If one of the methods for getting the property value above worked,
2345          * copy it into the objset_t's cache.
2346          */
2347         if (error == 0 && cached_copy != NULL) {
2348                 *cached_copy = *value;
2349         }
2350
2351         return (error);
2352 }
2353
2354 #if defined(_KERNEL)
2355 EXPORT_SYMBOL(zfs_create_fs);
2356 EXPORT_SYMBOL(zfs_obj_to_path);
2357
2358 /* CSTYLED */
2359 module_param(zfs_object_mutex_size, uint, 0644);
2360 MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
2361 module_param(zfs_unlink_suspend_progress, int, 0644);
2362 MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks "
2363 "(debug - leaks space into the unlinked set)");
2364 #endif