module/os/linux/zfs/zfs_znode.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  24  */
  25
  26 /* Portions Copyright 2007 Jeremy Teo */
  27
  28 #ifdef _KERNEL
  29 #include <sys/types.h>
  30 #include <sys/param.h>
  31 #include <sys/time.h>
  32 #include <sys/sysmacros.h>
  33 #include <sys/mntent.h>
  34 #include <sys/u8_textprep.h>
  35 #include <sys/dsl_dataset.h>
  36 #include <sys/vfs.h>
  37 #include <sys/vnode.h>
  38 #include <sys/file.h>
  39 #include <sys/kmem.h>
  40 #include <sys/errno.h>
  41 #include <sys/atomic.h>
  42 #include <sys/zfs_dir.h>
  43 #include <sys/zfs_acl.h>
  44 #include <sys/zfs_ioctl.h>
  45 #include <sys/zfs_rlock.h>
  46 #include <sys/zfs_fuid.h>
  47 #include <sys/zfs_vnops.h>
  48 #include <sys/zfs_ctldir.h>
  49 #include <sys/dnode.h>
  50 #include <sys/fs/zfs.h>
  51 #include <sys/zpl.h>
  52 #endif /* _KERNEL */
  53
  54 #include <sys/dmu.h>
  55 #include <sys/dmu_objset.h>
  56 #include <sys/dmu_tx.h>
  57 #include <sys/zfs_refcount.h>
  58 #include <sys/stat.h>
  59 #include <sys/zap.h>
  60 #include <sys/zfs_znode.h>
  61 #include <sys/sa.h>
  62 #include <sys/zfs_sa.h>
  63 #include <sys/zfs_stat.h>
  64
  65 #include "zfs_prop.h"
  66 #include "zfs_comutil.h"
  67
  68 /*
  69  * Functions needed for userland (ie: libzpool) are not put under
  70  * #ifdef_KERNEL; the rest of the functions have dependencies
  71  * (such as VFS logic) that will not compile easily in userland.
  72  */
  73 #ifdef _KERNEL
  74
  75 static kmem_cache_t *znode_cache = NULL;
  76 static kmem_cache_t *znode_hold_cache = NULL;
  77 unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
  78
  79 /*
  80  * This is used by the test suite so that it can delay znodes from being
  81  * freed in order to inspect the unlinked set.
  82  */
  83 static int zfs_unlink_suspend_progress = 0;
  84
  85 /*
  86  * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
  87  * z_rangelock. It will modify the offset and length of the lock to reflect
  88  * znode-specific information, and convert RL_APPEND to RL_WRITER.  This is
  89  * called with the rangelock_t's rl_lock held, which avoids races.
  90  */
  91 static void
  92 zfs_rangelock_cb(zfs_locked_range_t *new, void *arg)
  93 {
  94         znode_t *zp = arg;
  95
  96         /*
  97          * If in append mode, convert to writer and lock starting at the
  98          * current end of file.
  99          */
 100         if (new->lr_type == RL_APPEND) {
 101                 new->lr_offset = zp->z_size;
 102                 new->lr_type = RL_WRITER;
 103         }
 104
 105         /*
 106          * If we need to grow the block size then lock the whole file range.
 107          */
 108         uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
 109         if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
 110             zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
 111                 new->lr_offset = 0;
 112                 new->lr_length = UINT64_MAX;
 113         }
 114 }
 115
 116 static int
 117 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 118 {
 119         (void) arg, (void) kmflags;
 120         znode_t *zp = buf;
 121
 122         inode_init_once(ZTOI(zp));
 123         list_link_init(&zp->z_link_node);
 124
 125         mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
 126         rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
 127         rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
 128         mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
 129         rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
 130
 131         zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
 132
 133         zp->z_dirlocks = NULL;
 134         zp->z_acl_cached = NULL;
 135         zp->z_xattr_cached = NULL;
 136         zp->z_xattr_parent = 0;
 137         zp->z_sync_writes_cnt = 0;
 138         zp->z_async_writes_cnt = 0;
 139
 140         return (0);
 141 }
 142
 143 static void
 144 zfs_znode_cache_destructor(void *buf, void *arg)
 145 {
 146         (void) arg;
 147         znode_t *zp = buf;
 148
 149         ASSERT(!list_link_active(&zp->z_link_node));
 150         mutex_destroy(&zp->z_lock);
 151         rw_destroy(&zp->z_parent_lock);
 152         rw_destroy(&zp->z_name_lock);
 153         mutex_destroy(&zp->z_acl_lock);
 154         rw_destroy(&zp->z_xattr_lock);
 155         zfs_rangelock_fini(&zp->z_rangelock);
 156
 157         ASSERT3P(zp->z_dirlocks, ==, NULL);
 158         ASSERT3P(zp->z_acl_cached, ==, NULL);
 159         ASSERT3P(zp->z_xattr_cached, ==, NULL);
 160
 161         ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
 162         ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
 163 }
 164
 165 static int
 166 zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
 167 {
 168         (void) arg, (void) kmflags;
 169         znode_hold_t *zh = buf;
 170
 171         mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
 172         zh->zh_refcount = 0;
 173
 174         return (0);
 175 }
 176
 177 static void
 178 zfs_znode_hold_cache_destructor(void *buf, void *arg)
 179 {
 180         (void) arg;
 181         znode_hold_t *zh = buf;
 182
 183         mutex_destroy(&zh->zh_lock);
 184 }
 185
 186 void
 187 zfs_znode_init(void)
 188 {
 189         /*
 190          * Initialize zcache.  The KMC_SLAB hint is used in order that it be
 191          * backed by kmalloc() when on the Linux slab in order that any
 192          * wait_on_bit() operations on the related inode operate properly.
 193          */
 194         ASSERT(znode_cache == NULL);
 195         znode_cache = kmem_cache_create("zfs_znode_cache",
 196             sizeof (znode_t), 0, zfs_znode_cache_constructor,
 197             zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB);
 198
 199         ASSERT(znode_hold_cache == NULL);
 200         znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
 201             sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
 202             zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
 203 }
 204
 205 void
 206 zfs_znode_fini(void)
 207 {
 208         /*
 209          * Cleanup zcache
 210          */
 211         if (znode_cache)
 212                 kmem_cache_destroy(znode_cache);
 213         znode_cache = NULL;
 214
 215         if (znode_hold_cache)
 216                 kmem_cache_destroy(znode_hold_cache);
 217         znode_hold_cache = NULL;
 218 }
 219
 220 /*
 221  * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
 222  * serialize access to a znode and its SA buffer while the object is being
 223  * created or destroyed.  This kind of locking would normally reside in the
 224  * znode itself but in this case that's impossible because the znode and SA
 225  * buffer may not yet exist.  Therefore the locking is handled externally
 226  * with an array of mutexes and AVLs trees which contain per-object locks.
 227  *
 228  * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
 229  * in to the correct AVL tree and finally the per-object lock is held.  In
 230  * zfs_znode_hold_exit() the process is reversed.  The per-object lock is
 231  * released, removed from the AVL tree and destroyed if there are no waiters.
 232  *
 233  * This scheme has two important properties:
 234  *
 235  * 1) No memory allocations are performed while holding one of the z_hold_locks.
 236  *    This ensures evict(), which can be called from direct memory reclaim, will
 237  *    never block waiting on a z_hold_locks which just happens to have hashed
 238  *    to the same index.
 239  *
 240  * 2) All locks used to serialize access to an object are per-object and never
 241  *    shared.  This minimizes lock contention without creating a large number
 242  *    of dedicated locks.
 243  *
 244  * On the downside it does require znode_lock_t structures to be frequently
 245  * allocated and freed.  However, because these are backed by a kmem cache
 246  * and very short lived this cost is minimal.
 247  */
 248 int
 249 zfs_znode_hold_compare(const void *a, const void *b)
 250 {
 251         const znode_hold_t *zh_a = (const znode_hold_t *)a;
 252         const znode_hold_t *zh_b = (const znode_hold_t *)b;
 253
 254         return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj));
 255 }
 256
 257 static boolean_t __maybe_unused
 258 zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
 259 {
 260         znode_hold_t *zh, search;
 261         int i = ZFS_OBJ_HASH(zfsvfs, obj);
 262         boolean_t held;
 263
 264         search.zh_obj = obj;
 265
 266         mutex_enter(&zfsvfs->z_hold_locks[i]);
 267         zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
 268         held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
 269         mutex_exit(&zfsvfs->z_hold_locks[i]);
 270
 271         return (held);
 272 }
 273
 274 znode_hold_t *
 275 zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
 276 {
 277         znode_hold_t *zh, *zh_new, search;
 278         int i = ZFS_OBJ_HASH(zfsvfs, obj);
 279         boolean_t found = B_FALSE;
 280
 281         zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
 282         search.zh_obj = obj;
 283
 284         mutex_enter(&zfsvfs->z_hold_locks[i]);
 285         zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
 286         if (likely(zh == NULL)) {
 287                 zh = zh_new;
 288                 zh->zh_obj = obj;
 289                 avl_add(&zfsvfs->z_hold_trees[i], zh);
 290         } else {
 291                 ASSERT3U(zh->zh_obj, ==, obj);
 292                 found = B_TRUE;
 293         }
 294         zh->zh_refcount++;
 295         ASSERT3S(zh->zh_refcount, >, 0);
 296         mutex_exit(&zfsvfs->z_hold_locks[i]);
 297
 298         if (found == B_TRUE)
 299                 kmem_cache_free(znode_hold_cache, zh_new);
 300
 301         ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
 302         mutex_enter(&zh->zh_lock);
 303
 304         return (zh);
 305 }
 306
 307 void
 308 zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
 309 {
 310         int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
 311         boolean_t remove = B_FALSE;
 312
 313         ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
 314         mutex_exit(&zh->zh_lock);
 315
 316         mutex_enter(&zfsvfs->z_hold_locks[i]);
 317         ASSERT3S(zh->zh_refcount, >, 0);
 318         if (--zh->zh_refcount == 0) {
 319                 avl_remove(&zfsvfs->z_hold_trees[i], zh);
 320                 remove = B_TRUE;
 321         }
 322         mutex_exit(&zfsvfs->z_hold_locks[i]);
 323
 324         if (remove == B_TRUE)
 325                 kmem_cache_free(znode_hold_cache, zh);
 326 }
 327
 328 dev_t
 329 zfs_cmpldev(uint64_t dev)
 330 {
 331         return (dev);
 332 }
 333
 334 static void
 335 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
 336     dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
 337 {
 338         ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
 339
 340         mutex_enter(&zp->z_lock);
 341
 342         ASSERT(zp->z_sa_hdl == NULL);
 343         ASSERT(zp->z_acl_cached == NULL);
 344         if (sa_hdl == NULL) {
 345                 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
 346                     SA_HDL_SHARED, &zp->z_sa_hdl));
 347         } else {
 348                 zp->z_sa_hdl = sa_hdl;
 349                 sa_set_userp(sa_hdl, zp);
 350         }
 351
 352         zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
 353
 354         mutex_exit(&zp->z_lock);
 355 }
 356
 357 void
 358 zfs_znode_dmu_fini(znode_t *zp)
 359 {
 360         ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) ||
 361             RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
 362
 363         sa_handle_destroy(zp->z_sa_hdl);
 364         zp->z_sa_hdl = NULL;
 365 }
 366
 367 /*
 368  * Called by new_inode() to allocate a new inode.
 369  */
 370 int
 371 zfs_inode_alloc(struct super_block *sb, struct inode **ip)
 372 {
 373         znode_t *zp;
 374
 375         zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
 376         *ip = ZTOI(zp);
 377
 378         return (0);
 379 }
 380
 381 /*
 382  * Called in multiple places when an inode should be destroyed.
 383  */
 384 void
 385 zfs_inode_destroy(struct inode *ip)
 386 {
 387         znode_t *zp = ITOZ(ip);
 388         zfsvfs_t *zfsvfs = ZTOZSB(zp);
 389
 390         mutex_enter(&zfsvfs->z_znodes_lock);
 391         if (list_link_active(&zp->z_link_node)) {
 392                 list_remove(&zfsvfs->z_all_znodes, zp);
 393         }
 394         mutex_exit(&zfsvfs->z_znodes_lock);
 395
 396         if (zp->z_acl_cached) {
 397                 zfs_acl_free(zp->z_acl_cached);
 398                 zp->z_acl_cached = NULL;
 399         }
 400
 401         if (zp->z_xattr_cached) {
 402                 nvlist_free(zp->z_xattr_cached);
 403                 zp->z_xattr_cached = NULL;
 404         }
 405
 406         kmem_cache_free(znode_cache, zp);
 407 }
 408
 409 static void
 410 zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
 411 {
 412         uint64_t rdev = 0;
 413
 414         switch (ip->i_mode & S_IFMT) {
 415         case S_IFREG:
 416                 ip->i_op = &zpl_inode_operations;
 417 #ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
 418                 ip->i_fop = &zpl_file_operations.kabi_fops;
 419 #else
 420                 ip->i_fop = &zpl_file_operations;
 421 #endif
 422                 ip->i_mapping->a_ops = &zpl_address_space_operations;
 423                 break;
 424
 425         case S_IFDIR:
 426 #ifdef HAVE_RENAME2_OPERATIONS_WRAPPER
 427                 ip->i_flags |= S_IOPS_WRAPPER;
 428                 ip->i_op = &zpl_dir_inode_operations.ops;
 429 #else
 430                 ip->i_op = &zpl_dir_inode_operations;
 431 #endif
 432                 ip->i_fop = &zpl_dir_file_operations;
 433                 ITOZ(ip)->z_zn_prefetch = B_TRUE;
 434                 break;
 435
 436         case S_IFLNK:
 437                 ip->i_op = &zpl_symlink_inode_operations;
 438                 break;
 439
 440         /*
 441          * rdev is only stored in a SA only for device files.
 442          */
 443         case S_IFCHR:
 444         case S_IFBLK:
 445                 (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
 446                     sizeof (rdev));
 447                 zfs_fallthrough;
 448         case S_IFIFO:
 449         case S_IFSOCK:
 450                 init_special_inode(ip, ip->i_mode, rdev);
 451                 ip->i_op = &zpl_special_inode_operations;
 452                 break;
 453
 454         default:
 455                 zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
 456                     (u_longlong_t)ip->i_ino, ip->i_mode);
 457
 458                 /* Assume the inode is a file and attempt to continue */
 459                 ip->i_mode = S_IFREG | 0644;
 460                 ip->i_op = &zpl_inode_operations;
 461 #ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
 462                 ip->i_fop = &zpl_file_operations.kabi_fops;
 463 #else
 464                 ip->i_fop = &zpl_file_operations;
 465 #endif
 466                 ip->i_mapping->a_ops = &zpl_address_space_operations;
 467                 break;
 468         }
 469 }
 470
 471 static void
 472 zfs_set_inode_flags(znode_t *zp, struct inode *ip)
 473 {
 474         /*
 475          * Linux and Solaris have different sets of file attributes, so we
 476          * restrict this conversion to the intersection of the two.
 477          */
 478 #ifdef HAVE_INODE_SET_FLAGS
 479         unsigned int flags = 0;
 480         if (zp->z_pflags & ZFS_IMMUTABLE)
 481                 flags |= S_IMMUTABLE;
 482         if (zp->z_pflags & ZFS_APPENDONLY)
 483                 flags |= S_APPEND;
 484
 485         inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
 486 #else
 487         if (zp->z_pflags & ZFS_IMMUTABLE)
 488                 ip->i_flags |= S_IMMUTABLE;
 489         else
 490                 ip->i_flags &= ~S_IMMUTABLE;
 491
 492         if (zp->z_pflags & ZFS_APPENDONLY)
 493                 ip->i_flags |= S_APPEND;
 494         else
 495                 ip->i_flags &= ~S_APPEND;
 496 #endif
 497 }
 498
 499 /*
 500  * Update the embedded inode given the znode.
 501  */
 502 void
 503 zfs_znode_update_vfs(znode_t *zp)
 504 {
 505         struct inode    *ip;
 506         uint32_t        blksize;
 507         u_longlong_t    i_blocks;
 508
 509         ASSERT(zp != NULL);
 510         ip = ZTOI(zp);
 511
 512         /* Skip .zfs control nodes which do not exist on disk. */
 513         if (zfsctl_is_node(ip))
 514                 return;
 515
 516         dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
 517
 518         spin_lock(&ip->i_lock);
 519         ip->i_mode = zp->z_mode;
 520         ip->i_blocks = i_blocks;
 521         i_size_write(ip, zp->z_size);
 522         spin_unlock(&ip->i_lock);
 523 }
 524
 525
 526 /*
 527  * Construct a znode+inode and initialize.
 528  *
 529  * This does not do a call to dmu_set_user() that is
 530  * up to the caller to do, in case you don't want to
 531  * return the znode
 532  */
 533 static znode_t *
 534 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
 535     dmu_object_type_t obj_type, sa_handle_t *hdl)
 536 {
 537         znode_t *zp;
 538         struct inode *ip;
 539         uint64_t mode;
 540         uint64_t parent;
 541         uint64_t tmp_gen;
 542         uint64_t links;
 543         uint64_t z_uid, z_gid;
 544         uint64_t atime[2], mtime[2], ctime[2], btime[2];
 545         inode_timespec_t tmp_ts;
 546         uint64_t projid = ZFS_DEFAULT_PROJID;
 547         sa_bulk_attr_t bulk[12];
 548         int count = 0;
 549
 550         ASSERT(zfsvfs != NULL);
 551
 552         ip = new_inode(zfsvfs->z_sb);
 553         if (ip == NULL)
 554                 return (NULL);
 555
 556         zp = ITOZ(ip);
 557         ASSERT(zp->z_dirlocks == NULL);
 558         ASSERT3P(zp->z_acl_cached, ==, NULL);
 559         ASSERT3P(zp->z_xattr_cached, ==, NULL);
 560         zp->z_unlinked = B_FALSE;
 561         zp->z_atime_dirty = B_FALSE;
 562 #if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE)
 563         zp->z_is_mapped = B_FALSE;
 564 #endif
 565         zp->z_is_ctldir = B_FALSE;
 566         zp->z_suspended = B_FALSE;
 567         zp->z_sa_hdl = NULL;
 568         zp->z_mapcnt = 0;
 569         zp->z_id = db->db_object;
 570         zp->z_blksz = blksz;
 571         zp->z_seq = 0x7A4653;
 572         zp->z_sync_cnt = 0;
 573         zp->z_sync_writes_cnt = 0;
 574         zp->z_async_writes_cnt = 0;
 575
 576         zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
 577
 578         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
 579         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
 580         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 581             &zp->z_size, 8);
 582         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
 583         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 584             &zp->z_pflags, 8);
 585         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
 586             &parent, 8);
 587         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
 588         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
 589         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
 590         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 591         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 592         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
 593
 594         if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
 595             (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
 596             (zp->z_pflags & ZFS_PROJID) &&
 597             sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
 598                 if (hdl == NULL)
 599                         sa_handle_destroy(zp->z_sa_hdl);
 600                 zp->z_sa_hdl = NULL;
 601                 goto error;
 602         }
 603
 604         zp->z_projid = projid;
 605         zp->z_mode = ip->i_mode = mode;
 606         ip->i_generation = (uint32_t)tmp_gen;
 607         ip->i_blkbits = SPA_MINBLOCKSHIFT;
 608         set_nlink(ip, (uint32_t)links);
 609         zfs_uid_write(ip, z_uid);
 610         zfs_gid_write(ip, z_gid);
 611         zfs_set_inode_flags(zp, ip);
 612
 613         /* Cache the xattr parent id */
 614         if (zp->z_pflags & ZFS_XATTR)
 615                 zp->z_xattr_parent = parent;
 616
 617         ZFS_TIME_DECODE(&tmp_ts, atime);
 618         zpl_inode_set_atime_to_ts(ip, tmp_ts);
 619         ZFS_TIME_DECODE(&tmp_ts, mtime);
 620         zpl_inode_set_mtime_to_ts(ip, tmp_ts);
 621         ZFS_TIME_DECODE(&tmp_ts, ctime);
 622         zpl_inode_set_ctime_to_ts(ip, tmp_ts);
 623         ZFS_TIME_DECODE(&zp->z_btime, btime);
 624
 625         ip->i_ino = zp->z_id;
 626         zfs_znode_update_vfs(zp);
 627         zfs_inode_set_ops(zfsvfs, ip);
 628
 629         /*
 630          * The only way insert_inode_locked() can fail is if the ip->i_ino
 631          * number is already hashed for this super block.  This can never
 632          * happen because the inode numbers map 1:1 with the object numbers.
 633          *
 634          * Exceptions include rolling back a mounted file system, either
 635          * from the zfs rollback or zfs recv command.
 636          *
 637          * Active inodes are unhashed during the rollback, but since zrele
 638          * can happen asynchronously, we can't guarantee they've been
 639          * unhashed.  This can cause hash collisions in unlinked drain
 640          * processing so do not hash unlinked znodes.
 641          */
 642         if (links > 0)
 643                 VERIFY3S(insert_inode_locked(ip), ==, 0);
 644
 645         mutex_enter(&zfsvfs->z_znodes_lock);
 646         list_insert_tail(&zfsvfs->z_all_znodes, zp);
 647         mutex_exit(&zfsvfs->z_znodes_lock);
 648
 649         if (links > 0)
 650                 unlock_new_inode(ip);
 651         return (zp);
 652
 653 error:
 654         iput(ip);
 655         return (NULL);
 656 }
 657
 658 /*
 659  * Safely mark an inode dirty.  Inodes which are part of a read-only
 660  * file system or snapshot may not be dirtied.
 661  */
 662 void
 663 zfs_mark_inode_dirty(struct inode *ip)
 664 {
 665         zfsvfs_t *zfsvfs = ITOZSB(ip);
 666
 667         if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
 668                 return;
 669
 670         mark_inode_dirty(ip);
 671 }
 672
 673 static uint64_t empty_xattr;
 674 static uint64_t pad[4];
 675 static zfs_acl_phys_t acl_phys;
 676 /*
 677  * Create a new DMU object to hold a zfs znode.
 678  *
 679  *      IN:     dzp     - parent directory for new znode
 680  *              vap     - file attributes for new znode
 681  *              tx      - dmu transaction id for zap operations
 682  *              cr      - credentials of caller
 683  *              flag    - flags:
 684  *                        IS_ROOT_NODE  - new object will be root
 685  *                        IS_TMPFILE    - new object is of O_TMPFILE
 686  *                        IS_XATTR      - new object is an attribute
 687  *              acl_ids - ACL related attributes
 688  *
 689  *      OUT:    zpp     - allocated znode (set to dzp if IS_ROOT_NODE)
 690  *
 691  */
 692 void
 693 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 694     uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
 695 {
 696         uint64_t        crtime[2], atime[2], mtime[2], ctime[2];
 697         uint64_t        mode, size, links, parent, pflags;
 698         uint64_t        projid = ZFS_DEFAULT_PROJID;
 699         uint64_t        rdev = 0;
 700         zfsvfs_t        *zfsvfs = ZTOZSB(dzp);
 701         dmu_buf_t       *db;
 702         inode_timespec_t now;
 703         uint64_t        gen, obj;
 704         int             bonuslen;
 705         int             dnodesize;
 706         sa_handle_t     *sa_hdl;
 707         dmu_object_type_t obj_type;
 708         sa_bulk_attr_t  *sa_attrs;
 709         int             cnt = 0;
 710         zfs_acl_locator_cb_t locate = { 0 };
 711         znode_hold_t    *zh;
 712
 713         if (zfsvfs->z_replay) {
 714                 obj = vap->va_nodeid;
 715                 now = vap->va_ctime;            /* see zfs_replay_create() */
 716                 gen = vap->va_nblocks;          /* ditto */
 717                 dnodesize = vap->va_fsid;       /* ditto */
 718         } else {
 719                 obj = 0;
 720                 gethrestime(&now);
 721                 gen = dmu_tx_get_txg(tx);
 722                 dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
 723         }
 724
 725         if (dnodesize == 0)
 726                 dnodesize = DNODE_MIN_SIZE;
 727
 728         obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
 729
 730         bonuslen = (obj_type == DMU_OT_SA) ?
 731             DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
 732
 733         /*
 734          * Create a new DMU object.
 735          */
 736         /*
 737          * There's currently no mechanism for pre-reading the blocks that will
 738          * be needed to allocate a new object, so we accept the small chance
 739          * that there will be an i/o error and we will fail one of the
 740          * assertions below.
 741          */
 742         if (S_ISDIR(vap->va_mode)) {
 743                 if (zfsvfs->z_replay) {
 744                         VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
 745                             zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 746                             obj_type, bonuslen, dnodesize, tx));
 747                 } else {
 748                         obj = zap_create_norm_dnsize(zfsvfs->z_os,
 749                             zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 750                             obj_type, bonuslen, dnodesize, tx);
 751                 }
 752         } else {
 753                 if (zfsvfs->z_replay) {
 754                         VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
 755                             DMU_OT_PLAIN_FILE_CONTENTS, 0,
 756                             obj_type, bonuslen, dnodesize, tx));
 757                 } else {
 758                         obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
 759                             DMU_OT_PLAIN_FILE_CONTENTS, 0,
 760                             obj_type, bonuslen, dnodesize, tx);
 761                 }
 762         }
 763
 764         zh = zfs_znode_hold_enter(zfsvfs, obj);
 765         VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
 766
 767         /*
 768          * If this is the root, fix up the half-initialized parent pointer
 769          * to reference the just-allocated physical data area.
 770          */
 771         if (flag & IS_ROOT_NODE) {
 772                 dzp->z_id = obj;
 773         }
 774
 775         /*
 776          * If parent is an xattr, so am I.
 777          */
 778         if (dzp->z_pflags & ZFS_XATTR) {
 779                 flag |= IS_XATTR;
 780         }
 781
 782         if (zfsvfs->z_use_fuids)
 783                 pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
 784         else
 785                 pflags = 0;
 786
 787         if (S_ISDIR(vap->va_mode)) {
 788                 size = 2;               /* contents ("." and "..") */
 789                 links = 2;
 790         } else {
 791                 size = 0;
 792                 links = (flag & IS_TMPFILE) ? 0 : 1;
 793         }
 794
 795         if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
 796                 rdev = vap->va_rdev;
 797
 798         parent = dzp->z_id;
 799         mode = acl_ids->z_mode;
 800         if (flag & IS_XATTR)
 801                 pflags |= ZFS_XATTR;
 802
 803         if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
 804                 /*
 805                  * With ZFS_PROJID flag, we can easily know whether there is
 806                  * project ID stored on disk or not. See zfs_space_delta_cb().
 807                  */
 808                 if (obj_type != DMU_OT_ZNODE &&
 809                     dmu_objset_projectquota_enabled(zfsvfs->z_os))
 810                         pflags |= ZFS_PROJID;
 811
 812                 /*
 813                  * Inherit project ID from parent if required.
 814                  */
 815                 projid = zfs_inherit_projid(dzp);
 816                 if (dzp->z_pflags & ZFS_PROJINHERIT)
 817                         pflags |= ZFS_PROJINHERIT;
 818         }
 819
 820         /*
 821          * No execs denied will be determined when zfs_mode_compute() is called.
 822          */
 823         pflags |= acl_ids->z_aclp->z_hints &
 824             (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
 825             ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
 826
 827         ZFS_TIME_ENCODE(&now, crtime);
 828         ZFS_TIME_ENCODE(&now, ctime);
 829
 830         if (vap->va_mask & ATTR_ATIME) {
 831                 ZFS_TIME_ENCODE(&vap->va_atime, atime);
 832         } else {
 833                 ZFS_TIME_ENCODE(&now, atime);
 834         }
 835
 836         if (vap->va_mask & ATTR_MTIME) {
 837                 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 838         } else {
 839                 ZFS_TIME_ENCODE(&now, mtime);
 840         }
 841
 842         /* Now add in all of the "SA" attributes */
 843         VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
 844             &sa_hdl));
 845
 846         /*
 847          * Setup the array of attributes to be replaced/set on the new file
 848          *
 849          * order for  DMU_OT_ZNODE is critical since it needs to be constructed
 850          * in the old znode_phys_t format.  Don't change this ordering
 851          */
 852         sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
 853
 854         if (obj_type == DMU_OT_ZNODE) {
 855                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
 856                     NULL, &atime, 16);
 857                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
 858                     NULL, &mtime, 16);
 859                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
 860                     NULL, &ctime, 16);
 861                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
 862                     NULL, &crtime, 16);
 863                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
 864                     NULL, &gen, 8);
 865                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
 866                     NULL, &mode, 8);
 867                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
 868                     NULL, &size, 8);
 869                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
 870                     NULL, &parent, 8);
 871         } else {
 872                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
 873                     NULL, &mode, 8);
 874                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
 875                     NULL, &size, 8);
 876                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
 877                     NULL, &gen, 8);
 878                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
 879                     NULL, &acl_ids->z_fuid, 8);
 880                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
 881                     NULL, &acl_ids->z_fgid, 8);
 882                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
 883                     NULL, &parent, 8);
 884                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
 885                     NULL, &pflags, 8);
 886                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
 887                     NULL, &atime, 16);
 888                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
 889                     NULL, &mtime, 16);
 890                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
 891                     NULL, &ctime, 16);
 892                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
 893                     NULL, &crtime, 16);
 894         }
 895
 896         SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
 897
 898         if (obj_type == DMU_OT_ZNODE) {
 899                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
 900                     &empty_xattr, 8);
 901         } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
 902             pflags & ZFS_PROJID) {
 903                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
 904                     NULL, &projid, 8);
 905         }
 906         if (obj_type == DMU_OT_ZNODE ||
 907             (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
 908                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
 909                     NULL, &rdev, 8);
 910         }
 911         if (obj_type == DMU_OT_ZNODE) {
 912                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
 913                     NULL, &pflags, 8);
 914                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
 915                     &acl_ids->z_fuid, 8);
 916                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
 917                     &acl_ids->z_fgid, 8);
 918                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
 919                     sizeof (uint64_t) * 4);
 920                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
 921                     &acl_phys, sizeof (zfs_acl_phys_t));
 922         } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
 923                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
 924                     &acl_ids->z_aclp->z_acl_count, 8);
 925                 locate.cb_aclp = acl_ids->z_aclp;
 926                 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
 927                     zfs_acl_data_locator, &locate,
 928                     acl_ids->z_aclp->z_acl_bytes);
 929                 mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
 930                     acl_ids->z_fuid, acl_ids->z_fgid);
 931         }
 932
 933         VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
 934
 935         if (!(flag & IS_ROOT_NODE)) {
 936                 /*
 937                  * The call to zfs_znode_alloc() may fail if memory is low
 938                  * via the call path: alloc_inode() -> inode_init_always() ->
 939                  * security_inode_alloc() -> inode_alloc_security().  Since
 940                  * the existing code is written such that zfs_mknode() can
 941                  * not fail retry until sufficient memory has been reclaimed.
 942                  */
 943                 do {
 944                         *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
 945                 } while (*zpp == NULL);
 946
 947                 VERIFY(*zpp != NULL);
 948                 VERIFY(dzp != NULL);
 949         } else {
 950                 /*
 951                  * If we are creating the root node, the "parent" we
 952                  * passed in is the znode for the root.
 953                  */
 954                 *zpp = dzp;
 955
 956                 (*zpp)->z_sa_hdl = sa_hdl;
 957         }
 958
 959         (*zpp)->z_pflags = pflags;
 960         (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
 961         (*zpp)->z_dnodesize = dnodesize;
 962         (*zpp)->z_projid = projid;
 963
 964         if (obj_type == DMU_OT_ZNODE ||
 965             acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
 966                 VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
 967         }
 968         kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
 969         zfs_znode_hold_exit(zfsvfs, zh);
 970 }
 971
 972 /*
 973  * Update in-core attributes.  It is assumed the caller will be doing an
 974  * sa_bulk_update to push the changes out.
 975  */
 976 void
 977 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
 978 {
 979         xoptattr_t *xoap;
 980         boolean_t update_inode = B_FALSE;
 981
 982         xoap = xva_getxoptattr(xvap);
 983         ASSERT(xoap);
 984
 985         if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
 986                 uint64_t times[2];
 987                 ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
 988                 (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
 989                     &times, sizeof (times), tx);
 990                 XVA_SET_RTN(xvap, XAT_CREATETIME);
 991         }
 992         if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
 993                 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
 994                     zp->z_pflags, tx);
 995                 XVA_SET_RTN(xvap, XAT_READONLY);
 996         }
 997         if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
 998                 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
 999                     zp->z_pflags, tx);
1000                 XVA_SET_RTN(xvap, XAT_HIDDEN);
1001         }
1002         if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
1003                 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
1004                     zp->z_pflags, tx);
1005                 XVA_SET_RTN(xvap, XAT_SYSTEM);
1006         }
1007         if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
1008                 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
1009                     zp->z_pflags, tx);
1010                 XVA_SET_RTN(xvap, XAT_ARCHIVE);
1011         }
1012         if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
1013                 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
1014                     zp->z_pflags, tx);
1015                 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
1016
1017                 update_inode = B_TRUE;
1018         }
1019         if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
1020                 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
1021                     zp->z_pflags, tx);
1022                 XVA_SET_RTN(xvap, XAT_NOUNLINK);
1023         }
1024         if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1025                 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
1026                     zp->z_pflags, tx);
1027                 XVA_SET_RTN(xvap, XAT_APPENDONLY);
1028
1029                 update_inode = B_TRUE;
1030         }
1031         if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1032                 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
1033                     zp->z_pflags, tx);
1034                 XVA_SET_RTN(xvap, XAT_NODUMP);
1035         }
1036         if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1037                 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
1038                     zp->z_pflags, tx);
1039                 XVA_SET_RTN(xvap, XAT_OPAQUE);
1040         }
1041         if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1042                 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
1043                     xoap->xoa_av_quarantined, zp->z_pflags, tx);
1044                 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1045         }
1046         if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1047                 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
1048                     zp->z_pflags, tx);
1049                 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1050         }
1051         if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
1052                 zfs_sa_set_scanstamp(zp, xvap, tx);
1053                 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
1054         }
1055         if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1056                 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
1057                     zp->z_pflags, tx);
1058                 XVA_SET_RTN(xvap, XAT_REPARSE);
1059         }
1060         if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1061                 ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
1062                     zp->z_pflags, tx);
1063                 XVA_SET_RTN(xvap, XAT_OFFLINE);
1064         }
1065         if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1066                 ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
1067                     zp->z_pflags, tx);
1068                 XVA_SET_RTN(xvap, XAT_SPARSE);
1069         }
1070         if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
1071                 ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
1072                     zp->z_pflags, tx);
1073                 XVA_SET_RTN(xvap, XAT_PROJINHERIT);
1074         }
1075
1076         if (update_inode)
1077                 zfs_set_inode_flags(zp, ZTOI(zp));
1078 }
1079
1080 int
1081 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
1082 {
1083         dmu_object_info_t doi;
1084         dmu_buf_t       *db;
1085         znode_t         *zp;
1086         znode_hold_t    *zh;
1087         int err;
1088         sa_handle_t     *hdl;
1089
1090         *zpp = NULL;
1091
1092 again:
1093         zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1094
1095         err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1096         if (err) {
1097                 zfs_znode_hold_exit(zfsvfs, zh);
1098                 return (err);
1099         }
1100
1101         dmu_object_info_from_db(db, &doi);
1102         if (doi.doi_bonus_type != DMU_OT_SA &&
1103             (doi.doi_bonus_type != DMU_OT_ZNODE ||
1104             (doi.doi_bonus_type == DMU_OT_ZNODE &&
1105             doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1106                 sa_buf_rele(db, NULL);
1107                 zfs_znode_hold_exit(zfsvfs, zh);
1108                 return (SET_ERROR(EINVAL));
1109         }
1110
1111         hdl = dmu_buf_get_user(db);
1112         if (hdl != NULL) {
1113                 zp = sa_get_userdata(hdl);
1114
1115
1116                 /*
1117                  * Since "SA" does immediate eviction we
1118                  * should never find a sa handle that doesn't
1119                  * know about the znode.
1120                  */
1121
1122                 ASSERT3P(zp, !=, NULL);
1123
1124                 mutex_enter(&zp->z_lock);
1125                 ASSERT3U(zp->z_id, ==, obj_num);
1126                 /*
1127                  * If zp->z_unlinked is set, the znode is already marked
1128                  * for deletion and should not be discovered. Check this
1129                  * after checking igrab() due to fsetxattr() & O_TMPFILE.
1130                  *
1131                  * If igrab() returns NULL the VFS has independently
1132                  * determined the inode should be evicted and has
1133                  * called iput_final() to start the eviction process.
1134                  * The SA handle is still valid but because the VFS
1135                  * requires that the eviction succeed we must drop
1136                  * our locks and references to allow the eviction to
1137                  * complete.  The zfs_zget() may then be retried.
1138                  *
1139                  * This unlikely case could be optimized by registering
1140                  * a sops->drop_inode() callback.  The callback would
1141                  * need to detect the active SA hold thereby informing
1142                  * the VFS that this inode should not be evicted.
1143                  */
1144                 if (igrab(ZTOI(zp)) == NULL) {
1145                         if (zp->z_unlinked)
1146                                 err = SET_ERROR(ENOENT);
1147                         else
1148                                 err = SET_ERROR(EAGAIN);
1149                 } else {
1150                         *zpp = zp;
1151                         err = 0;
1152                 }
1153
1154                 mutex_exit(&zp->z_lock);
1155                 sa_buf_rele(db, NULL);
1156                 zfs_znode_hold_exit(zfsvfs, zh);
1157
1158                 if (err == EAGAIN) {
1159                         /* inode might need this to finish evict */
1160                         cond_resched();
1161                         goto again;
1162                 }
1163                 return (err);
1164         }
1165
1166         /*
1167          * Not found create new znode/vnode but only if file exists.
1168          *
1169          * There is a small window where zfs_vget() could
1170          * find this object while a file create is still in
1171          * progress.  This is checked for in zfs_znode_alloc()
1172          *
1173          * if zfs_znode_alloc() fails it will drop the hold on the
1174          * bonus buffer.
1175          */
1176         zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
1177             doi.doi_bonus_type, NULL);
1178         if (zp == NULL) {
1179                 err = SET_ERROR(ENOENT);
1180         } else {
1181                 *zpp = zp;
1182         }
1183         zfs_znode_hold_exit(zfsvfs, zh);
1184         return (err);
1185 }
1186
1187 int
1188 zfs_rezget(znode_t *zp)
1189 {
1190         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1191         dmu_object_info_t doi;
1192         dmu_buf_t *db;
1193         uint64_t obj_num = zp->z_id;
1194         uint64_t mode;
1195         uint64_t links;
1196         sa_bulk_attr_t bulk[11];
1197         int err;
1198         int count = 0;
1199         uint64_t gen;
1200         uint64_t z_uid, z_gid;
1201         uint64_t atime[2], mtime[2], ctime[2], btime[2];
1202         inode_timespec_t tmp_ts;
1203         uint64_t projid = ZFS_DEFAULT_PROJID;
1204         znode_hold_t *zh;
1205
1206         /*
1207          * skip ctldir, otherwise they will always get invalidated. This will
1208          * cause funny behaviour for the mounted snapdirs. Especially for
1209          * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
1210          * anyone automount it again as long as someone is still using the
1211          * detached mount.
1212          */
1213         if (zp->z_is_ctldir)
1214                 return (0);
1215
1216         zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1217
1218         mutex_enter(&zp->z_acl_lock);
1219         if (zp->z_acl_cached) {
1220                 zfs_acl_free(zp->z_acl_cached);
1221                 zp->z_acl_cached = NULL;
1222         }
1223         mutex_exit(&zp->z_acl_lock);
1224
1225         rw_enter(&zp->z_xattr_lock, RW_WRITER);
1226         if (zp->z_xattr_cached) {
1227                 nvlist_free(zp->z_xattr_cached);
1228                 zp->z_xattr_cached = NULL;
1229         }
1230         rw_exit(&zp->z_xattr_lock);
1231
1232         ASSERT(zp->z_sa_hdl == NULL);
1233         err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1234         if (err) {
1235                 zfs_znode_hold_exit(zfsvfs, zh);
1236                 return (err);
1237         }
1238
1239         dmu_object_info_from_db(db, &doi);
1240         if (doi.doi_bonus_type != DMU_OT_SA &&
1241             (doi.doi_bonus_type != DMU_OT_ZNODE ||
1242             (doi.doi_bonus_type == DMU_OT_ZNODE &&
1243             doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1244                 sa_buf_rele(db, NULL);
1245                 zfs_znode_hold_exit(zfsvfs, zh);
1246                 return (SET_ERROR(EINVAL));
1247         }
1248
1249         zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
1250
1251         /* reload cached values */
1252         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
1253             &gen, sizeof (gen));
1254         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1255             &zp->z_size, sizeof (zp->z_size));
1256         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
1257             &links, sizeof (links));
1258         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1259             &zp->z_pflags, sizeof (zp->z_pflags));
1260         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1261             &z_uid, sizeof (z_uid));
1262         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1263             &z_gid, sizeof (z_gid));
1264         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1265             &mode, sizeof (mode));
1266         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1267             &atime, 16);
1268         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
1269             &mtime, 16);
1270         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
1271             &ctime, 16);
1272         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
1273
1274         if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
1275                 zfs_znode_dmu_fini(zp);
1276                 zfs_znode_hold_exit(zfsvfs, zh);
1277                 return (SET_ERROR(EIO));
1278         }
1279
1280         if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
1281                 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
1282                     &projid, 8);
1283                 if (err != 0 && err != ENOENT) {
1284                         zfs_znode_dmu_fini(zp);
1285                         zfs_znode_hold_exit(zfsvfs, zh);
1286                         return (SET_ERROR(err));
1287                 }
1288         }
1289
1290         zp->z_projid = projid;
1291         zp->z_mode = ZTOI(zp)->i_mode = mode;
1292         zfs_uid_write(ZTOI(zp), z_uid);
1293         zfs_gid_write(ZTOI(zp), z_gid);
1294
1295         ZFS_TIME_DECODE(&tmp_ts, atime);
1296         zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_ts);
1297         ZFS_TIME_DECODE(&tmp_ts, mtime);
1298         zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts);
1299         ZFS_TIME_DECODE(&tmp_ts, ctime);
1300         zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts);
1301         ZFS_TIME_DECODE(&zp->z_btime, btime);
1302
1303         if ((uint32_t)gen != ZTOI(zp)->i_generation) {
1304                 zfs_znode_dmu_fini(zp);
1305                 zfs_znode_hold_exit(zfsvfs, zh);
1306                 return (SET_ERROR(EIO));
1307         }
1308
1309         set_nlink(ZTOI(zp), (uint32_t)links);
1310         zfs_set_inode_flags(zp, ZTOI(zp));
1311
1312         zp->z_blksz = doi.doi_data_block_size;
1313         zp->z_atime_dirty = B_FALSE;
1314         zfs_znode_update_vfs(zp);
1315
1316         /*
1317          * If the file has zero links, then it has been unlinked on the send
1318          * side and it must be in the received unlinked set.
1319          * We call zfs_znode_dmu_fini() now to prevent any accesses to the
1320          * stale data and to prevent automatic removal of the file in
1321          * zfs_zinactive().  The file will be removed either when it is removed
1322          * on the send side and the next incremental stream is received or
1323          * when the unlinked set gets processed.
1324          */
1325         zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
1326         if (zp->z_unlinked)
1327                 zfs_znode_dmu_fini(zp);
1328
1329         zfs_znode_hold_exit(zfsvfs, zh);
1330
1331         return (0);
1332 }
1333
1334 void
1335 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1336 {
1337         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1338         objset_t *os = zfsvfs->z_os;
1339         uint64_t obj = zp->z_id;
1340         uint64_t acl_obj = zfs_external_acl(zp);
1341         znode_hold_t *zh;
1342
1343         zh = zfs_znode_hold_enter(zfsvfs, obj);
1344         if (acl_obj) {
1345                 VERIFY(!zp->z_is_sa);
1346                 VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1347         }
1348         VERIFY(0 == dmu_object_free(os, obj, tx));
1349         zfs_znode_dmu_fini(zp);
1350         zfs_znode_hold_exit(zfsvfs, zh);
1351 }
1352
1353 void
1354 zfs_zinactive(znode_t *zp)
1355 {
1356         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1357         uint64_t z_id = zp->z_id;
1358         znode_hold_t *zh;
1359
1360         ASSERT(zp->z_sa_hdl);
1361
1362         /*
1363          * Don't allow a zfs_zget() while were trying to release this znode.
1364          */
1365         zh = zfs_znode_hold_enter(zfsvfs, z_id);
1366
1367         mutex_enter(&zp->z_lock);
1368
1369         /*
1370          * If this was the last reference to a file with no links, remove
1371          * the file from the file system unless the file system is mounted
1372          * read-only.  That can happen, for example, if the file system was
1373          * originally read-write, the file was opened, then unlinked and
1374          * the file system was made read-only before the file was finally
1375          * closed.  The file will remain in the unlinked set.
1376          */
1377         if (zp->z_unlinked) {
1378                 ASSERT(!zfsvfs->z_issnap);
1379                 if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) {
1380                         mutex_exit(&zp->z_lock);
1381                         zfs_znode_hold_exit(zfsvfs, zh);
1382                         zfs_rmnode(zp);
1383                         return;
1384                 }
1385         }
1386
1387         mutex_exit(&zp->z_lock);
1388         zfs_znode_dmu_fini(zp);
1389
1390         zfs_znode_hold_exit(zfsvfs, zh);
1391 }
1392
1393 #if defined(HAVE_INODE_TIMESPEC64_TIMES)
1394 #define zfs_compare_timespec timespec64_compare
1395 #else
1396 #define zfs_compare_timespec timespec_compare
1397 #endif
1398
1399 /*
1400  * Determine whether the znode's atime must be updated.  The logic mostly
1401  * duplicates the Linux kernel's relatime_need_update() functionality.
1402  * This function is only called if the underlying filesystem actually has
1403  * atime updates enabled.
1404  */
1405 boolean_t
1406 zfs_relatime_need_update(const struct inode *ip)
1407 {
1408         inode_timespec_t now, tmp_atime, tmp_ts;
1409
1410         gethrestime(&now);
1411         tmp_atime = zpl_inode_get_atime(ip);
1412         /*
1413          * In relatime mode, only update the atime if the previous atime
1414          * is earlier than either the ctime or mtime or if at least a day
1415          * has passed since the last update of atime.
1416          */
1417         tmp_ts = zpl_inode_get_mtime(ip);
1418         if (zfs_compare_timespec(&tmp_ts, &tmp_atime) >= 0)
1419                 return (B_TRUE);
1420
1421         tmp_ts = zpl_inode_get_ctime(ip);
1422         if (zfs_compare_timespec(&tmp_ts, &tmp_atime) >= 0)
1423                 return (B_TRUE);
1424
1425         if ((hrtime_t)now.tv_sec - (hrtime_t)tmp_atime.tv_sec >= 24*60*60)
1426                 return (B_TRUE);
1427
1428         return (B_FALSE);
1429 }
1430
1431 /*
1432  * Prepare to update znode time stamps.
1433  *
1434  *      IN:     zp      - znode requiring timestamp update
1435  *              flag    - ATTR_MTIME, ATTR_CTIME flags
1436  *
1437  *      OUT:    zp      - z_seq
1438  *              mtime   - new mtime
1439  *              ctime   - new ctime
1440  *
1441  *      Note: We don't update atime here, because we rely on Linux VFS to do
1442  *      atime updating.
1443  */
1444 void
1445 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
1446     uint64_t ctime[2])
1447 {
1448         inode_timespec_t now, tmp_ts;
1449
1450         gethrestime(&now);
1451
1452         zp->z_seq++;
1453
1454         if (flag & ATTR_MTIME) {
1455                 ZFS_TIME_ENCODE(&now, mtime);
1456                 ZFS_TIME_DECODE(&tmp_ts, mtime);
1457                 zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts);
1458                 if (ZTOZSB(zp)->z_use_fuids) {
1459                         zp->z_pflags |= (ZFS_ARCHIVE |
1460                             ZFS_AV_MODIFIED);
1461                 }
1462         }
1463
1464         if (flag & ATTR_CTIME) {
1465                 ZFS_TIME_ENCODE(&now, ctime);
1466                 ZFS_TIME_DECODE(&tmp_ts, ctime);
1467                 zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts);
1468                 if (ZTOZSB(zp)->z_use_fuids)
1469                         zp->z_pflags |= ZFS_ARCHIVE;
1470         }
1471 }
1472
1473 /*
1474  * Grow the block size for a file.
1475  *
1476  *      IN:     zp      - znode of file to free data in.
1477  *              size    - requested block size
1478  *              tx      - open transaction.
1479  *
1480  * NOTE: this function assumes that the znode is write locked.
1481  */
1482 void
1483 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1484 {
1485         int             error;
1486         u_longlong_t    dummy;
1487
1488         if (size <= zp->z_blksz)
1489                 return;
1490         /*
1491          * If the file size is already greater than the current blocksize,
1492          * we will not grow.  If there is more than one block in a file,
1493          * the blocksize cannot change.
1494          */
1495         if (zp->z_blksz && zp->z_size > zp->z_blksz)
1496                 return;
1497
1498         error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
1499             size, 0, tx);
1500
1501         if (error == ENOTSUP)
1502                 return;
1503         ASSERT0(error);
1504
1505         /* What blocksize did we actually get? */
1506         dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
1507 }
1508
1509 /*
1510  * Increase the file length
1511  *
1512  *      IN:     zp      - znode of file to free data in.
1513  *              end     - new end-of-file
1514  *
1515  *      RETURN: 0 on success, error code on failure
1516  */
1517 static int
1518 zfs_extend(znode_t *zp, uint64_t end)
1519 {
1520         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1521         dmu_tx_t *tx;
1522         zfs_locked_range_t *lr;
1523         uint64_t newblksz;
1524         int error;
1525
1526         /*
1527          * We will change zp_size, lock the whole file.
1528          */
1529         lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
1530
1531         /*
1532          * Nothing to do if file already at desired length.
1533          */
1534         if (end <= zp->z_size) {
1535                 zfs_rangelock_exit(lr);
1536                 return (0);
1537         }
1538         tx = dmu_tx_create(zfsvfs->z_os);
1539         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1540         zfs_sa_upgrade_txholds(tx, zp);
1541         if (end > zp->z_blksz &&
1542             (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1543                 /*
1544                  * We are growing the file past the current block size.
1545                  */
1546                 if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
1547                         /*
1548                          * File's blocksize is already larger than the
1549                          * "recordsize" property.  Only let it grow to
1550                          * the next power of 2.
1551                          */
1552                         ASSERT(!ISP2(zp->z_blksz));
1553                         newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
1554                 } else {
1555                         newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
1556                 }
1557                 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1558         } else {
1559                 newblksz = 0;
1560         }
1561
1562         error = dmu_tx_assign(tx, TXG_WAIT);
1563         if (error) {
1564                 dmu_tx_abort(tx);
1565                 zfs_rangelock_exit(lr);
1566                 return (error);
1567         }
1568
1569         if (newblksz)
1570                 zfs_grow_blocksize(zp, newblksz, tx);
1571
1572         zp->z_size = end;
1573
1574         VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
1575             &zp->z_size, sizeof (zp->z_size), tx));
1576
1577         zfs_rangelock_exit(lr);
1578
1579         dmu_tx_commit(tx);
1580
1581         return (0);
1582 }
1583
1584 /*
1585  * zfs_zero_partial_page - Modeled after update_pages() but
1586  * with different arguments and semantics for use by zfs_freesp().
1587  *
1588  * Zeroes a piece of a single page cache entry for zp at offset
1589  * start and length len.
1590  *
1591  * Caller must acquire a range lock on the file for the region
1592  * being zeroed in order that the ARC and page cache stay in sync.
1593  */
1594 static void
1595 zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
1596 {
1597         struct address_space *mp = ZTOI(zp)->i_mapping;
1598         struct page *pp;
1599         int64_t off;
1600         void *pb;
1601
1602         ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
1603
1604         off = start & (PAGE_SIZE - 1);
1605         start &= PAGE_MASK;
1606
1607         pp = find_lock_page(mp, start >> PAGE_SHIFT);
1608         if (pp) {
1609                 if (mapping_writably_mapped(mp))
1610                         flush_dcache_page(pp);
1611
1612                 pb = kmap(pp);
1613                 memset(pb + off, 0, len);
1614                 kunmap(pp);
1615
1616                 if (mapping_writably_mapped(mp))
1617                         flush_dcache_page(pp);
1618
1619                 mark_page_accessed(pp);
1620                 SetPageUptodate(pp);
1621                 ClearPageError(pp);
1622                 unlock_page(pp);
1623                 put_page(pp);
1624         }
1625 }
1626
1627 /*
1628  * Free space in a file.
1629  *
1630  *      IN:     zp      - znode of file to free data in.
1631  *              off     - start of section to free.
1632  *              len     - length of section to free.
1633  *
1634  *      RETURN: 0 on success, error code on failure
1635  */
1636 static int
1637 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1638 {
1639         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1640         zfs_locked_range_t *lr;
1641         int error;
1642
1643         /*
1644          * Lock the range being freed.
1645          */
1646         lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
1647
1648         /*
1649          * Nothing to do if file already at desired length.
1650          */
1651         if (off >= zp->z_size) {
1652                 zfs_rangelock_exit(lr);
1653                 return (0);
1654         }
1655
1656         if (off + len > zp->z_size)
1657                 len = zp->z_size - off;
1658
1659         error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1660
1661         /*
1662          * Zero partial page cache entries.  This must be done under a
1663          * range lock in order to keep the ARC and page cache in sync.
1664          */
1665         if (zn_has_cached_data(zp, off, off + len - 1)) {
1666                 loff_t first_page, last_page, page_len;
1667                 loff_t first_page_offset, last_page_offset;
1668
1669                 /* first possible full page in hole */
1670                 first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
1671                 /* last page of hole */
1672                 last_page = (off + len) >> PAGE_SHIFT;
1673
1674                 /* offset of first_page */
1675                 first_page_offset = first_page << PAGE_SHIFT;
1676                 /* offset of last_page */
1677                 last_page_offset = last_page << PAGE_SHIFT;
1678
1679                 /* truncate whole pages */
1680                 if (last_page_offset > first_page_offset) {
1681                         truncate_inode_pages_range(ZTOI(zp)->i_mapping,
1682                             first_page_offset, last_page_offset - 1);
1683                 }
1684
1685                 /* truncate sub-page ranges */
1686                 if (first_page > last_page) {
1687                         /* entire punched area within a single page */
1688                         zfs_zero_partial_page(zp, off, len);
1689                 } else {
1690                         /* beginning of punched area at the end of a page */
1691                         page_len  = first_page_offset - off;
1692                         if (page_len > 0)
1693                                 zfs_zero_partial_page(zp, off, page_len);
1694
1695                         /* end of punched area at the beginning of a page */
1696                         page_len = off + len - last_page_offset;
1697                         if (page_len > 0)
1698                                 zfs_zero_partial_page(zp, last_page_offset,
1699                                     page_len);
1700                 }
1701         }
1702         zfs_rangelock_exit(lr);
1703
1704         return (error);
1705 }
1706
1707 /*
1708  * Truncate a file
1709  *
1710  *      IN:     zp      - znode of file to free data in.
1711  *              end     - new end-of-file.
1712  *
1713  *      RETURN: 0 on success, error code on failure
1714  */
1715 static int
1716 zfs_trunc(znode_t *zp, uint64_t end)
1717 {
1718         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1719         dmu_tx_t *tx;
1720         zfs_locked_range_t *lr;
1721         int error;
1722         sa_bulk_attr_t bulk[2];
1723         int count = 0;
1724
1725         /*
1726          * We will change zp_size, lock the whole file.
1727          */
1728         lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
1729
1730         /*
1731          * Nothing to do if file already at desired length.
1732          */
1733         if (end >= zp->z_size) {
1734                 zfs_rangelock_exit(lr);
1735                 return (0);
1736         }
1737
1738         error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
1739             DMU_OBJECT_END);
1740         if (error) {
1741                 zfs_rangelock_exit(lr);
1742                 return (error);
1743         }
1744         tx = dmu_tx_create(zfsvfs->z_os);
1745         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1746         zfs_sa_upgrade_txholds(tx, zp);
1747         dmu_tx_mark_netfree(tx);
1748         error = dmu_tx_assign(tx, TXG_WAIT);
1749         if (error) {
1750                 dmu_tx_abort(tx);
1751                 zfs_rangelock_exit(lr);
1752                 return (error);
1753         }
1754
1755         zp->z_size = end;
1756         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
1757             NULL, &zp->z_size, sizeof (zp->z_size));
1758
1759         if (end == 0) {
1760                 zp->z_pflags &= ~ZFS_SPARSE;
1761                 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1762                     NULL, &zp->z_pflags, 8);
1763         }
1764         VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
1765
1766         dmu_tx_commit(tx);
1767         zfs_rangelock_exit(lr);
1768
1769         return (0);
1770 }
1771
1772 /*
1773  * Free space in a file
1774  *
1775  *      IN:     zp      - znode of file to free data in.
1776  *              off     - start of range
1777  *              len     - end of range (0 => EOF)
1778  *              flag    - current file open mode flags.
1779  *              log     - TRUE if this action should be logged
1780  *
1781  *      RETURN: 0 on success, error code on failure
1782  */
1783 int
1784 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1785 {
1786         dmu_tx_t *tx;
1787         zfsvfs_t *zfsvfs = ZTOZSB(zp);
1788         zilog_t *zilog = zfsvfs->z_log;
1789         uint64_t mode;
1790         uint64_t mtime[2], ctime[2];
1791         sa_bulk_attr_t bulk[3];
1792         int count = 0;
1793         int error;
1794
1795         if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
1796             sizeof (mode))) != 0)
1797                 return (error);
1798
1799         if (off > zp->z_size) {
1800                 error =  zfs_extend(zp, off+len);
1801                 if (error == 0 && log)
1802                         goto log;
1803                 goto out;
1804         }
1805
1806         if (len == 0) {
1807                 error = zfs_trunc(zp, off);
1808         } else {
1809                 if ((error = zfs_free_range(zp, off, len)) == 0 &&
1810                     off + len > zp->z_size)
1811                         error = zfs_extend(zp, off+len);
1812         }
1813         if (error || !log)
1814                 goto out;
1815 log:
1816         tx = dmu_tx_create(zfsvfs->z_os);
1817         dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1818         zfs_sa_upgrade_txholds(tx, zp);
1819         error = dmu_tx_assign(tx, TXG_WAIT);
1820         if (error) {
1821                 dmu_tx_abort(tx);
1822                 goto out;
1823         }
1824
1825         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
1826         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
1827         SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1828             NULL, &zp->z_pflags, 8);
1829         zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
1830         error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1831         ASSERT(error == 0);
1832
1833         zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1834
1835         dmu_tx_commit(tx);
1836
1837         zfs_znode_update_vfs(zp);
1838         error = 0;
1839
1840 out:
1841         /*
1842          * Truncate the page cache - for file truncate operations, use
1843          * the purpose-built API for truncations.  For punching operations,
1844          * the truncation is handled under a range lock in zfs_free_range.
1845          */
1846         if (len == 0)
1847                 truncate_setsize(ZTOI(zp), off);
1848         return (error);
1849 }
1850
1851 void
1852 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1853 {
1854         struct super_block *sb;
1855         zfsvfs_t        *zfsvfs;
1856         uint64_t        moid, obj, sa_obj, version;
1857         uint64_t        sense = ZFS_CASE_SENSITIVE;
1858         uint64_t        norm = 0;
1859         nvpair_t        *elem;
1860         int             size;
1861         int             error;
1862         int             i;
1863         znode_t         *rootzp = NULL;
1864         vattr_t         vattr;
1865         znode_t         *zp;
1866         zfs_acl_ids_t   acl_ids;
1867
1868         /*
1869          * First attempt to create master node.
1870          */
1871         /*
1872          * In an empty objset, there are no blocks to read and thus
1873          * there can be no i/o errors (which we assert below).
1874          */
1875         moid = MASTER_NODE_OBJ;
1876         error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1877             DMU_OT_NONE, 0, tx);
1878         ASSERT(error == 0);
1879
1880         /*
1881          * Set starting attributes.
1882          */
1883         version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
1884         elem = NULL;
1885         while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1886                 /* For the moment we expect all zpl props to be uint64_ts */
1887                 uint64_t val;
1888                 const char *name;
1889
1890                 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1891                 VERIFY(nvpair_value_uint64(elem, &val) == 0);
1892                 name = nvpair_name(elem);
1893                 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1894                         if (val < version)
1895                                 version = val;
1896                 } else {
1897                         error = zap_update(os, moid, name, 8, 1, &val, tx);
1898                 }
1899                 ASSERT(error == 0);
1900                 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1901                         norm = val;
1902                 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1903                         sense = val;
1904         }
1905         ASSERT(version != 0);
1906         error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
1907         ASSERT(error == 0);
1908
1909         /*
1910          * Create zap object used for SA attribute registration
1911          */
1912
1913         if (version >= ZPL_VERSION_SA) {
1914                 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
1915                     DMU_OT_NONE, 0, tx);
1916                 error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
1917                 ASSERT(error == 0);
1918         } else {
1919                 sa_obj = 0;
1920         }
1921         /*
1922          * Create a delete queue.
1923          */
1924         obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1925
1926         error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
1927         ASSERT(error == 0);
1928
1929         /*
1930          * Create root znode.  Create minimal znode/inode/zfsvfs/sb
1931          * to allow zfs_mknode to work.
1932          */
1933         vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
1934         vattr.va_mode = S_IFDIR|0755;
1935         vattr.va_uid = crgetuid(cr);
1936         vattr.va_gid = crgetgid(cr);
1937
1938         rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1939         rootzp->z_unlinked = B_FALSE;
1940         rootzp->z_atime_dirty = B_FALSE;
1941         rootzp->z_is_sa = USE_SA(version, os);
1942         rootzp->z_pflags = 0;
1943
1944         zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1945         zfsvfs->z_os = os;
1946         zfsvfs->z_parent = zfsvfs;
1947         zfsvfs->z_version = version;
1948         zfsvfs->z_use_fuids = USE_FUIDS(version, os);
1949         zfsvfs->z_use_sa = USE_SA(version, os);
1950         zfsvfs->z_norm = norm;
1951
1952         sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
1953         sb->s_fs_info = zfsvfs;
1954
1955         ZTOI(rootzp)->i_sb = sb;
1956
1957         error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
1958             &zfsvfs->z_attr_table);
1959
1960         ASSERT(error == 0);
1961
1962         /*
1963          * Fold case on file systems that are always or sometimes case
1964          * insensitive.
1965          */
1966         if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1967                 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
1968
1969         mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1970         list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1971             offsetof(znode_t, z_link_node));
1972
1973         size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
1974         zfsvfs->z_hold_size = size;
1975         zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
1976             KM_SLEEP);
1977         zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
1978         for (i = 0; i != size; i++) {
1979                 avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
1980                     sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
1981                 mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
1982         }
1983
1984         VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1985             cr, NULL, &acl_ids, zfs_init_idmap));
1986         zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
1987         ASSERT3P(zp, ==, rootzp);
1988         error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1989         ASSERT(error == 0);
1990         zfs_acl_ids_free(&acl_ids);
1991
1992         atomic_set(&ZTOI(rootzp)->i_count, 0);
1993         sa_handle_destroy(rootzp->z_sa_hdl);
1994         kmem_cache_free(znode_cache, rootzp);
1995
1996         for (i = 0; i != size; i++) {
1997                 avl_destroy(&zfsvfs->z_hold_trees[i]);
1998                 mutex_destroy(&zfsvfs->z_hold_locks[i]);
1999         }
2000
2001         mutex_destroy(&zfsvfs->z_znodes_lock);
2002
2003         vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
2004         vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
2005         kmem_free(sb, sizeof (struct super_block));
2006         kmem_free(zfsvfs, sizeof (zfsvfs_t));
2007 }
2008 #endif /* _KERNEL */
2009
2010 static int
2011 zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
2012 {
2013         uint64_t sa_obj = 0;
2014         int error;
2015
2016         error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
2017         if (error != 0 && error != ENOENT)
2018                 return (error);
2019
2020         error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
2021         return (error);
2022 }
2023
2024 static int
2025 zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
2026     dmu_buf_t **db, const void *tag)
2027 {
2028         dmu_object_info_t doi;
2029         int error;
2030
2031         if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
2032                 return (error);
2033
2034         dmu_object_info_from_db(*db, &doi);
2035         if ((doi.doi_bonus_type != DMU_OT_SA &&
2036             doi.doi_bonus_type != DMU_OT_ZNODE) ||
2037             (doi.doi_bonus_type == DMU_OT_ZNODE &&
2038             doi.doi_bonus_size < sizeof (znode_phys_t))) {
2039                 sa_buf_rele(*db, tag);
2040                 return (SET_ERROR(ENOTSUP));
2041         }
2042
2043         error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
2044         if (error != 0) {
2045                 sa_buf_rele(*db, tag);
2046                 return (error);
2047         }
2048
2049         return (0);
2050 }
2051
2052 static void
2053 zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, const void *tag)
2054 {
2055         sa_handle_destroy(hdl);
2056         sa_buf_rele(db, tag);
2057 }
2058
2059 /*
2060  * Given an object number, return its parent object number and whether
2061  * or not the object is an extended attribute directory.
2062  */
2063 static int
2064 zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
2065     uint64_t *pobjp, int *is_xattrdir)
2066 {
2067         uint64_t parent;
2068         uint64_t pflags;
2069         uint64_t mode;
2070         uint64_t parent_mode;
2071         sa_bulk_attr_t bulk[3];
2072         sa_handle_t *sa_hdl;
2073         dmu_buf_t *sa_db;
2074         int count = 0;
2075         int error;
2076
2077         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
2078             &parent, sizeof (parent));
2079         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
2080             &pflags, sizeof (pflags));
2081         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2082             &mode, sizeof (mode));
2083
2084         if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
2085                 return (error);
2086
2087         /*
2088          * When a link is removed its parent pointer is not changed and will
2089          * be invalid.  There are two cases where a link is removed but the
2090          * file stays around, when it goes to the delete queue and when there
2091          * are additional links.
2092          */
2093         error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
2094         if (error != 0)
2095                 return (error);
2096
2097         error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
2098         zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2099         if (error != 0)
2100                 return (error);
2101
2102         *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
2103
2104         /*
2105          * Extended attributes can be applied to files, directories, etc.
2106          * Otherwise the parent must be a directory.
2107          */
2108         if (!*is_xattrdir && !S_ISDIR(parent_mode))
2109                 return (SET_ERROR(EINVAL));
2110
2111         *pobjp = parent;
2112
2113         return (0);
2114 }
2115
2116 /*
2117  * Given an object number, return some zpl level statistics
2118  */
2119 static int
2120 zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
2121     zfs_stat_t *sb)
2122 {
2123         sa_bulk_attr_t bulk[4];
2124         int count = 0;
2125
2126         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2127             &sb->zs_mode, sizeof (sb->zs_mode));
2128         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
2129             &sb->zs_gen, sizeof (sb->zs_gen));
2130         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
2131             &sb->zs_links, sizeof (sb->zs_links));
2132         SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
2133             &sb->zs_ctime, sizeof (sb->zs_ctime));
2134
2135         return (sa_bulk_lookup(hdl, bulk, count));
2136 }
2137
2138 static int
2139 zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
2140     sa_attr_type_t *sa_table, char *buf, int len)
2141 {
2142         sa_handle_t *sa_hdl;
2143         sa_handle_t *prevhdl = NULL;
2144         dmu_buf_t *prevdb = NULL;
2145         dmu_buf_t *sa_db = NULL;
2146         char *path = buf + len - 1;
2147         int error;
2148
2149         *path = '\0';
2150         sa_hdl = hdl;
2151
2152         uint64_t deleteq_obj;
2153         VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
2154             ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
2155         error = zap_lookup_int(osp, deleteq_obj, obj);
2156         if (error == 0) {
2157                 return (ESTALE);
2158         } else if (error != ENOENT) {
2159                 return (error);
2160         }
2161
2162         for (;;) {
2163                 uint64_t pobj = 0;
2164                 char component[MAXNAMELEN + 2];
2165                 size_t complen;
2166                 int is_xattrdir = 0;
2167
2168                 if (prevdb) {
2169                         ASSERT(prevhdl != NULL);
2170                         zfs_release_sa_handle(prevhdl, prevdb, FTAG);
2171                 }
2172
2173                 if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
2174                     &is_xattrdir)) != 0)
2175                         break;
2176
2177                 if (pobj == obj) {
2178                         if (path[0] != '/')
2179                                 *--path = '/';
2180                         break;
2181                 }
2182
2183                 component[0] = '/';
2184                 if (is_xattrdir) {
2185                         strcpy(component + 1, "<xattrdir>");
2186                 } else {
2187                         error = zap_value_search(osp, pobj, obj,
2188                             ZFS_DIRENT_OBJ(-1ULL), component + 1);
2189                         if (error != 0)
2190                                 break;
2191                 }
2192
2193                 complen = strlen(component);
2194                 path -= complen;
2195                 ASSERT(path >= buf);
2196                 memcpy(path, component, complen);
2197                 obj = pobj;
2198
2199                 if (sa_hdl != hdl) {
2200                         prevhdl = sa_hdl;
2201                         prevdb = sa_db;
2202                 }
2203                 error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
2204                 if (error != 0) {
2205                         sa_hdl = prevhdl;
2206                         sa_db = prevdb;
2207                         break;
2208                 }
2209         }
2210
2211         if (sa_hdl != NULL && sa_hdl != hdl) {
2212                 ASSERT(sa_db != NULL);
2213                 zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2214         }
2215
2216         if (error == 0)
2217                 (void) memmove(buf, path, buf + len - path);
2218
2219         return (error);
2220 }
2221
2222 int
2223 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
2224 {
2225         sa_attr_type_t *sa_table;
2226         sa_handle_t *hdl;
2227         dmu_buf_t *db;
2228         int error;
2229
2230         error = zfs_sa_setup(osp, &sa_table);
2231         if (error != 0)
2232                 return (error);
2233
2234         error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2235         if (error != 0)
2236                 return (error);
2237
2238         error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2239
2240         zfs_release_sa_handle(hdl, db, FTAG);
2241         return (error);
2242 }
2243
2244 int
2245 zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
2246     char *buf, int len)
2247 {
2248         char *path = buf + len - 1;
2249         sa_attr_type_t *sa_table;
2250         sa_handle_t *hdl;
2251         dmu_buf_t *db;
2252         int error;
2253
2254         *path = '\0';
2255
2256         error = zfs_sa_setup(osp, &sa_table);
2257         if (error != 0)
2258                 return (error);
2259
2260         error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2261         if (error != 0)
2262                 return (error);
2263
2264         error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
2265         if (error != 0) {
2266                 zfs_release_sa_handle(hdl, db, FTAG);
2267                 return (error);
2268         }
2269
2270         error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2271
2272         zfs_release_sa_handle(hdl, db, FTAG);
2273         return (error);
2274 }
2275
2276 /*
2277  * Read a property stored within the master node.
2278  */
2279 int
2280 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2281 {
2282         uint64_t *cached_copy = NULL;
2283
2284         /*
2285          * Figure out where in the objset_t the cached copy would live, if it
2286          * is available for the requested property.
2287          */
2288         if (os != NULL) {
2289                 switch (prop) {
2290                 case ZFS_PROP_VERSION:
2291                         cached_copy = &os->os_version;
2292                         break;
2293                 case ZFS_PROP_NORMALIZE:
2294                         cached_copy = &os->os_normalization;
2295                         break;
2296                 case ZFS_PROP_UTF8ONLY:
2297                         cached_copy = &os->os_utf8only;
2298                         break;
2299                 case ZFS_PROP_CASE:
2300                         cached_copy = &os->os_casesensitivity;
2301                         break;
2302                 default:
2303                         break;
2304                 }
2305         }
2306         if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
2307                 *value = *cached_copy;
2308                 return (0);
2309         }
2310
2311         /*
2312          * If the property wasn't cached, look up the file system's value for
2313          * the property. For the version property, we look up a slightly
2314          * different string.
2315          */
2316         const char *pname;
2317         int error = ENOENT;
2318         if (prop == ZFS_PROP_VERSION)
2319                 pname = ZPL_VERSION_STR;
2320         else
2321                 pname = zfs_prop_to_name(prop);
2322
2323         if (os != NULL) {
2324                 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
2325                 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2326         }
2327
2328         if (error == ENOENT) {
2329                 /* No value set, use the default value */
2330                 switch (prop) {
2331                 case ZFS_PROP_VERSION:
2332                         *value = ZPL_VERSION;
2333                         break;
2334                 case ZFS_PROP_NORMALIZE:
2335                 case ZFS_PROP_UTF8ONLY:
2336                         *value = 0;
2337                         break;
2338                 case ZFS_PROP_CASE:
2339                         *value = ZFS_CASE_SENSITIVE;
2340                         break;
2341                 case ZFS_PROP_ACLTYPE:
2342                         *value = ZFS_ACLTYPE_OFF;
2343                         break;
2344                 default:
2345                         return (error);
2346                 }
2347                 error = 0;
2348         }
2349
2350         /*
2351          * If one of the methods for getting the property value above worked,
2352          * copy it into the objset_t's cache.
2353          */
2354         if (error == 0 && cached_copy != NULL) {
2355                 *cached_copy = *value;
2356         }
2357
2358         return (error);
2359 }
2360
2361 #if defined(_KERNEL)
2362 EXPORT_SYMBOL(zfs_create_fs);
2363 EXPORT_SYMBOL(zfs_obj_to_path);
2364
2365 /* CSTYLED */
2366 module_param(zfs_object_mutex_size, uint, 0644);
2367 MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
2368 module_param(zfs_unlink_suspend_progress, int, 0644);
2369 MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks "
2370 "(debug - leaks space into the unlinked set)");
2371 #endif