]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/zfs_znode.c
OpenZFS 9689 - zfs range lock code should not be zpl-specific
[mirror_zfs.git] / module / zfs / zfs_znode.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
5d43cc9a 23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
34dc7c2f
BB
24 */
25
26/* Portions Copyright 2007 Jeremy Teo */
27
34dc7c2f
BB
28#ifdef _KERNEL
29#include <sys/types.h>
30#include <sys/param.h>
31#include <sys/time.h>
34dc7c2f 32#include <sys/sysmacros.h>
34dc7c2f 33#include <sys/mntent.h>
34dc7c2f
BB
34#include <sys/u8_textprep.h>
35#include <sys/dsl_dataset.h>
36#include <sys/vfs.h>
34dc7c2f
BB
37#include <sys/vnode.h>
38#include <sys/file.h>
39#include <sys/kmem.h>
40#include <sys/errno.h>
34dc7c2f
BB
41#include <sys/mode.h>
42#include <sys/atomic.h>
34dc7c2f
BB
43#include <sys/zfs_dir.h>
44#include <sys/zfs_acl.h>
45#include <sys/zfs_ioctl.h>
46#include <sys/zfs_rlock.h>
47#include <sys/zfs_fuid.h>
3558fd73 48#include <sys/zfs_vnops.h>
ebe7e575 49#include <sys/zfs_ctldir.h>
428870ff 50#include <sys/dnode.h>
34dc7c2f 51#include <sys/fs/zfs.h>
3558fd73 52#include <sys/zpl.h>
34dc7c2f
BB
53#endif /* _KERNEL */
54
55#include <sys/dmu.h>
f1512ee6 56#include <sys/dmu_objset.h>
50c957f7 57#include <sys/dmu_tx.h>
34dc7c2f
BB
58#include <sys/refcount.h>
59#include <sys/stat.h>
60#include <sys/zap.h>
61#include <sys/zfs_znode.h>
428870ff
BB
62#include <sys/sa.h>
63#include <sys/zfs_sa.h>
572e2857 64#include <sys/zfs_stat.h>
34dc7c2f
BB
65
66#include "zfs_prop.h"
428870ff 67#include "zfs_comutil.h"
34dc7c2f 68
b128c09f
BB
69/*
70 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
71 * turned on when DEBUG is also defined.
72 */
73#ifdef DEBUG
74#define ZNODE_STATS
75#endif /* DEBUG */
76
77#ifdef ZNODE_STATS
78#define ZNODE_STAT_ADD(stat) ((stat)++)
79#else
80#define ZNODE_STAT_ADD(stat) /* nothing */
81#endif /* ZNODE_STATS */
82
34dc7c2f
BB
83/*
84 * Functions needed for userland (ie: libzpool) are not put under
85 * #ifdef_KERNEL; the rest of the functions have dependencies
86 * (such as VFS logic) that will not compile easily in userland.
87 */
88#ifdef _KERNEL
9babb374 89
b128c09f 90static kmem_cache_t *znode_cache = NULL;
c96c36fa 91static kmem_cache_t *znode_hold_cache = NULL;
0720116d 92unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
34dc7c2f 93
5d43cc9a
MA
94/*
95 * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
96 * z_rangelock. It will modify the offset and length of the lock to reflect
97 * znode-specific information, and convert RL_APPEND to RL_WRITER. This is
98 * called with the rangelock_t's rl_lock held, which avoids races.
99 */
100static void
101zfs_rangelock_cb(locked_range_t *new, void *arg)
102{
103 znode_t *zp = arg;
104
105 /*
106 * If in append mode, convert to writer and lock starting at the
107 * current end of file.
108 */
109 if (new->lr_type == RL_APPEND) {
110 new->lr_offset = zp->z_size;
111 new->lr_type = RL_WRITER;
112 }
113
114 /*
115 * If we need to grow the block size then lock the whole file range.
116 */
117 uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
118 if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
119 zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
120 new->lr_offset = 0;
121 new->lr_length = UINT64_MAX;
122 }
123}
124
34dc7c2f
BB
125/*ARGSUSED*/
126static int
b128c09f 127zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
34dc7c2f
BB
128{
129 znode_t *zp = buf;
130
3558fd73 131 inode_init_once(ZTOI(zp));
b128c09f
BB
132 list_link_init(&zp->z_link_node);
133
34dc7c2f 134 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
34dc7c2f 135 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
448d7aaa 136 rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
34dc7c2f 137 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
82a37189 138 rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
34dc7c2f 139
5d43cc9a 140 rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
34dc7c2f 141
b128c09f 142 zp->z_dirlocks = NULL;
45d1cae3 143 zp->z_acl_cached = NULL;
82a37189 144 zp->z_xattr_cached = NULL;
98701490 145 zp->z_xattr_parent = 0;
572e2857 146 zp->z_moved = 0;
34dc7c2f
BB
147 return (0);
148}
149
150/*ARGSUSED*/
151static void
b128c09f 152zfs_znode_cache_destructor(void *buf, void *arg)
34dc7c2f
BB
153{
154 znode_t *zp = buf;
155
b128c09f 156 ASSERT(!list_link_active(&zp->z_link_node));
34dc7c2f 157 mutex_destroy(&zp->z_lock);
34dc7c2f
BB
158 rw_destroy(&zp->z_parent_lock);
159 rw_destroy(&zp->z_name_lock);
160 mutex_destroy(&zp->z_acl_lock);
82a37189 161 rw_destroy(&zp->z_xattr_lock);
5d43cc9a 162 rangelock_fini(&zp->z_rangelock);
34dc7c2f 163
b128c09f 164 ASSERT(zp->z_dirlocks == NULL);
45d1cae3 165 ASSERT(zp->z_acl_cached == NULL);
82a37189 166 ASSERT(zp->z_xattr_cached == NULL);
b128c09f
BB
167}
168
c96c36fa
BB
169static int
170zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
171{
172 znode_hold_t *zh = buf;
173
174 mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
424fd7c3 175 zfs_refcount_create(&zh->zh_refcount);
c96c36fa
BB
176 zh->zh_obj = ZFS_NO_OBJECT;
177
178 return (0);
179}
180
181static void
182zfs_znode_hold_cache_destructor(void *buf, void *arg)
183{
184 znode_hold_t *zh = buf;
185
186 mutex_destroy(&zh->zh_lock);
424fd7c3 187 zfs_refcount_destroy(&zh->zh_refcount);
c96c36fa
BB
188}
189
34dc7c2f
BB
190void
191zfs_znode_init(void)
192{
193 /*
5074bfe8
TC
194 * Initialize zcache. The KMC_SLAB hint is used in order that it be
195 * backed by kmalloc() when on the Linux slab in order that any
196 * wait_on_bit() operations on the related inode operate properly.
34dc7c2f
BB
197 */
198 ASSERT(znode_cache == NULL);
199 znode_cache = kmem_cache_create("zfs_znode_cache",
200 sizeof (znode_t), 0, zfs_znode_cache_constructor,
5074bfe8 201 zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB);
c96c36fa
BB
202
203 ASSERT(znode_hold_cache == NULL);
204 znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
205 sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
206 zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
34dc7c2f
BB
207}
208
209void
210zfs_znode_fini(void)
211{
34dc7c2f
BB
212 /*
213 * Cleanup zcache
214 */
215 if (znode_cache)
216 kmem_cache_destroy(znode_cache);
217 znode_cache = NULL;
c96c36fa
BB
218
219 if (znode_hold_cache)
220 kmem_cache_destroy(znode_hold_cache);
221 znode_hold_cache = NULL;
222}
223
224/*
225 * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
226 * serialize access to a znode and its SA buffer while the object is being
227 * created or destroyed. This kind of locking would normally reside in the
228 * znode itself but in this case that's impossible because the znode and SA
229 * buffer may not yet exist. Therefore the locking is handled externally
230 * with an array of mutexs and AVLs trees which contain per-object locks.
231 *
232 * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
233 * in to the correct AVL tree and finally the per-object lock is held. In
234 * zfs_znode_hold_exit() the process is reversed. The per-object lock is
235 * released, removed from the AVL tree and destroyed if there are no waiters.
236 *
237 * This scheme has two important properties:
238 *
239 * 1) No memory allocations are performed while holding one of the z_hold_locks.
240 * This ensures evict(), which can be called from direct memory reclaim, will
241 * never block waiting on a z_hold_locks which just happens to have hashed
242 * to the same index.
243 *
244 * 2) All locks used to serialize access to an object are per-object and never
245 * shared. This minimizes lock contention without creating a large number
246 * of dedicated locks.
247 *
248 * On the downside it does require znode_lock_t structures to be frequently
249 * allocated and freed. However, because these are backed by a kmem cache
250 * and very short lived this cost is minimal.
251 */
252int
253zfs_znode_hold_compare(const void *a, const void *b)
254{
ee36c709
GN
255 const znode_hold_t *zh_a = (const znode_hold_t *)a;
256 const znode_hold_t *zh_b = (const znode_hold_t *)b;
257
258 return (AVL_CMP(zh_a->zh_obj, zh_b->zh_obj));
c96c36fa
BB
259}
260
261boolean_t
0037b49e 262zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
c96c36fa
BB
263{
264 znode_hold_t *zh, search;
0037b49e 265 int i = ZFS_OBJ_HASH(zfsvfs, obj);
37c56346 266 boolean_t held;
c96c36fa
BB
267
268 search.zh_obj = obj;
269
0037b49e
BB
270 mutex_enter(&zfsvfs->z_hold_locks[i]);
271 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
37c56346 272 held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
0037b49e 273 mutex_exit(&zfsvfs->z_hold_locks[i]);
c96c36fa 274
37c56346 275 return (held);
c96c36fa
BB
276}
277
278static znode_hold_t *
0037b49e 279zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
c96c36fa
BB
280{
281 znode_hold_t *zh, *zh_new, search;
0037b49e 282 int i = ZFS_OBJ_HASH(zfsvfs, obj);
c96c36fa
BB
283 boolean_t found = B_FALSE;
284
285 zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
286 zh_new->zh_obj = obj;
287 search.zh_obj = obj;
288
0037b49e
BB
289 mutex_enter(&zfsvfs->z_hold_locks[i]);
290 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
c96c36fa
BB
291 if (likely(zh == NULL)) {
292 zh = zh_new;
0037b49e 293 avl_add(&zfsvfs->z_hold_trees[i], zh);
c96c36fa
BB
294 } else {
295 ASSERT3U(zh->zh_obj, ==, obj);
296 found = B_TRUE;
297 }
c13060e4 298 zfs_refcount_add(&zh->zh_refcount, NULL);
0037b49e 299 mutex_exit(&zfsvfs->z_hold_locks[i]);
c96c36fa
BB
300
301 if (found == B_TRUE)
302 kmem_cache_free(znode_hold_cache, zh_new);
303
304 ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
424fd7c3 305 ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0);
c96c36fa
BB
306 mutex_enter(&zh->zh_lock);
307
308 return (zh);
309}
310
311static void
0037b49e 312zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
c96c36fa 313{
0037b49e 314 int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
c96c36fa
BB
315 boolean_t remove = B_FALSE;
316
0037b49e 317 ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
424fd7c3 318 ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0);
c96c36fa
BB
319 mutex_exit(&zh->zh_lock);
320
0037b49e 321 mutex_enter(&zfsvfs->z_hold_locks[i]);
424fd7c3 322 if (zfs_refcount_remove(&zh->zh_refcount, NULL) == 0) {
0037b49e 323 avl_remove(&zfsvfs->z_hold_trees[i], zh);
c96c36fa
BB
324 remove = B_TRUE;
325 }
0037b49e 326 mutex_exit(&zfsvfs->z_hold_locks[i]);
c96c36fa
BB
327
328 if (remove == B_TRUE)
329 kmem_cache_free(znode_hold_cache, zh);
34dc7c2f
BB
330}
331
34dc7c2f 332int
0037b49e 333zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
34dc7c2f 334{
3c9609b3 335#ifdef HAVE_SMB_SHARE
9babb374
BB
336 zfs_acl_ids_t acl_ids;
337 vattr_t vattr;
338 znode_t *sharezp;
339 vnode_t *vp;
340 znode_t *zp;
341 int error;
34dc7c2f 342
9babb374 343 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
3558fd73 344 vattr.va_mode = S_IFDIR | 0555;
9babb374
BB
345 vattr.va_uid = crgetuid(kcred);
346 vattr.va_gid = crgetgid(kcred);
34dc7c2f 347
79c76d5b 348 sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
572e2857 349 sharezp->z_moved = 0;
9babb374
BB
350 sharezp->z_unlinked = 0;
351 sharezp->z_atime_dirty = 0;
352 sharezp->z_zfsvfs = zfsvfs;
428870ff 353 sharezp->z_is_sa = zfsvfs->z_use_sa;
9c5167d1 354 sharezp->z_pflags = 0;
34dc7c2f 355
9babb374
BB
356 vp = ZTOV(sharezp);
357 vn_reinit(vp);
358 vp->v_type = VDIR;
34dc7c2f 359
9babb374
BB
360 VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
361 kcred, NULL, &acl_ids));
428870ff 362 zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
9babb374
BB
363 ASSERT3P(zp, ==, sharezp);
364 ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */
365 POINTER_INVALIDATE(&sharezp->z_zfsvfs);
366 error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
367 ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
368 zfsvfs->z_shares_dir = sharezp->z_id;
369
370 zfs_acl_ids_free(&acl_ids);
3558fd73 371 // ZTOV(sharezp)->v_count = 0;
428870ff 372 sa_handle_destroy(sharezp->z_sa_hdl);
9babb374 373 kmem_cache_free(znode_cache, sharezp);
34dc7c2f 374
9babb374 375 return (error);
9ee7fac5
BB
376#else
377 return (0);
3c9609b3 378#endif /* HAVE_SMB_SHARE */
34dc7c2f
BB
379}
380
34dc7c2f 381static void
0037b49e 382zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
428870ff 383 dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
34dc7c2f 384{
0037b49e 385 ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
34dc7c2f
BB
386
387 mutex_enter(&zp->z_lock);
388
428870ff
BB
389 ASSERT(zp->z_sa_hdl == NULL);
390 ASSERT(zp->z_acl_cached == NULL);
391 if (sa_hdl == NULL) {
0037b49e 392 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
428870ff
BB
393 SA_HDL_SHARED, &zp->z_sa_hdl));
394 } else {
395 zp->z_sa_hdl = sa_hdl;
396 sa_set_userp(sa_hdl, zp);
397 }
34dc7c2f 398
428870ff 399 zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
34dc7c2f 400
34dc7c2f 401 mutex_exit(&zp->z_lock);
34dc7c2f
BB
402}
403
404void
405zfs_znode_dmu_fini(znode_t *zp)
406{
c96c36fa 407 ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked ||
3558fd73 408 RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
428870ff
BB
409
410 sa_handle_destroy(zp->z_sa_hdl);
411 zp->z_sa_hdl = NULL;
34dc7c2f
BB
412}
413
414/*
3558fd73
BB
415 * Called by new_inode() to allocate a new inode.
416 */
417int
418zfs_inode_alloc(struct super_block *sb, struct inode **ip)
419{
420 znode_t *zp;
421
79c76d5b 422 zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
3558fd73
BB
423 *ip = ZTOI(zp);
424
425 return (0);
426}
427
428/*
429 * Called in multiple places when an inode should be destroyed.
430 */
431void
432zfs_inode_destroy(struct inode *ip)
433{
434 znode_t *zp = ITOZ(ip);
0037b49e 435 zfsvfs_t *zfsvfs = ZTOZSB(zp);
3558fd73 436
0037b49e 437 mutex_enter(&zfsvfs->z_znodes_lock);
7b3e34ba 438 if (list_link_active(&zp->z_link_node)) {
0037b49e
BB
439 list_remove(&zfsvfs->z_all_znodes, zp);
440 zfsvfs->z_nr_znodes--;
7b3e34ba 441 }
0037b49e 442 mutex_exit(&zfsvfs->z_znodes_lock);
3558fd73
BB
443
444 if (zp->z_acl_cached) {
445 zfs_acl_free(zp->z_acl_cached);
446 zp->z_acl_cached = NULL;
447 }
448
82a37189
BB
449 if (zp->z_xattr_cached) {
450 nvlist_free(zp->z_xattr_cached);
451 zp->z_xattr_cached = NULL;
452 }
453
3558fd73
BB
454 kmem_cache_free(znode_cache, zp);
455}
456
457static void
0037b49e 458zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
3558fd73 459{
aa6d8c10 460 uint64_t rdev = 0;
3558fd73
BB
461
462 switch (ip->i_mode & S_IFMT) {
463 case S_IFREG:
464 ip->i_op = &zpl_inode_operations;
465 ip->i_fop = &zpl_file_operations;
466 ip->i_mapping->a_ops = &zpl_address_space_operations;
467 break;
468
469 case S_IFDIR:
470 ip->i_op = &zpl_dir_inode_operations;
471 ip->i_fop = &zpl_dir_file_operations;
472 ITOZ(ip)->z_zn_prefetch = B_TRUE;
473 break;
474
475 case S_IFLNK:
476 ip->i_op = &zpl_symlink_inode_operations;
477 break;
478
aa6d8c10
NB
479 /*
480 * rdev is only stored in a SA only for device files.
481 */
3558fd73
BB
482 case S_IFCHR:
483 case S_IFBLK:
0037b49e 484 (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
53b1d979 485 sizeof (rdev));
aa6d8c10
NB
486 /*FALLTHROUGH*/
487 case S_IFIFO:
488 case S_IFSOCK:
3558fd73
BB
489 init_special_inode(ip, ip->i_mode, rdev);
490 ip->i_op = &zpl_special_inode_operations;
491 break;
492
493 default:
53b1d979
BB
494 zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
495 (u_longlong_t)ip->i_ino, ip->i_mode);
496
497 /* Assume the inode is a file and attempt to continue */
498 ip->i_mode = S_IFREG | 0644;
499 ip->i_op = &zpl_inode_operations;
500 ip->i_fop = &zpl_file_operations;
501 ip->i_mapping->a_ops = &zpl_address_space_operations;
502 break;
3558fd73
BB
503 }
504}
505
7bb1325f
CC
506void
507zfs_set_inode_flags(znode_t *zp, struct inode *ip)
508{
509 /*
510 * Linux and Solaris have different sets of file attributes, so we
511 * restrict this conversion to the intersection of the two.
512 */
a5248129
CC
513#ifdef HAVE_INODE_SET_FLAGS
514 unsigned int flags = 0;
515 if (zp->z_pflags & ZFS_IMMUTABLE)
516 flags |= S_IMMUTABLE;
517 if (zp->z_pflags & ZFS_APPENDONLY)
518 flags |= S_APPEND;
7bb1325f 519
a5248129
CC
520 inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
521#else
7bb1325f
CC
522 if (zp->z_pflags & ZFS_IMMUTABLE)
523 ip->i_flags |= S_IMMUTABLE;
524 else
525 ip->i_flags &= ~S_IMMUTABLE;
526
527 if (zp->z_pflags & ZFS_APPENDONLY)
528 ip->i_flags |= S_APPEND;
529 else
530 ip->i_flags &= ~S_APPEND;
a5248129 531#endif
7bb1325f
CC
532}
533
704cd075
CC
534/*
535 * Update the embedded inode given the znode. We should work toward
536 * eliminating this function as soon as possible by removing values
537 * which are duplicated between the znode and inode. If the generic
538 * inode has the correct field it should be used, and the ZFS code
539 * updated to access the inode. This can be done incrementally.
540 */
9f5f0019
NB
541void
542zfs_inode_update(znode_t *zp)
704cd075 543{
0037b49e 544 zfsvfs_t *zfsvfs;
704cd075
CC
545 struct inode *ip;
546 uint32_t blksize;
547 u_longlong_t i_blocks;
704cd075
CC
548
549 ASSERT(zp != NULL);
0037b49e 550 zfsvfs = ZTOZSB(zp);
704cd075
CC
551 ip = ZTOI(zp);
552
553 /* Skip .zfs control nodes which do not exist on disk. */
554 if (zfsctl_is_node(ip))
555 return;
556
704cd075
CC
557 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
558
559 spin_lock(&ip->i_lock);
704cd075 560 ip->i_blocks = i_blocks;
704cd075
CC
561 i_size_write(ip, zp->z_size);
562 spin_unlock(&ip->i_lock);
563}
564
704cd075 565
3558fd73
BB
566/*
567 * Construct a znode+inode and initialize.
34dc7c2f
BB
568 *
569 * This does not do a call to dmu_set_user() that is
570 * up to the caller to do, in case you don't want to
571 * return the znode
572 */
573static znode_t *
0037b49e 574zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
31b6111f 575 dmu_object_type_t obj_type, uint64_t obj, sa_handle_t *hdl)
34dc7c2f
BB
576{
577 znode_t *zp;
3558fd73 578 struct inode *ip;
7f89ae6b 579 uint64_t mode;
428870ff 580 uint64_t parent;
278f2236 581 uint64_t tmp_gen;
dfbc8630 582 uint64_t links;
2c6abf15 583 uint64_t z_uid, z_gid;
9f5f0019 584 uint64_t atime[2], mtime[2], ctime[2];
9c5167d1 585 uint64_t projid = ZFS_DEFAULT_PROJID;
9f5f0019 586 sa_bulk_attr_t bulk[11];
428870ff 587 int count = 0;
34dc7c2f 588
0037b49e 589 ASSERT(zfsvfs != NULL);
34dc7c2f 590
0037b49e 591 ip = new_inode(zfsvfs->z_sb);
3558fd73
BB
592 if (ip == NULL)
593 return (NULL);
7304b6e5 594
3558fd73 595 zp = ITOZ(ip);
34dc7c2f 596 ASSERT(zp->z_dirlocks == NULL);
ebe7e575
BB
597 ASSERT3P(zp->z_acl_cached, ==, NULL);
598 ASSERT3P(zp->z_xattr_cached, ==, NULL);
572e2857 599 zp->z_moved = 0;
428870ff 600 zp->z_sa_hdl = NULL;
34dc7c2f
BB
601 zp->z_unlinked = 0;
602 zp->z_atime_dirty = 0;
603 zp->z_mapcnt = 0;
34dc7c2f
BB
604 zp->z_id = db->db_object;
605 zp->z_blksz = blksz;
606 zp->z_seq = 0x7A4653;
607 zp->z_sync_cnt = 0;
ebe7e575
BB
608 zp->z_is_mapped = B_FALSE;
609 zp->z_is_ctldir = B_FALSE;
7b3e34ba 610 zp->z_is_stale = B_FALSE;
34dc7c2f 611
0037b49e 612 zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
3558fd73 613
0037b49e
BB
614 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
615 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
616 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
617 &zp->z_size, 8);
618 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
619 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
428870ff 620 &zp->z_pflags, 8);
0037b49e 621 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
7304b6e5 622 &parent, 8);
0037b49e
BB
623 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
624 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
625 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
626 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
627 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
428870ff 628
9c5167d1
NF
629 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
630 (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
631 (zp->z_pflags & ZFS_PROJID) &&
632 sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
428870ff
BB
633 if (hdl == NULL)
634 sa_handle_destroy(zp->z_sa_hdl);
07d63f0c 635 zp->z_sa_hdl = NULL;
3558fd73 636 goto error;
34dc7c2f 637 }
7304b6e5 638
9c5167d1 639 zp->z_projid = projid;
12fa7f34 640 zp->z_mode = ip->i_mode = mode;
278f2236 641 ip->i_generation = (uint32_t)tmp_gen;
ba2fe6af 642 ip->i_blkbits = SPA_MINBLOCKSHIFT;
dfbc8630 643 set_nlink(ip, (uint32_t)links);
2c6abf15
NB
644 zfs_uid_write(ip, z_uid);
645 zfs_gid_write(ip, z_gid);
7bb1325f 646 zfs_set_inode_flags(zp, ip);
7f89ae6b 647
98701490
CC
648 /* Cache the xattr parent id */
649 if (zp->z_pflags & ZFS_XATTR)
650 zp->z_xattr_parent = parent;
651
9f5f0019
NB
652 ZFS_TIME_DECODE(&ip->i_atime, atime);
653 ZFS_TIME_DECODE(&ip->i_mtime, mtime);
654 ZFS_TIME_DECODE(&ip->i_ctime, ctime);
655
3558fd73 656 ip->i_ino = obj;
9f5f0019 657 zfs_inode_update(zp);
0037b49e 658 zfs_inode_set_ops(zfsvfs, ip);
3558fd73 659
7b3e34ba
BB
660 /*
661 * The only way insert_inode_locked() can fail is if the ip->i_ino
662 * number is already hashed for this super block. This can never
663 * happen because the inode numbers map 1:1 with the object numbers.
664 *
665 * The one exception is rolling back a mounted file system, but in
666 * this case all the active inode are unhashed during the rollback.
667 */
668 VERIFY3S(insert_inode_locked(ip), ==, 0);
c85b224f 669
0037b49e
BB
670 mutex_enter(&zfsvfs->z_znodes_lock);
671 list_insert_tail(&zfsvfs->z_all_znodes, zp);
672 zfsvfs->z_nr_znodes++;
b128c09f 673 membar_producer();
0037b49e 674 mutex_exit(&zfsvfs->z_znodes_lock);
b128c09f 675
3558fd73 676 unlock_new_inode(ip);
34dc7c2f 677 return (zp);
3558fd73
BB
678
679error:
3558fd73 680 iput(ip);
d1d7e268 681 return (NULL);
34dc7c2f
BB
682}
683
1e8db771
BB
684/*
685 * Safely mark an inode dirty. Inodes which are part of a read-only
686 * file system or snapshot may not be dirtied.
687 */
688void
689zfs_mark_inode_dirty(struct inode *ip)
690{
0037b49e 691 zfsvfs_t *zfsvfs = ITOZSB(ip);
1e8db771 692
0037b49e 693 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
1e8db771
BB
694 return;
695
696 mark_inode_dirty(ip);
697}
698
428870ff
BB
699static uint64_t empty_xattr;
700static uint64_t pad[4];
701static zfs_acl_phys_t acl_phys;
34dc7c2f
BB
702/*
703 * Create a new DMU object to hold a zfs znode.
704 *
705 * IN: dzp - parent directory for new znode
706 * vap - file attributes for new znode
707 * tx - dmu transaction id for zap operations
708 * cr - credentials of caller
709 * flag - flags:
710 * IS_ROOT_NODE - new object will be root
711 * IS_XATTR - new object is an attribute
34dc7c2f
BB
712 * bonuslen - length of bonus buffer
713 * setaclp - File/Dir initial ACL
714 * fuidp - Tracks fuid allocation.
715 *
716 * OUT: zpp - allocated znode
717 *
718 */
719void
720zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
428870ff 721 uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
34dc7c2f 722{
428870ff
BB
723 uint64_t crtime[2], atime[2], mtime[2], ctime[2];
724 uint64_t mode, size, links, parent, pflags;
9c5167d1 725 uint64_t projid = ZFS_DEFAULT_PROJID;
428870ff 726 uint64_t rdev = 0;
0037b49e 727 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
428870ff 728 dmu_buf_t *db;
6413c95f 729 inode_timespec_t now;
34dc7c2f 730 uint64_t gen, obj;
428870ff 731 int bonuslen;
50c957f7 732 int dnodesize;
428870ff
BB
733 sa_handle_t *sa_hdl;
734 dmu_object_type_t obj_type;
f30484af 735 sa_bulk_attr_t *sa_attrs;
428870ff
BB
736 int cnt = 0;
737 zfs_acl_locator_cb_t locate = { 0 };
c96c36fa 738 znode_hold_t *zh;
34dc7c2f 739
0037b49e 740 if (zfsvfs->z_replay) {
34dc7c2f 741 obj = vap->va_nodeid;
34dc7c2f
BB
742 now = vap->va_ctime; /* see zfs_replay_create() */
743 gen = vap->va_nblocks; /* ditto */
50c957f7 744 dnodesize = vap->va_fsid; /* ditto */
34dc7c2f
BB
745 } else {
746 obj = 0;
747 gethrestime(&now);
748 gen = dmu_tx_get_txg(tx);
0037b49e 749 dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
34dc7c2f
BB
750 }
751
50c957f7
NB
752 if (dnodesize == 0)
753 dnodesize = DNODE_MIN_SIZE;
754
0037b49e 755 obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
50c957f7 756
428870ff 757 bonuslen = (obj_type == DMU_OT_SA) ?
50c957f7 758 DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
428870ff 759
34dc7c2f
BB
760 /*
761 * Create a new DMU object.
762 */
763 /*
764 * There's currently no mechanism for pre-reading the blocks that will
572e2857 765 * be needed to allocate a new object, so we accept the small chance
34dc7c2f
BB
766 * that there will be an i/o error and we will fail one of the
767 * assertions below.
768 */
3558fd73 769 if (S_ISDIR(vap->va_mode)) {
0037b49e
BB
770 if (zfsvfs->z_replay) {
771 VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
772 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
50c957f7 773 obj_type, bonuslen, dnodesize, tx));
34dc7c2f 774 } else {
0037b49e
BB
775 obj = zap_create_norm_dnsize(zfsvfs->z_os,
776 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
50c957f7 777 obj_type, bonuslen, dnodesize, tx);
34dc7c2f
BB
778 }
779 } else {
0037b49e
BB
780 if (zfsvfs->z_replay) {
781 VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
34dc7c2f 782 DMU_OT_PLAIN_FILE_CONTENTS, 0,
50c957f7 783 obj_type, bonuslen, dnodesize, tx));
34dc7c2f 784 } else {
0037b49e 785 obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
34dc7c2f 786 DMU_OT_PLAIN_FILE_CONTENTS, 0,
50c957f7 787 obj_type, bonuslen, dnodesize, tx);
34dc7c2f
BB
788 }
789 }
34dc7c2f 790
0037b49e 791 zh = zfs_znode_hold_enter(zfsvfs, obj);
9631681b 792 VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
34dc7c2f
BB
793
794 /*
795 * If this is the root, fix up the half-initialized parent pointer
796 * to reference the just-allocated physical data area.
797 */
798 if (flag & IS_ROOT_NODE) {
34dc7c2f
BB
799 dzp->z_id = obj;
800 }
801
802 /*
803 * If parent is an xattr, so am I.
804 */
9c5167d1 805 if (dzp->z_pflags & ZFS_XATTR) {
34dc7c2f 806 flag |= IS_XATTR;
34dc7c2f
BB
807 }
808
0037b49e 809 if (zfsvfs->z_use_fuids)
428870ff
BB
810 pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
811 else
812 pflags = 0;
34dc7c2f 813
3558fd73 814 if (S_ISDIR(vap->va_mode)) {
428870ff 815 size = 2; /* contents ("." and "..") */
dfbc8630 816 links = 2;
428870ff 817 } else {
dfbc8630 818 size = 0;
ace1eae8 819 links = (flag & IS_TMPFILE) ? 0 : 1;
34dc7c2f
BB
820 }
821
aa6d8c10 822 if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
dc1d7665 823 rdev = vap->va_rdev;
428870ff
BB
824
825 parent = dzp->z_id;
826 mode = acl_ids->z_mode;
34dc7c2f 827 if (flag & IS_XATTR)
428870ff 828 pflags |= ZFS_XATTR;
34dc7c2f 829
9c5167d1
NF
830 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
831 /*
832 * With ZFS_PROJID flag, we can easily know whether there is
833 * project ID stored on disk or not. See zfs_space_delta_cb().
834 */
835 if (obj_type != DMU_OT_ZNODE &&
836 dmu_objset_projectquota_enabled(zfsvfs->z_os))
837 pflags |= ZFS_PROJID;
838
839 /*
840 * Inherit project ID from parent if required.
841 */
842 projid = zfs_inherit_projid(dzp);
843 if (dzp->z_pflags & ZFS_PROJINHERIT)
844 pflags |= ZFS_PROJINHERIT;
845 }
846
428870ff
BB
847 /*
848 * No execs denied will be deterimed when zfs_mode_compute() is called.
849 */
850 pflags |= acl_ids->z_aclp->z_hints &
851 (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
852 ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
34dc7c2f 853
428870ff
BB
854 ZFS_TIME_ENCODE(&now, crtime);
855 ZFS_TIME_ENCODE(&now, ctime);
34dc7c2f 856
3558fd73 857 if (vap->va_mask & ATTR_ATIME) {
428870ff 858 ZFS_TIME_ENCODE(&vap->va_atime, atime);
34dc7c2f 859 } else {
428870ff 860 ZFS_TIME_ENCODE(&now, atime);
34dc7c2f
BB
861 }
862
3558fd73 863 if (vap->va_mask & ATTR_MTIME) {
428870ff
BB
864 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
865 } else {
866 ZFS_TIME_ENCODE(&now, mtime);
867 }
868
869 /* Now add in all of the "SA" attributes */
0037b49e 870 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
428870ff
BB
871 &sa_hdl));
872
873 /*
874 * Setup the array of attributes to be replaced/set on the new file
875 *
876 * order for DMU_OT_ZNODE is critical since it needs to be constructed
877 * in the old znode_phys_t format. Don't change this ordering
878 */
79c76d5b 879 sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
428870ff
BB
880
881 if (obj_type == DMU_OT_ZNODE) {
0037b49e 882 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
428870ff 883 NULL, &atime, 16);
0037b49e 884 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
428870ff 885 NULL, &mtime, 16);
0037b49e 886 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
428870ff 887 NULL, &ctime, 16);
0037b49e 888 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
428870ff 889 NULL, &crtime, 16);
0037b49e 890 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
428870ff 891 NULL, &gen, 8);
0037b49e 892 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
428870ff 893 NULL, &mode, 8);
0037b49e 894 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
428870ff 895 NULL, &size, 8);
0037b49e 896 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
428870ff 897 NULL, &parent, 8);
34dc7c2f 898 } else {
0037b49e 899 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
428870ff 900 NULL, &mode, 8);
0037b49e 901 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
428870ff 902 NULL, &size, 8);
0037b49e 903 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
428870ff 904 NULL, &gen, 8);
0037b49e 905 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
3558fd73 906 NULL, &acl_ids->z_fuid, 8);
0037b49e 907 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
3558fd73 908 NULL, &acl_ids->z_fgid, 8);
0037b49e 909 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
428870ff 910 NULL, &parent, 8);
0037b49e 911 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
428870ff 912 NULL, &pflags, 8);
0037b49e 913 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
428870ff 914 NULL, &atime, 16);
0037b49e 915 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
428870ff 916 NULL, &mtime, 16);
0037b49e 917 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
428870ff 918 NULL, &ctime, 16);
0037b49e 919 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
428870ff
BB
920 NULL, &crtime, 16);
921 }
922
0037b49e 923 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
428870ff
BB
924
925 if (obj_type == DMU_OT_ZNODE) {
0037b49e 926 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
428870ff 927 &empty_xattr, 8);
9c5167d1
NF
928 } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
929 pflags & ZFS_PROJID) {
930 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
931 NULL, &projid, 8);
34dc7c2f 932 }
428870ff 933 if (obj_type == DMU_OT_ZNODE ||
aa6d8c10 934 (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
0037b49e 935 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
428870ff 936 NULL, &rdev, 8);
428870ff
BB
937 }
938 if (obj_type == DMU_OT_ZNODE) {
0037b49e 939 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
428870ff 940 NULL, &pflags, 8);
0037b49e 941 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
428870ff 942 &acl_ids->z_fuid, 8);
0037b49e 943 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
428870ff 944 &acl_ids->z_fgid, 8);
0037b49e 945 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
428870ff 946 sizeof (uint64_t) * 4);
0037b49e 947 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
428870ff
BB
948 &acl_phys, sizeof (zfs_acl_phys_t));
949 } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
0037b49e 950 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
428870ff
BB
951 &acl_ids->z_aclp->z_acl_count, 8);
952 locate.cb_aclp = acl_ids->z_aclp;
0037b49e 953 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
428870ff
BB
954 zfs_acl_data_locator, &locate,
955 acl_ids->z_aclp->z_acl_bytes);
956 mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
957 acl_ids->z_fuid, acl_ids->z_fgid);
958 }
959
960 VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
34dc7c2f 961
34dc7c2f 962 if (!(flag & IS_ROOT_NODE)) {
8d703987
BB
963 /*
964 * The call to zfs_znode_alloc() may fail if memory is low
965 * via the call path: alloc_inode() -> inode_init_always() ->
966 * security_inode_alloc() -> inode_alloc_security(). Since
967 * the existing code is written such that zfs_mknode() can
968 * not fail retry until sufficient memory has been reclaimed.
969 */
970 do {
971 *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, obj,
972 sa_hdl);
973 } while (*zpp == NULL);
974
7b3e34ba
BB
975 VERIFY(*zpp != NULL);
976 VERIFY(dzp != NULL);
34dc7c2f
BB
977 } else {
978 /*
979 * If we are creating the root node, the "parent" we
980 * passed in is the znode for the root.
981 */
982 *zpp = dzp;
428870ff
BB
983
984 (*zpp)->z_sa_hdl = sa_hdl;
34dc7c2f 985 }
428870ff
BB
986
987 (*zpp)->z_pflags = pflags;
12fa7f34 988 (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
50c957f7 989 (*zpp)->z_dnodesize = dnodesize;
9c5167d1 990 (*zpp)->z_projid = projid;
428870ff 991
428870ff
BB
992 if (obj_type == DMU_OT_ZNODE ||
993 acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
b0bc7a84 994 VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
428870ff 995 }
d1d7e268 996 kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
0037b49e 997 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
998}
999
5484965a 1000/*
d3cc8b15
WA
1001 * Update in-core attributes. It is assumed the caller will be doing an
1002 * sa_bulk_update to push the changes out.
5484965a
BB
1003 */
1004void
1005zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
1006{
1007 xoptattr_t *xoap;
7bb1325f 1008 boolean_t update_inode = B_FALSE;
5484965a
BB
1009
1010 xoap = xva_getxoptattr(xvap);
1011 ASSERT(xoap);
1012
1013 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
1014 uint64_t times[2];
1015 ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
1016 (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
1017 &times, sizeof (times), tx);
1018 XVA_SET_RTN(xvap, XAT_CREATETIME);
1019 }
1020 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
1021 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
1022 zp->z_pflags, tx);
1023 XVA_SET_RTN(xvap, XAT_READONLY);
1024 }
1025 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
1026 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
1027 zp->z_pflags, tx);
1028 XVA_SET_RTN(xvap, XAT_HIDDEN);
1029 }
1030 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
1031 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
1032 zp->z_pflags, tx);
1033 XVA_SET_RTN(xvap, XAT_SYSTEM);
1034 }
1035 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
1036 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
1037 zp->z_pflags, tx);
1038 XVA_SET_RTN(xvap, XAT_ARCHIVE);
1039 }
1040 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
1041 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
1042 zp->z_pflags, tx);
1043 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
64c688d7 1044
7bb1325f 1045 update_inode = B_TRUE;
5484965a
BB
1046 }
1047 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
1048 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
1049 zp->z_pflags, tx);
1050 XVA_SET_RTN(xvap, XAT_NOUNLINK);
1051 }
1052 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1053 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
1054 zp->z_pflags, tx);
1055 XVA_SET_RTN(xvap, XAT_APPENDONLY);
64c688d7 1056
7bb1325f 1057 update_inode = B_TRUE;
5484965a
BB
1058 }
1059 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1060 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
1061 zp->z_pflags, tx);
1062 XVA_SET_RTN(xvap, XAT_NODUMP);
1063 }
1064 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1065 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
1066 zp->z_pflags, tx);
1067 XVA_SET_RTN(xvap, XAT_OPAQUE);
1068 }
1069 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1070 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
1071 xoap->xoa_av_quarantined, zp->z_pflags, tx);
1072 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1073 }
1074 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1075 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
1076 zp->z_pflags, tx);
1077 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1078 }
1079 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
1080 zfs_sa_set_scanstamp(zp, xvap, tx);
1081 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
1082 }
1083 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1084 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
1085 zp->z_pflags, tx);
1086 XVA_SET_RTN(xvap, XAT_REPARSE);
1087 }
1088 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1089 ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
1090 zp->z_pflags, tx);
1091 XVA_SET_RTN(xvap, XAT_OFFLINE);
1092 }
1093 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1094 ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
1095 zp->z_pflags, tx);
1096 XVA_SET_RTN(xvap, XAT_SPARSE);
1097 }
9c5167d1
NF
1098 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
1099 ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
1100 zp->z_pflags, tx);
1101 XVA_SET_RTN(xvap, XAT_PROJINHERIT);
1102 }
7bb1325f
CC
1103
1104 if (update_inode)
1105 zfs_set_inode_flags(zp, ZTOI(zp));
5484965a
BB
1106}
1107
34dc7c2f 1108int
0037b49e 1109zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
34dc7c2f
BB
1110{
1111 dmu_object_info_t doi;
1112 dmu_buf_t *db;
1113 znode_t *zp;
c96c36fa 1114 znode_hold_t *zh;
34dc7c2f 1115 int err;
428870ff 1116 sa_handle_t *hdl;
34dc7c2f
BB
1117
1118 *zpp = NULL;
1119
6f9548c4 1120again:
0037b49e 1121 zh = zfs_znode_hold_enter(zfsvfs, obj_num);
34dc7c2f 1122
0037b49e 1123 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
34dc7c2f 1124 if (err) {
0037b49e 1125 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1126 return (err);
1127 }
1128
1129 dmu_object_info_from_db(db, &doi);
428870ff
BB
1130 if (doi.doi_bonus_type != DMU_OT_SA &&
1131 (doi.doi_bonus_type != DMU_OT_ZNODE ||
1132 (doi.doi_bonus_type == DMU_OT_ZNODE &&
1133 doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1134 sa_buf_rele(db, NULL);
0037b49e 1135 zfs_znode_hold_exit(zfsvfs, zh);
2e528b49 1136 return (SET_ERROR(EINVAL));
34dc7c2f
BB
1137 }
1138
428870ff
BB
1139 hdl = dmu_buf_get_user(db);
1140 if (hdl != NULL) {
36df2843 1141 zp = sa_get_userdata(hdl);
34dc7c2f 1142
8ac67298 1143
34dc7c2f 1144 /*
428870ff
BB
1145 * Since "SA" does immediate eviction we
1146 * should never find a sa handle that doesn't
1147 * know about the znode.
34dc7c2f 1148 */
428870ff
BB
1149
1150 ASSERT3P(zp, !=, NULL);
1151
1152 mutex_enter(&zp->z_lock);
34dc7c2f 1153 ASSERT3U(zp->z_id, ==, obj_num);
98701490
CC
1154 /*
1155 * If igrab() returns NULL the VFS has independently
1156 * determined the inode should be evicted and has
1157 * called iput_final() to start the eviction process.
1158 * The SA handle is still valid but because the VFS
1159 * requires that the eviction succeed we must drop
1160 * our locks and references to allow the eviction to
1161 * complete. The zfs_zget() may then be retried.
1162 *
1163 * This unlikely case could be optimized by registering
1164 * a sops->drop_inode() callback. The callback would
1165 * need to detect the active SA hold thereby informing
1166 * the VFS that this inode should not be evicted.
1167 */
1168 if (igrab(ZTOI(zp)) == NULL) {
1169 mutex_exit(&zp->z_lock);
1170 sa_buf_rele(db, NULL);
0037b49e 1171 zfs_znode_hold_exit(zfsvfs, zh);
98701490
CC
1172 /* inode might need this to finish evict */
1173 cond_resched();
1174 goto again;
34dc7c2f 1175 }
98701490
CC
1176 *zpp = zp;
1177 err = 0;
34dc7c2f 1178 mutex_exit(&zp->z_lock);
f3ad9cd6 1179 sa_buf_rele(db, NULL);
0037b49e 1180 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1181 return (err);
1182 }
1183
1184 /*
3558fd73 1185 * Not found create new znode/vnode but only if file exists.
428870ff
BB
1186 *
1187 * There is a small window where zfs_vget() could
1188 * find this object while a file create is still in
1189 * progress. This is checked for in zfs_znode_alloc()
1190 *
1191 * if zfs_znode_alloc() fails it will drop the hold on the
1192 * bonus buffer.
34dc7c2f 1193 */
0037b49e 1194 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
31b6111f 1195 doi.doi_bonus_type, obj_num, NULL);
428870ff 1196 if (zp == NULL) {
2e528b49 1197 err = SET_ERROR(ENOENT);
428870ff
BB
1198 } else {
1199 *zpp = zp;
1200 }
0037b49e 1201 zfs_znode_hold_exit(zfsvfs, zh);
428870ff 1202 return (err);
34dc7c2f
BB
1203}
1204
1205int
1206zfs_rezget(znode_t *zp)
1207{
0037b49e 1208 zfsvfs_t *zfsvfs = ZTOZSB(zp);
34dc7c2f
BB
1209 dmu_object_info_t doi;
1210 dmu_buf_t *db;
1211 uint64_t obj_num = zp->z_id;
428870ff 1212 uint64_t mode;
dfbc8630 1213 uint64_t links;
9f5f0019 1214 sa_bulk_attr_t bulk[10];
34dc7c2f 1215 int err;
428870ff
BB
1216 int count = 0;
1217 uint64_t gen;
2c6abf15 1218 uint64_t z_uid, z_gid;
9f5f0019 1219 uint64_t atime[2], mtime[2], ctime[2];
9c5167d1 1220 uint64_t projid = ZFS_DEFAULT_PROJID;
c96c36fa 1221 znode_hold_t *zh;
34dc7c2f 1222
cbecb4fb
CC
1223 /*
1224 * skip ctldir, otherwise they will always get invalidated. This will
1225 * cause funny behaviour for the mounted snapdirs. Especially for
1226 * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
1227 * anyone automount it again as long as someone is still using the
1228 * detached mount.
1229 */
1230 if (zp->z_is_ctldir)
1231 return (0);
1232
0037b49e 1233 zh = zfs_znode_hold_enter(zfsvfs, obj_num);
34dc7c2f 1234
428870ff
BB
1235 mutex_enter(&zp->z_acl_lock);
1236 if (zp->z_acl_cached) {
1237 zfs_acl_free(zp->z_acl_cached);
1238 zp->z_acl_cached = NULL;
1239 }
428870ff 1240 mutex_exit(&zp->z_acl_lock);
7b3e34ba 1241
228b461b 1242 rw_enter(&zp->z_xattr_lock, RW_WRITER);
7b3e34ba
BB
1243 if (zp->z_xattr_cached) {
1244 nvlist_free(zp->z_xattr_cached);
1245 zp->z_xattr_cached = NULL;
1246 }
7b3e34ba
BB
1247 rw_exit(&zp->z_xattr_lock);
1248
428870ff 1249 ASSERT(zp->z_sa_hdl == NULL);
0037b49e 1250 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
34dc7c2f 1251 if (err) {
0037b49e 1252 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1253 return (err);
1254 }
1255
1256 dmu_object_info_from_db(db, &doi);
428870ff
BB
1257 if (doi.doi_bonus_type != DMU_OT_SA &&
1258 (doi.doi_bonus_type != DMU_OT_ZNODE ||
1259 (doi.doi_bonus_type == DMU_OT_ZNODE &&
1260 doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1261 sa_buf_rele(db, NULL);
0037b49e 1262 zfs_znode_hold_exit(zfsvfs, zh);
2e528b49 1263 return (SET_ERROR(EINVAL));
34dc7c2f
BB
1264 }
1265
0037b49e 1266 zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
428870ff
BB
1267
1268 /* reload cached values */
0037b49e 1269 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
428870ff 1270 &gen, sizeof (gen));
0037b49e 1271 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
428870ff 1272 &zp->z_size, sizeof (zp->z_size));
0037b49e 1273 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
dfbc8630 1274 &links, sizeof (links));
0037b49e 1275 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
428870ff 1276 &zp->z_pflags, sizeof (zp->z_pflags));
0037b49e 1277 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2c6abf15 1278 &z_uid, sizeof (z_uid));
0037b49e 1279 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
2c6abf15 1280 &z_gid, sizeof (z_gid));
0037b49e 1281 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
428870ff 1282 &mode, sizeof (mode));
0037b49e 1283 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
9f5f0019 1284 &atime, 16);
0037b49e 1285 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
9f5f0019 1286 &mtime, 16);
0037b49e 1287 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
9f5f0019 1288 &ctime, 16);
428870ff 1289
428870ff
BB
1290 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
1291 zfs_znode_dmu_fini(zp);
0037b49e 1292 zfs_znode_hold_exit(zfsvfs, zh);
2e528b49 1293 return (SET_ERROR(EIO));
428870ff
BB
1294 }
1295
9c5167d1
NF
1296 if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
1297 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
1298 &projid, 8);
1299 if (err != 0 && err != ENOENT) {
1300 zfs_znode_dmu_fini(zp);
1301 zfs_znode_hold_exit(zfsvfs, zh);
1302 return (SET_ERROR(err));
1303 }
1304 }
1305
1306 zp->z_projid = projid;
12fa7f34 1307 zp->z_mode = ZTOI(zp)->i_mode = mode;
2c6abf15
NB
1308 zfs_uid_write(ZTOI(zp), z_uid);
1309 zfs_gid_write(ZTOI(zp), z_gid);
572e2857 1310
9f5f0019
NB
1311 ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
1312 ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
1313 ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime);
1314
278f2236 1315 if (gen != ZTOI(zp)->i_generation) {
428870ff 1316 zfs_znode_dmu_fini(zp);
0037b49e 1317 zfs_znode_hold_exit(zfsvfs, zh);
2e528b49 1318 return (SET_ERROR(EIO));
34dc7c2f
BB
1319 }
1320
dfbc8630 1321 set_nlink(ZTOI(zp), (uint32_t)links);
7bb1325f 1322 zfs_set_inode_flags(zp, ZTOI(zp));
dfbc8630 1323
34dc7c2f 1324 zp->z_blksz = doi.doi_data_block_size;
704cd075 1325 zp->z_atime_dirty = 0;
9f5f0019 1326 zfs_inode_update(zp);
34dc7c2f 1327
6a218566
AG
1328 /*
1329 * If the file has zero links, then it has been unlinked on the send
1330 * side and it must be in the received unlinked set.
1331 * We call zfs_znode_dmu_fini() now to prevent any accesses to the
1332 * stale data and to prevent automatical removal of the file in
1333 * zfs_zinactive(). The file will be removed either when it is removed
1334 * on the send side and the next incremental stream is received or
1335 * when the unlinked set gets processed.
1336 */
1337 zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
1338 if (zp->z_unlinked)
1339 zfs_znode_dmu_fini(zp);
1340
0037b49e 1341 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1342
1343 return (0);
1344}
1345
1346void
1347zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1348{
0037b49e
BB
1349 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1350 objset_t *os = zfsvfs->z_os;
34dc7c2f 1351 uint64_t obj = zp->z_id;
572e2857 1352 uint64_t acl_obj = zfs_external_acl(zp);
c96c36fa 1353 znode_hold_t *zh;
34dc7c2f 1354
0037b49e 1355 zh = zfs_znode_hold_enter(zfsvfs, obj);
572e2857
BB
1356 if (acl_obj) {
1357 VERIFY(!zp->z_is_sa);
b128c09f 1358 VERIFY(0 == dmu_object_free(os, acl_obj, tx));
572e2857 1359 }
b128c09f 1360 VERIFY(0 == dmu_object_free(os, obj, tx));
34dc7c2f 1361 zfs_znode_dmu_fini(zp);
0037b49e 1362 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1363}
1364
1365void
1366zfs_zinactive(znode_t *zp)
1367{
0037b49e 1368 zfsvfs_t *zfsvfs = ZTOZSB(zp);
34dc7c2f 1369 uint64_t z_id = zp->z_id;
c96c36fa 1370 znode_hold_t *zh;
34dc7c2f 1371
428870ff 1372 ASSERT(zp->z_sa_hdl);
34dc7c2f
BB
1373
1374 /*
d6bd8eaa 1375 * Don't allow a zfs_zget() while were trying to release this znode.
34dc7c2f 1376 */
0037b49e 1377 zh = zfs_znode_hold_enter(zfsvfs, z_id);
d6bd8eaa 1378
34dc7c2f 1379 mutex_enter(&zp->z_lock);
34dc7c2f
BB
1380
1381 /*
6a218566
AG
1382 * If this was the last reference to a file with no links, remove
1383 * the file from the file system unless the file system is mounted
1384 * read-only. That can happen, for example, if the file system was
1385 * originally read-write, the file was opened, then unlinked and
1386 * the file system was made read-only before the file was finally
1387 * closed. The file will remain in the unlinked set.
34dc7c2f
BB
1388 */
1389 if (zp->z_unlinked) {
6a218566
AG
1390 ASSERT(!zfsvfs->z_issnap);
1391 if (!zfs_is_readonly(zfsvfs)) {
1392 mutex_exit(&zp->z_lock);
1393 zfs_znode_hold_exit(zfsvfs, zh);
1394 zfs_rmnode(zp);
1395 return;
1396 }
34dc7c2f 1397 }
428870ff 1398
34dc7c2f
BB
1399 mutex_exit(&zp->z_lock);
1400 zfs_znode_dmu_fini(zp);
d6bd8eaa 1401
0037b49e 1402 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1403}
1404
6d111134
TC
1405static inline int
1406zfs_compare_timespec(struct timespec *t1, struct timespec *t2)
1407{
1408 if (t1->tv_sec < t2->tv_sec)
1409 return (-1);
1410
1411 if (t1->tv_sec > t2->tv_sec)
1412 return (1);
1413
1414 return (t1->tv_nsec - t2->tv_nsec);
1415}
1416
6d111134
TC
1417/*
1418 * Prepare to update znode time stamps.
1419 *
1420 * IN: zp - znode requiring timestamp update
0df9673f 1421 * flag - ATTR_MTIME, ATTR_CTIME flags
6d111134 1422 *
0df9673f 1423 * OUT: zp - z_seq
6d111134
TC
1424 * mtime - new mtime
1425 * ctime - new ctime
1426 *
0df9673f
CC
1427 * Note: We don't update atime here, because we rely on Linux VFS to do
1428 * atime updating.
6d111134 1429 */
34dc7c2f 1430void
428870ff 1431zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
0df9673f 1432 uint64_t ctime[2])
34dc7c2f 1433{
6413c95f 1434 inode_timespec_t now;
34dc7c2f 1435
34dc7c2f
BB
1436 gethrestime(&now);
1437
0df9673f 1438 zp->z_seq++;
34dc7c2f 1439
3558fd73 1440 if (flag & ATTR_MTIME) {
428870ff 1441 ZFS_TIME_ENCODE(&now, mtime);
9f5f0019 1442 ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime);
3558fd73 1443 if (ZTOZSB(zp)->z_use_fuids) {
428870ff
BB
1444 zp->z_pflags |= (ZFS_ARCHIVE |
1445 ZFS_AV_MODIFIED);
1446 }
34dc7c2f
BB
1447 }
1448
3558fd73 1449 if (flag & ATTR_CTIME) {
428870ff 1450 ZFS_TIME_ENCODE(&now, ctime);
9f5f0019 1451 ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime);
3558fd73 1452 if (ZTOZSB(zp)->z_use_fuids)
428870ff 1453 zp->z_pflags |= ZFS_ARCHIVE;
34dc7c2f
BB
1454 }
1455}
1456
34dc7c2f
BB
1457/*
1458 * Grow the block size for a file.
1459 *
1460 * IN: zp - znode of file to free data in.
1461 * size - requested block size
1462 * tx - open transaction.
1463 *
1464 * NOTE: this function assumes that the znode is write locked.
1465 */
1466void
1467zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1468{
1469 int error;
1470 u_longlong_t dummy;
1471
1472 if (size <= zp->z_blksz)
1473 return;
1474 /*
1475 * If the file size is already greater than the current blocksize,
1476 * we will not grow. If there is more than one block in a file,
1477 * the blocksize cannot change.
1478 */
428870ff 1479 if (zp->z_blksz && zp->z_size > zp->z_blksz)
34dc7c2f
BB
1480 return;
1481
3558fd73 1482 error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
34dc7c2f 1483 size, 0, tx);
428870ff 1484
34dc7c2f
BB
1485 if (error == ENOTSUP)
1486 return;
c99c9001 1487 ASSERT0(error);
34dc7c2f
BB
1488
1489 /* What blocksize did we actually get? */
428870ff 1490 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
34dc7c2f
BB
1491}
1492
34dc7c2f 1493/*
b128c09f 1494 * Increase the file length
34dc7c2f
BB
1495 *
1496 * IN: zp - znode of file to free data in.
b128c09f 1497 * end - new end-of-file
34dc7c2f 1498 *
19d55079 1499 * RETURN: 0 on success, error code on failure
34dc7c2f 1500 */
b128c09f
BB
1501static int
1502zfs_extend(znode_t *zp, uint64_t end)
34dc7c2f 1503{
0037b49e 1504 zfsvfs_t *zfsvfs = ZTOZSB(zp);
b128c09f 1505 dmu_tx_t *tx;
5d43cc9a 1506 locked_range_t *lr;
b128c09f 1507 uint64_t newblksz;
34dc7c2f
BB
1508 int error;
1509
34dc7c2f 1510 /*
b128c09f 1511 * We will change zp_size, lock the whole file.
34dc7c2f 1512 */
5d43cc9a 1513 lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
34dc7c2f
BB
1514
1515 /*
1516 * Nothing to do if file already at desired length.
1517 */
428870ff 1518 if (end <= zp->z_size) {
5d43cc9a 1519 rangelock_exit(lr);
34dc7c2f
BB
1520 return (0);
1521 }
0037b49e 1522 tx = dmu_tx_create(zfsvfs->z_os);
428870ff
BB
1523 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1524 zfs_sa_upgrade_txholds(tx, zp);
b128c09f 1525 if (end > zp->z_blksz &&
0037b49e 1526 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
34dc7c2f
BB
1527 /*
1528 * We are growing the file past the current block size.
1529 */
3558fd73 1530 if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
f1512ee6
MA
1531 /*
1532 * File's blocksize is already larger than the
1533 * "recordsize" property. Only let it grow to
1534 * the next power of 2.
1535 */
34dc7c2f 1536 ASSERT(!ISP2(zp->z_blksz));
f1512ee6 1537 newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
34dc7c2f 1538 } else {
3558fd73 1539 newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
34dc7c2f 1540 }
b128c09f
BB
1541 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1542 } else {
1543 newblksz = 0;
34dc7c2f
BB
1544 }
1545
384f8a09 1546 error = dmu_tx_assign(tx, TXG_WAIT);
34dc7c2f 1547 if (error) {
34dc7c2f 1548 dmu_tx_abort(tx);
5d43cc9a 1549 rangelock_exit(lr);
34dc7c2f
BB
1550 return (error);
1551 }
1552
b128c09f
BB
1553 if (newblksz)
1554 zfs_grow_blocksize(zp, newblksz, tx);
34dc7c2f 1555
428870ff
BB
1556 zp->z_size = end;
1557
3558fd73 1558 VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
428870ff 1559 &zp->z_size, sizeof (zp->z_size), tx));
34dc7c2f 1560
5d43cc9a 1561 rangelock_exit(lr);
34dc7c2f 1562
b128c09f 1563 dmu_tx_commit(tx);
34dc7c2f 1564
b128c09f
BB
1565 return (0);
1566}
1567
223df016
TC
1568/*
1569 * zfs_zero_partial_page - Modeled after update_pages() but
1570 * with different arguments and semantics for use by zfs_freesp().
1571 *
1572 * Zeroes a piece of a single page cache entry for zp at offset
1573 * start and length len.
1574 *
1575 * Caller must acquire a range lock on the file for the region
1576 * being zeroed in order that the ARC and page cache stay in sync.
1577 */
1578static void
1579zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
1580{
1581 struct address_space *mp = ZTOI(zp)->i_mapping;
1582 struct page *pp;
1583 int64_t off;
1584 void *pb;
1585
8b1899d3 1586 ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
223df016 1587
8b1899d3
BB
1588 off = start & (PAGE_SIZE - 1);
1589 start &= PAGE_MASK;
223df016 1590
8b1899d3 1591 pp = find_lock_page(mp, start >> PAGE_SHIFT);
223df016
TC
1592 if (pp) {
1593 if (mapping_writably_mapped(mp))
1594 flush_dcache_page(pp);
1595
1596 pb = kmap(pp);
1597 bzero(pb + off, len);
1598 kunmap(pp);
1599
1600 if (mapping_writably_mapped(mp))
1601 flush_dcache_page(pp);
1602
1603 mark_page_accessed(pp);
1604 SetPageUptodate(pp);
1605 ClearPageError(pp);
1606 unlock_page(pp);
8b1899d3 1607 put_page(pp);
223df016
TC
1608 }
1609}
1610
b128c09f
BB
1611/*
1612 * Free space in a file.
1613 *
1614 * IN: zp - znode of file to free data in.
1615 * off - start of section to free.
1616 * len - length of section to free.
1617 *
19d55079 1618 * RETURN: 0 on success, error code on failure
b128c09f
BB
1619 */
1620static int
1621zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1622{
0037b49e 1623 zfsvfs_t *zfsvfs = ZTOZSB(zp);
5d43cc9a 1624 locked_range_t *lr;
b128c09f
BB
1625 int error;
1626
1627 /*
1628 * Lock the range being freed.
1629 */
5d43cc9a 1630 lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
b128c09f
BB
1631
1632 /*
1633 * Nothing to do if file already at desired length.
1634 */
428870ff 1635 if (off >= zp->z_size) {
5d43cc9a 1636 rangelock_exit(lr);
b128c09f 1637 return (0);
34dc7c2f
BB
1638 }
1639
428870ff
BB
1640 if (off + len > zp->z_size)
1641 len = zp->z_size - off;
b128c09f 1642
0037b49e 1643 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
b128c09f 1644
223df016
TC
1645 /*
1646 * Zero partial page cache entries. This must be done under a
1647 * range lock in order to keep the ARC and page cache in sync.
1648 */
1649 if (zp->z_is_mapped) {
1650 loff_t first_page, last_page, page_len;
1651 loff_t first_page_offset, last_page_offset;
1652
1653 /* first possible full page in hole */
8b1899d3 1654 first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
223df016 1655 /* last page of hole */
8b1899d3 1656 last_page = (off + len) >> PAGE_SHIFT;
223df016
TC
1657
1658 /* offset of first_page */
8b1899d3 1659 first_page_offset = first_page << PAGE_SHIFT;
223df016 1660 /* offset of last_page */
8b1899d3 1661 last_page_offset = last_page << PAGE_SHIFT;
223df016 1662
cb08f063
TC
1663 /* truncate whole pages */
1664 if (last_page_offset > first_page_offset) {
1665 truncate_inode_pages_range(ZTOI(zp)->i_mapping,
1666 first_page_offset, last_page_offset - 1);
1667 }
1668
1669 /* truncate sub-page ranges */
223df016
TC
1670 if (first_page > last_page) {
1671 /* entire punched area within a single page */
1672 zfs_zero_partial_page(zp, off, len);
1673 } else {
1674 /* beginning of punched area at the end of a page */
1675 page_len = first_page_offset - off;
1676 if (page_len > 0)
1677 zfs_zero_partial_page(zp, off, page_len);
1678
1679 /* end of punched area at the beginning of a page */
1680 page_len = off + len - last_page_offset;
1681 if (page_len > 0)
1682 zfs_zero_partial_page(zp, last_page_offset,
1683 page_len);
1684 }
1685 }
5d43cc9a 1686 rangelock_exit(lr);
34dc7c2f 1687
b128c09f
BB
1688 return (error);
1689}
1690
1691/*
1692 * Truncate a file
1693 *
1694 * IN: zp - znode of file to free data in.
1695 * end - new end-of-file.
1696 *
19d55079 1697 * RETURN: 0 on success, error code on failure
b128c09f
BB
1698 */
1699static int
1700zfs_trunc(znode_t *zp, uint64_t end)
1701{
0037b49e 1702 zfsvfs_t *zfsvfs = ZTOZSB(zp);
b128c09f 1703 dmu_tx_t *tx;
5d43cc9a 1704 locked_range_t *lr;
b128c09f 1705 int error;
572e2857
BB
1706 sa_bulk_attr_t bulk[2];
1707 int count = 0;
b128c09f
BB
1708
1709 /*
1710 * We will change zp_size, lock the whole file.
1711 */
5d43cc9a 1712 lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
b128c09f
BB
1713
1714 /*
1715 * Nothing to do if file already at desired length.
1716 */
428870ff 1717 if (end >= zp->z_size) {
5d43cc9a 1718 rangelock_exit(lr);
b128c09f
BB
1719 return (0);
1720 }
1721
18a2485f
FS
1722 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
1723 DMU_OBJECT_END);
b128c09f 1724 if (error) {
5d43cc9a 1725 rangelock_exit(lr);
b128c09f
BB
1726 return (error);
1727 }
0037b49e 1728 tx = dmu_tx_create(zfsvfs->z_os);
428870ff
BB
1729 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1730 zfs_sa_upgrade_txholds(tx, zp);
19d55079 1731 dmu_tx_mark_netfree(tx);
7a8f0e80 1732 error = dmu_tx_assign(tx, TXG_WAIT);
b128c09f 1733 if (error) {
b128c09f 1734 dmu_tx_abort(tx);
5d43cc9a 1735 rangelock_exit(lr);
b128c09f
BB
1736 return (error);
1737 }
b128c09f 1738
428870ff 1739 zp->z_size = end;
0037b49e 1740 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
572e2857 1741 NULL, &zp->z_size, sizeof (zp->z_size));
428870ff 1742
572e2857
BB
1743 if (end == 0) {
1744 zp->z_pflags &= ~ZFS_SPARSE;
0037b49e 1745 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
572e2857
BB
1746 NULL, &zp->z_pflags, 8);
1747 }
1748 VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
b128c09f 1749
34dc7c2f 1750 dmu_tx_commit(tx);
5d43cc9a 1751 rangelock_exit(lr);
34dc7c2f
BB
1752
1753 return (0);
1754}
1755
b128c09f
BB
1756/*
1757 * Free space in a file
1758 *
1759 * IN: zp - znode of file to free data in.
1760 * off - start of range
1761 * len - end of range (0 => EOF)
1762 * flag - current file open mode flags.
1763 * log - TRUE if this action should be logged
1764 *
19d55079 1765 * RETURN: 0 on success, error code on failure
b128c09f
BB
1766 */
1767int
1768zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1769{
b128c09f 1770 dmu_tx_t *tx;
0037b49e
BB
1771 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1772 zilog_t *zilog = zfsvfs->z_log;
428870ff
BB
1773 uint64_t mode;
1774 uint64_t mtime[2], ctime[2];
1775 sa_bulk_attr_t bulk[3];
1776 int count = 0;
b128c09f
BB
1777 int error;
1778
0037b49e 1779 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
428870ff
BB
1780 sizeof (mode))) != 0)
1781 return (error);
1782
1783 if (off > zp->z_size) {
b128c09f
BB
1784 error = zfs_extend(zp, off+len);
1785 if (error == 0 && log)
1786 goto log;
223df016 1787 goto out;
b128c09f
BB
1788 }
1789
b128c09f
BB
1790 if (len == 0) {
1791 error = zfs_trunc(zp, off);
1792 } else {
1793 if ((error = zfs_free_range(zp, off, len)) == 0 &&
428870ff 1794 off + len > zp->z_size)
b128c09f
BB
1795 error = zfs_extend(zp, off+len);
1796 }
1797 if (error || !log)
223df016 1798 goto out;
b128c09f 1799log:
0037b49e 1800 tx = dmu_tx_create(zfsvfs->z_os);
428870ff
BB
1801 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1802 zfs_sa_upgrade_txholds(tx, zp);
384f8a09 1803 error = dmu_tx_assign(tx, TXG_WAIT);
b128c09f 1804 if (error) {
b128c09f 1805 dmu_tx_abort(tx);
223df016 1806 goto out;
b128c09f
BB
1807 }
1808
0037b49e
BB
1809 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
1810 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
1811 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
428870ff 1812 NULL, &zp->z_pflags, 8);
0df9673f 1813 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
428870ff
BB
1814 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1815 ASSERT(error == 0);
1816
b128c09f
BB
1817 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1818
1819 dmu_tx_commit(tx);
223df016 1820
960e08fe 1821 zfs_inode_update(zp);
223df016
TC
1822 error = 0;
1823
1824out:
1825 /*
1826 * Truncate the page cache - for file truncate operations, use
1827 * the purpose-built API for truncations. For punching operations,
cb08f063 1828 * the truncation is handled under a range lock in zfs_free_range.
223df016
TC
1829 */
1830 if (len == 0)
1831 truncate_setsize(ZTOI(zp), off);
223df016 1832 return (error);
b128c09f
BB
1833}
1834
34dc7c2f
BB
1835void
1836zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1837{
22872ff5 1838 struct super_block *sb;
0037b49e 1839 zfsvfs_t *zfsvfs;
428870ff 1840 uint64_t moid, obj, sa_obj, version;
22872ff5 1841 uint64_t sense = ZFS_CASE_SENSITIVE;
34dc7c2f
BB
1842 uint64_t norm = 0;
1843 nvpair_t *elem;
c96c36fa 1844 int size;
34dc7c2f 1845 int error;
22872ff5
BB
1846 int i;
1847 znode_t *rootzp = NULL;
1848 vattr_t vattr;
1849 znode_t *zp;
1850 zfs_acl_ids_t acl_ids;
34dc7c2f
BB
1851
1852 /*
1853 * First attempt to create master node.
1854 */
1855 /*
1856 * In an empty objset, there are no blocks to read and thus
1857 * there can be no i/o errors (which we assert below).
1858 */
1859 moid = MASTER_NODE_OBJ;
1860 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1861 DMU_OT_NONE, 0, tx);
1862 ASSERT(error == 0);
1863
1864 /*
1865 * Set starting attributes.
1866 */
428870ff 1867 version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
34dc7c2f
BB
1868 elem = NULL;
1869 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1870 /* For the moment we expect all zpl props to be uint64_ts */
1871 uint64_t val;
1872 char *name;
1873
1874 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1875 VERIFY(nvpair_value_uint64(elem, &val) == 0);
1876 name = nvpair_name(elem);
1877 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
9babb374
BB
1878 if (val < version)
1879 version = val;
34dc7c2f
BB
1880 } else {
1881 error = zap_update(os, moid, name, 8, 1, &val, tx);
1882 }
1883 ASSERT(error == 0);
1884 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1885 norm = val;
22872ff5
BB
1886 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1887 sense = val;
34dc7c2f
BB
1888 }
1889 ASSERT(version != 0);
9babb374 1890 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
34dc7c2f 1891
428870ff
BB
1892 /*
1893 * Create zap object used for SA attribute registration
1894 */
1895
1896 if (version >= ZPL_VERSION_SA) {
1897 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
1898 DMU_OT_NONE, 0, tx);
1899 error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
1900 ASSERT(error == 0);
1901 } else {
1902 sa_obj = 0;
1903 }
34dc7c2f
BB
1904 /*
1905 * Create a delete queue.
1906 */
9babb374 1907 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
34dc7c2f 1908
9babb374 1909 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
34dc7c2f
BB
1910 ASSERT(error == 0);
1911
9babb374 1912 /*
0037b49e 1913 * Create root znode. Create minimal znode/inode/zfsvfs/sb
22872ff5 1914 * to allow zfs_mknode to work.
9babb374 1915 */
22872ff5
BB
1916 vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
1917 vattr.va_mode = S_IFDIR|0755;
1918 vattr.va_uid = crgetuid(cr);
1919 vattr.va_gid = crgetgid(cr);
1920
79c76d5b 1921 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
22872ff5
BB
1922 rootzp->z_moved = 0;
1923 rootzp->z_unlinked = 0;
1924 rootzp->z_atime_dirty = 0;
1925 rootzp->z_is_sa = USE_SA(version, os);
9c5167d1 1926 rootzp->z_pflags = 0;
22872ff5 1927
0037b49e
BB
1928 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1929 zfsvfs->z_os = os;
1930 zfsvfs->z_parent = zfsvfs;
1931 zfsvfs->z_version = version;
1932 zfsvfs->z_use_fuids = USE_FUIDS(version, os);
1933 zfsvfs->z_use_sa = USE_SA(version, os);
1934 zfsvfs->z_norm = norm;
22872ff5 1935
79c76d5b 1936 sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
0037b49e 1937 sb->s_fs_info = zfsvfs;
22872ff5
BB
1938
1939 ZTOI(rootzp)->i_sb = sb;
1940
1941 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
0037b49e 1942 &zfsvfs->z_attr_table);
9babb374 1943
22872ff5 1944 ASSERT(error == 0);
9babb374 1945
60101509 1946 /*
22872ff5
BB
1947 * Fold case on file systems that are always or sometimes case
1948 * insensitive.
60101509 1949 */
22872ff5 1950 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
0037b49e 1951 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
60101509 1952
0037b49e
BB
1953 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1954 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
22872ff5 1955 offsetof(znode_t, z_link_node));
60101509 1956
c96c36fa 1957 size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
0037b49e
BB
1958 zfsvfs->z_hold_size = size;
1959 zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
1960 KM_SLEEP);
1961 zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
c96c36fa 1962 for (i = 0; i != size; i++) {
0037b49e 1963 avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
c96c36fa 1964 sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
0037b49e 1965 mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
c96c36fa 1966 }
60101509 1967
22872ff5
BB
1968 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1969 cr, NULL, &acl_ids));
1970 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
1971 ASSERT3P(zp, ==, rootzp);
1972 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1973 ASSERT(error == 0);
1974 zfs_acl_ids_free(&acl_ids);
60101509 1975
22872ff5
BB
1976 atomic_set(&ZTOI(rootzp)->i_count, 0);
1977 sa_handle_destroy(rootzp->z_sa_hdl);
22872ff5
BB
1978 kmem_cache_free(znode_cache, rootzp);
1979
1980 /*
1981 * Create shares directory
1982 */
0037b49e 1983 error = zfs_create_share_dir(zfsvfs, tx);
9babb374 1984 ASSERT(error == 0);
428870ff 1985
c96c36fa 1986 for (i = 0; i != size; i++) {
0037b49e
BB
1987 avl_destroy(&zfsvfs->z_hold_trees[i]);
1988 mutex_destroy(&zfsvfs->z_hold_locks[i]);
c96c36fa 1989 }
2708f716 1990
c17486b2
GN
1991 mutex_destroy(&zfsvfs->z_znodes_lock);
1992
0037b49e
BB
1993 vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
1994 vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
2708f716 1995 kmem_free(sb, sizeof (struct super_block));
0037b49e 1996 kmem_free(zfsvfs, sizeof (zfsvfs_t));
34dc7c2f 1997}
34dc7c2f 1998#endif /* _KERNEL */
428870ff 1999
34dc7c2f 2000static int
572e2857
BB
2001zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
2002{
2003 uint64_t sa_obj = 0;
2004 int error;
2005
2006 error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
2007 if (error != 0 && error != ENOENT)
2008 return (error);
2009
2010 error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
2011 return (error);
2012}
2013
2014static int
2015zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
7b8518cb 2016 dmu_buf_t **db, void *tag)
34dc7c2f 2017{
34dc7c2f 2018 dmu_object_info_t doi;
34dc7c2f 2019 int error;
428870ff 2020
7b8518cb 2021 if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
34dc7c2f
BB
2022 return (error);
2023
572e2857 2024 dmu_object_info_from_db(*db, &doi);
428870ff
BB
2025 if ((doi.doi_bonus_type != DMU_OT_SA &&
2026 doi.doi_bonus_type != DMU_OT_ZNODE) ||
d6320ddb
BB
2027 (doi.doi_bonus_type == DMU_OT_ZNODE &&
2028 doi.doi_bonus_size < sizeof (znode_phys_t))) {
7b8518cb 2029 sa_buf_rele(*db, tag);
2e528b49 2030 return (SET_ERROR(ENOTSUP));
34dc7c2f
BB
2031 }
2032
572e2857
BB
2033 error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
2034 if (error != 0) {
7b8518cb 2035 sa_buf_rele(*db, tag);
428870ff
BB
2036 return (error);
2037 }
2038
572e2857
BB
2039 return (0);
2040}
2041
2042void
7b8518cb 2043zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
572e2857
BB
2044{
2045 sa_handle_destroy(hdl);
7b8518cb 2046 sa_buf_rele(db, tag);
572e2857
BB
2047}
2048
2049/*
2050 * Given an object number, return its parent object number and whether
2051 * or not the object is an extended attribute directory.
2052 */
2053static int
b23ad7f3
JJ
2054zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
2055 uint64_t *pobjp, int *is_xattrdir)
572e2857
BB
2056{
2057 uint64_t parent;
2058 uint64_t pflags;
2059 uint64_t mode;
b23ad7f3 2060 uint64_t parent_mode;
572e2857 2061 sa_bulk_attr_t bulk[3];
b23ad7f3
JJ
2062 sa_handle_t *sa_hdl;
2063 dmu_buf_t *sa_db;
572e2857
BB
2064 int count = 0;
2065 int error;
2066
2067 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
2068 &parent, sizeof (parent));
428870ff 2069 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
572e2857 2070 &pflags, sizeof (pflags));
428870ff 2071 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
572e2857 2072 &mode, sizeof (mode));
428870ff 2073
572e2857 2074 if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
428870ff 2075 return (error);
572e2857 2076
b23ad7f3
JJ
2077 /*
2078 * When a link is removed its parent pointer is not changed and will
2079 * be invalid. There are two cases where a link is removed but the
2080 * file stays around, when it goes to the delete queue and when there
2081 * are additional links.
2082 */
2083 error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
2084 if (error != 0)
2085 return (error);
2086
2087 error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
2088 zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2089 if (error != 0)
2090 return (error);
2091
428870ff 2092 *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
34dc7c2f 2093
b23ad7f3
JJ
2094 /*
2095 * Extended attributes can be applied to files, directories, etc.
2096 * Otherwise the parent must be a directory.
2097 */
2098 if (!*is_xattrdir && !S_ISDIR(parent_mode))
ecb2b7dc 2099 return (SET_ERROR(EINVAL));
b23ad7f3
JJ
2100
2101 *pobjp = parent;
2102
34dc7c2f
BB
2103 return (0);
2104}
2105
572e2857
BB
2106/*
2107 * Given an object number, return some zpl level statistics
2108 */
2109static int
2110zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
2111 zfs_stat_t *sb)
34dc7c2f 2112{
572e2857
BB
2113 sa_bulk_attr_t bulk[4];
2114 int count = 0;
2115
2116 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2117 &sb->zs_mode, sizeof (sb->zs_mode));
2118 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
2119 &sb->zs_gen, sizeof (sb->zs_gen));
2120 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
2121 &sb->zs_links, sizeof (sb->zs_links));
2122 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
2123 &sb->zs_ctime, sizeof (sb->zs_ctime));
2124
2125 return (sa_bulk_lookup(hdl, bulk, count));
2126}
2127
2128static int
2129zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
2130 sa_attr_type_t *sa_table, char *buf, int len)
2131{
2132 sa_handle_t *sa_hdl;
2133 sa_handle_t *prevhdl = NULL;
2134 dmu_buf_t *prevdb = NULL;
2135 dmu_buf_t *sa_db = NULL;
34dc7c2f
BB
2136 char *path = buf + len - 1;
2137 int error;
2138
2139 *path = '\0';
572e2857 2140 sa_hdl = hdl;
428870ff 2141
64c1dcef
PD
2142 uint64_t deleteq_obj;
2143 VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
2144 ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
2145 error = zap_lookup_int(osp, deleteq_obj, obj);
2146 if (error == 0) {
2147 return (ESTALE);
2148 } else if (error != ENOENT) {
2149 return (error);
2150 }
2151 error = 0;
2152
34dc7c2f 2153 for (;;) {
17897ce2 2154 uint64_t pobj = 0;
34dc7c2f
BB
2155 char component[MAXNAMELEN + 2];
2156 size_t complen;
17897ce2 2157 int is_xattrdir = 0;
34dc7c2f 2158
572e2857 2159 if (prevdb)
7b8518cb 2160 zfs_release_sa_handle(prevhdl, prevdb, FTAG);
572e2857 2161
b23ad7f3 2162 if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
572e2857 2163 &is_xattrdir)) != 0)
34dc7c2f
BB
2164 break;
2165
2166 if (pobj == obj) {
2167 if (path[0] != '/')
2168 *--path = '/';
2169 break;
2170 }
2171
2172 component[0] = '/';
2173 if (is_xattrdir) {
2174 (void) sprintf(component + 1, "<xattrdir>");
2175 } else {
2176 error = zap_value_search(osp, pobj, obj,
2177 ZFS_DIRENT_OBJ(-1ULL), component + 1);
2178 if (error != 0)
2179 break;
2180 }
2181
2182 complen = strlen(component);
2183 path -= complen;
2184 ASSERT(path >= buf);
2185 bcopy(component, path, complen);
2186 obj = pobj;
572e2857
BB
2187
2188 if (sa_hdl != hdl) {
2189 prevhdl = sa_hdl;
2190 prevdb = sa_db;
2191 }
7b8518cb 2192 error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
572e2857
BB
2193 if (error != 0) {
2194 sa_hdl = prevhdl;
2195 sa_db = prevdb;
2196 break;
2197 }
2198 }
2199
2200 if (sa_hdl != NULL && sa_hdl != hdl) {
2201 ASSERT(sa_db != NULL);
7b8518cb 2202 zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
34dc7c2f
BB
2203 }
2204
2205 if (error == 0)
2206 (void) memmove(buf, path, buf + len - path);
428870ff 2207
34dc7c2f
BB
2208 return (error);
2209}
572e2857
BB
2210
2211int
2212zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
2213{
2214 sa_attr_type_t *sa_table;
2215 sa_handle_t *hdl;
2216 dmu_buf_t *db;
2217 int error;
2218
2219 error = zfs_sa_setup(osp, &sa_table);
2220 if (error != 0)
2221 return (error);
2222
7b8518cb 2223 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
572e2857
BB
2224 if (error != 0)
2225 return (error);
2226
2227 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2228
7b8518cb 2229 zfs_release_sa_handle(hdl, db, FTAG);
572e2857
BB
2230 return (error);
2231}
2232
2233int
2234zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
2235 char *buf, int len)
2236{
2237 char *path = buf + len - 1;
2238 sa_attr_type_t *sa_table;
2239 sa_handle_t *hdl;
2240 dmu_buf_t *db;
2241 int error;
2242
2243 *path = '\0';
2244
2245 error = zfs_sa_setup(osp, &sa_table);
2246 if (error != 0)
2247 return (error);
2248
7b8518cb 2249 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
572e2857
BB
2250 if (error != 0)
2251 return (error);
2252
2253 error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
2254 if (error != 0) {
7b8518cb 2255 zfs_release_sa_handle(hdl, db, FTAG);
572e2857
BB
2256 return (error);
2257 }
2258
2259 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2260
7b8518cb 2261 zfs_release_sa_handle(hdl, db, FTAG);
572e2857
BB
2262 return (error);
2263}
c28b2279 2264
93ce2b4c 2265#if defined(_KERNEL)
c28b2279
BB
2266EXPORT_SYMBOL(zfs_create_fs);
2267EXPORT_SYMBOL(zfs_obj_to_path);
0720116d 2268
02730c33 2269/* CSTYLED */
0720116d
BB
2270module_param(zfs_object_mutex_size, uint, 0644);
2271MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
c28b2279 2272#endif