]> git.proxmox.com Git - mirror_zfs.git/blame - module/os/linux/zfs/zfs_znode.c
Rename fallthrough to zfs_fallthrough
[mirror_zfs.git] / module / os / linux / zfs / zfs_znode.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
5d43cc9a 23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
34dc7c2f
BB
24 */
25
26/* Portions Copyright 2007 Jeremy Teo */
27
34dc7c2f
BB
28#ifdef _KERNEL
29#include <sys/types.h>
30#include <sys/param.h>
31#include <sys/time.h>
34dc7c2f 32#include <sys/sysmacros.h>
34dc7c2f 33#include <sys/mntent.h>
34dc7c2f
BB
34#include <sys/u8_textprep.h>
35#include <sys/dsl_dataset.h>
36#include <sys/vfs.h>
34dc7c2f
BB
37#include <sys/vnode.h>
38#include <sys/file.h>
39#include <sys/kmem.h>
40#include <sys/errno.h>
34dc7c2f 41#include <sys/atomic.h>
34dc7c2f
BB
42#include <sys/zfs_dir.h>
43#include <sys/zfs_acl.h>
44#include <sys/zfs_ioctl.h>
45#include <sys/zfs_rlock.h>
46#include <sys/zfs_fuid.h>
3558fd73 47#include <sys/zfs_vnops.h>
ebe7e575 48#include <sys/zfs_ctldir.h>
428870ff 49#include <sys/dnode.h>
34dc7c2f 50#include <sys/fs/zfs.h>
3558fd73 51#include <sys/zpl.h>
34dc7c2f
BB
52#endif /* _KERNEL */
53
54#include <sys/dmu.h>
f1512ee6 55#include <sys/dmu_objset.h>
50c957f7 56#include <sys/dmu_tx.h>
27d96d22 57#include <sys/zfs_refcount.h>
34dc7c2f
BB
58#include <sys/stat.h>
59#include <sys/zap.h>
60#include <sys/zfs_znode.h>
428870ff
BB
61#include <sys/sa.h>
62#include <sys/zfs_sa.h>
572e2857 63#include <sys/zfs_stat.h>
34dc7c2f
BB
64
65#include "zfs_prop.h"
428870ff 66#include "zfs_comutil.h"
34dc7c2f
BB
67
68/*
69 * Functions needed for userland (ie: libzpool) are not put under
70 * #ifdef_KERNEL; the rest of the functions have dependencies
71 * (such as VFS logic) that will not compile easily in userland.
72 */
73#ifdef _KERNEL
9babb374 74
b128c09f 75static kmem_cache_t *znode_cache = NULL;
c96c36fa 76static kmem_cache_t *znode_hold_cache = NULL;
0720116d 77unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
34dc7c2f 78
dcec0a12
AP
79/*
80 * This is used by the test suite so that it can delay znodes from being
81 * freed in order to inspect the unlinked set.
82 */
18168da7 83static int zfs_unlink_suspend_progress = 0;
dcec0a12 84
5d43cc9a
MA
85/*
86 * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
87 * z_rangelock. It will modify the offset and length of the lock to reflect
88 * znode-specific information, and convert RL_APPEND to RL_WRITER. This is
89 * called with the rangelock_t's rl_lock held, which avoids races.
90 */
91static void
bd4dde8e 92zfs_rangelock_cb(zfs_locked_range_t *new, void *arg)
5d43cc9a
MA
93{
94 znode_t *zp = arg;
95
96 /*
97 * If in append mode, convert to writer and lock starting at the
98 * current end of file.
99 */
100 if (new->lr_type == RL_APPEND) {
101 new->lr_offset = zp->z_size;
102 new->lr_type = RL_WRITER;
103 }
104
105 /*
106 * If we need to grow the block size then lock the whole file range.
107 */
108 uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
109 if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
110 zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
111 new->lr_offset = 0;
112 new->lr_length = UINT64_MAX;
113 }
114}
115
34dc7c2f
BB
116/*ARGSUSED*/
117static int
b128c09f 118zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
34dc7c2f
BB
119{
120 znode_t *zp = buf;
121
3558fd73 122 inode_init_once(ZTOI(zp));
b128c09f
BB
123 list_link_init(&zp->z_link_node);
124
34dc7c2f 125 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
34dc7c2f 126 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
448d7aaa 127 rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
34dc7c2f 128 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
82a37189 129 rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
34dc7c2f 130
2cc479d0 131 zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
34dc7c2f 132
b128c09f 133 zp->z_dirlocks = NULL;
45d1cae3 134 zp->z_acl_cached = NULL;
82a37189 135 zp->z_xattr_cached = NULL;
98701490 136 zp->z_xattr_parent = 0;
34dc7c2f
BB
137 return (0);
138}
139
140/*ARGSUSED*/
141static void
b128c09f 142zfs_znode_cache_destructor(void *buf, void *arg)
34dc7c2f
BB
143{
144 znode_t *zp = buf;
145
b128c09f 146 ASSERT(!list_link_active(&zp->z_link_node));
34dc7c2f 147 mutex_destroy(&zp->z_lock);
34dc7c2f
BB
148 rw_destroy(&zp->z_parent_lock);
149 rw_destroy(&zp->z_name_lock);
150 mutex_destroy(&zp->z_acl_lock);
82a37189 151 rw_destroy(&zp->z_xattr_lock);
2cc479d0 152 zfs_rangelock_fini(&zp->z_rangelock);
34dc7c2f 153
c903a756
RM
154 ASSERT3P(zp->z_dirlocks, ==, NULL);
155 ASSERT3P(zp->z_acl_cached, ==, NULL);
156 ASSERT3P(zp->z_xattr_cached, ==, NULL);
b128c09f
BB
157}
158
c96c36fa
BB
159static int
160zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
161{
162 znode_hold_t *zh = buf;
163
164 mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
424fd7c3 165 zfs_refcount_create(&zh->zh_refcount);
c96c36fa
BB
166 zh->zh_obj = ZFS_NO_OBJECT;
167
168 return (0);
169}
170
171static void
172zfs_znode_hold_cache_destructor(void *buf, void *arg)
173{
174 znode_hold_t *zh = buf;
175
176 mutex_destroy(&zh->zh_lock);
424fd7c3 177 zfs_refcount_destroy(&zh->zh_refcount);
c96c36fa
BB
178}
179
34dc7c2f
BB
180void
181zfs_znode_init(void)
182{
183 /*
5074bfe8
TC
184 * Initialize zcache. The KMC_SLAB hint is used in order that it be
185 * backed by kmalloc() when on the Linux slab in order that any
186 * wait_on_bit() operations on the related inode operate properly.
34dc7c2f
BB
187 */
188 ASSERT(znode_cache == NULL);
189 znode_cache = kmem_cache_create("zfs_znode_cache",
190 sizeof (znode_t), 0, zfs_znode_cache_constructor,
5074bfe8 191 zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB);
c96c36fa
BB
192
193 ASSERT(znode_hold_cache == NULL);
194 znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
195 sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
196 zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
34dc7c2f
BB
197}
198
199void
200zfs_znode_fini(void)
201{
34dc7c2f
BB
202 /*
203 * Cleanup zcache
204 */
205 if (znode_cache)
206 kmem_cache_destroy(znode_cache);
207 znode_cache = NULL;
c96c36fa
BB
208
209 if (znode_hold_cache)
210 kmem_cache_destroy(znode_hold_cache);
211 znode_hold_cache = NULL;
212}
213
214/*
215 * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
216 * serialize access to a znode and its SA buffer while the object is being
217 * created or destroyed. This kind of locking would normally reside in the
218 * znode itself but in this case that's impossible because the znode and SA
219 * buffer may not yet exist. Therefore the locking is handled externally
bf169e9f 220 * with an array of mutexes and AVLs trees which contain per-object locks.
c96c36fa
BB
221 *
222 * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
223 * in to the correct AVL tree and finally the per-object lock is held. In
224 * zfs_znode_hold_exit() the process is reversed. The per-object lock is
225 * released, removed from the AVL tree and destroyed if there are no waiters.
226 *
227 * This scheme has two important properties:
228 *
229 * 1) No memory allocations are performed while holding one of the z_hold_locks.
230 * This ensures evict(), which can be called from direct memory reclaim, will
231 * never block waiting on a z_hold_locks which just happens to have hashed
232 * to the same index.
233 *
234 * 2) All locks used to serialize access to an object are per-object and never
235 * shared. This minimizes lock contention without creating a large number
236 * of dedicated locks.
237 *
238 * On the downside it does require znode_lock_t structures to be frequently
239 * allocated and freed. However, because these are backed by a kmem cache
240 * and very short lived this cost is minimal.
241 */
242int
243zfs_znode_hold_compare(const void *a, const void *b)
244{
ee36c709
GN
245 const znode_hold_t *zh_a = (const znode_hold_t *)a;
246 const znode_hold_t *zh_b = (const znode_hold_t *)b;
247
ca577779 248 return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj));
c96c36fa
BB
249}
250
65c7cc49 251static boolean_t __maybe_unused
0037b49e 252zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
c96c36fa
BB
253{
254 znode_hold_t *zh, search;
0037b49e 255 int i = ZFS_OBJ_HASH(zfsvfs, obj);
37c56346 256 boolean_t held;
c96c36fa
BB
257
258 search.zh_obj = obj;
259
0037b49e
BB
260 mutex_enter(&zfsvfs->z_hold_locks[i]);
261 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
37c56346 262 held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
0037b49e 263 mutex_exit(&zfsvfs->z_hold_locks[i]);
c96c36fa 264
37c56346 265 return (held);
c96c36fa
BB
266}
267
268static znode_hold_t *
0037b49e 269zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
c96c36fa
BB
270{
271 znode_hold_t *zh, *zh_new, search;
0037b49e 272 int i = ZFS_OBJ_HASH(zfsvfs, obj);
c96c36fa
BB
273 boolean_t found = B_FALSE;
274
275 zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
276 zh_new->zh_obj = obj;
277 search.zh_obj = obj;
278
0037b49e
BB
279 mutex_enter(&zfsvfs->z_hold_locks[i]);
280 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
c96c36fa
BB
281 if (likely(zh == NULL)) {
282 zh = zh_new;
0037b49e 283 avl_add(&zfsvfs->z_hold_trees[i], zh);
c96c36fa
BB
284 } else {
285 ASSERT3U(zh->zh_obj, ==, obj);
286 found = B_TRUE;
287 }
c13060e4 288 zfs_refcount_add(&zh->zh_refcount, NULL);
0037b49e 289 mutex_exit(&zfsvfs->z_hold_locks[i]);
c96c36fa
BB
290
291 if (found == B_TRUE)
292 kmem_cache_free(znode_hold_cache, zh_new);
293
294 ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
424fd7c3 295 ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0);
c96c36fa
BB
296 mutex_enter(&zh->zh_lock);
297
298 return (zh);
299}
300
301static void
0037b49e 302zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
c96c36fa 303{
0037b49e 304 int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
c96c36fa
BB
305 boolean_t remove = B_FALSE;
306
0037b49e 307 ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
424fd7c3 308 ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0);
c96c36fa
BB
309 mutex_exit(&zh->zh_lock);
310
0037b49e 311 mutex_enter(&zfsvfs->z_hold_locks[i]);
424fd7c3 312 if (zfs_refcount_remove(&zh->zh_refcount, NULL) == 0) {
0037b49e 313 avl_remove(&zfsvfs->z_hold_trees[i], zh);
c96c36fa
BB
314 remove = B_TRUE;
315 }
0037b49e 316 mutex_exit(&zfsvfs->z_hold_locks[i]);
c96c36fa
BB
317
318 if (remove == B_TRUE)
319 kmem_cache_free(znode_hold_cache, zh);
34dc7c2f
BB
320}
321
13a9a6f5
MM
322dev_t
323zfs_cmpldev(uint64_t dev)
324{
325 return (dev);
326}
327
34dc7c2f 328static void
0037b49e 329zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
428870ff 330 dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
34dc7c2f 331{
0037b49e 332 ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
34dc7c2f
BB
333
334 mutex_enter(&zp->z_lock);
335
428870ff
BB
336 ASSERT(zp->z_sa_hdl == NULL);
337 ASSERT(zp->z_acl_cached == NULL);
338 if (sa_hdl == NULL) {
0037b49e 339 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
428870ff
BB
340 SA_HDL_SHARED, &zp->z_sa_hdl));
341 } else {
342 zp->z_sa_hdl = sa_hdl;
343 sa_set_userp(sa_hdl, zp);
344 }
34dc7c2f 345
428870ff 346 zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
34dc7c2f 347
34dc7c2f 348 mutex_exit(&zp->z_lock);
34dc7c2f
BB
349}
350
351void
352zfs_znode_dmu_fini(znode_t *zp)
353{
c96c36fa 354 ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked ||
3558fd73 355 RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
428870ff
BB
356
357 sa_handle_destroy(zp->z_sa_hdl);
358 zp->z_sa_hdl = NULL;
34dc7c2f
BB
359}
360
361/*
3558fd73
BB
362 * Called by new_inode() to allocate a new inode.
363 */
364int
365zfs_inode_alloc(struct super_block *sb, struct inode **ip)
366{
367 znode_t *zp;
368
79c76d5b 369 zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
3558fd73
BB
370 *ip = ZTOI(zp);
371
372 return (0);
373}
374
375/*
376 * Called in multiple places when an inode should be destroyed.
377 */
378void
379zfs_inode_destroy(struct inode *ip)
380{
381 znode_t *zp = ITOZ(ip);
0037b49e 382 zfsvfs_t *zfsvfs = ZTOZSB(zp);
3558fd73 383
0037b49e 384 mutex_enter(&zfsvfs->z_znodes_lock);
7b3e34ba 385 if (list_link_active(&zp->z_link_node)) {
0037b49e
BB
386 list_remove(&zfsvfs->z_all_znodes, zp);
387 zfsvfs->z_nr_znodes--;
7b3e34ba 388 }
0037b49e 389 mutex_exit(&zfsvfs->z_znodes_lock);
3558fd73
BB
390
391 if (zp->z_acl_cached) {
392 zfs_acl_free(zp->z_acl_cached);
393 zp->z_acl_cached = NULL;
394 }
395
82a37189
BB
396 if (zp->z_xattr_cached) {
397 nvlist_free(zp->z_xattr_cached);
398 zp->z_xattr_cached = NULL;
399 }
400
3558fd73
BB
401 kmem_cache_free(znode_cache, zp);
402}
403
404static void
0037b49e 405zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
3558fd73 406{
aa6d8c10 407 uint64_t rdev = 0;
3558fd73
BB
408
409 switch (ip->i_mode & S_IFMT) {
410 case S_IFREG:
411 ip->i_op = &zpl_inode_operations;
412 ip->i_fop = &zpl_file_operations;
413 ip->i_mapping->a_ops = &zpl_address_space_operations;
414 break;
415
416 case S_IFDIR:
417 ip->i_op = &zpl_dir_inode_operations;
418 ip->i_fop = &zpl_dir_file_operations;
419 ITOZ(ip)->z_zn_prefetch = B_TRUE;
420 break;
421
422 case S_IFLNK:
423 ip->i_op = &zpl_symlink_inode_operations;
424 break;
425
aa6d8c10
NB
426 /*
427 * rdev is only stored in a SA only for device files.
428 */
3558fd73
BB
429 case S_IFCHR:
430 case S_IFBLK:
0037b49e 431 (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
53b1d979 432 sizeof (rdev));
9a70e97f 433 zfs_fallthrough;
aa6d8c10
NB
434 case S_IFIFO:
435 case S_IFSOCK:
3558fd73
BB
436 init_special_inode(ip, ip->i_mode, rdev);
437 ip->i_op = &zpl_special_inode_operations;
438 break;
439
440 default:
53b1d979
BB
441 zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
442 (u_longlong_t)ip->i_ino, ip->i_mode);
443
444 /* Assume the inode is a file and attempt to continue */
445 ip->i_mode = S_IFREG | 0644;
446 ip->i_op = &zpl_inode_operations;
447 ip->i_fop = &zpl_file_operations;
448 ip->i_mapping->a_ops = &zpl_address_space_operations;
449 break;
3558fd73
BB
450 }
451}
452
65c7cc49 453static void
7bb1325f
CC
454zfs_set_inode_flags(znode_t *zp, struct inode *ip)
455{
456 /*
457 * Linux and Solaris have different sets of file attributes, so we
458 * restrict this conversion to the intersection of the two.
459 */
a5248129
CC
460#ifdef HAVE_INODE_SET_FLAGS
461 unsigned int flags = 0;
462 if (zp->z_pflags & ZFS_IMMUTABLE)
463 flags |= S_IMMUTABLE;
464 if (zp->z_pflags & ZFS_APPENDONLY)
465 flags |= S_APPEND;
7bb1325f 466
a5248129
CC
467 inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
468#else
7bb1325f
CC
469 if (zp->z_pflags & ZFS_IMMUTABLE)
470 ip->i_flags |= S_IMMUTABLE;
471 else
472 ip->i_flags &= ~S_IMMUTABLE;
473
474 if (zp->z_pflags & ZFS_APPENDONLY)
475 ip->i_flags |= S_APPEND;
476 else
477 ip->i_flags &= ~S_APPEND;
a5248129 478#endif
7bb1325f
CC
479}
480
704cd075 481/*
fc273894 482 * Update the embedded inode given the znode.
704cd075 483 */
9f5f0019 484void
fc273894 485zfs_znode_update_vfs(znode_t *zp)
704cd075 486{
0037b49e 487 zfsvfs_t *zfsvfs;
704cd075
CC
488 struct inode *ip;
489 uint32_t blksize;
490 u_longlong_t i_blocks;
704cd075
CC
491
492 ASSERT(zp != NULL);
0037b49e 493 zfsvfs = ZTOZSB(zp);
704cd075
CC
494 ip = ZTOI(zp);
495
496 /* Skip .zfs control nodes which do not exist on disk. */
497 if (zfsctl_is_node(ip))
498 return;
499
704cd075
CC
500 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
501
502 spin_lock(&ip->i_lock);
e53d678d 503 ip->i_mode = zp->z_mode;
704cd075 504 ip->i_blocks = i_blocks;
704cd075
CC
505 i_size_write(ip, zp->z_size);
506 spin_unlock(&ip->i_lock);
507}
508
704cd075 509
3558fd73
BB
510/*
511 * Construct a znode+inode and initialize.
34dc7c2f
BB
512 *
513 * This does not do a call to dmu_set_user() that is
514 * up to the caller to do, in case you don't want to
515 * return the znode
516 */
517static znode_t *
0037b49e 518zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
1a688994 519 dmu_object_type_t obj_type, sa_handle_t *hdl)
34dc7c2f
BB
520{
521 znode_t *zp;
3558fd73 522 struct inode *ip;
7f89ae6b 523 uint64_t mode;
428870ff 524 uint64_t parent;
278f2236 525 uint64_t tmp_gen;
dfbc8630 526 uint64_t links;
2c6abf15 527 uint64_t z_uid, z_gid;
abbf0bd4 528 uint64_t atime[2], mtime[2], ctime[2], btime[2];
9c5167d1 529 uint64_t projid = ZFS_DEFAULT_PROJID;
abbf0bd4 530 sa_bulk_attr_t bulk[12];
428870ff 531 int count = 0;
34dc7c2f 532
0037b49e 533 ASSERT(zfsvfs != NULL);
34dc7c2f 534
0037b49e 535 ip = new_inode(zfsvfs->z_sb);
3558fd73
BB
536 if (ip == NULL)
537 return (NULL);
7304b6e5 538
3558fd73 539 zp = ITOZ(ip);
34dc7c2f 540 ASSERT(zp->z_dirlocks == NULL);
ebe7e575
BB
541 ASSERT3P(zp->z_acl_cached, ==, NULL);
542 ASSERT3P(zp->z_xattr_cached, ==, NULL);
a43570c5
TK
543 zp->z_unlinked = B_FALSE;
544 zp->z_atime_dirty = B_FALSE;
a43570c5
TK
545 zp->z_is_mapped = B_FALSE;
546 zp->z_is_ctldir = B_FALSE;
547 zp->z_is_stale = B_FALSE;
e7a2fa70 548 zp->z_suspended = B_FALSE;
428870ff 549 zp->z_sa_hdl = NULL;
34dc7c2f 550 zp->z_mapcnt = 0;
34dc7c2f
BB
551 zp->z_id = db->db_object;
552 zp->z_blksz = blksz;
553 zp->z_seq = 0x7A4653;
554 zp->z_sync_cnt = 0;
555
0037b49e 556 zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
3558fd73 557
0037b49e
BB
558 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
559 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
560 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
561 &zp->z_size, 8);
562 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
563 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
428870ff 564 &zp->z_pflags, 8);
0037b49e 565 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
7304b6e5 566 &parent, 8);
0037b49e
BB
567 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
568 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
569 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
570 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
571 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
abbf0bd4 572 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
428870ff 573
9c5167d1
NF
574 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
575 (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
576 (zp->z_pflags & ZFS_PROJID) &&
577 sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
428870ff
BB
578 if (hdl == NULL)
579 sa_handle_destroy(zp->z_sa_hdl);
07d63f0c 580 zp->z_sa_hdl = NULL;
3558fd73 581 goto error;
34dc7c2f 582 }
7304b6e5 583
9c5167d1 584 zp->z_projid = projid;
12fa7f34 585 zp->z_mode = ip->i_mode = mode;
278f2236 586 ip->i_generation = (uint32_t)tmp_gen;
ba2fe6af 587 ip->i_blkbits = SPA_MINBLOCKSHIFT;
dfbc8630 588 set_nlink(ip, (uint32_t)links);
2c6abf15
NB
589 zfs_uid_write(ip, z_uid);
590 zfs_gid_write(ip, z_gid);
7bb1325f 591 zfs_set_inode_flags(zp, ip);
7f89ae6b 592
98701490
CC
593 /* Cache the xattr parent id */
594 if (zp->z_pflags & ZFS_XATTR)
595 zp->z_xattr_parent = parent;
596
9f5f0019
NB
597 ZFS_TIME_DECODE(&ip->i_atime, atime);
598 ZFS_TIME_DECODE(&ip->i_mtime, mtime);
599 ZFS_TIME_DECODE(&ip->i_ctime, ctime);
abbf0bd4 600 ZFS_TIME_DECODE(&zp->z_btime, btime);
9f5f0019 601
1a688994 602 ip->i_ino = zp->z_id;
fc273894 603 zfs_znode_update_vfs(zp);
0037b49e 604 zfs_inode_set_ops(zfsvfs, ip);
3558fd73 605
7b3e34ba
BB
606 /*
607 * The only way insert_inode_locked() can fail is if the ip->i_ino
608 * number is already hashed for this super block. This can never
609 * happen because the inode numbers map 1:1 with the object numbers.
610 *
afa7b348
PZ
611 * Exceptions include rolling back a mounted file system, either
612 * from the zfs rollback or zfs recv command.
613 *
614 * Active inodes are unhashed during the rollback, but since zrele
615 * can happen asynchronously, we can't guarantee they've been
616 * unhashed. This can cause hash collisions in unlinked drain
617 * processing so do not hash unlinked znodes.
7b3e34ba 618 */
afa7b348
PZ
619 if (links > 0)
620 VERIFY3S(insert_inode_locked(ip), ==, 0);
c85b224f 621
0037b49e
BB
622 mutex_enter(&zfsvfs->z_znodes_lock);
623 list_insert_tail(&zfsvfs->z_all_znodes, zp);
624 zfsvfs->z_nr_znodes++;
0037b49e 625 mutex_exit(&zfsvfs->z_znodes_lock);
b128c09f 626
afa7b348
PZ
627 if (links > 0)
628 unlock_new_inode(ip);
34dc7c2f 629 return (zp);
3558fd73
BB
630
631error:
3558fd73 632 iput(ip);
d1d7e268 633 return (NULL);
34dc7c2f
BB
634}
635
1e8db771
BB
636/*
637 * Safely mark an inode dirty. Inodes which are part of a read-only
638 * file system or snapshot may not be dirtied.
639 */
640void
641zfs_mark_inode_dirty(struct inode *ip)
642{
0037b49e 643 zfsvfs_t *zfsvfs = ITOZSB(ip);
1e8db771 644
0037b49e 645 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
1e8db771
BB
646 return;
647
648 mark_inode_dirty(ip);
649}
650
428870ff
BB
651static uint64_t empty_xattr;
652static uint64_t pad[4];
653static zfs_acl_phys_t acl_phys;
34dc7c2f
BB
654/*
655 * Create a new DMU object to hold a zfs znode.
656 *
657 * IN: dzp - parent directory for new znode
658 * vap - file attributes for new znode
659 * tx - dmu transaction id for zap operations
660 * cr - credentials of caller
661 * flag - flags:
662 * IS_ROOT_NODE - new object will be root
841a7a98 663 * IS_TMPFILE - new object is of O_TMPFILE
34dc7c2f 664 * IS_XATTR - new object is an attribute
841a7a98 665 * acl_ids - ACL related attributes
34dc7c2f 666 *
841a7a98 667 * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE)
34dc7c2f
BB
668 *
669 */
670void
671zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
428870ff 672 uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
34dc7c2f 673{
428870ff
BB
674 uint64_t crtime[2], atime[2], mtime[2], ctime[2];
675 uint64_t mode, size, links, parent, pflags;
9c5167d1 676 uint64_t projid = ZFS_DEFAULT_PROJID;
428870ff 677 uint64_t rdev = 0;
0037b49e 678 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
428870ff 679 dmu_buf_t *db;
6413c95f 680 inode_timespec_t now;
34dc7c2f 681 uint64_t gen, obj;
428870ff 682 int bonuslen;
50c957f7 683 int dnodesize;
428870ff
BB
684 sa_handle_t *sa_hdl;
685 dmu_object_type_t obj_type;
f30484af 686 sa_bulk_attr_t *sa_attrs;
428870ff
BB
687 int cnt = 0;
688 zfs_acl_locator_cb_t locate = { 0 };
c96c36fa 689 znode_hold_t *zh;
34dc7c2f 690
0037b49e 691 if (zfsvfs->z_replay) {
34dc7c2f 692 obj = vap->va_nodeid;
34dc7c2f
BB
693 now = vap->va_ctime; /* see zfs_replay_create() */
694 gen = vap->va_nblocks; /* ditto */
50c957f7 695 dnodesize = vap->va_fsid; /* ditto */
34dc7c2f
BB
696 } else {
697 obj = 0;
698 gethrestime(&now);
699 gen = dmu_tx_get_txg(tx);
0037b49e 700 dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
34dc7c2f
BB
701 }
702
50c957f7
NB
703 if (dnodesize == 0)
704 dnodesize = DNODE_MIN_SIZE;
705
0037b49e 706 obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
50c957f7 707
428870ff 708 bonuslen = (obj_type == DMU_OT_SA) ?
50c957f7 709 DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
428870ff 710
34dc7c2f
BB
711 /*
712 * Create a new DMU object.
713 */
714 /*
715 * There's currently no mechanism for pre-reading the blocks that will
572e2857 716 * be needed to allocate a new object, so we accept the small chance
34dc7c2f
BB
717 * that there will be an i/o error and we will fail one of the
718 * assertions below.
719 */
3558fd73 720 if (S_ISDIR(vap->va_mode)) {
0037b49e
BB
721 if (zfsvfs->z_replay) {
722 VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
723 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
50c957f7 724 obj_type, bonuslen, dnodesize, tx));
34dc7c2f 725 } else {
0037b49e
BB
726 obj = zap_create_norm_dnsize(zfsvfs->z_os,
727 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
50c957f7 728 obj_type, bonuslen, dnodesize, tx);
34dc7c2f
BB
729 }
730 } else {
0037b49e
BB
731 if (zfsvfs->z_replay) {
732 VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
34dc7c2f 733 DMU_OT_PLAIN_FILE_CONTENTS, 0,
50c957f7 734 obj_type, bonuslen, dnodesize, tx));
34dc7c2f 735 } else {
0037b49e 736 obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
34dc7c2f 737 DMU_OT_PLAIN_FILE_CONTENTS, 0,
50c957f7 738 obj_type, bonuslen, dnodesize, tx);
34dc7c2f
BB
739 }
740 }
34dc7c2f 741
0037b49e 742 zh = zfs_znode_hold_enter(zfsvfs, obj);
9631681b 743 VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
34dc7c2f
BB
744
745 /*
746 * If this is the root, fix up the half-initialized parent pointer
747 * to reference the just-allocated physical data area.
748 */
749 if (flag & IS_ROOT_NODE) {
34dc7c2f
BB
750 dzp->z_id = obj;
751 }
752
753 /*
754 * If parent is an xattr, so am I.
755 */
9c5167d1 756 if (dzp->z_pflags & ZFS_XATTR) {
34dc7c2f 757 flag |= IS_XATTR;
34dc7c2f
BB
758 }
759
0037b49e 760 if (zfsvfs->z_use_fuids)
428870ff
BB
761 pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
762 else
763 pflags = 0;
34dc7c2f 764
3558fd73 765 if (S_ISDIR(vap->va_mode)) {
428870ff 766 size = 2; /* contents ("." and "..") */
dfbc8630 767 links = 2;
428870ff 768 } else {
dfbc8630 769 size = 0;
ace1eae8 770 links = (flag & IS_TMPFILE) ? 0 : 1;
34dc7c2f
BB
771 }
772
aa6d8c10 773 if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
dc1d7665 774 rdev = vap->va_rdev;
428870ff
BB
775
776 parent = dzp->z_id;
777 mode = acl_ids->z_mode;
34dc7c2f 778 if (flag & IS_XATTR)
428870ff 779 pflags |= ZFS_XATTR;
34dc7c2f 780
9c5167d1
NF
781 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
782 /*
783 * With ZFS_PROJID flag, we can easily know whether there is
784 * project ID stored on disk or not. See zfs_space_delta_cb().
785 */
786 if (obj_type != DMU_OT_ZNODE &&
787 dmu_objset_projectquota_enabled(zfsvfs->z_os))
788 pflags |= ZFS_PROJID;
789
790 /*
791 * Inherit project ID from parent if required.
792 */
793 projid = zfs_inherit_projid(dzp);
794 if (dzp->z_pflags & ZFS_PROJINHERIT)
795 pflags |= ZFS_PROJINHERIT;
796 }
797
428870ff 798 /*
e1cfd73f 799 * No execs denied will be determined when zfs_mode_compute() is called.
428870ff
BB
800 */
801 pflags |= acl_ids->z_aclp->z_hints &
802 (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
803 ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
34dc7c2f 804
428870ff
BB
805 ZFS_TIME_ENCODE(&now, crtime);
806 ZFS_TIME_ENCODE(&now, ctime);
34dc7c2f 807
3558fd73 808 if (vap->va_mask & ATTR_ATIME) {
428870ff 809 ZFS_TIME_ENCODE(&vap->va_atime, atime);
34dc7c2f 810 } else {
428870ff 811 ZFS_TIME_ENCODE(&now, atime);
34dc7c2f
BB
812 }
813
3558fd73 814 if (vap->va_mask & ATTR_MTIME) {
428870ff
BB
815 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
816 } else {
817 ZFS_TIME_ENCODE(&now, mtime);
818 }
819
820 /* Now add in all of the "SA" attributes */
0037b49e 821 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
428870ff
BB
822 &sa_hdl));
823
824 /*
825 * Setup the array of attributes to be replaced/set on the new file
826 *
827 * order for DMU_OT_ZNODE is critical since it needs to be constructed
828 * in the old znode_phys_t format. Don't change this ordering
829 */
79c76d5b 830 sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
428870ff
BB
831
832 if (obj_type == DMU_OT_ZNODE) {
0037b49e 833 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
428870ff 834 NULL, &atime, 16);
0037b49e 835 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
428870ff 836 NULL, &mtime, 16);
0037b49e 837 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
428870ff 838 NULL, &ctime, 16);
0037b49e 839 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
428870ff 840 NULL, &crtime, 16);
0037b49e 841 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
428870ff 842 NULL, &gen, 8);
0037b49e 843 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
428870ff 844 NULL, &mode, 8);
0037b49e 845 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
428870ff 846 NULL, &size, 8);
0037b49e 847 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
428870ff 848 NULL, &parent, 8);
34dc7c2f 849 } else {
0037b49e 850 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
428870ff 851 NULL, &mode, 8);
0037b49e 852 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
428870ff 853 NULL, &size, 8);
0037b49e 854 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
428870ff 855 NULL, &gen, 8);
0037b49e 856 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
3558fd73 857 NULL, &acl_ids->z_fuid, 8);
0037b49e 858 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
3558fd73 859 NULL, &acl_ids->z_fgid, 8);
0037b49e 860 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
428870ff 861 NULL, &parent, 8);
0037b49e 862 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
428870ff 863 NULL, &pflags, 8);
0037b49e 864 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
428870ff 865 NULL, &atime, 16);
0037b49e 866 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
428870ff 867 NULL, &mtime, 16);
0037b49e 868 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
428870ff 869 NULL, &ctime, 16);
0037b49e 870 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
428870ff
BB
871 NULL, &crtime, 16);
872 }
873
0037b49e 874 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
428870ff
BB
875
876 if (obj_type == DMU_OT_ZNODE) {
0037b49e 877 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
428870ff 878 &empty_xattr, 8);
9c5167d1
NF
879 } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
880 pflags & ZFS_PROJID) {
881 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
882 NULL, &projid, 8);
34dc7c2f 883 }
428870ff 884 if (obj_type == DMU_OT_ZNODE ||
aa6d8c10 885 (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
0037b49e 886 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
428870ff 887 NULL, &rdev, 8);
428870ff
BB
888 }
889 if (obj_type == DMU_OT_ZNODE) {
0037b49e 890 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
428870ff 891 NULL, &pflags, 8);
0037b49e 892 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
428870ff 893 &acl_ids->z_fuid, 8);
0037b49e 894 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
428870ff 895 &acl_ids->z_fgid, 8);
0037b49e 896 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
428870ff 897 sizeof (uint64_t) * 4);
0037b49e 898 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
428870ff
BB
899 &acl_phys, sizeof (zfs_acl_phys_t));
900 } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
0037b49e 901 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
428870ff
BB
902 &acl_ids->z_aclp->z_acl_count, 8);
903 locate.cb_aclp = acl_ids->z_aclp;
0037b49e 904 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
428870ff
BB
905 zfs_acl_data_locator, &locate,
906 acl_ids->z_aclp->z_acl_bytes);
907 mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
908 acl_ids->z_fuid, acl_ids->z_fgid);
909 }
910
911 VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
34dc7c2f 912
34dc7c2f 913 if (!(flag & IS_ROOT_NODE)) {
8d703987
BB
914 /*
915 * The call to zfs_znode_alloc() may fail if memory is low
916 * via the call path: alloc_inode() -> inode_init_always() ->
917 * security_inode_alloc() -> inode_alloc_security(). Since
918 * the existing code is written such that zfs_mknode() can
919 * not fail retry until sufficient memory has been reclaimed.
920 */
921 do {
1a688994 922 *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
8d703987
BB
923 } while (*zpp == NULL);
924
7b3e34ba
BB
925 VERIFY(*zpp != NULL);
926 VERIFY(dzp != NULL);
34dc7c2f
BB
927 } else {
928 /*
929 * If we are creating the root node, the "parent" we
930 * passed in is the znode for the root.
931 */
932 *zpp = dzp;
428870ff
BB
933
934 (*zpp)->z_sa_hdl = sa_hdl;
34dc7c2f 935 }
428870ff
BB
936
937 (*zpp)->z_pflags = pflags;
12fa7f34 938 (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
50c957f7 939 (*zpp)->z_dnodesize = dnodesize;
9c5167d1 940 (*zpp)->z_projid = projid;
428870ff 941
428870ff
BB
942 if (obj_type == DMU_OT_ZNODE ||
943 acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
b0bc7a84 944 VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
428870ff 945 }
d1d7e268 946 kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
0037b49e 947 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
948}
949
5484965a 950/*
d3cc8b15
WA
951 * Update in-core attributes. It is assumed the caller will be doing an
952 * sa_bulk_update to push the changes out.
5484965a
BB
953 */
954void
955zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
956{
957 xoptattr_t *xoap;
7bb1325f 958 boolean_t update_inode = B_FALSE;
5484965a
BB
959
960 xoap = xva_getxoptattr(xvap);
961 ASSERT(xoap);
962
963 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
964 uint64_t times[2];
965 ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
966 (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
967 &times, sizeof (times), tx);
968 XVA_SET_RTN(xvap, XAT_CREATETIME);
969 }
970 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
971 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
972 zp->z_pflags, tx);
973 XVA_SET_RTN(xvap, XAT_READONLY);
974 }
975 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
976 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
977 zp->z_pflags, tx);
978 XVA_SET_RTN(xvap, XAT_HIDDEN);
979 }
980 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
981 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
982 zp->z_pflags, tx);
983 XVA_SET_RTN(xvap, XAT_SYSTEM);
984 }
985 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
986 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
987 zp->z_pflags, tx);
988 XVA_SET_RTN(xvap, XAT_ARCHIVE);
989 }
990 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
991 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
992 zp->z_pflags, tx);
993 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
64c688d7 994
7bb1325f 995 update_inode = B_TRUE;
5484965a
BB
996 }
997 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
998 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
999 zp->z_pflags, tx);
1000 XVA_SET_RTN(xvap, XAT_NOUNLINK);
1001 }
1002 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1003 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
1004 zp->z_pflags, tx);
1005 XVA_SET_RTN(xvap, XAT_APPENDONLY);
64c688d7 1006
7bb1325f 1007 update_inode = B_TRUE;
5484965a
BB
1008 }
1009 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1010 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
1011 zp->z_pflags, tx);
1012 XVA_SET_RTN(xvap, XAT_NODUMP);
1013 }
1014 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1015 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
1016 zp->z_pflags, tx);
1017 XVA_SET_RTN(xvap, XAT_OPAQUE);
1018 }
1019 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1020 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
1021 xoap->xoa_av_quarantined, zp->z_pflags, tx);
1022 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1023 }
1024 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1025 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
1026 zp->z_pflags, tx);
1027 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1028 }
1029 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
1030 zfs_sa_set_scanstamp(zp, xvap, tx);
1031 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
1032 }
1033 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1034 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
1035 zp->z_pflags, tx);
1036 XVA_SET_RTN(xvap, XAT_REPARSE);
1037 }
1038 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1039 ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
1040 zp->z_pflags, tx);
1041 XVA_SET_RTN(xvap, XAT_OFFLINE);
1042 }
1043 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1044 ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
1045 zp->z_pflags, tx);
1046 XVA_SET_RTN(xvap, XAT_SPARSE);
1047 }
9c5167d1
NF
1048 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
1049 ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
1050 zp->z_pflags, tx);
1051 XVA_SET_RTN(xvap, XAT_PROJINHERIT);
1052 }
7bb1325f
CC
1053
1054 if (update_inode)
1055 zfs_set_inode_flags(zp, ZTOI(zp));
5484965a
BB
1056}
1057
34dc7c2f 1058int
0037b49e 1059zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
34dc7c2f
BB
1060{
1061 dmu_object_info_t doi;
1062 dmu_buf_t *db;
1063 znode_t *zp;
c96c36fa 1064 znode_hold_t *zh;
34dc7c2f 1065 int err;
428870ff 1066 sa_handle_t *hdl;
34dc7c2f
BB
1067
1068 *zpp = NULL;
1069
6f9548c4 1070again:
0037b49e 1071 zh = zfs_znode_hold_enter(zfsvfs, obj_num);
34dc7c2f 1072
0037b49e 1073 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
34dc7c2f 1074 if (err) {
0037b49e 1075 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1076 return (err);
1077 }
1078
1079 dmu_object_info_from_db(db, &doi);
428870ff
BB
1080 if (doi.doi_bonus_type != DMU_OT_SA &&
1081 (doi.doi_bonus_type != DMU_OT_ZNODE ||
1082 (doi.doi_bonus_type == DMU_OT_ZNODE &&
1083 doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1084 sa_buf_rele(db, NULL);
0037b49e 1085 zfs_znode_hold_exit(zfsvfs, zh);
2e528b49 1086 return (SET_ERROR(EINVAL));
34dc7c2f
BB
1087 }
1088
428870ff
BB
1089 hdl = dmu_buf_get_user(db);
1090 if (hdl != NULL) {
36df2843 1091 zp = sa_get_userdata(hdl);
34dc7c2f 1092
8ac67298 1093
34dc7c2f 1094 /*
428870ff
BB
1095 * Since "SA" does immediate eviction we
1096 * should never find a sa handle that doesn't
1097 * know about the znode.
34dc7c2f 1098 */
428870ff
BB
1099
1100 ASSERT3P(zp, !=, NULL);
1101
1102 mutex_enter(&zp->z_lock);
34dc7c2f 1103 ASSERT3U(zp->z_id, ==, obj_num);
98701490 1104 /*
41e1aa2a 1105 * If zp->z_unlinked is set, the znode is already marked
0c468138
MFO
1106 * for deletion and should not be discovered. Check this
1107 * after checking igrab() due to fsetxattr() & O_TMPFILE.
41e1aa2a 1108 *
98701490
CC
1109 * If igrab() returns NULL the VFS has independently
1110 * determined the inode should be evicted and has
1111 * called iput_final() to start the eviction process.
1112 * The SA handle is still valid but because the VFS
1113 * requires that the eviction succeed we must drop
1114 * our locks and references to allow the eviction to
1115 * complete. The zfs_zget() may then be retried.
1116 *
1117 * This unlikely case could be optimized by registering
1118 * a sops->drop_inode() callback. The callback would
1119 * need to detect the active SA hold thereby informing
1120 * the VFS that this inode should not be evicted.
1121 */
0c468138
MFO
1122 if (igrab(ZTOI(zp)) == NULL) {
1123 if (zp->z_unlinked)
1124 err = SET_ERROR(ENOENT);
1125 else
1126 err = SET_ERROR(EAGAIN);
41e1aa2a
HAS
1127 } else {
1128 *zpp = zp;
1129 err = 0;
34dc7c2f 1130 }
41e1aa2a 1131
34dc7c2f 1132 mutex_exit(&zp->z_lock);
f3ad9cd6 1133 sa_buf_rele(db, NULL);
0037b49e 1134 zfs_znode_hold_exit(zfsvfs, zh);
41e1aa2a
HAS
1135
1136 if (err == EAGAIN) {
1137 /* inode might need this to finish evict */
1138 cond_resched();
1139 goto again;
1140 }
34dc7c2f
BB
1141 return (err);
1142 }
1143
1144 /*
3558fd73 1145 * Not found create new znode/vnode but only if file exists.
428870ff
BB
1146 *
1147 * There is a small window where zfs_vget() could
1148 * find this object while a file create is still in
1149 * progress. This is checked for in zfs_znode_alloc()
1150 *
1151 * if zfs_znode_alloc() fails it will drop the hold on the
1152 * bonus buffer.
34dc7c2f 1153 */
0037b49e 1154 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
1a688994 1155 doi.doi_bonus_type, NULL);
428870ff 1156 if (zp == NULL) {
2e528b49 1157 err = SET_ERROR(ENOENT);
428870ff
BB
1158 } else {
1159 *zpp = zp;
1160 }
0037b49e 1161 zfs_znode_hold_exit(zfsvfs, zh);
428870ff 1162 return (err);
34dc7c2f
BB
1163}
1164
1165int
1166zfs_rezget(znode_t *zp)
1167{
0037b49e 1168 zfsvfs_t *zfsvfs = ZTOZSB(zp);
34dc7c2f
BB
1169 dmu_object_info_t doi;
1170 dmu_buf_t *db;
1171 uint64_t obj_num = zp->z_id;
428870ff 1172 uint64_t mode;
dfbc8630 1173 uint64_t links;
abbf0bd4 1174 sa_bulk_attr_t bulk[11];
34dc7c2f 1175 int err;
428870ff
BB
1176 int count = 0;
1177 uint64_t gen;
2c6abf15 1178 uint64_t z_uid, z_gid;
abbf0bd4 1179 uint64_t atime[2], mtime[2], ctime[2], btime[2];
9c5167d1 1180 uint64_t projid = ZFS_DEFAULT_PROJID;
c96c36fa 1181 znode_hold_t *zh;
34dc7c2f 1182
cbecb4fb
CC
1183 /*
1184 * skip ctldir, otherwise they will always get invalidated. This will
1185 * cause funny behaviour for the mounted snapdirs. Especially for
1186 * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
1187 * anyone automount it again as long as someone is still using the
1188 * detached mount.
1189 */
1190 if (zp->z_is_ctldir)
1191 return (0);
1192
0037b49e 1193 zh = zfs_znode_hold_enter(zfsvfs, obj_num);
34dc7c2f 1194
428870ff
BB
1195 mutex_enter(&zp->z_acl_lock);
1196 if (zp->z_acl_cached) {
1197 zfs_acl_free(zp->z_acl_cached);
1198 zp->z_acl_cached = NULL;
1199 }
428870ff 1200 mutex_exit(&zp->z_acl_lock);
7b3e34ba 1201
228b461b 1202 rw_enter(&zp->z_xattr_lock, RW_WRITER);
7b3e34ba
BB
1203 if (zp->z_xattr_cached) {
1204 nvlist_free(zp->z_xattr_cached);
1205 zp->z_xattr_cached = NULL;
1206 }
7b3e34ba
BB
1207 rw_exit(&zp->z_xattr_lock);
1208
428870ff 1209 ASSERT(zp->z_sa_hdl == NULL);
0037b49e 1210 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
34dc7c2f 1211 if (err) {
0037b49e 1212 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1213 return (err);
1214 }
1215
1216 dmu_object_info_from_db(db, &doi);
428870ff
BB
1217 if (doi.doi_bonus_type != DMU_OT_SA &&
1218 (doi.doi_bonus_type != DMU_OT_ZNODE ||
1219 (doi.doi_bonus_type == DMU_OT_ZNODE &&
1220 doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1221 sa_buf_rele(db, NULL);
0037b49e 1222 zfs_znode_hold_exit(zfsvfs, zh);
2e528b49 1223 return (SET_ERROR(EINVAL));
34dc7c2f
BB
1224 }
1225
0037b49e 1226 zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
428870ff
BB
1227
1228 /* reload cached values */
0037b49e 1229 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
428870ff 1230 &gen, sizeof (gen));
0037b49e 1231 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
428870ff 1232 &zp->z_size, sizeof (zp->z_size));
0037b49e 1233 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
dfbc8630 1234 &links, sizeof (links));
0037b49e 1235 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
428870ff 1236 &zp->z_pflags, sizeof (zp->z_pflags));
0037b49e 1237 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2c6abf15 1238 &z_uid, sizeof (z_uid));
0037b49e 1239 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
2c6abf15 1240 &z_gid, sizeof (z_gid));
0037b49e 1241 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
428870ff 1242 &mode, sizeof (mode));
0037b49e 1243 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
9f5f0019 1244 &atime, 16);
0037b49e 1245 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
9f5f0019 1246 &mtime, 16);
0037b49e 1247 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
9f5f0019 1248 &ctime, 16);
abbf0bd4 1249 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
428870ff 1250
428870ff
BB
1251 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
1252 zfs_znode_dmu_fini(zp);
0037b49e 1253 zfs_znode_hold_exit(zfsvfs, zh);
2e528b49 1254 return (SET_ERROR(EIO));
428870ff
BB
1255 }
1256
9c5167d1
NF
1257 if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
1258 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
1259 &projid, 8);
1260 if (err != 0 && err != ENOENT) {
1261 zfs_znode_dmu_fini(zp);
1262 zfs_znode_hold_exit(zfsvfs, zh);
1263 return (SET_ERROR(err));
1264 }
1265 }
1266
1267 zp->z_projid = projid;
12fa7f34 1268 zp->z_mode = ZTOI(zp)->i_mode = mode;
2c6abf15
NB
1269 zfs_uid_write(ZTOI(zp), z_uid);
1270 zfs_gid_write(ZTOI(zp), z_gid);
572e2857 1271
9f5f0019
NB
1272 ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
1273 ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
1274 ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime);
abbf0bd4 1275 ZFS_TIME_DECODE(&zp->z_btime, btime);
9f5f0019 1276
3ce85b5e 1277 if ((uint32_t)gen != ZTOI(zp)->i_generation) {
428870ff 1278 zfs_znode_dmu_fini(zp);
0037b49e 1279 zfs_znode_hold_exit(zfsvfs, zh);
2e528b49 1280 return (SET_ERROR(EIO));
34dc7c2f
BB
1281 }
1282
dfbc8630 1283 set_nlink(ZTOI(zp), (uint32_t)links);
7bb1325f 1284 zfs_set_inode_flags(zp, ZTOI(zp));
dfbc8630 1285
34dc7c2f 1286 zp->z_blksz = doi.doi_data_block_size;
a43570c5 1287 zp->z_atime_dirty = B_FALSE;
fc273894 1288 zfs_znode_update_vfs(zp);
34dc7c2f 1289
6a218566
AG
1290 /*
1291 * If the file has zero links, then it has been unlinked on the send
1292 * side and it must be in the received unlinked set.
1293 * We call zfs_znode_dmu_fini() now to prevent any accesses to the
e1cfd73f 1294 * stale data and to prevent automatic removal of the file in
6a218566
AG
1295 * zfs_zinactive(). The file will be removed either when it is removed
1296 * on the send side and the next incremental stream is received or
1297 * when the unlinked set gets processed.
1298 */
1299 zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
1300 if (zp->z_unlinked)
1301 zfs_znode_dmu_fini(zp);
1302
0037b49e 1303 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1304
1305 return (0);
1306}
1307
1308void
1309zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1310{
0037b49e
BB
1311 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1312 objset_t *os = zfsvfs->z_os;
34dc7c2f 1313 uint64_t obj = zp->z_id;
572e2857 1314 uint64_t acl_obj = zfs_external_acl(zp);
c96c36fa 1315 znode_hold_t *zh;
34dc7c2f 1316
0037b49e 1317 zh = zfs_znode_hold_enter(zfsvfs, obj);
572e2857
BB
1318 if (acl_obj) {
1319 VERIFY(!zp->z_is_sa);
b128c09f 1320 VERIFY(0 == dmu_object_free(os, acl_obj, tx));
572e2857 1321 }
b128c09f 1322 VERIFY(0 == dmu_object_free(os, obj, tx));
34dc7c2f 1323 zfs_znode_dmu_fini(zp);
0037b49e 1324 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1325}
1326
1327void
1328zfs_zinactive(znode_t *zp)
1329{
0037b49e 1330 zfsvfs_t *zfsvfs = ZTOZSB(zp);
34dc7c2f 1331 uint64_t z_id = zp->z_id;
c96c36fa 1332 znode_hold_t *zh;
34dc7c2f 1333
428870ff 1334 ASSERT(zp->z_sa_hdl);
34dc7c2f
BB
1335
1336 /*
d6bd8eaa 1337 * Don't allow a zfs_zget() while were trying to release this znode.
34dc7c2f 1338 */
0037b49e 1339 zh = zfs_znode_hold_enter(zfsvfs, z_id);
d6bd8eaa 1340
34dc7c2f 1341 mutex_enter(&zp->z_lock);
34dc7c2f
BB
1342
1343 /*
6a218566
AG
1344 * If this was the last reference to a file with no links, remove
1345 * the file from the file system unless the file system is mounted
1346 * read-only. That can happen, for example, if the file system was
1347 * originally read-write, the file was opened, then unlinked and
1348 * the file system was made read-only before the file was finally
1349 * closed. The file will remain in the unlinked set.
34dc7c2f
BB
1350 */
1351 if (zp->z_unlinked) {
6a218566 1352 ASSERT(!zfsvfs->z_issnap);
dcec0a12 1353 if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) {
6a218566
AG
1354 mutex_exit(&zp->z_lock);
1355 zfs_znode_hold_exit(zfsvfs, zh);
1356 zfs_rmnode(zp);
1357 return;
1358 }
34dc7c2f 1359 }
428870ff 1360
34dc7c2f
BB
1361 mutex_exit(&zp->z_lock);
1362 zfs_znode_dmu_fini(zp);
d6bd8eaa 1363
0037b49e 1364 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1365}
1366
9c53e516
TK
1367#if defined(HAVE_INODE_TIMESPEC64_TIMES)
1368#define zfs_compare_timespec timespec64_compare
1369#else
1370#define zfs_compare_timespec timespec_compare
1371#endif
1372
1373/*
1374 * Determine whether the znode's atime must be updated. The logic mostly
1375 * duplicates the Linux kernel's relatime_need_update() functionality.
1376 * This function is only called if the underlying filesystem actually has
1377 * atime updates enabled.
1378 */
1379boolean_t
1380zfs_relatime_need_update(const struct inode *ip)
6d111134 1381{
9c53e516
TK
1382 inode_timespec_t now;
1383
1384 gethrestime(&now);
1385 /*
1386 * In relatime mode, only update the atime if the previous atime
1387 * is earlier than either the ctime or mtime or if at least a day
1388 * has passed since the last update of atime.
1389 */
1390 if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0)
1391 return (B_TRUE);
1392
1393 if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0)
1394 return (B_TRUE);
6d111134 1395
9c53e516
TK
1396 if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60)
1397 return (B_TRUE);
6d111134 1398
9c53e516 1399 return (B_FALSE);
6d111134
TC
1400}
1401
6d111134
TC
1402/*
1403 * Prepare to update znode time stamps.
1404 *
1405 * IN: zp - znode requiring timestamp update
0df9673f 1406 * flag - ATTR_MTIME, ATTR_CTIME flags
6d111134 1407 *
0df9673f 1408 * OUT: zp - z_seq
6d111134
TC
1409 * mtime - new mtime
1410 * ctime - new ctime
1411 *
0df9673f
CC
1412 * Note: We don't update atime here, because we rely on Linux VFS to do
1413 * atime updating.
6d111134 1414 */
34dc7c2f 1415void
428870ff 1416zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
0df9673f 1417 uint64_t ctime[2])
34dc7c2f 1418{
6413c95f 1419 inode_timespec_t now;
34dc7c2f 1420
34dc7c2f
BB
1421 gethrestime(&now);
1422
0df9673f 1423 zp->z_seq++;
34dc7c2f 1424
3558fd73 1425 if (flag & ATTR_MTIME) {
428870ff 1426 ZFS_TIME_ENCODE(&now, mtime);
9f5f0019 1427 ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime);
3558fd73 1428 if (ZTOZSB(zp)->z_use_fuids) {
428870ff
BB
1429 zp->z_pflags |= (ZFS_ARCHIVE |
1430 ZFS_AV_MODIFIED);
1431 }
34dc7c2f
BB
1432 }
1433
3558fd73 1434 if (flag & ATTR_CTIME) {
428870ff 1435 ZFS_TIME_ENCODE(&now, ctime);
9f5f0019 1436 ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime);
3558fd73 1437 if (ZTOZSB(zp)->z_use_fuids)
428870ff 1438 zp->z_pflags |= ZFS_ARCHIVE;
34dc7c2f
BB
1439 }
1440}
1441
34dc7c2f
BB
1442/*
1443 * Grow the block size for a file.
1444 *
1445 * IN: zp - znode of file to free data in.
1446 * size - requested block size
1447 * tx - open transaction.
1448 *
1449 * NOTE: this function assumes that the znode is write locked.
1450 */
1451void
1452zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1453{
1454 int error;
1455 u_longlong_t dummy;
1456
1457 if (size <= zp->z_blksz)
1458 return;
1459 /*
1460 * If the file size is already greater than the current blocksize,
1461 * we will not grow. If there is more than one block in a file,
1462 * the blocksize cannot change.
1463 */
428870ff 1464 if (zp->z_blksz && zp->z_size > zp->z_blksz)
34dc7c2f
BB
1465 return;
1466
3558fd73 1467 error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
34dc7c2f 1468 size, 0, tx);
428870ff 1469
34dc7c2f
BB
1470 if (error == ENOTSUP)
1471 return;
c99c9001 1472 ASSERT0(error);
34dc7c2f
BB
1473
1474 /* What blocksize did we actually get? */
428870ff 1475 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
34dc7c2f
BB
1476}
1477
34dc7c2f 1478/*
b128c09f 1479 * Increase the file length
34dc7c2f
BB
1480 *
1481 * IN: zp - znode of file to free data in.
b128c09f 1482 * end - new end-of-file
34dc7c2f 1483 *
19d55079 1484 * RETURN: 0 on success, error code on failure
34dc7c2f 1485 */
b128c09f
BB
1486static int
1487zfs_extend(znode_t *zp, uint64_t end)
34dc7c2f 1488{
0037b49e 1489 zfsvfs_t *zfsvfs = ZTOZSB(zp);
b128c09f 1490 dmu_tx_t *tx;
bd4dde8e 1491 zfs_locked_range_t *lr;
b128c09f 1492 uint64_t newblksz;
34dc7c2f
BB
1493 int error;
1494
34dc7c2f 1495 /*
b128c09f 1496 * We will change zp_size, lock the whole file.
34dc7c2f 1497 */
2cc479d0 1498 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
34dc7c2f
BB
1499
1500 /*
1501 * Nothing to do if file already at desired length.
1502 */
428870ff 1503 if (end <= zp->z_size) {
2cc479d0 1504 zfs_rangelock_exit(lr);
34dc7c2f
BB
1505 return (0);
1506 }
0037b49e 1507 tx = dmu_tx_create(zfsvfs->z_os);
428870ff
BB
1508 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1509 zfs_sa_upgrade_txholds(tx, zp);
b128c09f 1510 if (end > zp->z_blksz &&
0037b49e 1511 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
34dc7c2f
BB
1512 /*
1513 * We are growing the file past the current block size.
1514 */
3558fd73 1515 if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
f1512ee6
MA
1516 /*
1517 * File's blocksize is already larger than the
1518 * "recordsize" property. Only let it grow to
1519 * the next power of 2.
1520 */
34dc7c2f 1521 ASSERT(!ISP2(zp->z_blksz));
f1512ee6 1522 newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
34dc7c2f 1523 } else {
3558fd73 1524 newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
34dc7c2f 1525 }
b128c09f
BB
1526 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1527 } else {
1528 newblksz = 0;
34dc7c2f
BB
1529 }
1530
384f8a09 1531 error = dmu_tx_assign(tx, TXG_WAIT);
34dc7c2f 1532 if (error) {
34dc7c2f 1533 dmu_tx_abort(tx);
2cc479d0 1534 zfs_rangelock_exit(lr);
34dc7c2f
BB
1535 return (error);
1536 }
1537
b128c09f
BB
1538 if (newblksz)
1539 zfs_grow_blocksize(zp, newblksz, tx);
34dc7c2f 1540
428870ff
BB
1541 zp->z_size = end;
1542
3558fd73 1543 VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
428870ff 1544 &zp->z_size, sizeof (zp->z_size), tx));
34dc7c2f 1545
2cc479d0 1546 zfs_rangelock_exit(lr);
34dc7c2f 1547
b128c09f 1548 dmu_tx_commit(tx);
34dc7c2f 1549
b128c09f
BB
1550 return (0);
1551}
1552
223df016
TC
1553/*
1554 * zfs_zero_partial_page - Modeled after update_pages() but
1555 * with different arguments and semantics for use by zfs_freesp().
1556 *
1557 * Zeroes a piece of a single page cache entry for zp at offset
1558 * start and length len.
1559 *
1560 * Caller must acquire a range lock on the file for the region
1561 * being zeroed in order that the ARC and page cache stay in sync.
1562 */
1563static void
1564zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
1565{
1566 struct address_space *mp = ZTOI(zp)->i_mapping;
1567 struct page *pp;
1568 int64_t off;
1569 void *pb;
1570
8b1899d3 1571 ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
223df016 1572
8b1899d3
BB
1573 off = start & (PAGE_SIZE - 1);
1574 start &= PAGE_MASK;
223df016 1575
8b1899d3 1576 pp = find_lock_page(mp, start >> PAGE_SHIFT);
223df016
TC
1577 if (pp) {
1578 if (mapping_writably_mapped(mp))
1579 flush_dcache_page(pp);
1580
1581 pb = kmap(pp);
1582 bzero(pb + off, len);
1583 kunmap(pp);
1584
1585 if (mapping_writably_mapped(mp))
1586 flush_dcache_page(pp);
1587
1588 mark_page_accessed(pp);
1589 SetPageUptodate(pp);
1590 ClearPageError(pp);
1591 unlock_page(pp);
8b1899d3 1592 put_page(pp);
223df016
TC
1593 }
1594}
1595
b128c09f
BB
1596/*
1597 * Free space in a file.
1598 *
1599 * IN: zp - znode of file to free data in.
1600 * off - start of section to free.
1601 * len - length of section to free.
1602 *
19d55079 1603 * RETURN: 0 on success, error code on failure
b128c09f
BB
1604 */
1605static int
1606zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1607{
0037b49e 1608 zfsvfs_t *zfsvfs = ZTOZSB(zp);
bd4dde8e 1609 zfs_locked_range_t *lr;
b128c09f
BB
1610 int error;
1611
1612 /*
1613 * Lock the range being freed.
1614 */
2cc479d0 1615 lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
b128c09f
BB
1616
1617 /*
1618 * Nothing to do if file already at desired length.
1619 */
428870ff 1620 if (off >= zp->z_size) {
2cc479d0 1621 zfs_rangelock_exit(lr);
b128c09f 1622 return (0);
34dc7c2f
BB
1623 }
1624
428870ff
BB
1625 if (off + len > zp->z_size)
1626 len = zp->z_size - off;
b128c09f 1627
0037b49e 1628 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
b128c09f 1629
223df016
TC
1630 /*
1631 * Zero partial page cache entries. This must be done under a
1632 * range lock in order to keep the ARC and page cache in sync.
1633 */
1634 if (zp->z_is_mapped) {
1635 loff_t first_page, last_page, page_len;
1636 loff_t first_page_offset, last_page_offset;
1637
1638 /* first possible full page in hole */
8b1899d3 1639 first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
223df016 1640 /* last page of hole */
8b1899d3 1641 last_page = (off + len) >> PAGE_SHIFT;
223df016
TC
1642
1643 /* offset of first_page */
8b1899d3 1644 first_page_offset = first_page << PAGE_SHIFT;
223df016 1645 /* offset of last_page */
8b1899d3 1646 last_page_offset = last_page << PAGE_SHIFT;
223df016 1647
cb08f063
TC
1648 /* truncate whole pages */
1649 if (last_page_offset > first_page_offset) {
1650 truncate_inode_pages_range(ZTOI(zp)->i_mapping,
1651 first_page_offset, last_page_offset - 1);
1652 }
1653
1654 /* truncate sub-page ranges */
223df016
TC
1655 if (first_page > last_page) {
1656 /* entire punched area within a single page */
1657 zfs_zero_partial_page(zp, off, len);
1658 } else {
1659 /* beginning of punched area at the end of a page */
1660 page_len = first_page_offset - off;
1661 if (page_len > 0)
1662 zfs_zero_partial_page(zp, off, page_len);
1663
1664 /* end of punched area at the beginning of a page */
1665 page_len = off + len - last_page_offset;
1666 if (page_len > 0)
1667 zfs_zero_partial_page(zp, last_page_offset,
1668 page_len);
1669 }
1670 }
2cc479d0 1671 zfs_rangelock_exit(lr);
34dc7c2f 1672
b128c09f
BB
1673 return (error);
1674}
1675
1676/*
1677 * Truncate a file
1678 *
1679 * IN: zp - znode of file to free data in.
1680 * end - new end-of-file.
1681 *
19d55079 1682 * RETURN: 0 on success, error code on failure
b128c09f
BB
1683 */
1684static int
1685zfs_trunc(znode_t *zp, uint64_t end)
1686{
0037b49e 1687 zfsvfs_t *zfsvfs = ZTOZSB(zp);
b128c09f 1688 dmu_tx_t *tx;
bd4dde8e 1689 zfs_locked_range_t *lr;
b128c09f 1690 int error;
572e2857
BB
1691 sa_bulk_attr_t bulk[2];
1692 int count = 0;
b128c09f
BB
1693
1694 /*
1695 * We will change zp_size, lock the whole file.
1696 */
2cc479d0 1697 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
b128c09f
BB
1698
1699 /*
1700 * Nothing to do if file already at desired length.
1701 */
428870ff 1702 if (end >= zp->z_size) {
2cc479d0 1703 zfs_rangelock_exit(lr);
b128c09f
BB
1704 return (0);
1705 }
1706
18a2485f
FS
1707 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
1708 DMU_OBJECT_END);
b128c09f 1709 if (error) {
2cc479d0 1710 zfs_rangelock_exit(lr);
b128c09f
BB
1711 return (error);
1712 }
0037b49e 1713 tx = dmu_tx_create(zfsvfs->z_os);
428870ff
BB
1714 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1715 zfs_sa_upgrade_txholds(tx, zp);
19d55079 1716 dmu_tx_mark_netfree(tx);
7a8f0e80 1717 error = dmu_tx_assign(tx, TXG_WAIT);
b128c09f 1718 if (error) {
b128c09f 1719 dmu_tx_abort(tx);
2cc479d0 1720 zfs_rangelock_exit(lr);
b128c09f
BB
1721 return (error);
1722 }
b128c09f 1723
428870ff 1724 zp->z_size = end;
0037b49e 1725 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
572e2857 1726 NULL, &zp->z_size, sizeof (zp->z_size));
428870ff 1727
572e2857
BB
1728 if (end == 0) {
1729 zp->z_pflags &= ~ZFS_SPARSE;
0037b49e 1730 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
572e2857
BB
1731 NULL, &zp->z_pflags, 8);
1732 }
1733 VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
b128c09f 1734
34dc7c2f 1735 dmu_tx_commit(tx);
2cc479d0 1736 zfs_rangelock_exit(lr);
34dc7c2f
BB
1737
1738 return (0);
1739}
1740
b128c09f
BB
1741/*
1742 * Free space in a file
1743 *
1744 * IN: zp - znode of file to free data in.
1745 * off - start of range
1746 * len - end of range (0 => EOF)
1747 * flag - current file open mode flags.
1748 * log - TRUE if this action should be logged
1749 *
19d55079 1750 * RETURN: 0 on success, error code on failure
b128c09f
BB
1751 */
1752int
1753zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1754{
b128c09f 1755 dmu_tx_t *tx;
0037b49e
BB
1756 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1757 zilog_t *zilog = zfsvfs->z_log;
428870ff
BB
1758 uint64_t mode;
1759 uint64_t mtime[2], ctime[2];
1760 sa_bulk_attr_t bulk[3];
1761 int count = 0;
b128c09f
BB
1762 int error;
1763
0037b49e 1764 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
428870ff
BB
1765 sizeof (mode))) != 0)
1766 return (error);
1767
1768 if (off > zp->z_size) {
b128c09f
BB
1769 error = zfs_extend(zp, off+len);
1770 if (error == 0 && log)
1771 goto log;
223df016 1772 goto out;
b128c09f
BB
1773 }
1774
b128c09f
BB
1775 if (len == 0) {
1776 error = zfs_trunc(zp, off);
1777 } else {
1778 if ((error = zfs_free_range(zp, off, len)) == 0 &&
428870ff 1779 off + len > zp->z_size)
b128c09f
BB
1780 error = zfs_extend(zp, off+len);
1781 }
1782 if (error || !log)
223df016 1783 goto out;
b128c09f 1784log:
0037b49e 1785 tx = dmu_tx_create(zfsvfs->z_os);
428870ff
BB
1786 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1787 zfs_sa_upgrade_txholds(tx, zp);
384f8a09 1788 error = dmu_tx_assign(tx, TXG_WAIT);
b128c09f 1789 if (error) {
b128c09f 1790 dmu_tx_abort(tx);
223df016 1791 goto out;
b128c09f
BB
1792 }
1793
0037b49e
BB
1794 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
1795 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
1796 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
428870ff 1797 NULL, &zp->z_pflags, 8);
0df9673f 1798 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
428870ff
BB
1799 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1800 ASSERT(error == 0);
1801
b128c09f
BB
1802 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1803
1804 dmu_tx_commit(tx);
223df016 1805
fc273894 1806 zfs_znode_update_vfs(zp);
223df016
TC
1807 error = 0;
1808
1809out:
1810 /*
1811 * Truncate the page cache - for file truncate operations, use
1812 * the purpose-built API for truncations. For punching operations,
cb08f063 1813 * the truncation is handled under a range lock in zfs_free_range.
223df016
TC
1814 */
1815 if (len == 0)
1816 truncate_setsize(ZTOI(zp), off);
223df016 1817 return (error);
b128c09f
BB
1818}
1819
34dc7c2f
BB
1820void
1821zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1822{
22872ff5 1823 struct super_block *sb;
0037b49e 1824 zfsvfs_t *zfsvfs;
428870ff 1825 uint64_t moid, obj, sa_obj, version;
22872ff5 1826 uint64_t sense = ZFS_CASE_SENSITIVE;
34dc7c2f
BB
1827 uint64_t norm = 0;
1828 nvpair_t *elem;
c96c36fa 1829 int size;
34dc7c2f 1830 int error;
22872ff5
BB
1831 int i;
1832 znode_t *rootzp = NULL;
1833 vattr_t vattr;
1834 znode_t *zp;
1835 zfs_acl_ids_t acl_ids;
34dc7c2f
BB
1836
1837 /*
1838 * First attempt to create master node.
1839 */
1840 /*
1841 * In an empty objset, there are no blocks to read and thus
1842 * there can be no i/o errors (which we assert below).
1843 */
1844 moid = MASTER_NODE_OBJ;
1845 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1846 DMU_OT_NONE, 0, tx);
1847 ASSERT(error == 0);
1848
1849 /*
1850 * Set starting attributes.
1851 */
428870ff 1852 version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
34dc7c2f
BB
1853 elem = NULL;
1854 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1855 /* For the moment we expect all zpl props to be uint64_ts */
1856 uint64_t val;
1857 char *name;
1858
1859 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1860 VERIFY(nvpair_value_uint64(elem, &val) == 0);
1861 name = nvpair_name(elem);
1862 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
9babb374
BB
1863 if (val < version)
1864 version = val;
34dc7c2f
BB
1865 } else {
1866 error = zap_update(os, moid, name, 8, 1, &val, tx);
1867 }
1868 ASSERT(error == 0);
1869 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1870 norm = val;
22872ff5
BB
1871 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1872 sense = val;
34dc7c2f
BB
1873 }
1874 ASSERT(version != 0);
9babb374 1875 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
34dc7c2f 1876
428870ff
BB
1877 /*
1878 * Create zap object used for SA attribute registration
1879 */
1880
1881 if (version >= ZPL_VERSION_SA) {
1882 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
1883 DMU_OT_NONE, 0, tx);
1884 error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
1885 ASSERT(error == 0);
1886 } else {
1887 sa_obj = 0;
1888 }
34dc7c2f
BB
1889 /*
1890 * Create a delete queue.
1891 */
9babb374 1892 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
34dc7c2f 1893
9babb374 1894 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
34dc7c2f
BB
1895 ASSERT(error == 0);
1896
9babb374 1897 /*
0037b49e 1898 * Create root znode. Create minimal znode/inode/zfsvfs/sb
22872ff5 1899 * to allow zfs_mknode to work.
9babb374 1900 */
22872ff5
BB
1901 vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
1902 vattr.va_mode = S_IFDIR|0755;
1903 vattr.va_uid = crgetuid(cr);
1904 vattr.va_gid = crgetgid(cr);
1905
79c76d5b 1906 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
a43570c5
TK
1907 rootzp->z_unlinked = B_FALSE;
1908 rootzp->z_atime_dirty = B_FALSE;
22872ff5 1909 rootzp->z_is_sa = USE_SA(version, os);
9c5167d1 1910 rootzp->z_pflags = 0;
22872ff5 1911
0037b49e
BB
1912 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1913 zfsvfs->z_os = os;
1914 zfsvfs->z_parent = zfsvfs;
1915 zfsvfs->z_version = version;
1916 zfsvfs->z_use_fuids = USE_FUIDS(version, os);
1917 zfsvfs->z_use_sa = USE_SA(version, os);
1918 zfsvfs->z_norm = norm;
22872ff5 1919
79c76d5b 1920 sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
0037b49e 1921 sb->s_fs_info = zfsvfs;
22872ff5
BB
1922
1923 ZTOI(rootzp)->i_sb = sb;
1924
1925 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
0037b49e 1926 &zfsvfs->z_attr_table);
9babb374 1927
22872ff5 1928 ASSERT(error == 0);
9babb374 1929
60101509 1930 /*
22872ff5
BB
1931 * Fold case on file systems that are always or sometimes case
1932 * insensitive.
60101509 1933 */
22872ff5 1934 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
0037b49e 1935 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
60101509 1936
0037b49e
BB
1937 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1938 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
22872ff5 1939 offsetof(znode_t, z_link_node));
60101509 1940
c96c36fa 1941 size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
0037b49e
BB
1942 zfsvfs->z_hold_size = size;
1943 zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
1944 KM_SLEEP);
1945 zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
c96c36fa 1946 for (i = 0; i != size; i++) {
0037b49e 1947 avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
c96c36fa 1948 sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
0037b49e 1949 mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
c96c36fa 1950 }
60101509 1951
22872ff5
BB
1952 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1953 cr, NULL, &acl_ids));
1954 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
1955 ASSERT3P(zp, ==, rootzp);
1956 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1957 ASSERT(error == 0);
1958 zfs_acl_ids_free(&acl_ids);
60101509 1959
22872ff5
BB
1960 atomic_set(&ZTOI(rootzp)->i_count, 0);
1961 sa_handle_destroy(rootzp->z_sa_hdl);
22872ff5
BB
1962 kmem_cache_free(znode_cache, rootzp);
1963
c96c36fa 1964 for (i = 0; i != size; i++) {
0037b49e
BB
1965 avl_destroy(&zfsvfs->z_hold_trees[i]);
1966 mutex_destroy(&zfsvfs->z_hold_locks[i]);
c96c36fa 1967 }
2708f716 1968
c17486b2
GN
1969 mutex_destroy(&zfsvfs->z_znodes_lock);
1970
0037b49e
BB
1971 vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
1972 vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
2708f716 1973 kmem_free(sb, sizeof (struct super_block));
0037b49e 1974 kmem_free(zfsvfs, sizeof (zfsvfs_t));
34dc7c2f 1975}
34dc7c2f 1976#endif /* _KERNEL */
428870ff 1977
34dc7c2f 1978static int
572e2857
BB
1979zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
1980{
1981 uint64_t sa_obj = 0;
1982 int error;
1983
1984 error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
1985 if (error != 0 && error != ENOENT)
1986 return (error);
1987
1988 error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
1989 return (error);
1990}
1991
1992static int
1993zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
7b8518cb 1994 dmu_buf_t **db, void *tag)
34dc7c2f 1995{
34dc7c2f 1996 dmu_object_info_t doi;
34dc7c2f 1997 int error;
428870ff 1998
7b8518cb 1999 if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
34dc7c2f
BB
2000 return (error);
2001
572e2857 2002 dmu_object_info_from_db(*db, &doi);
428870ff
BB
2003 if ((doi.doi_bonus_type != DMU_OT_SA &&
2004 doi.doi_bonus_type != DMU_OT_ZNODE) ||
d6320ddb
BB
2005 (doi.doi_bonus_type == DMU_OT_ZNODE &&
2006 doi.doi_bonus_size < sizeof (znode_phys_t))) {
7b8518cb 2007 sa_buf_rele(*db, tag);
2e528b49 2008 return (SET_ERROR(ENOTSUP));
34dc7c2f
BB
2009 }
2010
572e2857
BB
2011 error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
2012 if (error != 0) {
7b8518cb 2013 sa_buf_rele(*db, tag);
428870ff
BB
2014 return (error);
2015 }
2016
572e2857
BB
2017 return (0);
2018}
2019
65c7cc49 2020static void
7b8518cb 2021zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
572e2857
BB
2022{
2023 sa_handle_destroy(hdl);
7b8518cb 2024 sa_buf_rele(db, tag);
572e2857
BB
2025}
2026
2027/*
2028 * Given an object number, return its parent object number and whether
2029 * or not the object is an extended attribute directory.
2030 */
2031static int
b23ad7f3
JJ
2032zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
2033 uint64_t *pobjp, int *is_xattrdir)
572e2857
BB
2034{
2035 uint64_t parent;
2036 uint64_t pflags;
2037 uint64_t mode;
b23ad7f3 2038 uint64_t parent_mode;
572e2857 2039 sa_bulk_attr_t bulk[3];
b23ad7f3
JJ
2040 sa_handle_t *sa_hdl;
2041 dmu_buf_t *sa_db;
572e2857
BB
2042 int count = 0;
2043 int error;
2044
2045 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
2046 &parent, sizeof (parent));
428870ff 2047 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
572e2857 2048 &pflags, sizeof (pflags));
428870ff 2049 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
572e2857 2050 &mode, sizeof (mode));
428870ff 2051
572e2857 2052 if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
428870ff 2053 return (error);
572e2857 2054
b23ad7f3
JJ
2055 /*
2056 * When a link is removed its parent pointer is not changed and will
2057 * be invalid. There are two cases where a link is removed but the
2058 * file stays around, when it goes to the delete queue and when there
2059 * are additional links.
2060 */
2061 error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
2062 if (error != 0)
2063 return (error);
2064
2065 error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
2066 zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2067 if (error != 0)
2068 return (error);
2069
428870ff 2070 *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
34dc7c2f 2071
b23ad7f3
JJ
2072 /*
2073 * Extended attributes can be applied to files, directories, etc.
2074 * Otherwise the parent must be a directory.
2075 */
2076 if (!*is_xattrdir && !S_ISDIR(parent_mode))
ecb2b7dc 2077 return (SET_ERROR(EINVAL));
b23ad7f3
JJ
2078
2079 *pobjp = parent;
2080
34dc7c2f
BB
2081 return (0);
2082}
2083
572e2857
BB
2084/*
2085 * Given an object number, return some zpl level statistics
2086 */
2087static int
2088zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
2089 zfs_stat_t *sb)
34dc7c2f 2090{
572e2857
BB
2091 sa_bulk_attr_t bulk[4];
2092 int count = 0;
2093
2094 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2095 &sb->zs_mode, sizeof (sb->zs_mode));
2096 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
2097 &sb->zs_gen, sizeof (sb->zs_gen));
2098 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
2099 &sb->zs_links, sizeof (sb->zs_links));
2100 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
2101 &sb->zs_ctime, sizeof (sb->zs_ctime));
2102
2103 return (sa_bulk_lookup(hdl, bulk, count));
2104}
2105
2106static int
2107zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
2108 sa_attr_type_t *sa_table, char *buf, int len)
2109{
2110 sa_handle_t *sa_hdl;
2111 sa_handle_t *prevhdl = NULL;
2112 dmu_buf_t *prevdb = NULL;
2113 dmu_buf_t *sa_db = NULL;
34dc7c2f
BB
2114 char *path = buf + len - 1;
2115 int error;
2116
2117 *path = '\0';
572e2857 2118 sa_hdl = hdl;
428870ff 2119
64c1dcef
PD
2120 uint64_t deleteq_obj;
2121 VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
2122 ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
2123 error = zap_lookup_int(osp, deleteq_obj, obj);
2124 if (error == 0) {
2125 return (ESTALE);
2126 } else if (error != ENOENT) {
2127 return (error);
2128 }
2129 error = 0;
2130
34dc7c2f 2131 for (;;) {
17897ce2 2132 uint64_t pobj = 0;
34dc7c2f
BB
2133 char component[MAXNAMELEN + 2];
2134 size_t complen;
17897ce2 2135 int is_xattrdir = 0;
34dc7c2f 2136
4f22619a
KT
2137 if (prevdb) {
2138 ASSERT(prevhdl != NULL);
7b8518cb 2139 zfs_release_sa_handle(prevhdl, prevdb, FTAG);
4f22619a 2140 }
572e2857 2141
b23ad7f3 2142 if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
572e2857 2143 &is_xattrdir)) != 0)
34dc7c2f
BB
2144 break;
2145
2146 if (pobj == obj) {
2147 if (path[0] != '/')
2148 *--path = '/';
2149 break;
2150 }
2151
2152 component[0] = '/';
2153 if (is_xattrdir) {
2154 (void) sprintf(component + 1, "<xattrdir>");
2155 } else {
2156 error = zap_value_search(osp, pobj, obj,
2157 ZFS_DIRENT_OBJ(-1ULL), component + 1);
2158 if (error != 0)
2159 break;
2160 }
2161
2162 complen = strlen(component);
2163 path -= complen;
2164 ASSERT(path >= buf);
2165 bcopy(component, path, complen);
2166 obj = pobj;
572e2857
BB
2167
2168 if (sa_hdl != hdl) {
2169 prevhdl = sa_hdl;
2170 prevdb = sa_db;
2171 }
7b8518cb 2172 error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
572e2857
BB
2173 if (error != 0) {
2174 sa_hdl = prevhdl;
2175 sa_db = prevdb;
2176 break;
2177 }
2178 }
2179
2180 if (sa_hdl != NULL && sa_hdl != hdl) {
2181 ASSERT(sa_db != NULL);
7b8518cb 2182 zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
34dc7c2f
BB
2183 }
2184
2185 if (error == 0)
2186 (void) memmove(buf, path, buf + len - path);
428870ff 2187
34dc7c2f
BB
2188 return (error);
2189}
572e2857
BB
2190
2191int
2192zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
2193{
2194 sa_attr_type_t *sa_table;
2195 sa_handle_t *hdl;
2196 dmu_buf_t *db;
2197 int error;
2198
2199 error = zfs_sa_setup(osp, &sa_table);
2200 if (error != 0)
2201 return (error);
2202
7b8518cb 2203 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
572e2857
BB
2204 if (error != 0)
2205 return (error);
2206
2207 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2208
7b8518cb 2209 zfs_release_sa_handle(hdl, db, FTAG);
572e2857
BB
2210 return (error);
2211}
2212
2213int
2214zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
2215 char *buf, int len)
2216{
2217 char *path = buf + len - 1;
2218 sa_attr_type_t *sa_table;
2219 sa_handle_t *hdl;
2220 dmu_buf_t *db;
2221 int error;
2222
2223 *path = '\0';
2224
2225 error = zfs_sa_setup(osp, &sa_table);
2226 if (error != 0)
2227 return (error);
2228
7b8518cb 2229 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
572e2857
BB
2230 if (error != 0)
2231 return (error);
2232
2233 error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
2234 if (error != 0) {
7b8518cb 2235 zfs_release_sa_handle(hdl, db, FTAG);
572e2857
BB
2236 return (error);
2237 }
2238
2239 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2240
7b8518cb 2241 zfs_release_sa_handle(hdl, db, FTAG);
572e2857
BB
2242 return (error);
2243}
c28b2279 2244
93ce2b4c 2245#if defined(_KERNEL)
c28b2279
BB
2246EXPORT_SYMBOL(zfs_create_fs);
2247EXPORT_SYMBOL(zfs_obj_to_path);
0720116d 2248
02730c33 2249/* CSTYLED */
0720116d
BB
2250module_param(zfs_object_mutex_size, uint, 0644);
2251MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
dcec0a12
AP
2252module_param(zfs_unlink_suspend_progress, int, 0644);
2253MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks "
2254"(debug - leaks space into the unlinked set)");
c28b2279 2255#endif