]> git.proxmox.com Git - mirror_zfs.git/blame - module/os/linux/zfs/zfs_znode.c
FreeBSD: Add zfs_link_create() error handling
[mirror_zfs.git] / module / os / linux / zfs / zfs_znode.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
1d3ba0bf 9 * or https://opensource.org/licenses/CDDL-1.0.
34dc7c2f
BB
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
5d43cc9a 23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
34dc7c2f
BB
24 */
25
26/* Portions Copyright 2007 Jeremy Teo */
27
34dc7c2f
BB
28#ifdef _KERNEL
29#include <sys/types.h>
30#include <sys/param.h>
31#include <sys/time.h>
34dc7c2f 32#include <sys/sysmacros.h>
34dc7c2f 33#include <sys/mntent.h>
34dc7c2f
BB
34#include <sys/u8_textprep.h>
35#include <sys/dsl_dataset.h>
36#include <sys/vfs.h>
34dc7c2f
BB
37#include <sys/vnode.h>
38#include <sys/file.h>
39#include <sys/kmem.h>
40#include <sys/errno.h>
34dc7c2f 41#include <sys/atomic.h>
34dc7c2f
BB
42#include <sys/zfs_dir.h>
43#include <sys/zfs_acl.h>
44#include <sys/zfs_ioctl.h>
45#include <sys/zfs_rlock.h>
46#include <sys/zfs_fuid.h>
3558fd73 47#include <sys/zfs_vnops.h>
ebe7e575 48#include <sys/zfs_ctldir.h>
428870ff 49#include <sys/dnode.h>
34dc7c2f 50#include <sys/fs/zfs.h>
3558fd73 51#include <sys/zpl.h>
34dc7c2f
BB
52#endif /* _KERNEL */
53
54#include <sys/dmu.h>
f1512ee6 55#include <sys/dmu_objset.h>
50c957f7 56#include <sys/dmu_tx.h>
27d96d22 57#include <sys/zfs_refcount.h>
34dc7c2f
BB
58#include <sys/stat.h>
59#include <sys/zap.h>
60#include <sys/zfs_znode.h>
428870ff
BB
61#include <sys/sa.h>
62#include <sys/zfs_sa.h>
572e2857 63#include <sys/zfs_stat.h>
34dc7c2f
BB
64
65#include "zfs_prop.h"
428870ff 66#include "zfs_comutil.h"
34dc7c2f
BB
67
68/*
69 * Functions needed for userland (ie: libzpool) are not put under
70 * #ifdef_KERNEL; the rest of the functions have dependencies
71 * (such as VFS logic) that will not compile easily in userland.
72 */
73#ifdef _KERNEL
9babb374 74
b128c09f 75static kmem_cache_t *znode_cache = NULL;
c96c36fa 76static kmem_cache_t *znode_hold_cache = NULL;
0720116d 77unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
34dc7c2f 78
dcec0a12
AP
79/*
80 * This is used by the test suite so that it can delay znodes from being
81 * freed in order to inspect the unlinked set.
82 */
18168da7 83static int zfs_unlink_suspend_progress = 0;
dcec0a12 84
5d43cc9a
MA
85/*
86 * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
87 * z_rangelock. It will modify the offset and length of the lock to reflect
88 * znode-specific information, and convert RL_APPEND to RL_WRITER. This is
89 * called with the rangelock_t's rl_lock held, which avoids races.
90 */
91static void
bd4dde8e 92zfs_rangelock_cb(zfs_locked_range_t *new, void *arg)
5d43cc9a
MA
93{
94 znode_t *zp = arg;
95
96 /*
97 * If in append mode, convert to writer and lock starting at the
98 * current end of file.
99 */
100 if (new->lr_type == RL_APPEND) {
101 new->lr_offset = zp->z_size;
102 new->lr_type = RL_WRITER;
103 }
104
105 /*
106 * If we need to grow the block size then lock the whole file range.
107 */
108 uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
109 if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
110 zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
111 new->lr_offset = 0;
112 new->lr_length = UINT64_MAX;
113 }
114}
115
34dc7c2f 116static int
b128c09f 117zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
34dc7c2f 118{
ef70eff1 119 (void) arg, (void) kmflags;
34dc7c2f
BB
120 znode_t *zp = buf;
121
3558fd73 122 inode_init_once(ZTOI(zp));
b128c09f
BB
123 list_link_init(&zp->z_link_node);
124
34dc7c2f 125 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
34dc7c2f 126 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
448d7aaa 127 rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
34dc7c2f 128 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
82a37189 129 rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
34dc7c2f 130
2cc479d0 131 zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
34dc7c2f 132
b128c09f 133 zp->z_dirlocks = NULL;
45d1cae3 134 zp->z_acl_cached = NULL;
82a37189 135 zp->z_xattr_cached = NULL;
98701490 136 zp->z_xattr_parent = 0;
411f4a01
SN
137 zp->z_sync_writes_cnt = 0;
138 zp->z_async_writes_cnt = 0;
139
34dc7c2f
BB
140 return (0);
141}
142
34dc7c2f 143static void
b128c09f 144zfs_znode_cache_destructor(void *buf, void *arg)
34dc7c2f 145{
ef70eff1 146 (void) arg;
34dc7c2f
BB
147 znode_t *zp = buf;
148
b128c09f 149 ASSERT(!list_link_active(&zp->z_link_node));
34dc7c2f 150 mutex_destroy(&zp->z_lock);
34dc7c2f
BB
151 rw_destroy(&zp->z_parent_lock);
152 rw_destroy(&zp->z_name_lock);
153 mutex_destroy(&zp->z_acl_lock);
82a37189 154 rw_destroy(&zp->z_xattr_lock);
2cc479d0 155 zfs_rangelock_fini(&zp->z_rangelock);
34dc7c2f 156
c903a756
RM
157 ASSERT3P(zp->z_dirlocks, ==, NULL);
158 ASSERT3P(zp->z_acl_cached, ==, NULL);
159 ASSERT3P(zp->z_xattr_cached, ==, NULL);
411f4a01
SN
160
161 ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
162 ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
b128c09f
BB
163}
164
c96c36fa
BB
165static int
166zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
167{
ef70eff1 168 (void) arg, (void) kmflags;
c96c36fa
BB
169 znode_hold_t *zh = buf;
170
171 mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
5f45e3f6 172 zh->zh_refcount = 0;
c96c36fa
BB
173
174 return (0);
175}
176
177static void
178zfs_znode_hold_cache_destructor(void *buf, void *arg)
179{
ef70eff1 180 (void) arg;
c96c36fa
BB
181 znode_hold_t *zh = buf;
182
183 mutex_destroy(&zh->zh_lock);
c96c36fa
BB
184}
185
34dc7c2f
BB
186void
187zfs_znode_init(void)
188{
189 /*
5074bfe8
TC
190 * Initialize zcache. The KMC_SLAB hint is used in order that it be
191 * backed by kmalloc() when on the Linux slab in order that any
192 * wait_on_bit() operations on the related inode operate properly.
34dc7c2f
BB
193 */
194 ASSERT(znode_cache == NULL);
195 znode_cache = kmem_cache_create("zfs_znode_cache",
196 sizeof (znode_t), 0, zfs_znode_cache_constructor,
5074bfe8 197 zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB);
c96c36fa
BB
198
199 ASSERT(znode_hold_cache == NULL);
200 znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
201 sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
202 zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
34dc7c2f
BB
203}
204
205void
206zfs_znode_fini(void)
207{
34dc7c2f
BB
208 /*
209 * Cleanup zcache
210 */
211 if (znode_cache)
212 kmem_cache_destroy(znode_cache);
213 znode_cache = NULL;
c96c36fa
BB
214
215 if (znode_hold_cache)
216 kmem_cache_destroy(znode_hold_cache);
217 znode_hold_cache = NULL;
218}
219
220/*
221 * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
222 * serialize access to a znode and its SA buffer while the object is being
223 * created or destroyed. This kind of locking would normally reside in the
224 * znode itself but in this case that's impossible because the znode and SA
225 * buffer may not yet exist. Therefore the locking is handled externally
bf169e9f 226 * with an array of mutexes and AVLs trees which contain per-object locks.
c96c36fa
BB
227 *
228 * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
229 * in to the correct AVL tree and finally the per-object lock is held. In
230 * zfs_znode_hold_exit() the process is reversed. The per-object lock is
231 * released, removed from the AVL tree and destroyed if there are no waiters.
232 *
233 * This scheme has two important properties:
234 *
235 * 1) No memory allocations are performed while holding one of the z_hold_locks.
236 * This ensures evict(), which can be called from direct memory reclaim, will
237 * never block waiting on a z_hold_locks which just happens to have hashed
238 * to the same index.
239 *
240 * 2) All locks used to serialize access to an object are per-object and never
241 * shared. This minimizes lock contention without creating a large number
242 * of dedicated locks.
243 *
244 * On the downside it does require znode_lock_t structures to be frequently
245 * allocated and freed. However, because these are backed by a kmem cache
246 * and very short lived this cost is minimal.
247 */
248int
249zfs_znode_hold_compare(const void *a, const void *b)
250{
ee36c709
GN
251 const znode_hold_t *zh_a = (const znode_hold_t *)a;
252 const znode_hold_t *zh_b = (const znode_hold_t *)b;
253
ca577779 254 return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj));
c96c36fa
BB
255}
256
65c7cc49 257static boolean_t __maybe_unused
0037b49e 258zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
c96c36fa
BB
259{
260 znode_hold_t *zh, search;
0037b49e 261 int i = ZFS_OBJ_HASH(zfsvfs, obj);
37c56346 262 boolean_t held;
c96c36fa
BB
263
264 search.zh_obj = obj;
265
0037b49e
BB
266 mutex_enter(&zfsvfs->z_hold_locks[i]);
267 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
37c56346 268 held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
0037b49e 269 mutex_exit(&zfsvfs->z_hold_locks[i]);
c96c36fa 270
37c56346 271 return (held);
c96c36fa
BB
272}
273
c6dab6dd 274znode_hold_t *
0037b49e 275zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
c96c36fa
BB
276{
277 znode_hold_t *zh, *zh_new, search;
0037b49e 278 int i = ZFS_OBJ_HASH(zfsvfs, obj);
c96c36fa
BB
279 boolean_t found = B_FALSE;
280
281 zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
c96c36fa
BB
282 search.zh_obj = obj;
283
0037b49e
BB
284 mutex_enter(&zfsvfs->z_hold_locks[i]);
285 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
c96c36fa
BB
286 if (likely(zh == NULL)) {
287 zh = zh_new;
5f45e3f6 288 zh->zh_obj = obj;
0037b49e 289 avl_add(&zfsvfs->z_hold_trees[i], zh);
c96c36fa
BB
290 } else {
291 ASSERT3U(zh->zh_obj, ==, obj);
292 found = B_TRUE;
293 }
5f45e3f6
AM
294 zh->zh_refcount++;
295 ASSERT3S(zh->zh_refcount, >, 0);
0037b49e 296 mutex_exit(&zfsvfs->z_hold_locks[i]);
c96c36fa
BB
297
298 if (found == B_TRUE)
299 kmem_cache_free(znode_hold_cache, zh_new);
300
301 ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
c96c36fa
BB
302 mutex_enter(&zh->zh_lock);
303
304 return (zh);
305}
306
c6dab6dd 307void
0037b49e 308zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
c96c36fa 309{
0037b49e 310 int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
c96c36fa
BB
311 boolean_t remove = B_FALSE;
312
0037b49e 313 ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
c96c36fa
BB
314 mutex_exit(&zh->zh_lock);
315
0037b49e 316 mutex_enter(&zfsvfs->z_hold_locks[i]);
5f45e3f6
AM
317 ASSERT3S(zh->zh_refcount, >, 0);
318 if (--zh->zh_refcount == 0) {
0037b49e 319 avl_remove(&zfsvfs->z_hold_trees[i], zh);
c96c36fa
BB
320 remove = B_TRUE;
321 }
0037b49e 322 mutex_exit(&zfsvfs->z_hold_locks[i]);
c96c36fa
BB
323
324 if (remove == B_TRUE)
325 kmem_cache_free(znode_hold_cache, zh);
34dc7c2f
BB
326}
327
13a9a6f5
MM
328dev_t
329zfs_cmpldev(uint64_t dev)
330{
331 return (dev);
332}
333
34dc7c2f 334static void
0037b49e 335zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
428870ff 336 dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
34dc7c2f 337{
0037b49e 338 ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
34dc7c2f
BB
339
340 mutex_enter(&zp->z_lock);
341
428870ff
BB
342 ASSERT(zp->z_sa_hdl == NULL);
343 ASSERT(zp->z_acl_cached == NULL);
344 if (sa_hdl == NULL) {
0037b49e 345 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
428870ff
BB
346 SA_HDL_SHARED, &zp->z_sa_hdl));
347 } else {
348 zp->z_sa_hdl = sa_hdl;
349 sa_set_userp(sa_hdl, zp);
350 }
34dc7c2f 351
428870ff 352 zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
34dc7c2f 353
34dc7c2f 354 mutex_exit(&zp->z_lock);
34dc7c2f
BB
355}
356
357void
358zfs_znode_dmu_fini(znode_t *zp)
359{
c6dab6dd 360 ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) ||
3558fd73 361 RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
428870ff
BB
362
363 sa_handle_destroy(zp->z_sa_hdl);
364 zp->z_sa_hdl = NULL;
34dc7c2f
BB
365}
366
367/*
3558fd73
BB
368 * Called by new_inode() to allocate a new inode.
369 */
370int
371zfs_inode_alloc(struct super_block *sb, struct inode **ip)
372{
373 znode_t *zp;
374
79c76d5b 375 zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
3558fd73
BB
376 *ip = ZTOI(zp);
377
378 return (0);
379}
380
381/*
382 * Called in multiple places when an inode should be destroyed.
383 */
384void
385zfs_inode_destroy(struct inode *ip)
386{
387 znode_t *zp = ITOZ(ip);
0037b49e 388 zfsvfs_t *zfsvfs = ZTOZSB(zp);
3558fd73 389
0037b49e 390 mutex_enter(&zfsvfs->z_znodes_lock);
7b3e34ba 391 if (list_link_active(&zp->z_link_node)) {
0037b49e 392 list_remove(&zfsvfs->z_all_znodes, zp);
7b3e34ba 393 }
0037b49e 394 mutex_exit(&zfsvfs->z_znodes_lock);
3558fd73
BB
395
396 if (zp->z_acl_cached) {
397 zfs_acl_free(zp->z_acl_cached);
398 zp->z_acl_cached = NULL;
399 }
400
82a37189
BB
401 if (zp->z_xattr_cached) {
402 nvlist_free(zp->z_xattr_cached);
403 zp->z_xattr_cached = NULL;
404 }
405
3558fd73
BB
406 kmem_cache_free(znode_cache, zp);
407}
408
409static void
0037b49e 410zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
3558fd73 411{
aa6d8c10 412 uint64_t rdev = 0;
3558fd73
BB
413
414 switch (ip->i_mode & S_IFMT) {
415 case S_IFREG:
416 ip->i_op = &zpl_inode_operations;
6b0a4be5
RN
417#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
418 ip->i_fop = &zpl_file_operations.kabi_fops;
419#else
3558fd73 420 ip->i_fop = &zpl_file_operations;
6b0a4be5 421#endif
3558fd73
BB
422 ip->i_mapping->a_ops = &zpl_address_space_operations;
423 break;
424
425 case S_IFDIR:
dbf6108b
AS
426#ifdef HAVE_RENAME2_OPERATIONS_WRAPPER
427 ip->i_flags |= S_IOPS_WRAPPER;
428 ip->i_op = &zpl_dir_inode_operations.ops;
429#else
3558fd73 430 ip->i_op = &zpl_dir_inode_operations;
dbf6108b 431#endif
3558fd73
BB
432 ip->i_fop = &zpl_dir_file_operations;
433 ITOZ(ip)->z_zn_prefetch = B_TRUE;
434 break;
435
436 case S_IFLNK:
437 ip->i_op = &zpl_symlink_inode_operations;
438 break;
439
aa6d8c10
NB
440 /*
441 * rdev is only stored in a SA only for device files.
442 */
3558fd73
BB
443 case S_IFCHR:
444 case S_IFBLK:
0037b49e 445 (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
53b1d979 446 sizeof (rdev));
9a70e97f 447 zfs_fallthrough;
aa6d8c10
NB
448 case S_IFIFO:
449 case S_IFSOCK:
3558fd73
BB
450 init_special_inode(ip, ip->i_mode, rdev);
451 ip->i_op = &zpl_special_inode_operations;
452 break;
453
454 default:
53b1d979
BB
455 zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
456 (u_longlong_t)ip->i_ino, ip->i_mode);
457
458 /* Assume the inode is a file and attempt to continue */
459 ip->i_mode = S_IFREG | 0644;
460 ip->i_op = &zpl_inode_operations;
6b0a4be5
RN
461#ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
462 ip->i_fop = &zpl_file_operations.kabi_fops;
463#else
53b1d979 464 ip->i_fop = &zpl_file_operations;
6b0a4be5 465#endif
53b1d979
BB
466 ip->i_mapping->a_ops = &zpl_address_space_operations;
467 break;
3558fd73
BB
468 }
469}
470
65c7cc49 471static void
7bb1325f
CC
472zfs_set_inode_flags(znode_t *zp, struct inode *ip)
473{
474 /*
475 * Linux and Solaris have different sets of file attributes, so we
476 * restrict this conversion to the intersection of the two.
477 */
a5248129
CC
478#ifdef HAVE_INODE_SET_FLAGS
479 unsigned int flags = 0;
480 if (zp->z_pflags & ZFS_IMMUTABLE)
481 flags |= S_IMMUTABLE;
482 if (zp->z_pflags & ZFS_APPENDONLY)
483 flags |= S_APPEND;
7bb1325f 484
a5248129
CC
485 inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
486#else
7bb1325f
CC
487 if (zp->z_pflags & ZFS_IMMUTABLE)
488 ip->i_flags |= S_IMMUTABLE;
489 else
490 ip->i_flags &= ~S_IMMUTABLE;
491
492 if (zp->z_pflags & ZFS_APPENDONLY)
493 ip->i_flags |= S_APPEND;
494 else
495 ip->i_flags &= ~S_APPEND;
a5248129 496#endif
7bb1325f
CC
497}
498
704cd075 499/*
fc273894 500 * Update the embedded inode given the znode.
704cd075 501 */
9f5f0019 502void
fc273894 503zfs_znode_update_vfs(znode_t *zp)
704cd075 504{
704cd075
CC
505 struct inode *ip;
506 uint32_t blksize;
507 u_longlong_t i_blocks;
704cd075
CC
508
509 ASSERT(zp != NULL);
704cd075
CC
510 ip = ZTOI(zp);
511
512 /* Skip .zfs control nodes which do not exist on disk. */
513 if (zfsctl_is_node(ip))
514 return;
515
704cd075
CC
516 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
517
518 spin_lock(&ip->i_lock);
e53d678d 519 ip->i_mode = zp->z_mode;
704cd075 520 ip->i_blocks = i_blocks;
704cd075
CC
521 i_size_write(ip, zp->z_size);
522 spin_unlock(&ip->i_lock);
523}
524
704cd075 525
3558fd73
BB
526/*
527 * Construct a znode+inode and initialize.
34dc7c2f
BB
528 *
529 * This does not do a call to dmu_set_user() that is
530 * up to the caller to do, in case you don't want to
531 * return the znode
532 */
533static znode_t *
0037b49e 534zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
1a688994 535 dmu_object_type_t obj_type, sa_handle_t *hdl)
34dc7c2f
BB
536{
537 znode_t *zp;
3558fd73 538 struct inode *ip;
7f89ae6b 539 uint64_t mode;
428870ff 540 uint64_t parent;
278f2236 541 uint64_t tmp_gen;
dfbc8630 542 uint64_t links;
2c6abf15 543 uint64_t z_uid, z_gid;
abbf0bd4 544 uint64_t atime[2], mtime[2], ctime[2], btime[2];
db4fc559 545 inode_timespec_t tmp_ts;
9c5167d1 546 uint64_t projid = ZFS_DEFAULT_PROJID;
abbf0bd4 547 sa_bulk_attr_t bulk[12];
428870ff 548 int count = 0;
34dc7c2f 549
0037b49e 550 ASSERT(zfsvfs != NULL);
34dc7c2f 551
0037b49e 552 ip = new_inode(zfsvfs->z_sb);
3558fd73
BB
553 if (ip == NULL)
554 return (NULL);
7304b6e5 555
3558fd73 556 zp = ITOZ(ip);
34dc7c2f 557 ASSERT(zp->z_dirlocks == NULL);
ebe7e575
BB
558 ASSERT3P(zp->z_acl_cached, ==, NULL);
559 ASSERT3P(zp->z_xattr_cached, ==, NULL);
a43570c5
TK
560 zp->z_unlinked = B_FALSE;
561 zp->z_atime_dirty = B_FALSE;
3fc92adc 562#if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE)
a43570c5 563 zp->z_is_mapped = B_FALSE;
3fc92adc 564#endif
a43570c5 565 zp->z_is_ctldir = B_FALSE;
e7a2fa70 566 zp->z_suspended = B_FALSE;
428870ff 567 zp->z_sa_hdl = NULL;
34dc7c2f 568 zp->z_mapcnt = 0;
34dc7c2f
BB
569 zp->z_id = db->db_object;
570 zp->z_blksz = blksz;
571 zp->z_seq = 0x7A4653;
572 zp->z_sync_cnt = 0;
411f4a01
SN
573 zp->z_sync_writes_cnt = 0;
574 zp->z_async_writes_cnt = 0;
34dc7c2f 575
0037b49e 576 zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
3558fd73 577
0037b49e
BB
578 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
579 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
580 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
581 &zp->z_size, 8);
582 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
583 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
428870ff 584 &zp->z_pflags, 8);
0037b49e 585 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
7304b6e5 586 &parent, 8);
0037b49e
BB
587 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
588 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
589 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
590 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
591 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
abbf0bd4 592 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
428870ff 593
9c5167d1
NF
594 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
595 (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
596 (zp->z_pflags & ZFS_PROJID) &&
597 sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
428870ff
BB
598 if (hdl == NULL)
599 sa_handle_destroy(zp->z_sa_hdl);
07d63f0c 600 zp->z_sa_hdl = NULL;
3558fd73 601 goto error;
34dc7c2f 602 }
7304b6e5 603
9c5167d1 604 zp->z_projid = projid;
12fa7f34 605 zp->z_mode = ip->i_mode = mode;
278f2236 606 ip->i_generation = (uint32_t)tmp_gen;
ba2fe6af 607 ip->i_blkbits = SPA_MINBLOCKSHIFT;
dfbc8630 608 set_nlink(ip, (uint32_t)links);
2c6abf15
NB
609 zfs_uid_write(ip, z_uid);
610 zfs_gid_write(ip, z_gid);
7bb1325f 611 zfs_set_inode_flags(zp, ip);
7f89ae6b 612
98701490
CC
613 /* Cache the xattr parent id */
614 if (zp->z_pflags & ZFS_XATTR)
615 zp->z_xattr_parent = parent;
616
db4fc559
RN
617 ZFS_TIME_DECODE(&tmp_ts, atime);
618 zpl_inode_set_atime_to_ts(ip, tmp_ts);
619 ZFS_TIME_DECODE(&tmp_ts, mtime);
620 zpl_inode_set_mtime_to_ts(ip, tmp_ts);
621 ZFS_TIME_DECODE(&tmp_ts, ctime);
622 zpl_inode_set_ctime_to_ts(ip, tmp_ts);
abbf0bd4 623 ZFS_TIME_DECODE(&zp->z_btime, btime);
9f5f0019 624
1a688994 625 ip->i_ino = zp->z_id;
fc273894 626 zfs_znode_update_vfs(zp);
0037b49e 627 zfs_inode_set_ops(zfsvfs, ip);
3558fd73 628
7b3e34ba
BB
629 /*
630 * The only way insert_inode_locked() can fail is if the ip->i_ino
631 * number is already hashed for this super block. This can never
632 * happen because the inode numbers map 1:1 with the object numbers.
633 *
afa7b348
PZ
634 * Exceptions include rolling back a mounted file system, either
635 * from the zfs rollback or zfs recv command.
636 *
637 * Active inodes are unhashed during the rollback, but since zrele
638 * can happen asynchronously, we can't guarantee they've been
639 * unhashed. This can cause hash collisions in unlinked drain
640 * processing so do not hash unlinked znodes.
7b3e34ba 641 */
afa7b348
PZ
642 if (links > 0)
643 VERIFY3S(insert_inode_locked(ip), ==, 0);
c85b224f 644
0037b49e
BB
645 mutex_enter(&zfsvfs->z_znodes_lock);
646 list_insert_tail(&zfsvfs->z_all_znodes, zp);
0037b49e 647 mutex_exit(&zfsvfs->z_znodes_lock);
b128c09f 648
afa7b348
PZ
649 if (links > 0)
650 unlock_new_inode(ip);
34dc7c2f 651 return (zp);
3558fd73
BB
652
653error:
3558fd73 654 iput(ip);
d1d7e268 655 return (NULL);
34dc7c2f
BB
656}
657
1e8db771
BB
658/*
659 * Safely mark an inode dirty. Inodes which are part of a read-only
660 * file system or snapshot may not be dirtied.
661 */
662void
663zfs_mark_inode_dirty(struct inode *ip)
664{
0037b49e 665 zfsvfs_t *zfsvfs = ITOZSB(ip);
1e8db771 666
0037b49e 667 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
1e8db771
BB
668 return;
669
670 mark_inode_dirty(ip);
671}
672
428870ff
BB
673static uint64_t empty_xattr;
674static uint64_t pad[4];
675static zfs_acl_phys_t acl_phys;
34dc7c2f
BB
676/*
677 * Create a new DMU object to hold a zfs znode.
678 *
679 * IN: dzp - parent directory for new znode
680 * vap - file attributes for new znode
681 * tx - dmu transaction id for zap operations
682 * cr - credentials of caller
683 * flag - flags:
684 * IS_ROOT_NODE - new object will be root
841a7a98 685 * IS_TMPFILE - new object is of O_TMPFILE
34dc7c2f 686 * IS_XATTR - new object is an attribute
841a7a98 687 * acl_ids - ACL related attributes
34dc7c2f 688 *
841a7a98 689 * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE)
34dc7c2f
BB
690 *
691 */
692void
693zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
428870ff 694 uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
34dc7c2f 695{
428870ff
BB
696 uint64_t crtime[2], atime[2], mtime[2], ctime[2];
697 uint64_t mode, size, links, parent, pflags;
9c5167d1 698 uint64_t projid = ZFS_DEFAULT_PROJID;
428870ff 699 uint64_t rdev = 0;
0037b49e 700 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
428870ff 701 dmu_buf_t *db;
6413c95f 702 inode_timespec_t now;
34dc7c2f 703 uint64_t gen, obj;
428870ff 704 int bonuslen;
50c957f7 705 int dnodesize;
428870ff
BB
706 sa_handle_t *sa_hdl;
707 dmu_object_type_t obj_type;
f30484af 708 sa_bulk_attr_t *sa_attrs;
428870ff
BB
709 int cnt = 0;
710 zfs_acl_locator_cb_t locate = { 0 };
c96c36fa 711 znode_hold_t *zh;
34dc7c2f 712
0037b49e 713 if (zfsvfs->z_replay) {
34dc7c2f 714 obj = vap->va_nodeid;
34dc7c2f
BB
715 now = vap->va_ctime; /* see zfs_replay_create() */
716 gen = vap->va_nblocks; /* ditto */
50c957f7 717 dnodesize = vap->va_fsid; /* ditto */
34dc7c2f
BB
718 } else {
719 obj = 0;
720 gethrestime(&now);
721 gen = dmu_tx_get_txg(tx);
0037b49e 722 dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
34dc7c2f
BB
723 }
724
50c957f7
NB
725 if (dnodesize == 0)
726 dnodesize = DNODE_MIN_SIZE;
727
0037b49e 728 obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
50c957f7 729
428870ff 730 bonuslen = (obj_type == DMU_OT_SA) ?
50c957f7 731 DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
428870ff 732
34dc7c2f
BB
733 /*
734 * Create a new DMU object.
735 */
736 /*
737 * There's currently no mechanism for pre-reading the blocks that will
572e2857 738 * be needed to allocate a new object, so we accept the small chance
34dc7c2f
BB
739 * that there will be an i/o error and we will fail one of the
740 * assertions below.
741 */
3558fd73 742 if (S_ISDIR(vap->va_mode)) {
0037b49e
BB
743 if (zfsvfs->z_replay) {
744 VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
745 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
50c957f7 746 obj_type, bonuslen, dnodesize, tx));
34dc7c2f 747 } else {
0037b49e
BB
748 obj = zap_create_norm_dnsize(zfsvfs->z_os,
749 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
50c957f7 750 obj_type, bonuslen, dnodesize, tx);
34dc7c2f
BB
751 }
752 } else {
0037b49e
BB
753 if (zfsvfs->z_replay) {
754 VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
34dc7c2f 755 DMU_OT_PLAIN_FILE_CONTENTS, 0,
50c957f7 756 obj_type, bonuslen, dnodesize, tx));
34dc7c2f 757 } else {
0037b49e 758 obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
34dc7c2f 759 DMU_OT_PLAIN_FILE_CONTENTS, 0,
50c957f7 760 obj_type, bonuslen, dnodesize, tx);
34dc7c2f
BB
761 }
762 }
34dc7c2f 763
0037b49e 764 zh = zfs_znode_hold_enter(zfsvfs, obj);
9631681b 765 VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
34dc7c2f
BB
766
767 /*
768 * If this is the root, fix up the half-initialized parent pointer
769 * to reference the just-allocated physical data area.
770 */
771 if (flag & IS_ROOT_NODE) {
34dc7c2f
BB
772 dzp->z_id = obj;
773 }
774
775 /*
776 * If parent is an xattr, so am I.
777 */
9c5167d1 778 if (dzp->z_pflags & ZFS_XATTR) {
34dc7c2f 779 flag |= IS_XATTR;
34dc7c2f
BB
780 }
781
0037b49e 782 if (zfsvfs->z_use_fuids)
428870ff
BB
783 pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
784 else
785 pflags = 0;
34dc7c2f 786
3558fd73 787 if (S_ISDIR(vap->va_mode)) {
428870ff 788 size = 2; /* contents ("." and "..") */
dfbc8630 789 links = 2;
428870ff 790 } else {
dfbc8630 791 size = 0;
ace1eae8 792 links = (flag & IS_TMPFILE) ? 0 : 1;
34dc7c2f
BB
793 }
794
aa6d8c10 795 if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
dc1d7665 796 rdev = vap->va_rdev;
428870ff
BB
797
798 parent = dzp->z_id;
799 mode = acl_ids->z_mode;
34dc7c2f 800 if (flag & IS_XATTR)
428870ff 801 pflags |= ZFS_XATTR;
34dc7c2f 802
9c5167d1
NF
803 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
804 /*
805 * With ZFS_PROJID flag, we can easily know whether there is
806 * project ID stored on disk or not. See zfs_space_delta_cb().
807 */
808 if (obj_type != DMU_OT_ZNODE &&
809 dmu_objset_projectquota_enabled(zfsvfs->z_os))
810 pflags |= ZFS_PROJID;
811
812 /*
813 * Inherit project ID from parent if required.
814 */
815 projid = zfs_inherit_projid(dzp);
816 if (dzp->z_pflags & ZFS_PROJINHERIT)
817 pflags |= ZFS_PROJINHERIT;
818 }
819
428870ff 820 /*
e1cfd73f 821 * No execs denied will be determined when zfs_mode_compute() is called.
428870ff
BB
822 */
823 pflags |= acl_ids->z_aclp->z_hints &
824 (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
825 ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
34dc7c2f 826
428870ff
BB
827 ZFS_TIME_ENCODE(&now, crtime);
828 ZFS_TIME_ENCODE(&now, ctime);
34dc7c2f 829
3558fd73 830 if (vap->va_mask & ATTR_ATIME) {
428870ff 831 ZFS_TIME_ENCODE(&vap->va_atime, atime);
34dc7c2f 832 } else {
428870ff 833 ZFS_TIME_ENCODE(&now, atime);
34dc7c2f
BB
834 }
835
3558fd73 836 if (vap->va_mask & ATTR_MTIME) {
428870ff
BB
837 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
838 } else {
839 ZFS_TIME_ENCODE(&now, mtime);
840 }
841
842 /* Now add in all of the "SA" attributes */
0037b49e 843 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
428870ff
BB
844 &sa_hdl));
845
846 /*
847 * Setup the array of attributes to be replaced/set on the new file
848 *
849 * order for DMU_OT_ZNODE is critical since it needs to be constructed
850 * in the old znode_phys_t format. Don't change this ordering
851 */
79c76d5b 852 sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
428870ff
BB
853
854 if (obj_type == DMU_OT_ZNODE) {
0037b49e 855 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
428870ff 856 NULL, &atime, 16);
0037b49e 857 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
428870ff 858 NULL, &mtime, 16);
0037b49e 859 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
428870ff 860 NULL, &ctime, 16);
0037b49e 861 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
428870ff 862 NULL, &crtime, 16);
0037b49e 863 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
428870ff 864 NULL, &gen, 8);
0037b49e 865 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
428870ff 866 NULL, &mode, 8);
0037b49e 867 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
428870ff 868 NULL, &size, 8);
0037b49e 869 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
428870ff 870 NULL, &parent, 8);
34dc7c2f 871 } else {
0037b49e 872 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
428870ff 873 NULL, &mode, 8);
0037b49e 874 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
428870ff 875 NULL, &size, 8);
0037b49e 876 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
428870ff 877 NULL, &gen, 8);
0037b49e 878 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
3558fd73 879 NULL, &acl_ids->z_fuid, 8);
0037b49e 880 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
3558fd73 881 NULL, &acl_ids->z_fgid, 8);
0037b49e 882 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
428870ff 883 NULL, &parent, 8);
0037b49e 884 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
428870ff 885 NULL, &pflags, 8);
0037b49e 886 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
428870ff 887 NULL, &atime, 16);
0037b49e 888 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
428870ff 889 NULL, &mtime, 16);
0037b49e 890 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
428870ff 891 NULL, &ctime, 16);
0037b49e 892 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
428870ff
BB
893 NULL, &crtime, 16);
894 }
895
0037b49e 896 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
428870ff
BB
897
898 if (obj_type == DMU_OT_ZNODE) {
0037b49e 899 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
428870ff 900 &empty_xattr, 8);
9c5167d1
NF
901 } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
902 pflags & ZFS_PROJID) {
903 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
904 NULL, &projid, 8);
34dc7c2f 905 }
428870ff 906 if (obj_type == DMU_OT_ZNODE ||
aa6d8c10 907 (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
0037b49e 908 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
428870ff 909 NULL, &rdev, 8);
428870ff
BB
910 }
911 if (obj_type == DMU_OT_ZNODE) {
0037b49e 912 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
428870ff 913 NULL, &pflags, 8);
0037b49e 914 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
428870ff 915 &acl_ids->z_fuid, 8);
0037b49e 916 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
428870ff 917 &acl_ids->z_fgid, 8);
0037b49e 918 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
428870ff 919 sizeof (uint64_t) * 4);
0037b49e 920 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
428870ff
BB
921 &acl_phys, sizeof (zfs_acl_phys_t));
922 } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
0037b49e 923 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
428870ff
BB
924 &acl_ids->z_aclp->z_acl_count, 8);
925 locate.cb_aclp = acl_ids->z_aclp;
0037b49e 926 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
428870ff
BB
927 zfs_acl_data_locator, &locate,
928 acl_ids->z_aclp->z_acl_bytes);
929 mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
930 acl_ids->z_fuid, acl_ids->z_fgid);
931 }
932
933 VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
34dc7c2f 934
34dc7c2f 935 if (!(flag & IS_ROOT_NODE)) {
8d703987
BB
936 /*
937 * The call to zfs_znode_alloc() may fail if memory is low
938 * via the call path: alloc_inode() -> inode_init_always() ->
939 * security_inode_alloc() -> inode_alloc_security(). Since
940 * the existing code is written such that zfs_mknode() can
941 * not fail retry until sufficient memory has been reclaimed.
942 */
943 do {
1a688994 944 *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
8d703987
BB
945 } while (*zpp == NULL);
946
7b3e34ba
BB
947 VERIFY(*zpp != NULL);
948 VERIFY(dzp != NULL);
34dc7c2f
BB
949 } else {
950 /*
951 * If we are creating the root node, the "parent" we
952 * passed in is the znode for the root.
953 */
954 *zpp = dzp;
428870ff
BB
955
956 (*zpp)->z_sa_hdl = sa_hdl;
34dc7c2f 957 }
428870ff
BB
958
959 (*zpp)->z_pflags = pflags;
12fa7f34 960 (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
50c957f7 961 (*zpp)->z_dnodesize = dnodesize;
9c5167d1 962 (*zpp)->z_projid = projid;
428870ff 963
428870ff
BB
964 if (obj_type == DMU_OT_ZNODE ||
965 acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
b0bc7a84 966 VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
428870ff 967 }
d1d7e268 968 kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
0037b49e 969 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
970}
971
5484965a 972/*
d3cc8b15
WA
973 * Update in-core attributes. It is assumed the caller will be doing an
974 * sa_bulk_update to push the changes out.
5484965a
BB
975 */
976void
977zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
978{
979 xoptattr_t *xoap;
7bb1325f 980 boolean_t update_inode = B_FALSE;
5484965a
BB
981
982 xoap = xva_getxoptattr(xvap);
983 ASSERT(xoap);
984
985 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
986 uint64_t times[2];
987 ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
988 (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
989 &times, sizeof (times), tx);
990 XVA_SET_RTN(xvap, XAT_CREATETIME);
991 }
992 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
993 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
994 zp->z_pflags, tx);
995 XVA_SET_RTN(xvap, XAT_READONLY);
996 }
997 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
998 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
999 zp->z_pflags, tx);
1000 XVA_SET_RTN(xvap, XAT_HIDDEN);
1001 }
1002 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
1003 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
1004 zp->z_pflags, tx);
1005 XVA_SET_RTN(xvap, XAT_SYSTEM);
1006 }
1007 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
1008 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
1009 zp->z_pflags, tx);
1010 XVA_SET_RTN(xvap, XAT_ARCHIVE);
1011 }
1012 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
1013 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
1014 zp->z_pflags, tx);
1015 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
64c688d7 1016
7bb1325f 1017 update_inode = B_TRUE;
5484965a
BB
1018 }
1019 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
1020 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
1021 zp->z_pflags, tx);
1022 XVA_SET_RTN(xvap, XAT_NOUNLINK);
1023 }
1024 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1025 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
1026 zp->z_pflags, tx);
1027 XVA_SET_RTN(xvap, XAT_APPENDONLY);
64c688d7 1028
7bb1325f 1029 update_inode = B_TRUE;
5484965a
BB
1030 }
1031 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1032 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
1033 zp->z_pflags, tx);
1034 XVA_SET_RTN(xvap, XAT_NODUMP);
1035 }
1036 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1037 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
1038 zp->z_pflags, tx);
1039 XVA_SET_RTN(xvap, XAT_OPAQUE);
1040 }
1041 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1042 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
1043 xoap->xoa_av_quarantined, zp->z_pflags, tx);
1044 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1045 }
1046 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1047 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
1048 zp->z_pflags, tx);
1049 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1050 }
1051 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
1052 zfs_sa_set_scanstamp(zp, xvap, tx);
1053 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
1054 }
1055 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1056 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
1057 zp->z_pflags, tx);
1058 XVA_SET_RTN(xvap, XAT_REPARSE);
1059 }
1060 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1061 ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
1062 zp->z_pflags, tx);
1063 XVA_SET_RTN(xvap, XAT_OFFLINE);
1064 }
1065 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1066 ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
1067 zp->z_pflags, tx);
1068 XVA_SET_RTN(xvap, XAT_SPARSE);
1069 }
9c5167d1
NF
1070 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
1071 ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
1072 zp->z_pflags, tx);
1073 XVA_SET_RTN(xvap, XAT_PROJINHERIT);
1074 }
7bb1325f
CC
1075
1076 if (update_inode)
1077 zfs_set_inode_flags(zp, ZTOI(zp));
5484965a
BB
1078}
1079
34dc7c2f 1080int
0037b49e 1081zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
34dc7c2f
BB
1082{
1083 dmu_object_info_t doi;
1084 dmu_buf_t *db;
1085 znode_t *zp;
c96c36fa 1086 znode_hold_t *zh;
34dc7c2f 1087 int err;
428870ff 1088 sa_handle_t *hdl;
34dc7c2f
BB
1089
1090 *zpp = NULL;
1091
6f9548c4 1092again:
0037b49e 1093 zh = zfs_znode_hold_enter(zfsvfs, obj_num);
34dc7c2f 1094
0037b49e 1095 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
34dc7c2f 1096 if (err) {
0037b49e 1097 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1098 return (err);
1099 }
1100
1101 dmu_object_info_from_db(db, &doi);
428870ff
BB
1102 if (doi.doi_bonus_type != DMU_OT_SA &&
1103 (doi.doi_bonus_type != DMU_OT_ZNODE ||
1104 (doi.doi_bonus_type == DMU_OT_ZNODE &&
1105 doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1106 sa_buf_rele(db, NULL);
0037b49e 1107 zfs_znode_hold_exit(zfsvfs, zh);
2e528b49 1108 return (SET_ERROR(EINVAL));
34dc7c2f
BB
1109 }
1110
428870ff
BB
1111 hdl = dmu_buf_get_user(db);
1112 if (hdl != NULL) {
36df2843 1113 zp = sa_get_userdata(hdl);
34dc7c2f 1114
8ac67298 1115
34dc7c2f 1116 /*
428870ff
BB
1117 * Since "SA" does immediate eviction we
1118 * should never find a sa handle that doesn't
1119 * know about the znode.
34dc7c2f 1120 */
428870ff
BB
1121
1122 ASSERT3P(zp, !=, NULL);
1123
1124 mutex_enter(&zp->z_lock);
34dc7c2f 1125 ASSERT3U(zp->z_id, ==, obj_num);
98701490 1126 /*
41e1aa2a 1127 * If zp->z_unlinked is set, the znode is already marked
0c468138
MFO
1128 * for deletion and should not be discovered. Check this
1129 * after checking igrab() due to fsetxattr() & O_TMPFILE.
41e1aa2a 1130 *
98701490
CC
1131 * If igrab() returns NULL the VFS has independently
1132 * determined the inode should be evicted and has
1133 * called iput_final() to start the eviction process.
1134 * The SA handle is still valid but because the VFS
1135 * requires that the eviction succeed we must drop
1136 * our locks and references to allow the eviction to
1137 * complete. The zfs_zget() may then be retried.
1138 *
1139 * This unlikely case could be optimized by registering
1140 * a sops->drop_inode() callback. The callback would
1141 * need to detect the active SA hold thereby informing
1142 * the VFS that this inode should not be evicted.
1143 */
0c468138
MFO
1144 if (igrab(ZTOI(zp)) == NULL) {
1145 if (zp->z_unlinked)
1146 err = SET_ERROR(ENOENT);
1147 else
1148 err = SET_ERROR(EAGAIN);
41e1aa2a
HAS
1149 } else {
1150 *zpp = zp;
1151 err = 0;
34dc7c2f 1152 }
41e1aa2a 1153
34dc7c2f 1154 mutex_exit(&zp->z_lock);
f3ad9cd6 1155 sa_buf_rele(db, NULL);
0037b49e 1156 zfs_znode_hold_exit(zfsvfs, zh);
41e1aa2a
HAS
1157
1158 if (err == EAGAIN) {
1159 /* inode might need this to finish evict */
1160 cond_resched();
1161 goto again;
1162 }
34dc7c2f
BB
1163 return (err);
1164 }
1165
1166 /*
3558fd73 1167 * Not found create new znode/vnode but only if file exists.
428870ff
BB
1168 *
1169 * There is a small window where zfs_vget() could
1170 * find this object while a file create is still in
1171 * progress. This is checked for in zfs_znode_alloc()
1172 *
1173 * if zfs_znode_alloc() fails it will drop the hold on the
1174 * bonus buffer.
34dc7c2f 1175 */
0037b49e 1176 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
1a688994 1177 doi.doi_bonus_type, NULL);
428870ff 1178 if (zp == NULL) {
2e528b49 1179 err = SET_ERROR(ENOENT);
428870ff
BB
1180 } else {
1181 *zpp = zp;
1182 }
0037b49e 1183 zfs_znode_hold_exit(zfsvfs, zh);
428870ff 1184 return (err);
34dc7c2f
BB
1185}
1186
1187int
1188zfs_rezget(znode_t *zp)
1189{
0037b49e 1190 zfsvfs_t *zfsvfs = ZTOZSB(zp);
34dc7c2f
BB
1191 dmu_object_info_t doi;
1192 dmu_buf_t *db;
1193 uint64_t obj_num = zp->z_id;
428870ff 1194 uint64_t mode;
dfbc8630 1195 uint64_t links;
abbf0bd4 1196 sa_bulk_attr_t bulk[11];
34dc7c2f 1197 int err;
428870ff
BB
1198 int count = 0;
1199 uint64_t gen;
2c6abf15 1200 uint64_t z_uid, z_gid;
abbf0bd4 1201 uint64_t atime[2], mtime[2], ctime[2], btime[2];
db4fc559 1202 inode_timespec_t tmp_ts;
9c5167d1 1203 uint64_t projid = ZFS_DEFAULT_PROJID;
c96c36fa 1204 znode_hold_t *zh;
34dc7c2f 1205
cbecb4fb
CC
1206 /*
1207 * skip ctldir, otherwise they will always get invalidated. This will
1208 * cause funny behaviour for the mounted snapdirs. Especially for
1209 * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
1210 * anyone automount it again as long as someone is still using the
1211 * detached mount.
1212 */
1213 if (zp->z_is_ctldir)
1214 return (0);
1215
0037b49e 1216 zh = zfs_znode_hold_enter(zfsvfs, obj_num);
34dc7c2f 1217
428870ff
BB
1218 mutex_enter(&zp->z_acl_lock);
1219 if (zp->z_acl_cached) {
1220 zfs_acl_free(zp->z_acl_cached);
1221 zp->z_acl_cached = NULL;
1222 }
428870ff 1223 mutex_exit(&zp->z_acl_lock);
7b3e34ba 1224
228b461b 1225 rw_enter(&zp->z_xattr_lock, RW_WRITER);
7b3e34ba
BB
1226 if (zp->z_xattr_cached) {
1227 nvlist_free(zp->z_xattr_cached);
1228 zp->z_xattr_cached = NULL;
1229 }
7b3e34ba
BB
1230 rw_exit(&zp->z_xattr_lock);
1231
428870ff 1232 ASSERT(zp->z_sa_hdl == NULL);
0037b49e 1233 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
34dc7c2f 1234 if (err) {
0037b49e 1235 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1236 return (err);
1237 }
1238
1239 dmu_object_info_from_db(db, &doi);
428870ff
BB
1240 if (doi.doi_bonus_type != DMU_OT_SA &&
1241 (doi.doi_bonus_type != DMU_OT_ZNODE ||
1242 (doi.doi_bonus_type == DMU_OT_ZNODE &&
1243 doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1244 sa_buf_rele(db, NULL);
0037b49e 1245 zfs_znode_hold_exit(zfsvfs, zh);
2e528b49 1246 return (SET_ERROR(EINVAL));
34dc7c2f
BB
1247 }
1248
0037b49e 1249 zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
428870ff
BB
1250
1251 /* reload cached values */
0037b49e 1252 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
428870ff 1253 &gen, sizeof (gen));
0037b49e 1254 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
428870ff 1255 &zp->z_size, sizeof (zp->z_size));
0037b49e 1256 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
dfbc8630 1257 &links, sizeof (links));
0037b49e 1258 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
428870ff 1259 &zp->z_pflags, sizeof (zp->z_pflags));
0037b49e 1260 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2c6abf15 1261 &z_uid, sizeof (z_uid));
0037b49e 1262 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
2c6abf15 1263 &z_gid, sizeof (z_gid));
0037b49e 1264 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
428870ff 1265 &mode, sizeof (mode));
0037b49e 1266 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
9f5f0019 1267 &atime, 16);
0037b49e 1268 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
9f5f0019 1269 &mtime, 16);
0037b49e 1270 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
9f5f0019 1271 &ctime, 16);
abbf0bd4 1272 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
428870ff 1273
428870ff
BB
1274 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
1275 zfs_znode_dmu_fini(zp);
0037b49e 1276 zfs_znode_hold_exit(zfsvfs, zh);
2e528b49 1277 return (SET_ERROR(EIO));
428870ff
BB
1278 }
1279
9c5167d1
NF
1280 if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
1281 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
1282 &projid, 8);
1283 if (err != 0 && err != ENOENT) {
1284 zfs_znode_dmu_fini(zp);
1285 zfs_znode_hold_exit(zfsvfs, zh);
1286 return (SET_ERROR(err));
1287 }
1288 }
1289
1290 zp->z_projid = projid;
12fa7f34 1291 zp->z_mode = ZTOI(zp)->i_mode = mode;
2c6abf15
NB
1292 zfs_uid_write(ZTOI(zp), z_uid);
1293 zfs_gid_write(ZTOI(zp), z_gid);
572e2857 1294
db4fc559
RN
1295 ZFS_TIME_DECODE(&tmp_ts, atime);
1296 zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_ts);
1297 ZFS_TIME_DECODE(&tmp_ts, mtime);
1298 zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts);
1299 ZFS_TIME_DECODE(&tmp_ts, ctime);
1300 zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts);
abbf0bd4 1301 ZFS_TIME_DECODE(&zp->z_btime, btime);
9f5f0019 1302
3ce85b5e 1303 if ((uint32_t)gen != ZTOI(zp)->i_generation) {
428870ff 1304 zfs_znode_dmu_fini(zp);
0037b49e 1305 zfs_znode_hold_exit(zfsvfs, zh);
2e528b49 1306 return (SET_ERROR(EIO));
34dc7c2f
BB
1307 }
1308
dfbc8630 1309 set_nlink(ZTOI(zp), (uint32_t)links);
7bb1325f 1310 zfs_set_inode_flags(zp, ZTOI(zp));
dfbc8630 1311
34dc7c2f 1312 zp->z_blksz = doi.doi_data_block_size;
a43570c5 1313 zp->z_atime_dirty = B_FALSE;
fc273894 1314 zfs_znode_update_vfs(zp);
34dc7c2f 1315
6a218566
AG
1316 /*
1317 * If the file has zero links, then it has been unlinked on the send
1318 * side and it must be in the received unlinked set.
1319 * We call zfs_znode_dmu_fini() now to prevent any accesses to the
e1cfd73f 1320 * stale data and to prevent automatic removal of the file in
6a218566
AG
1321 * zfs_zinactive(). The file will be removed either when it is removed
1322 * on the send side and the next incremental stream is received or
1323 * when the unlinked set gets processed.
1324 */
1325 zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
1326 if (zp->z_unlinked)
1327 zfs_znode_dmu_fini(zp);
1328
0037b49e 1329 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1330
1331 return (0);
1332}
1333
1334void
1335zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1336{
0037b49e
BB
1337 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1338 objset_t *os = zfsvfs->z_os;
34dc7c2f 1339 uint64_t obj = zp->z_id;
572e2857 1340 uint64_t acl_obj = zfs_external_acl(zp);
c96c36fa 1341 znode_hold_t *zh;
34dc7c2f 1342
0037b49e 1343 zh = zfs_znode_hold_enter(zfsvfs, obj);
572e2857
BB
1344 if (acl_obj) {
1345 VERIFY(!zp->z_is_sa);
b128c09f 1346 VERIFY(0 == dmu_object_free(os, acl_obj, tx));
572e2857 1347 }
b128c09f 1348 VERIFY(0 == dmu_object_free(os, obj, tx));
34dc7c2f 1349 zfs_znode_dmu_fini(zp);
0037b49e 1350 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1351}
1352
1353void
1354zfs_zinactive(znode_t *zp)
1355{
0037b49e 1356 zfsvfs_t *zfsvfs = ZTOZSB(zp);
34dc7c2f 1357 uint64_t z_id = zp->z_id;
c96c36fa 1358 znode_hold_t *zh;
34dc7c2f 1359
428870ff 1360 ASSERT(zp->z_sa_hdl);
34dc7c2f
BB
1361
1362 /*
d6bd8eaa 1363 * Don't allow a zfs_zget() while were trying to release this znode.
34dc7c2f 1364 */
0037b49e 1365 zh = zfs_znode_hold_enter(zfsvfs, z_id);
d6bd8eaa 1366
34dc7c2f 1367 mutex_enter(&zp->z_lock);
34dc7c2f
BB
1368
1369 /*
6a218566
AG
1370 * If this was the last reference to a file with no links, remove
1371 * the file from the file system unless the file system is mounted
1372 * read-only. That can happen, for example, if the file system was
1373 * originally read-write, the file was opened, then unlinked and
1374 * the file system was made read-only before the file was finally
1375 * closed. The file will remain in the unlinked set.
34dc7c2f
BB
1376 */
1377 if (zp->z_unlinked) {
6a218566 1378 ASSERT(!zfsvfs->z_issnap);
dcec0a12 1379 if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) {
6a218566
AG
1380 mutex_exit(&zp->z_lock);
1381 zfs_znode_hold_exit(zfsvfs, zh);
1382 zfs_rmnode(zp);
1383 return;
1384 }
34dc7c2f 1385 }
428870ff 1386
34dc7c2f
BB
1387 mutex_exit(&zp->z_lock);
1388 zfs_znode_dmu_fini(zp);
d6bd8eaa 1389
0037b49e 1390 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1391}
1392
9c53e516
TK
1393#if defined(HAVE_INODE_TIMESPEC64_TIMES)
1394#define zfs_compare_timespec timespec64_compare
1395#else
1396#define zfs_compare_timespec timespec_compare
1397#endif
1398
1399/*
1400 * Determine whether the znode's atime must be updated. The logic mostly
1401 * duplicates the Linux kernel's relatime_need_update() functionality.
1402 * This function is only called if the underlying filesystem actually has
1403 * atime updates enabled.
1404 */
1405boolean_t
1406zfs_relatime_need_update(const struct inode *ip)
6d111134 1407{
db4fc559 1408 inode_timespec_t now, tmp_atime, tmp_ts;
9c53e516
TK
1409
1410 gethrestime(&now);
db4fc559 1411 tmp_atime = zpl_inode_get_atime(ip);
9c53e516
TK
1412 /*
1413 * In relatime mode, only update the atime if the previous atime
1414 * is earlier than either the ctime or mtime or if at least a day
1415 * has passed since the last update of atime.
1416 */
db4fc559
RN
1417 tmp_ts = zpl_inode_get_mtime(ip);
1418 if (zfs_compare_timespec(&tmp_ts, &tmp_atime) >= 0)
9c53e516
TK
1419 return (B_TRUE);
1420
db4fc559
RN
1421 tmp_ts = zpl_inode_get_ctime(ip);
1422 if (zfs_compare_timespec(&tmp_ts, &tmp_atime) >= 0)
9c53e516 1423 return (B_TRUE);
6d111134 1424
db4fc559 1425 if ((hrtime_t)now.tv_sec - (hrtime_t)tmp_atime.tv_sec >= 24*60*60)
9c53e516 1426 return (B_TRUE);
6d111134 1427
9c53e516 1428 return (B_FALSE);
6d111134
TC
1429}
1430
6d111134
TC
1431/*
1432 * Prepare to update znode time stamps.
1433 *
1434 * IN: zp - znode requiring timestamp update
0df9673f 1435 * flag - ATTR_MTIME, ATTR_CTIME flags
6d111134 1436 *
0df9673f 1437 * OUT: zp - z_seq
6d111134
TC
1438 * mtime - new mtime
1439 * ctime - new ctime
1440 *
0df9673f
CC
1441 * Note: We don't update atime here, because we rely on Linux VFS to do
1442 * atime updating.
6d111134 1443 */
34dc7c2f 1444void
428870ff 1445zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
0df9673f 1446 uint64_t ctime[2])
34dc7c2f 1447{
db4fc559 1448 inode_timespec_t now, tmp_ts;
34dc7c2f 1449
34dc7c2f
BB
1450 gethrestime(&now);
1451
0df9673f 1452 zp->z_seq++;
34dc7c2f 1453
3558fd73 1454 if (flag & ATTR_MTIME) {
428870ff 1455 ZFS_TIME_ENCODE(&now, mtime);
db4fc559
RN
1456 ZFS_TIME_DECODE(&tmp_ts, mtime);
1457 zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts);
3558fd73 1458 if (ZTOZSB(zp)->z_use_fuids) {
428870ff
BB
1459 zp->z_pflags |= (ZFS_ARCHIVE |
1460 ZFS_AV_MODIFIED);
1461 }
34dc7c2f
BB
1462 }
1463
3558fd73 1464 if (flag & ATTR_CTIME) {
428870ff 1465 ZFS_TIME_ENCODE(&now, ctime);
db4fc559
RN
1466 ZFS_TIME_DECODE(&tmp_ts, ctime);
1467 zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts);
3558fd73 1468 if (ZTOZSB(zp)->z_use_fuids)
428870ff 1469 zp->z_pflags |= ZFS_ARCHIVE;
34dc7c2f
BB
1470 }
1471}
1472
34dc7c2f
BB
1473/*
1474 * Grow the block size for a file.
1475 *
1476 * IN: zp - znode of file to free data in.
1477 * size - requested block size
1478 * tx - open transaction.
1479 *
1480 * NOTE: this function assumes that the znode is write locked.
1481 */
1482void
1483zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1484{
1485 int error;
1486 u_longlong_t dummy;
1487
1488 if (size <= zp->z_blksz)
1489 return;
1490 /*
1491 * If the file size is already greater than the current blocksize,
1492 * we will not grow. If there is more than one block in a file,
1493 * the blocksize cannot change.
1494 */
428870ff 1495 if (zp->z_blksz && zp->z_size > zp->z_blksz)
34dc7c2f
BB
1496 return;
1497
3558fd73 1498 error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
34dc7c2f 1499 size, 0, tx);
428870ff 1500
34dc7c2f
BB
1501 if (error == ENOTSUP)
1502 return;
c99c9001 1503 ASSERT0(error);
34dc7c2f
BB
1504
1505 /* What blocksize did we actually get? */
428870ff 1506 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
34dc7c2f
BB
1507}
1508
34dc7c2f 1509/*
b128c09f 1510 * Increase the file length
34dc7c2f
BB
1511 *
1512 * IN: zp - znode of file to free data in.
b128c09f 1513 * end - new end-of-file
34dc7c2f 1514 *
19d55079 1515 * RETURN: 0 on success, error code on failure
34dc7c2f 1516 */
b128c09f
BB
1517static int
1518zfs_extend(znode_t *zp, uint64_t end)
34dc7c2f 1519{
0037b49e 1520 zfsvfs_t *zfsvfs = ZTOZSB(zp);
b128c09f 1521 dmu_tx_t *tx;
bd4dde8e 1522 zfs_locked_range_t *lr;
b128c09f 1523 uint64_t newblksz;
34dc7c2f
BB
1524 int error;
1525
34dc7c2f 1526 /*
b128c09f 1527 * We will change zp_size, lock the whole file.
34dc7c2f 1528 */
2cc479d0 1529 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
34dc7c2f
BB
1530
1531 /*
1532 * Nothing to do if file already at desired length.
1533 */
428870ff 1534 if (end <= zp->z_size) {
2cc479d0 1535 zfs_rangelock_exit(lr);
34dc7c2f
BB
1536 return (0);
1537 }
0037b49e 1538 tx = dmu_tx_create(zfsvfs->z_os);
428870ff
BB
1539 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1540 zfs_sa_upgrade_txholds(tx, zp);
b128c09f 1541 if (end > zp->z_blksz &&
0037b49e 1542 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
34dc7c2f
BB
1543 /*
1544 * We are growing the file past the current block size.
1545 */
3558fd73 1546 if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
f1512ee6
MA
1547 /*
1548 * File's blocksize is already larger than the
1549 * "recordsize" property. Only let it grow to
1550 * the next power of 2.
1551 */
34dc7c2f 1552 ASSERT(!ISP2(zp->z_blksz));
f1512ee6 1553 newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
34dc7c2f 1554 } else {
3558fd73 1555 newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
34dc7c2f 1556 }
b128c09f
BB
1557 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1558 } else {
1559 newblksz = 0;
34dc7c2f
BB
1560 }
1561
384f8a09 1562 error = dmu_tx_assign(tx, TXG_WAIT);
34dc7c2f 1563 if (error) {
34dc7c2f 1564 dmu_tx_abort(tx);
2cc479d0 1565 zfs_rangelock_exit(lr);
34dc7c2f
BB
1566 return (error);
1567 }
1568
b128c09f
BB
1569 if (newblksz)
1570 zfs_grow_blocksize(zp, newblksz, tx);
34dc7c2f 1571
428870ff
BB
1572 zp->z_size = end;
1573
3558fd73 1574 VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
428870ff 1575 &zp->z_size, sizeof (zp->z_size), tx));
34dc7c2f 1576
2cc479d0 1577 zfs_rangelock_exit(lr);
34dc7c2f 1578
b128c09f 1579 dmu_tx_commit(tx);
34dc7c2f 1580
b128c09f
BB
1581 return (0);
1582}
1583
223df016
TC
1584/*
1585 * zfs_zero_partial_page - Modeled after update_pages() but
1586 * with different arguments and semantics for use by zfs_freesp().
1587 *
1588 * Zeroes a piece of a single page cache entry for zp at offset
1589 * start and length len.
1590 *
1591 * Caller must acquire a range lock on the file for the region
1592 * being zeroed in order that the ARC and page cache stay in sync.
1593 */
1594static void
1595zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
1596{
1597 struct address_space *mp = ZTOI(zp)->i_mapping;
1598 struct page *pp;
1599 int64_t off;
1600 void *pb;
1601
8b1899d3 1602 ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
223df016 1603
8b1899d3
BB
1604 off = start & (PAGE_SIZE - 1);
1605 start &= PAGE_MASK;
223df016 1606
8b1899d3 1607 pp = find_lock_page(mp, start >> PAGE_SHIFT);
223df016
TC
1608 if (pp) {
1609 if (mapping_writably_mapped(mp))
1610 flush_dcache_page(pp);
1611
1612 pb = kmap(pp);
861166b0 1613 memset(pb + off, 0, len);
223df016
TC
1614 kunmap(pp);
1615
1616 if (mapping_writably_mapped(mp))
1617 flush_dcache_page(pp);
1618
1619 mark_page_accessed(pp);
1620 SetPageUptodate(pp);
1621 ClearPageError(pp);
1622 unlock_page(pp);
8b1899d3 1623 put_page(pp);
223df016
TC
1624 }
1625}
1626
b128c09f
BB
1627/*
1628 * Free space in a file.
1629 *
1630 * IN: zp - znode of file to free data in.
1631 * off - start of section to free.
1632 * len - length of section to free.
1633 *
19d55079 1634 * RETURN: 0 on success, error code on failure
b128c09f
BB
1635 */
1636static int
1637zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1638{
0037b49e 1639 zfsvfs_t *zfsvfs = ZTOZSB(zp);
bd4dde8e 1640 zfs_locked_range_t *lr;
b128c09f
BB
1641 int error;
1642
1643 /*
1644 * Lock the range being freed.
1645 */
2cc479d0 1646 lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
b128c09f
BB
1647
1648 /*
1649 * Nothing to do if file already at desired length.
1650 */
428870ff 1651 if (off >= zp->z_size) {
2cc479d0 1652 zfs_rangelock_exit(lr);
b128c09f 1653 return (0);
34dc7c2f
BB
1654 }
1655
428870ff
BB
1656 if (off + len > zp->z_size)
1657 len = zp->z_size - off;
b128c09f 1658
0037b49e 1659 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
b128c09f 1660
223df016
TC
1661 /*
1662 * Zero partial page cache entries. This must be done under a
1663 * range lock in order to keep the ARC and page cache in sync.
1664 */
3fc92adc 1665 if (zn_has_cached_data(zp, off, off + len - 1)) {
223df016
TC
1666 loff_t first_page, last_page, page_len;
1667 loff_t first_page_offset, last_page_offset;
1668
1669 /* first possible full page in hole */
8b1899d3 1670 first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
223df016 1671 /* last page of hole */
8b1899d3 1672 last_page = (off + len) >> PAGE_SHIFT;
223df016
TC
1673
1674 /* offset of first_page */
8b1899d3 1675 first_page_offset = first_page << PAGE_SHIFT;
223df016 1676 /* offset of last_page */
8b1899d3 1677 last_page_offset = last_page << PAGE_SHIFT;
223df016 1678
cb08f063
TC
1679 /* truncate whole pages */
1680 if (last_page_offset > first_page_offset) {
1681 truncate_inode_pages_range(ZTOI(zp)->i_mapping,
1682 first_page_offset, last_page_offset - 1);
1683 }
1684
1685 /* truncate sub-page ranges */
223df016
TC
1686 if (first_page > last_page) {
1687 /* entire punched area within a single page */
1688 zfs_zero_partial_page(zp, off, len);
1689 } else {
1690 /* beginning of punched area at the end of a page */
1691 page_len = first_page_offset - off;
1692 if (page_len > 0)
1693 zfs_zero_partial_page(zp, off, page_len);
1694
1695 /* end of punched area at the beginning of a page */
1696 page_len = off + len - last_page_offset;
1697 if (page_len > 0)
1698 zfs_zero_partial_page(zp, last_page_offset,
1699 page_len);
1700 }
1701 }
2cc479d0 1702 zfs_rangelock_exit(lr);
34dc7c2f 1703
b128c09f
BB
1704 return (error);
1705}
1706
1707/*
1708 * Truncate a file
1709 *
1710 * IN: zp - znode of file to free data in.
1711 * end - new end-of-file.
1712 *
19d55079 1713 * RETURN: 0 on success, error code on failure
b128c09f
BB
1714 */
1715static int
1716zfs_trunc(znode_t *zp, uint64_t end)
1717{
0037b49e 1718 zfsvfs_t *zfsvfs = ZTOZSB(zp);
b128c09f 1719 dmu_tx_t *tx;
bd4dde8e 1720 zfs_locked_range_t *lr;
b128c09f 1721 int error;
572e2857
BB
1722 sa_bulk_attr_t bulk[2];
1723 int count = 0;
b128c09f
BB
1724
1725 /*
1726 * We will change zp_size, lock the whole file.
1727 */
2cc479d0 1728 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
b128c09f
BB
1729
1730 /*
1731 * Nothing to do if file already at desired length.
1732 */
428870ff 1733 if (end >= zp->z_size) {
2cc479d0 1734 zfs_rangelock_exit(lr);
b128c09f
BB
1735 return (0);
1736 }
1737
18a2485f
FS
1738 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
1739 DMU_OBJECT_END);
b128c09f 1740 if (error) {
2cc479d0 1741 zfs_rangelock_exit(lr);
b128c09f
BB
1742 return (error);
1743 }
0037b49e 1744 tx = dmu_tx_create(zfsvfs->z_os);
428870ff
BB
1745 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1746 zfs_sa_upgrade_txholds(tx, zp);
19d55079 1747 dmu_tx_mark_netfree(tx);
7a8f0e80 1748 error = dmu_tx_assign(tx, TXG_WAIT);
b128c09f 1749 if (error) {
b128c09f 1750 dmu_tx_abort(tx);
2cc479d0 1751 zfs_rangelock_exit(lr);
b128c09f
BB
1752 return (error);
1753 }
b128c09f 1754
428870ff 1755 zp->z_size = end;
0037b49e 1756 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
572e2857 1757 NULL, &zp->z_size, sizeof (zp->z_size));
428870ff 1758
572e2857
BB
1759 if (end == 0) {
1760 zp->z_pflags &= ~ZFS_SPARSE;
0037b49e 1761 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
572e2857
BB
1762 NULL, &zp->z_pflags, 8);
1763 }
1764 VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
b128c09f 1765
34dc7c2f 1766 dmu_tx_commit(tx);
2cc479d0 1767 zfs_rangelock_exit(lr);
34dc7c2f
BB
1768
1769 return (0);
1770}
1771
b128c09f
BB
1772/*
1773 * Free space in a file
1774 *
1775 * IN: zp - znode of file to free data in.
1776 * off - start of range
1777 * len - end of range (0 => EOF)
1778 * flag - current file open mode flags.
1779 * log - TRUE if this action should be logged
1780 *
19d55079 1781 * RETURN: 0 on success, error code on failure
b128c09f
BB
1782 */
1783int
1784zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1785{
b128c09f 1786 dmu_tx_t *tx;
0037b49e
BB
1787 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1788 zilog_t *zilog = zfsvfs->z_log;
428870ff
BB
1789 uint64_t mode;
1790 uint64_t mtime[2], ctime[2];
1791 sa_bulk_attr_t bulk[3];
1792 int count = 0;
b128c09f
BB
1793 int error;
1794
0037b49e 1795 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
428870ff
BB
1796 sizeof (mode))) != 0)
1797 return (error);
1798
1799 if (off > zp->z_size) {
b128c09f
BB
1800 error = zfs_extend(zp, off+len);
1801 if (error == 0 && log)
1802 goto log;
223df016 1803 goto out;
b128c09f
BB
1804 }
1805
b128c09f
BB
1806 if (len == 0) {
1807 error = zfs_trunc(zp, off);
1808 } else {
1809 if ((error = zfs_free_range(zp, off, len)) == 0 &&
428870ff 1810 off + len > zp->z_size)
b128c09f
BB
1811 error = zfs_extend(zp, off+len);
1812 }
1813 if (error || !log)
223df016 1814 goto out;
b128c09f 1815log:
0037b49e 1816 tx = dmu_tx_create(zfsvfs->z_os);
428870ff
BB
1817 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1818 zfs_sa_upgrade_txholds(tx, zp);
384f8a09 1819 error = dmu_tx_assign(tx, TXG_WAIT);
b128c09f 1820 if (error) {
b128c09f 1821 dmu_tx_abort(tx);
223df016 1822 goto out;
b128c09f
BB
1823 }
1824
0037b49e
BB
1825 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
1826 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
1827 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
428870ff 1828 NULL, &zp->z_pflags, 8);
0df9673f 1829 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
428870ff
BB
1830 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1831 ASSERT(error == 0);
1832
b128c09f
BB
1833 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1834
1835 dmu_tx_commit(tx);
223df016 1836
fc273894 1837 zfs_znode_update_vfs(zp);
223df016
TC
1838 error = 0;
1839
1840out:
1841 /*
1842 * Truncate the page cache - for file truncate operations, use
1843 * the purpose-built API for truncations. For punching operations,
cb08f063 1844 * the truncation is handled under a range lock in zfs_free_range.
223df016
TC
1845 */
1846 if (len == 0)
1847 truncate_setsize(ZTOI(zp), off);
223df016 1848 return (error);
b128c09f
BB
1849}
1850
34dc7c2f
BB
1851void
1852zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1853{
22872ff5 1854 struct super_block *sb;
0037b49e 1855 zfsvfs_t *zfsvfs;
428870ff 1856 uint64_t moid, obj, sa_obj, version;
22872ff5 1857 uint64_t sense = ZFS_CASE_SENSITIVE;
34dc7c2f
BB
1858 uint64_t norm = 0;
1859 nvpair_t *elem;
c96c36fa 1860 int size;
34dc7c2f 1861 int error;
22872ff5
BB
1862 int i;
1863 znode_t *rootzp = NULL;
1864 vattr_t vattr;
1865 znode_t *zp;
1866 zfs_acl_ids_t acl_ids;
34dc7c2f
BB
1867
1868 /*
1869 * First attempt to create master node.
1870 */
1871 /*
1872 * In an empty objset, there are no blocks to read and thus
1873 * there can be no i/o errors (which we assert below).
1874 */
1875 moid = MASTER_NODE_OBJ;
1876 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1877 DMU_OT_NONE, 0, tx);
1878 ASSERT(error == 0);
1879
1880 /*
1881 * Set starting attributes.
1882 */
428870ff 1883 version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
34dc7c2f
BB
1884 elem = NULL;
1885 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1886 /* For the moment we expect all zpl props to be uint64_ts */
1887 uint64_t val;
d1807f16 1888 const char *name;
34dc7c2f
BB
1889
1890 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1891 VERIFY(nvpair_value_uint64(elem, &val) == 0);
1892 name = nvpair_name(elem);
1893 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
9babb374
BB
1894 if (val < version)
1895 version = val;
34dc7c2f
BB
1896 } else {
1897 error = zap_update(os, moid, name, 8, 1, &val, tx);
1898 }
1899 ASSERT(error == 0);
1900 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1901 norm = val;
22872ff5
BB
1902 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1903 sense = val;
34dc7c2f
BB
1904 }
1905 ASSERT(version != 0);
9babb374 1906 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
2e7f664f 1907 ASSERT(error == 0);
34dc7c2f 1908
428870ff
BB
1909 /*
1910 * Create zap object used for SA attribute registration
1911 */
1912
1913 if (version >= ZPL_VERSION_SA) {
1914 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
1915 DMU_OT_NONE, 0, tx);
1916 error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
1917 ASSERT(error == 0);
1918 } else {
1919 sa_obj = 0;
1920 }
34dc7c2f
BB
1921 /*
1922 * Create a delete queue.
1923 */
9babb374 1924 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
34dc7c2f 1925
9babb374 1926 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
34dc7c2f
BB
1927 ASSERT(error == 0);
1928
9babb374 1929 /*
0037b49e 1930 * Create root znode. Create minimal znode/inode/zfsvfs/sb
22872ff5 1931 * to allow zfs_mknode to work.
9babb374 1932 */
22872ff5
BB
1933 vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
1934 vattr.va_mode = S_IFDIR|0755;
1935 vattr.va_uid = crgetuid(cr);
1936 vattr.va_gid = crgetgid(cr);
1937
79c76d5b 1938 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
a43570c5
TK
1939 rootzp->z_unlinked = B_FALSE;
1940 rootzp->z_atime_dirty = B_FALSE;
22872ff5 1941 rootzp->z_is_sa = USE_SA(version, os);
9c5167d1 1942 rootzp->z_pflags = 0;
22872ff5 1943
0037b49e
BB
1944 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1945 zfsvfs->z_os = os;
1946 zfsvfs->z_parent = zfsvfs;
1947 zfsvfs->z_version = version;
1948 zfsvfs->z_use_fuids = USE_FUIDS(version, os);
1949 zfsvfs->z_use_sa = USE_SA(version, os);
1950 zfsvfs->z_norm = norm;
22872ff5 1951
79c76d5b 1952 sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
0037b49e 1953 sb->s_fs_info = zfsvfs;
22872ff5
BB
1954
1955 ZTOI(rootzp)->i_sb = sb;
1956
1957 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
0037b49e 1958 &zfsvfs->z_attr_table);
9babb374 1959
22872ff5 1960 ASSERT(error == 0);
9babb374 1961
60101509 1962 /*
22872ff5
BB
1963 * Fold case on file systems that are always or sometimes case
1964 * insensitive.
60101509 1965 */
22872ff5 1966 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
0037b49e 1967 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
60101509 1968
0037b49e
BB
1969 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1970 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
22872ff5 1971 offsetof(znode_t, z_link_node));
60101509 1972
c96c36fa 1973 size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
0037b49e
BB
1974 zfsvfs->z_hold_size = size;
1975 zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
1976 KM_SLEEP);
1977 zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
c96c36fa 1978 for (i = 0; i != size; i++) {
0037b49e 1979 avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
c96c36fa 1980 sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
0037b49e 1981 mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
c96c36fa 1982 }
60101509 1983
22872ff5 1984 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
d4dc53da 1985 cr, NULL, &acl_ids, zfs_init_idmap));
22872ff5
BB
1986 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
1987 ASSERT3P(zp, ==, rootzp);
1988 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1989 ASSERT(error == 0);
1990 zfs_acl_ids_free(&acl_ids);
60101509 1991
22872ff5
BB
1992 atomic_set(&ZTOI(rootzp)->i_count, 0);
1993 sa_handle_destroy(rootzp->z_sa_hdl);
22872ff5
BB
1994 kmem_cache_free(znode_cache, rootzp);
1995
c96c36fa 1996 for (i = 0; i != size; i++) {
0037b49e
BB
1997 avl_destroy(&zfsvfs->z_hold_trees[i]);
1998 mutex_destroy(&zfsvfs->z_hold_locks[i]);
c96c36fa 1999 }
2708f716 2000
c17486b2
GN
2001 mutex_destroy(&zfsvfs->z_znodes_lock);
2002
0037b49e
BB
2003 vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
2004 vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
2708f716 2005 kmem_free(sb, sizeof (struct super_block));
0037b49e 2006 kmem_free(zfsvfs, sizeof (zfsvfs_t));
34dc7c2f 2007}
34dc7c2f 2008#endif /* _KERNEL */
428870ff 2009
34dc7c2f 2010static int
572e2857
BB
2011zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
2012{
2013 uint64_t sa_obj = 0;
2014 int error;
2015
2016 error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
2017 if (error != 0 && error != ENOENT)
2018 return (error);
2019
2020 error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
2021 return (error);
2022}
2023
2024static int
2025zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
dd66857d 2026 dmu_buf_t **db, const void *tag)
34dc7c2f 2027{
34dc7c2f 2028 dmu_object_info_t doi;
34dc7c2f 2029 int error;
428870ff 2030
7b8518cb 2031 if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
34dc7c2f
BB
2032 return (error);
2033
572e2857 2034 dmu_object_info_from_db(*db, &doi);
428870ff
BB
2035 if ((doi.doi_bonus_type != DMU_OT_SA &&
2036 doi.doi_bonus_type != DMU_OT_ZNODE) ||
d6320ddb
BB
2037 (doi.doi_bonus_type == DMU_OT_ZNODE &&
2038 doi.doi_bonus_size < sizeof (znode_phys_t))) {
7b8518cb 2039 sa_buf_rele(*db, tag);
2e528b49 2040 return (SET_ERROR(ENOTSUP));
34dc7c2f
BB
2041 }
2042
572e2857
BB
2043 error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
2044 if (error != 0) {
7b8518cb 2045 sa_buf_rele(*db, tag);
428870ff
BB
2046 return (error);
2047 }
2048
572e2857
BB
2049 return (0);
2050}
2051
65c7cc49 2052static void
dd66857d 2053zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, const void *tag)
572e2857
BB
2054{
2055 sa_handle_destroy(hdl);
7b8518cb 2056 sa_buf_rele(db, tag);
572e2857
BB
2057}
2058
2059/*
2060 * Given an object number, return its parent object number and whether
2061 * or not the object is an extended attribute directory.
2062 */
2063static int
b23ad7f3
JJ
2064zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
2065 uint64_t *pobjp, int *is_xattrdir)
572e2857
BB
2066{
2067 uint64_t parent;
2068 uint64_t pflags;
2069 uint64_t mode;
b23ad7f3 2070 uint64_t parent_mode;
572e2857 2071 sa_bulk_attr_t bulk[3];
b23ad7f3
JJ
2072 sa_handle_t *sa_hdl;
2073 dmu_buf_t *sa_db;
572e2857
BB
2074 int count = 0;
2075 int error;
2076
2077 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
2078 &parent, sizeof (parent));
428870ff 2079 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
572e2857 2080 &pflags, sizeof (pflags));
428870ff 2081 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
572e2857 2082 &mode, sizeof (mode));
428870ff 2083
572e2857 2084 if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
428870ff 2085 return (error);
572e2857 2086
b23ad7f3
JJ
2087 /*
2088 * When a link is removed its parent pointer is not changed and will
2089 * be invalid. There are two cases where a link is removed but the
2090 * file stays around, when it goes to the delete queue and when there
2091 * are additional links.
2092 */
2093 error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
2094 if (error != 0)
2095 return (error);
2096
2097 error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
2098 zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2099 if (error != 0)
2100 return (error);
2101
428870ff 2102 *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
34dc7c2f 2103
b23ad7f3
JJ
2104 /*
2105 * Extended attributes can be applied to files, directories, etc.
2106 * Otherwise the parent must be a directory.
2107 */
2108 if (!*is_xattrdir && !S_ISDIR(parent_mode))
ecb2b7dc 2109 return (SET_ERROR(EINVAL));
b23ad7f3
JJ
2110
2111 *pobjp = parent;
2112
34dc7c2f
BB
2113 return (0);
2114}
2115
572e2857
BB
2116/*
2117 * Given an object number, return some zpl level statistics
2118 */
2119static int
2120zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
2121 zfs_stat_t *sb)
34dc7c2f 2122{
572e2857
BB
2123 sa_bulk_attr_t bulk[4];
2124 int count = 0;
2125
2126 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2127 &sb->zs_mode, sizeof (sb->zs_mode));
2128 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
2129 &sb->zs_gen, sizeof (sb->zs_gen));
2130 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
2131 &sb->zs_links, sizeof (sb->zs_links));
2132 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
2133 &sb->zs_ctime, sizeof (sb->zs_ctime));
2134
2135 return (sa_bulk_lookup(hdl, bulk, count));
2136}
2137
2138static int
2139zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
2140 sa_attr_type_t *sa_table, char *buf, int len)
2141{
2142 sa_handle_t *sa_hdl;
2143 sa_handle_t *prevhdl = NULL;
2144 dmu_buf_t *prevdb = NULL;
2145 dmu_buf_t *sa_db = NULL;
34dc7c2f
BB
2146 char *path = buf + len - 1;
2147 int error;
2148
2149 *path = '\0';
572e2857 2150 sa_hdl = hdl;
428870ff 2151
64c1dcef
PD
2152 uint64_t deleteq_obj;
2153 VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
2154 ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
2155 error = zap_lookup_int(osp, deleteq_obj, obj);
2156 if (error == 0) {
2157 return (ESTALE);
2158 } else if (error != ENOENT) {
2159 return (error);
2160 }
64c1dcef 2161
34dc7c2f 2162 for (;;) {
17897ce2 2163 uint64_t pobj = 0;
34dc7c2f
BB
2164 char component[MAXNAMELEN + 2];
2165 size_t complen;
17897ce2 2166 int is_xattrdir = 0;
34dc7c2f 2167
4f22619a
KT
2168 if (prevdb) {
2169 ASSERT(prevhdl != NULL);
7b8518cb 2170 zfs_release_sa_handle(prevhdl, prevdb, FTAG);
4f22619a 2171 }
572e2857 2172
b23ad7f3 2173 if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
572e2857 2174 &is_xattrdir)) != 0)
34dc7c2f
BB
2175 break;
2176
2177 if (pobj == obj) {
2178 if (path[0] != '/')
2179 *--path = '/';
2180 break;
2181 }
2182
2183 component[0] = '/';
2184 if (is_xattrdir) {
861166b0 2185 strcpy(component + 1, "<xattrdir>");
34dc7c2f
BB
2186 } else {
2187 error = zap_value_search(osp, pobj, obj,
2188 ZFS_DIRENT_OBJ(-1ULL), component + 1);
2189 if (error != 0)
2190 break;
2191 }
2192
2193 complen = strlen(component);
2194 path -= complen;
2195 ASSERT(path >= buf);
861166b0 2196 memcpy(path, component, complen);
34dc7c2f 2197 obj = pobj;
572e2857
BB
2198
2199 if (sa_hdl != hdl) {
2200 prevhdl = sa_hdl;
2201 prevdb = sa_db;
2202 }
7b8518cb 2203 error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
572e2857
BB
2204 if (error != 0) {
2205 sa_hdl = prevhdl;
2206 sa_db = prevdb;
2207 break;
2208 }
2209 }
2210
2211 if (sa_hdl != NULL && sa_hdl != hdl) {
2212 ASSERT(sa_db != NULL);
7b8518cb 2213 zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
34dc7c2f
BB
2214 }
2215
2216 if (error == 0)
2217 (void) memmove(buf, path, buf + len - path);
428870ff 2218
34dc7c2f
BB
2219 return (error);
2220}
572e2857
BB
2221
2222int
2223zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
2224{
2225 sa_attr_type_t *sa_table;
2226 sa_handle_t *hdl;
2227 dmu_buf_t *db;
2228 int error;
2229
2230 error = zfs_sa_setup(osp, &sa_table);
2231 if (error != 0)
2232 return (error);
2233
7b8518cb 2234 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
572e2857
BB
2235 if (error != 0)
2236 return (error);
2237
2238 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2239
7b8518cb 2240 zfs_release_sa_handle(hdl, db, FTAG);
572e2857
BB
2241 return (error);
2242}
2243
2244int
2245zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
2246 char *buf, int len)
2247{
2248 char *path = buf + len - 1;
2249 sa_attr_type_t *sa_table;
2250 sa_handle_t *hdl;
2251 dmu_buf_t *db;
2252 int error;
2253
2254 *path = '\0';
2255
2256 error = zfs_sa_setup(osp, &sa_table);
2257 if (error != 0)
2258 return (error);
2259
7b8518cb 2260 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
572e2857
BB
2261 if (error != 0)
2262 return (error);
2263
2264 error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
2265 if (error != 0) {
7b8518cb 2266 zfs_release_sa_handle(hdl, db, FTAG);
572e2857
BB
2267 return (error);
2268 }
2269
2270 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2271
7b8518cb 2272 zfs_release_sa_handle(hdl, db, FTAG);
572e2857
BB
2273 return (error);
2274}
c28b2279 2275
2b9f8ba6
RN
2276/*
2277 * Read a property stored within the master node.
2278 */
2279int
2280zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2281{
2282 uint64_t *cached_copy = NULL;
2283
2284 /*
2285 * Figure out where in the objset_t the cached copy would live, if it
2286 * is available for the requested property.
2287 */
2288 if (os != NULL) {
2289 switch (prop) {
2290 case ZFS_PROP_VERSION:
2291 cached_copy = &os->os_version;
2292 break;
2293 case ZFS_PROP_NORMALIZE:
2294 cached_copy = &os->os_normalization;
2295 break;
2296 case ZFS_PROP_UTF8ONLY:
2297 cached_copy = &os->os_utf8only;
2298 break;
2299 case ZFS_PROP_CASE:
2300 cached_copy = &os->os_casesensitivity;
2301 break;
2302 default:
2303 break;
2304 }
2305 }
2306 if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
2307 *value = *cached_copy;
2308 return (0);
2309 }
2310
2311 /*
2312 * If the property wasn't cached, look up the file system's value for
2313 * the property. For the version property, we look up a slightly
2314 * different string.
2315 */
2316 const char *pname;
2317 int error = ENOENT;
2318 if (prop == ZFS_PROP_VERSION)
2319 pname = ZPL_VERSION_STR;
2320 else
2321 pname = zfs_prop_to_name(prop);
2322
2323 if (os != NULL) {
2324 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
2325 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2326 }
2327
2328 if (error == ENOENT) {
2329 /* No value set, use the default value */
2330 switch (prop) {
2331 case ZFS_PROP_VERSION:
2332 *value = ZPL_VERSION;
2333 break;
2334 case ZFS_PROP_NORMALIZE:
2335 case ZFS_PROP_UTF8ONLY:
2336 *value = 0;
2337 break;
2338 case ZFS_PROP_CASE:
2339 *value = ZFS_CASE_SENSITIVE;
2340 break;
2341 case ZFS_PROP_ACLTYPE:
2342 *value = ZFS_ACLTYPE_OFF;
2343 break;
2344 default:
2345 return (error);
2346 }
2347 error = 0;
2348 }
2349
2350 /*
2351 * If one of the methods for getting the property value above worked,
2352 * copy it into the objset_t's cache.
2353 */
2354 if (error == 0 && cached_copy != NULL) {
2355 *cached_copy = *value;
2356 }
2357
2358 return (error);
2359}
2360
93ce2b4c 2361#if defined(_KERNEL)
c28b2279
BB
2362EXPORT_SYMBOL(zfs_create_fs);
2363EXPORT_SYMBOL(zfs_obj_to_path);
0720116d 2364
02730c33 2365/* CSTYLED */
0720116d
BB
2366module_param(zfs_object_mutex_size, uint, 0644);
2367MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
dcec0a12
AP
2368module_param(zfs_unlink_suspend_progress, int, 0644);
2369MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks "
2370"(debug - leaks space into the unlinked set)");
c28b2279 2371#endif