]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/zfs_znode.c
Merge branch 'zfsonlinux/merge-spl'
[mirror_zfs.git] / module / zfs / zfs_znode.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
19d55079 23 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
34dc7c2f
BB
24 */
25
26/* Portions Copyright 2007 Jeremy Teo */
27
34dc7c2f
BB
28#ifdef _KERNEL
29#include <sys/types.h>
30#include <sys/param.h>
31#include <sys/time.h>
32#include <sys/systm.h>
33#include <sys/sysmacros.h>
34#include <sys/resource.h>
35#include <sys/mntent.h>
36#include <sys/mkdev.h>
37#include <sys/u8_textprep.h>
38#include <sys/dsl_dataset.h>
39#include <sys/vfs.h>
40#include <sys/vfs_opreg.h>
41#include <sys/vnode.h>
42#include <sys/file.h>
43#include <sys/kmem.h>
44#include <sys/errno.h>
45#include <sys/unistd.h>
46#include <sys/mode.h>
47#include <sys/atomic.h>
48#include <vm/pvn.h>
49#include "fs/fs_subr.h"
50#include <sys/zfs_dir.h>
51#include <sys/zfs_acl.h>
52#include <sys/zfs_ioctl.h>
53#include <sys/zfs_rlock.h>
54#include <sys/zfs_fuid.h>
3558fd73 55#include <sys/zfs_vnops.h>
ebe7e575 56#include <sys/zfs_ctldir.h>
428870ff 57#include <sys/dnode.h>
34dc7c2f
BB
58#include <sys/fs/zfs.h>
59#include <sys/kidmap.h>
3558fd73 60#include <sys/zpl.h>
34dc7c2f
BB
61#endif /* _KERNEL */
62
63#include <sys/dmu.h>
f1512ee6 64#include <sys/dmu_objset.h>
50c957f7 65#include <sys/dmu_tx.h>
34dc7c2f
BB
66#include <sys/refcount.h>
67#include <sys/stat.h>
68#include <sys/zap.h>
69#include <sys/zfs_znode.h>
428870ff
BB
70#include <sys/sa.h>
71#include <sys/zfs_sa.h>
572e2857 72#include <sys/zfs_stat.h>
34dc7c2f
BB
73
74#include "zfs_prop.h"
428870ff 75#include "zfs_comutil.h"
34dc7c2f 76
b128c09f
BB
77/*
78 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
79 * turned on when DEBUG is also defined.
80 */
81#ifdef DEBUG
82#define ZNODE_STATS
83#endif /* DEBUG */
84
85#ifdef ZNODE_STATS
86#define ZNODE_STAT_ADD(stat) ((stat)++)
87#else
88#define ZNODE_STAT_ADD(stat) /* nothing */
89#endif /* ZNODE_STATS */
90
34dc7c2f
BB
91/*
92 * Functions needed for userland (ie: libzpool) are not put under
93 * #ifdef_KERNEL; the rest of the functions have dependencies
94 * (such as VFS logic) that will not compile easily in userland.
95 */
96#ifdef _KERNEL
9babb374 97
b128c09f 98static kmem_cache_t *znode_cache = NULL;
c96c36fa 99static kmem_cache_t *znode_hold_cache = NULL;
0720116d 100unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
34dc7c2f 101
34dc7c2f
BB
102/*ARGSUSED*/
103static int
b128c09f 104zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
34dc7c2f
BB
105{
106 znode_t *zp = buf;
107
3558fd73 108 inode_init_once(ZTOI(zp));
b128c09f
BB
109 list_link_init(&zp->z_link_node);
110
34dc7c2f 111 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
34dc7c2f 112 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
448d7aaa 113 rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
34dc7c2f 114 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
82a37189 115 rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
34dc7c2f 116
d88895a0 117 zfs_rlock_init(&zp->z_range_lock);
34dc7c2f 118
b128c09f 119 zp->z_dirlocks = NULL;
45d1cae3 120 zp->z_acl_cached = NULL;
82a37189 121 zp->z_xattr_cached = NULL;
98701490 122 zp->z_xattr_parent = 0;
572e2857 123 zp->z_moved = 0;
34dc7c2f
BB
124 return (0);
125}
126
127/*ARGSUSED*/
128static void
b128c09f 129zfs_znode_cache_destructor(void *buf, void *arg)
34dc7c2f
BB
130{
131 znode_t *zp = buf;
132
b128c09f 133 ASSERT(!list_link_active(&zp->z_link_node));
34dc7c2f 134 mutex_destroy(&zp->z_lock);
34dc7c2f
BB
135 rw_destroy(&zp->z_parent_lock);
136 rw_destroy(&zp->z_name_lock);
137 mutex_destroy(&zp->z_acl_lock);
82a37189 138 rw_destroy(&zp->z_xattr_lock);
d88895a0 139 zfs_rlock_destroy(&zp->z_range_lock);
34dc7c2f 140
b128c09f 141 ASSERT(zp->z_dirlocks == NULL);
45d1cae3 142 ASSERT(zp->z_acl_cached == NULL);
82a37189 143 ASSERT(zp->z_xattr_cached == NULL);
b128c09f
BB
144}
145
c96c36fa
BB
146static int
147zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
148{
149 znode_hold_t *zh = buf;
150
151 mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
152 refcount_create(&zh->zh_refcount);
153 zh->zh_obj = ZFS_NO_OBJECT;
154
155 return (0);
156}
157
158static void
159zfs_znode_hold_cache_destructor(void *buf, void *arg)
160{
161 znode_hold_t *zh = buf;
162
163 mutex_destroy(&zh->zh_lock);
164 refcount_destroy(&zh->zh_refcount);
165}
166
34dc7c2f
BB
167void
168zfs_znode_init(void)
169{
170 /*
5074bfe8
TC
171 * Initialize zcache. The KMC_SLAB hint is used in order that it be
172 * backed by kmalloc() when on the Linux slab in order that any
173 * wait_on_bit() operations on the related inode operate properly.
34dc7c2f
BB
174 */
175 ASSERT(znode_cache == NULL);
176 znode_cache = kmem_cache_create("zfs_znode_cache",
177 sizeof (znode_t), 0, zfs_znode_cache_constructor,
5074bfe8 178 zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB);
c96c36fa
BB
179
180 ASSERT(znode_hold_cache == NULL);
181 znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
182 sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
183 zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
34dc7c2f
BB
184}
185
186void
187zfs_znode_fini(void)
188{
34dc7c2f
BB
189 /*
190 * Cleanup zcache
191 */
192 if (znode_cache)
193 kmem_cache_destroy(znode_cache);
194 znode_cache = NULL;
c96c36fa
BB
195
196 if (znode_hold_cache)
197 kmem_cache_destroy(znode_hold_cache);
198 znode_hold_cache = NULL;
199}
200
201/*
202 * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
203 * serialize access to a znode and its SA buffer while the object is being
204 * created or destroyed. This kind of locking would normally reside in the
205 * znode itself but in this case that's impossible because the znode and SA
206 * buffer may not yet exist. Therefore the locking is handled externally
207 * with an array of mutexs and AVLs trees which contain per-object locks.
208 *
209 * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
210 * in to the correct AVL tree and finally the per-object lock is held. In
211 * zfs_znode_hold_exit() the process is reversed. The per-object lock is
212 * released, removed from the AVL tree and destroyed if there are no waiters.
213 *
214 * This scheme has two important properties:
215 *
216 * 1) No memory allocations are performed while holding one of the z_hold_locks.
217 * This ensures evict(), which can be called from direct memory reclaim, will
218 * never block waiting on a z_hold_locks which just happens to have hashed
219 * to the same index.
220 *
221 * 2) All locks used to serialize access to an object are per-object and never
222 * shared. This minimizes lock contention without creating a large number
223 * of dedicated locks.
224 *
225 * On the downside it does require znode_lock_t structures to be frequently
226 * allocated and freed. However, because these are backed by a kmem cache
227 * and very short lived this cost is minimal.
228 */
229int
230zfs_znode_hold_compare(const void *a, const void *b)
231{
ee36c709
GN
232 const znode_hold_t *zh_a = (const znode_hold_t *)a;
233 const znode_hold_t *zh_b = (const znode_hold_t *)b;
234
235 return (AVL_CMP(zh_a->zh_obj, zh_b->zh_obj));
c96c36fa
BB
236}
237
238boolean_t
0037b49e 239zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
c96c36fa
BB
240{
241 znode_hold_t *zh, search;
0037b49e 242 int i = ZFS_OBJ_HASH(zfsvfs, obj);
37c56346 243 boolean_t held;
c96c36fa
BB
244
245 search.zh_obj = obj;
246
0037b49e
BB
247 mutex_enter(&zfsvfs->z_hold_locks[i]);
248 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
37c56346 249 held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
0037b49e 250 mutex_exit(&zfsvfs->z_hold_locks[i]);
c96c36fa 251
37c56346 252 return (held);
c96c36fa
BB
253}
254
255static znode_hold_t *
0037b49e 256zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
c96c36fa
BB
257{
258 znode_hold_t *zh, *zh_new, search;
0037b49e 259 int i = ZFS_OBJ_HASH(zfsvfs, obj);
c96c36fa
BB
260 boolean_t found = B_FALSE;
261
262 zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
263 zh_new->zh_obj = obj;
264 search.zh_obj = obj;
265
0037b49e
BB
266 mutex_enter(&zfsvfs->z_hold_locks[i]);
267 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
c96c36fa
BB
268 if (likely(zh == NULL)) {
269 zh = zh_new;
0037b49e 270 avl_add(&zfsvfs->z_hold_trees[i], zh);
c96c36fa
BB
271 } else {
272 ASSERT3U(zh->zh_obj, ==, obj);
273 found = B_TRUE;
274 }
275 refcount_add(&zh->zh_refcount, NULL);
0037b49e 276 mutex_exit(&zfsvfs->z_hold_locks[i]);
c96c36fa
BB
277
278 if (found == B_TRUE)
279 kmem_cache_free(znode_hold_cache, zh_new);
280
281 ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
282 ASSERT3S(refcount_count(&zh->zh_refcount), >, 0);
283 mutex_enter(&zh->zh_lock);
284
285 return (zh);
286}
287
288static void
0037b49e 289zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
c96c36fa 290{
0037b49e 291 int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
c96c36fa
BB
292 boolean_t remove = B_FALSE;
293
0037b49e 294 ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
c96c36fa
BB
295 ASSERT3S(refcount_count(&zh->zh_refcount), >, 0);
296 mutex_exit(&zh->zh_lock);
297
0037b49e 298 mutex_enter(&zfsvfs->z_hold_locks[i]);
c96c36fa 299 if (refcount_remove(&zh->zh_refcount, NULL) == 0) {
0037b49e 300 avl_remove(&zfsvfs->z_hold_trees[i], zh);
c96c36fa
BB
301 remove = B_TRUE;
302 }
0037b49e 303 mutex_exit(&zfsvfs->z_hold_locks[i]);
c96c36fa
BB
304
305 if (remove == B_TRUE)
306 kmem_cache_free(znode_hold_cache, zh);
34dc7c2f
BB
307}
308
34dc7c2f 309int
0037b49e 310zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
34dc7c2f 311{
3c9609b3 312#ifdef HAVE_SMB_SHARE
9babb374
BB
313 zfs_acl_ids_t acl_ids;
314 vattr_t vattr;
315 znode_t *sharezp;
316 vnode_t *vp;
317 znode_t *zp;
318 int error;
34dc7c2f 319
9babb374 320 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
3558fd73 321 vattr.va_mode = S_IFDIR | 0555;
9babb374
BB
322 vattr.va_uid = crgetuid(kcred);
323 vattr.va_gid = crgetgid(kcred);
34dc7c2f 324
79c76d5b 325 sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
572e2857 326 sharezp->z_moved = 0;
9babb374
BB
327 sharezp->z_unlinked = 0;
328 sharezp->z_atime_dirty = 0;
329 sharezp->z_zfsvfs = zfsvfs;
428870ff 330 sharezp->z_is_sa = zfsvfs->z_use_sa;
9c5167d1 331 sharezp->z_pflags = 0;
34dc7c2f 332
9babb374
BB
333 vp = ZTOV(sharezp);
334 vn_reinit(vp);
335 vp->v_type = VDIR;
34dc7c2f 336
9babb374
BB
337 VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
338 kcred, NULL, &acl_ids));
428870ff 339 zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
9babb374
BB
340 ASSERT3P(zp, ==, sharezp);
341 ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */
342 POINTER_INVALIDATE(&sharezp->z_zfsvfs);
343 error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
344 ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
345 zfsvfs->z_shares_dir = sharezp->z_id;
346
347 zfs_acl_ids_free(&acl_ids);
3558fd73 348 // ZTOV(sharezp)->v_count = 0;
428870ff 349 sa_handle_destroy(sharezp->z_sa_hdl);
9babb374 350 kmem_cache_free(znode_cache, sharezp);
34dc7c2f 351
9babb374 352 return (error);
9ee7fac5
BB
353#else
354 return (0);
3c9609b3 355#endif /* HAVE_SMB_SHARE */
34dc7c2f
BB
356}
357
34dc7c2f 358static void
0037b49e 359zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
428870ff 360 dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
34dc7c2f 361{
0037b49e 362 ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
34dc7c2f
BB
363
364 mutex_enter(&zp->z_lock);
365
428870ff
BB
366 ASSERT(zp->z_sa_hdl == NULL);
367 ASSERT(zp->z_acl_cached == NULL);
368 if (sa_hdl == NULL) {
0037b49e 369 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
428870ff
BB
370 SA_HDL_SHARED, &zp->z_sa_hdl));
371 } else {
372 zp->z_sa_hdl = sa_hdl;
373 sa_set_userp(sa_hdl, zp);
374 }
34dc7c2f 375
428870ff 376 zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
34dc7c2f 377
34dc7c2f 378 mutex_exit(&zp->z_lock);
34dc7c2f
BB
379}
380
381void
382zfs_znode_dmu_fini(znode_t *zp)
383{
c96c36fa 384 ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked ||
3558fd73 385 RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
428870ff
BB
386
387 sa_handle_destroy(zp->z_sa_hdl);
388 zp->z_sa_hdl = NULL;
34dc7c2f
BB
389}
390
391/*
3558fd73
BB
392 * Called by new_inode() to allocate a new inode.
393 */
394int
395zfs_inode_alloc(struct super_block *sb, struct inode **ip)
396{
397 znode_t *zp;
398
79c76d5b 399 zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
3558fd73
BB
400 *ip = ZTOI(zp);
401
402 return (0);
403}
404
405/*
406 * Called in multiple places when an inode should be destroyed.
407 */
408void
409zfs_inode_destroy(struct inode *ip)
410{
411 znode_t *zp = ITOZ(ip);
0037b49e 412 zfsvfs_t *zfsvfs = ZTOZSB(zp);
3558fd73 413
0037b49e 414 mutex_enter(&zfsvfs->z_znodes_lock);
7b3e34ba 415 if (list_link_active(&zp->z_link_node)) {
0037b49e
BB
416 list_remove(&zfsvfs->z_all_znodes, zp);
417 zfsvfs->z_nr_znodes--;
7b3e34ba 418 }
0037b49e 419 mutex_exit(&zfsvfs->z_znodes_lock);
3558fd73
BB
420
421 if (zp->z_acl_cached) {
422 zfs_acl_free(zp->z_acl_cached);
423 zp->z_acl_cached = NULL;
424 }
425
82a37189
BB
426 if (zp->z_xattr_cached) {
427 nvlist_free(zp->z_xattr_cached);
428 zp->z_xattr_cached = NULL;
429 }
430
3558fd73
BB
431 kmem_cache_free(znode_cache, zp);
432}
433
434static void
0037b49e 435zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
3558fd73 436{
aa6d8c10 437 uint64_t rdev = 0;
3558fd73
BB
438
439 switch (ip->i_mode & S_IFMT) {
440 case S_IFREG:
441 ip->i_op = &zpl_inode_operations;
442 ip->i_fop = &zpl_file_operations;
443 ip->i_mapping->a_ops = &zpl_address_space_operations;
444 break;
445
446 case S_IFDIR:
447 ip->i_op = &zpl_dir_inode_operations;
448 ip->i_fop = &zpl_dir_file_operations;
449 ITOZ(ip)->z_zn_prefetch = B_TRUE;
450 break;
451
452 case S_IFLNK:
453 ip->i_op = &zpl_symlink_inode_operations;
454 break;
455
aa6d8c10
NB
456 /*
457 * rdev is only stored in a SA only for device files.
458 */
3558fd73
BB
459 case S_IFCHR:
460 case S_IFBLK:
0037b49e 461 (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
53b1d979 462 sizeof (rdev));
aa6d8c10
NB
463 /*FALLTHROUGH*/
464 case S_IFIFO:
465 case S_IFSOCK:
3558fd73
BB
466 init_special_inode(ip, ip->i_mode, rdev);
467 ip->i_op = &zpl_special_inode_operations;
468 break;
469
470 default:
53b1d979
BB
471 zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
472 (u_longlong_t)ip->i_ino, ip->i_mode);
473
474 /* Assume the inode is a file and attempt to continue */
475 ip->i_mode = S_IFREG | 0644;
476 ip->i_op = &zpl_inode_operations;
477 ip->i_fop = &zpl_file_operations;
478 ip->i_mapping->a_ops = &zpl_address_space_operations;
479 break;
3558fd73
BB
480 }
481}
482
7bb1325f
CC
483void
484zfs_set_inode_flags(znode_t *zp, struct inode *ip)
485{
486 /*
487 * Linux and Solaris have different sets of file attributes, so we
488 * restrict this conversion to the intersection of the two.
489 */
a5248129
CC
490#ifdef HAVE_INODE_SET_FLAGS
491 unsigned int flags = 0;
492 if (zp->z_pflags & ZFS_IMMUTABLE)
493 flags |= S_IMMUTABLE;
494 if (zp->z_pflags & ZFS_APPENDONLY)
495 flags |= S_APPEND;
7bb1325f 496
a5248129
CC
497 inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
498#else
7bb1325f
CC
499 if (zp->z_pflags & ZFS_IMMUTABLE)
500 ip->i_flags |= S_IMMUTABLE;
501 else
502 ip->i_flags &= ~S_IMMUTABLE;
503
504 if (zp->z_pflags & ZFS_APPENDONLY)
505 ip->i_flags |= S_APPEND;
506 else
507 ip->i_flags &= ~S_APPEND;
a5248129 508#endif
7bb1325f
CC
509}
510
704cd075
CC
511/*
512 * Update the embedded inode given the znode. We should work toward
513 * eliminating this function as soon as possible by removing values
514 * which are duplicated between the znode and inode. If the generic
515 * inode has the correct field it should be used, and the ZFS code
516 * updated to access the inode. This can be done incrementally.
517 */
9f5f0019
NB
518void
519zfs_inode_update(znode_t *zp)
704cd075 520{
0037b49e 521 zfsvfs_t *zfsvfs;
704cd075
CC
522 struct inode *ip;
523 uint32_t blksize;
524 u_longlong_t i_blocks;
704cd075
CC
525
526 ASSERT(zp != NULL);
0037b49e 527 zfsvfs = ZTOZSB(zp);
704cd075
CC
528 ip = ZTOI(zp);
529
530 /* Skip .zfs control nodes which do not exist on disk. */
531 if (zfsctl_is_node(ip))
532 return;
533
704cd075
CC
534 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
535
536 spin_lock(&ip->i_lock);
704cd075 537 ip->i_blocks = i_blocks;
704cd075
CC
538 i_size_write(ip, zp->z_size);
539 spin_unlock(&ip->i_lock);
540}
541
704cd075 542
3558fd73
BB
543/*
544 * Construct a znode+inode and initialize.
34dc7c2f
BB
545 *
546 * This does not do a call to dmu_set_user() that is
547 * up to the caller to do, in case you don't want to
548 * return the znode
549 */
550static znode_t *
0037b49e 551zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
31b6111f 552 dmu_object_type_t obj_type, uint64_t obj, sa_handle_t *hdl)
34dc7c2f
BB
553{
554 znode_t *zp;
3558fd73 555 struct inode *ip;
7f89ae6b 556 uint64_t mode;
428870ff 557 uint64_t parent;
278f2236 558 uint64_t tmp_gen;
dfbc8630 559 uint64_t links;
2c6abf15 560 uint64_t z_uid, z_gid;
9f5f0019 561 uint64_t atime[2], mtime[2], ctime[2];
9c5167d1 562 uint64_t projid = ZFS_DEFAULT_PROJID;
9f5f0019 563 sa_bulk_attr_t bulk[11];
428870ff 564 int count = 0;
34dc7c2f 565
0037b49e 566 ASSERT(zfsvfs != NULL);
34dc7c2f 567
0037b49e 568 ip = new_inode(zfsvfs->z_sb);
3558fd73
BB
569 if (ip == NULL)
570 return (NULL);
7304b6e5 571
3558fd73 572 zp = ITOZ(ip);
34dc7c2f 573 ASSERT(zp->z_dirlocks == NULL);
ebe7e575
BB
574 ASSERT3P(zp->z_acl_cached, ==, NULL);
575 ASSERT3P(zp->z_xattr_cached, ==, NULL);
572e2857 576 zp->z_moved = 0;
428870ff 577 zp->z_sa_hdl = NULL;
34dc7c2f
BB
578 zp->z_unlinked = 0;
579 zp->z_atime_dirty = 0;
580 zp->z_mapcnt = 0;
34dc7c2f
BB
581 zp->z_id = db->db_object;
582 zp->z_blksz = blksz;
583 zp->z_seq = 0x7A4653;
584 zp->z_sync_cnt = 0;
ebe7e575
BB
585 zp->z_is_mapped = B_FALSE;
586 zp->z_is_ctldir = B_FALSE;
7b3e34ba 587 zp->z_is_stale = B_FALSE;
d88895a0
CC
588 zp->z_range_lock.zr_size = &zp->z_size;
589 zp->z_range_lock.zr_blksz = &zp->z_blksz;
590 zp->z_range_lock.zr_max_blksz = &ZTOZSB(zp)->z_max_blksz;
34dc7c2f 591
0037b49e 592 zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
3558fd73 593
0037b49e
BB
594 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
595 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
596 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
597 &zp->z_size, 8);
598 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
599 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
428870ff 600 &zp->z_pflags, 8);
0037b49e 601 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
7304b6e5 602 &parent, 8);
0037b49e
BB
603 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
604 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
605 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
606 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
607 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
428870ff 608
9c5167d1
NF
609 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
610 (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
611 (zp->z_pflags & ZFS_PROJID) &&
612 sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
428870ff
BB
613 if (hdl == NULL)
614 sa_handle_destroy(zp->z_sa_hdl);
07d63f0c 615 zp->z_sa_hdl = NULL;
3558fd73 616 goto error;
34dc7c2f 617 }
7304b6e5 618
9c5167d1 619 zp->z_projid = projid;
12fa7f34 620 zp->z_mode = ip->i_mode = mode;
278f2236 621 ip->i_generation = (uint32_t)tmp_gen;
ba2fe6af 622 ip->i_blkbits = SPA_MINBLOCKSHIFT;
dfbc8630 623 set_nlink(ip, (uint32_t)links);
2c6abf15
NB
624 zfs_uid_write(ip, z_uid);
625 zfs_gid_write(ip, z_gid);
7bb1325f 626 zfs_set_inode_flags(zp, ip);
7f89ae6b 627
98701490
CC
628 /* Cache the xattr parent id */
629 if (zp->z_pflags & ZFS_XATTR)
630 zp->z_xattr_parent = parent;
631
9f5f0019
NB
632 ZFS_TIME_DECODE(&ip->i_atime, atime);
633 ZFS_TIME_DECODE(&ip->i_mtime, mtime);
634 ZFS_TIME_DECODE(&ip->i_ctime, ctime);
635
3558fd73 636 ip->i_ino = obj;
9f5f0019 637 zfs_inode_update(zp);
0037b49e 638 zfs_inode_set_ops(zfsvfs, ip);
3558fd73 639
7b3e34ba
BB
640 /*
641 * The only way insert_inode_locked() can fail is if the ip->i_ino
642 * number is already hashed for this super block. This can never
643 * happen because the inode numbers map 1:1 with the object numbers.
644 *
645 * The one exception is rolling back a mounted file system, but in
646 * this case all the active inode are unhashed during the rollback.
647 */
648 VERIFY3S(insert_inode_locked(ip), ==, 0);
c85b224f 649
0037b49e
BB
650 mutex_enter(&zfsvfs->z_znodes_lock);
651 list_insert_tail(&zfsvfs->z_all_znodes, zp);
652 zfsvfs->z_nr_znodes++;
b128c09f 653 membar_producer();
0037b49e 654 mutex_exit(&zfsvfs->z_znodes_lock);
b128c09f 655
3558fd73 656 unlock_new_inode(ip);
34dc7c2f 657 return (zp);
3558fd73
BB
658
659error:
3558fd73 660 iput(ip);
d1d7e268 661 return (NULL);
34dc7c2f
BB
662}
663
1e8db771
BB
664/*
665 * Safely mark an inode dirty. Inodes which are part of a read-only
666 * file system or snapshot may not be dirtied.
667 */
668void
669zfs_mark_inode_dirty(struct inode *ip)
670{
0037b49e 671 zfsvfs_t *zfsvfs = ITOZSB(ip);
1e8db771 672
0037b49e 673 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
1e8db771
BB
674 return;
675
676 mark_inode_dirty(ip);
677}
678
428870ff
BB
679static uint64_t empty_xattr;
680static uint64_t pad[4];
681static zfs_acl_phys_t acl_phys;
34dc7c2f
BB
682/*
683 * Create a new DMU object to hold a zfs znode.
684 *
685 * IN: dzp - parent directory for new znode
686 * vap - file attributes for new znode
687 * tx - dmu transaction id for zap operations
688 * cr - credentials of caller
689 * flag - flags:
690 * IS_ROOT_NODE - new object will be root
691 * IS_XATTR - new object is an attribute
34dc7c2f
BB
692 * bonuslen - length of bonus buffer
693 * setaclp - File/Dir initial ACL
694 * fuidp - Tracks fuid allocation.
695 *
696 * OUT: zpp - allocated znode
697 *
698 */
699void
700zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
428870ff 701 uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
34dc7c2f 702{
428870ff
BB
703 uint64_t crtime[2], atime[2], mtime[2], ctime[2];
704 uint64_t mode, size, links, parent, pflags;
9c5167d1 705 uint64_t projid = ZFS_DEFAULT_PROJID;
428870ff 706 uint64_t rdev = 0;
0037b49e 707 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
428870ff 708 dmu_buf_t *db;
34dc7c2f
BB
709 timestruc_t now;
710 uint64_t gen, obj;
428870ff 711 int bonuslen;
50c957f7 712 int dnodesize;
428870ff
BB
713 sa_handle_t *sa_hdl;
714 dmu_object_type_t obj_type;
f30484af 715 sa_bulk_attr_t *sa_attrs;
428870ff
BB
716 int cnt = 0;
717 zfs_acl_locator_cb_t locate = { 0 };
c96c36fa 718 znode_hold_t *zh;
34dc7c2f 719
0037b49e 720 if (zfsvfs->z_replay) {
34dc7c2f 721 obj = vap->va_nodeid;
34dc7c2f
BB
722 now = vap->va_ctime; /* see zfs_replay_create() */
723 gen = vap->va_nblocks; /* ditto */
50c957f7 724 dnodesize = vap->va_fsid; /* ditto */
34dc7c2f
BB
725 } else {
726 obj = 0;
727 gethrestime(&now);
728 gen = dmu_tx_get_txg(tx);
0037b49e 729 dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
34dc7c2f
BB
730 }
731
50c957f7
NB
732 if (dnodesize == 0)
733 dnodesize = DNODE_MIN_SIZE;
734
0037b49e 735 obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
50c957f7 736
428870ff 737 bonuslen = (obj_type == DMU_OT_SA) ?
50c957f7 738 DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
428870ff 739
34dc7c2f
BB
740 /*
741 * Create a new DMU object.
742 */
743 /*
744 * There's currently no mechanism for pre-reading the blocks that will
572e2857 745 * be needed to allocate a new object, so we accept the small chance
34dc7c2f
BB
746 * that there will be an i/o error and we will fail one of the
747 * assertions below.
748 */
3558fd73 749 if (S_ISDIR(vap->va_mode)) {
0037b49e
BB
750 if (zfsvfs->z_replay) {
751 VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
752 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
50c957f7 753 obj_type, bonuslen, dnodesize, tx));
34dc7c2f 754 } else {
0037b49e
BB
755 obj = zap_create_norm_dnsize(zfsvfs->z_os,
756 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
50c957f7 757 obj_type, bonuslen, dnodesize, tx);
34dc7c2f
BB
758 }
759 } else {
0037b49e
BB
760 if (zfsvfs->z_replay) {
761 VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
34dc7c2f 762 DMU_OT_PLAIN_FILE_CONTENTS, 0,
50c957f7 763 obj_type, bonuslen, dnodesize, tx));
34dc7c2f 764 } else {
0037b49e 765 obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
34dc7c2f 766 DMU_OT_PLAIN_FILE_CONTENTS, 0,
50c957f7 767 obj_type, bonuslen, dnodesize, tx);
34dc7c2f
BB
768 }
769 }
34dc7c2f 770
0037b49e 771 zh = zfs_znode_hold_enter(zfsvfs, obj);
9631681b 772 VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
34dc7c2f
BB
773
774 /*
775 * If this is the root, fix up the half-initialized parent pointer
776 * to reference the just-allocated physical data area.
777 */
778 if (flag & IS_ROOT_NODE) {
34dc7c2f
BB
779 dzp->z_id = obj;
780 }
781
782 /*
783 * If parent is an xattr, so am I.
784 */
9c5167d1 785 if (dzp->z_pflags & ZFS_XATTR) {
34dc7c2f 786 flag |= IS_XATTR;
34dc7c2f
BB
787 }
788
0037b49e 789 if (zfsvfs->z_use_fuids)
428870ff
BB
790 pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
791 else
792 pflags = 0;
34dc7c2f 793
3558fd73 794 if (S_ISDIR(vap->va_mode)) {
428870ff 795 size = 2; /* contents ("." and "..") */
dfbc8630 796 links = 2;
428870ff 797 } else {
dfbc8630 798 size = 0;
ace1eae8 799 links = (flag & IS_TMPFILE) ? 0 : 1;
34dc7c2f
BB
800 }
801
aa6d8c10 802 if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
dc1d7665 803 rdev = vap->va_rdev;
428870ff
BB
804
805 parent = dzp->z_id;
806 mode = acl_ids->z_mode;
34dc7c2f 807 if (flag & IS_XATTR)
428870ff 808 pflags |= ZFS_XATTR;
34dc7c2f 809
9c5167d1
NF
810 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
811 /*
812 * With ZFS_PROJID flag, we can easily know whether there is
813 * project ID stored on disk or not. See zfs_space_delta_cb().
814 */
815 if (obj_type != DMU_OT_ZNODE &&
816 dmu_objset_projectquota_enabled(zfsvfs->z_os))
817 pflags |= ZFS_PROJID;
818
819 /*
820 * Inherit project ID from parent if required.
821 */
822 projid = zfs_inherit_projid(dzp);
823 if (dzp->z_pflags & ZFS_PROJINHERIT)
824 pflags |= ZFS_PROJINHERIT;
825 }
826
428870ff
BB
827 /*
828 * No execs denied will be deterimed when zfs_mode_compute() is called.
829 */
830 pflags |= acl_ids->z_aclp->z_hints &
831 (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
832 ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
34dc7c2f 833
428870ff
BB
834 ZFS_TIME_ENCODE(&now, crtime);
835 ZFS_TIME_ENCODE(&now, ctime);
34dc7c2f 836
3558fd73 837 if (vap->va_mask & ATTR_ATIME) {
428870ff 838 ZFS_TIME_ENCODE(&vap->va_atime, atime);
34dc7c2f 839 } else {
428870ff 840 ZFS_TIME_ENCODE(&now, atime);
34dc7c2f
BB
841 }
842
3558fd73 843 if (vap->va_mask & ATTR_MTIME) {
428870ff
BB
844 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
845 } else {
846 ZFS_TIME_ENCODE(&now, mtime);
847 }
848
849 /* Now add in all of the "SA" attributes */
0037b49e 850 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
428870ff
BB
851 &sa_hdl));
852
853 /*
854 * Setup the array of attributes to be replaced/set on the new file
855 *
856 * order for DMU_OT_ZNODE is critical since it needs to be constructed
857 * in the old znode_phys_t format. Don't change this ordering
858 */
79c76d5b 859 sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
428870ff
BB
860
861 if (obj_type == DMU_OT_ZNODE) {
0037b49e 862 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
428870ff 863 NULL, &atime, 16);
0037b49e 864 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
428870ff 865 NULL, &mtime, 16);
0037b49e 866 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
428870ff 867 NULL, &ctime, 16);
0037b49e 868 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
428870ff 869 NULL, &crtime, 16);
0037b49e 870 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
428870ff 871 NULL, &gen, 8);
0037b49e 872 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
428870ff 873 NULL, &mode, 8);
0037b49e 874 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
428870ff 875 NULL, &size, 8);
0037b49e 876 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
428870ff 877 NULL, &parent, 8);
34dc7c2f 878 } else {
0037b49e 879 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
428870ff 880 NULL, &mode, 8);
0037b49e 881 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
428870ff 882 NULL, &size, 8);
0037b49e 883 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
428870ff 884 NULL, &gen, 8);
0037b49e 885 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
3558fd73 886 NULL, &acl_ids->z_fuid, 8);
0037b49e 887 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
3558fd73 888 NULL, &acl_ids->z_fgid, 8);
0037b49e 889 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
428870ff 890 NULL, &parent, 8);
0037b49e 891 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
428870ff 892 NULL, &pflags, 8);
0037b49e 893 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
428870ff 894 NULL, &atime, 16);
0037b49e 895 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
428870ff 896 NULL, &mtime, 16);
0037b49e 897 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
428870ff 898 NULL, &ctime, 16);
0037b49e 899 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
428870ff
BB
900 NULL, &crtime, 16);
901 }
902
0037b49e 903 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
428870ff
BB
904
905 if (obj_type == DMU_OT_ZNODE) {
0037b49e 906 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
428870ff 907 &empty_xattr, 8);
9c5167d1
NF
908 } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
909 pflags & ZFS_PROJID) {
910 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
911 NULL, &projid, 8);
34dc7c2f 912 }
428870ff 913 if (obj_type == DMU_OT_ZNODE ||
aa6d8c10 914 (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
0037b49e 915 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
428870ff 916 NULL, &rdev, 8);
428870ff
BB
917 }
918 if (obj_type == DMU_OT_ZNODE) {
0037b49e 919 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
428870ff 920 NULL, &pflags, 8);
0037b49e 921 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
428870ff 922 &acl_ids->z_fuid, 8);
0037b49e 923 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
428870ff 924 &acl_ids->z_fgid, 8);
0037b49e 925 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
428870ff 926 sizeof (uint64_t) * 4);
0037b49e 927 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
428870ff
BB
928 &acl_phys, sizeof (zfs_acl_phys_t));
929 } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
0037b49e 930 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
428870ff
BB
931 &acl_ids->z_aclp->z_acl_count, 8);
932 locate.cb_aclp = acl_ids->z_aclp;
0037b49e 933 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
428870ff
BB
934 zfs_acl_data_locator, &locate,
935 acl_ids->z_aclp->z_acl_bytes);
936 mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
937 acl_ids->z_fuid, acl_ids->z_fgid);
938 }
939
940 VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
34dc7c2f 941
34dc7c2f 942 if (!(flag & IS_ROOT_NODE)) {
8d703987
BB
943 /*
944 * The call to zfs_znode_alloc() may fail if memory is low
945 * via the call path: alloc_inode() -> inode_init_always() ->
946 * security_inode_alloc() -> inode_alloc_security(). Since
947 * the existing code is written such that zfs_mknode() can
948 * not fail retry until sufficient memory has been reclaimed.
949 */
950 do {
951 *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, obj,
952 sa_hdl);
953 } while (*zpp == NULL);
954
7b3e34ba
BB
955 VERIFY(*zpp != NULL);
956 VERIFY(dzp != NULL);
34dc7c2f
BB
957 } else {
958 /*
959 * If we are creating the root node, the "parent" we
960 * passed in is the znode for the root.
961 */
962 *zpp = dzp;
428870ff
BB
963
964 (*zpp)->z_sa_hdl = sa_hdl;
34dc7c2f 965 }
428870ff
BB
966
967 (*zpp)->z_pflags = pflags;
12fa7f34 968 (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
50c957f7 969 (*zpp)->z_dnodesize = dnodesize;
9c5167d1 970 (*zpp)->z_projid = projid;
428870ff 971
428870ff
BB
972 if (obj_type == DMU_OT_ZNODE ||
973 acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
b0bc7a84 974 VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
428870ff 975 }
d1d7e268 976 kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
0037b49e 977 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
978}
979
5484965a 980/*
d3cc8b15
WA
981 * Update in-core attributes. It is assumed the caller will be doing an
982 * sa_bulk_update to push the changes out.
5484965a
BB
983 */
984void
985zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
986{
987 xoptattr_t *xoap;
7bb1325f 988 boolean_t update_inode = B_FALSE;
5484965a
BB
989
990 xoap = xva_getxoptattr(xvap);
991 ASSERT(xoap);
992
993 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
994 uint64_t times[2];
995 ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
996 (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
997 &times, sizeof (times), tx);
998 XVA_SET_RTN(xvap, XAT_CREATETIME);
999 }
1000 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
1001 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
1002 zp->z_pflags, tx);
1003 XVA_SET_RTN(xvap, XAT_READONLY);
1004 }
1005 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
1006 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
1007 zp->z_pflags, tx);
1008 XVA_SET_RTN(xvap, XAT_HIDDEN);
1009 }
1010 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
1011 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
1012 zp->z_pflags, tx);
1013 XVA_SET_RTN(xvap, XAT_SYSTEM);
1014 }
1015 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
1016 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
1017 zp->z_pflags, tx);
1018 XVA_SET_RTN(xvap, XAT_ARCHIVE);
1019 }
1020 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
1021 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
1022 zp->z_pflags, tx);
1023 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
64c688d7 1024
7bb1325f 1025 update_inode = B_TRUE;
5484965a
BB
1026 }
1027 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
1028 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
1029 zp->z_pflags, tx);
1030 XVA_SET_RTN(xvap, XAT_NOUNLINK);
1031 }
1032 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1033 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
1034 zp->z_pflags, tx);
1035 XVA_SET_RTN(xvap, XAT_APPENDONLY);
64c688d7 1036
7bb1325f 1037 update_inode = B_TRUE;
5484965a
BB
1038 }
1039 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1040 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
1041 zp->z_pflags, tx);
1042 XVA_SET_RTN(xvap, XAT_NODUMP);
1043 }
1044 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1045 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
1046 zp->z_pflags, tx);
1047 XVA_SET_RTN(xvap, XAT_OPAQUE);
1048 }
1049 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1050 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
1051 xoap->xoa_av_quarantined, zp->z_pflags, tx);
1052 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1053 }
1054 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1055 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
1056 zp->z_pflags, tx);
1057 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1058 }
1059 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
1060 zfs_sa_set_scanstamp(zp, xvap, tx);
1061 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
1062 }
1063 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1064 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
1065 zp->z_pflags, tx);
1066 XVA_SET_RTN(xvap, XAT_REPARSE);
1067 }
1068 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1069 ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
1070 zp->z_pflags, tx);
1071 XVA_SET_RTN(xvap, XAT_OFFLINE);
1072 }
1073 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1074 ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
1075 zp->z_pflags, tx);
1076 XVA_SET_RTN(xvap, XAT_SPARSE);
1077 }
9c5167d1
NF
1078 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
1079 ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
1080 zp->z_pflags, tx);
1081 XVA_SET_RTN(xvap, XAT_PROJINHERIT);
1082 }
7bb1325f
CC
1083
1084 if (update_inode)
1085 zfs_set_inode_flags(zp, ZTOI(zp));
5484965a
BB
1086}
1087
34dc7c2f 1088int
0037b49e 1089zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
34dc7c2f
BB
1090{
1091 dmu_object_info_t doi;
1092 dmu_buf_t *db;
1093 znode_t *zp;
c96c36fa 1094 znode_hold_t *zh;
34dc7c2f 1095 int err;
428870ff 1096 sa_handle_t *hdl;
34dc7c2f
BB
1097
1098 *zpp = NULL;
1099
6f9548c4 1100again:
0037b49e 1101 zh = zfs_znode_hold_enter(zfsvfs, obj_num);
34dc7c2f 1102
0037b49e 1103 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
34dc7c2f 1104 if (err) {
0037b49e 1105 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1106 return (err);
1107 }
1108
1109 dmu_object_info_from_db(db, &doi);
428870ff
BB
1110 if (doi.doi_bonus_type != DMU_OT_SA &&
1111 (doi.doi_bonus_type != DMU_OT_ZNODE ||
1112 (doi.doi_bonus_type == DMU_OT_ZNODE &&
1113 doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1114 sa_buf_rele(db, NULL);
0037b49e 1115 zfs_znode_hold_exit(zfsvfs, zh);
2e528b49 1116 return (SET_ERROR(EINVAL));
34dc7c2f
BB
1117 }
1118
428870ff
BB
1119 hdl = dmu_buf_get_user(db);
1120 if (hdl != NULL) {
36df2843 1121 zp = sa_get_userdata(hdl);
34dc7c2f 1122
8ac67298 1123
34dc7c2f 1124 /*
428870ff
BB
1125 * Since "SA" does immediate eviction we
1126 * should never find a sa handle that doesn't
1127 * know about the znode.
34dc7c2f 1128 */
428870ff
BB
1129
1130 ASSERT3P(zp, !=, NULL);
1131
1132 mutex_enter(&zp->z_lock);
34dc7c2f 1133 ASSERT3U(zp->z_id, ==, obj_num);
98701490
CC
1134 /*
1135 * If igrab() returns NULL the VFS has independently
1136 * determined the inode should be evicted and has
1137 * called iput_final() to start the eviction process.
1138 * The SA handle is still valid but because the VFS
1139 * requires that the eviction succeed we must drop
1140 * our locks and references to allow the eviction to
1141 * complete. The zfs_zget() may then be retried.
1142 *
1143 * This unlikely case could be optimized by registering
1144 * a sops->drop_inode() callback. The callback would
1145 * need to detect the active SA hold thereby informing
1146 * the VFS that this inode should not be evicted.
1147 */
1148 if (igrab(ZTOI(zp)) == NULL) {
1149 mutex_exit(&zp->z_lock);
1150 sa_buf_rele(db, NULL);
0037b49e 1151 zfs_znode_hold_exit(zfsvfs, zh);
98701490
CC
1152 /* inode might need this to finish evict */
1153 cond_resched();
1154 goto again;
34dc7c2f 1155 }
98701490
CC
1156 *zpp = zp;
1157 err = 0;
34dc7c2f 1158 mutex_exit(&zp->z_lock);
f3ad9cd6 1159 sa_buf_rele(db, NULL);
0037b49e 1160 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1161 return (err);
1162 }
1163
1164 /*
3558fd73 1165 * Not found create new znode/vnode but only if file exists.
428870ff
BB
1166 *
1167 * There is a small window where zfs_vget() could
1168 * find this object while a file create is still in
1169 * progress. This is checked for in zfs_znode_alloc()
1170 *
1171 * if zfs_znode_alloc() fails it will drop the hold on the
1172 * bonus buffer.
34dc7c2f 1173 */
0037b49e 1174 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
31b6111f 1175 doi.doi_bonus_type, obj_num, NULL);
428870ff 1176 if (zp == NULL) {
2e528b49 1177 err = SET_ERROR(ENOENT);
428870ff
BB
1178 } else {
1179 *zpp = zp;
1180 }
0037b49e 1181 zfs_znode_hold_exit(zfsvfs, zh);
428870ff 1182 return (err);
34dc7c2f
BB
1183}
1184
1185int
1186zfs_rezget(znode_t *zp)
1187{
0037b49e 1188 zfsvfs_t *zfsvfs = ZTOZSB(zp);
34dc7c2f
BB
1189 dmu_object_info_t doi;
1190 dmu_buf_t *db;
1191 uint64_t obj_num = zp->z_id;
428870ff 1192 uint64_t mode;
dfbc8630 1193 uint64_t links;
9f5f0019 1194 sa_bulk_attr_t bulk[10];
34dc7c2f 1195 int err;
428870ff
BB
1196 int count = 0;
1197 uint64_t gen;
2c6abf15 1198 uint64_t z_uid, z_gid;
9f5f0019 1199 uint64_t atime[2], mtime[2], ctime[2];
9c5167d1 1200 uint64_t projid = ZFS_DEFAULT_PROJID;
c96c36fa 1201 znode_hold_t *zh;
34dc7c2f 1202
cbecb4fb
CC
1203 /*
1204 * skip ctldir, otherwise they will always get invalidated. This will
1205 * cause funny behaviour for the mounted snapdirs. Especially for
1206 * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
1207 * anyone automount it again as long as someone is still using the
1208 * detached mount.
1209 */
1210 if (zp->z_is_ctldir)
1211 return (0);
1212
0037b49e 1213 zh = zfs_znode_hold_enter(zfsvfs, obj_num);
34dc7c2f 1214
428870ff
BB
1215 mutex_enter(&zp->z_acl_lock);
1216 if (zp->z_acl_cached) {
1217 zfs_acl_free(zp->z_acl_cached);
1218 zp->z_acl_cached = NULL;
1219 }
428870ff 1220 mutex_exit(&zp->z_acl_lock);
7b3e34ba 1221
228b461b 1222 rw_enter(&zp->z_xattr_lock, RW_WRITER);
7b3e34ba
BB
1223 if (zp->z_xattr_cached) {
1224 nvlist_free(zp->z_xattr_cached);
1225 zp->z_xattr_cached = NULL;
1226 }
7b3e34ba
BB
1227 rw_exit(&zp->z_xattr_lock);
1228
428870ff 1229 ASSERT(zp->z_sa_hdl == NULL);
0037b49e 1230 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
34dc7c2f 1231 if (err) {
0037b49e 1232 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1233 return (err);
1234 }
1235
1236 dmu_object_info_from_db(db, &doi);
428870ff
BB
1237 if (doi.doi_bonus_type != DMU_OT_SA &&
1238 (doi.doi_bonus_type != DMU_OT_ZNODE ||
1239 (doi.doi_bonus_type == DMU_OT_ZNODE &&
1240 doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1241 sa_buf_rele(db, NULL);
0037b49e 1242 zfs_znode_hold_exit(zfsvfs, zh);
2e528b49 1243 return (SET_ERROR(EINVAL));
34dc7c2f
BB
1244 }
1245
0037b49e 1246 zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
428870ff
BB
1247
1248 /* reload cached values */
0037b49e 1249 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
428870ff 1250 &gen, sizeof (gen));
0037b49e 1251 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
428870ff 1252 &zp->z_size, sizeof (zp->z_size));
0037b49e 1253 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
dfbc8630 1254 &links, sizeof (links));
0037b49e 1255 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
428870ff 1256 &zp->z_pflags, sizeof (zp->z_pflags));
0037b49e 1257 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
2c6abf15 1258 &z_uid, sizeof (z_uid));
0037b49e 1259 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
2c6abf15 1260 &z_gid, sizeof (z_gid));
0037b49e 1261 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
428870ff 1262 &mode, sizeof (mode));
0037b49e 1263 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
9f5f0019 1264 &atime, 16);
0037b49e 1265 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
9f5f0019 1266 &mtime, 16);
0037b49e 1267 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
9f5f0019 1268 &ctime, 16);
428870ff 1269
428870ff
BB
1270 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
1271 zfs_znode_dmu_fini(zp);
0037b49e 1272 zfs_znode_hold_exit(zfsvfs, zh);
2e528b49 1273 return (SET_ERROR(EIO));
428870ff
BB
1274 }
1275
9c5167d1
NF
1276 if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
1277 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
1278 &projid, 8);
1279 if (err != 0 && err != ENOENT) {
1280 zfs_znode_dmu_fini(zp);
1281 zfs_znode_hold_exit(zfsvfs, zh);
1282 return (SET_ERROR(err));
1283 }
1284 }
1285
1286 zp->z_projid = projid;
12fa7f34 1287 zp->z_mode = ZTOI(zp)->i_mode = mode;
2c6abf15
NB
1288 zfs_uid_write(ZTOI(zp), z_uid);
1289 zfs_gid_write(ZTOI(zp), z_gid);
572e2857 1290
9f5f0019
NB
1291 ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
1292 ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
1293 ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime);
1294
278f2236 1295 if (gen != ZTOI(zp)->i_generation) {
428870ff 1296 zfs_znode_dmu_fini(zp);
0037b49e 1297 zfs_znode_hold_exit(zfsvfs, zh);
2e528b49 1298 return (SET_ERROR(EIO));
34dc7c2f
BB
1299 }
1300
dfbc8630 1301 set_nlink(ZTOI(zp), (uint32_t)links);
7bb1325f 1302 zfs_set_inode_flags(zp, ZTOI(zp));
dfbc8630 1303
34dc7c2f 1304 zp->z_blksz = doi.doi_data_block_size;
704cd075 1305 zp->z_atime_dirty = 0;
9f5f0019 1306 zfs_inode_update(zp);
34dc7c2f 1307
6a218566
AG
1308 /*
1309 * If the file has zero links, then it has been unlinked on the send
1310 * side and it must be in the received unlinked set.
1311 * We call zfs_znode_dmu_fini() now to prevent any accesses to the
1312 * stale data and to prevent automatical removal of the file in
1313 * zfs_zinactive(). The file will be removed either when it is removed
1314 * on the send side and the next incremental stream is received or
1315 * when the unlinked set gets processed.
1316 */
1317 zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
1318 if (zp->z_unlinked)
1319 zfs_znode_dmu_fini(zp);
1320
0037b49e 1321 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1322
1323 return (0);
1324}
1325
1326void
1327zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1328{
0037b49e
BB
1329 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1330 objset_t *os = zfsvfs->z_os;
34dc7c2f 1331 uint64_t obj = zp->z_id;
572e2857 1332 uint64_t acl_obj = zfs_external_acl(zp);
c96c36fa 1333 znode_hold_t *zh;
34dc7c2f 1334
0037b49e 1335 zh = zfs_znode_hold_enter(zfsvfs, obj);
572e2857
BB
1336 if (acl_obj) {
1337 VERIFY(!zp->z_is_sa);
b128c09f 1338 VERIFY(0 == dmu_object_free(os, acl_obj, tx));
572e2857 1339 }
b128c09f 1340 VERIFY(0 == dmu_object_free(os, obj, tx));
34dc7c2f 1341 zfs_znode_dmu_fini(zp);
0037b49e 1342 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1343}
1344
1345void
1346zfs_zinactive(znode_t *zp)
1347{
0037b49e 1348 zfsvfs_t *zfsvfs = ZTOZSB(zp);
34dc7c2f 1349 uint64_t z_id = zp->z_id;
c96c36fa 1350 znode_hold_t *zh;
34dc7c2f 1351
428870ff 1352 ASSERT(zp->z_sa_hdl);
34dc7c2f
BB
1353
1354 /*
d6bd8eaa 1355 * Don't allow a zfs_zget() while were trying to release this znode.
34dc7c2f 1356 */
0037b49e 1357 zh = zfs_znode_hold_enter(zfsvfs, z_id);
d6bd8eaa 1358
34dc7c2f 1359 mutex_enter(&zp->z_lock);
34dc7c2f
BB
1360
1361 /*
6a218566
AG
1362 * If this was the last reference to a file with no links, remove
1363 * the file from the file system unless the file system is mounted
1364 * read-only. That can happen, for example, if the file system was
1365 * originally read-write, the file was opened, then unlinked and
1366 * the file system was made read-only before the file was finally
1367 * closed. The file will remain in the unlinked set.
34dc7c2f
BB
1368 */
1369 if (zp->z_unlinked) {
6a218566
AG
1370 ASSERT(!zfsvfs->z_issnap);
1371 if (!zfs_is_readonly(zfsvfs)) {
1372 mutex_exit(&zp->z_lock);
1373 zfs_znode_hold_exit(zfsvfs, zh);
1374 zfs_rmnode(zp);
1375 return;
1376 }
34dc7c2f 1377 }
428870ff 1378
34dc7c2f
BB
1379 mutex_exit(&zp->z_lock);
1380 zfs_znode_dmu_fini(zp);
d6bd8eaa 1381
0037b49e 1382 zfs_znode_hold_exit(zfsvfs, zh);
34dc7c2f
BB
1383}
1384
6d111134
TC
1385static inline int
1386zfs_compare_timespec(struct timespec *t1, struct timespec *t2)
1387{
1388 if (t1->tv_sec < t2->tv_sec)
1389 return (-1);
1390
1391 if (t1->tv_sec > t2->tv_sec)
1392 return (1);
1393
1394 return (t1->tv_nsec - t2->tv_nsec);
1395}
1396
6d111134
TC
1397/*
1398 * Prepare to update znode time stamps.
1399 *
1400 * IN: zp - znode requiring timestamp update
0df9673f 1401 * flag - ATTR_MTIME, ATTR_CTIME flags
6d111134 1402 *
0df9673f 1403 * OUT: zp - z_seq
6d111134
TC
1404 * mtime - new mtime
1405 * ctime - new ctime
1406 *
0df9673f
CC
1407 * Note: We don't update atime here, because we rely on Linux VFS to do
1408 * atime updating.
6d111134 1409 */
34dc7c2f 1410void
428870ff 1411zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
0df9673f 1412 uint64_t ctime[2])
34dc7c2f
BB
1413{
1414 timestruc_t now;
1415
34dc7c2f
BB
1416 gethrestime(&now);
1417
0df9673f 1418 zp->z_seq++;
34dc7c2f 1419
3558fd73 1420 if (flag & ATTR_MTIME) {
428870ff 1421 ZFS_TIME_ENCODE(&now, mtime);
9f5f0019 1422 ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime);
3558fd73 1423 if (ZTOZSB(zp)->z_use_fuids) {
428870ff
BB
1424 zp->z_pflags |= (ZFS_ARCHIVE |
1425 ZFS_AV_MODIFIED);
1426 }
34dc7c2f
BB
1427 }
1428
3558fd73 1429 if (flag & ATTR_CTIME) {
428870ff 1430 ZFS_TIME_ENCODE(&now, ctime);
9f5f0019 1431 ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime);
3558fd73 1432 if (ZTOZSB(zp)->z_use_fuids)
428870ff 1433 zp->z_pflags |= ZFS_ARCHIVE;
34dc7c2f
BB
1434 }
1435}
1436
34dc7c2f
BB
1437/*
1438 * Grow the block size for a file.
1439 *
1440 * IN: zp - znode of file to free data in.
1441 * size - requested block size
1442 * tx - open transaction.
1443 *
1444 * NOTE: this function assumes that the znode is write locked.
1445 */
1446void
1447zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1448{
1449 int error;
1450 u_longlong_t dummy;
1451
1452 if (size <= zp->z_blksz)
1453 return;
1454 /*
1455 * If the file size is already greater than the current blocksize,
1456 * we will not grow. If there is more than one block in a file,
1457 * the blocksize cannot change.
1458 */
428870ff 1459 if (zp->z_blksz && zp->z_size > zp->z_blksz)
34dc7c2f
BB
1460 return;
1461
3558fd73 1462 error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
34dc7c2f 1463 size, 0, tx);
428870ff 1464
34dc7c2f
BB
1465 if (error == ENOTSUP)
1466 return;
c99c9001 1467 ASSERT0(error);
34dc7c2f
BB
1468
1469 /* What blocksize did we actually get? */
428870ff 1470 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
34dc7c2f
BB
1471}
1472
34dc7c2f 1473/*
b128c09f 1474 * Increase the file length
34dc7c2f
BB
1475 *
1476 * IN: zp - znode of file to free data in.
b128c09f 1477 * end - new end-of-file
34dc7c2f 1478 *
19d55079 1479 * RETURN: 0 on success, error code on failure
34dc7c2f 1480 */
b128c09f
BB
1481static int
1482zfs_extend(znode_t *zp, uint64_t end)
34dc7c2f 1483{
0037b49e 1484 zfsvfs_t *zfsvfs = ZTOZSB(zp);
b128c09f 1485 dmu_tx_t *tx;
34dc7c2f 1486 rl_t *rl;
b128c09f 1487 uint64_t newblksz;
34dc7c2f
BB
1488 int error;
1489
34dc7c2f 1490 /*
b128c09f 1491 * We will change zp_size, lock the whole file.
34dc7c2f 1492 */
d88895a0 1493 rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER);
34dc7c2f
BB
1494
1495 /*
1496 * Nothing to do if file already at desired length.
1497 */
428870ff 1498 if (end <= zp->z_size) {
34dc7c2f
BB
1499 zfs_range_unlock(rl);
1500 return (0);
1501 }
0037b49e 1502 tx = dmu_tx_create(zfsvfs->z_os);
428870ff
BB
1503 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1504 zfs_sa_upgrade_txholds(tx, zp);
b128c09f 1505 if (end > zp->z_blksz &&
0037b49e 1506 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
34dc7c2f
BB
1507 /*
1508 * We are growing the file past the current block size.
1509 */
3558fd73 1510 if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
f1512ee6
MA
1511 /*
1512 * File's blocksize is already larger than the
1513 * "recordsize" property. Only let it grow to
1514 * the next power of 2.
1515 */
34dc7c2f 1516 ASSERT(!ISP2(zp->z_blksz));
f1512ee6 1517 newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
34dc7c2f 1518 } else {
3558fd73 1519 newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
34dc7c2f 1520 }
b128c09f
BB
1521 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1522 } else {
1523 newblksz = 0;
34dc7c2f
BB
1524 }
1525
384f8a09 1526 error = dmu_tx_assign(tx, TXG_WAIT);
34dc7c2f 1527 if (error) {
34dc7c2f
BB
1528 dmu_tx_abort(tx);
1529 zfs_range_unlock(rl);
1530 return (error);
1531 }
1532
b128c09f
BB
1533 if (newblksz)
1534 zfs_grow_blocksize(zp, newblksz, tx);
34dc7c2f 1535
428870ff
BB
1536 zp->z_size = end;
1537
3558fd73 1538 VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
428870ff 1539 &zp->z_size, sizeof (zp->z_size), tx));
34dc7c2f 1540
b128c09f 1541 zfs_range_unlock(rl);
34dc7c2f 1542
b128c09f 1543 dmu_tx_commit(tx);
34dc7c2f 1544
b128c09f
BB
1545 return (0);
1546}
1547
223df016
TC
1548/*
1549 * zfs_zero_partial_page - Modeled after update_pages() but
1550 * with different arguments and semantics for use by zfs_freesp().
1551 *
1552 * Zeroes a piece of a single page cache entry for zp at offset
1553 * start and length len.
1554 *
1555 * Caller must acquire a range lock on the file for the region
1556 * being zeroed in order that the ARC and page cache stay in sync.
1557 */
1558static void
1559zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
1560{
1561 struct address_space *mp = ZTOI(zp)->i_mapping;
1562 struct page *pp;
1563 int64_t off;
1564 void *pb;
1565
8b1899d3 1566 ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
223df016 1567
8b1899d3
BB
1568 off = start & (PAGE_SIZE - 1);
1569 start &= PAGE_MASK;
223df016 1570
8b1899d3 1571 pp = find_lock_page(mp, start >> PAGE_SHIFT);
223df016
TC
1572 if (pp) {
1573 if (mapping_writably_mapped(mp))
1574 flush_dcache_page(pp);
1575
1576 pb = kmap(pp);
1577 bzero(pb + off, len);
1578 kunmap(pp);
1579
1580 if (mapping_writably_mapped(mp))
1581 flush_dcache_page(pp);
1582
1583 mark_page_accessed(pp);
1584 SetPageUptodate(pp);
1585 ClearPageError(pp);
1586 unlock_page(pp);
8b1899d3 1587 put_page(pp);
223df016
TC
1588 }
1589}
1590
b128c09f
BB
1591/*
1592 * Free space in a file.
1593 *
1594 * IN: zp - znode of file to free data in.
1595 * off - start of section to free.
1596 * len - length of section to free.
1597 *
19d55079 1598 * RETURN: 0 on success, error code on failure
b128c09f
BB
1599 */
1600static int
1601zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1602{
0037b49e 1603 zfsvfs_t *zfsvfs = ZTOZSB(zp);
b128c09f
BB
1604 rl_t *rl;
1605 int error;
1606
1607 /*
1608 * Lock the range being freed.
1609 */
d88895a0 1610 rl = zfs_range_lock(&zp->z_range_lock, off, len, RL_WRITER);
b128c09f
BB
1611
1612 /*
1613 * Nothing to do if file already at desired length.
1614 */
428870ff 1615 if (off >= zp->z_size) {
b128c09f
BB
1616 zfs_range_unlock(rl);
1617 return (0);
34dc7c2f
BB
1618 }
1619
428870ff
BB
1620 if (off + len > zp->z_size)
1621 len = zp->z_size - off;
b128c09f 1622
0037b49e 1623 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
b128c09f 1624
223df016
TC
1625 /*
1626 * Zero partial page cache entries. This must be done under a
1627 * range lock in order to keep the ARC and page cache in sync.
1628 */
1629 if (zp->z_is_mapped) {
1630 loff_t first_page, last_page, page_len;
1631 loff_t first_page_offset, last_page_offset;
1632
1633 /* first possible full page in hole */
8b1899d3 1634 first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
223df016 1635 /* last page of hole */
8b1899d3 1636 last_page = (off + len) >> PAGE_SHIFT;
223df016
TC
1637
1638 /* offset of first_page */
8b1899d3 1639 first_page_offset = first_page << PAGE_SHIFT;
223df016 1640 /* offset of last_page */
8b1899d3 1641 last_page_offset = last_page << PAGE_SHIFT;
223df016 1642
cb08f063
TC
1643 /* truncate whole pages */
1644 if (last_page_offset > first_page_offset) {
1645 truncate_inode_pages_range(ZTOI(zp)->i_mapping,
1646 first_page_offset, last_page_offset - 1);
1647 }
1648
1649 /* truncate sub-page ranges */
223df016
TC
1650 if (first_page > last_page) {
1651 /* entire punched area within a single page */
1652 zfs_zero_partial_page(zp, off, len);
1653 } else {
1654 /* beginning of punched area at the end of a page */
1655 page_len = first_page_offset - off;
1656 if (page_len > 0)
1657 zfs_zero_partial_page(zp, off, page_len);
1658
1659 /* end of punched area at the beginning of a page */
1660 page_len = off + len - last_page_offset;
1661 if (page_len > 0)
1662 zfs_zero_partial_page(zp, last_page_offset,
1663 page_len);
1664 }
1665 }
34dc7c2f
BB
1666 zfs_range_unlock(rl);
1667
b128c09f
BB
1668 return (error);
1669}
1670
1671/*
1672 * Truncate a file
1673 *
1674 * IN: zp - znode of file to free data in.
1675 * end - new end-of-file.
1676 *
19d55079 1677 * RETURN: 0 on success, error code on failure
b128c09f
BB
1678 */
1679static int
1680zfs_trunc(znode_t *zp, uint64_t end)
1681{
0037b49e 1682 zfsvfs_t *zfsvfs = ZTOZSB(zp);
b128c09f
BB
1683 dmu_tx_t *tx;
1684 rl_t *rl;
1685 int error;
572e2857
BB
1686 sa_bulk_attr_t bulk[2];
1687 int count = 0;
b128c09f
BB
1688
1689 /*
1690 * We will change zp_size, lock the whole file.
1691 */
d88895a0 1692 rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER);
b128c09f
BB
1693
1694 /*
1695 * Nothing to do if file already at desired length.
1696 */
428870ff 1697 if (end >= zp->z_size) {
b128c09f
BB
1698 zfs_range_unlock(rl);
1699 return (0);
1700 }
1701
18a2485f
FS
1702 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
1703 DMU_OBJECT_END);
b128c09f
BB
1704 if (error) {
1705 zfs_range_unlock(rl);
1706 return (error);
1707 }
0037b49e 1708 tx = dmu_tx_create(zfsvfs->z_os);
428870ff
BB
1709 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1710 zfs_sa_upgrade_txholds(tx, zp);
19d55079 1711 dmu_tx_mark_netfree(tx);
7a8f0e80 1712 error = dmu_tx_assign(tx, TXG_WAIT);
b128c09f 1713 if (error) {
b128c09f
BB
1714 dmu_tx_abort(tx);
1715 zfs_range_unlock(rl);
1716 return (error);
1717 }
b128c09f 1718
428870ff 1719 zp->z_size = end;
0037b49e 1720 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
572e2857 1721 NULL, &zp->z_size, sizeof (zp->z_size));
428870ff 1722
572e2857
BB
1723 if (end == 0) {
1724 zp->z_pflags &= ~ZFS_SPARSE;
0037b49e 1725 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
572e2857
BB
1726 NULL, &zp->z_pflags, 8);
1727 }
1728 VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
b128c09f 1729
34dc7c2f
BB
1730 dmu_tx_commit(tx);
1731
d164b209 1732 zfs_range_unlock(rl);
34dc7c2f
BB
1733
1734 return (0);
1735}
1736
b128c09f
BB
1737/*
1738 * Free space in a file
1739 *
1740 * IN: zp - znode of file to free data in.
1741 * off - start of range
1742 * len - end of range (0 => EOF)
1743 * flag - current file open mode flags.
1744 * log - TRUE if this action should be logged
1745 *
19d55079 1746 * RETURN: 0 on success, error code on failure
b128c09f
BB
1747 */
1748int
1749zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1750{
b128c09f 1751 dmu_tx_t *tx;
0037b49e
BB
1752 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1753 zilog_t *zilog = zfsvfs->z_log;
428870ff
BB
1754 uint64_t mode;
1755 uint64_t mtime[2], ctime[2];
1756 sa_bulk_attr_t bulk[3];
1757 int count = 0;
b128c09f
BB
1758 int error;
1759
0037b49e 1760 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
428870ff
BB
1761 sizeof (mode))) != 0)
1762 return (error);
1763
1764 if (off > zp->z_size) {
b128c09f
BB
1765 error = zfs_extend(zp, off+len);
1766 if (error == 0 && log)
1767 goto log;
223df016 1768 goto out;
b128c09f
BB
1769 }
1770
b128c09f
BB
1771 if (len == 0) {
1772 error = zfs_trunc(zp, off);
1773 } else {
1774 if ((error = zfs_free_range(zp, off, len)) == 0 &&
428870ff 1775 off + len > zp->z_size)
b128c09f
BB
1776 error = zfs_extend(zp, off+len);
1777 }
1778 if (error || !log)
223df016 1779 goto out;
b128c09f 1780log:
0037b49e 1781 tx = dmu_tx_create(zfsvfs->z_os);
428870ff
BB
1782 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1783 zfs_sa_upgrade_txholds(tx, zp);
384f8a09 1784 error = dmu_tx_assign(tx, TXG_WAIT);
b128c09f 1785 if (error) {
b128c09f 1786 dmu_tx_abort(tx);
223df016 1787 goto out;
b128c09f
BB
1788 }
1789
0037b49e
BB
1790 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
1791 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
1792 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
428870ff 1793 NULL, &zp->z_pflags, 8);
0df9673f 1794 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
428870ff
BB
1795 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1796 ASSERT(error == 0);
1797
b128c09f
BB
1798 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1799
1800 dmu_tx_commit(tx);
223df016 1801
960e08fe 1802 zfs_inode_update(zp);
223df016
TC
1803 error = 0;
1804
1805out:
1806 /*
1807 * Truncate the page cache - for file truncate operations, use
1808 * the purpose-built API for truncations. For punching operations,
cb08f063 1809 * the truncation is handled under a range lock in zfs_free_range.
223df016
TC
1810 */
1811 if (len == 0)
1812 truncate_setsize(ZTOI(zp), off);
223df016 1813 return (error);
b128c09f
BB
1814}
1815
34dc7c2f
BB
1816void
1817zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1818{
22872ff5 1819 struct super_block *sb;
0037b49e 1820 zfsvfs_t *zfsvfs;
428870ff 1821 uint64_t moid, obj, sa_obj, version;
22872ff5 1822 uint64_t sense = ZFS_CASE_SENSITIVE;
34dc7c2f
BB
1823 uint64_t norm = 0;
1824 nvpair_t *elem;
c96c36fa 1825 int size;
34dc7c2f 1826 int error;
22872ff5
BB
1827 int i;
1828 znode_t *rootzp = NULL;
1829 vattr_t vattr;
1830 znode_t *zp;
1831 zfs_acl_ids_t acl_ids;
34dc7c2f
BB
1832
1833 /*
1834 * First attempt to create master node.
1835 */
1836 /*
1837 * In an empty objset, there are no blocks to read and thus
1838 * there can be no i/o errors (which we assert below).
1839 */
1840 moid = MASTER_NODE_OBJ;
1841 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1842 DMU_OT_NONE, 0, tx);
1843 ASSERT(error == 0);
1844
1845 /*
1846 * Set starting attributes.
1847 */
428870ff 1848 version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
34dc7c2f
BB
1849 elem = NULL;
1850 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1851 /* For the moment we expect all zpl props to be uint64_ts */
1852 uint64_t val;
1853 char *name;
1854
1855 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1856 VERIFY(nvpair_value_uint64(elem, &val) == 0);
1857 name = nvpair_name(elem);
1858 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
9babb374
BB
1859 if (val < version)
1860 version = val;
34dc7c2f
BB
1861 } else {
1862 error = zap_update(os, moid, name, 8, 1, &val, tx);
1863 }
1864 ASSERT(error == 0);
1865 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1866 norm = val;
22872ff5
BB
1867 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1868 sense = val;
34dc7c2f
BB
1869 }
1870 ASSERT(version != 0);
9babb374 1871 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
34dc7c2f 1872
428870ff
BB
1873 /*
1874 * Create zap object used for SA attribute registration
1875 */
1876
1877 if (version >= ZPL_VERSION_SA) {
1878 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
1879 DMU_OT_NONE, 0, tx);
1880 error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
1881 ASSERT(error == 0);
1882 } else {
1883 sa_obj = 0;
1884 }
34dc7c2f
BB
1885 /*
1886 * Create a delete queue.
1887 */
9babb374 1888 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
34dc7c2f 1889
9babb374 1890 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
34dc7c2f
BB
1891 ASSERT(error == 0);
1892
9babb374 1893 /*
0037b49e 1894 * Create root znode. Create minimal znode/inode/zfsvfs/sb
22872ff5 1895 * to allow zfs_mknode to work.
9babb374 1896 */
22872ff5
BB
1897 vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
1898 vattr.va_mode = S_IFDIR|0755;
1899 vattr.va_uid = crgetuid(cr);
1900 vattr.va_gid = crgetgid(cr);
1901
79c76d5b 1902 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
22872ff5
BB
1903 rootzp->z_moved = 0;
1904 rootzp->z_unlinked = 0;
1905 rootzp->z_atime_dirty = 0;
1906 rootzp->z_is_sa = USE_SA(version, os);
9c5167d1 1907 rootzp->z_pflags = 0;
22872ff5 1908
0037b49e
BB
1909 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1910 zfsvfs->z_os = os;
1911 zfsvfs->z_parent = zfsvfs;
1912 zfsvfs->z_version = version;
1913 zfsvfs->z_use_fuids = USE_FUIDS(version, os);
1914 zfsvfs->z_use_sa = USE_SA(version, os);
1915 zfsvfs->z_norm = norm;
22872ff5 1916
79c76d5b 1917 sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
0037b49e 1918 sb->s_fs_info = zfsvfs;
22872ff5
BB
1919
1920 ZTOI(rootzp)->i_sb = sb;
1921
1922 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
0037b49e 1923 &zfsvfs->z_attr_table);
9babb374 1924
22872ff5 1925 ASSERT(error == 0);
9babb374 1926
60101509 1927 /*
22872ff5
BB
1928 * Fold case on file systems that are always or sometimes case
1929 * insensitive.
60101509 1930 */
22872ff5 1931 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
0037b49e 1932 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
60101509 1933
0037b49e
BB
1934 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1935 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
22872ff5 1936 offsetof(znode_t, z_link_node));
60101509 1937
c96c36fa 1938 size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
0037b49e
BB
1939 zfsvfs->z_hold_size = size;
1940 zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
1941 KM_SLEEP);
1942 zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
c96c36fa 1943 for (i = 0; i != size; i++) {
0037b49e 1944 avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
c96c36fa 1945 sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
0037b49e 1946 mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
c96c36fa 1947 }
60101509 1948
22872ff5
BB
1949 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1950 cr, NULL, &acl_ids));
1951 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
1952 ASSERT3P(zp, ==, rootzp);
1953 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1954 ASSERT(error == 0);
1955 zfs_acl_ids_free(&acl_ids);
60101509 1956
22872ff5
BB
1957 atomic_set(&ZTOI(rootzp)->i_count, 0);
1958 sa_handle_destroy(rootzp->z_sa_hdl);
22872ff5
BB
1959 kmem_cache_free(znode_cache, rootzp);
1960
1961 /*
1962 * Create shares directory
1963 */
0037b49e 1964 error = zfs_create_share_dir(zfsvfs, tx);
9babb374 1965 ASSERT(error == 0);
428870ff 1966
c96c36fa 1967 for (i = 0; i != size; i++) {
0037b49e
BB
1968 avl_destroy(&zfsvfs->z_hold_trees[i]);
1969 mutex_destroy(&zfsvfs->z_hold_locks[i]);
c96c36fa 1970 }
2708f716 1971
c17486b2
GN
1972 mutex_destroy(&zfsvfs->z_znodes_lock);
1973
0037b49e
BB
1974 vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
1975 vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
2708f716 1976 kmem_free(sb, sizeof (struct super_block));
0037b49e 1977 kmem_free(zfsvfs, sizeof (zfsvfs_t));
34dc7c2f 1978}
34dc7c2f 1979#endif /* _KERNEL */
428870ff 1980
34dc7c2f 1981static int
572e2857
BB
1982zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
1983{
1984 uint64_t sa_obj = 0;
1985 int error;
1986
1987 error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
1988 if (error != 0 && error != ENOENT)
1989 return (error);
1990
1991 error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
1992 return (error);
1993}
1994
1995static int
1996zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
7b8518cb 1997 dmu_buf_t **db, void *tag)
34dc7c2f 1998{
34dc7c2f 1999 dmu_object_info_t doi;
34dc7c2f 2000 int error;
428870ff 2001
7b8518cb 2002 if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
34dc7c2f
BB
2003 return (error);
2004
572e2857 2005 dmu_object_info_from_db(*db, &doi);
428870ff
BB
2006 if ((doi.doi_bonus_type != DMU_OT_SA &&
2007 doi.doi_bonus_type != DMU_OT_ZNODE) ||
d6320ddb
BB
2008 (doi.doi_bonus_type == DMU_OT_ZNODE &&
2009 doi.doi_bonus_size < sizeof (znode_phys_t))) {
7b8518cb 2010 sa_buf_rele(*db, tag);
2e528b49 2011 return (SET_ERROR(ENOTSUP));
34dc7c2f
BB
2012 }
2013
572e2857
BB
2014 error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
2015 if (error != 0) {
7b8518cb 2016 sa_buf_rele(*db, tag);
428870ff
BB
2017 return (error);
2018 }
2019
572e2857
BB
2020 return (0);
2021}
2022
2023void
7b8518cb 2024zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
572e2857
BB
2025{
2026 sa_handle_destroy(hdl);
7b8518cb 2027 sa_buf_rele(db, tag);
572e2857
BB
2028}
2029
2030/*
2031 * Given an object number, return its parent object number and whether
2032 * or not the object is an extended attribute directory.
2033 */
2034static int
b23ad7f3
JJ
2035zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
2036 uint64_t *pobjp, int *is_xattrdir)
572e2857
BB
2037{
2038 uint64_t parent;
2039 uint64_t pflags;
2040 uint64_t mode;
b23ad7f3 2041 uint64_t parent_mode;
572e2857 2042 sa_bulk_attr_t bulk[3];
b23ad7f3
JJ
2043 sa_handle_t *sa_hdl;
2044 dmu_buf_t *sa_db;
572e2857
BB
2045 int count = 0;
2046 int error;
2047
2048 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
2049 &parent, sizeof (parent));
428870ff 2050 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
572e2857 2051 &pflags, sizeof (pflags));
428870ff 2052 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
572e2857 2053 &mode, sizeof (mode));
428870ff 2054
572e2857 2055 if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
428870ff 2056 return (error);
572e2857 2057
b23ad7f3
JJ
2058 /*
2059 * When a link is removed its parent pointer is not changed and will
2060 * be invalid. There are two cases where a link is removed but the
2061 * file stays around, when it goes to the delete queue and when there
2062 * are additional links.
2063 */
2064 error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
2065 if (error != 0)
2066 return (error);
2067
2068 error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
2069 zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2070 if (error != 0)
2071 return (error);
2072
428870ff 2073 *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
34dc7c2f 2074
b23ad7f3
JJ
2075 /*
2076 * Extended attributes can be applied to files, directories, etc.
2077 * Otherwise the parent must be a directory.
2078 */
2079 if (!*is_xattrdir && !S_ISDIR(parent_mode))
ecb2b7dc 2080 return (SET_ERROR(EINVAL));
b23ad7f3
JJ
2081
2082 *pobjp = parent;
2083
34dc7c2f
BB
2084 return (0);
2085}
2086
572e2857
BB
2087/*
2088 * Given an object number, return some zpl level statistics
2089 */
2090static int
2091zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
2092 zfs_stat_t *sb)
34dc7c2f 2093{
572e2857
BB
2094 sa_bulk_attr_t bulk[4];
2095 int count = 0;
2096
2097 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2098 &sb->zs_mode, sizeof (sb->zs_mode));
2099 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
2100 &sb->zs_gen, sizeof (sb->zs_gen));
2101 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
2102 &sb->zs_links, sizeof (sb->zs_links));
2103 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
2104 &sb->zs_ctime, sizeof (sb->zs_ctime));
2105
2106 return (sa_bulk_lookup(hdl, bulk, count));
2107}
2108
2109static int
2110zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
2111 sa_attr_type_t *sa_table, char *buf, int len)
2112{
2113 sa_handle_t *sa_hdl;
2114 sa_handle_t *prevhdl = NULL;
2115 dmu_buf_t *prevdb = NULL;
2116 dmu_buf_t *sa_db = NULL;
34dc7c2f
BB
2117 char *path = buf + len - 1;
2118 int error;
2119
2120 *path = '\0';
572e2857 2121 sa_hdl = hdl;
428870ff 2122
64c1dcef
PD
2123 uint64_t deleteq_obj;
2124 VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
2125 ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
2126 error = zap_lookup_int(osp, deleteq_obj, obj);
2127 if (error == 0) {
2128 return (ESTALE);
2129 } else if (error != ENOENT) {
2130 return (error);
2131 }
2132 error = 0;
2133
34dc7c2f 2134 for (;;) {
17897ce2 2135 uint64_t pobj = 0;
34dc7c2f
BB
2136 char component[MAXNAMELEN + 2];
2137 size_t complen;
17897ce2 2138 int is_xattrdir = 0;
34dc7c2f 2139
572e2857 2140 if (prevdb)
7b8518cb 2141 zfs_release_sa_handle(prevhdl, prevdb, FTAG);
572e2857 2142
b23ad7f3 2143 if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
572e2857 2144 &is_xattrdir)) != 0)
34dc7c2f
BB
2145 break;
2146
2147 if (pobj == obj) {
2148 if (path[0] != '/')
2149 *--path = '/';
2150 break;
2151 }
2152
2153 component[0] = '/';
2154 if (is_xattrdir) {
2155 (void) sprintf(component + 1, "<xattrdir>");
2156 } else {
2157 error = zap_value_search(osp, pobj, obj,
2158 ZFS_DIRENT_OBJ(-1ULL), component + 1);
2159 if (error != 0)
2160 break;
2161 }
2162
2163 complen = strlen(component);
2164 path -= complen;
2165 ASSERT(path >= buf);
2166 bcopy(component, path, complen);
2167 obj = pobj;
572e2857
BB
2168
2169 if (sa_hdl != hdl) {
2170 prevhdl = sa_hdl;
2171 prevdb = sa_db;
2172 }
7b8518cb 2173 error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
572e2857
BB
2174 if (error != 0) {
2175 sa_hdl = prevhdl;
2176 sa_db = prevdb;
2177 break;
2178 }
2179 }
2180
2181 if (sa_hdl != NULL && sa_hdl != hdl) {
2182 ASSERT(sa_db != NULL);
7b8518cb 2183 zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
34dc7c2f
BB
2184 }
2185
2186 if (error == 0)
2187 (void) memmove(buf, path, buf + len - path);
428870ff 2188
34dc7c2f
BB
2189 return (error);
2190}
572e2857
BB
2191
2192int
2193zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
2194{
2195 sa_attr_type_t *sa_table;
2196 sa_handle_t *hdl;
2197 dmu_buf_t *db;
2198 int error;
2199
2200 error = zfs_sa_setup(osp, &sa_table);
2201 if (error != 0)
2202 return (error);
2203
7b8518cb 2204 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
572e2857
BB
2205 if (error != 0)
2206 return (error);
2207
2208 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2209
7b8518cb 2210 zfs_release_sa_handle(hdl, db, FTAG);
572e2857
BB
2211 return (error);
2212}
2213
2214int
2215zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
2216 char *buf, int len)
2217{
2218 char *path = buf + len - 1;
2219 sa_attr_type_t *sa_table;
2220 sa_handle_t *hdl;
2221 dmu_buf_t *db;
2222 int error;
2223
2224 *path = '\0';
2225
2226 error = zfs_sa_setup(osp, &sa_table);
2227 if (error != 0)
2228 return (error);
2229
7b8518cb 2230 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
572e2857
BB
2231 if (error != 0)
2232 return (error);
2233
2234 error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
2235 if (error != 0) {
7b8518cb 2236 zfs_release_sa_handle(hdl, db, FTAG);
572e2857
BB
2237 return (error);
2238 }
2239
2240 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2241
7b8518cb 2242 zfs_release_sa_handle(hdl, db, FTAG);
572e2857
BB
2243 return (error);
2244}
c28b2279
BB
2245
2246#if defined(_KERNEL) && defined(HAVE_SPL)
2247EXPORT_SYMBOL(zfs_create_fs);
2248EXPORT_SYMBOL(zfs_obj_to_path);
0720116d 2249
02730c33 2250/* CSTYLED */
0720116d
BB
2251module_param(zfs_object_mutex_size, uint, 0644);
2252MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
c28b2279 2253#endif