]> git.proxmox.com Git - mirror_zfs.git/blob - module/os/linux/zfs/zfs_znode.c
f71026da83cbb5c07588961ea7892d855ada77f1
[mirror_zfs.git] / module / os / linux / zfs / zfs_znode.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
24 */
25
26 /* Portions Copyright 2007 Jeremy Teo */
27
28 #ifdef _KERNEL
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/time.h>
32 #include <sys/sysmacros.h>
33 #include <sys/mntent.h>
34 #include <sys/u8_textprep.h>
35 #include <sys/dsl_dataset.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/file.h>
39 #include <sys/kmem.h>
40 #include <sys/errno.h>
41 #include <sys/atomic.h>
42 #include <sys/zfs_dir.h>
43 #include <sys/zfs_acl.h>
44 #include <sys/zfs_ioctl.h>
45 #include <sys/zfs_rlock.h>
46 #include <sys/zfs_fuid.h>
47 #include <sys/zfs_vnops.h>
48 #include <sys/zfs_ctldir.h>
49 #include <sys/dnode.h>
50 #include <sys/fs/zfs.h>
51 #include <sys/zpl.h>
52 #endif /* _KERNEL */
53
54 #include <sys/dmu.h>
55 #include <sys/dmu_objset.h>
56 #include <sys/dmu_tx.h>
57 #include <sys/zfs_refcount.h>
58 #include <sys/stat.h>
59 #include <sys/zap.h>
60 #include <sys/zfs_znode.h>
61 #include <sys/sa.h>
62 #include <sys/zfs_sa.h>
63 #include <sys/zfs_stat.h>
64
65 #include "zfs_prop.h"
66 #include "zfs_comutil.h"
67
68 /*
69 * Functions needed for userland (ie: libzpool) are not put under
70 * #ifdef_KERNEL; the rest of the functions have dependencies
71 * (such as VFS logic) that will not compile easily in userland.
72 */
73 #ifdef _KERNEL
74
75 static kmem_cache_t *znode_cache = NULL;
76 static kmem_cache_t *znode_hold_cache = NULL;
77 unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
78
79 /*
80 * This is used by the test suite so that it can delay znodes from being
81 * freed in order to inspect the unlinked set.
82 */
83 static int zfs_unlink_suspend_progress = 0;
84
85 /*
86 * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
87 * z_rangelock. It will modify the offset and length of the lock to reflect
88 * znode-specific information, and convert RL_APPEND to RL_WRITER. This is
89 * called with the rangelock_t's rl_lock held, which avoids races.
90 */
91 static void
92 zfs_rangelock_cb(zfs_locked_range_t *new, void *arg)
93 {
94 znode_t *zp = arg;
95
96 /*
97 * If in append mode, convert to writer and lock starting at the
98 * current end of file.
99 */
100 if (new->lr_type == RL_APPEND) {
101 new->lr_offset = zp->z_size;
102 new->lr_type = RL_WRITER;
103 }
104
105 /*
106 * If we need to grow the block size then lock the whole file range.
107 */
108 uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
109 if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
110 zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
111 new->lr_offset = 0;
112 new->lr_length = UINT64_MAX;
113 }
114 }
115
116 static int
117 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
118 {
119 (void) arg, (void) kmflags;
120 znode_t *zp = buf;
121
122 inode_init_once(ZTOI(zp));
123 list_link_init(&zp->z_link_node);
124
125 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
126 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
127 rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
128 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
129 rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
130
131 zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
132
133 zp->z_dirlocks = NULL;
134 zp->z_acl_cached = NULL;
135 zp->z_xattr_cached = NULL;
136 zp->z_xattr_parent = 0;
137 zp->z_sync_writes_cnt = 0;
138 zp->z_async_writes_cnt = 0;
139
140 return (0);
141 }
142
143 static void
144 zfs_znode_cache_destructor(void *buf, void *arg)
145 {
146 (void) arg;
147 znode_t *zp = buf;
148
149 ASSERT(!list_link_active(&zp->z_link_node));
150 mutex_destroy(&zp->z_lock);
151 rw_destroy(&zp->z_parent_lock);
152 rw_destroy(&zp->z_name_lock);
153 mutex_destroy(&zp->z_acl_lock);
154 rw_destroy(&zp->z_xattr_lock);
155 zfs_rangelock_fini(&zp->z_rangelock);
156
157 ASSERT3P(zp->z_dirlocks, ==, NULL);
158 ASSERT3P(zp->z_acl_cached, ==, NULL);
159 ASSERT3P(zp->z_xattr_cached, ==, NULL);
160
161 ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt));
162 ASSERT0(atomic_load_32(&zp->z_async_writes_cnt));
163 }
164
165 static int
166 zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
167 {
168 (void) arg, (void) kmflags;
169 znode_hold_t *zh = buf;
170
171 mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
172 zh->zh_refcount = 0;
173
174 return (0);
175 }
176
177 static void
178 zfs_znode_hold_cache_destructor(void *buf, void *arg)
179 {
180 (void) arg;
181 znode_hold_t *zh = buf;
182
183 mutex_destroy(&zh->zh_lock);
184 }
185
186 void
187 zfs_znode_init(void)
188 {
189 /*
190 * Initialize zcache. The KMC_SLAB hint is used in order that it be
191 * backed by kmalloc() when on the Linux slab in order that any
192 * wait_on_bit() operations on the related inode operate properly.
193 */
194 ASSERT(znode_cache == NULL);
195 znode_cache = kmem_cache_create("zfs_znode_cache",
196 sizeof (znode_t), 0, zfs_znode_cache_constructor,
197 zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB);
198
199 ASSERT(znode_hold_cache == NULL);
200 znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
201 sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
202 zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
203 }
204
205 void
206 zfs_znode_fini(void)
207 {
208 /*
209 * Cleanup zcache
210 */
211 if (znode_cache)
212 kmem_cache_destroy(znode_cache);
213 znode_cache = NULL;
214
215 if (znode_hold_cache)
216 kmem_cache_destroy(znode_hold_cache);
217 znode_hold_cache = NULL;
218 }
219
220 /*
221 * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
222 * serialize access to a znode and its SA buffer while the object is being
223 * created or destroyed. This kind of locking would normally reside in the
224 * znode itself but in this case that's impossible because the znode and SA
225 * buffer may not yet exist. Therefore the locking is handled externally
226 * with an array of mutexes and AVLs trees which contain per-object locks.
227 *
228 * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
229 * in to the correct AVL tree and finally the per-object lock is held. In
230 * zfs_znode_hold_exit() the process is reversed. The per-object lock is
231 * released, removed from the AVL tree and destroyed if there are no waiters.
232 *
233 * This scheme has two important properties:
234 *
235 * 1) No memory allocations are performed while holding one of the z_hold_locks.
236 * This ensures evict(), which can be called from direct memory reclaim, will
237 * never block waiting on a z_hold_locks which just happens to have hashed
238 * to the same index.
239 *
240 * 2) All locks used to serialize access to an object are per-object and never
241 * shared. This minimizes lock contention without creating a large number
242 * of dedicated locks.
243 *
244 * On the downside it does require znode_lock_t structures to be frequently
245 * allocated and freed. However, because these are backed by a kmem cache
246 * and very short lived this cost is minimal.
247 */
248 int
249 zfs_znode_hold_compare(const void *a, const void *b)
250 {
251 const znode_hold_t *zh_a = (const znode_hold_t *)a;
252 const znode_hold_t *zh_b = (const znode_hold_t *)b;
253
254 return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj));
255 }
256
257 static boolean_t __maybe_unused
258 zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
259 {
260 znode_hold_t *zh, search;
261 int i = ZFS_OBJ_HASH(zfsvfs, obj);
262 boolean_t held;
263
264 search.zh_obj = obj;
265
266 mutex_enter(&zfsvfs->z_hold_locks[i]);
267 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
268 held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
269 mutex_exit(&zfsvfs->z_hold_locks[i]);
270
271 return (held);
272 }
273
274 znode_hold_t *
275 zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
276 {
277 znode_hold_t *zh, *zh_new, search;
278 int i = ZFS_OBJ_HASH(zfsvfs, obj);
279 boolean_t found = B_FALSE;
280
281 zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
282 search.zh_obj = obj;
283
284 mutex_enter(&zfsvfs->z_hold_locks[i]);
285 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
286 if (likely(zh == NULL)) {
287 zh = zh_new;
288 zh->zh_obj = obj;
289 avl_add(&zfsvfs->z_hold_trees[i], zh);
290 } else {
291 ASSERT3U(zh->zh_obj, ==, obj);
292 found = B_TRUE;
293 }
294 zh->zh_refcount++;
295 ASSERT3S(zh->zh_refcount, >, 0);
296 mutex_exit(&zfsvfs->z_hold_locks[i]);
297
298 if (found == B_TRUE)
299 kmem_cache_free(znode_hold_cache, zh_new);
300
301 ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
302 mutex_enter(&zh->zh_lock);
303
304 return (zh);
305 }
306
307 void
308 zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
309 {
310 int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
311 boolean_t remove = B_FALSE;
312
313 ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
314 mutex_exit(&zh->zh_lock);
315
316 mutex_enter(&zfsvfs->z_hold_locks[i]);
317 ASSERT3S(zh->zh_refcount, >, 0);
318 if (--zh->zh_refcount == 0) {
319 avl_remove(&zfsvfs->z_hold_trees[i], zh);
320 remove = B_TRUE;
321 }
322 mutex_exit(&zfsvfs->z_hold_locks[i]);
323
324 if (remove == B_TRUE)
325 kmem_cache_free(znode_hold_cache, zh);
326 }
327
328 dev_t
329 zfs_cmpldev(uint64_t dev)
330 {
331 return (dev);
332 }
333
334 static void
335 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
336 dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
337 {
338 ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
339
340 mutex_enter(&zp->z_lock);
341
342 ASSERT(zp->z_sa_hdl == NULL);
343 ASSERT(zp->z_acl_cached == NULL);
344 if (sa_hdl == NULL) {
345 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
346 SA_HDL_SHARED, &zp->z_sa_hdl));
347 } else {
348 zp->z_sa_hdl = sa_hdl;
349 sa_set_userp(sa_hdl, zp);
350 }
351
352 zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
353
354 mutex_exit(&zp->z_lock);
355 }
356
357 void
358 zfs_znode_dmu_fini(znode_t *zp)
359 {
360 ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) ||
361 RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
362
363 sa_handle_destroy(zp->z_sa_hdl);
364 zp->z_sa_hdl = NULL;
365 }
366
367 /*
368 * Called by new_inode() to allocate a new inode.
369 */
370 int
371 zfs_inode_alloc(struct super_block *sb, struct inode **ip)
372 {
373 znode_t *zp;
374
375 zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
376 *ip = ZTOI(zp);
377
378 return (0);
379 }
380
381 /*
382 * Called in multiple places when an inode should be destroyed.
383 */
384 void
385 zfs_inode_destroy(struct inode *ip)
386 {
387 znode_t *zp = ITOZ(ip);
388 zfsvfs_t *zfsvfs = ZTOZSB(zp);
389
390 mutex_enter(&zfsvfs->z_znodes_lock);
391 if (list_link_active(&zp->z_link_node)) {
392 list_remove(&zfsvfs->z_all_znodes, zp);
393 }
394 mutex_exit(&zfsvfs->z_znodes_lock);
395
396 if (zp->z_acl_cached) {
397 zfs_acl_free(zp->z_acl_cached);
398 zp->z_acl_cached = NULL;
399 }
400
401 if (zp->z_xattr_cached) {
402 nvlist_free(zp->z_xattr_cached);
403 zp->z_xattr_cached = NULL;
404 }
405
406 kmem_cache_free(znode_cache, zp);
407 }
408
409 static void
410 zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
411 {
412 uint64_t rdev = 0;
413
414 switch (ip->i_mode & S_IFMT) {
415 case S_IFREG:
416 ip->i_op = &zpl_inode_operations;
417 #ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
418 ip->i_fop = &zpl_file_operations.kabi_fops;
419 #else
420 ip->i_fop = &zpl_file_operations;
421 #endif
422 ip->i_mapping->a_ops = &zpl_address_space_operations;
423 break;
424
425 case S_IFDIR:
426 #ifdef HAVE_RENAME2_OPERATIONS_WRAPPER
427 ip->i_flags |= S_IOPS_WRAPPER;
428 ip->i_op = &zpl_dir_inode_operations.ops;
429 #else
430 ip->i_op = &zpl_dir_inode_operations;
431 #endif
432 ip->i_fop = &zpl_dir_file_operations;
433 ITOZ(ip)->z_zn_prefetch = B_TRUE;
434 break;
435
436 case S_IFLNK:
437 ip->i_op = &zpl_symlink_inode_operations;
438 break;
439
440 /*
441 * rdev is only stored in a SA only for device files.
442 */
443 case S_IFCHR:
444 case S_IFBLK:
445 (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
446 sizeof (rdev));
447 zfs_fallthrough;
448 case S_IFIFO:
449 case S_IFSOCK:
450 init_special_inode(ip, ip->i_mode, rdev);
451 ip->i_op = &zpl_special_inode_operations;
452 break;
453
454 default:
455 zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
456 (u_longlong_t)ip->i_ino, ip->i_mode);
457
458 /* Assume the inode is a file and attempt to continue */
459 ip->i_mode = S_IFREG | 0644;
460 ip->i_op = &zpl_inode_operations;
461 #ifdef HAVE_VFS_FILE_OPERATIONS_EXTEND
462 ip->i_fop = &zpl_file_operations.kabi_fops;
463 #else
464 ip->i_fop = &zpl_file_operations;
465 #endif
466 ip->i_mapping->a_ops = &zpl_address_space_operations;
467 break;
468 }
469 }
470
471 static void
472 zfs_set_inode_flags(znode_t *zp, struct inode *ip)
473 {
474 /*
475 * Linux and Solaris have different sets of file attributes, so we
476 * restrict this conversion to the intersection of the two.
477 */
478 #ifdef HAVE_INODE_SET_FLAGS
479 unsigned int flags = 0;
480 if (zp->z_pflags & ZFS_IMMUTABLE)
481 flags |= S_IMMUTABLE;
482 if (zp->z_pflags & ZFS_APPENDONLY)
483 flags |= S_APPEND;
484
485 inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
486 #else
487 if (zp->z_pflags & ZFS_IMMUTABLE)
488 ip->i_flags |= S_IMMUTABLE;
489 else
490 ip->i_flags &= ~S_IMMUTABLE;
491
492 if (zp->z_pflags & ZFS_APPENDONLY)
493 ip->i_flags |= S_APPEND;
494 else
495 ip->i_flags &= ~S_APPEND;
496 #endif
497 }
498
499 /*
500 * Update the embedded inode given the znode.
501 */
502 void
503 zfs_znode_update_vfs(znode_t *zp)
504 {
505 struct inode *ip;
506 uint32_t blksize;
507 u_longlong_t i_blocks;
508
509 ASSERT(zp != NULL);
510 ip = ZTOI(zp);
511
512 /* Skip .zfs control nodes which do not exist on disk. */
513 if (zfsctl_is_node(ip))
514 return;
515
516 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
517
518 spin_lock(&ip->i_lock);
519 ip->i_mode = zp->z_mode;
520 ip->i_blocks = i_blocks;
521 i_size_write(ip, zp->z_size);
522 spin_unlock(&ip->i_lock);
523 }
524
525
526 /*
527 * Construct a znode+inode and initialize.
528 *
529 * This does not do a call to dmu_set_user() that is
530 * up to the caller to do, in case you don't want to
531 * return the znode
532 */
533 static znode_t *
534 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
535 dmu_object_type_t obj_type, sa_handle_t *hdl)
536 {
537 znode_t *zp;
538 struct inode *ip;
539 uint64_t mode;
540 uint64_t parent;
541 uint64_t tmp_gen;
542 uint64_t links;
543 uint64_t z_uid, z_gid;
544 uint64_t atime[2], mtime[2], ctime[2], btime[2];
545 inode_timespec_t tmp_ctime;
546 uint64_t projid = ZFS_DEFAULT_PROJID;
547 sa_bulk_attr_t bulk[12];
548 int count = 0;
549
550 ASSERT(zfsvfs != NULL);
551
552 ip = new_inode(zfsvfs->z_sb);
553 if (ip == NULL)
554 return (NULL);
555
556 zp = ITOZ(ip);
557 ASSERT(zp->z_dirlocks == NULL);
558 ASSERT3P(zp->z_acl_cached, ==, NULL);
559 ASSERT3P(zp->z_xattr_cached, ==, NULL);
560 zp->z_unlinked = B_FALSE;
561 zp->z_atime_dirty = B_FALSE;
562 #if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE)
563 zp->z_is_mapped = B_FALSE;
564 #endif
565 zp->z_is_ctldir = B_FALSE;
566 zp->z_suspended = B_FALSE;
567 zp->z_sa_hdl = NULL;
568 zp->z_mapcnt = 0;
569 zp->z_id = db->db_object;
570 zp->z_blksz = blksz;
571 zp->z_seq = 0x7A4653;
572 zp->z_sync_cnt = 0;
573 zp->z_sync_writes_cnt = 0;
574 zp->z_async_writes_cnt = 0;
575
576 zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
577
578 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
579 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
580 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
581 &zp->z_size, 8);
582 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
583 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
584 &zp->z_pflags, 8);
585 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
586 &parent, 8);
587 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
588 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
589 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
590 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
591 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
592 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
593
594 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
595 (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
596 (zp->z_pflags & ZFS_PROJID) &&
597 sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
598 if (hdl == NULL)
599 sa_handle_destroy(zp->z_sa_hdl);
600 zp->z_sa_hdl = NULL;
601 goto error;
602 }
603
604 zp->z_projid = projid;
605 zp->z_mode = ip->i_mode = mode;
606 ip->i_generation = (uint32_t)tmp_gen;
607 ip->i_blkbits = SPA_MINBLOCKSHIFT;
608 set_nlink(ip, (uint32_t)links);
609 zfs_uid_write(ip, z_uid);
610 zfs_gid_write(ip, z_gid);
611 zfs_set_inode_flags(zp, ip);
612
613 /* Cache the xattr parent id */
614 if (zp->z_pflags & ZFS_XATTR)
615 zp->z_xattr_parent = parent;
616
617 ZFS_TIME_DECODE(&ip->i_atime, atime);
618 ZFS_TIME_DECODE(&ip->i_mtime, mtime);
619 ZFS_TIME_DECODE(&tmp_ctime, ctime);
620 zpl_inode_set_ctime_to_ts(ip, tmp_ctime);
621 ZFS_TIME_DECODE(&zp->z_btime, btime);
622
623 ip->i_ino = zp->z_id;
624 zfs_znode_update_vfs(zp);
625 zfs_inode_set_ops(zfsvfs, ip);
626
627 /*
628 * The only way insert_inode_locked() can fail is if the ip->i_ino
629 * number is already hashed for this super block. This can never
630 * happen because the inode numbers map 1:1 with the object numbers.
631 *
632 * Exceptions include rolling back a mounted file system, either
633 * from the zfs rollback or zfs recv command.
634 *
635 * Active inodes are unhashed during the rollback, but since zrele
636 * can happen asynchronously, we can't guarantee they've been
637 * unhashed. This can cause hash collisions in unlinked drain
638 * processing so do not hash unlinked znodes.
639 */
640 if (links > 0)
641 VERIFY3S(insert_inode_locked(ip), ==, 0);
642
643 mutex_enter(&zfsvfs->z_znodes_lock);
644 list_insert_tail(&zfsvfs->z_all_znodes, zp);
645 mutex_exit(&zfsvfs->z_znodes_lock);
646
647 if (links > 0)
648 unlock_new_inode(ip);
649 return (zp);
650
651 error:
652 iput(ip);
653 return (NULL);
654 }
655
656 /*
657 * Safely mark an inode dirty. Inodes which are part of a read-only
658 * file system or snapshot may not be dirtied.
659 */
660 void
661 zfs_mark_inode_dirty(struct inode *ip)
662 {
663 zfsvfs_t *zfsvfs = ITOZSB(ip);
664
665 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
666 return;
667
668 mark_inode_dirty(ip);
669 }
670
671 static uint64_t empty_xattr;
672 static uint64_t pad[4];
673 static zfs_acl_phys_t acl_phys;
674 /*
675 * Create a new DMU object to hold a zfs znode.
676 *
677 * IN: dzp - parent directory for new znode
678 * vap - file attributes for new znode
679 * tx - dmu transaction id for zap operations
680 * cr - credentials of caller
681 * flag - flags:
682 * IS_ROOT_NODE - new object will be root
683 * IS_TMPFILE - new object is of O_TMPFILE
684 * IS_XATTR - new object is an attribute
685 * acl_ids - ACL related attributes
686 *
687 * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE)
688 *
689 */
690 void
691 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
692 uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
693 {
694 uint64_t crtime[2], atime[2], mtime[2], ctime[2];
695 uint64_t mode, size, links, parent, pflags;
696 uint64_t projid = ZFS_DEFAULT_PROJID;
697 uint64_t rdev = 0;
698 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
699 dmu_buf_t *db;
700 inode_timespec_t now;
701 uint64_t gen, obj;
702 int bonuslen;
703 int dnodesize;
704 sa_handle_t *sa_hdl;
705 dmu_object_type_t obj_type;
706 sa_bulk_attr_t *sa_attrs;
707 int cnt = 0;
708 zfs_acl_locator_cb_t locate = { 0 };
709 znode_hold_t *zh;
710
711 if (zfsvfs->z_replay) {
712 obj = vap->va_nodeid;
713 now = vap->va_ctime; /* see zfs_replay_create() */
714 gen = vap->va_nblocks; /* ditto */
715 dnodesize = vap->va_fsid; /* ditto */
716 } else {
717 obj = 0;
718 gethrestime(&now);
719 gen = dmu_tx_get_txg(tx);
720 dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
721 }
722
723 if (dnodesize == 0)
724 dnodesize = DNODE_MIN_SIZE;
725
726 obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
727
728 bonuslen = (obj_type == DMU_OT_SA) ?
729 DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
730
731 /*
732 * Create a new DMU object.
733 */
734 /*
735 * There's currently no mechanism for pre-reading the blocks that will
736 * be needed to allocate a new object, so we accept the small chance
737 * that there will be an i/o error and we will fail one of the
738 * assertions below.
739 */
740 if (S_ISDIR(vap->va_mode)) {
741 if (zfsvfs->z_replay) {
742 VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
743 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
744 obj_type, bonuslen, dnodesize, tx));
745 } else {
746 obj = zap_create_norm_dnsize(zfsvfs->z_os,
747 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
748 obj_type, bonuslen, dnodesize, tx);
749 }
750 } else {
751 if (zfsvfs->z_replay) {
752 VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
753 DMU_OT_PLAIN_FILE_CONTENTS, 0,
754 obj_type, bonuslen, dnodesize, tx));
755 } else {
756 obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
757 DMU_OT_PLAIN_FILE_CONTENTS, 0,
758 obj_type, bonuslen, dnodesize, tx);
759 }
760 }
761
762 zh = zfs_znode_hold_enter(zfsvfs, obj);
763 VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
764
765 /*
766 * If this is the root, fix up the half-initialized parent pointer
767 * to reference the just-allocated physical data area.
768 */
769 if (flag & IS_ROOT_NODE) {
770 dzp->z_id = obj;
771 }
772
773 /*
774 * If parent is an xattr, so am I.
775 */
776 if (dzp->z_pflags & ZFS_XATTR) {
777 flag |= IS_XATTR;
778 }
779
780 if (zfsvfs->z_use_fuids)
781 pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
782 else
783 pflags = 0;
784
785 if (S_ISDIR(vap->va_mode)) {
786 size = 2; /* contents ("." and "..") */
787 links = 2;
788 } else {
789 size = 0;
790 links = (flag & IS_TMPFILE) ? 0 : 1;
791 }
792
793 if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
794 rdev = vap->va_rdev;
795
796 parent = dzp->z_id;
797 mode = acl_ids->z_mode;
798 if (flag & IS_XATTR)
799 pflags |= ZFS_XATTR;
800
801 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
802 /*
803 * With ZFS_PROJID flag, we can easily know whether there is
804 * project ID stored on disk or not. See zfs_space_delta_cb().
805 */
806 if (obj_type != DMU_OT_ZNODE &&
807 dmu_objset_projectquota_enabled(zfsvfs->z_os))
808 pflags |= ZFS_PROJID;
809
810 /*
811 * Inherit project ID from parent if required.
812 */
813 projid = zfs_inherit_projid(dzp);
814 if (dzp->z_pflags & ZFS_PROJINHERIT)
815 pflags |= ZFS_PROJINHERIT;
816 }
817
818 /*
819 * No execs denied will be determined when zfs_mode_compute() is called.
820 */
821 pflags |= acl_ids->z_aclp->z_hints &
822 (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
823 ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
824
825 ZFS_TIME_ENCODE(&now, crtime);
826 ZFS_TIME_ENCODE(&now, ctime);
827
828 if (vap->va_mask & ATTR_ATIME) {
829 ZFS_TIME_ENCODE(&vap->va_atime, atime);
830 } else {
831 ZFS_TIME_ENCODE(&now, atime);
832 }
833
834 if (vap->va_mask & ATTR_MTIME) {
835 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
836 } else {
837 ZFS_TIME_ENCODE(&now, mtime);
838 }
839
840 /* Now add in all of the "SA" attributes */
841 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
842 &sa_hdl));
843
844 /*
845 * Setup the array of attributes to be replaced/set on the new file
846 *
847 * order for DMU_OT_ZNODE is critical since it needs to be constructed
848 * in the old znode_phys_t format. Don't change this ordering
849 */
850 sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
851
852 if (obj_type == DMU_OT_ZNODE) {
853 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
854 NULL, &atime, 16);
855 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
856 NULL, &mtime, 16);
857 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
858 NULL, &ctime, 16);
859 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
860 NULL, &crtime, 16);
861 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
862 NULL, &gen, 8);
863 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
864 NULL, &mode, 8);
865 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
866 NULL, &size, 8);
867 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
868 NULL, &parent, 8);
869 } else {
870 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
871 NULL, &mode, 8);
872 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
873 NULL, &size, 8);
874 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
875 NULL, &gen, 8);
876 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
877 NULL, &acl_ids->z_fuid, 8);
878 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
879 NULL, &acl_ids->z_fgid, 8);
880 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
881 NULL, &parent, 8);
882 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
883 NULL, &pflags, 8);
884 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
885 NULL, &atime, 16);
886 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
887 NULL, &mtime, 16);
888 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
889 NULL, &ctime, 16);
890 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
891 NULL, &crtime, 16);
892 }
893
894 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
895
896 if (obj_type == DMU_OT_ZNODE) {
897 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
898 &empty_xattr, 8);
899 } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
900 pflags & ZFS_PROJID) {
901 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
902 NULL, &projid, 8);
903 }
904 if (obj_type == DMU_OT_ZNODE ||
905 (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
906 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
907 NULL, &rdev, 8);
908 }
909 if (obj_type == DMU_OT_ZNODE) {
910 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
911 NULL, &pflags, 8);
912 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
913 &acl_ids->z_fuid, 8);
914 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
915 &acl_ids->z_fgid, 8);
916 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
917 sizeof (uint64_t) * 4);
918 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
919 &acl_phys, sizeof (zfs_acl_phys_t));
920 } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
921 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
922 &acl_ids->z_aclp->z_acl_count, 8);
923 locate.cb_aclp = acl_ids->z_aclp;
924 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
925 zfs_acl_data_locator, &locate,
926 acl_ids->z_aclp->z_acl_bytes);
927 mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
928 acl_ids->z_fuid, acl_ids->z_fgid);
929 }
930
931 VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
932
933 if (!(flag & IS_ROOT_NODE)) {
934 /*
935 * The call to zfs_znode_alloc() may fail if memory is low
936 * via the call path: alloc_inode() -> inode_init_always() ->
937 * security_inode_alloc() -> inode_alloc_security(). Since
938 * the existing code is written such that zfs_mknode() can
939 * not fail retry until sufficient memory has been reclaimed.
940 */
941 do {
942 *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
943 } while (*zpp == NULL);
944
945 VERIFY(*zpp != NULL);
946 VERIFY(dzp != NULL);
947 } else {
948 /*
949 * If we are creating the root node, the "parent" we
950 * passed in is the znode for the root.
951 */
952 *zpp = dzp;
953
954 (*zpp)->z_sa_hdl = sa_hdl;
955 }
956
957 (*zpp)->z_pflags = pflags;
958 (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
959 (*zpp)->z_dnodesize = dnodesize;
960 (*zpp)->z_projid = projid;
961
962 if (obj_type == DMU_OT_ZNODE ||
963 acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
964 VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
965 }
966 kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
967 zfs_znode_hold_exit(zfsvfs, zh);
968 }
969
970 /*
971 * Update in-core attributes. It is assumed the caller will be doing an
972 * sa_bulk_update to push the changes out.
973 */
974 void
975 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
976 {
977 xoptattr_t *xoap;
978 boolean_t update_inode = B_FALSE;
979
980 xoap = xva_getxoptattr(xvap);
981 ASSERT(xoap);
982
983 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
984 uint64_t times[2];
985 ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
986 (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
987 &times, sizeof (times), tx);
988 XVA_SET_RTN(xvap, XAT_CREATETIME);
989 }
990 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
991 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
992 zp->z_pflags, tx);
993 XVA_SET_RTN(xvap, XAT_READONLY);
994 }
995 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
996 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
997 zp->z_pflags, tx);
998 XVA_SET_RTN(xvap, XAT_HIDDEN);
999 }
1000 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
1001 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
1002 zp->z_pflags, tx);
1003 XVA_SET_RTN(xvap, XAT_SYSTEM);
1004 }
1005 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
1006 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
1007 zp->z_pflags, tx);
1008 XVA_SET_RTN(xvap, XAT_ARCHIVE);
1009 }
1010 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
1011 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
1012 zp->z_pflags, tx);
1013 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
1014
1015 update_inode = B_TRUE;
1016 }
1017 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
1018 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
1019 zp->z_pflags, tx);
1020 XVA_SET_RTN(xvap, XAT_NOUNLINK);
1021 }
1022 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1023 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
1024 zp->z_pflags, tx);
1025 XVA_SET_RTN(xvap, XAT_APPENDONLY);
1026
1027 update_inode = B_TRUE;
1028 }
1029 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1030 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
1031 zp->z_pflags, tx);
1032 XVA_SET_RTN(xvap, XAT_NODUMP);
1033 }
1034 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1035 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
1036 zp->z_pflags, tx);
1037 XVA_SET_RTN(xvap, XAT_OPAQUE);
1038 }
1039 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1040 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
1041 xoap->xoa_av_quarantined, zp->z_pflags, tx);
1042 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1043 }
1044 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1045 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
1046 zp->z_pflags, tx);
1047 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1048 }
1049 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
1050 zfs_sa_set_scanstamp(zp, xvap, tx);
1051 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
1052 }
1053 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1054 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
1055 zp->z_pflags, tx);
1056 XVA_SET_RTN(xvap, XAT_REPARSE);
1057 }
1058 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1059 ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
1060 zp->z_pflags, tx);
1061 XVA_SET_RTN(xvap, XAT_OFFLINE);
1062 }
1063 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1064 ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
1065 zp->z_pflags, tx);
1066 XVA_SET_RTN(xvap, XAT_SPARSE);
1067 }
1068 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
1069 ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
1070 zp->z_pflags, tx);
1071 XVA_SET_RTN(xvap, XAT_PROJINHERIT);
1072 }
1073
1074 if (update_inode)
1075 zfs_set_inode_flags(zp, ZTOI(zp));
1076 }
1077
1078 int
1079 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
1080 {
1081 dmu_object_info_t doi;
1082 dmu_buf_t *db;
1083 znode_t *zp;
1084 znode_hold_t *zh;
1085 int err;
1086 sa_handle_t *hdl;
1087
1088 *zpp = NULL;
1089
1090 again:
1091 zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1092
1093 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1094 if (err) {
1095 zfs_znode_hold_exit(zfsvfs, zh);
1096 return (err);
1097 }
1098
1099 dmu_object_info_from_db(db, &doi);
1100 if (doi.doi_bonus_type != DMU_OT_SA &&
1101 (doi.doi_bonus_type != DMU_OT_ZNODE ||
1102 (doi.doi_bonus_type == DMU_OT_ZNODE &&
1103 doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1104 sa_buf_rele(db, NULL);
1105 zfs_znode_hold_exit(zfsvfs, zh);
1106 return (SET_ERROR(EINVAL));
1107 }
1108
1109 hdl = dmu_buf_get_user(db);
1110 if (hdl != NULL) {
1111 zp = sa_get_userdata(hdl);
1112
1113
1114 /*
1115 * Since "SA" does immediate eviction we
1116 * should never find a sa handle that doesn't
1117 * know about the znode.
1118 */
1119
1120 ASSERT3P(zp, !=, NULL);
1121
1122 mutex_enter(&zp->z_lock);
1123 ASSERT3U(zp->z_id, ==, obj_num);
1124 /*
1125 * If zp->z_unlinked is set, the znode is already marked
1126 * for deletion and should not be discovered. Check this
1127 * after checking igrab() due to fsetxattr() & O_TMPFILE.
1128 *
1129 * If igrab() returns NULL the VFS has independently
1130 * determined the inode should be evicted and has
1131 * called iput_final() to start the eviction process.
1132 * The SA handle is still valid but because the VFS
1133 * requires that the eviction succeed we must drop
1134 * our locks and references to allow the eviction to
1135 * complete. The zfs_zget() may then be retried.
1136 *
1137 * This unlikely case could be optimized by registering
1138 * a sops->drop_inode() callback. The callback would
1139 * need to detect the active SA hold thereby informing
1140 * the VFS that this inode should not be evicted.
1141 */
1142 if (igrab(ZTOI(zp)) == NULL) {
1143 if (zp->z_unlinked)
1144 err = SET_ERROR(ENOENT);
1145 else
1146 err = SET_ERROR(EAGAIN);
1147 } else {
1148 *zpp = zp;
1149 err = 0;
1150 }
1151
1152 mutex_exit(&zp->z_lock);
1153 sa_buf_rele(db, NULL);
1154 zfs_znode_hold_exit(zfsvfs, zh);
1155
1156 if (err == EAGAIN) {
1157 /* inode might need this to finish evict */
1158 cond_resched();
1159 goto again;
1160 }
1161 return (err);
1162 }
1163
1164 /*
1165 * Not found create new znode/vnode but only if file exists.
1166 *
1167 * There is a small window where zfs_vget() could
1168 * find this object while a file create is still in
1169 * progress. This is checked for in zfs_znode_alloc()
1170 *
1171 * if zfs_znode_alloc() fails it will drop the hold on the
1172 * bonus buffer.
1173 */
1174 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
1175 doi.doi_bonus_type, NULL);
1176 if (zp == NULL) {
1177 err = SET_ERROR(ENOENT);
1178 } else {
1179 *zpp = zp;
1180 }
1181 zfs_znode_hold_exit(zfsvfs, zh);
1182 return (err);
1183 }
1184
1185 int
1186 zfs_rezget(znode_t *zp)
1187 {
1188 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1189 dmu_object_info_t doi;
1190 dmu_buf_t *db;
1191 uint64_t obj_num = zp->z_id;
1192 uint64_t mode;
1193 uint64_t links;
1194 sa_bulk_attr_t bulk[11];
1195 int err;
1196 int count = 0;
1197 uint64_t gen;
1198 uint64_t z_uid, z_gid;
1199 uint64_t atime[2], mtime[2], ctime[2], btime[2];
1200 inode_timespec_t tmp_ctime;
1201 uint64_t projid = ZFS_DEFAULT_PROJID;
1202 znode_hold_t *zh;
1203
1204 /*
1205 * skip ctldir, otherwise they will always get invalidated. This will
1206 * cause funny behaviour for the mounted snapdirs. Especially for
1207 * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
1208 * anyone automount it again as long as someone is still using the
1209 * detached mount.
1210 */
1211 if (zp->z_is_ctldir)
1212 return (0);
1213
1214 zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1215
1216 mutex_enter(&zp->z_acl_lock);
1217 if (zp->z_acl_cached) {
1218 zfs_acl_free(zp->z_acl_cached);
1219 zp->z_acl_cached = NULL;
1220 }
1221 mutex_exit(&zp->z_acl_lock);
1222
1223 rw_enter(&zp->z_xattr_lock, RW_WRITER);
1224 if (zp->z_xattr_cached) {
1225 nvlist_free(zp->z_xattr_cached);
1226 zp->z_xattr_cached = NULL;
1227 }
1228 rw_exit(&zp->z_xattr_lock);
1229
1230 ASSERT(zp->z_sa_hdl == NULL);
1231 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1232 if (err) {
1233 zfs_znode_hold_exit(zfsvfs, zh);
1234 return (err);
1235 }
1236
1237 dmu_object_info_from_db(db, &doi);
1238 if (doi.doi_bonus_type != DMU_OT_SA &&
1239 (doi.doi_bonus_type != DMU_OT_ZNODE ||
1240 (doi.doi_bonus_type == DMU_OT_ZNODE &&
1241 doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1242 sa_buf_rele(db, NULL);
1243 zfs_znode_hold_exit(zfsvfs, zh);
1244 return (SET_ERROR(EINVAL));
1245 }
1246
1247 zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
1248
1249 /* reload cached values */
1250 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
1251 &gen, sizeof (gen));
1252 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1253 &zp->z_size, sizeof (zp->z_size));
1254 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
1255 &links, sizeof (links));
1256 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1257 &zp->z_pflags, sizeof (zp->z_pflags));
1258 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1259 &z_uid, sizeof (z_uid));
1260 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1261 &z_gid, sizeof (z_gid));
1262 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1263 &mode, sizeof (mode));
1264 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1265 &atime, 16);
1266 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
1267 &mtime, 16);
1268 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
1269 &ctime, 16);
1270 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16);
1271
1272 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
1273 zfs_znode_dmu_fini(zp);
1274 zfs_znode_hold_exit(zfsvfs, zh);
1275 return (SET_ERROR(EIO));
1276 }
1277
1278 if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
1279 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
1280 &projid, 8);
1281 if (err != 0 && err != ENOENT) {
1282 zfs_znode_dmu_fini(zp);
1283 zfs_znode_hold_exit(zfsvfs, zh);
1284 return (SET_ERROR(err));
1285 }
1286 }
1287
1288 zp->z_projid = projid;
1289 zp->z_mode = ZTOI(zp)->i_mode = mode;
1290 zfs_uid_write(ZTOI(zp), z_uid);
1291 zfs_gid_write(ZTOI(zp), z_gid);
1292
1293 ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
1294 ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
1295 ZFS_TIME_DECODE(&tmp_ctime, ctime);
1296 zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime);
1297 ZFS_TIME_DECODE(&zp->z_btime, btime);
1298
1299 if ((uint32_t)gen != ZTOI(zp)->i_generation) {
1300 zfs_znode_dmu_fini(zp);
1301 zfs_znode_hold_exit(zfsvfs, zh);
1302 return (SET_ERROR(EIO));
1303 }
1304
1305 set_nlink(ZTOI(zp), (uint32_t)links);
1306 zfs_set_inode_flags(zp, ZTOI(zp));
1307
1308 zp->z_blksz = doi.doi_data_block_size;
1309 zp->z_atime_dirty = B_FALSE;
1310 zfs_znode_update_vfs(zp);
1311
1312 /*
1313 * If the file has zero links, then it has been unlinked on the send
1314 * side and it must be in the received unlinked set.
1315 * We call zfs_znode_dmu_fini() now to prevent any accesses to the
1316 * stale data and to prevent automatic removal of the file in
1317 * zfs_zinactive(). The file will be removed either when it is removed
1318 * on the send side and the next incremental stream is received or
1319 * when the unlinked set gets processed.
1320 */
1321 zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
1322 if (zp->z_unlinked)
1323 zfs_znode_dmu_fini(zp);
1324
1325 zfs_znode_hold_exit(zfsvfs, zh);
1326
1327 return (0);
1328 }
1329
1330 void
1331 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1332 {
1333 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1334 objset_t *os = zfsvfs->z_os;
1335 uint64_t obj = zp->z_id;
1336 uint64_t acl_obj = zfs_external_acl(zp);
1337 znode_hold_t *zh;
1338
1339 zh = zfs_znode_hold_enter(zfsvfs, obj);
1340 if (acl_obj) {
1341 VERIFY(!zp->z_is_sa);
1342 VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1343 }
1344 VERIFY(0 == dmu_object_free(os, obj, tx));
1345 zfs_znode_dmu_fini(zp);
1346 zfs_znode_hold_exit(zfsvfs, zh);
1347 }
1348
1349 void
1350 zfs_zinactive(znode_t *zp)
1351 {
1352 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1353 uint64_t z_id = zp->z_id;
1354 znode_hold_t *zh;
1355
1356 ASSERT(zp->z_sa_hdl);
1357
1358 /*
1359 * Don't allow a zfs_zget() while were trying to release this znode.
1360 */
1361 zh = zfs_znode_hold_enter(zfsvfs, z_id);
1362
1363 mutex_enter(&zp->z_lock);
1364
1365 /*
1366 * If this was the last reference to a file with no links, remove
1367 * the file from the file system unless the file system is mounted
1368 * read-only. That can happen, for example, if the file system was
1369 * originally read-write, the file was opened, then unlinked and
1370 * the file system was made read-only before the file was finally
1371 * closed. The file will remain in the unlinked set.
1372 */
1373 if (zp->z_unlinked) {
1374 ASSERT(!zfsvfs->z_issnap);
1375 if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) {
1376 mutex_exit(&zp->z_lock);
1377 zfs_znode_hold_exit(zfsvfs, zh);
1378 zfs_rmnode(zp);
1379 return;
1380 }
1381 }
1382
1383 mutex_exit(&zp->z_lock);
1384 zfs_znode_dmu_fini(zp);
1385
1386 zfs_znode_hold_exit(zfsvfs, zh);
1387 }
1388
1389 #if defined(HAVE_INODE_TIMESPEC64_TIMES)
1390 #define zfs_compare_timespec timespec64_compare
1391 #else
1392 #define zfs_compare_timespec timespec_compare
1393 #endif
1394
1395 /*
1396 * Determine whether the znode's atime must be updated. The logic mostly
1397 * duplicates the Linux kernel's relatime_need_update() functionality.
1398 * This function is only called if the underlying filesystem actually has
1399 * atime updates enabled.
1400 */
1401 boolean_t
1402 zfs_relatime_need_update(const struct inode *ip)
1403 {
1404 inode_timespec_t now, tmp_ctime;
1405
1406 gethrestime(&now);
1407 /*
1408 * In relatime mode, only update the atime if the previous atime
1409 * is earlier than either the ctime or mtime or if at least a day
1410 * has passed since the last update of atime.
1411 */
1412 if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0)
1413 return (B_TRUE);
1414
1415 tmp_ctime = zpl_inode_get_ctime(ip);
1416 if (zfs_compare_timespec(&tmp_ctime, &ip->i_atime) >= 0)
1417 return (B_TRUE);
1418
1419 if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60)
1420 return (B_TRUE);
1421
1422 return (B_FALSE);
1423 }
1424
1425 /*
1426 * Prepare to update znode time stamps.
1427 *
1428 * IN: zp - znode requiring timestamp update
1429 * flag - ATTR_MTIME, ATTR_CTIME flags
1430 *
1431 * OUT: zp - z_seq
1432 * mtime - new mtime
1433 * ctime - new ctime
1434 *
1435 * Note: We don't update atime here, because we rely on Linux VFS to do
1436 * atime updating.
1437 */
1438 void
1439 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
1440 uint64_t ctime[2])
1441 {
1442 inode_timespec_t now, tmp_ctime;
1443
1444 gethrestime(&now);
1445
1446 zp->z_seq++;
1447
1448 if (flag & ATTR_MTIME) {
1449 ZFS_TIME_ENCODE(&now, mtime);
1450 ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime);
1451 if (ZTOZSB(zp)->z_use_fuids) {
1452 zp->z_pflags |= (ZFS_ARCHIVE |
1453 ZFS_AV_MODIFIED);
1454 }
1455 }
1456
1457 if (flag & ATTR_CTIME) {
1458 ZFS_TIME_ENCODE(&now, ctime);
1459 ZFS_TIME_DECODE(&tmp_ctime, ctime);
1460 zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime);
1461 if (ZTOZSB(zp)->z_use_fuids)
1462 zp->z_pflags |= ZFS_ARCHIVE;
1463 }
1464 }
1465
1466 /*
1467 * Grow the block size for a file.
1468 *
1469 * IN: zp - znode of file to free data in.
1470 * size - requested block size
1471 * tx - open transaction.
1472 *
1473 * NOTE: this function assumes that the znode is write locked.
1474 */
1475 void
1476 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1477 {
1478 int error;
1479 u_longlong_t dummy;
1480
1481 if (size <= zp->z_blksz)
1482 return;
1483 /*
1484 * If the file size is already greater than the current blocksize,
1485 * we will not grow. If there is more than one block in a file,
1486 * the blocksize cannot change.
1487 */
1488 if (zp->z_blksz && zp->z_size > zp->z_blksz)
1489 return;
1490
1491 error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
1492 size, 0, tx);
1493
1494 if (error == ENOTSUP)
1495 return;
1496 ASSERT0(error);
1497
1498 /* What blocksize did we actually get? */
1499 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
1500 }
1501
1502 /*
1503 * Increase the file length
1504 *
1505 * IN: zp - znode of file to free data in.
1506 * end - new end-of-file
1507 *
1508 * RETURN: 0 on success, error code on failure
1509 */
1510 static int
1511 zfs_extend(znode_t *zp, uint64_t end)
1512 {
1513 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1514 dmu_tx_t *tx;
1515 zfs_locked_range_t *lr;
1516 uint64_t newblksz;
1517 int error;
1518
1519 /*
1520 * We will change zp_size, lock the whole file.
1521 */
1522 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
1523
1524 /*
1525 * Nothing to do if file already at desired length.
1526 */
1527 if (end <= zp->z_size) {
1528 zfs_rangelock_exit(lr);
1529 return (0);
1530 }
1531 tx = dmu_tx_create(zfsvfs->z_os);
1532 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1533 zfs_sa_upgrade_txholds(tx, zp);
1534 if (end > zp->z_blksz &&
1535 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1536 /*
1537 * We are growing the file past the current block size.
1538 */
1539 if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
1540 /*
1541 * File's blocksize is already larger than the
1542 * "recordsize" property. Only let it grow to
1543 * the next power of 2.
1544 */
1545 ASSERT(!ISP2(zp->z_blksz));
1546 newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
1547 } else {
1548 newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
1549 }
1550 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1551 } else {
1552 newblksz = 0;
1553 }
1554
1555 error = dmu_tx_assign(tx, TXG_WAIT);
1556 if (error) {
1557 dmu_tx_abort(tx);
1558 zfs_rangelock_exit(lr);
1559 return (error);
1560 }
1561
1562 if (newblksz)
1563 zfs_grow_blocksize(zp, newblksz, tx);
1564
1565 zp->z_size = end;
1566
1567 VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
1568 &zp->z_size, sizeof (zp->z_size), tx));
1569
1570 zfs_rangelock_exit(lr);
1571
1572 dmu_tx_commit(tx);
1573
1574 return (0);
1575 }
1576
1577 /*
1578 * zfs_zero_partial_page - Modeled after update_pages() but
1579 * with different arguments and semantics for use by zfs_freesp().
1580 *
1581 * Zeroes a piece of a single page cache entry for zp at offset
1582 * start and length len.
1583 *
1584 * Caller must acquire a range lock on the file for the region
1585 * being zeroed in order that the ARC and page cache stay in sync.
1586 */
1587 static void
1588 zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
1589 {
1590 struct address_space *mp = ZTOI(zp)->i_mapping;
1591 struct page *pp;
1592 int64_t off;
1593 void *pb;
1594
1595 ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
1596
1597 off = start & (PAGE_SIZE - 1);
1598 start &= PAGE_MASK;
1599
1600 pp = find_lock_page(mp, start >> PAGE_SHIFT);
1601 if (pp) {
1602 if (mapping_writably_mapped(mp))
1603 flush_dcache_page(pp);
1604
1605 pb = kmap(pp);
1606 memset(pb + off, 0, len);
1607 kunmap(pp);
1608
1609 if (mapping_writably_mapped(mp))
1610 flush_dcache_page(pp);
1611
1612 mark_page_accessed(pp);
1613 SetPageUptodate(pp);
1614 ClearPageError(pp);
1615 unlock_page(pp);
1616 put_page(pp);
1617 }
1618 }
1619
1620 /*
1621 * Free space in a file.
1622 *
1623 * IN: zp - znode of file to free data in.
1624 * off - start of section to free.
1625 * len - length of section to free.
1626 *
1627 * RETURN: 0 on success, error code on failure
1628 */
1629 static int
1630 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1631 {
1632 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1633 zfs_locked_range_t *lr;
1634 int error;
1635
1636 /*
1637 * Lock the range being freed.
1638 */
1639 lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
1640
1641 /*
1642 * Nothing to do if file already at desired length.
1643 */
1644 if (off >= zp->z_size) {
1645 zfs_rangelock_exit(lr);
1646 return (0);
1647 }
1648
1649 if (off + len > zp->z_size)
1650 len = zp->z_size - off;
1651
1652 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1653
1654 /*
1655 * Zero partial page cache entries. This must be done under a
1656 * range lock in order to keep the ARC and page cache in sync.
1657 */
1658 if (zn_has_cached_data(zp, off, off + len - 1)) {
1659 loff_t first_page, last_page, page_len;
1660 loff_t first_page_offset, last_page_offset;
1661
1662 /* first possible full page in hole */
1663 first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
1664 /* last page of hole */
1665 last_page = (off + len) >> PAGE_SHIFT;
1666
1667 /* offset of first_page */
1668 first_page_offset = first_page << PAGE_SHIFT;
1669 /* offset of last_page */
1670 last_page_offset = last_page << PAGE_SHIFT;
1671
1672 /* truncate whole pages */
1673 if (last_page_offset > first_page_offset) {
1674 truncate_inode_pages_range(ZTOI(zp)->i_mapping,
1675 first_page_offset, last_page_offset - 1);
1676 }
1677
1678 /* truncate sub-page ranges */
1679 if (first_page > last_page) {
1680 /* entire punched area within a single page */
1681 zfs_zero_partial_page(zp, off, len);
1682 } else {
1683 /* beginning of punched area at the end of a page */
1684 page_len = first_page_offset - off;
1685 if (page_len > 0)
1686 zfs_zero_partial_page(zp, off, page_len);
1687
1688 /* end of punched area at the beginning of a page */
1689 page_len = off + len - last_page_offset;
1690 if (page_len > 0)
1691 zfs_zero_partial_page(zp, last_page_offset,
1692 page_len);
1693 }
1694 }
1695 zfs_rangelock_exit(lr);
1696
1697 return (error);
1698 }
1699
1700 /*
1701 * Truncate a file
1702 *
1703 * IN: zp - znode of file to free data in.
1704 * end - new end-of-file.
1705 *
1706 * RETURN: 0 on success, error code on failure
1707 */
1708 static int
1709 zfs_trunc(znode_t *zp, uint64_t end)
1710 {
1711 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1712 dmu_tx_t *tx;
1713 zfs_locked_range_t *lr;
1714 int error;
1715 sa_bulk_attr_t bulk[2];
1716 int count = 0;
1717
1718 /*
1719 * We will change zp_size, lock the whole file.
1720 */
1721 lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
1722
1723 /*
1724 * Nothing to do if file already at desired length.
1725 */
1726 if (end >= zp->z_size) {
1727 zfs_rangelock_exit(lr);
1728 return (0);
1729 }
1730
1731 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
1732 DMU_OBJECT_END);
1733 if (error) {
1734 zfs_rangelock_exit(lr);
1735 return (error);
1736 }
1737 tx = dmu_tx_create(zfsvfs->z_os);
1738 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1739 zfs_sa_upgrade_txholds(tx, zp);
1740 dmu_tx_mark_netfree(tx);
1741 error = dmu_tx_assign(tx, TXG_WAIT);
1742 if (error) {
1743 dmu_tx_abort(tx);
1744 zfs_rangelock_exit(lr);
1745 return (error);
1746 }
1747
1748 zp->z_size = end;
1749 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
1750 NULL, &zp->z_size, sizeof (zp->z_size));
1751
1752 if (end == 0) {
1753 zp->z_pflags &= ~ZFS_SPARSE;
1754 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1755 NULL, &zp->z_pflags, 8);
1756 }
1757 VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
1758
1759 dmu_tx_commit(tx);
1760 zfs_rangelock_exit(lr);
1761
1762 return (0);
1763 }
1764
1765 /*
1766 * Free space in a file
1767 *
1768 * IN: zp - znode of file to free data in.
1769 * off - start of range
1770 * len - end of range (0 => EOF)
1771 * flag - current file open mode flags.
1772 * log - TRUE if this action should be logged
1773 *
1774 * RETURN: 0 on success, error code on failure
1775 */
1776 int
1777 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1778 {
1779 dmu_tx_t *tx;
1780 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1781 zilog_t *zilog = zfsvfs->z_log;
1782 uint64_t mode;
1783 uint64_t mtime[2], ctime[2];
1784 sa_bulk_attr_t bulk[3];
1785 int count = 0;
1786 int error;
1787
1788 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
1789 sizeof (mode))) != 0)
1790 return (error);
1791
1792 if (off > zp->z_size) {
1793 error = zfs_extend(zp, off+len);
1794 if (error == 0 && log)
1795 goto log;
1796 goto out;
1797 }
1798
1799 if (len == 0) {
1800 error = zfs_trunc(zp, off);
1801 } else {
1802 if ((error = zfs_free_range(zp, off, len)) == 0 &&
1803 off + len > zp->z_size)
1804 error = zfs_extend(zp, off+len);
1805 }
1806 if (error || !log)
1807 goto out;
1808 log:
1809 tx = dmu_tx_create(zfsvfs->z_os);
1810 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1811 zfs_sa_upgrade_txholds(tx, zp);
1812 error = dmu_tx_assign(tx, TXG_WAIT);
1813 if (error) {
1814 dmu_tx_abort(tx);
1815 goto out;
1816 }
1817
1818 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
1819 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
1820 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1821 NULL, &zp->z_pflags, 8);
1822 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
1823 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1824 ASSERT(error == 0);
1825
1826 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1827
1828 dmu_tx_commit(tx);
1829
1830 zfs_znode_update_vfs(zp);
1831 error = 0;
1832
1833 out:
1834 /*
1835 * Truncate the page cache - for file truncate operations, use
1836 * the purpose-built API for truncations. For punching operations,
1837 * the truncation is handled under a range lock in zfs_free_range.
1838 */
1839 if (len == 0)
1840 truncate_setsize(ZTOI(zp), off);
1841 return (error);
1842 }
1843
1844 void
1845 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1846 {
1847 struct super_block *sb;
1848 zfsvfs_t *zfsvfs;
1849 uint64_t moid, obj, sa_obj, version;
1850 uint64_t sense = ZFS_CASE_SENSITIVE;
1851 uint64_t norm = 0;
1852 nvpair_t *elem;
1853 int size;
1854 int error;
1855 int i;
1856 znode_t *rootzp = NULL;
1857 vattr_t vattr;
1858 znode_t *zp;
1859 zfs_acl_ids_t acl_ids;
1860
1861 /*
1862 * First attempt to create master node.
1863 */
1864 /*
1865 * In an empty objset, there are no blocks to read and thus
1866 * there can be no i/o errors (which we assert below).
1867 */
1868 moid = MASTER_NODE_OBJ;
1869 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1870 DMU_OT_NONE, 0, tx);
1871 ASSERT(error == 0);
1872
1873 /*
1874 * Set starting attributes.
1875 */
1876 version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
1877 elem = NULL;
1878 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1879 /* For the moment we expect all zpl props to be uint64_ts */
1880 uint64_t val;
1881 const char *name;
1882
1883 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1884 VERIFY(nvpair_value_uint64(elem, &val) == 0);
1885 name = nvpair_name(elem);
1886 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1887 if (val < version)
1888 version = val;
1889 } else {
1890 error = zap_update(os, moid, name, 8, 1, &val, tx);
1891 }
1892 ASSERT(error == 0);
1893 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1894 norm = val;
1895 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1896 sense = val;
1897 }
1898 ASSERT(version != 0);
1899 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
1900 ASSERT(error == 0);
1901
1902 /*
1903 * Create zap object used for SA attribute registration
1904 */
1905
1906 if (version >= ZPL_VERSION_SA) {
1907 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
1908 DMU_OT_NONE, 0, tx);
1909 error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
1910 ASSERT(error == 0);
1911 } else {
1912 sa_obj = 0;
1913 }
1914 /*
1915 * Create a delete queue.
1916 */
1917 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1918
1919 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
1920 ASSERT(error == 0);
1921
1922 /*
1923 * Create root znode. Create minimal znode/inode/zfsvfs/sb
1924 * to allow zfs_mknode to work.
1925 */
1926 vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
1927 vattr.va_mode = S_IFDIR|0755;
1928 vattr.va_uid = crgetuid(cr);
1929 vattr.va_gid = crgetgid(cr);
1930
1931 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1932 rootzp->z_unlinked = B_FALSE;
1933 rootzp->z_atime_dirty = B_FALSE;
1934 rootzp->z_is_sa = USE_SA(version, os);
1935 rootzp->z_pflags = 0;
1936
1937 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1938 zfsvfs->z_os = os;
1939 zfsvfs->z_parent = zfsvfs;
1940 zfsvfs->z_version = version;
1941 zfsvfs->z_use_fuids = USE_FUIDS(version, os);
1942 zfsvfs->z_use_sa = USE_SA(version, os);
1943 zfsvfs->z_norm = norm;
1944
1945 sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
1946 sb->s_fs_info = zfsvfs;
1947
1948 ZTOI(rootzp)->i_sb = sb;
1949
1950 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
1951 &zfsvfs->z_attr_table);
1952
1953 ASSERT(error == 0);
1954
1955 /*
1956 * Fold case on file systems that are always or sometimes case
1957 * insensitive.
1958 */
1959 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1960 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
1961
1962 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1963 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1964 offsetof(znode_t, z_link_node));
1965
1966 size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
1967 zfsvfs->z_hold_size = size;
1968 zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
1969 KM_SLEEP);
1970 zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
1971 for (i = 0; i != size; i++) {
1972 avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
1973 sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
1974 mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
1975 }
1976
1977 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1978 cr, NULL, &acl_ids, zfs_init_idmap));
1979 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
1980 ASSERT3P(zp, ==, rootzp);
1981 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1982 ASSERT(error == 0);
1983 zfs_acl_ids_free(&acl_ids);
1984
1985 atomic_set(&ZTOI(rootzp)->i_count, 0);
1986 sa_handle_destroy(rootzp->z_sa_hdl);
1987 kmem_cache_free(znode_cache, rootzp);
1988
1989 for (i = 0; i != size; i++) {
1990 avl_destroy(&zfsvfs->z_hold_trees[i]);
1991 mutex_destroy(&zfsvfs->z_hold_locks[i]);
1992 }
1993
1994 mutex_destroy(&zfsvfs->z_znodes_lock);
1995
1996 vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
1997 vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
1998 kmem_free(sb, sizeof (struct super_block));
1999 kmem_free(zfsvfs, sizeof (zfsvfs_t));
2000 }
2001 #endif /* _KERNEL */
2002
2003 static int
2004 zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
2005 {
2006 uint64_t sa_obj = 0;
2007 int error;
2008
2009 error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
2010 if (error != 0 && error != ENOENT)
2011 return (error);
2012
2013 error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
2014 return (error);
2015 }
2016
2017 static int
2018 zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
2019 dmu_buf_t **db, const void *tag)
2020 {
2021 dmu_object_info_t doi;
2022 int error;
2023
2024 if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
2025 return (error);
2026
2027 dmu_object_info_from_db(*db, &doi);
2028 if ((doi.doi_bonus_type != DMU_OT_SA &&
2029 doi.doi_bonus_type != DMU_OT_ZNODE) ||
2030 (doi.doi_bonus_type == DMU_OT_ZNODE &&
2031 doi.doi_bonus_size < sizeof (znode_phys_t))) {
2032 sa_buf_rele(*db, tag);
2033 return (SET_ERROR(ENOTSUP));
2034 }
2035
2036 error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
2037 if (error != 0) {
2038 sa_buf_rele(*db, tag);
2039 return (error);
2040 }
2041
2042 return (0);
2043 }
2044
2045 static void
2046 zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, const void *tag)
2047 {
2048 sa_handle_destroy(hdl);
2049 sa_buf_rele(db, tag);
2050 }
2051
2052 /*
2053 * Given an object number, return its parent object number and whether
2054 * or not the object is an extended attribute directory.
2055 */
2056 static int
2057 zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
2058 uint64_t *pobjp, int *is_xattrdir)
2059 {
2060 uint64_t parent;
2061 uint64_t pflags;
2062 uint64_t mode;
2063 uint64_t parent_mode;
2064 sa_bulk_attr_t bulk[3];
2065 sa_handle_t *sa_hdl;
2066 dmu_buf_t *sa_db;
2067 int count = 0;
2068 int error;
2069
2070 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
2071 &parent, sizeof (parent));
2072 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
2073 &pflags, sizeof (pflags));
2074 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2075 &mode, sizeof (mode));
2076
2077 if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
2078 return (error);
2079
2080 /*
2081 * When a link is removed its parent pointer is not changed and will
2082 * be invalid. There are two cases where a link is removed but the
2083 * file stays around, when it goes to the delete queue and when there
2084 * are additional links.
2085 */
2086 error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
2087 if (error != 0)
2088 return (error);
2089
2090 error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
2091 zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2092 if (error != 0)
2093 return (error);
2094
2095 *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
2096
2097 /*
2098 * Extended attributes can be applied to files, directories, etc.
2099 * Otherwise the parent must be a directory.
2100 */
2101 if (!*is_xattrdir && !S_ISDIR(parent_mode))
2102 return (SET_ERROR(EINVAL));
2103
2104 *pobjp = parent;
2105
2106 return (0);
2107 }
2108
2109 /*
2110 * Given an object number, return some zpl level statistics
2111 */
2112 static int
2113 zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
2114 zfs_stat_t *sb)
2115 {
2116 sa_bulk_attr_t bulk[4];
2117 int count = 0;
2118
2119 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2120 &sb->zs_mode, sizeof (sb->zs_mode));
2121 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
2122 &sb->zs_gen, sizeof (sb->zs_gen));
2123 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
2124 &sb->zs_links, sizeof (sb->zs_links));
2125 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
2126 &sb->zs_ctime, sizeof (sb->zs_ctime));
2127
2128 return (sa_bulk_lookup(hdl, bulk, count));
2129 }
2130
2131 static int
2132 zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
2133 sa_attr_type_t *sa_table, char *buf, int len)
2134 {
2135 sa_handle_t *sa_hdl;
2136 sa_handle_t *prevhdl = NULL;
2137 dmu_buf_t *prevdb = NULL;
2138 dmu_buf_t *sa_db = NULL;
2139 char *path = buf + len - 1;
2140 int error;
2141
2142 *path = '\0';
2143 sa_hdl = hdl;
2144
2145 uint64_t deleteq_obj;
2146 VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
2147 ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
2148 error = zap_lookup_int(osp, deleteq_obj, obj);
2149 if (error == 0) {
2150 return (ESTALE);
2151 } else if (error != ENOENT) {
2152 return (error);
2153 }
2154
2155 for (;;) {
2156 uint64_t pobj = 0;
2157 char component[MAXNAMELEN + 2];
2158 size_t complen;
2159 int is_xattrdir = 0;
2160
2161 if (prevdb) {
2162 ASSERT(prevhdl != NULL);
2163 zfs_release_sa_handle(prevhdl, prevdb, FTAG);
2164 }
2165
2166 if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
2167 &is_xattrdir)) != 0)
2168 break;
2169
2170 if (pobj == obj) {
2171 if (path[0] != '/')
2172 *--path = '/';
2173 break;
2174 }
2175
2176 component[0] = '/';
2177 if (is_xattrdir) {
2178 strcpy(component + 1, "<xattrdir>");
2179 } else {
2180 error = zap_value_search(osp, pobj, obj,
2181 ZFS_DIRENT_OBJ(-1ULL), component + 1);
2182 if (error != 0)
2183 break;
2184 }
2185
2186 complen = strlen(component);
2187 path -= complen;
2188 ASSERT(path >= buf);
2189 memcpy(path, component, complen);
2190 obj = pobj;
2191
2192 if (sa_hdl != hdl) {
2193 prevhdl = sa_hdl;
2194 prevdb = sa_db;
2195 }
2196 error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
2197 if (error != 0) {
2198 sa_hdl = prevhdl;
2199 sa_db = prevdb;
2200 break;
2201 }
2202 }
2203
2204 if (sa_hdl != NULL && sa_hdl != hdl) {
2205 ASSERT(sa_db != NULL);
2206 zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2207 }
2208
2209 if (error == 0)
2210 (void) memmove(buf, path, buf + len - path);
2211
2212 return (error);
2213 }
2214
2215 int
2216 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
2217 {
2218 sa_attr_type_t *sa_table;
2219 sa_handle_t *hdl;
2220 dmu_buf_t *db;
2221 int error;
2222
2223 error = zfs_sa_setup(osp, &sa_table);
2224 if (error != 0)
2225 return (error);
2226
2227 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2228 if (error != 0)
2229 return (error);
2230
2231 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2232
2233 zfs_release_sa_handle(hdl, db, FTAG);
2234 return (error);
2235 }
2236
2237 int
2238 zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
2239 char *buf, int len)
2240 {
2241 char *path = buf + len - 1;
2242 sa_attr_type_t *sa_table;
2243 sa_handle_t *hdl;
2244 dmu_buf_t *db;
2245 int error;
2246
2247 *path = '\0';
2248
2249 error = zfs_sa_setup(osp, &sa_table);
2250 if (error != 0)
2251 return (error);
2252
2253 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2254 if (error != 0)
2255 return (error);
2256
2257 error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
2258 if (error != 0) {
2259 zfs_release_sa_handle(hdl, db, FTAG);
2260 return (error);
2261 }
2262
2263 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2264
2265 zfs_release_sa_handle(hdl, db, FTAG);
2266 return (error);
2267 }
2268
2269 /*
2270 * Read a property stored within the master node.
2271 */
2272 int
2273 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
2274 {
2275 uint64_t *cached_copy = NULL;
2276
2277 /*
2278 * Figure out where in the objset_t the cached copy would live, if it
2279 * is available for the requested property.
2280 */
2281 if (os != NULL) {
2282 switch (prop) {
2283 case ZFS_PROP_VERSION:
2284 cached_copy = &os->os_version;
2285 break;
2286 case ZFS_PROP_NORMALIZE:
2287 cached_copy = &os->os_normalization;
2288 break;
2289 case ZFS_PROP_UTF8ONLY:
2290 cached_copy = &os->os_utf8only;
2291 break;
2292 case ZFS_PROP_CASE:
2293 cached_copy = &os->os_casesensitivity;
2294 break;
2295 default:
2296 break;
2297 }
2298 }
2299 if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
2300 *value = *cached_copy;
2301 return (0);
2302 }
2303
2304 /*
2305 * If the property wasn't cached, look up the file system's value for
2306 * the property. For the version property, we look up a slightly
2307 * different string.
2308 */
2309 const char *pname;
2310 int error = ENOENT;
2311 if (prop == ZFS_PROP_VERSION)
2312 pname = ZPL_VERSION_STR;
2313 else
2314 pname = zfs_prop_to_name(prop);
2315
2316 if (os != NULL) {
2317 ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
2318 error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
2319 }
2320
2321 if (error == ENOENT) {
2322 /* No value set, use the default value */
2323 switch (prop) {
2324 case ZFS_PROP_VERSION:
2325 *value = ZPL_VERSION;
2326 break;
2327 case ZFS_PROP_NORMALIZE:
2328 case ZFS_PROP_UTF8ONLY:
2329 *value = 0;
2330 break;
2331 case ZFS_PROP_CASE:
2332 *value = ZFS_CASE_SENSITIVE;
2333 break;
2334 case ZFS_PROP_ACLTYPE:
2335 *value = ZFS_ACLTYPE_OFF;
2336 break;
2337 default:
2338 return (error);
2339 }
2340 error = 0;
2341 }
2342
2343 /*
2344 * If one of the methods for getting the property value above worked,
2345 * copy it into the objset_t's cache.
2346 */
2347 if (error == 0 && cached_copy != NULL) {
2348 *cached_copy = *value;
2349 }
2350
2351 return (error);
2352 }
2353
2354 #if defined(_KERNEL)
2355 EXPORT_SYMBOL(zfs_create_fs);
2356 EXPORT_SYMBOL(zfs_obj_to_path);
2357
2358 /* CSTYLED */
2359 module_param(zfs_object_mutex_size, uint, 0644);
2360 MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
2361 module_param(zfs_unlink_suspend_progress, int, 0644);
2362 MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks "
2363 "(debug - leaks space into the unlinked set)");
2364 #endif