]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/zfs_znode.c
Linux 4.18 compat: inode timespec -> timespec64
[mirror_zfs.git] / module / zfs / zfs_znode.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
24 */
25
26 /* Portions Copyright 2007 Jeremy Teo */
27
28 #ifdef _KERNEL
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/time.h>
32 #include <sys/sysmacros.h>
33 #include <sys/mntent.h>
34 #include <sys/u8_textprep.h>
35 #include <sys/dsl_dataset.h>
36 #include <sys/vfs.h>
37 #include <sys/vnode.h>
38 #include <sys/file.h>
39 #include <sys/kmem.h>
40 #include <sys/errno.h>
41 #include <sys/mode.h>
42 #include <sys/atomic.h>
43 #include <sys/zfs_dir.h>
44 #include <sys/zfs_acl.h>
45 #include <sys/zfs_ioctl.h>
46 #include <sys/zfs_rlock.h>
47 #include <sys/zfs_fuid.h>
48 #include <sys/zfs_vnops.h>
49 #include <sys/zfs_ctldir.h>
50 #include <sys/dnode.h>
51 #include <sys/fs/zfs.h>
52 #include <sys/zpl.h>
53 #endif /* _KERNEL */
54
55 #include <sys/dmu.h>
56 #include <sys/dmu_objset.h>
57 #include <sys/dmu_tx.h>
58 #include <sys/refcount.h>
59 #include <sys/stat.h>
60 #include <sys/zap.h>
61 #include <sys/zfs_znode.h>
62 #include <sys/sa.h>
63 #include <sys/zfs_sa.h>
64 #include <sys/zfs_stat.h>
65
66 #include "zfs_prop.h"
67 #include "zfs_comutil.h"
68
69 /*
70 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
71 * turned on when DEBUG is also defined.
72 */
73 #ifdef DEBUG
74 #define ZNODE_STATS
75 #endif /* DEBUG */
76
77 #ifdef ZNODE_STATS
78 #define ZNODE_STAT_ADD(stat) ((stat)++)
79 #else
80 #define ZNODE_STAT_ADD(stat) /* nothing */
81 #endif /* ZNODE_STATS */
82
83 /*
84 * Functions needed for userland (ie: libzpool) are not put under
85 * #ifdef_KERNEL; the rest of the functions have dependencies
86 * (such as VFS logic) that will not compile easily in userland.
87 */
88 #ifdef _KERNEL
89
90 static kmem_cache_t *znode_cache = NULL;
91 static kmem_cache_t *znode_hold_cache = NULL;
92 unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
93
94 /*ARGSUSED*/
95 static int
96 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
97 {
98 znode_t *zp = buf;
99
100 inode_init_once(ZTOI(zp));
101 list_link_init(&zp->z_link_node);
102
103 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
104 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
105 rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
106 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
107 rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
108
109 zfs_rlock_init(&zp->z_range_lock);
110
111 zp->z_dirlocks = NULL;
112 zp->z_acl_cached = NULL;
113 zp->z_xattr_cached = NULL;
114 zp->z_xattr_parent = 0;
115 zp->z_moved = 0;
116 return (0);
117 }
118
119 /*ARGSUSED*/
120 static void
121 zfs_znode_cache_destructor(void *buf, void *arg)
122 {
123 znode_t *zp = buf;
124
125 ASSERT(!list_link_active(&zp->z_link_node));
126 mutex_destroy(&zp->z_lock);
127 rw_destroy(&zp->z_parent_lock);
128 rw_destroy(&zp->z_name_lock);
129 mutex_destroy(&zp->z_acl_lock);
130 rw_destroy(&zp->z_xattr_lock);
131 zfs_rlock_destroy(&zp->z_range_lock);
132
133 ASSERT(zp->z_dirlocks == NULL);
134 ASSERT(zp->z_acl_cached == NULL);
135 ASSERT(zp->z_xattr_cached == NULL);
136 }
137
138 static int
139 zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
140 {
141 znode_hold_t *zh = buf;
142
143 mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
144 refcount_create(&zh->zh_refcount);
145 zh->zh_obj = ZFS_NO_OBJECT;
146
147 return (0);
148 }
149
150 static void
151 zfs_znode_hold_cache_destructor(void *buf, void *arg)
152 {
153 znode_hold_t *zh = buf;
154
155 mutex_destroy(&zh->zh_lock);
156 refcount_destroy(&zh->zh_refcount);
157 }
158
159 void
160 zfs_znode_init(void)
161 {
162 /*
163 * Initialize zcache. The KMC_SLAB hint is used in order that it be
164 * backed by kmalloc() when on the Linux slab in order that any
165 * wait_on_bit() operations on the related inode operate properly.
166 */
167 ASSERT(znode_cache == NULL);
168 znode_cache = kmem_cache_create("zfs_znode_cache",
169 sizeof (znode_t), 0, zfs_znode_cache_constructor,
170 zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB);
171
172 ASSERT(znode_hold_cache == NULL);
173 znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
174 sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
175 zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
176 }
177
178 void
179 zfs_znode_fini(void)
180 {
181 /*
182 * Cleanup zcache
183 */
184 if (znode_cache)
185 kmem_cache_destroy(znode_cache);
186 znode_cache = NULL;
187
188 if (znode_hold_cache)
189 kmem_cache_destroy(znode_hold_cache);
190 znode_hold_cache = NULL;
191 }
192
193 /*
194 * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
195 * serialize access to a znode and its SA buffer while the object is being
196 * created or destroyed. This kind of locking would normally reside in the
197 * znode itself but in this case that's impossible because the znode and SA
198 * buffer may not yet exist. Therefore the locking is handled externally
199 * with an array of mutexs and AVLs trees which contain per-object locks.
200 *
201 * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
202 * in to the correct AVL tree and finally the per-object lock is held. In
203 * zfs_znode_hold_exit() the process is reversed. The per-object lock is
204 * released, removed from the AVL tree and destroyed if there are no waiters.
205 *
206 * This scheme has two important properties:
207 *
208 * 1) No memory allocations are performed while holding one of the z_hold_locks.
209 * This ensures evict(), which can be called from direct memory reclaim, will
210 * never block waiting on a z_hold_locks which just happens to have hashed
211 * to the same index.
212 *
213 * 2) All locks used to serialize access to an object are per-object and never
214 * shared. This minimizes lock contention without creating a large number
215 * of dedicated locks.
216 *
217 * On the downside it does require znode_lock_t structures to be frequently
218 * allocated and freed. However, because these are backed by a kmem cache
219 * and very short lived this cost is minimal.
220 */
221 int
222 zfs_znode_hold_compare(const void *a, const void *b)
223 {
224 const znode_hold_t *zh_a = (const znode_hold_t *)a;
225 const znode_hold_t *zh_b = (const znode_hold_t *)b;
226
227 return (AVL_CMP(zh_a->zh_obj, zh_b->zh_obj));
228 }
229
230 boolean_t
231 zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
232 {
233 znode_hold_t *zh, search;
234 int i = ZFS_OBJ_HASH(zfsvfs, obj);
235 boolean_t held;
236
237 search.zh_obj = obj;
238
239 mutex_enter(&zfsvfs->z_hold_locks[i]);
240 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
241 held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
242 mutex_exit(&zfsvfs->z_hold_locks[i]);
243
244 return (held);
245 }
246
247 static znode_hold_t *
248 zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
249 {
250 znode_hold_t *zh, *zh_new, search;
251 int i = ZFS_OBJ_HASH(zfsvfs, obj);
252 boolean_t found = B_FALSE;
253
254 zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
255 zh_new->zh_obj = obj;
256 search.zh_obj = obj;
257
258 mutex_enter(&zfsvfs->z_hold_locks[i]);
259 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
260 if (likely(zh == NULL)) {
261 zh = zh_new;
262 avl_add(&zfsvfs->z_hold_trees[i], zh);
263 } else {
264 ASSERT3U(zh->zh_obj, ==, obj);
265 found = B_TRUE;
266 }
267 refcount_add(&zh->zh_refcount, NULL);
268 mutex_exit(&zfsvfs->z_hold_locks[i]);
269
270 if (found == B_TRUE)
271 kmem_cache_free(znode_hold_cache, zh_new);
272
273 ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
274 ASSERT3S(refcount_count(&zh->zh_refcount), >, 0);
275 mutex_enter(&zh->zh_lock);
276
277 return (zh);
278 }
279
280 static void
281 zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
282 {
283 int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
284 boolean_t remove = B_FALSE;
285
286 ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
287 ASSERT3S(refcount_count(&zh->zh_refcount), >, 0);
288 mutex_exit(&zh->zh_lock);
289
290 mutex_enter(&zfsvfs->z_hold_locks[i]);
291 if (refcount_remove(&zh->zh_refcount, NULL) == 0) {
292 avl_remove(&zfsvfs->z_hold_trees[i], zh);
293 remove = B_TRUE;
294 }
295 mutex_exit(&zfsvfs->z_hold_locks[i]);
296
297 if (remove == B_TRUE)
298 kmem_cache_free(znode_hold_cache, zh);
299 }
300
301 int
302 zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
303 {
304 #ifdef HAVE_SMB_SHARE
305 zfs_acl_ids_t acl_ids;
306 vattr_t vattr;
307 znode_t *sharezp;
308 vnode_t *vp;
309 znode_t *zp;
310 int error;
311
312 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
313 vattr.va_mode = S_IFDIR | 0555;
314 vattr.va_uid = crgetuid(kcred);
315 vattr.va_gid = crgetgid(kcred);
316
317 sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
318 sharezp->z_moved = 0;
319 sharezp->z_unlinked = 0;
320 sharezp->z_atime_dirty = 0;
321 sharezp->z_zfsvfs = zfsvfs;
322 sharezp->z_is_sa = zfsvfs->z_use_sa;
323 sharezp->z_pflags = 0;
324
325 vp = ZTOV(sharezp);
326 vn_reinit(vp);
327 vp->v_type = VDIR;
328
329 VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
330 kcred, NULL, &acl_ids));
331 zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
332 ASSERT3P(zp, ==, sharezp);
333 ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */
334 POINTER_INVALIDATE(&sharezp->z_zfsvfs);
335 error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
336 ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
337 zfsvfs->z_shares_dir = sharezp->z_id;
338
339 zfs_acl_ids_free(&acl_ids);
340 // ZTOV(sharezp)->v_count = 0;
341 sa_handle_destroy(sharezp->z_sa_hdl);
342 kmem_cache_free(znode_cache, sharezp);
343
344 return (error);
345 #else
346 return (0);
347 #endif /* HAVE_SMB_SHARE */
348 }
349
350 static void
351 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
352 dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
353 {
354 ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
355
356 mutex_enter(&zp->z_lock);
357
358 ASSERT(zp->z_sa_hdl == NULL);
359 ASSERT(zp->z_acl_cached == NULL);
360 if (sa_hdl == NULL) {
361 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
362 SA_HDL_SHARED, &zp->z_sa_hdl));
363 } else {
364 zp->z_sa_hdl = sa_hdl;
365 sa_set_userp(sa_hdl, zp);
366 }
367
368 zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
369
370 mutex_exit(&zp->z_lock);
371 }
372
373 void
374 zfs_znode_dmu_fini(znode_t *zp)
375 {
376 ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked ||
377 RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
378
379 sa_handle_destroy(zp->z_sa_hdl);
380 zp->z_sa_hdl = NULL;
381 }
382
383 /*
384 * Called by new_inode() to allocate a new inode.
385 */
386 int
387 zfs_inode_alloc(struct super_block *sb, struct inode **ip)
388 {
389 znode_t *zp;
390
391 zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
392 *ip = ZTOI(zp);
393
394 return (0);
395 }
396
397 /*
398 * Called in multiple places when an inode should be destroyed.
399 */
400 void
401 zfs_inode_destroy(struct inode *ip)
402 {
403 znode_t *zp = ITOZ(ip);
404 zfsvfs_t *zfsvfs = ZTOZSB(zp);
405
406 mutex_enter(&zfsvfs->z_znodes_lock);
407 if (list_link_active(&zp->z_link_node)) {
408 list_remove(&zfsvfs->z_all_znodes, zp);
409 zfsvfs->z_nr_znodes--;
410 }
411 mutex_exit(&zfsvfs->z_znodes_lock);
412
413 if (zp->z_acl_cached) {
414 zfs_acl_free(zp->z_acl_cached);
415 zp->z_acl_cached = NULL;
416 }
417
418 if (zp->z_xattr_cached) {
419 nvlist_free(zp->z_xattr_cached);
420 zp->z_xattr_cached = NULL;
421 }
422
423 kmem_cache_free(znode_cache, zp);
424 }
425
426 static void
427 zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
428 {
429 uint64_t rdev = 0;
430
431 switch (ip->i_mode & S_IFMT) {
432 case S_IFREG:
433 ip->i_op = &zpl_inode_operations;
434 ip->i_fop = &zpl_file_operations;
435 ip->i_mapping->a_ops = &zpl_address_space_operations;
436 break;
437
438 case S_IFDIR:
439 ip->i_op = &zpl_dir_inode_operations;
440 ip->i_fop = &zpl_dir_file_operations;
441 ITOZ(ip)->z_zn_prefetch = B_TRUE;
442 break;
443
444 case S_IFLNK:
445 ip->i_op = &zpl_symlink_inode_operations;
446 break;
447
448 /*
449 * rdev is only stored in a SA only for device files.
450 */
451 case S_IFCHR:
452 case S_IFBLK:
453 (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
454 sizeof (rdev));
455 /*FALLTHROUGH*/
456 case S_IFIFO:
457 case S_IFSOCK:
458 init_special_inode(ip, ip->i_mode, rdev);
459 ip->i_op = &zpl_special_inode_operations;
460 break;
461
462 default:
463 zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
464 (u_longlong_t)ip->i_ino, ip->i_mode);
465
466 /* Assume the inode is a file and attempt to continue */
467 ip->i_mode = S_IFREG | 0644;
468 ip->i_op = &zpl_inode_operations;
469 ip->i_fop = &zpl_file_operations;
470 ip->i_mapping->a_ops = &zpl_address_space_operations;
471 break;
472 }
473 }
474
475 void
476 zfs_set_inode_flags(znode_t *zp, struct inode *ip)
477 {
478 /*
479 * Linux and Solaris have different sets of file attributes, so we
480 * restrict this conversion to the intersection of the two.
481 */
482 #ifdef HAVE_INODE_SET_FLAGS
483 unsigned int flags = 0;
484 if (zp->z_pflags & ZFS_IMMUTABLE)
485 flags |= S_IMMUTABLE;
486 if (zp->z_pflags & ZFS_APPENDONLY)
487 flags |= S_APPEND;
488
489 inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
490 #else
491 if (zp->z_pflags & ZFS_IMMUTABLE)
492 ip->i_flags |= S_IMMUTABLE;
493 else
494 ip->i_flags &= ~S_IMMUTABLE;
495
496 if (zp->z_pflags & ZFS_APPENDONLY)
497 ip->i_flags |= S_APPEND;
498 else
499 ip->i_flags &= ~S_APPEND;
500 #endif
501 }
502
503 /*
504 * Update the embedded inode given the znode. We should work toward
505 * eliminating this function as soon as possible by removing values
506 * which are duplicated between the znode and inode. If the generic
507 * inode has the correct field it should be used, and the ZFS code
508 * updated to access the inode. This can be done incrementally.
509 */
510 void
511 zfs_inode_update(znode_t *zp)
512 {
513 zfsvfs_t *zfsvfs;
514 struct inode *ip;
515 uint32_t blksize;
516 u_longlong_t i_blocks;
517
518 ASSERT(zp != NULL);
519 zfsvfs = ZTOZSB(zp);
520 ip = ZTOI(zp);
521
522 /* Skip .zfs control nodes which do not exist on disk. */
523 if (zfsctl_is_node(ip))
524 return;
525
526 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
527
528 spin_lock(&ip->i_lock);
529 ip->i_blocks = i_blocks;
530 i_size_write(ip, zp->z_size);
531 spin_unlock(&ip->i_lock);
532 }
533
534
535 /*
536 * Construct a znode+inode and initialize.
537 *
538 * This does not do a call to dmu_set_user() that is
539 * up to the caller to do, in case you don't want to
540 * return the znode
541 */
542 static znode_t *
543 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
544 dmu_object_type_t obj_type, uint64_t obj, sa_handle_t *hdl)
545 {
546 znode_t *zp;
547 struct inode *ip;
548 uint64_t mode;
549 uint64_t parent;
550 uint64_t tmp_gen;
551 uint64_t links;
552 uint64_t z_uid, z_gid;
553 uint64_t atime[2], mtime[2], ctime[2];
554 uint64_t projid = ZFS_DEFAULT_PROJID;
555 sa_bulk_attr_t bulk[11];
556 int count = 0;
557
558 ASSERT(zfsvfs != NULL);
559
560 ip = new_inode(zfsvfs->z_sb);
561 if (ip == NULL)
562 return (NULL);
563
564 zp = ITOZ(ip);
565 ASSERT(zp->z_dirlocks == NULL);
566 ASSERT3P(zp->z_acl_cached, ==, NULL);
567 ASSERT3P(zp->z_xattr_cached, ==, NULL);
568 zp->z_moved = 0;
569 zp->z_sa_hdl = NULL;
570 zp->z_unlinked = 0;
571 zp->z_atime_dirty = 0;
572 zp->z_mapcnt = 0;
573 zp->z_id = db->db_object;
574 zp->z_blksz = blksz;
575 zp->z_seq = 0x7A4653;
576 zp->z_sync_cnt = 0;
577 zp->z_is_mapped = B_FALSE;
578 zp->z_is_ctldir = B_FALSE;
579 zp->z_is_stale = B_FALSE;
580 zp->z_range_lock.zr_size = &zp->z_size;
581 zp->z_range_lock.zr_blksz = &zp->z_blksz;
582 zp->z_range_lock.zr_max_blksz = &ZTOZSB(zp)->z_max_blksz;
583
584 zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
585
586 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
587 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
588 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
589 &zp->z_size, 8);
590 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
591 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
592 &zp->z_pflags, 8);
593 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
594 &parent, 8);
595 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
596 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
597 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
598 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
599 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
600
601 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
602 (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
603 (zp->z_pflags & ZFS_PROJID) &&
604 sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
605 if (hdl == NULL)
606 sa_handle_destroy(zp->z_sa_hdl);
607 zp->z_sa_hdl = NULL;
608 goto error;
609 }
610
611 zp->z_projid = projid;
612 zp->z_mode = ip->i_mode = mode;
613 ip->i_generation = (uint32_t)tmp_gen;
614 ip->i_blkbits = SPA_MINBLOCKSHIFT;
615 set_nlink(ip, (uint32_t)links);
616 zfs_uid_write(ip, z_uid);
617 zfs_gid_write(ip, z_gid);
618 zfs_set_inode_flags(zp, ip);
619
620 /* Cache the xattr parent id */
621 if (zp->z_pflags & ZFS_XATTR)
622 zp->z_xattr_parent = parent;
623
624 ZFS_TIME_DECODE(&ip->i_atime, atime);
625 ZFS_TIME_DECODE(&ip->i_mtime, mtime);
626 ZFS_TIME_DECODE(&ip->i_ctime, ctime);
627
628 ip->i_ino = obj;
629 zfs_inode_update(zp);
630 zfs_inode_set_ops(zfsvfs, ip);
631
632 /*
633 * The only way insert_inode_locked() can fail is if the ip->i_ino
634 * number is already hashed for this super block. This can never
635 * happen because the inode numbers map 1:1 with the object numbers.
636 *
637 * The one exception is rolling back a mounted file system, but in
638 * this case all the active inode are unhashed during the rollback.
639 */
640 VERIFY3S(insert_inode_locked(ip), ==, 0);
641
642 mutex_enter(&zfsvfs->z_znodes_lock);
643 list_insert_tail(&zfsvfs->z_all_znodes, zp);
644 zfsvfs->z_nr_znodes++;
645 membar_producer();
646 mutex_exit(&zfsvfs->z_znodes_lock);
647
648 unlock_new_inode(ip);
649 return (zp);
650
651 error:
652 iput(ip);
653 return (NULL);
654 }
655
656 /*
657 * Safely mark an inode dirty. Inodes which are part of a read-only
658 * file system or snapshot may not be dirtied.
659 */
660 void
661 zfs_mark_inode_dirty(struct inode *ip)
662 {
663 zfsvfs_t *zfsvfs = ITOZSB(ip);
664
665 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
666 return;
667
668 mark_inode_dirty(ip);
669 }
670
671 static uint64_t empty_xattr;
672 static uint64_t pad[4];
673 static zfs_acl_phys_t acl_phys;
674 /*
675 * Create a new DMU object to hold a zfs znode.
676 *
677 * IN: dzp - parent directory for new znode
678 * vap - file attributes for new znode
679 * tx - dmu transaction id for zap operations
680 * cr - credentials of caller
681 * flag - flags:
682 * IS_ROOT_NODE - new object will be root
683 * IS_XATTR - new object is an attribute
684 * bonuslen - length of bonus buffer
685 * setaclp - File/Dir initial ACL
686 * fuidp - Tracks fuid allocation.
687 *
688 * OUT: zpp - allocated znode
689 *
690 */
691 void
692 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
693 uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
694 {
695 uint64_t crtime[2], atime[2], mtime[2], ctime[2];
696 uint64_t mode, size, links, parent, pflags;
697 uint64_t projid = ZFS_DEFAULT_PROJID;
698 uint64_t rdev = 0;
699 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
700 dmu_buf_t *db;
701 inode_timespec_t now;
702 uint64_t gen, obj;
703 int bonuslen;
704 int dnodesize;
705 sa_handle_t *sa_hdl;
706 dmu_object_type_t obj_type;
707 sa_bulk_attr_t *sa_attrs;
708 int cnt = 0;
709 zfs_acl_locator_cb_t locate = { 0 };
710 znode_hold_t *zh;
711
712 if (zfsvfs->z_replay) {
713 obj = vap->va_nodeid;
714 now = vap->va_ctime; /* see zfs_replay_create() */
715 gen = vap->va_nblocks; /* ditto */
716 dnodesize = vap->va_fsid; /* ditto */
717 } else {
718 obj = 0;
719 gethrestime(&now);
720 gen = dmu_tx_get_txg(tx);
721 dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
722 }
723
724 if (dnodesize == 0)
725 dnodesize = DNODE_MIN_SIZE;
726
727 obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
728
729 bonuslen = (obj_type == DMU_OT_SA) ?
730 DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
731
732 /*
733 * Create a new DMU object.
734 */
735 /*
736 * There's currently no mechanism for pre-reading the blocks that will
737 * be needed to allocate a new object, so we accept the small chance
738 * that there will be an i/o error and we will fail one of the
739 * assertions below.
740 */
741 if (S_ISDIR(vap->va_mode)) {
742 if (zfsvfs->z_replay) {
743 VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
744 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
745 obj_type, bonuslen, dnodesize, tx));
746 } else {
747 obj = zap_create_norm_dnsize(zfsvfs->z_os,
748 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
749 obj_type, bonuslen, dnodesize, tx);
750 }
751 } else {
752 if (zfsvfs->z_replay) {
753 VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
754 DMU_OT_PLAIN_FILE_CONTENTS, 0,
755 obj_type, bonuslen, dnodesize, tx));
756 } else {
757 obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
758 DMU_OT_PLAIN_FILE_CONTENTS, 0,
759 obj_type, bonuslen, dnodesize, tx);
760 }
761 }
762
763 zh = zfs_znode_hold_enter(zfsvfs, obj);
764 VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
765
766 /*
767 * If this is the root, fix up the half-initialized parent pointer
768 * to reference the just-allocated physical data area.
769 */
770 if (flag & IS_ROOT_NODE) {
771 dzp->z_id = obj;
772 }
773
774 /*
775 * If parent is an xattr, so am I.
776 */
777 if (dzp->z_pflags & ZFS_XATTR) {
778 flag |= IS_XATTR;
779 }
780
781 if (zfsvfs->z_use_fuids)
782 pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
783 else
784 pflags = 0;
785
786 if (S_ISDIR(vap->va_mode)) {
787 size = 2; /* contents ("." and "..") */
788 links = 2;
789 } else {
790 size = 0;
791 links = (flag & IS_TMPFILE) ? 0 : 1;
792 }
793
794 if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
795 rdev = vap->va_rdev;
796
797 parent = dzp->z_id;
798 mode = acl_ids->z_mode;
799 if (flag & IS_XATTR)
800 pflags |= ZFS_XATTR;
801
802 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
803 /*
804 * With ZFS_PROJID flag, we can easily know whether there is
805 * project ID stored on disk or not. See zfs_space_delta_cb().
806 */
807 if (obj_type != DMU_OT_ZNODE &&
808 dmu_objset_projectquota_enabled(zfsvfs->z_os))
809 pflags |= ZFS_PROJID;
810
811 /*
812 * Inherit project ID from parent if required.
813 */
814 projid = zfs_inherit_projid(dzp);
815 if (dzp->z_pflags & ZFS_PROJINHERIT)
816 pflags |= ZFS_PROJINHERIT;
817 }
818
819 /*
820 * No execs denied will be deterimed when zfs_mode_compute() is called.
821 */
822 pflags |= acl_ids->z_aclp->z_hints &
823 (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
824 ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
825
826 ZFS_TIME_ENCODE(&now, crtime);
827 ZFS_TIME_ENCODE(&now, ctime);
828
829 if (vap->va_mask & ATTR_ATIME) {
830 ZFS_TIME_ENCODE(&vap->va_atime, atime);
831 } else {
832 ZFS_TIME_ENCODE(&now, atime);
833 }
834
835 if (vap->va_mask & ATTR_MTIME) {
836 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
837 } else {
838 ZFS_TIME_ENCODE(&now, mtime);
839 }
840
841 /* Now add in all of the "SA" attributes */
842 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
843 &sa_hdl));
844
845 /*
846 * Setup the array of attributes to be replaced/set on the new file
847 *
848 * order for DMU_OT_ZNODE is critical since it needs to be constructed
849 * in the old znode_phys_t format. Don't change this ordering
850 */
851 sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
852
853 if (obj_type == DMU_OT_ZNODE) {
854 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
855 NULL, &atime, 16);
856 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
857 NULL, &mtime, 16);
858 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
859 NULL, &ctime, 16);
860 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
861 NULL, &crtime, 16);
862 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
863 NULL, &gen, 8);
864 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
865 NULL, &mode, 8);
866 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
867 NULL, &size, 8);
868 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
869 NULL, &parent, 8);
870 } else {
871 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
872 NULL, &mode, 8);
873 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
874 NULL, &size, 8);
875 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
876 NULL, &gen, 8);
877 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
878 NULL, &acl_ids->z_fuid, 8);
879 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
880 NULL, &acl_ids->z_fgid, 8);
881 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
882 NULL, &parent, 8);
883 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
884 NULL, &pflags, 8);
885 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
886 NULL, &atime, 16);
887 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
888 NULL, &mtime, 16);
889 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
890 NULL, &ctime, 16);
891 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
892 NULL, &crtime, 16);
893 }
894
895 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
896
897 if (obj_type == DMU_OT_ZNODE) {
898 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
899 &empty_xattr, 8);
900 } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
901 pflags & ZFS_PROJID) {
902 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
903 NULL, &projid, 8);
904 }
905 if (obj_type == DMU_OT_ZNODE ||
906 (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
907 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
908 NULL, &rdev, 8);
909 }
910 if (obj_type == DMU_OT_ZNODE) {
911 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
912 NULL, &pflags, 8);
913 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
914 &acl_ids->z_fuid, 8);
915 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
916 &acl_ids->z_fgid, 8);
917 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
918 sizeof (uint64_t) * 4);
919 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
920 &acl_phys, sizeof (zfs_acl_phys_t));
921 } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
922 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
923 &acl_ids->z_aclp->z_acl_count, 8);
924 locate.cb_aclp = acl_ids->z_aclp;
925 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
926 zfs_acl_data_locator, &locate,
927 acl_ids->z_aclp->z_acl_bytes);
928 mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
929 acl_ids->z_fuid, acl_ids->z_fgid);
930 }
931
932 VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
933
934 if (!(flag & IS_ROOT_NODE)) {
935 /*
936 * The call to zfs_znode_alloc() may fail if memory is low
937 * via the call path: alloc_inode() -> inode_init_always() ->
938 * security_inode_alloc() -> inode_alloc_security(). Since
939 * the existing code is written such that zfs_mknode() can
940 * not fail retry until sufficient memory has been reclaimed.
941 */
942 do {
943 *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, obj,
944 sa_hdl);
945 } while (*zpp == NULL);
946
947 VERIFY(*zpp != NULL);
948 VERIFY(dzp != NULL);
949 } else {
950 /*
951 * If we are creating the root node, the "parent" we
952 * passed in is the znode for the root.
953 */
954 *zpp = dzp;
955
956 (*zpp)->z_sa_hdl = sa_hdl;
957 }
958
959 (*zpp)->z_pflags = pflags;
960 (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
961 (*zpp)->z_dnodesize = dnodesize;
962 (*zpp)->z_projid = projid;
963
964 if (obj_type == DMU_OT_ZNODE ||
965 acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
966 VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
967 }
968 kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
969 zfs_znode_hold_exit(zfsvfs, zh);
970 }
971
972 /*
973 * Update in-core attributes. It is assumed the caller will be doing an
974 * sa_bulk_update to push the changes out.
975 */
976 void
977 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
978 {
979 xoptattr_t *xoap;
980 boolean_t update_inode = B_FALSE;
981
982 xoap = xva_getxoptattr(xvap);
983 ASSERT(xoap);
984
985 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
986 uint64_t times[2];
987 ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
988 (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
989 &times, sizeof (times), tx);
990 XVA_SET_RTN(xvap, XAT_CREATETIME);
991 }
992 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
993 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
994 zp->z_pflags, tx);
995 XVA_SET_RTN(xvap, XAT_READONLY);
996 }
997 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
998 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
999 zp->z_pflags, tx);
1000 XVA_SET_RTN(xvap, XAT_HIDDEN);
1001 }
1002 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
1003 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
1004 zp->z_pflags, tx);
1005 XVA_SET_RTN(xvap, XAT_SYSTEM);
1006 }
1007 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
1008 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
1009 zp->z_pflags, tx);
1010 XVA_SET_RTN(xvap, XAT_ARCHIVE);
1011 }
1012 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
1013 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
1014 zp->z_pflags, tx);
1015 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
1016
1017 update_inode = B_TRUE;
1018 }
1019 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
1020 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
1021 zp->z_pflags, tx);
1022 XVA_SET_RTN(xvap, XAT_NOUNLINK);
1023 }
1024 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1025 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
1026 zp->z_pflags, tx);
1027 XVA_SET_RTN(xvap, XAT_APPENDONLY);
1028
1029 update_inode = B_TRUE;
1030 }
1031 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1032 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
1033 zp->z_pflags, tx);
1034 XVA_SET_RTN(xvap, XAT_NODUMP);
1035 }
1036 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1037 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
1038 zp->z_pflags, tx);
1039 XVA_SET_RTN(xvap, XAT_OPAQUE);
1040 }
1041 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1042 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
1043 xoap->xoa_av_quarantined, zp->z_pflags, tx);
1044 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1045 }
1046 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1047 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
1048 zp->z_pflags, tx);
1049 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1050 }
1051 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
1052 zfs_sa_set_scanstamp(zp, xvap, tx);
1053 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
1054 }
1055 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1056 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
1057 zp->z_pflags, tx);
1058 XVA_SET_RTN(xvap, XAT_REPARSE);
1059 }
1060 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1061 ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
1062 zp->z_pflags, tx);
1063 XVA_SET_RTN(xvap, XAT_OFFLINE);
1064 }
1065 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1066 ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
1067 zp->z_pflags, tx);
1068 XVA_SET_RTN(xvap, XAT_SPARSE);
1069 }
1070 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
1071 ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
1072 zp->z_pflags, tx);
1073 XVA_SET_RTN(xvap, XAT_PROJINHERIT);
1074 }
1075
1076 if (update_inode)
1077 zfs_set_inode_flags(zp, ZTOI(zp));
1078 }
1079
1080 int
1081 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
1082 {
1083 dmu_object_info_t doi;
1084 dmu_buf_t *db;
1085 znode_t *zp;
1086 znode_hold_t *zh;
1087 int err;
1088 sa_handle_t *hdl;
1089
1090 *zpp = NULL;
1091
1092 again:
1093 zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1094
1095 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1096 if (err) {
1097 zfs_znode_hold_exit(zfsvfs, zh);
1098 return (err);
1099 }
1100
1101 dmu_object_info_from_db(db, &doi);
1102 if (doi.doi_bonus_type != DMU_OT_SA &&
1103 (doi.doi_bonus_type != DMU_OT_ZNODE ||
1104 (doi.doi_bonus_type == DMU_OT_ZNODE &&
1105 doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1106 sa_buf_rele(db, NULL);
1107 zfs_znode_hold_exit(zfsvfs, zh);
1108 return (SET_ERROR(EINVAL));
1109 }
1110
1111 hdl = dmu_buf_get_user(db);
1112 if (hdl != NULL) {
1113 zp = sa_get_userdata(hdl);
1114
1115
1116 /*
1117 * Since "SA" does immediate eviction we
1118 * should never find a sa handle that doesn't
1119 * know about the znode.
1120 */
1121
1122 ASSERT3P(zp, !=, NULL);
1123
1124 mutex_enter(&zp->z_lock);
1125 ASSERT3U(zp->z_id, ==, obj_num);
1126 /*
1127 * If igrab() returns NULL the VFS has independently
1128 * determined the inode should be evicted and has
1129 * called iput_final() to start the eviction process.
1130 * The SA handle is still valid but because the VFS
1131 * requires that the eviction succeed we must drop
1132 * our locks and references to allow the eviction to
1133 * complete. The zfs_zget() may then be retried.
1134 *
1135 * This unlikely case could be optimized by registering
1136 * a sops->drop_inode() callback. The callback would
1137 * need to detect the active SA hold thereby informing
1138 * the VFS that this inode should not be evicted.
1139 */
1140 if (igrab(ZTOI(zp)) == NULL) {
1141 mutex_exit(&zp->z_lock);
1142 sa_buf_rele(db, NULL);
1143 zfs_znode_hold_exit(zfsvfs, zh);
1144 /* inode might need this to finish evict */
1145 cond_resched();
1146 goto again;
1147 }
1148 *zpp = zp;
1149 err = 0;
1150 mutex_exit(&zp->z_lock);
1151 sa_buf_rele(db, NULL);
1152 zfs_znode_hold_exit(zfsvfs, zh);
1153 return (err);
1154 }
1155
1156 /*
1157 * Not found create new znode/vnode but only if file exists.
1158 *
1159 * There is a small window where zfs_vget() could
1160 * find this object while a file create is still in
1161 * progress. This is checked for in zfs_znode_alloc()
1162 *
1163 * if zfs_znode_alloc() fails it will drop the hold on the
1164 * bonus buffer.
1165 */
1166 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
1167 doi.doi_bonus_type, obj_num, NULL);
1168 if (zp == NULL) {
1169 err = SET_ERROR(ENOENT);
1170 } else {
1171 *zpp = zp;
1172 }
1173 zfs_znode_hold_exit(zfsvfs, zh);
1174 return (err);
1175 }
1176
1177 int
1178 zfs_rezget(znode_t *zp)
1179 {
1180 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1181 dmu_object_info_t doi;
1182 dmu_buf_t *db;
1183 uint64_t obj_num = zp->z_id;
1184 uint64_t mode;
1185 uint64_t links;
1186 sa_bulk_attr_t bulk[10];
1187 int err;
1188 int count = 0;
1189 uint64_t gen;
1190 uint64_t z_uid, z_gid;
1191 uint64_t atime[2], mtime[2], ctime[2];
1192 uint64_t projid = ZFS_DEFAULT_PROJID;
1193 znode_hold_t *zh;
1194
1195 /*
1196 * skip ctldir, otherwise they will always get invalidated. This will
1197 * cause funny behaviour for the mounted snapdirs. Especially for
1198 * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
1199 * anyone automount it again as long as someone is still using the
1200 * detached mount.
1201 */
1202 if (zp->z_is_ctldir)
1203 return (0);
1204
1205 zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1206
1207 mutex_enter(&zp->z_acl_lock);
1208 if (zp->z_acl_cached) {
1209 zfs_acl_free(zp->z_acl_cached);
1210 zp->z_acl_cached = NULL;
1211 }
1212 mutex_exit(&zp->z_acl_lock);
1213
1214 rw_enter(&zp->z_xattr_lock, RW_WRITER);
1215 if (zp->z_xattr_cached) {
1216 nvlist_free(zp->z_xattr_cached);
1217 zp->z_xattr_cached = NULL;
1218 }
1219 rw_exit(&zp->z_xattr_lock);
1220
1221 ASSERT(zp->z_sa_hdl == NULL);
1222 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1223 if (err) {
1224 zfs_znode_hold_exit(zfsvfs, zh);
1225 return (err);
1226 }
1227
1228 dmu_object_info_from_db(db, &doi);
1229 if (doi.doi_bonus_type != DMU_OT_SA &&
1230 (doi.doi_bonus_type != DMU_OT_ZNODE ||
1231 (doi.doi_bonus_type == DMU_OT_ZNODE &&
1232 doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1233 sa_buf_rele(db, NULL);
1234 zfs_znode_hold_exit(zfsvfs, zh);
1235 return (SET_ERROR(EINVAL));
1236 }
1237
1238 zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
1239
1240 /* reload cached values */
1241 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
1242 &gen, sizeof (gen));
1243 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1244 &zp->z_size, sizeof (zp->z_size));
1245 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
1246 &links, sizeof (links));
1247 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1248 &zp->z_pflags, sizeof (zp->z_pflags));
1249 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1250 &z_uid, sizeof (z_uid));
1251 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1252 &z_gid, sizeof (z_gid));
1253 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1254 &mode, sizeof (mode));
1255 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1256 &atime, 16);
1257 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
1258 &mtime, 16);
1259 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
1260 &ctime, 16);
1261
1262 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
1263 zfs_znode_dmu_fini(zp);
1264 zfs_znode_hold_exit(zfsvfs, zh);
1265 return (SET_ERROR(EIO));
1266 }
1267
1268 if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
1269 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
1270 &projid, 8);
1271 if (err != 0 && err != ENOENT) {
1272 zfs_znode_dmu_fini(zp);
1273 zfs_znode_hold_exit(zfsvfs, zh);
1274 return (SET_ERROR(err));
1275 }
1276 }
1277
1278 zp->z_projid = projid;
1279 zp->z_mode = ZTOI(zp)->i_mode = mode;
1280 zfs_uid_write(ZTOI(zp), z_uid);
1281 zfs_gid_write(ZTOI(zp), z_gid);
1282
1283 ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
1284 ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
1285 ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime);
1286
1287 if (gen != ZTOI(zp)->i_generation) {
1288 zfs_znode_dmu_fini(zp);
1289 zfs_znode_hold_exit(zfsvfs, zh);
1290 return (SET_ERROR(EIO));
1291 }
1292
1293 set_nlink(ZTOI(zp), (uint32_t)links);
1294 zfs_set_inode_flags(zp, ZTOI(zp));
1295
1296 zp->z_blksz = doi.doi_data_block_size;
1297 zp->z_atime_dirty = 0;
1298 zfs_inode_update(zp);
1299
1300 /*
1301 * If the file has zero links, then it has been unlinked on the send
1302 * side and it must be in the received unlinked set.
1303 * We call zfs_znode_dmu_fini() now to prevent any accesses to the
1304 * stale data and to prevent automatical removal of the file in
1305 * zfs_zinactive(). The file will be removed either when it is removed
1306 * on the send side and the next incremental stream is received or
1307 * when the unlinked set gets processed.
1308 */
1309 zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
1310 if (zp->z_unlinked)
1311 zfs_znode_dmu_fini(zp);
1312
1313 zfs_znode_hold_exit(zfsvfs, zh);
1314
1315 return (0);
1316 }
1317
1318 void
1319 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1320 {
1321 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1322 objset_t *os = zfsvfs->z_os;
1323 uint64_t obj = zp->z_id;
1324 uint64_t acl_obj = zfs_external_acl(zp);
1325 znode_hold_t *zh;
1326
1327 zh = zfs_znode_hold_enter(zfsvfs, obj);
1328 if (acl_obj) {
1329 VERIFY(!zp->z_is_sa);
1330 VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1331 }
1332 VERIFY(0 == dmu_object_free(os, obj, tx));
1333 zfs_znode_dmu_fini(zp);
1334 zfs_znode_hold_exit(zfsvfs, zh);
1335 }
1336
1337 void
1338 zfs_zinactive(znode_t *zp)
1339 {
1340 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1341 uint64_t z_id = zp->z_id;
1342 znode_hold_t *zh;
1343
1344 ASSERT(zp->z_sa_hdl);
1345
1346 /*
1347 * Don't allow a zfs_zget() while were trying to release this znode.
1348 */
1349 zh = zfs_znode_hold_enter(zfsvfs, z_id);
1350
1351 mutex_enter(&zp->z_lock);
1352
1353 /*
1354 * If this was the last reference to a file with no links, remove
1355 * the file from the file system unless the file system is mounted
1356 * read-only. That can happen, for example, if the file system was
1357 * originally read-write, the file was opened, then unlinked and
1358 * the file system was made read-only before the file was finally
1359 * closed. The file will remain in the unlinked set.
1360 */
1361 if (zp->z_unlinked) {
1362 ASSERT(!zfsvfs->z_issnap);
1363 if (!zfs_is_readonly(zfsvfs)) {
1364 mutex_exit(&zp->z_lock);
1365 zfs_znode_hold_exit(zfsvfs, zh);
1366 zfs_rmnode(zp);
1367 return;
1368 }
1369 }
1370
1371 mutex_exit(&zp->z_lock);
1372 zfs_znode_dmu_fini(zp);
1373
1374 zfs_znode_hold_exit(zfsvfs, zh);
1375 }
1376
1377 static inline int
1378 zfs_compare_timespec(struct timespec *t1, struct timespec *t2)
1379 {
1380 if (t1->tv_sec < t2->tv_sec)
1381 return (-1);
1382
1383 if (t1->tv_sec > t2->tv_sec)
1384 return (1);
1385
1386 return (t1->tv_nsec - t2->tv_nsec);
1387 }
1388
1389 /*
1390 * Prepare to update znode time stamps.
1391 *
1392 * IN: zp - znode requiring timestamp update
1393 * flag - ATTR_MTIME, ATTR_CTIME flags
1394 *
1395 * OUT: zp - z_seq
1396 * mtime - new mtime
1397 * ctime - new ctime
1398 *
1399 * Note: We don't update atime here, because we rely on Linux VFS to do
1400 * atime updating.
1401 */
1402 void
1403 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
1404 uint64_t ctime[2])
1405 {
1406 inode_timespec_t now;
1407
1408 gethrestime(&now);
1409
1410 zp->z_seq++;
1411
1412 if (flag & ATTR_MTIME) {
1413 ZFS_TIME_ENCODE(&now, mtime);
1414 ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime);
1415 if (ZTOZSB(zp)->z_use_fuids) {
1416 zp->z_pflags |= (ZFS_ARCHIVE |
1417 ZFS_AV_MODIFIED);
1418 }
1419 }
1420
1421 if (flag & ATTR_CTIME) {
1422 ZFS_TIME_ENCODE(&now, ctime);
1423 ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime);
1424 if (ZTOZSB(zp)->z_use_fuids)
1425 zp->z_pflags |= ZFS_ARCHIVE;
1426 }
1427 }
1428
1429 /*
1430 * Grow the block size for a file.
1431 *
1432 * IN: zp - znode of file to free data in.
1433 * size - requested block size
1434 * tx - open transaction.
1435 *
1436 * NOTE: this function assumes that the znode is write locked.
1437 */
1438 void
1439 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1440 {
1441 int error;
1442 u_longlong_t dummy;
1443
1444 if (size <= zp->z_blksz)
1445 return;
1446 /*
1447 * If the file size is already greater than the current blocksize,
1448 * we will not grow. If there is more than one block in a file,
1449 * the blocksize cannot change.
1450 */
1451 if (zp->z_blksz && zp->z_size > zp->z_blksz)
1452 return;
1453
1454 error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
1455 size, 0, tx);
1456
1457 if (error == ENOTSUP)
1458 return;
1459 ASSERT0(error);
1460
1461 /* What blocksize did we actually get? */
1462 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
1463 }
1464
1465 /*
1466 * Increase the file length
1467 *
1468 * IN: zp - znode of file to free data in.
1469 * end - new end-of-file
1470 *
1471 * RETURN: 0 on success, error code on failure
1472 */
1473 static int
1474 zfs_extend(znode_t *zp, uint64_t end)
1475 {
1476 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1477 dmu_tx_t *tx;
1478 rl_t *rl;
1479 uint64_t newblksz;
1480 int error;
1481
1482 /*
1483 * We will change zp_size, lock the whole file.
1484 */
1485 rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER);
1486
1487 /*
1488 * Nothing to do if file already at desired length.
1489 */
1490 if (end <= zp->z_size) {
1491 zfs_range_unlock(rl);
1492 return (0);
1493 }
1494 tx = dmu_tx_create(zfsvfs->z_os);
1495 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1496 zfs_sa_upgrade_txholds(tx, zp);
1497 if (end > zp->z_blksz &&
1498 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1499 /*
1500 * We are growing the file past the current block size.
1501 */
1502 if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
1503 /*
1504 * File's blocksize is already larger than the
1505 * "recordsize" property. Only let it grow to
1506 * the next power of 2.
1507 */
1508 ASSERT(!ISP2(zp->z_blksz));
1509 newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
1510 } else {
1511 newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
1512 }
1513 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1514 } else {
1515 newblksz = 0;
1516 }
1517
1518 error = dmu_tx_assign(tx, TXG_WAIT);
1519 if (error) {
1520 dmu_tx_abort(tx);
1521 zfs_range_unlock(rl);
1522 return (error);
1523 }
1524
1525 if (newblksz)
1526 zfs_grow_blocksize(zp, newblksz, tx);
1527
1528 zp->z_size = end;
1529
1530 VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
1531 &zp->z_size, sizeof (zp->z_size), tx));
1532
1533 zfs_range_unlock(rl);
1534
1535 dmu_tx_commit(tx);
1536
1537 return (0);
1538 }
1539
1540 /*
1541 * zfs_zero_partial_page - Modeled after update_pages() but
1542 * with different arguments and semantics for use by zfs_freesp().
1543 *
1544 * Zeroes a piece of a single page cache entry for zp at offset
1545 * start and length len.
1546 *
1547 * Caller must acquire a range lock on the file for the region
1548 * being zeroed in order that the ARC and page cache stay in sync.
1549 */
1550 static void
1551 zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
1552 {
1553 struct address_space *mp = ZTOI(zp)->i_mapping;
1554 struct page *pp;
1555 int64_t off;
1556 void *pb;
1557
1558 ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
1559
1560 off = start & (PAGE_SIZE - 1);
1561 start &= PAGE_MASK;
1562
1563 pp = find_lock_page(mp, start >> PAGE_SHIFT);
1564 if (pp) {
1565 if (mapping_writably_mapped(mp))
1566 flush_dcache_page(pp);
1567
1568 pb = kmap(pp);
1569 bzero(pb + off, len);
1570 kunmap(pp);
1571
1572 if (mapping_writably_mapped(mp))
1573 flush_dcache_page(pp);
1574
1575 mark_page_accessed(pp);
1576 SetPageUptodate(pp);
1577 ClearPageError(pp);
1578 unlock_page(pp);
1579 put_page(pp);
1580 }
1581 }
1582
1583 /*
1584 * Free space in a file.
1585 *
1586 * IN: zp - znode of file to free data in.
1587 * off - start of section to free.
1588 * len - length of section to free.
1589 *
1590 * RETURN: 0 on success, error code on failure
1591 */
1592 static int
1593 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1594 {
1595 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1596 rl_t *rl;
1597 int error;
1598
1599 /*
1600 * Lock the range being freed.
1601 */
1602 rl = zfs_range_lock(&zp->z_range_lock, off, len, RL_WRITER);
1603
1604 /*
1605 * Nothing to do if file already at desired length.
1606 */
1607 if (off >= zp->z_size) {
1608 zfs_range_unlock(rl);
1609 return (0);
1610 }
1611
1612 if (off + len > zp->z_size)
1613 len = zp->z_size - off;
1614
1615 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1616
1617 /*
1618 * Zero partial page cache entries. This must be done under a
1619 * range lock in order to keep the ARC and page cache in sync.
1620 */
1621 if (zp->z_is_mapped) {
1622 loff_t first_page, last_page, page_len;
1623 loff_t first_page_offset, last_page_offset;
1624
1625 /* first possible full page in hole */
1626 first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
1627 /* last page of hole */
1628 last_page = (off + len) >> PAGE_SHIFT;
1629
1630 /* offset of first_page */
1631 first_page_offset = first_page << PAGE_SHIFT;
1632 /* offset of last_page */
1633 last_page_offset = last_page << PAGE_SHIFT;
1634
1635 /* truncate whole pages */
1636 if (last_page_offset > first_page_offset) {
1637 truncate_inode_pages_range(ZTOI(zp)->i_mapping,
1638 first_page_offset, last_page_offset - 1);
1639 }
1640
1641 /* truncate sub-page ranges */
1642 if (first_page > last_page) {
1643 /* entire punched area within a single page */
1644 zfs_zero_partial_page(zp, off, len);
1645 } else {
1646 /* beginning of punched area at the end of a page */
1647 page_len = first_page_offset - off;
1648 if (page_len > 0)
1649 zfs_zero_partial_page(zp, off, page_len);
1650
1651 /* end of punched area at the beginning of a page */
1652 page_len = off + len - last_page_offset;
1653 if (page_len > 0)
1654 zfs_zero_partial_page(zp, last_page_offset,
1655 page_len);
1656 }
1657 }
1658 zfs_range_unlock(rl);
1659
1660 return (error);
1661 }
1662
1663 /*
1664 * Truncate a file
1665 *
1666 * IN: zp - znode of file to free data in.
1667 * end - new end-of-file.
1668 *
1669 * RETURN: 0 on success, error code on failure
1670 */
1671 static int
1672 zfs_trunc(znode_t *zp, uint64_t end)
1673 {
1674 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1675 dmu_tx_t *tx;
1676 rl_t *rl;
1677 int error;
1678 sa_bulk_attr_t bulk[2];
1679 int count = 0;
1680
1681 /*
1682 * We will change zp_size, lock the whole file.
1683 */
1684 rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER);
1685
1686 /*
1687 * Nothing to do if file already at desired length.
1688 */
1689 if (end >= zp->z_size) {
1690 zfs_range_unlock(rl);
1691 return (0);
1692 }
1693
1694 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
1695 DMU_OBJECT_END);
1696 if (error) {
1697 zfs_range_unlock(rl);
1698 return (error);
1699 }
1700 tx = dmu_tx_create(zfsvfs->z_os);
1701 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1702 zfs_sa_upgrade_txholds(tx, zp);
1703 dmu_tx_mark_netfree(tx);
1704 error = dmu_tx_assign(tx, TXG_WAIT);
1705 if (error) {
1706 dmu_tx_abort(tx);
1707 zfs_range_unlock(rl);
1708 return (error);
1709 }
1710
1711 zp->z_size = end;
1712 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
1713 NULL, &zp->z_size, sizeof (zp->z_size));
1714
1715 if (end == 0) {
1716 zp->z_pflags &= ~ZFS_SPARSE;
1717 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1718 NULL, &zp->z_pflags, 8);
1719 }
1720 VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
1721
1722 dmu_tx_commit(tx);
1723
1724 zfs_range_unlock(rl);
1725
1726 return (0);
1727 }
1728
1729 /*
1730 * Free space in a file
1731 *
1732 * IN: zp - znode of file to free data in.
1733 * off - start of range
1734 * len - end of range (0 => EOF)
1735 * flag - current file open mode flags.
1736 * log - TRUE if this action should be logged
1737 *
1738 * RETURN: 0 on success, error code on failure
1739 */
1740 int
1741 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1742 {
1743 dmu_tx_t *tx;
1744 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1745 zilog_t *zilog = zfsvfs->z_log;
1746 uint64_t mode;
1747 uint64_t mtime[2], ctime[2];
1748 sa_bulk_attr_t bulk[3];
1749 int count = 0;
1750 int error;
1751
1752 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
1753 sizeof (mode))) != 0)
1754 return (error);
1755
1756 if (off > zp->z_size) {
1757 error = zfs_extend(zp, off+len);
1758 if (error == 0 && log)
1759 goto log;
1760 goto out;
1761 }
1762
1763 if (len == 0) {
1764 error = zfs_trunc(zp, off);
1765 } else {
1766 if ((error = zfs_free_range(zp, off, len)) == 0 &&
1767 off + len > zp->z_size)
1768 error = zfs_extend(zp, off+len);
1769 }
1770 if (error || !log)
1771 goto out;
1772 log:
1773 tx = dmu_tx_create(zfsvfs->z_os);
1774 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1775 zfs_sa_upgrade_txholds(tx, zp);
1776 error = dmu_tx_assign(tx, TXG_WAIT);
1777 if (error) {
1778 dmu_tx_abort(tx);
1779 goto out;
1780 }
1781
1782 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
1783 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
1784 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1785 NULL, &zp->z_pflags, 8);
1786 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
1787 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1788 ASSERT(error == 0);
1789
1790 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1791
1792 dmu_tx_commit(tx);
1793
1794 zfs_inode_update(zp);
1795 error = 0;
1796
1797 out:
1798 /*
1799 * Truncate the page cache - for file truncate operations, use
1800 * the purpose-built API for truncations. For punching operations,
1801 * the truncation is handled under a range lock in zfs_free_range.
1802 */
1803 if (len == 0)
1804 truncate_setsize(ZTOI(zp), off);
1805 return (error);
1806 }
1807
1808 void
1809 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1810 {
1811 struct super_block *sb;
1812 zfsvfs_t *zfsvfs;
1813 uint64_t moid, obj, sa_obj, version;
1814 uint64_t sense = ZFS_CASE_SENSITIVE;
1815 uint64_t norm = 0;
1816 nvpair_t *elem;
1817 int size;
1818 int error;
1819 int i;
1820 znode_t *rootzp = NULL;
1821 vattr_t vattr;
1822 znode_t *zp;
1823 zfs_acl_ids_t acl_ids;
1824
1825 /*
1826 * First attempt to create master node.
1827 */
1828 /*
1829 * In an empty objset, there are no blocks to read and thus
1830 * there can be no i/o errors (which we assert below).
1831 */
1832 moid = MASTER_NODE_OBJ;
1833 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1834 DMU_OT_NONE, 0, tx);
1835 ASSERT(error == 0);
1836
1837 /*
1838 * Set starting attributes.
1839 */
1840 version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
1841 elem = NULL;
1842 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1843 /* For the moment we expect all zpl props to be uint64_ts */
1844 uint64_t val;
1845 char *name;
1846
1847 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1848 VERIFY(nvpair_value_uint64(elem, &val) == 0);
1849 name = nvpair_name(elem);
1850 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1851 if (val < version)
1852 version = val;
1853 } else {
1854 error = zap_update(os, moid, name, 8, 1, &val, tx);
1855 }
1856 ASSERT(error == 0);
1857 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1858 norm = val;
1859 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1860 sense = val;
1861 }
1862 ASSERT(version != 0);
1863 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
1864
1865 /*
1866 * Create zap object used for SA attribute registration
1867 */
1868
1869 if (version >= ZPL_VERSION_SA) {
1870 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
1871 DMU_OT_NONE, 0, tx);
1872 error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
1873 ASSERT(error == 0);
1874 } else {
1875 sa_obj = 0;
1876 }
1877 /*
1878 * Create a delete queue.
1879 */
1880 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1881
1882 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
1883 ASSERT(error == 0);
1884
1885 /*
1886 * Create root znode. Create minimal znode/inode/zfsvfs/sb
1887 * to allow zfs_mknode to work.
1888 */
1889 vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
1890 vattr.va_mode = S_IFDIR|0755;
1891 vattr.va_uid = crgetuid(cr);
1892 vattr.va_gid = crgetgid(cr);
1893
1894 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1895 rootzp->z_moved = 0;
1896 rootzp->z_unlinked = 0;
1897 rootzp->z_atime_dirty = 0;
1898 rootzp->z_is_sa = USE_SA(version, os);
1899 rootzp->z_pflags = 0;
1900
1901 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1902 zfsvfs->z_os = os;
1903 zfsvfs->z_parent = zfsvfs;
1904 zfsvfs->z_version = version;
1905 zfsvfs->z_use_fuids = USE_FUIDS(version, os);
1906 zfsvfs->z_use_sa = USE_SA(version, os);
1907 zfsvfs->z_norm = norm;
1908
1909 sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
1910 sb->s_fs_info = zfsvfs;
1911
1912 ZTOI(rootzp)->i_sb = sb;
1913
1914 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
1915 &zfsvfs->z_attr_table);
1916
1917 ASSERT(error == 0);
1918
1919 /*
1920 * Fold case on file systems that are always or sometimes case
1921 * insensitive.
1922 */
1923 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1924 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
1925
1926 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1927 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1928 offsetof(znode_t, z_link_node));
1929
1930 size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
1931 zfsvfs->z_hold_size = size;
1932 zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
1933 KM_SLEEP);
1934 zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
1935 for (i = 0; i != size; i++) {
1936 avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
1937 sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
1938 mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
1939 }
1940
1941 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1942 cr, NULL, &acl_ids));
1943 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
1944 ASSERT3P(zp, ==, rootzp);
1945 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1946 ASSERT(error == 0);
1947 zfs_acl_ids_free(&acl_ids);
1948
1949 atomic_set(&ZTOI(rootzp)->i_count, 0);
1950 sa_handle_destroy(rootzp->z_sa_hdl);
1951 kmem_cache_free(znode_cache, rootzp);
1952
1953 /*
1954 * Create shares directory
1955 */
1956 error = zfs_create_share_dir(zfsvfs, tx);
1957 ASSERT(error == 0);
1958
1959 for (i = 0; i != size; i++) {
1960 avl_destroy(&zfsvfs->z_hold_trees[i]);
1961 mutex_destroy(&zfsvfs->z_hold_locks[i]);
1962 }
1963
1964 mutex_destroy(&zfsvfs->z_znodes_lock);
1965
1966 vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
1967 vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
1968 kmem_free(sb, sizeof (struct super_block));
1969 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1970 }
1971 #endif /* _KERNEL */
1972
1973 static int
1974 zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
1975 {
1976 uint64_t sa_obj = 0;
1977 int error;
1978
1979 error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
1980 if (error != 0 && error != ENOENT)
1981 return (error);
1982
1983 error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
1984 return (error);
1985 }
1986
1987 static int
1988 zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
1989 dmu_buf_t **db, void *tag)
1990 {
1991 dmu_object_info_t doi;
1992 int error;
1993
1994 if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
1995 return (error);
1996
1997 dmu_object_info_from_db(*db, &doi);
1998 if ((doi.doi_bonus_type != DMU_OT_SA &&
1999 doi.doi_bonus_type != DMU_OT_ZNODE) ||
2000 (doi.doi_bonus_type == DMU_OT_ZNODE &&
2001 doi.doi_bonus_size < sizeof (znode_phys_t))) {
2002 sa_buf_rele(*db, tag);
2003 return (SET_ERROR(ENOTSUP));
2004 }
2005
2006 error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
2007 if (error != 0) {
2008 sa_buf_rele(*db, tag);
2009 return (error);
2010 }
2011
2012 return (0);
2013 }
2014
2015 void
2016 zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
2017 {
2018 sa_handle_destroy(hdl);
2019 sa_buf_rele(db, tag);
2020 }
2021
2022 /*
2023 * Given an object number, return its parent object number and whether
2024 * or not the object is an extended attribute directory.
2025 */
2026 static int
2027 zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
2028 uint64_t *pobjp, int *is_xattrdir)
2029 {
2030 uint64_t parent;
2031 uint64_t pflags;
2032 uint64_t mode;
2033 uint64_t parent_mode;
2034 sa_bulk_attr_t bulk[3];
2035 sa_handle_t *sa_hdl;
2036 dmu_buf_t *sa_db;
2037 int count = 0;
2038 int error;
2039
2040 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
2041 &parent, sizeof (parent));
2042 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
2043 &pflags, sizeof (pflags));
2044 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2045 &mode, sizeof (mode));
2046
2047 if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
2048 return (error);
2049
2050 /*
2051 * When a link is removed its parent pointer is not changed and will
2052 * be invalid. There are two cases where a link is removed but the
2053 * file stays around, when it goes to the delete queue and when there
2054 * are additional links.
2055 */
2056 error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
2057 if (error != 0)
2058 return (error);
2059
2060 error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
2061 zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2062 if (error != 0)
2063 return (error);
2064
2065 *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
2066
2067 /*
2068 * Extended attributes can be applied to files, directories, etc.
2069 * Otherwise the parent must be a directory.
2070 */
2071 if (!*is_xattrdir && !S_ISDIR(parent_mode))
2072 return (SET_ERROR(EINVAL));
2073
2074 *pobjp = parent;
2075
2076 return (0);
2077 }
2078
2079 /*
2080 * Given an object number, return some zpl level statistics
2081 */
2082 static int
2083 zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
2084 zfs_stat_t *sb)
2085 {
2086 sa_bulk_attr_t bulk[4];
2087 int count = 0;
2088
2089 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2090 &sb->zs_mode, sizeof (sb->zs_mode));
2091 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
2092 &sb->zs_gen, sizeof (sb->zs_gen));
2093 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
2094 &sb->zs_links, sizeof (sb->zs_links));
2095 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
2096 &sb->zs_ctime, sizeof (sb->zs_ctime));
2097
2098 return (sa_bulk_lookup(hdl, bulk, count));
2099 }
2100
2101 static int
2102 zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
2103 sa_attr_type_t *sa_table, char *buf, int len)
2104 {
2105 sa_handle_t *sa_hdl;
2106 sa_handle_t *prevhdl = NULL;
2107 dmu_buf_t *prevdb = NULL;
2108 dmu_buf_t *sa_db = NULL;
2109 char *path = buf + len - 1;
2110 int error;
2111
2112 *path = '\0';
2113 sa_hdl = hdl;
2114
2115 uint64_t deleteq_obj;
2116 VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
2117 ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
2118 error = zap_lookup_int(osp, deleteq_obj, obj);
2119 if (error == 0) {
2120 return (ESTALE);
2121 } else if (error != ENOENT) {
2122 return (error);
2123 }
2124 error = 0;
2125
2126 for (;;) {
2127 uint64_t pobj = 0;
2128 char component[MAXNAMELEN + 2];
2129 size_t complen;
2130 int is_xattrdir = 0;
2131
2132 if (prevdb)
2133 zfs_release_sa_handle(prevhdl, prevdb, FTAG);
2134
2135 if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
2136 &is_xattrdir)) != 0)
2137 break;
2138
2139 if (pobj == obj) {
2140 if (path[0] != '/')
2141 *--path = '/';
2142 break;
2143 }
2144
2145 component[0] = '/';
2146 if (is_xattrdir) {
2147 (void) sprintf(component + 1, "<xattrdir>");
2148 } else {
2149 error = zap_value_search(osp, pobj, obj,
2150 ZFS_DIRENT_OBJ(-1ULL), component + 1);
2151 if (error != 0)
2152 break;
2153 }
2154
2155 complen = strlen(component);
2156 path -= complen;
2157 ASSERT(path >= buf);
2158 bcopy(component, path, complen);
2159 obj = pobj;
2160
2161 if (sa_hdl != hdl) {
2162 prevhdl = sa_hdl;
2163 prevdb = sa_db;
2164 }
2165 error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
2166 if (error != 0) {
2167 sa_hdl = prevhdl;
2168 sa_db = prevdb;
2169 break;
2170 }
2171 }
2172
2173 if (sa_hdl != NULL && sa_hdl != hdl) {
2174 ASSERT(sa_db != NULL);
2175 zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2176 }
2177
2178 if (error == 0)
2179 (void) memmove(buf, path, buf + len - path);
2180
2181 return (error);
2182 }
2183
2184 int
2185 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
2186 {
2187 sa_attr_type_t *sa_table;
2188 sa_handle_t *hdl;
2189 dmu_buf_t *db;
2190 int error;
2191
2192 error = zfs_sa_setup(osp, &sa_table);
2193 if (error != 0)
2194 return (error);
2195
2196 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2197 if (error != 0)
2198 return (error);
2199
2200 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2201
2202 zfs_release_sa_handle(hdl, db, FTAG);
2203 return (error);
2204 }
2205
2206 int
2207 zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
2208 char *buf, int len)
2209 {
2210 char *path = buf + len - 1;
2211 sa_attr_type_t *sa_table;
2212 sa_handle_t *hdl;
2213 dmu_buf_t *db;
2214 int error;
2215
2216 *path = '\0';
2217
2218 error = zfs_sa_setup(osp, &sa_table);
2219 if (error != 0)
2220 return (error);
2221
2222 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2223 if (error != 0)
2224 return (error);
2225
2226 error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
2227 if (error != 0) {
2228 zfs_release_sa_handle(hdl, db, FTAG);
2229 return (error);
2230 }
2231
2232 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2233
2234 zfs_release_sa_handle(hdl, db, FTAG);
2235 return (error);
2236 }
2237
2238 #if defined(_KERNEL)
2239 EXPORT_SYMBOL(zfs_create_fs);
2240 EXPORT_SYMBOL(zfs_obj_to_path);
2241
2242 /* CSTYLED */
2243 module_param(zfs_object_mutex_size, uint, 0644);
2244 MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
2245 #endif