]> git.proxmox.com Git - mirror_zfs-debian.git/blob - module/zfs/zfs_znode.c
New upstream version 0.7.11
[mirror_zfs-debian.git] / module / zfs / zfs_znode.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
24 */
25
26 /* Portions Copyright 2007 Jeremy Teo */
27
28 #ifdef _KERNEL
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/time.h>
32 #include <sys/systm.h>
33 #include <sys/sysmacros.h>
34 #include <sys/resource.h>
35 #include <sys/mntent.h>
36 #include <sys/mkdev.h>
37 #include <sys/u8_textprep.h>
38 #include <sys/dsl_dataset.h>
39 #include <sys/vfs.h>
40 #include <sys/vfs_opreg.h>
41 #include <sys/vnode.h>
42 #include <sys/file.h>
43 #include <sys/kmem.h>
44 #include <sys/errno.h>
45 #include <sys/unistd.h>
46 #include <sys/mode.h>
47 #include <sys/atomic.h>
48 #include <vm/pvn.h>
49 #include "fs/fs_subr.h"
50 #include <sys/zfs_dir.h>
51 #include <sys/zfs_acl.h>
52 #include <sys/zfs_ioctl.h>
53 #include <sys/zfs_rlock.h>
54 #include <sys/zfs_fuid.h>
55 #include <sys/zfs_vnops.h>
56 #include <sys/zfs_ctldir.h>
57 #include <sys/dnode.h>
58 #include <sys/fs/zfs.h>
59 #include <sys/kidmap.h>
60 #include <sys/zpl.h>
61 #endif /* _KERNEL */
62
63 #include <sys/dmu.h>
64 #include <sys/dmu_objset.h>
65 #include <sys/dmu_tx.h>
66 #include <sys/refcount.h>
67 #include <sys/stat.h>
68 #include <sys/zap.h>
69 #include <sys/zfs_znode.h>
70 #include <sys/sa.h>
71 #include <sys/zfs_sa.h>
72 #include <sys/zfs_stat.h>
73
74 #include "zfs_prop.h"
75 #include "zfs_comutil.h"
76
77 /*
78 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
79 * turned on when DEBUG is also defined.
80 */
81 #ifdef DEBUG
82 #define ZNODE_STATS
83 #endif /* DEBUG */
84
85 #ifdef ZNODE_STATS
86 #define ZNODE_STAT_ADD(stat) ((stat)++)
87 #else
88 #define ZNODE_STAT_ADD(stat) /* nothing */
89 #endif /* ZNODE_STATS */
90
91 /*
92 * Functions needed for userland (ie: libzpool) are not put under
93 * #ifdef_KERNEL; the rest of the functions have dependencies
94 * (such as VFS logic) that will not compile easily in userland.
95 */
96 #ifdef _KERNEL
97
98 static kmem_cache_t *znode_cache = NULL;
99 static kmem_cache_t *znode_hold_cache = NULL;
100 unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
101
102 /*ARGSUSED*/
103 static int
104 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
105 {
106 znode_t *zp = buf;
107
108 inode_init_once(ZTOI(zp));
109 list_link_init(&zp->z_link_node);
110
111 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
112 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
113 rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
114 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
115 rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
116
117 zfs_rlock_init(&zp->z_range_lock);
118
119 zp->z_dirlocks = NULL;
120 zp->z_acl_cached = NULL;
121 zp->z_xattr_cached = NULL;
122 zp->z_xattr_parent = 0;
123 zp->z_moved = 0;
124 return (0);
125 }
126
127 /*ARGSUSED*/
128 static void
129 zfs_znode_cache_destructor(void *buf, void *arg)
130 {
131 znode_t *zp = buf;
132
133 ASSERT(!list_link_active(&zp->z_link_node));
134 mutex_destroy(&zp->z_lock);
135 rw_destroy(&zp->z_parent_lock);
136 rw_destroy(&zp->z_name_lock);
137 mutex_destroy(&zp->z_acl_lock);
138 rw_destroy(&zp->z_xattr_lock);
139 zfs_rlock_destroy(&zp->z_range_lock);
140
141 ASSERT(zp->z_dirlocks == NULL);
142 ASSERT(zp->z_acl_cached == NULL);
143 ASSERT(zp->z_xattr_cached == NULL);
144 }
145
146 static int
147 zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
148 {
149 znode_hold_t *zh = buf;
150
151 mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
152 refcount_create(&zh->zh_refcount);
153 zh->zh_obj = ZFS_NO_OBJECT;
154
155 return (0);
156 }
157
158 static void
159 zfs_znode_hold_cache_destructor(void *buf, void *arg)
160 {
161 znode_hold_t *zh = buf;
162
163 mutex_destroy(&zh->zh_lock);
164 refcount_destroy(&zh->zh_refcount);
165 }
166
167 void
168 zfs_znode_init(void)
169 {
170 /*
171 * Initialize zcache. The KMC_SLAB hint is used in order that it be
172 * backed by kmalloc() when on the Linux slab in order that any
173 * wait_on_bit() operations on the related inode operate properly.
174 */
175 ASSERT(znode_cache == NULL);
176 znode_cache = kmem_cache_create("zfs_znode_cache",
177 sizeof (znode_t), 0, zfs_znode_cache_constructor,
178 zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB);
179
180 ASSERT(znode_hold_cache == NULL);
181 znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
182 sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
183 zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
184 }
185
186 void
187 zfs_znode_fini(void)
188 {
189 /*
190 * Cleanup zcache
191 */
192 if (znode_cache)
193 kmem_cache_destroy(znode_cache);
194 znode_cache = NULL;
195
196 if (znode_hold_cache)
197 kmem_cache_destroy(znode_hold_cache);
198 znode_hold_cache = NULL;
199 }
200
201 /*
202 * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
203 * serialize access to a znode and its SA buffer while the object is being
204 * created or destroyed. This kind of locking would normally reside in the
205 * znode itself but in this case that's impossible because the znode and SA
206 * buffer may not yet exist. Therefore the locking is handled externally
207 * with an array of mutexs and AVLs trees which contain per-object locks.
208 *
209 * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
210 * in to the correct AVL tree and finally the per-object lock is held. In
211 * zfs_znode_hold_exit() the process is reversed. The per-object lock is
212 * released, removed from the AVL tree and destroyed if there are no waiters.
213 *
214 * This scheme has two important properties:
215 *
216 * 1) No memory allocations are performed while holding one of the z_hold_locks.
217 * This ensures evict(), which can be called from direct memory reclaim, will
218 * never block waiting on a z_hold_locks which just happens to have hashed
219 * to the same index.
220 *
221 * 2) All locks used to serialize access to an object are per-object and never
222 * shared. This minimizes lock contention without creating a large number
223 * of dedicated locks.
224 *
225 * On the downside it does require znode_lock_t structures to be frequently
226 * allocated and freed. However, because these are backed by a kmem cache
227 * and very short lived this cost is minimal.
228 */
229 int
230 zfs_znode_hold_compare(const void *a, const void *b)
231 {
232 const znode_hold_t *zh_a = (const znode_hold_t *)a;
233 const znode_hold_t *zh_b = (const znode_hold_t *)b;
234
235 return (AVL_CMP(zh_a->zh_obj, zh_b->zh_obj));
236 }
237
238 boolean_t
239 zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
240 {
241 znode_hold_t *zh, search;
242 int i = ZFS_OBJ_HASH(zfsvfs, obj);
243 boolean_t held;
244
245 search.zh_obj = obj;
246
247 mutex_enter(&zfsvfs->z_hold_locks[i]);
248 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
249 held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
250 mutex_exit(&zfsvfs->z_hold_locks[i]);
251
252 return (held);
253 }
254
255 static znode_hold_t *
256 zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
257 {
258 znode_hold_t *zh, *zh_new, search;
259 int i = ZFS_OBJ_HASH(zfsvfs, obj);
260 boolean_t found = B_FALSE;
261
262 zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
263 zh_new->zh_obj = obj;
264 search.zh_obj = obj;
265
266 mutex_enter(&zfsvfs->z_hold_locks[i]);
267 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
268 if (likely(zh == NULL)) {
269 zh = zh_new;
270 avl_add(&zfsvfs->z_hold_trees[i], zh);
271 } else {
272 ASSERT3U(zh->zh_obj, ==, obj);
273 found = B_TRUE;
274 }
275 refcount_add(&zh->zh_refcount, NULL);
276 mutex_exit(&zfsvfs->z_hold_locks[i]);
277
278 if (found == B_TRUE)
279 kmem_cache_free(znode_hold_cache, zh_new);
280
281 ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
282 ASSERT3S(refcount_count(&zh->zh_refcount), >, 0);
283 mutex_enter(&zh->zh_lock);
284
285 return (zh);
286 }
287
288 static void
289 zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
290 {
291 int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
292 boolean_t remove = B_FALSE;
293
294 ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
295 ASSERT3S(refcount_count(&zh->zh_refcount), >, 0);
296 mutex_exit(&zh->zh_lock);
297
298 mutex_enter(&zfsvfs->z_hold_locks[i]);
299 if (refcount_remove(&zh->zh_refcount, NULL) == 0) {
300 avl_remove(&zfsvfs->z_hold_trees[i], zh);
301 remove = B_TRUE;
302 }
303 mutex_exit(&zfsvfs->z_hold_locks[i]);
304
305 if (remove == B_TRUE)
306 kmem_cache_free(znode_hold_cache, zh);
307 }
308
309 int
310 zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
311 {
312 #ifdef HAVE_SMB_SHARE
313 zfs_acl_ids_t acl_ids;
314 vattr_t vattr;
315 znode_t *sharezp;
316 vnode_t *vp;
317 znode_t *zp;
318 int error;
319
320 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
321 vattr.va_mode = S_IFDIR | 0555;
322 vattr.va_uid = crgetuid(kcred);
323 vattr.va_gid = crgetgid(kcred);
324
325 sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
326 sharezp->z_moved = 0;
327 sharezp->z_unlinked = 0;
328 sharezp->z_atime_dirty = 0;
329 sharezp->z_zfsvfs = zfsvfs;
330 sharezp->z_is_sa = zfsvfs->z_use_sa;
331
332 vp = ZTOV(sharezp);
333 vn_reinit(vp);
334 vp->v_type = VDIR;
335
336 VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
337 kcred, NULL, &acl_ids));
338 zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
339 ASSERT3P(zp, ==, sharezp);
340 ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */
341 POINTER_INVALIDATE(&sharezp->z_zfsvfs);
342 error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
343 ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
344 zfsvfs->z_shares_dir = sharezp->z_id;
345
346 zfs_acl_ids_free(&acl_ids);
347 // ZTOV(sharezp)->v_count = 0;
348 sa_handle_destroy(sharezp->z_sa_hdl);
349 kmem_cache_free(znode_cache, sharezp);
350
351 return (error);
352 #else
353 return (0);
354 #endif /* HAVE_SMB_SHARE */
355 }
356
357 static void
358 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
359 dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
360 {
361 ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
362
363 mutex_enter(&zp->z_lock);
364
365 ASSERT(zp->z_sa_hdl == NULL);
366 ASSERT(zp->z_acl_cached == NULL);
367 if (sa_hdl == NULL) {
368 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
369 SA_HDL_SHARED, &zp->z_sa_hdl));
370 } else {
371 zp->z_sa_hdl = sa_hdl;
372 sa_set_userp(sa_hdl, zp);
373 }
374
375 zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
376
377 mutex_exit(&zp->z_lock);
378 }
379
380 void
381 zfs_znode_dmu_fini(znode_t *zp)
382 {
383 ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked ||
384 RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
385
386 sa_handle_destroy(zp->z_sa_hdl);
387 zp->z_sa_hdl = NULL;
388 }
389
390 /*
391 * Called by new_inode() to allocate a new inode.
392 */
393 int
394 zfs_inode_alloc(struct super_block *sb, struct inode **ip)
395 {
396 znode_t *zp;
397
398 zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
399 *ip = ZTOI(zp);
400
401 return (0);
402 }
403
404 /*
405 * Called in multiple places when an inode should be destroyed.
406 */
407 void
408 zfs_inode_destroy(struct inode *ip)
409 {
410 znode_t *zp = ITOZ(ip);
411 zfsvfs_t *zfsvfs = ZTOZSB(zp);
412
413 mutex_enter(&zfsvfs->z_znodes_lock);
414 if (list_link_active(&zp->z_link_node)) {
415 list_remove(&zfsvfs->z_all_znodes, zp);
416 zfsvfs->z_nr_znodes--;
417 }
418 mutex_exit(&zfsvfs->z_znodes_lock);
419
420 if (zp->z_acl_cached) {
421 zfs_acl_free(zp->z_acl_cached);
422 zp->z_acl_cached = NULL;
423 }
424
425 if (zp->z_xattr_cached) {
426 nvlist_free(zp->z_xattr_cached);
427 zp->z_xattr_cached = NULL;
428 }
429
430 kmem_cache_free(znode_cache, zp);
431 }
432
433 static void
434 zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
435 {
436 uint64_t rdev = 0;
437
438 switch (ip->i_mode & S_IFMT) {
439 case S_IFREG:
440 ip->i_op = &zpl_inode_operations;
441 ip->i_fop = &zpl_file_operations;
442 ip->i_mapping->a_ops = &zpl_address_space_operations;
443 break;
444
445 case S_IFDIR:
446 ip->i_op = &zpl_dir_inode_operations;
447 ip->i_fop = &zpl_dir_file_operations;
448 ITOZ(ip)->z_zn_prefetch = B_TRUE;
449 break;
450
451 case S_IFLNK:
452 ip->i_op = &zpl_symlink_inode_operations;
453 break;
454
455 /*
456 * rdev is only stored in a SA only for device files.
457 */
458 case S_IFCHR:
459 case S_IFBLK:
460 (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
461 sizeof (rdev));
462 /*FALLTHROUGH*/
463 case S_IFIFO:
464 case S_IFSOCK:
465 init_special_inode(ip, ip->i_mode, rdev);
466 ip->i_op = &zpl_special_inode_operations;
467 break;
468
469 default:
470 zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
471 (u_longlong_t)ip->i_ino, ip->i_mode);
472
473 /* Assume the inode is a file and attempt to continue */
474 ip->i_mode = S_IFREG | 0644;
475 ip->i_op = &zpl_inode_operations;
476 ip->i_fop = &zpl_file_operations;
477 ip->i_mapping->a_ops = &zpl_address_space_operations;
478 break;
479 }
480 }
481
482 void
483 zfs_set_inode_flags(znode_t *zp, struct inode *ip)
484 {
485 /*
486 * Linux and Solaris have different sets of file attributes, so we
487 * restrict this conversion to the intersection of the two.
488 */
489 #ifdef HAVE_INODE_SET_FLAGS
490 unsigned int flags = 0;
491 if (zp->z_pflags & ZFS_IMMUTABLE)
492 flags |= S_IMMUTABLE;
493 if (zp->z_pflags & ZFS_APPENDONLY)
494 flags |= S_APPEND;
495
496 inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
497 #else
498 if (zp->z_pflags & ZFS_IMMUTABLE)
499 ip->i_flags |= S_IMMUTABLE;
500 else
501 ip->i_flags &= ~S_IMMUTABLE;
502
503 if (zp->z_pflags & ZFS_APPENDONLY)
504 ip->i_flags |= S_APPEND;
505 else
506 ip->i_flags &= ~S_APPEND;
507 #endif
508 }
509
510 /*
511 * Update the embedded inode given the znode. We should work toward
512 * eliminating this function as soon as possible by removing values
513 * which are duplicated between the znode and inode. If the generic
514 * inode has the correct field it should be used, and the ZFS code
515 * updated to access the inode. This can be done incrementally.
516 */
517 void
518 zfs_inode_update(znode_t *zp)
519 {
520 zfsvfs_t *zfsvfs;
521 struct inode *ip;
522 uint32_t blksize;
523 u_longlong_t i_blocks;
524
525 ASSERT(zp != NULL);
526 zfsvfs = ZTOZSB(zp);
527 ip = ZTOI(zp);
528
529 /* Skip .zfs control nodes which do not exist on disk. */
530 if (zfsctl_is_node(ip))
531 return;
532
533 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
534
535 spin_lock(&ip->i_lock);
536 ip->i_blocks = i_blocks;
537 i_size_write(ip, zp->z_size);
538 spin_unlock(&ip->i_lock);
539 }
540
541
542 /*
543 * Construct a znode+inode and initialize.
544 *
545 * This does not do a call to dmu_set_user() that is
546 * up to the caller to do, in case you don't want to
547 * return the znode
548 */
549 static znode_t *
550 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
551 dmu_object_type_t obj_type, uint64_t obj, sa_handle_t *hdl)
552 {
553 znode_t *zp;
554 struct inode *ip;
555 uint64_t mode;
556 uint64_t parent;
557 uint64_t tmp_gen;
558 uint64_t links;
559 uint64_t z_uid, z_gid;
560 uint64_t atime[2], mtime[2], ctime[2];
561 sa_bulk_attr_t bulk[11];
562 int count = 0;
563
564 ASSERT(zfsvfs != NULL);
565
566 ip = new_inode(zfsvfs->z_sb);
567 if (ip == NULL)
568 return (NULL);
569
570 zp = ITOZ(ip);
571 ASSERT(zp->z_dirlocks == NULL);
572 ASSERT3P(zp->z_acl_cached, ==, NULL);
573 ASSERT3P(zp->z_xattr_cached, ==, NULL);
574 zp->z_moved = 0;
575 zp->z_sa_hdl = NULL;
576 zp->z_unlinked = 0;
577 zp->z_atime_dirty = 0;
578 zp->z_mapcnt = 0;
579 zp->z_id = db->db_object;
580 zp->z_blksz = blksz;
581 zp->z_seq = 0x7A4653;
582 zp->z_sync_cnt = 0;
583 zp->z_is_mapped = B_FALSE;
584 zp->z_is_ctldir = B_FALSE;
585 zp->z_is_stale = B_FALSE;
586 zp->z_range_lock.zr_size = &zp->z_size;
587 zp->z_range_lock.zr_blksz = &zp->z_blksz;
588 zp->z_range_lock.zr_max_blksz = &ZTOZSB(zp)->z_max_blksz;
589
590 zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
591
592 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
593 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
594 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
595 &zp->z_size, 8);
596 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
597 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
598 &zp->z_pflags, 8);
599 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
600 &parent, 8);
601 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
602 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
603 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
604 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
605 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
606
607 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0) {
608 if (hdl == NULL)
609 sa_handle_destroy(zp->z_sa_hdl);
610 zp->z_sa_hdl = NULL;
611 goto error;
612 }
613
614 zp->z_mode = ip->i_mode = mode;
615 ip->i_generation = (uint32_t)tmp_gen;
616 ip->i_blkbits = SPA_MINBLOCKSHIFT;
617 set_nlink(ip, (uint32_t)links);
618 zfs_uid_write(ip, z_uid);
619 zfs_gid_write(ip, z_gid);
620 zfs_set_inode_flags(zp, ip);
621
622 /* Cache the xattr parent id */
623 if (zp->z_pflags & ZFS_XATTR)
624 zp->z_xattr_parent = parent;
625
626 ZFS_TIME_DECODE(&ip->i_atime, atime);
627 ZFS_TIME_DECODE(&ip->i_mtime, mtime);
628 ZFS_TIME_DECODE(&ip->i_ctime, ctime);
629
630 ip->i_ino = obj;
631 zfs_inode_update(zp);
632 zfs_inode_set_ops(zfsvfs, ip);
633
634 /*
635 * The only way insert_inode_locked() can fail is if the ip->i_ino
636 * number is already hashed for this super block. This can never
637 * happen because the inode numbers map 1:1 with the object numbers.
638 *
639 * The one exception is rolling back a mounted file system, but in
640 * this case all the active inode are unhashed during the rollback.
641 */
642 VERIFY3S(insert_inode_locked(ip), ==, 0);
643
644 mutex_enter(&zfsvfs->z_znodes_lock);
645 list_insert_tail(&zfsvfs->z_all_znodes, zp);
646 zfsvfs->z_nr_znodes++;
647 membar_producer();
648 mutex_exit(&zfsvfs->z_znodes_lock);
649
650 unlock_new_inode(ip);
651 return (zp);
652
653 error:
654 iput(ip);
655 return (NULL);
656 }
657
658 /*
659 * Safely mark an inode dirty. Inodes which are part of a read-only
660 * file system or snapshot may not be dirtied.
661 */
662 void
663 zfs_mark_inode_dirty(struct inode *ip)
664 {
665 zfsvfs_t *zfsvfs = ITOZSB(ip);
666
667 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
668 return;
669
670 mark_inode_dirty(ip);
671 }
672
673 static uint64_t empty_xattr;
674 static uint64_t pad[4];
675 static zfs_acl_phys_t acl_phys;
676 /*
677 * Create a new DMU object to hold a zfs znode.
678 *
679 * IN: dzp - parent directory for new znode
680 * vap - file attributes for new znode
681 * tx - dmu transaction id for zap operations
682 * cr - credentials of caller
683 * flag - flags:
684 * IS_ROOT_NODE - new object will be root
685 * IS_XATTR - new object is an attribute
686 * bonuslen - length of bonus buffer
687 * setaclp - File/Dir initial ACL
688 * fuidp - Tracks fuid allocation.
689 *
690 * OUT: zpp - allocated znode
691 *
692 */
693 void
694 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
695 uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
696 {
697 uint64_t crtime[2], atime[2], mtime[2], ctime[2];
698 uint64_t mode, size, links, parent, pflags;
699 uint64_t dzp_pflags = 0;
700 uint64_t rdev = 0;
701 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
702 dmu_buf_t *db;
703 inode_timespec_t now;
704 uint64_t gen, obj;
705 int bonuslen;
706 int dnodesize;
707 sa_handle_t *sa_hdl;
708 dmu_object_type_t obj_type;
709 sa_bulk_attr_t *sa_attrs;
710 int cnt = 0;
711 zfs_acl_locator_cb_t locate = { 0 };
712 znode_hold_t *zh;
713
714 if (zfsvfs->z_replay) {
715 obj = vap->va_nodeid;
716 now = vap->va_ctime; /* see zfs_replay_create() */
717 gen = vap->va_nblocks; /* ditto */
718 dnodesize = vap->va_fsid; /* ditto */
719 } else {
720 obj = 0;
721 gethrestime(&now);
722 gen = dmu_tx_get_txg(tx);
723 dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
724 }
725
726 if (dnodesize == 0)
727 dnodesize = DNODE_MIN_SIZE;
728
729 obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
730
731 bonuslen = (obj_type == DMU_OT_SA) ?
732 DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
733
734 /*
735 * Create a new DMU object.
736 */
737 /*
738 * There's currently no mechanism for pre-reading the blocks that will
739 * be needed to allocate a new object, so we accept the small chance
740 * that there will be an i/o error and we will fail one of the
741 * assertions below.
742 */
743 if (S_ISDIR(vap->va_mode)) {
744 if (zfsvfs->z_replay) {
745 VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
746 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
747 obj_type, bonuslen, dnodesize, tx));
748 } else {
749 obj = zap_create_norm_dnsize(zfsvfs->z_os,
750 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
751 obj_type, bonuslen, dnodesize, tx);
752 }
753 } else {
754 if (zfsvfs->z_replay) {
755 VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
756 DMU_OT_PLAIN_FILE_CONTENTS, 0,
757 obj_type, bonuslen, dnodesize, tx));
758 } else {
759 obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
760 DMU_OT_PLAIN_FILE_CONTENTS, 0,
761 obj_type, bonuslen, dnodesize, tx);
762 }
763 }
764
765 zh = zfs_znode_hold_enter(zfsvfs, obj);
766 VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
767
768 /*
769 * If this is the root, fix up the half-initialized parent pointer
770 * to reference the just-allocated physical data area.
771 */
772 if (flag & IS_ROOT_NODE) {
773 dzp->z_id = obj;
774 } else {
775 dzp_pflags = dzp->z_pflags;
776 }
777
778 /*
779 * If parent is an xattr, so am I.
780 */
781 if (dzp_pflags & ZFS_XATTR) {
782 flag |= IS_XATTR;
783 }
784
785 if (zfsvfs->z_use_fuids)
786 pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
787 else
788 pflags = 0;
789
790 if (S_ISDIR(vap->va_mode)) {
791 size = 2; /* contents ("." and "..") */
792 links = 2;
793 } else {
794 size = 0;
795 links = (flag & IS_TMPFILE) ? 0 : 1;
796 }
797
798 if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
799 rdev = vap->va_rdev;
800
801 parent = dzp->z_id;
802 mode = acl_ids->z_mode;
803 if (flag & IS_XATTR)
804 pflags |= ZFS_XATTR;
805
806 /*
807 * No execs denied will be deterimed when zfs_mode_compute() is called.
808 */
809 pflags |= acl_ids->z_aclp->z_hints &
810 (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
811 ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
812
813 ZFS_TIME_ENCODE(&now, crtime);
814 ZFS_TIME_ENCODE(&now, ctime);
815
816 if (vap->va_mask & ATTR_ATIME) {
817 ZFS_TIME_ENCODE(&vap->va_atime, atime);
818 } else {
819 ZFS_TIME_ENCODE(&now, atime);
820 }
821
822 if (vap->va_mask & ATTR_MTIME) {
823 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
824 } else {
825 ZFS_TIME_ENCODE(&now, mtime);
826 }
827
828 /* Now add in all of the "SA" attributes */
829 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
830 &sa_hdl));
831
832 /*
833 * Setup the array of attributes to be replaced/set on the new file
834 *
835 * order for DMU_OT_ZNODE is critical since it needs to be constructed
836 * in the old znode_phys_t format. Don't change this ordering
837 */
838 sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
839
840 if (obj_type == DMU_OT_ZNODE) {
841 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
842 NULL, &atime, 16);
843 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
844 NULL, &mtime, 16);
845 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
846 NULL, &ctime, 16);
847 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
848 NULL, &crtime, 16);
849 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
850 NULL, &gen, 8);
851 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
852 NULL, &mode, 8);
853 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
854 NULL, &size, 8);
855 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
856 NULL, &parent, 8);
857 } else {
858 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
859 NULL, &mode, 8);
860 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
861 NULL, &size, 8);
862 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
863 NULL, &gen, 8);
864 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
865 NULL, &acl_ids->z_fuid, 8);
866 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
867 NULL, &acl_ids->z_fgid, 8);
868 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
869 NULL, &parent, 8);
870 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
871 NULL, &pflags, 8);
872 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
873 NULL, &atime, 16);
874 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
875 NULL, &mtime, 16);
876 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
877 NULL, &ctime, 16);
878 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
879 NULL, &crtime, 16);
880 }
881
882 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
883
884 if (obj_type == DMU_OT_ZNODE) {
885 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
886 &empty_xattr, 8);
887 }
888 if (obj_type == DMU_OT_ZNODE ||
889 (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
890 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
891 NULL, &rdev, 8);
892 }
893 if (obj_type == DMU_OT_ZNODE) {
894 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
895 NULL, &pflags, 8);
896 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
897 &acl_ids->z_fuid, 8);
898 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
899 &acl_ids->z_fgid, 8);
900 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
901 sizeof (uint64_t) * 4);
902 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
903 &acl_phys, sizeof (zfs_acl_phys_t));
904 } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
905 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
906 &acl_ids->z_aclp->z_acl_count, 8);
907 locate.cb_aclp = acl_ids->z_aclp;
908 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
909 zfs_acl_data_locator, &locate,
910 acl_ids->z_aclp->z_acl_bytes);
911 mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
912 acl_ids->z_fuid, acl_ids->z_fgid);
913 }
914
915 VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
916
917 if (!(flag & IS_ROOT_NODE)) {
918 /*
919 * The call to zfs_znode_alloc() may fail if memory is low
920 * via the call path: alloc_inode() -> inode_init_always() ->
921 * security_inode_alloc() -> inode_alloc_security(). Since
922 * the existing code is written such that zfs_mknode() can
923 * not fail retry until sufficient memory has been reclaimed.
924 */
925 do {
926 *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, obj,
927 sa_hdl);
928 } while (*zpp == NULL);
929
930 VERIFY(*zpp != NULL);
931 VERIFY(dzp != NULL);
932 } else {
933 /*
934 * If we are creating the root node, the "parent" we
935 * passed in is the znode for the root.
936 */
937 *zpp = dzp;
938
939 (*zpp)->z_sa_hdl = sa_hdl;
940 }
941
942 (*zpp)->z_pflags = pflags;
943 (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
944 (*zpp)->z_dnodesize = dnodesize;
945
946 if (obj_type == DMU_OT_ZNODE ||
947 acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
948 VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
949 }
950 kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
951 zfs_znode_hold_exit(zfsvfs, zh);
952 }
953
954 /*
955 * Update in-core attributes. It is assumed the caller will be doing an
956 * sa_bulk_update to push the changes out.
957 */
958 void
959 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
960 {
961 xoptattr_t *xoap;
962 boolean_t update_inode = B_FALSE;
963
964 xoap = xva_getxoptattr(xvap);
965 ASSERT(xoap);
966
967 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
968 uint64_t times[2];
969 ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
970 (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
971 &times, sizeof (times), tx);
972 XVA_SET_RTN(xvap, XAT_CREATETIME);
973 }
974 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
975 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
976 zp->z_pflags, tx);
977 XVA_SET_RTN(xvap, XAT_READONLY);
978 }
979 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
980 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
981 zp->z_pflags, tx);
982 XVA_SET_RTN(xvap, XAT_HIDDEN);
983 }
984 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
985 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
986 zp->z_pflags, tx);
987 XVA_SET_RTN(xvap, XAT_SYSTEM);
988 }
989 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
990 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
991 zp->z_pflags, tx);
992 XVA_SET_RTN(xvap, XAT_ARCHIVE);
993 }
994 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
995 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
996 zp->z_pflags, tx);
997 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
998
999 update_inode = B_TRUE;
1000 }
1001 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
1002 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
1003 zp->z_pflags, tx);
1004 XVA_SET_RTN(xvap, XAT_NOUNLINK);
1005 }
1006 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1007 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
1008 zp->z_pflags, tx);
1009 XVA_SET_RTN(xvap, XAT_APPENDONLY);
1010
1011 update_inode = B_TRUE;
1012 }
1013 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1014 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
1015 zp->z_pflags, tx);
1016 XVA_SET_RTN(xvap, XAT_NODUMP);
1017 }
1018 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1019 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
1020 zp->z_pflags, tx);
1021 XVA_SET_RTN(xvap, XAT_OPAQUE);
1022 }
1023 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1024 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
1025 xoap->xoa_av_quarantined, zp->z_pflags, tx);
1026 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1027 }
1028 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1029 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
1030 zp->z_pflags, tx);
1031 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1032 }
1033 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
1034 zfs_sa_set_scanstamp(zp, xvap, tx);
1035 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
1036 }
1037 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1038 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
1039 zp->z_pflags, tx);
1040 XVA_SET_RTN(xvap, XAT_REPARSE);
1041 }
1042 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1043 ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
1044 zp->z_pflags, tx);
1045 XVA_SET_RTN(xvap, XAT_OFFLINE);
1046 }
1047 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1048 ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
1049 zp->z_pflags, tx);
1050 XVA_SET_RTN(xvap, XAT_SPARSE);
1051 }
1052
1053 if (update_inode)
1054 zfs_set_inode_flags(zp, ZTOI(zp));
1055 }
1056
1057 int
1058 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
1059 {
1060 dmu_object_info_t doi;
1061 dmu_buf_t *db;
1062 znode_t *zp;
1063 znode_hold_t *zh;
1064 int err;
1065 sa_handle_t *hdl;
1066
1067 *zpp = NULL;
1068
1069 again:
1070 zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1071
1072 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1073 if (err) {
1074 zfs_znode_hold_exit(zfsvfs, zh);
1075 return (err);
1076 }
1077
1078 dmu_object_info_from_db(db, &doi);
1079 if (doi.doi_bonus_type != DMU_OT_SA &&
1080 (doi.doi_bonus_type != DMU_OT_ZNODE ||
1081 (doi.doi_bonus_type == DMU_OT_ZNODE &&
1082 doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1083 sa_buf_rele(db, NULL);
1084 zfs_znode_hold_exit(zfsvfs, zh);
1085 return (SET_ERROR(EINVAL));
1086 }
1087
1088 hdl = dmu_buf_get_user(db);
1089 if (hdl != NULL) {
1090 zp = sa_get_userdata(hdl);
1091
1092
1093 /*
1094 * Since "SA" does immediate eviction we
1095 * should never find a sa handle that doesn't
1096 * know about the znode.
1097 */
1098
1099 ASSERT3P(zp, !=, NULL);
1100
1101 mutex_enter(&zp->z_lock);
1102 ASSERT3U(zp->z_id, ==, obj_num);
1103 /*
1104 * If igrab() returns NULL the VFS has independently
1105 * determined the inode should be evicted and has
1106 * called iput_final() to start the eviction process.
1107 * The SA handle is still valid but because the VFS
1108 * requires that the eviction succeed we must drop
1109 * our locks and references to allow the eviction to
1110 * complete. The zfs_zget() may then be retried.
1111 *
1112 * This unlikely case could be optimized by registering
1113 * a sops->drop_inode() callback. The callback would
1114 * need to detect the active SA hold thereby informing
1115 * the VFS that this inode should not be evicted.
1116 */
1117 if (igrab(ZTOI(zp)) == NULL) {
1118 mutex_exit(&zp->z_lock);
1119 sa_buf_rele(db, NULL);
1120 zfs_znode_hold_exit(zfsvfs, zh);
1121 /* inode might need this to finish evict */
1122 cond_resched();
1123 goto again;
1124 }
1125 *zpp = zp;
1126 err = 0;
1127 mutex_exit(&zp->z_lock);
1128 sa_buf_rele(db, NULL);
1129 zfs_znode_hold_exit(zfsvfs, zh);
1130 return (err);
1131 }
1132
1133 /*
1134 * Not found create new znode/vnode but only if file exists.
1135 *
1136 * There is a small window where zfs_vget() could
1137 * find this object while a file create is still in
1138 * progress. This is checked for in zfs_znode_alloc()
1139 *
1140 * if zfs_znode_alloc() fails it will drop the hold on the
1141 * bonus buffer.
1142 */
1143 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
1144 doi.doi_bonus_type, obj_num, NULL);
1145 if (zp == NULL) {
1146 err = SET_ERROR(ENOENT);
1147 } else {
1148 *zpp = zp;
1149 }
1150 zfs_znode_hold_exit(zfsvfs, zh);
1151 return (err);
1152 }
1153
1154 int
1155 zfs_rezget(znode_t *zp)
1156 {
1157 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1158 dmu_object_info_t doi;
1159 dmu_buf_t *db;
1160 uint64_t obj_num = zp->z_id;
1161 uint64_t mode;
1162 uint64_t links;
1163 sa_bulk_attr_t bulk[10];
1164 int err;
1165 int count = 0;
1166 uint64_t gen;
1167 uint64_t z_uid, z_gid;
1168 uint64_t atime[2], mtime[2], ctime[2];
1169 znode_hold_t *zh;
1170
1171 /*
1172 * skip ctldir, otherwise they will always get invalidated. This will
1173 * cause funny behaviour for the mounted snapdirs. Especially for
1174 * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
1175 * anyone automount it again as long as someone is still using the
1176 * detached mount.
1177 */
1178 if (zp->z_is_ctldir)
1179 return (0);
1180
1181 zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1182
1183 mutex_enter(&zp->z_acl_lock);
1184 if (zp->z_acl_cached) {
1185 zfs_acl_free(zp->z_acl_cached);
1186 zp->z_acl_cached = NULL;
1187 }
1188 mutex_exit(&zp->z_acl_lock);
1189
1190 rw_enter(&zp->z_xattr_lock, RW_WRITER);
1191 if (zp->z_xattr_cached) {
1192 nvlist_free(zp->z_xattr_cached);
1193 zp->z_xattr_cached = NULL;
1194 }
1195 rw_exit(&zp->z_xattr_lock);
1196
1197 ASSERT(zp->z_sa_hdl == NULL);
1198 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1199 if (err) {
1200 zfs_znode_hold_exit(zfsvfs, zh);
1201 return (err);
1202 }
1203
1204 dmu_object_info_from_db(db, &doi);
1205 if (doi.doi_bonus_type != DMU_OT_SA &&
1206 (doi.doi_bonus_type != DMU_OT_ZNODE ||
1207 (doi.doi_bonus_type == DMU_OT_ZNODE &&
1208 doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1209 sa_buf_rele(db, NULL);
1210 zfs_znode_hold_exit(zfsvfs, zh);
1211 return (SET_ERROR(EINVAL));
1212 }
1213
1214 zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
1215
1216 /* reload cached values */
1217 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
1218 &gen, sizeof (gen));
1219 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1220 &zp->z_size, sizeof (zp->z_size));
1221 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
1222 &links, sizeof (links));
1223 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1224 &zp->z_pflags, sizeof (zp->z_pflags));
1225 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1226 &z_uid, sizeof (z_uid));
1227 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1228 &z_gid, sizeof (z_gid));
1229 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1230 &mode, sizeof (mode));
1231 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1232 &atime, 16);
1233 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
1234 &mtime, 16);
1235 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
1236 &ctime, 16);
1237
1238 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
1239 zfs_znode_dmu_fini(zp);
1240 zfs_znode_hold_exit(zfsvfs, zh);
1241 return (SET_ERROR(EIO));
1242 }
1243
1244 zp->z_mode = ZTOI(zp)->i_mode = mode;
1245 zfs_uid_write(ZTOI(zp), z_uid);
1246 zfs_gid_write(ZTOI(zp), z_gid);
1247
1248 ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
1249 ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
1250 ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime);
1251
1252 if (gen != ZTOI(zp)->i_generation) {
1253 zfs_znode_dmu_fini(zp);
1254 zfs_znode_hold_exit(zfsvfs, zh);
1255 return (SET_ERROR(EIO));
1256 }
1257
1258 zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
1259 set_nlink(ZTOI(zp), (uint32_t)links);
1260 zfs_set_inode_flags(zp, ZTOI(zp));
1261
1262 zp->z_blksz = doi.doi_data_block_size;
1263 zp->z_atime_dirty = 0;
1264 zfs_inode_update(zp);
1265
1266 zfs_znode_hold_exit(zfsvfs, zh);
1267
1268 return (0);
1269 }
1270
1271 void
1272 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1273 {
1274 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1275 objset_t *os = zfsvfs->z_os;
1276 uint64_t obj = zp->z_id;
1277 uint64_t acl_obj = zfs_external_acl(zp);
1278 znode_hold_t *zh;
1279
1280 zh = zfs_znode_hold_enter(zfsvfs, obj);
1281 if (acl_obj) {
1282 VERIFY(!zp->z_is_sa);
1283 VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1284 }
1285 VERIFY(0 == dmu_object_free(os, obj, tx));
1286 zfs_znode_dmu_fini(zp);
1287 zfs_znode_hold_exit(zfsvfs, zh);
1288 }
1289
1290 void
1291 zfs_zinactive(znode_t *zp)
1292 {
1293 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1294 uint64_t z_id = zp->z_id;
1295 znode_hold_t *zh;
1296
1297 ASSERT(zp->z_sa_hdl);
1298
1299 /*
1300 * Don't allow a zfs_zget() while were trying to release this znode.
1301 */
1302 zh = zfs_znode_hold_enter(zfsvfs, z_id);
1303
1304 mutex_enter(&zp->z_lock);
1305
1306 /*
1307 * If this was the last reference to a file with no links,
1308 * remove the file from the file system.
1309 */
1310 if (zp->z_unlinked) {
1311 mutex_exit(&zp->z_lock);
1312 zfs_znode_hold_exit(zfsvfs, zh);
1313 zfs_rmnode(zp);
1314 return;
1315 }
1316
1317 mutex_exit(&zp->z_lock);
1318 zfs_znode_dmu_fini(zp);
1319
1320 zfs_znode_hold_exit(zfsvfs, zh);
1321 }
1322
1323 static inline int
1324 zfs_compare_timespec(struct timespec *t1, struct timespec *t2)
1325 {
1326 if (t1->tv_sec < t2->tv_sec)
1327 return (-1);
1328
1329 if (t1->tv_sec > t2->tv_sec)
1330 return (1);
1331
1332 return (t1->tv_nsec - t2->tv_nsec);
1333 }
1334
1335 /*
1336 * Prepare to update znode time stamps.
1337 *
1338 * IN: zp - znode requiring timestamp update
1339 * flag - ATTR_MTIME, ATTR_CTIME flags
1340 *
1341 * OUT: zp - z_seq
1342 * mtime - new mtime
1343 * ctime - new ctime
1344 *
1345 * Note: We don't update atime here, because we rely on Linux VFS to do
1346 * atime updating.
1347 */
1348 void
1349 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
1350 uint64_t ctime[2])
1351 {
1352 inode_timespec_t now;
1353
1354 gethrestime(&now);
1355
1356 zp->z_seq++;
1357
1358 if (flag & ATTR_MTIME) {
1359 ZFS_TIME_ENCODE(&now, mtime);
1360 ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime);
1361 if (ZTOZSB(zp)->z_use_fuids) {
1362 zp->z_pflags |= (ZFS_ARCHIVE |
1363 ZFS_AV_MODIFIED);
1364 }
1365 }
1366
1367 if (flag & ATTR_CTIME) {
1368 ZFS_TIME_ENCODE(&now, ctime);
1369 ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime);
1370 if (ZTOZSB(zp)->z_use_fuids)
1371 zp->z_pflags |= ZFS_ARCHIVE;
1372 }
1373 }
1374
1375 /*
1376 * Grow the block size for a file.
1377 *
1378 * IN: zp - znode of file to free data in.
1379 * size - requested block size
1380 * tx - open transaction.
1381 *
1382 * NOTE: this function assumes that the znode is write locked.
1383 */
1384 void
1385 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1386 {
1387 int error;
1388 u_longlong_t dummy;
1389
1390 if (size <= zp->z_blksz)
1391 return;
1392 /*
1393 * If the file size is already greater than the current blocksize,
1394 * we will not grow. If there is more than one block in a file,
1395 * the blocksize cannot change.
1396 */
1397 if (zp->z_blksz && zp->z_size > zp->z_blksz)
1398 return;
1399
1400 error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
1401 size, 0, tx);
1402
1403 if (error == ENOTSUP)
1404 return;
1405 ASSERT0(error);
1406
1407 /* What blocksize did we actually get? */
1408 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
1409 }
1410
1411 /*
1412 * Increase the file length
1413 *
1414 * IN: zp - znode of file to free data in.
1415 * end - new end-of-file
1416 *
1417 * RETURN: 0 on success, error code on failure
1418 */
1419 static int
1420 zfs_extend(znode_t *zp, uint64_t end)
1421 {
1422 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1423 dmu_tx_t *tx;
1424 rl_t *rl;
1425 uint64_t newblksz;
1426 int error;
1427
1428 /*
1429 * We will change zp_size, lock the whole file.
1430 */
1431 rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER);
1432
1433 /*
1434 * Nothing to do if file already at desired length.
1435 */
1436 if (end <= zp->z_size) {
1437 zfs_range_unlock(rl);
1438 return (0);
1439 }
1440 tx = dmu_tx_create(zfsvfs->z_os);
1441 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1442 zfs_sa_upgrade_txholds(tx, zp);
1443 if (end > zp->z_blksz &&
1444 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1445 /*
1446 * We are growing the file past the current block size.
1447 */
1448 if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
1449 /*
1450 * File's blocksize is already larger than the
1451 * "recordsize" property. Only let it grow to
1452 * the next power of 2.
1453 */
1454 ASSERT(!ISP2(zp->z_blksz));
1455 newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
1456 } else {
1457 newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
1458 }
1459 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1460 } else {
1461 newblksz = 0;
1462 }
1463
1464 error = dmu_tx_assign(tx, TXG_WAIT);
1465 if (error) {
1466 dmu_tx_abort(tx);
1467 zfs_range_unlock(rl);
1468 return (error);
1469 }
1470
1471 if (newblksz)
1472 zfs_grow_blocksize(zp, newblksz, tx);
1473
1474 zp->z_size = end;
1475
1476 VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
1477 &zp->z_size, sizeof (zp->z_size), tx));
1478
1479 zfs_range_unlock(rl);
1480
1481 dmu_tx_commit(tx);
1482
1483 return (0);
1484 }
1485
1486 /*
1487 * zfs_zero_partial_page - Modeled after update_pages() but
1488 * with different arguments and semantics for use by zfs_freesp().
1489 *
1490 * Zeroes a piece of a single page cache entry for zp at offset
1491 * start and length len.
1492 *
1493 * Caller must acquire a range lock on the file for the region
1494 * being zeroed in order that the ARC and page cache stay in sync.
1495 */
1496 static void
1497 zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
1498 {
1499 struct address_space *mp = ZTOI(zp)->i_mapping;
1500 struct page *pp;
1501 int64_t off;
1502 void *pb;
1503
1504 ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
1505
1506 off = start & (PAGE_SIZE - 1);
1507 start &= PAGE_MASK;
1508
1509 pp = find_lock_page(mp, start >> PAGE_SHIFT);
1510 if (pp) {
1511 if (mapping_writably_mapped(mp))
1512 flush_dcache_page(pp);
1513
1514 pb = kmap(pp);
1515 bzero(pb + off, len);
1516 kunmap(pp);
1517
1518 if (mapping_writably_mapped(mp))
1519 flush_dcache_page(pp);
1520
1521 mark_page_accessed(pp);
1522 SetPageUptodate(pp);
1523 ClearPageError(pp);
1524 unlock_page(pp);
1525 put_page(pp);
1526 }
1527 }
1528
1529 /*
1530 * Free space in a file.
1531 *
1532 * IN: zp - znode of file to free data in.
1533 * off - start of section to free.
1534 * len - length of section to free.
1535 *
1536 * RETURN: 0 on success, error code on failure
1537 */
1538 static int
1539 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1540 {
1541 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1542 rl_t *rl;
1543 int error;
1544
1545 /*
1546 * Lock the range being freed.
1547 */
1548 rl = zfs_range_lock(&zp->z_range_lock, off, len, RL_WRITER);
1549
1550 /*
1551 * Nothing to do if file already at desired length.
1552 */
1553 if (off >= zp->z_size) {
1554 zfs_range_unlock(rl);
1555 return (0);
1556 }
1557
1558 if (off + len > zp->z_size)
1559 len = zp->z_size - off;
1560
1561 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1562
1563 /*
1564 * Zero partial page cache entries. This must be done under a
1565 * range lock in order to keep the ARC and page cache in sync.
1566 */
1567 if (zp->z_is_mapped) {
1568 loff_t first_page, last_page, page_len;
1569 loff_t first_page_offset, last_page_offset;
1570
1571 /* first possible full page in hole */
1572 first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
1573 /* last page of hole */
1574 last_page = (off + len) >> PAGE_SHIFT;
1575
1576 /* offset of first_page */
1577 first_page_offset = first_page << PAGE_SHIFT;
1578 /* offset of last_page */
1579 last_page_offset = last_page << PAGE_SHIFT;
1580
1581 /* truncate whole pages */
1582 if (last_page_offset > first_page_offset) {
1583 truncate_inode_pages_range(ZTOI(zp)->i_mapping,
1584 first_page_offset, last_page_offset - 1);
1585 }
1586
1587 /* truncate sub-page ranges */
1588 if (first_page > last_page) {
1589 /* entire punched area within a single page */
1590 zfs_zero_partial_page(zp, off, len);
1591 } else {
1592 /* beginning of punched area at the end of a page */
1593 page_len = first_page_offset - off;
1594 if (page_len > 0)
1595 zfs_zero_partial_page(zp, off, page_len);
1596
1597 /* end of punched area at the beginning of a page */
1598 page_len = off + len - last_page_offset;
1599 if (page_len > 0)
1600 zfs_zero_partial_page(zp, last_page_offset,
1601 page_len);
1602 }
1603 }
1604 zfs_range_unlock(rl);
1605
1606 return (error);
1607 }
1608
1609 /*
1610 * Truncate a file
1611 *
1612 * IN: zp - znode of file to free data in.
1613 * end - new end-of-file.
1614 *
1615 * RETURN: 0 on success, error code on failure
1616 */
1617 static int
1618 zfs_trunc(znode_t *zp, uint64_t end)
1619 {
1620 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1621 dmu_tx_t *tx;
1622 rl_t *rl;
1623 int error;
1624 sa_bulk_attr_t bulk[2];
1625 int count = 0;
1626
1627 /*
1628 * We will change zp_size, lock the whole file.
1629 */
1630 rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER);
1631
1632 /*
1633 * Nothing to do if file already at desired length.
1634 */
1635 if (end >= zp->z_size) {
1636 zfs_range_unlock(rl);
1637 return (0);
1638 }
1639
1640 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, -1);
1641 if (error) {
1642 zfs_range_unlock(rl);
1643 return (error);
1644 }
1645 tx = dmu_tx_create(zfsvfs->z_os);
1646 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1647 zfs_sa_upgrade_txholds(tx, zp);
1648 dmu_tx_mark_netfree(tx);
1649 error = dmu_tx_assign(tx, TXG_WAIT);
1650 if (error) {
1651 dmu_tx_abort(tx);
1652 zfs_range_unlock(rl);
1653 return (error);
1654 }
1655
1656 zp->z_size = end;
1657 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
1658 NULL, &zp->z_size, sizeof (zp->z_size));
1659
1660 if (end == 0) {
1661 zp->z_pflags &= ~ZFS_SPARSE;
1662 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1663 NULL, &zp->z_pflags, 8);
1664 }
1665 VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
1666
1667 dmu_tx_commit(tx);
1668
1669 zfs_range_unlock(rl);
1670
1671 return (0);
1672 }
1673
1674 /*
1675 * Free space in a file
1676 *
1677 * IN: zp - znode of file to free data in.
1678 * off - start of range
1679 * len - end of range (0 => EOF)
1680 * flag - current file open mode flags.
1681 * log - TRUE if this action should be logged
1682 *
1683 * RETURN: 0 on success, error code on failure
1684 */
1685 int
1686 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1687 {
1688 dmu_tx_t *tx;
1689 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1690 zilog_t *zilog = zfsvfs->z_log;
1691 uint64_t mode;
1692 uint64_t mtime[2], ctime[2];
1693 sa_bulk_attr_t bulk[3];
1694 int count = 0;
1695 int error;
1696
1697 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
1698 sizeof (mode))) != 0)
1699 return (error);
1700
1701 if (off > zp->z_size) {
1702 error = zfs_extend(zp, off+len);
1703 if (error == 0 && log)
1704 goto log;
1705 goto out;
1706 }
1707
1708 if (len == 0) {
1709 error = zfs_trunc(zp, off);
1710 } else {
1711 if ((error = zfs_free_range(zp, off, len)) == 0 &&
1712 off + len > zp->z_size)
1713 error = zfs_extend(zp, off+len);
1714 }
1715 if (error || !log)
1716 goto out;
1717 log:
1718 tx = dmu_tx_create(zfsvfs->z_os);
1719 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1720 zfs_sa_upgrade_txholds(tx, zp);
1721 error = dmu_tx_assign(tx, TXG_WAIT);
1722 if (error) {
1723 dmu_tx_abort(tx);
1724 goto out;
1725 }
1726
1727 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
1728 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
1729 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1730 NULL, &zp->z_pflags, 8);
1731 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
1732 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1733 ASSERT(error == 0);
1734
1735 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1736
1737 dmu_tx_commit(tx);
1738
1739 zfs_inode_update(zp);
1740 error = 0;
1741
1742 out:
1743 /*
1744 * Truncate the page cache - for file truncate operations, use
1745 * the purpose-built API for truncations. For punching operations,
1746 * the truncation is handled under a range lock in zfs_free_range.
1747 */
1748 if (len == 0)
1749 truncate_setsize(ZTOI(zp), off);
1750 return (error);
1751 }
1752
1753 void
1754 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1755 {
1756 struct super_block *sb;
1757 zfsvfs_t *zfsvfs;
1758 uint64_t moid, obj, sa_obj, version;
1759 uint64_t sense = ZFS_CASE_SENSITIVE;
1760 uint64_t norm = 0;
1761 nvpair_t *elem;
1762 int size;
1763 int error;
1764 int i;
1765 znode_t *rootzp = NULL;
1766 vattr_t vattr;
1767 znode_t *zp;
1768 zfs_acl_ids_t acl_ids;
1769
1770 /*
1771 * First attempt to create master node.
1772 */
1773 /*
1774 * In an empty objset, there are no blocks to read and thus
1775 * there can be no i/o errors (which we assert below).
1776 */
1777 moid = MASTER_NODE_OBJ;
1778 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1779 DMU_OT_NONE, 0, tx);
1780 ASSERT(error == 0);
1781
1782 /*
1783 * Set starting attributes.
1784 */
1785 version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
1786 elem = NULL;
1787 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1788 /* For the moment we expect all zpl props to be uint64_ts */
1789 uint64_t val;
1790 char *name;
1791
1792 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1793 VERIFY(nvpair_value_uint64(elem, &val) == 0);
1794 name = nvpair_name(elem);
1795 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1796 if (val < version)
1797 version = val;
1798 } else {
1799 error = zap_update(os, moid, name, 8, 1, &val, tx);
1800 }
1801 ASSERT(error == 0);
1802 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1803 norm = val;
1804 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1805 sense = val;
1806 }
1807 ASSERT(version != 0);
1808 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
1809
1810 /*
1811 * Create zap object used for SA attribute registration
1812 */
1813
1814 if (version >= ZPL_VERSION_SA) {
1815 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
1816 DMU_OT_NONE, 0, tx);
1817 error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
1818 ASSERT(error == 0);
1819 } else {
1820 sa_obj = 0;
1821 }
1822 /*
1823 * Create a delete queue.
1824 */
1825 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1826
1827 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
1828 ASSERT(error == 0);
1829
1830 /*
1831 * Create root znode. Create minimal znode/inode/zfsvfs/sb
1832 * to allow zfs_mknode to work.
1833 */
1834 vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
1835 vattr.va_mode = S_IFDIR|0755;
1836 vattr.va_uid = crgetuid(cr);
1837 vattr.va_gid = crgetgid(cr);
1838
1839 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1840 rootzp->z_moved = 0;
1841 rootzp->z_unlinked = 0;
1842 rootzp->z_atime_dirty = 0;
1843 rootzp->z_is_sa = USE_SA(version, os);
1844
1845 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1846 zfsvfs->z_os = os;
1847 zfsvfs->z_parent = zfsvfs;
1848 zfsvfs->z_version = version;
1849 zfsvfs->z_use_fuids = USE_FUIDS(version, os);
1850 zfsvfs->z_use_sa = USE_SA(version, os);
1851 zfsvfs->z_norm = norm;
1852
1853 sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
1854 sb->s_fs_info = zfsvfs;
1855
1856 ZTOI(rootzp)->i_sb = sb;
1857
1858 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
1859 &zfsvfs->z_attr_table);
1860
1861 ASSERT(error == 0);
1862
1863 /*
1864 * Fold case on file systems that are always or sometimes case
1865 * insensitive.
1866 */
1867 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1868 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
1869
1870 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1871 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1872 offsetof(znode_t, z_link_node));
1873
1874 size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
1875 zfsvfs->z_hold_size = size;
1876 zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
1877 KM_SLEEP);
1878 zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
1879 for (i = 0; i != size; i++) {
1880 avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
1881 sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
1882 mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
1883 }
1884
1885 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1886 cr, NULL, &acl_ids));
1887 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
1888 ASSERT3P(zp, ==, rootzp);
1889 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1890 ASSERT(error == 0);
1891 zfs_acl_ids_free(&acl_ids);
1892
1893 atomic_set(&ZTOI(rootzp)->i_count, 0);
1894 sa_handle_destroy(rootzp->z_sa_hdl);
1895 kmem_cache_free(znode_cache, rootzp);
1896
1897 /*
1898 * Create shares directory
1899 */
1900 error = zfs_create_share_dir(zfsvfs, tx);
1901 ASSERT(error == 0);
1902
1903 for (i = 0; i != size; i++) {
1904 avl_destroy(&zfsvfs->z_hold_trees[i]);
1905 mutex_destroy(&zfsvfs->z_hold_locks[i]);
1906 }
1907
1908 mutex_destroy(&zfsvfs->z_znodes_lock);
1909
1910 vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
1911 vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
1912 kmem_free(sb, sizeof (struct super_block));
1913 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1914 }
1915 #endif /* _KERNEL */
1916
1917 static int
1918 zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
1919 {
1920 uint64_t sa_obj = 0;
1921 int error;
1922
1923 error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
1924 if (error != 0 && error != ENOENT)
1925 return (error);
1926
1927 error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
1928 return (error);
1929 }
1930
1931 static int
1932 zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
1933 dmu_buf_t **db, void *tag)
1934 {
1935 dmu_object_info_t doi;
1936 int error;
1937
1938 if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
1939 return (error);
1940
1941 dmu_object_info_from_db(*db, &doi);
1942 if ((doi.doi_bonus_type != DMU_OT_SA &&
1943 doi.doi_bonus_type != DMU_OT_ZNODE) ||
1944 (doi.doi_bonus_type == DMU_OT_ZNODE &&
1945 doi.doi_bonus_size < sizeof (znode_phys_t))) {
1946 sa_buf_rele(*db, tag);
1947 return (SET_ERROR(ENOTSUP));
1948 }
1949
1950 error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
1951 if (error != 0) {
1952 sa_buf_rele(*db, tag);
1953 return (error);
1954 }
1955
1956 return (0);
1957 }
1958
1959 void
1960 zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
1961 {
1962 sa_handle_destroy(hdl);
1963 sa_buf_rele(db, tag);
1964 }
1965
1966 /*
1967 * Given an object number, return its parent object number and whether
1968 * or not the object is an extended attribute directory.
1969 */
1970 static int
1971 zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
1972 uint64_t *pobjp, int *is_xattrdir)
1973 {
1974 uint64_t parent;
1975 uint64_t pflags;
1976 uint64_t mode;
1977 uint64_t parent_mode;
1978 sa_bulk_attr_t bulk[3];
1979 sa_handle_t *sa_hdl;
1980 dmu_buf_t *sa_db;
1981 int count = 0;
1982 int error;
1983
1984 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
1985 &parent, sizeof (parent));
1986 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
1987 &pflags, sizeof (pflags));
1988 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
1989 &mode, sizeof (mode));
1990
1991 if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
1992 return (error);
1993
1994 /*
1995 * When a link is removed its parent pointer is not changed and will
1996 * be invalid. There are two cases where a link is removed but the
1997 * file stays around, when it goes to the delete queue and when there
1998 * are additional links.
1999 */
2000 error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
2001 if (error != 0)
2002 return (error);
2003
2004 error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
2005 zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2006 if (error != 0)
2007 return (error);
2008
2009 *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
2010
2011 /*
2012 * Extended attributes can be applied to files, directories, etc.
2013 * Otherwise the parent must be a directory.
2014 */
2015 if (!*is_xattrdir && !S_ISDIR(parent_mode))
2016 return (EINVAL);
2017
2018 *pobjp = parent;
2019
2020 return (0);
2021 }
2022
2023 /*
2024 * Given an object number, return some zpl level statistics
2025 */
2026 static int
2027 zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
2028 zfs_stat_t *sb)
2029 {
2030 sa_bulk_attr_t bulk[4];
2031 int count = 0;
2032
2033 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2034 &sb->zs_mode, sizeof (sb->zs_mode));
2035 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
2036 &sb->zs_gen, sizeof (sb->zs_gen));
2037 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
2038 &sb->zs_links, sizeof (sb->zs_links));
2039 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
2040 &sb->zs_ctime, sizeof (sb->zs_ctime));
2041
2042 return (sa_bulk_lookup(hdl, bulk, count));
2043 }
2044
2045 static int
2046 zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
2047 sa_attr_type_t *sa_table, char *buf, int len)
2048 {
2049 sa_handle_t *sa_hdl;
2050 sa_handle_t *prevhdl = NULL;
2051 dmu_buf_t *prevdb = NULL;
2052 dmu_buf_t *sa_db = NULL;
2053 char *path = buf + len - 1;
2054 int error;
2055
2056 *path = '\0';
2057 sa_hdl = hdl;
2058
2059 for (;;) {
2060 uint64_t pobj = 0;
2061 char component[MAXNAMELEN + 2];
2062 size_t complen;
2063 int is_xattrdir = 0;
2064
2065 if (prevdb)
2066 zfs_release_sa_handle(prevhdl, prevdb, FTAG);
2067
2068 if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
2069 &is_xattrdir)) != 0)
2070 break;
2071
2072 if (pobj == obj) {
2073 if (path[0] != '/')
2074 *--path = '/';
2075 break;
2076 }
2077
2078 component[0] = '/';
2079 if (is_xattrdir) {
2080 (void) sprintf(component + 1, "<xattrdir>");
2081 } else {
2082 error = zap_value_search(osp, pobj, obj,
2083 ZFS_DIRENT_OBJ(-1ULL), component + 1);
2084 if (error != 0)
2085 break;
2086 }
2087
2088 complen = strlen(component);
2089 path -= complen;
2090 ASSERT(path >= buf);
2091 bcopy(component, path, complen);
2092 obj = pobj;
2093
2094 if (sa_hdl != hdl) {
2095 prevhdl = sa_hdl;
2096 prevdb = sa_db;
2097 }
2098 error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
2099 if (error != 0) {
2100 sa_hdl = prevhdl;
2101 sa_db = prevdb;
2102 break;
2103 }
2104 }
2105
2106 if (sa_hdl != NULL && sa_hdl != hdl) {
2107 ASSERT(sa_db != NULL);
2108 zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2109 }
2110
2111 if (error == 0)
2112 (void) memmove(buf, path, buf + len - path);
2113
2114 return (error);
2115 }
2116
2117 int
2118 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
2119 {
2120 sa_attr_type_t *sa_table;
2121 sa_handle_t *hdl;
2122 dmu_buf_t *db;
2123 int error;
2124
2125 error = zfs_sa_setup(osp, &sa_table);
2126 if (error != 0)
2127 return (error);
2128
2129 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2130 if (error != 0)
2131 return (error);
2132
2133 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2134
2135 zfs_release_sa_handle(hdl, db, FTAG);
2136 return (error);
2137 }
2138
2139 int
2140 zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
2141 char *buf, int len)
2142 {
2143 char *path = buf + len - 1;
2144 sa_attr_type_t *sa_table;
2145 sa_handle_t *hdl;
2146 dmu_buf_t *db;
2147 int error;
2148
2149 *path = '\0';
2150
2151 error = zfs_sa_setup(osp, &sa_table);
2152 if (error != 0)
2153 return (error);
2154
2155 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2156 if (error != 0)
2157 return (error);
2158
2159 error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
2160 if (error != 0) {
2161 zfs_release_sa_handle(hdl, db, FTAG);
2162 return (error);
2163 }
2164
2165 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2166
2167 zfs_release_sa_handle(hdl, db, FTAG);
2168 return (error);
2169 }
2170
2171 #if defined(_KERNEL) && defined(HAVE_SPL)
2172 EXPORT_SYMBOL(zfs_create_fs);
2173 EXPORT_SYMBOL(zfs_obj_to_path);
2174
2175 /* CSTYLED */
2176 module_param(zfs_object_mutex_size, uint, 0644);
2177 MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
2178 #endif