]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/zfs_znode.c
Merge branch 'zfsonlinux/merge-spl'
[mirror_zfs.git] / module / zfs / zfs_znode.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
24 */
25
26 /* Portions Copyright 2007 Jeremy Teo */
27
28 #ifdef _KERNEL
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/time.h>
32 #include <sys/systm.h>
33 #include <sys/sysmacros.h>
34 #include <sys/resource.h>
35 #include <sys/mntent.h>
36 #include <sys/mkdev.h>
37 #include <sys/u8_textprep.h>
38 #include <sys/dsl_dataset.h>
39 #include <sys/vfs.h>
40 #include <sys/vfs_opreg.h>
41 #include <sys/vnode.h>
42 #include <sys/file.h>
43 #include <sys/kmem.h>
44 #include <sys/errno.h>
45 #include <sys/unistd.h>
46 #include <sys/mode.h>
47 #include <sys/atomic.h>
48 #include <vm/pvn.h>
49 #include "fs/fs_subr.h"
50 #include <sys/zfs_dir.h>
51 #include <sys/zfs_acl.h>
52 #include <sys/zfs_ioctl.h>
53 #include <sys/zfs_rlock.h>
54 #include <sys/zfs_fuid.h>
55 #include <sys/zfs_vnops.h>
56 #include <sys/zfs_ctldir.h>
57 #include <sys/dnode.h>
58 #include <sys/fs/zfs.h>
59 #include <sys/kidmap.h>
60 #include <sys/zpl.h>
61 #endif /* _KERNEL */
62
63 #include <sys/dmu.h>
64 #include <sys/dmu_objset.h>
65 #include <sys/dmu_tx.h>
66 #include <sys/refcount.h>
67 #include <sys/stat.h>
68 #include <sys/zap.h>
69 #include <sys/zfs_znode.h>
70 #include <sys/sa.h>
71 #include <sys/zfs_sa.h>
72 #include <sys/zfs_stat.h>
73
74 #include "zfs_prop.h"
75 #include "zfs_comutil.h"
76
77 /*
78 * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
79 * turned on when DEBUG is also defined.
80 */
81 #ifdef DEBUG
82 #define ZNODE_STATS
83 #endif /* DEBUG */
84
85 #ifdef ZNODE_STATS
86 #define ZNODE_STAT_ADD(stat) ((stat)++)
87 #else
88 #define ZNODE_STAT_ADD(stat) /* nothing */
89 #endif /* ZNODE_STATS */
90
91 /*
92 * Functions needed for userland (ie: libzpool) are not put under
93 * #ifdef_KERNEL; the rest of the functions have dependencies
94 * (such as VFS logic) that will not compile easily in userland.
95 */
96 #ifdef _KERNEL
97
98 static kmem_cache_t *znode_cache = NULL;
99 static kmem_cache_t *znode_hold_cache = NULL;
100 unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
101
102 /*ARGSUSED*/
103 static int
104 zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
105 {
106 znode_t *zp = buf;
107
108 inode_init_once(ZTOI(zp));
109 list_link_init(&zp->z_link_node);
110
111 mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
112 rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
113 rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
114 mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
115 rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
116
117 zfs_rlock_init(&zp->z_range_lock);
118
119 zp->z_dirlocks = NULL;
120 zp->z_acl_cached = NULL;
121 zp->z_xattr_cached = NULL;
122 zp->z_xattr_parent = 0;
123 zp->z_moved = 0;
124 return (0);
125 }
126
127 /*ARGSUSED*/
128 static void
129 zfs_znode_cache_destructor(void *buf, void *arg)
130 {
131 znode_t *zp = buf;
132
133 ASSERT(!list_link_active(&zp->z_link_node));
134 mutex_destroy(&zp->z_lock);
135 rw_destroy(&zp->z_parent_lock);
136 rw_destroy(&zp->z_name_lock);
137 mutex_destroy(&zp->z_acl_lock);
138 rw_destroy(&zp->z_xattr_lock);
139 zfs_rlock_destroy(&zp->z_range_lock);
140
141 ASSERT(zp->z_dirlocks == NULL);
142 ASSERT(zp->z_acl_cached == NULL);
143 ASSERT(zp->z_xattr_cached == NULL);
144 }
145
146 static int
147 zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
148 {
149 znode_hold_t *zh = buf;
150
151 mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
152 refcount_create(&zh->zh_refcount);
153 zh->zh_obj = ZFS_NO_OBJECT;
154
155 return (0);
156 }
157
158 static void
159 zfs_znode_hold_cache_destructor(void *buf, void *arg)
160 {
161 znode_hold_t *zh = buf;
162
163 mutex_destroy(&zh->zh_lock);
164 refcount_destroy(&zh->zh_refcount);
165 }
166
167 void
168 zfs_znode_init(void)
169 {
170 /*
171 * Initialize zcache. The KMC_SLAB hint is used in order that it be
172 * backed by kmalloc() when on the Linux slab in order that any
173 * wait_on_bit() operations on the related inode operate properly.
174 */
175 ASSERT(znode_cache == NULL);
176 znode_cache = kmem_cache_create("zfs_znode_cache",
177 sizeof (znode_t), 0, zfs_znode_cache_constructor,
178 zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB);
179
180 ASSERT(znode_hold_cache == NULL);
181 znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
182 sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
183 zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
184 }
185
186 void
187 zfs_znode_fini(void)
188 {
189 /*
190 * Cleanup zcache
191 */
192 if (znode_cache)
193 kmem_cache_destroy(znode_cache);
194 znode_cache = NULL;
195
196 if (znode_hold_cache)
197 kmem_cache_destroy(znode_hold_cache);
198 znode_hold_cache = NULL;
199 }
200
201 /*
202 * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
203 * serialize access to a znode and its SA buffer while the object is being
204 * created or destroyed. This kind of locking would normally reside in the
205 * znode itself but in this case that's impossible because the znode and SA
206 * buffer may not yet exist. Therefore the locking is handled externally
207 * with an array of mutexs and AVLs trees which contain per-object locks.
208 *
209 * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
210 * in to the correct AVL tree and finally the per-object lock is held. In
211 * zfs_znode_hold_exit() the process is reversed. The per-object lock is
212 * released, removed from the AVL tree and destroyed if there are no waiters.
213 *
214 * This scheme has two important properties:
215 *
216 * 1) No memory allocations are performed while holding one of the z_hold_locks.
217 * This ensures evict(), which can be called from direct memory reclaim, will
218 * never block waiting on a z_hold_locks which just happens to have hashed
219 * to the same index.
220 *
221 * 2) All locks used to serialize access to an object are per-object and never
222 * shared. This minimizes lock contention without creating a large number
223 * of dedicated locks.
224 *
225 * On the downside it does require znode_lock_t structures to be frequently
226 * allocated and freed. However, because these are backed by a kmem cache
227 * and very short lived this cost is minimal.
228 */
229 int
230 zfs_znode_hold_compare(const void *a, const void *b)
231 {
232 const znode_hold_t *zh_a = (const znode_hold_t *)a;
233 const znode_hold_t *zh_b = (const znode_hold_t *)b;
234
235 return (AVL_CMP(zh_a->zh_obj, zh_b->zh_obj));
236 }
237
238 boolean_t
239 zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
240 {
241 znode_hold_t *zh, search;
242 int i = ZFS_OBJ_HASH(zfsvfs, obj);
243 boolean_t held;
244
245 search.zh_obj = obj;
246
247 mutex_enter(&zfsvfs->z_hold_locks[i]);
248 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
249 held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
250 mutex_exit(&zfsvfs->z_hold_locks[i]);
251
252 return (held);
253 }
254
255 static znode_hold_t *
256 zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
257 {
258 znode_hold_t *zh, *zh_new, search;
259 int i = ZFS_OBJ_HASH(zfsvfs, obj);
260 boolean_t found = B_FALSE;
261
262 zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
263 zh_new->zh_obj = obj;
264 search.zh_obj = obj;
265
266 mutex_enter(&zfsvfs->z_hold_locks[i]);
267 zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
268 if (likely(zh == NULL)) {
269 zh = zh_new;
270 avl_add(&zfsvfs->z_hold_trees[i], zh);
271 } else {
272 ASSERT3U(zh->zh_obj, ==, obj);
273 found = B_TRUE;
274 }
275 refcount_add(&zh->zh_refcount, NULL);
276 mutex_exit(&zfsvfs->z_hold_locks[i]);
277
278 if (found == B_TRUE)
279 kmem_cache_free(znode_hold_cache, zh_new);
280
281 ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
282 ASSERT3S(refcount_count(&zh->zh_refcount), >, 0);
283 mutex_enter(&zh->zh_lock);
284
285 return (zh);
286 }
287
288 static void
289 zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
290 {
291 int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
292 boolean_t remove = B_FALSE;
293
294 ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
295 ASSERT3S(refcount_count(&zh->zh_refcount), >, 0);
296 mutex_exit(&zh->zh_lock);
297
298 mutex_enter(&zfsvfs->z_hold_locks[i]);
299 if (refcount_remove(&zh->zh_refcount, NULL) == 0) {
300 avl_remove(&zfsvfs->z_hold_trees[i], zh);
301 remove = B_TRUE;
302 }
303 mutex_exit(&zfsvfs->z_hold_locks[i]);
304
305 if (remove == B_TRUE)
306 kmem_cache_free(znode_hold_cache, zh);
307 }
308
309 int
310 zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
311 {
312 #ifdef HAVE_SMB_SHARE
313 zfs_acl_ids_t acl_ids;
314 vattr_t vattr;
315 znode_t *sharezp;
316 vnode_t *vp;
317 znode_t *zp;
318 int error;
319
320 vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
321 vattr.va_mode = S_IFDIR | 0555;
322 vattr.va_uid = crgetuid(kcred);
323 vattr.va_gid = crgetgid(kcred);
324
325 sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
326 sharezp->z_moved = 0;
327 sharezp->z_unlinked = 0;
328 sharezp->z_atime_dirty = 0;
329 sharezp->z_zfsvfs = zfsvfs;
330 sharezp->z_is_sa = zfsvfs->z_use_sa;
331 sharezp->z_pflags = 0;
332
333 vp = ZTOV(sharezp);
334 vn_reinit(vp);
335 vp->v_type = VDIR;
336
337 VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
338 kcred, NULL, &acl_ids));
339 zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
340 ASSERT3P(zp, ==, sharezp);
341 ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */
342 POINTER_INVALIDATE(&sharezp->z_zfsvfs);
343 error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
344 ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
345 zfsvfs->z_shares_dir = sharezp->z_id;
346
347 zfs_acl_ids_free(&acl_ids);
348 // ZTOV(sharezp)->v_count = 0;
349 sa_handle_destroy(sharezp->z_sa_hdl);
350 kmem_cache_free(znode_cache, sharezp);
351
352 return (error);
353 #else
354 return (0);
355 #endif /* HAVE_SMB_SHARE */
356 }
357
358 static void
359 zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
360 dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
361 {
362 ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
363
364 mutex_enter(&zp->z_lock);
365
366 ASSERT(zp->z_sa_hdl == NULL);
367 ASSERT(zp->z_acl_cached == NULL);
368 if (sa_hdl == NULL) {
369 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
370 SA_HDL_SHARED, &zp->z_sa_hdl));
371 } else {
372 zp->z_sa_hdl = sa_hdl;
373 sa_set_userp(sa_hdl, zp);
374 }
375
376 zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
377
378 mutex_exit(&zp->z_lock);
379 }
380
381 void
382 zfs_znode_dmu_fini(znode_t *zp)
383 {
384 ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked ||
385 RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
386
387 sa_handle_destroy(zp->z_sa_hdl);
388 zp->z_sa_hdl = NULL;
389 }
390
391 /*
392 * Called by new_inode() to allocate a new inode.
393 */
394 int
395 zfs_inode_alloc(struct super_block *sb, struct inode **ip)
396 {
397 znode_t *zp;
398
399 zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
400 *ip = ZTOI(zp);
401
402 return (0);
403 }
404
405 /*
406 * Called in multiple places when an inode should be destroyed.
407 */
408 void
409 zfs_inode_destroy(struct inode *ip)
410 {
411 znode_t *zp = ITOZ(ip);
412 zfsvfs_t *zfsvfs = ZTOZSB(zp);
413
414 mutex_enter(&zfsvfs->z_znodes_lock);
415 if (list_link_active(&zp->z_link_node)) {
416 list_remove(&zfsvfs->z_all_znodes, zp);
417 zfsvfs->z_nr_znodes--;
418 }
419 mutex_exit(&zfsvfs->z_znodes_lock);
420
421 if (zp->z_acl_cached) {
422 zfs_acl_free(zp->z_acl_cached);
423 zp->z_acl_cached = NULL;
424 }
425
426 if (zp->z_xattr_cached) {
427 nvlist_free(zp->z_xattr_cached);
428 zp->z_xattr_cached = NULL;
429 }
430
431 kmem_cache_free(znode_cache, zp);
432 }
433
434 static void
435 zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
436 {
437 uint64_t rdev = 0;
438
439 switch (ip->i_mode & S_IFMT) {
440 case S_IFREG:
441 ip->i_op = &zpl_inode_operations;
442 ip->i_fop = &zpl_file_operations;
443 ip->i_mapping->a_ops = &zpl_address_space_operations;
444 break;
445
446 case S_IFDIR:
447 ip->i_op = &zpl_dir_inode_operations;
448 ip->i_fop = &zpl_dir_file_operations;
449 ITOZ(ip)->z_zn_prefetch = B_TRUE;
450 break;
451
452 case S_IFLNK:
453 ip->i_op = &zpl_symlink_inode_operations;
454 break;
455
456 /*
457 * rdev is only stored in a SA only for device files.
458 */
459 case S_IFCHR:
460 case S_IFBLK:
461 (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
462 sizeof (rdev));
463 /*FALLTHROUGH*/
464 case S_IFIFO:
465 case S_IFSOCK:
466 init_special_inode(ip, ip->i_mode, rdev);
467 ip->i_op = &zpl_special_inode_operations;
468 break;
469
470 default:
471 zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
472 (u_longlong_t)ip->i_ino, ip->i_mode);
473
474 /* Assume the inode is a file and attempt to continue */
475 ip->i_mode = S_IFREG | 0644;
476 ip->i_op = &zpl_inode_operations;
477 ip->i_fop = &zpl_file_operations;
478 ip->i_mapping->a_ops = &zpl_address_space_operations;
479 break;
480 }
481 }
482
483 void
484 zfs_set_inode_flags(znode_t *zp, struct inode *ip)
485 {
486 /*
487 * Linux and Solaris have different sets of file attributes, so we
488 * restrict this conversion to the intersection of the two.
489 */
490 #ifdef HAVE_INODE_SET_FLAGS
491 unsigned int flags = 0;
492 if (zp->z_pflags & ZFS_IMMUTABLE)
493 flags |= S_IMMUTABLE;
494 if (zp->z_pflags & ZFS_APPENDONLY)
495 flags |= S_APPEND;
496
497 inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
498 #else
499 if (zp->z_pflags & ZFS_IMMUTABLE)
500 ip->i_flags |= S_IMMUTABLE;
501 else
502 ip->i_flags &= ~S_IMMUTABLE;
503
504 if (zp->z_pflags & ZFS_APPENDONLY)
505 ip->i_flags |= S_APPEND;
506 else
507 ip->i_flags &= ~S_APPEND;
508 #endif
509 }
510
511 /*
512 * Update the embedded inode given the znode. We should work toward
513 * eliminating this function as soon as possible by removing values
514 * which are duplicated between the znode and inode. If the generic
515 * inode has the correct field it should be used, and the ZFS code
516 * updated to access the inode. This can be done incrementally.
517 */
518 void
519 zfs_inode_update(znode_t *zp)
520 {
521 zfsvfs_t *zfsvfs;
522 struct inode *ip;
523 uint32_t blksize;
524 u_longlong_t i_blocks;
525
526 ASSERT(zp != NULL);
527 zfsvfs = ZTOZSB(zp);
528 ip = ZTOI(zp);
529
530 /* Skip .zfs control nodes which do not exist on disk. */
531 if (zfsctl_is_node(ip))
532 return;
533
534 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
535
536 spin_lock(&ip->i_lock);
537 ip->i_blocks = i_blocks;
538 i_size_write(ip, zp->z_size);
539 spin_unlock(&ip->i_lock);
540 }
541
542
543 /*
544 * Construct a znode+inode and initialize.
545 *
546 * This does not do a call to dmu_set_user() that is
547 * up to the caller to do, in case you don't want to
548 * return the znode
549 */
550 static znode_t *
551 zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
552 dmu_object_type_t obj_type, uint64_t obj, sa_handle_t *hdl)
553 {
554 znode_t *zp;
555 struct inode *ip;
556 uint64_t mode;
557 uint64_t parent;
558 uint64_t tmp_gen;
559 uint64_t links;
560 uint64_t z_uid, z_gid;
561 uint64_t atime[2], mtime[2], ctime[2];
562 uint64_t projid = ZFS_DEFAULT_PROJID;
563 sa_bulk_attr_t bulk[11];
564 int count = 0;
565
566 ASSERT(zfsvfs != NULL);
567
568 ip = new_inode(zfsvfs->z_sb);
569 if (ip == NULL)
570 return (NULL);
571
572 zp = ITOZ(ip);
573 ASSERT(zp->z_dirlocks == NULL);
574 ASSERT3P(zp->z_acl_cached, ==, NULL);
575 ASSERT3P(zp->z_xattr_cached, ==, NULL);
576 zp->z_moved = 0;
577 zp->z_sa_hdl = NULL;
578 zp->z_unlinked = 0;
579 zp->z_atime_dirty = 0;
580 zp->z_mapcnt = 0;
581 zp->z_id = db->db_object;
582 zp->z_blksz = blksz;
583 zp->z_seq = 0x7A4653;
584 zp->z_sync_cnt = 0;
585 zp->z_is_mapped = B_FALSE;
586 zp->z_is_ctldir = B_FALSE;
587 zp->z_is_stale = B_FALSE;
588 zp->z_range_lock.zr_size = &zp->z_size;
589 zp->z_range_lock.zr_blksz = &zp->z_blksz;
590 zp->z_range_lock.zr_max_blksz = &ZTOZSB(zp)->z_max_blksz;
591
592 zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
593
594 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
595 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
596 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
597 &zp->z_size, 8);
598 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
599 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
600 &zp->z_pflags, 8);
601 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
602 &parent, 8);
603 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
604 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
605 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
606 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
607 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
608
609 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
610 (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
611 (zp->z_pflags & ZFS_PROJID) &&
612 sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
613 if (hdl == NULL)
614 sa_handle_destroy(zp->z_sa_hdl);
615 zp->z_sa_hdl = NULL;
616 goto error;
617 }
618
619 zp->z_projid = projid;
620 zp->z_mode = ip->i_mode = mode;
621 ip->i_generation = (uint32_t)tmp_gen;
622 ip->i_blkbits = SPA_MINBLOCKSHIFT;
623 set_nlink(ip, (uint32_t)links);
624 zfs_uid_write(ip, z_uid);
625 zfs_gid_write(ip, z_gid);
626 zfs_set_inode_flags(zp, ip);
627
628 /* Cache the xattr parent id */
629 if (zp->z_pflags & ZFS_XATTR)
630 zp->z_xattr_parent = parent;
631
632 ZFS_TIME_DECODE(&ip->i_atime, atime);
633 ZFS_TIME_DECODE(&ip->i_mtime, mtime);
634 ZFS_TIME_DECODE(&ip->i_ctime, ctime);
635
636 ip->i_ino = obj;
637 zfs_inode_update(zp);
638 zfs_inode_set_ops(zfsvfs, ip);
639
640 /*
641 * The only way insert_inode_locked() can fail is if the ip->i_ino
642 * number is already hashed for this super block. This can never
643 * happen because the inode numbers map 1:1 with the object numbers.
644 *
645 * The one exception is rolling back a mounted file system, but in
646 * this case all the active inode are unhashed during the rollback.
647 */
648 VERIFY3S(insert_inode_locked(ip), ==, 0);
649
650 mutex_enter(&zfsvfs->z_znodes_lock);
651 list_insert_tail(&zfsvfs->z_all_znodes, zp);
652 zfsvfs->z_nr_znodes++;
653 membar_producer();
654 mutex_exit(&zfsvfs->z_znodes_lock);
655
656 unlock_new_inode(ip);
657 return (zp);
658
659 error:
660 iput(ip);
661 return (NULL);
662 }
663
664 /*
665 * Safely mark an inode dirty. Inodes which are part of a read-only
666 * file system or snapshot may not be dirtied.
667 */
668 void
669 zfs_mark_inode_dirty(struct inode *ip)
670 {
671 zfsvfs_t *zfsvfs = ITOZSB(ip);
672
673 if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
674 return;
675
676 mark_inode_dirty(ip);
677 }
678
679 static uint64_t empty_xattr;
680 static uint64_t pad[4];
681 static zfs_acl_phys_t acl_phys;
682 /*
683 * Create a new DMU object to hold a zfs znode.
684 *
685 * IN: dzp - parent directory for new znode
686 * vap - file attributes for new znode
687 * tx - dmu transaction id for zap operations
688 * cr - credentials of caller
689 * flag - flags:
690 * IS_ROOT_NODE - new object will be root
691 * IS_XATTR - new object is an attribute
692 * bonuslen - length of bonus buffer
693 * setaclp - File/Dir initial ACL
694 * fuidp - Tracks fuid allocation.
695 *
696 * OUT: zpp - allocated znode
697 *
698 */
699 void
700 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
701 uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
702 {
703 uint64_t crtime[2], atime[2], mtime[2], ctime[2];
704 uint64_t mode, size, links, parent, pflags;
705 uint64_t projid = ZFS_DEFAULT_PROJID;
706 uint64_t rdev = 0;
707 zfsvfs_t *zfsvfs = ZTOZSB(dzp);
708 dmu_buf_t *db;
709 timestruc_t now;
710 uint64_t gen, obj;
711 int bonuslen;
712 int dnodesize;
713 sa_handle_t *sa_hdl;
714 dmu_object_type_t obj_type;
715 sa_bulk_attr_t *sa_attrs;
716 int cnt = 0;
717 zfs_acl_locator_cb_t locate = { 0 };
718 znode_hold_t *zh;
719
720 if (zfsvfs->z_replay) {
721 obj = vap->va_nodeid;
722 now = vap->va_ctime; /* see zfs_replay_create() */
723 gen = vap->va_nblocks; /* ditto */
724 dnodesize = vap->va_fsid; /* ditto */
725 } else {
726 obj = 0;
727 gethrestime(&now);
728 gen = dmu_tx_get_txg(tx);
729 dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
730 }
731
732 if (dnodesize == 0)
733 dnodesize = DNODE_MIN_SIZE;
734
735 obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
736
737 bonuslen = (obj_type == DMU_OT_SA) ?
738 DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
739
740 /*
741 * Create a new DMU object.
742 */
743 /*
744 * There's currently no mechanism for pre-reading the blocks that will
745 * be needed to allocate a new object, so we accept the small chance
746 * that there will be an i/o error and we will fail one of the
747 * assertions below.
748 */
749 if (S_ISDIR(vap->va_mode)) {
750 if (zfsvfs->z_replay) {
751 VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
752 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
753 obj_type, bonuslen, dnodesize, tx));
754 } else {
755 obj = zap_create_norm_dnsize(zfsvfs->z_os,
756 zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
757 obj_type, bonuslen, dnodesize, tx);
758 }
759 } else {
760 if (zfsvfs->z_replay) {
761 VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
762 DMU_OT_PLAIN_FILE_CONTENTS, 0,
763 obj_type, bonuslen, dnodesize, tx));
764 } else {
765 obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
766 DMU_OT_PLAIN_FILE_CONTENTS, 0,
767 obj_type, bonuslen, dnodesize, tx);
768 }
769 }
770
771 zh = zfs_znode_hold_enter(zfsvfs, obj);
772 VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
773
774 /*
775 * If this is the root, fix up the half-initialized parent pointer
776 * to reference the just-allocated physical data area.
777 */
778 if (flag & IS_ROOT_NODE) {
779 dzp->z_id = obj;
780 }
781
782 /*
783 * If parent is an xattr, so am I.
784 */
785 if (dzp->z_pflags & ZFS_XATTR) {
786 flag |= IS_XATTR;
787 }
788
789 if (zfsvfs->z_use_fuids)
790 pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
791 else
792 pflags = 0;
793
794 if (S_ISDIR(vap->va_mode)) {
795 size = 2; /* contents ("." and "..") */
796 links = 2;
797 } else {
798 size = 0;
799 links = (flag & IS_TMPFILE) ? 0 : 1;
800 }
801
802 if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
803 rdev = vap->va_rdev;
804
805 parent = dzp->z_id;
806 mode = acl_ids->z_mode;
807 if (flag & IS_XATTR)
808 pflags |= ZFS_XATTR;
809
810 if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
811 /*
812 * With ZFS_PROJID flag, we can easily know whether there is
813 * project ID stored on disk or not. See zfs_space_delta_cb().
814 */
815 if (obj_type != DMU_OT_ZNODE &&
816 dmu_objset_projectquota_enabled(zfsvfs->z_os))
817 pflags |= ZFS_PROJID;
818
819 /*
820 * Inherit project ID from parent if required.
821 */
822 projid = zfs_inherit_projid(dzp);
823 if (dzp->z_pflags & ZFS_PROJINHERIT)
824 pflags |= ZFS_PROJINHERIT;
825 }
826
827 /*
828 * No execs denied will be deterimed when zfs_mode_compute() is called.
829 */
830 pflags |= acl_ids->z_aclp->z_hints &
831 (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
832 ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
833
834 ZFS_TIME_ENCODE(&now, crtime);
835 ZFS_TIME_ENCODE(&now, ctime);
836
837 if (vap->va_mask & ATTR_ATIME) {
838 ZFS_TIME_ENCODE(&vap->va_atime, atime);
839 } else {
840 ZFS_TIME_ENCODE(&now, atime);
841 }
842
843 if (vap->va_mask & ATTR_MTIME) {
844 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
845 } else {
846 ZFS_TIME_ENCODE(&now, mtime);
847 }
848
849 /* Now add in all of the "SA" attributes */
850 VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
851 &sa_hdl));
852
853 /*
854 * Setup the array of attributes to be replaced/set on the new file
855 *
856 * order for DMU_OT_ZNODE is critical since it needs to be constructed
857 * in the old znode_phys_t format. Don't change this ordering
858 */
859 sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
860
861 if (obj_type == DMU_OT_ZNODE) {
862 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
863 NULL, &atime, 16);
864 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
865 NULL, &mtime, 16);
866 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
867 NULL, &ctime, 16);
868 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
869 NULL, &crtime, 16);
870 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
871 NULL, &gen, 8);
872 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
873 NULL, &mode, 8);
874 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
875 NULL, &size, 8);
876 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
877 NULL, &parent, 8);
878 } else {
879 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
880 NULL, &mode, 8);
881 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
882 NULL, &size, 8);
883 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
884 NULL, &gen, 8);
885 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
886 NULL, &acl_ids->z_fuid, 8);
887 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
888 NULL, &acl_ids->z_fgid, 8);
889 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
890 NULL, &parent, 8);
891 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
892 NULL, &pflags, 8);
893 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
894 NULL, &atime, 16);
895 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
896 NULL, &mtime, 16);
897 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
898 NULL, &ctime, 16);
899 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
900 NULL, &crtime, 16);
901 }
902
903 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
904
905 if (obj_type == DMU_OT_ZNODE) {
906 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
907 &empty_xattr, 8);
908 } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
909 pflags & ZFS_PROJID) {
910 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
911 NULL, &projid, 8);
912 }
913 if (obj_type == DMU_OT_ZNODE ||
914 (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
915 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
916 NULL, &rdev, 8);
917 }
918 if (obj_type == DMU_OT_ZNODE) {
919 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
920 NULL, &pflags, 8);
921 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
922 &acl_ids->z_fuid, 8);
923 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
924 &acl_ids->z_fgid, 8);
925 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
926 sizeof (uint64_t) * 4);
927 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
928 &acl_phys, sizeof (zfs_acl_phys_t));
929 } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
930 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
931 &acl_ids->z_aclp->z_acl_count, 8);
932 locate.cb_aclp = acl_ids->z_aclp;
933 SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
934 zfs_acl_data_locator, &locate,
935 acl_ids->z_aclp->z_acl_bytes);
936 mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
937 acl_ids->z_fuid, acl_ids->z_fgid);
938 }
939
940 VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
941
942 if (!(flag & IS_ROOT_NODE)) {
943 /*
944 * The call to zfs_znode_alloc() may fail if memory is low
945 * via the call path: alloc_inode() -> inode_init_always() ->
946 * security_inode_alloc() -> inode_alloc_security(). Since
947 * the existing code is written such that zfs_mknode() can
948 * not fail retry until sufficient memory has been reclaimed.
949 */
950 do {
951 *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, obj,
952 sa_hdl);
953 } while (*zpp == NULL);
954
955 VERIFY(*zpp != NULL);
956 VERIFY(dzp != NULL);
957 } else {
958 /*
959 * If we are creating the root node, the "parent" we
960 * passed in is the znode for the root.
961 */
962 *zpp = dzp;
963
964 (*zpp)->z_sa_hdl = sa_hdl;
965 }
966
967 (*zpp)->z_pflags = pflags;
968 (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
969 (*zpp)->z_dnodesize = dnodesize;
970 (*zpp)->z_projid = projid;
971
972 if (obj_type == DMU_OT_ZNODE ||
973 acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
974 VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
975 }
976 kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
977 zfs_znode_hold_exit(zfsvfs, zh);
978 }
979
980 /*
981 * Update in-core attributes. It is assumed the caller will be doing an
982 * sa_bulk_update to push the changes out.
983 */
984 void
985 zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
986 {
987 xoptattr_t *xoap;
988 boolean_t update_inode = B_FALSE;
989
990 xoap = xva_getxoptattr(xvap);
991 ASSERT(xoap);
992
993 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
994 uint64_t times[2];
995 ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
996 (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
997 &times, sizeof (times), tx);
998 XVA_SET_RTN(xvap, XAT_CREATETIME);
999 }
1000 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
1001 ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
1002 zp->z_pflags, tx);
1003 XVA_SET_RTN(xvap, XAT_READONLY);
1004 }
1005 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
1006 ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
1007 zp->z_pflags, tx);
1008 XVA_SET_RTN(xvap, XAT_HIDDEN);
1009 }
1010 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
1011 ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
1012 zp->z_pflags, tx);
1013 XVA_SET_RTN(xvap, XAT_SYSTEM);
1014 }
1015 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
1016 ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
1017 zp->z_pflags, tx);
1018 XVA_SET_RTN(xvap, XAT_ARCHIVE);
1019 }
1020 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
1021 ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
1022 zp->z_pflags, tx);
1023 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
1024
1025 update_inode = B_TRUE;
1026 }
1027 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
1028 ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
1029 zp->z_pflags, tx);
1030 XVA_SET_RTN(xvap, XAT_NOUNLINK);
1031 }
1032 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1033 ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
1034 zp->z_pflags, tx);
1035 XVA_SET_RTN(xvap, XAT_APPENDONLY);
1036
1037 update_inode = B_TRUE;
1038 }
1039 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1040 ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
1041 zp->z_pflags, tx);
1042 XVA_SET_RTN(xvap, XAT_NODUMP);
1043 }
1044 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1045 ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
1046 zp->z_pflags, tx);
1047 XVA_SET_RTN(xvap, XAT_OPAQUE);
1048 }
1049 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1050 ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
1051 xoap->xoa_av_quarantined, zp->z_pflags, tx);
1052 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1053 }
1054 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1055 ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
1056 zp->z_pflags, tx);
1057 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1058 }
1059 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
1060 zfs_sa_set_scanstamp(zp, xvap, tx);
1061 XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
1062 }
1063 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1064 ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
1065 zp->z_pflags, tx);
1066 XVA_SET_RTN(xvap, XAT_REPARSE);
1067 }
1068 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1069 ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
1070 zp->z_pflags, tx);
1071 XVA_SET_RTN(xvap, XAT_OFFLINE);
1072 }
1073 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1074 ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
1075 zp->z_pflags, tx);
1076 XVA_SET_RTN(xvap, XAT_SPARSE);
1077 }
1078 if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
1079 ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
1080 zp->z_pflags, tx);
1081 XVA_SET_RTN(xvap, XAT_PROJINHERIT);
1082 }
1083
1084 if (update_inode)
1085 zfs_set_inode_flags(zp, ZTOI(zp));
1086 }
1087
1088 int
1089 zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
1090 {
1091 dmu_object_info_t doi;
1092 dmu_buf_t *db;
1093 znode_t *zp;
1094 znode_hold_t *zh;
1095 int err;
1096 sa_handle_t *hdl;
1097
1098 *zpp = NULL;
1099
1100 again:
1101 zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1102
1103 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1104 if (err) {
1105 zfs_znode_hold_exit(zfsvfs, zh);
1106 return (err);
1107 }
1108
1109 dmu_object_info_from_db(db, &doi);
1110 if (doi.doi_bonus_type != DMU_OT_SA &&
1111 (doi.doi_bonus_type != DMU_OT_ZNODE ||
1112 (doi.doi_bonus_type == DMU_OT_ZNODE &&
1113 doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1114 sa_buf_rele(db, NULL);
1115 zfs_znode_hold_exit(zfsvfs, zh);
1116 return (SET_ERROR(EINVAL));
1117 }
1118
1119 hdl = dmu_buf_get_user(db);
1120 if (hdl != NULL) {
1121 zp = sa_get_userdata(hdl);
1122
1123
1124 /*
1125 * Since "SA" does immediate eviction we
1126 * should never find a sa handle that doesn't
1127 * know about the znode.
1128 */
1129
1130 ASSERT3P(zp, !=, NULL);
1131
1132 mutex_enter(&zp->z_lock);
1133 ASSERT3U(zp->z_id, ==, obj_num);
1134 /*
1135 * If igrab() returns NULL the VFS has independently
1136 * determined the inode should be evicted and has
1137 * called iput_final() to start the eviction process.
1138 * The SA handle is still valid but because the VFS
1139 * requires that the eviction succeed we must drop
1140 * our locks and references to allow the eviction to
1141 * complete. The zfs_zget() may then be retried.
1142 *
1143 * This unlikely case could be optimized by registering
1144 * a sops->drop_inode() callback. The callback would
1145 * need to detect the active SA hold thereby informing
1146 * the VFS that this inode should not be evicted.
1147 */
1148 if (igrab(ZTOI(zp)) == NULL) {
1149 mutex_exit(&zp->z_lock);
1150 sa_buf_rele(db, NULL);
1151 zfs_znode_hold_exit(zfsvfs, zh);
1152 /* inode might need this to finish evict */
1153 cond_resched();
1154 goto again;
1155 }
1156 *zpp = zp;
1157 err = 0;
1158 mutex_exit(&zp->z_lock);
1159 sa_buf_rele(db, NULL);
1160 zfs_znode_hold_exit(zfsvfs, zh);
1161 return (err);
1162 }
1163
1164 /*
1165 * Not found create new znode/vnode but only if file exists.
1166 *
1167 * There is a small window where zfs_vget() could
1168 * find this object while a file create is still in
1169 * progress. This is checked for in zfs_znode_alloc()
1170 *
1171 * if zfs_znode_alloc() fails it will drop the hold on the
1172 * bonus buffer.
1173 */
1174 zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
1175 doi.doi_bonus_type, obj_num, NULL);
1176 if (zp == NULL) {
1177 err = SET_ERROR(ENOENT);
1178 } else {
1179 *zpp = zp;
1180 }
1181 zfs_znode_hold_exit(zfsvfs, zh);
1182 return (err);
1183 }
1184
1185 int
1186 zfs_rezget(znode_t *zp)
1187 {
1188 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1189 dmu_object_info_t doi;
1190 dmu_buf_t *db;
1191 uint64_t obj_num = zp->z_id;
1192 uint64_t mode;
1193 uint64_t links;
1194 sa_bulk_attr_t bulk[10];
1195 int err;
1196 int count = 0;
1197 uint64_t gen;
1198 uint64_t z_uid, z_gid;
1199 uint64_t atime[2], mtime[2], ctime[2];
1200 uint64_t projid = ZFS_DEFAULT_PROJID;
1201 znode_hold_t *zh;
1202
1203 /*
1204 * skip ctldir, otherwise they will always get invalidated. This will
1205 * cause funny behaviour for the mounted snapdirs. Especially for
1206 * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
1207 * anyone automount it again as long as someone is still using the
1208 * detached mount.
1209 */
1210 if (zp->z_is_ctldir)
1211 return (0);
1212
1213 zh = zfs_znode_hold_enter(zfsvfs, obj_num);
1214
1215 mutex_enter(&zp->z_acl_lock);
1216 if (zp->z_acl_cached) {
1217 zfs_acl_free(zp->z_acl_cached);
1218 zp->z_acl_cached = NULL;
1219 }
1220 mutex_exit(&zp->z_acl_lock);
1221
1222 rw_enter(&zp->z_xattr_lock, RW_WRITER);
1223 if (zp->z_xattr_cached) {
1224 nvlist_free(zp->z_xattr_cached);
1225 zp->z_xattr_cached = NULL;
1226 }
1227 rw_exit(&zp->z_xattr_lock);
1228
1229 ASSERT(zp->z_sa_hdl == NULL);
1230 err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1231 if (err) {
1232 zfs_znode_hold_exit(zfsvfs, zh);
1233 return (err);
1234 }
1235
1236 dmu_object_info_from_db(db, &doi);
1237 if (doi.doi_bonus_type != DMU_OT_SA &&
1238 (doi.doi_bonus_type != DMU_OT_ZNODE ||
1239 (doi.doi_bonus_type == DMU_OT_ZNODE &&
1240 doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1241 sa_buf_rele(db, NULL);
1242 zfs_znode_hold_exit(zfsvfs, zh);
1243 return (SET_ERROR(EINVAL));
1244 }
1245
1246 zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
1247
1248 /* reload cached values */
1249 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
1250 &gen, sizeof (gen));
1251 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1252 &zp->z_size, sizeof (zp->z_size));
1253 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
1254 &links, sizeof (links));
1255 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1256 &zp->z_pflags, sizeof (zp->z_pflags));
1257 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1258 &z_uid, sizeof (z_uid));
1259 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1260 &z_gid, sizeof (z_gid));
1261 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1262 &mode, sizeof (mode));
1263 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1264 &atime, 16);
1265 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
1266 &mtime, 16);
1267 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
1268 &ctime, 16);
1269
1270 if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
1271 zfs_znode_dmu_fini(zp);
1272 zfs_znode_hold_exit(zfsvfs, zh);
1273 return (SET_ERROR(EIO));
1274 }
1275
1276 if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
1277 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
1278 &projid, 8);
1279 if (err != 0 && err != ENOENT) {
1280 zfs_znode_dmu_fini(zp);
1281 zfs_znode_hold_exit(zfsvfs, zh);
1282 return (SET_ERROR(err));
1283 }
1284 }
1285
1286 zp->z_projid = projid;
1287 zp->z_mode = ZTOI(zp)->i_mode = mode;
1288 zfs_uid_write(ZTOI(zp), z_uid);
1289 zfs_gid_write(ZTOI(zp), z_gid);
1290
1291 ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
1292 ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
1293 ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime);
1294
1295 if (gen != ZTOI(zp)->i_generation) {
1296 zfs_znode_dmu_fini(zp);
1297 zfs_znode_hold_exit(zfsvfs, zh);
1298 return (SET_ERROR(EIO));
1299 }
1300
1301 set_nlink(ZTOI(zp), (uint32_t)links);
1302 zfs_set_inode_flags(zp, ZTOI(zp));
1303
1304 zp->z_blksz = doi.doi_data_block_size;
1305 zp->z_atime_dirty = 0;
1306 zfs_inode_update(zp);
1307
1308 /*
1309 * If the file has zero links, then it has been unlinked on the send
1310 * side and it must be in the received unlinked set.
1311 * We call zfs_znode_dmu_fini() now to prevent any accesses to the
1312 * stale data and to prevent automatical removal of the file in
1313 * zfs_zinactive(). The file will be removed either when it is removed
1314 * on the send side and the next incremental stream is received or
1315 * when the unlinked set gets processed.
1316 */
1317 zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
1318 if (zp->z_unlinked)
1319 zfs_znode_dmu_fini(zp);
1320
1321 zfs_znode_hold_exit(zfsvfs, zh);
1322
1323 return (0);
1324 }
1325
1326 void
1327 zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1328 {
1329 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1330 objset_t *os = zfsvfs->z_os;
1331 uint64_t obj = zp->z_id;
1332 uint64_t acl_obj = zfs_external_acl(zp);
1333 znode_hold_t *zh;
1334
1335 zh = zfs_znode_hold_enter(zfsvfs, obj);
1336 if (acl_obj) {
1337 VERIFY(!zp->z_is_sa);
1338 VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1339 }
1340 VERIFY(0 == dmu_object_free(os, obj, tx));
1341 zfs_znode_dmu_fini(zp);
1342 zfs_znode_hold_exit(zfsvfs, zh);
1343 }
1344
1345 void
1346 zfs_zinactive(znode_t *zp)
1347 {
1348 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1349 uint64_t z_id = zp->z_id;
1350 znode_hold_t *zh;
1351
1352 ASSERT(zp->z_sa_hdl);
1353
1354 /*
1355 * Don't allow a zfs_zget() while were trying to release this znode.
1356 */
1357 zh = zfs_znode_hold_enter(zfsvfs, z_id);
1358
1359 mutex_enter(&zp->z_lock);
1360
1361 /*
1362 * If this was the last reference to a file with no links, remove
1363 * the file from the file system unless the file system is mounted
1364 * read-only. That can happen, for example, if the file system was
1365 * originally read-write, the file was opened, then unlinked and
1366 * the file system was made read-only before the file was finally
1367 * closed. The file will remain in the unlinked set.
1368 */
1369 if (zp->z_unlinked) {
1370 ASSERT(!zfsvfs->z_issnap);
1371 if (!zfs_is_readonly(zfsvfs)) {
1372 mutex_exit(&zp->z_lock);
1373 zfs_znode_hold_exit(zfsvfs, zh);
1374 zfs_rmnode(zp);
1375 return;
1376 }
1377 }
1378
1379 mutex_exit(&zp->z_lock);
1380 zfs_znode_dmu_fini(zp);
1381
1382 zfs_znode_hold_exit(zfsvfs, zh);
1383 }
1384
1385 static inline int
1386 zfs_compare_timespec(struct timespec *t1, struct timespec *t2)
1387 {
1388 if (t1->tv_sec < t2->tv_sec)
1389 return (-1);
1390
1391 if (t1->tv_sec > t2->tv_sec)
1392 return (1);
1393
1394 return (t1->tv_nsec - t2->tv_nsec);
1395 }
1396
1397 /*
1398 * Prepare to update znode time stamps.
1399 *
1400 * IN: zp - znode requiring timestamp update
1401 * flag - ATTR_MTIME, ATTR_CTIME flags
1402 *
1403 * OUT: zp - z_seq
1404 * mtime - new mtime
1405 * ctime - new ctime
1406 *
1407 * Note: We don't update atime here, because we rely on Linux VFS to do
1408 * atime updating.
1409 */
1410 void
1411 zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
1412 uint64_t ctime[2])
1413 {
1414 timestruc_t now;
1415
1416 gethrestime(&now);
1417
1418 zp->z_seq++;
1419
1420 if (flag & ATTR_MTIME) {
1421 ZFS_TIME_ENCODE(&now, mtime);
1422 ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime);
1423 if (ZTOZSB(zp)->z_use_fuids) {
1424 zp->z_pflags |= (ZFS_ARCHIVE |
1425 ZFS_AV_MODIFIED);
1426 }
1427 }
1428
1429 if (flag & ATTR_CTIME) {
1430 ZFS_TIME_ENCODE(&now, ctime);
1431 ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime);
1432 if (ZTOZSB(zp)->z_use_fuids)
1433 zp->z_pflags |= ZFS_ARCHIVE;
1434 }
1435 }
1436
1437 /*
1438 * Grow the block size for a file.
1439 *
1440 * IN: zp - znode of file to free data in.
1441 * size - requested block size
1442 * tx - open transaction.
1443 *
1444 * NOTE: this function assumes that the znode is write locked.
1445 */
1446 void
1447 zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1448 {
1449 int error;
1450 u_longlong_t dummy;
1451
1452 if (size <= zp->z_blksz)
1453 return;
1454 /*
1455 * If the file size is already greater than the current blocksize,
1456 * we will not grow. If there is more than one block in a file,
1457 * the blocksize cannot change.
1458 */
1459 if (zp->z_blksz && zp->z_size > zp->z_blksz)
1460 return;
1461
1462 error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
1463 size, 0, tx);
1464
1465 if (error == ENOTSUP)
1466 return;
1467 ASSERT0(error);
1468
1469 /* What blocksize did we actually get? */
1470 dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
1471 }
1472
1473 /*
1474 * Increase the file length
1475 *
1476 * IN: zp - znode of file to free data in.
1477 * end - new end-of-file
1478 *
1479 * RETURN: 0 on success, error code on failure
1480 */
1481 static int
1482 zfs_extend(znode_t *zp, uint64_t end)
1483 {
1484 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1485 dmu_tx_t *tx;
1486 rl_t *rl;
1487 uint64_t newblksz;
1488 int error;
1489
1490 /*
1491 * We will change zp_size, lock the whole file.
1492 */
1493 rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER);
1494
1495 /*
1496 * Nothing to do if file already at desired length.
1497 */
1498 if (end <= zp->z_size) {
1499 zfs_range_unlock(rl);
1500 return (0);
1501 }
1502 tx = dmu_tx_create(zfsvfs->z_os);
1503 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1504 zfs_sa_upgrade_txholds(tx, zp);
1505 if (end > zp->z_blksz &&
1506 (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1507 /*
1508 * We are growing the file past the current block size.
1509 */
1510 if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
1511 /*
1512 * File's blocksize is already larger than the
1513 * "recordsize" property. Only let it grow to
1514 * the next power of 2.
1515 */
1516 ASSERT(!ISP2(zp->z_blksz));
1517 newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
1518 } else {
1519 newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
1520 }
1521 dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1522 } else {
1523 newblksz = 0;
1524 }
1525
1526 error = dmu_tx_assign(tx, TXG_WAIT);
1527 if (error) {
1528 dmu_tx_abort(tx);
1529 zfs_range_unlock(rl);
1530 return (error);
1531 }
1532
1533 if (newblksz)
1534 zfs_grow_blocksize(zp, newblksz, tx);
1535
1536 zp->z_size = end;
1537
1538 VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
1539 &zp->z_size, sizeof (zp->z_size), tx));
1540
1541 zfs_range_unlock(rl);
1542
1543 dmu_tx_commit(tx);
1544
1545 return (0);
1546 }
1547
1548 /*
1549 * zfs_zero_partial_page - Modeled after update_pages() but
1550 * with different arguments and semantics for use by zfs_freesp().
1551 *
1552 * Zeroes a piece of a single page cache entry for zp at offset
1553 * start and length len.
1554 *
1555 * Caller must acquire a range lock on the file for the region
1556 * being zeroed in order that the ARC and page cache stay in sync.
1557 */
1558 static void
1559 zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
1560 {
1561 struct address_space *mp = ZTOI(zp)->i_mapping;
1562 struct page *pp;
1563 int64_t off;
1564 void *pb;
1565
1566 ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
1567
1568 off = start & (PAGE_SIZE - 1);
1569 start &= PAGE_MASK;
1570
1571 pp = find_lock_page(mp, start >> PAGE_SHIFT);
1572 if (pp) {
1573 if (mapping_writably_mapped(mp))
1574 flush_dcache_page(pp);
1575
1576 pb = kmap(pp);
1577 bzero(pb + off, len);
1578 kunmap(pp);
1579
1580 if (mapping_writably_mapped(mp))
1581 flush_dcache_page(pp);
1582
1583 mark_page_accessed(pp);
1584 SetPageUptodate(pp);
1585 ClearPageError(pp);
1586 unlock_page(pp);
1587 put_page(pp);
1588 }
1589 }
1590
1591 /*
1592 * Free space in a file.
1593 *
1594 * IN: zp - znode of file to free data in.
1595 * off - start of section to free.
1596 * len - length of section to free.
1597 *
1598 * RETURN: 0 on success, error code on failure
1599 */
1600 static int
1601 zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1602 {
1603 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1604 rl_t *rl;
1605 int error;
1606
1607 /*
1608 * Lock the range being freed.
1609 */
1610 rl = zfs_range_lock(&zp->z_range_lock, off, len, RL_WRITER);
1611
1612 /*
1613 * Nothing to do if file already at desired length.
1614 */
1615 if (off >= zp->z_size) {
1616 zfs_range_unlock(rl);
1617 return (0);
1618 }
1619
1620 if (off + len > zp->z_size)
1621 len = zp->z_size - off;
1622
1623 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1624
1625 /*
1626 * Zero partial page cache entries. This must be done under a
1627 * range lock in order to keep the ARC and page cache in sync.
1628 */
1629 if (zp->z_is_mapped) {
1630 loff_t first_page, last_page, page_len;
1631 loff_t first_page_offset, last_page_offset;
1632
1633 /* first possible full page in hole */
1634 first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
1635 /* last page of hole */
1636 last_page = (off + len) >> PAGE_SHIFT;
1637
1638 /* offset of first_page */
1639 first_page_offset = first_page << PAGE_SHIFT;
1640 /* offset of last_page */
1641 last_page_offset = last_page << PAGE_SHIFT;
1642
1643 /* truncate whole pages */
1644 if (last_page_offset > first_page_offset) {
1645 truncate_inode_pages_range(ZTOI(zp)->i_mapping,
1646 first_page_offset, last_page_offset - 1);
1647 }
1648
1649 /* truncate sub-page ranges */
1650 if (first_page > last_page) {
1651 /* entire punched area within a single page */
1652 zfs_zero_partial_page(zp, off, len);
1653 } else {
1654 /* beginning of punched area at the end of a page */
1655 page_len = first_page_offset - off;
1656 if (page_len > 0)
1657 zfs_zero_partial_page(zp, off, page_len);
1658
1659 /* end of punched area at the beginning of a page */
1660 page_len = off + len - last_page_offset;
1661 if (page_len > 0)
1662 zfs_zero_partial_page(zp, last_page_offset,
1663 page_len);
1664 }
1665 }
1666 zfs_range_unlock(rl);
1667
1668 return (error);
1669 }
1670
1671 /*
1672 * Truncate a file
1673 *
1674 * IN: zp - znode of file to free data in.
1675 * end - new end-of-file.
1676 *
1677 * RETURN: 0 on success, error code on failure
1678 */
1679 static int
1680 zfs_trunc(znode_t *zp, uint64_t end)
1681 {
1682 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1683 dmu_tx_t *tx;
1684 rl_t *rl;
1685 int error;
1686 sa_bulk_attr_t bulk[2];
1687 int count = 0;
1688
1689 /*
1690 * We will change zp_size, lock the whole file.
1691 */
1692 rl = zfs_range_lock(&zp->z_range_lock, 0, UINT64_MAX, RL_WRITER);
1693
1694 /*
1695 * Nothing to do if file already at desired length.
1696 */
1697 if (end >= zp->z_size) {
1698 zfs_range_unlock(rl);
1699 return (0);
1700 }
1701
1702 error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
1703 DMU_OBJECT_END);
1704 if (error) {
1705 zfs_range_unlock(rl);
1706 return (error);
1707 }
1708 tx = dmu_tx_create(zfsvfs->z_os);
1709 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1710 zfs_sa_upgrade_txholds(tx, zp);
1711 dmu_tx_mark_netfree(tx);
1712 error = dmu_tx_assign(tx, TXG_WAIT);
1713 if (error) {
1714 dmu_tx_abort(tx);
1715 zfs_range_unlock(rl);
1716 return (error);
1717 }
1718
1719 zp->z_size = end;
1720 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
1721 NULL, &zp->z_size, sizeof (zp->z_size));
1722
1723 if (end == 0) {
1724 zp->z_pflags &= ~ZFS_SPARSE;
1725 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1726 NULL, &zp->z_pflags, 8);
1727 }
1728 VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
1729
1730 dmu_tx_commit(tx);
1731
1732 zfs_range_unlock(rl);
1733
1734 return (0);
1735 }
1736
1737 /*
1738 * Free space in a file
1739 *
1740 * IN: zp - znode of file to free data in.
1741 * off - start of range
1742 * len - end of range (0 => EOF)
1743 * flag - current file open mode flags.
1744 * log - TRUE if this action should be logged
1745 *
1746 * RETURN: 0 on success, error code on failure
1747 */
1748 int
1749 zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1750 {
1751 dmu_tx_t *tx;
1752 zfsvfs_t *zfsvfs = ZTOZSB(zp);
1753 zilog_t *zilog = zfsvfs->z_log;
1754 uint64_t mode;
1755 uint64_t mtime[2], ctime[2];
1756 sa_bulk_attr_t bulk[3];
1757 int count = 0;
1758 int error;
1759
1760 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
1761 sizeof (mode))) != 0)
1762 return (error);
1763
1764 if (off > zp->z_size) {
1765 error = zfs_extend(zp, off+len);
1766 if (error == 0 && log)
1767 goto log;
1768 goto out;
1769 }
1770
1771 if (len == 0) {
1772 error = zfs_trunc(zp, off);
1773 } else {
1774 if ((error = zfs_free_range(zp, off, len)) == 0 &&
1775 off + len > zp->z_size)
1776 error = zfs_extend(zp, off+len);
1777 }
1778 if (error || !log)
1779 goto out;
1780 log:
1781 tx = dmu_tx_create(zfsvfs->z_os);
1782 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1783 zfs_sa_upgrade_txholds(tx, zp);
1784 error = dmu_tx_assign(tx, TXG_WAIT);
1785 if (error) {
1786 dmu_tx_abort(tx);
1787 goto out;
1788 }
1789
1790 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
1791 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
1792 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1793 NULL, &zp->z_pflags, 8);
1794 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
1795 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1796 ASSERT(error == 0);
1797
1798 zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1799
1800 dmu_tx_commit(tx);
1801
1802 zfs_inode_update(zp);
1803 error = 0;
1804
1805 out:
1806 /*
1807 * Truncate the page cache - for file truncate operations, use
1808 * the purpose-built API for truncations. For punching operations,
1809 * the truncation is handled under a range lock in zfs_free_range.
1810 */
1811 if (len == 0)
1812 truncate_setsize(ZTOI(zp), off);
1813 return (error);
1814 }
1815
1816 void
1817 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1818 {
1819 struct super_block *sb;
1820 zfsvfs_t *zfsvfs;
1821 uint64_t moid, obj, sa_obj, version;
1822 uint64_t sense = ZFS_CASE_SENSITIVE;
1823 uint64_t norm = 0;
1824 nvpair_t *elem;
1825 int size;
1826 int error;
1827 int i;
1828 znode_t *rootzp = NULL;
1829 vattr_t vattr;
1830 znode_t *zp;
1831 zfs_acl_ids_t acl_ids;
1832
1833 /*
1834 * First attempt to create master node.
1835 */
1836 /*
1837 * In an empty objset, there are no blocks to read and thus
1838 * there can be no i/o errors (which we assert below).
1839 */
1840 moid = MASTER_NODE_OBJ;
1841 error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1842 DMU_OT_NONE, 0, tx);
1843 ASSERT(error == 0);
1844
1845 /*
1846 * Set starting attributes.
1847 */
1848 version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
1849 elem = NULL;
1850 while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1851 /* For the moment we expect all zpl props to be uint64_ts */
1852 uint64_t val;
1853 char *name;
1854
1855 ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1856 VERIFY(nvpair_value_uint64(elem, &val) == 0);
1857 name = nvpair_name(elem);
1858 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1859 if (val < version)
1860 version = val;
1861 } else {
1862 error = zap_update(os, moid, name, 8, 1, &val, tx);
1863 }
1864 ASSERT(error == 0);
1865 if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1866 norm = val;
1867 else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1868 sense = val;
1869 }
1870 ASSERT(version != 0);
1871 error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
1872
1873 /*
1874 * Create zap object used for SA attribute registration
1875 */
1876
1877 if (version >= ZPL_VERSION_SA) {
1878 sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
1879 DMU_OT_NONE, 0, tx);
1880 error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
1881 ASSERT(error == 0);
1882 } else {
1883 sa_obj = 0;
1884 }
1885 /*
1886 * Create a delete queue.
1887 */
1888 obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1889
1890 error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
1891 ASSERT(error == 0);
1892
1893 /*
1894 * Create root znode. Create minimal znode/inode/zfsvfs/sb
1895 * to allow zfs_mknode to work.
1896 */
1897 vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
1898 vattr.va_mode = S_IFDIR|0755;
1899 vattr.va_uid = crgetuid(cr);
1900 vattr.va_gid = crgetgid(cr);
1901
1902 rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1903 rootzp->z_moved = 0;
1904 rootzp->z_unlinked = 0;
1905 rootzp->z_atime_dirty = 0;
1906 rootzp->z_is_sa = USE_SA(version, os);
1907 rootzp->z_pflags = 0;
1908
1909 zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
1910 zfsvfs->z_os = os;
1911 zfsvfs->z_parent = zfsvfs;
1912 zfsvfs->z_version = version;
1913 zfsvfs->z_use_fuids = USE_FUIDS(version, os);
1914 zfsvfs->z_use_sa = USE_SA(version, os);
1915 zfsvfs->z_norm = norm;
1916
1917 sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
1918 sb->s_fs_info = zfsvfs;
1919
1920 ZTOI(rootzp)->i_sb = sb;
1921
1922 error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
1923 &zfsvfs->z_attr_table);
1924
1925 ASSERT(error == 0);
1926
1927 /*
1928 * Fold case on file systems that are always or sometimes case
1929 * insensitive.
1930 */
1931 if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1932 zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
1933
1934 mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1935 list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
1936 offsetof(znode_t, z_link_node));
1937
1938 size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
1939 zfsvfs->z_hold_size = size;
1940 zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
1941 KM_SLEEP);
1942 zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
1943 for (i = 0; i != size; i++) {
1944 avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
1945 sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
1946 mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
1947 }
1948
1949 VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1950 cr, NULL, &acl_ids));
1951 zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
1952 ASSERT3P(zp, ==, rootzp);
1953 error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1954 ASSERT(error == 0);
1955 zfs_acl_ids_free(&acl_ids);
1956
1957 atomic_set(&ZTOI(rootzp)->i_count, 0);
1958 sa_handle_destroy(rootzp->z_sa_hdl);
1959 kmem_cache_free(znode_cache, rootzp);
1960
1961 /*
1962 * Create shares directory
1963 */
1964 error = zfs_create_share_dir(zfsvfs, tx);
1965 ASSERT(error == 0);
1966
1967 for (i = 0; i != size; i++) {
1968 avl_destroy(&zfsvfs->z_hold_trees[i]);
1969 mutex_destroy(&zfsvfs->z_hold_locks[i]);
1970 }
1971
1972 mutex_destroy(&zfsvfs->z_znodes_lock);
1973
1974 vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
1975 vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
1976 kmem_free(sb, sizeof (struct super_block));
1977 kmem_free(zfsvfs, sizeof (zfsvfs_t));
1978 }
1979 #endif /* _KERNEL */
1980
1981 static int
1982 zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
1983 {
1984 uint64_t sa_obj = 0;
1985 int error;
1986
1987 error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
1988 if (error != 0 && error != ENOENT)
1989 return (error);
1990
1991 error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
1992 return (error);
1993 }
1994
1995 static int
1996 zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
1997 dmu_buf_t **db, void *tag)
1998 {
1999 dmu_object_info_t doi;
2000 int error;
2001
2002 if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
2003 return (error);
2004
2005 dmu_object_info_from_db(*db, &doi);
2006 if ((doi.doi_bonus_type != DMU_OT_SA &&
2007 doi.doi_bonus_type != DMU_OT_ZNODE) ||
2008 (doi.doi_bonus_type == DMU_OT_ZNODE &&
2009 doi.doi_bonus_size < sizeof (znode_phys_t))) {
2010 sa_buf_rele(*db, tag);
2011 return (SET_ERROR(ENOTSUP));
2012 }
2013
2014 error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
2015 if (error != 0) {
2016 sa_buf_rele(*db, tag);
2017 return (error);
2018 }
2019
2020 return (0);
2021 }
2022
2023 void
2024 zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
2025 {
2026 sa_handle_destroy(hdl);
2027 sa_buf_rele(db, tag);
2028 }
2029
2030 /*
2031 * Given an object number, return its parent object number and whether
2032 * or not the object is an extended attribute directory.
2033 */
2034 static int
2035 zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
2036 uint64_t *pobjp, int *is_xattrdir)
2037 {
2038 uint64_t parent;
2039 uint64_t pflags;
2040 uint64_t mode;
2041 uint64_t parent_mode;
2042 sa_bulk_attr_t bulk[3];
2043 sa_handle_t *sa_hdl;
2044 dmu_buf_t *sa_db;
2045 int count = 0;
2046 int error;
2047
2048 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
2049 &parent, sizeof (parent));
2050 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
2051 &pflags, sizeof (pflags));
2052 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2053 &mode, sizeof (mode));
2054
2055 if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
2056 return (error);
2057
2058 /*
2059 * When a link is removed its parent pointer is not changed and will
2060 * be invalid. There are two cases where a link is removed but the
2061 * file stays around, when it goes to the delete queue and when there
2062 * are additional links.
2063 */
2064 error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
2065 if (error != 0)
2066 return (error);
2067
2068 error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
2069 zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2070 if (error != 0)
2071 return (error);
2072
2073 *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
2074
2075 /*
2076 * Extended attributes can be applied to files, directories, etc.
2077 * Otherwise the parent must be a directory.
2078 */
2079 if (!*is_xattrdir && !S_ISDIR(parent_mode))
2080 return (SET_ERROR(EINVAL));
2081
2082 *pobjp = parent;
2083
2084 return (0);
2085 }
2086
2087 /*
2088 * Given an object number, return some zpl level statistics
2089 */
2090 static int
2091 zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
2092 zfs_stat_t *sb)
2093 {
2094 sa_bulk_attr_t bulk[4];
2095 int count = 0;
2096
2097 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2098 &sb->zs_mode, sizeof (sb->zs_mode));
2099 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
2100 &sb->zs_gen, sizeof (sb->zs_gen));
2101 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
2102 &sb->zs_links, sizeof (sb->zs_links));
2103 SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
2104 &sb->zs_ctime, sizeof (sb->zs_ctime));
2105
2106 return (sa_bulk_lookup(hdl, bulk, count));
2107 }
2108
2109 static int
2110 zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
2111 sa_attr_type_t *sa_table, char *buf, int len)
2112 {
2113 sa_handle_t *sa_hdl;
2114 sa_handle_t *prevhdl = NULL;
2115 dmu_buf_t *prevdb = NULL;
2116 dmu_buf_t *sa_db = NULL;
2117 char *path = buf + len - 1;
2118 int error;
2119
2120 *path = '\0';
2121 sa_hdl = hdl;
2122
2123 uint64_t deleteq_obj;
2124 VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
2125 ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
2126 error = zap_lookup_int(osp, deleteq_obj, obj);
2127 if (error == 0) {
2128 return (ESTALE);
2129 } else if (error != ENOENT) {
2130 return (error);
2131 }
2132 error = 0;
2133
2134 for (;;) {
2135 uint64_t pobj = 0;
2136 char component[MAXNAMELEN + 2];
2137 size_t complen;
2138 int is_xattrdir = 0;
2139
2140 if (prevdb)
2141 zfs_release_sa_handle(prevhdl, prevdb, FTAG);
2142
2143 if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
2144 &is_xattrdir)) != 0)
2145 break;
2146
2147 if (pobj == obj) {
2148 if (path[0] != '/')
2149 *--path = '/';
2150 break;
2151 }
2152
2153 component[0] = '/';
2154 if (is_xattrdir) {
2155 (void) sprintf(component + 1, "<xattrdir>");
2156 } else {
2157 error = zap_value_search(osp, pobj, obj,
2158 ZFS_DIRENT_OBJ(-1ULL), component + 1);
2159 if (error != 0)
2160 break;
2161 }
2162
2163 complen = strlen(component);
2164 path -= complen;
2165 ASSERT(path >= buf);
2166 bcopy(component, path, complen);
2167 obj = pobj;
2168
2169 if (sa_hdl != hdl) {
2170 prevhdl = sa_hdl;
2171 prevdb = sa_db;
2172 }
2173 error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
2174 if (error != 0) {
2175 sa_hdl = prevhdl;
2176 sa_db = prevdb;
2177 break;
2178 }
2179 }
2180
2181 if (sa_hdl != NULL && sa_hdl != hdl) {
2182 ASSERT(sa_db != NULL);
2183 zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2184 }
2185
2186 if (error == 0)
2187 (void) memmove(buf, path, buf + len - path);
2188
2189 return (error);
2190 }
2191
2192 int
2193 zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
2194 {
2195 sa_attr_type_t *sa_table;
2196 sa_handle_t *hdl;
2197 dmu_buf_t *db;
2198 int error;
2199
2200 error = zfs_sa_setup(osp, &sa_table);
2201 if (error != 0)
2202 return (error);
2203
2204 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2205 if (error != 0)
2206 return (error);
2207
2208 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2209
2210 zfs_release_sa_handle(hdl, db, FTAG);
2211 return (error);
2212 }
2213
2214 int
2215 zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
2216 char *buf, int len)
2217 {
2218 char *path = buf + len - 1;
2219 sa_attr_type_t *sa_table;
2220 sa_handle_t *hdl;
2221 dmu_buf_t *db;
2222 int error;
2223
2224 *path = '\0';
2225
2226 error = zfs_sa_setup(osp, &sa_table);
2227 if (error != 0)
2228 return (error);
2229
2230 error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2231 if (error != 0)
2232 return (error);
2233
2234 error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
2235 if (error != 0) {
2236 zfs_release_sa_handle(hdl, db, FTAG);
2237 return (error);
2238 }
2239
2240 error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2241
2242 zfs_release_sa_handle(hdl, db, FTAG);
2243 return (error);
2244 }
2245
2246 #if defined(_KERNEL) && defined(HAVE_SPL)
2247 EXPORT_SYMBOL(zfs_create_fs);
2248 EXPORT_SYMBOL(zfs_obj_to_path);
2249
2250 /* CSTYLED */
2251 module_param(zfs_object_mutex_size, uint, 0644);
2252 MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
2253 #endif