4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
29 #include <sys/zfs_context.h>
31 #include <sys/refcount.h>
32 #include <sys/zap_impl.h>
33 #include <sys/zap_leaf.h>
38 #include <sys/sunddi.h>
41 static int mzap_upgrade(zap_t
**zapp
, dmu_tx_t
*tx
, zap_flags_t flags
);
44 zap_getflags(zap_t
*zap
)
48 return (zap
->zap_u
.zap_fat
.zap_phys
->zap_flags
);
52 zap_hashbits(zap_t
*zap
)
54 if (zap_getflags(zap
) & ZAP_FLAG_HASH64
)
63 if (zap_getflags(zap
) & ZAP_FLAG_HASH64
)
70 zap_hash(zap_name_t
*zn
)
72 zap_t
*zap
= zn
->zn_zap
;
75 if (zap_getflags(zap
) & ZAP_FLAG_PRE_HASHED_KEY
) {
76 ASSERT(zap_getflags(zap
) & ZAP_FLAG_UINT64_KEY
);
77 h
= *(uint64_t *)zn
->zn_key_orig
;
81 ASSERT(zfs_crc64_table
[128] == ZFS_CRC64_POLY
);
83 if (zap_getflags(zap
) & ZAP_FLAG_UINT64_KEY
) {
85 const uint64_t *wp
= zn
->zn_key_norm
;
87 ASSERT(zn
->zn_key_intlen
== 8);
88 for (i
= 0; i
< zn
->zn_key_norm_numints
; wp
++, i
++) {
92 for (j
= 0; j
< zn
->zn_key_intlen
; j
++) {
94 zfs_crc64_table
[(h
^ word
) & 0xFF];
100 const uint8_t *cp
= zn
->zn_key_norm
;
103 * We previously stored the terminating null on
104 * disk, but didn't hash it, so we need to
105 * continue to not hash it. (The
106 * zn_key_*_numints includes the terminating
107 * null for non-binary keys.)
109 len
= zn
->zn_key_norm_numints
- 1;
111 ASSERT(zn
->zn_key_intlen
== 1);
112 for (i
= 0; i
< len
; cp
++, i
++) {
114 zfs_crc64_table
[(h
^ *cp
) & 0xFF];
119 * Don't use all 64 bits, since we need some in the cookie for
120 * the collision differentiator. We MUST use the high bits,
121 * since those are the ones that we first pay attention to when
122 * chosing the bucket.
124 h
&= ~((1ULL << (64 - zap_hashbits(zap
))) - 1);
130 zap_normalize(zap_t
*zap
, const char *name
, char *namenorm
)
132 size_t inlen
, outlen
;
135 ASSERT(!(zap_getflags(zap
) & ZAP_FLAG_UINT64_KEY
));
137 inlen
= strlen(name
) + 1;
138 outlen
= ZAP_MAXNAMELEN
;
141 (void) u8_textprep_str((char *)name
, &inlen
, namenorm
, &outlen
,
142 zap
->zap_normflags
| U8_TEXTPREP_IGNORE_NULL
|
143 U8_TEXTPREP_IGNORE_INVALID
, U8_UNICODE_LATEST
, &err
);
149 zap_match(zap_name_t
*zn
, const char *matchname
)
151 ASSERT(!(zap_getflags(zn
->zn_zap
) & ZAP_FLAG_UINT64_KEY
));
153 if (zn
->zn_matchtype
== MT_FIRST
) {
154 char norm
[ZAP_MAXNAMELEN
];
156 if (zap_normalize(zn
->zn_zap
, matchname
, norm
) != 0)
159 return (strcmp(zn
->zn_key_norm
, norm
) == 0);
161 /* MT_BEST or MT_EXACT */
162 return (strcmp(zn
->zn_key_orig
, matchname
) == 0);
167 zap_name_free(zap_name_t
*zn
)
169 kmem_free(zn
, sizeof (zap_name_t
));
173 zap_name_alloc(zap_t
*zap
, const char *key
, matchtype_t mt
)
175 zap_name_t
*zn
= kmem_alloc(sizeof (zap_name_t
), KM_SLEEP
);
178 zn
->zn_key_intlen
= sizeof (*key
);
179 zn
->zn_key_orig
= key
;
180 zn
->zn_key_orig_numints
= strlen(zn
->zn_key_orig
) + 1;
181 zn
->zn_matchtype
= mt
;
182 if (zap
->zap_normflags
) {
183 if (zap_normalize(zap
, key
, zn
->zn_normbuf
) != 0) {
187 zn
->zn_key_norm
= zn
->zn_normbuf
;
188 zn
->zn_key_norm_numints
= strlen(zn
->zn_key_norm
) + 1;
190 if (mt
!= MT_EXACT
) {
194 zn
->zn_key_norm
= zn
->zn_key_orig
;
195 zn
->zn_key_norm_numints
= zn
->zn_key_orig_numints
;
198 zn
->zn_hash
= zap_hash(zn
);
203 zap_name_alloc_uint64(zap_t
*zap
, const uint64_t *key
, int numints
)
205 zap_name_t
*zn
= kmem_alloc(sizeof (zap_name_t
), KM_SLEEP
);
207 ASSERT(zap
->zap_normflags
== 0);
209 zn
->zn_key_intlen
= sizeof (*key
);
210 zn
->zn_key_orig
= zn
->zn_key_norm
= key
;
211 zn
->zn_key_orig_numints
= zn
->zn_key_norm_numints
= numints
;
212 zn
->zn_matchtype
= MT_EXACT
;
214 zn
->zn_hash
= zap_hash(zn
);
219 mzap_byteswap(mzap_phys_t
*buf
, size_t size
)
222 buf
->mz_block_type
= BSWAP_64(buf
->mz_block_type
);
223 buf
->mz_salt
= BSWAP_64(buf
->mz_salt
);
224 buf
->mz_normflags
= BSWAP_64(buf
->mz_normflags
);
225 max
= (size
/ MZAP_ENT_LEN
) - 1;
226 for (i
= 0; i
< max
; i
++) {
227 buf
->mz_chunk
[i
].mze_value
=
228 BSWAP_64(buf
->mz_chunk
[i
].mze_value
);
229 buf
->mz_chunk
[i
].mze_cd
=
230 BSWAP_32(buf
->mz_chunk
[i
].mze_cd
);
235 zap_byteswap(void *buf
, size_t size
)
239 block_type
= *(uint64_t *)buf
;
241 if (block_type
== ZBT_MICRO
|| block_type
== BSWAP_64(ZBT_MICRO
)) {
242 /* ASSERT(magic == ZAP_LEAF_MAGIC); */
243 mzap_byteswap(buf
, size
);
245 fzap_byteswap(buf
, size
);
250 mze_compare(const void *arg1
, const void *arg2
)
252 const mzap_ent_t
*mze1
= arg1
;
253 const mzap_ent_t
*mze2
= arg2
;
255 if (mze1
->mze_hash
> mze2
->mze_hash
)
257 if (mze1
->mze_hash
< mze2
->mze_hash
)
259 if (mze1
->mze_cd
> mze2
->mze_cd
)
261 if (mze1
->mze_cd
< mze2
->mze_cd
)
267 mze_insert(zap_t
*zap
, int chunkid
, uint64_t hash
)
271 ASSERT(zap
->zap_ismicro
);
272 ASSERT(RW_WRITE_HELD(&zap
->zap_rwlock
));
274 mze
= kmem_alloc(sizeof (mzap_ent_t
), KM_SLEEP
);
275 mze
->mze_chunkid
= chunkid
;
276 mze
->mze_hash
= hash
;
277 mze
->mze_cd
= MZE_PHYS(zap
, mze
)->mze_cd
;
278 ASSERT(MZE_PHYS(zap
, mze
)->mze_name
[0] != 0);
279 avl_add(&zap
->zap_m
.zap_avl
, mze
);
283 mze_find(zap_name_t
*zn
)
285 mzap_ent_t mze_tofind
;
288 avl_tree_t
*avl
= &zn
->zn_zap
->zap_m
.zap_avl
;
290 ASSERT(zn
->zn_zap
->zap_ismicro
);
291 ASSERT(RW_LOCK_HELD(&zn
->zn_zap
->zap_rwlock
));
293 mze_tofind
.mze_hash
= zn
->zn_hash
;
294 mze_tofind
.mze_cd
= 0;
297 mze
= avl_find(avl
, &mze_tofind
, &idx
);
299 mze
= avl_nearest(avl
, idx
, AVL_AFTER
);
300 for (; mze
&& mze
->mze_hash
== zn
->zn_hash
; mze
= AVL_NEXT(avl
, mze
)) {
301 ASSERT3U(mze
->mze_cd
, ==, MZE_PHYS(zn
->zn_zap
, mze
)->mze_cd
);
302 if (zap_match(zn
, MZE_PHYS(zn
->zn_zap
, mze
)->mze_name
))
305 if (zn
->zn_matchtype
== MT_BEST
) {
306 zn
->zn_matchtype
= MT_FIRST
;
313 mze_find_unused_cd(zap_t
*zap
, uint64_t hash
)
315 mzap_ent_t mze_tofind
;
318 avl_tree_t
*avl
= &zap
->zap_m
.zap_avl
;
321 ASSERT(zap
->zap_ismicro
);
322 ASSERT(RW_LOCK_HELD(&zap
->zap_rwlock
));
324 mze_tofind
.mze_hash
= hash
;
325 mze_tofind
.mze_cd
= 0;
328 for (mze
= avl_find(avl
, &mze_tofind
, &idx
);
329 mze
&& mze
->mze_hash
== hash
; mze
= AVL_NEXT(avl
, mze
)) {
330 if (mze
->mze_cd
!= cd
)
339 mze_remove(zap_t
*zap
, mzap_ent_t
*mze
)
341 ASSERT(zap
->zap_ismicro
);
342 ASSERT(RW_WRITE_HELD(&zap
->zap_rwlock
));
344 avl_remove(&zap
->zap_m
.zap_avl
, mze
);
345 kmem_free(mze
, sizeof (mzap_ent_t
));
349 mze_destroy(zap_t
*zap
)
352 void *avlcookie
= NULL
;
354 while ((mze
= avl_destroy_nodes(&zap
->zap_m
.zap_avl
, &avlcookie
)))
355 kmem_free(mze
, sizeof (mzap_ent_t
));
356 avl_destroy(&zap
->zap_m
.zap_avl
);
360 mzap_open(objset_t
*os
, uint64_t obj
, dmu_buf_t
*db
)
366 ASSERT3U(MZAP_ENT_LEN
, ==, sizeof (mzap_ent_phys_t
));
368 zap
= kmem_zalloc(sizeof (zap_t
), KM_SLEEP
);
369 rw_init(&zap
->zap_rwlock
, NULL
, RW_DEFAULT
, NULL
);
370 rw_enter(&zap
->zap_rwlock
, RW_WRITER
);
371 zap
->zap_objset
= os
;
372 zap
->zap_object
= obj
;
375 if (*(uint64_t *)db
->db_data
!= ZBT_MICRO
) {
376 mutex_init(&zap
->zap_f
.zap_num_entries_mtx
, 0, 0, 0);
377 zap
->zap_f
.zap_block_shift
= highbit64(db
->db_size
) - 1;
379 zap
->zap_ismicro
= TRUE
;
383 * Make sure that zap_ismicro is set before we let others see
384 * it, because zap_lockdir() checks zap_ismicro without the lock
387 winner
= dmu_buf_set_user(db
, zap
, &zap
->zap_m
.zap_phys
, zap_evict
);
389 if (winner
!= NULL
) {
390 rw_exit(&zap
->zap_rwlock
);
391 rw_destroy(&zap
->zap_rwlock
);
392 if (!zap
->zap_ismicro
)
393 mutex_destroy(&zap
->zap_f
.zap_num_entries_mtx
);
394 kmem_free(zap
, sizeof (zap_t
));
398 if (zap
->zap_ismicro
) {
399 zap
->zap_salt
= zap
->zap_m
.zap_phys
->mz_salt
;
400 zap
->zap_normflags
= zap
->zap_m
.zap_phys
->mz_normflags
;
401 zap
->zap_m
.zap_num_chunks
= db
->db_size
/ MZAP_ENT_LEN
- 1;
402 avl_create(&zap
->zap_m
.zap_avl
, mze_compare
,
403 sizeof (mzap_ent_t
), offsetof(mzap_ent_t
, mze_node
));
405 for (i
= 0; i
< zap
->zap_m
.zap_num_chunks
; i
++) {
406 mzap_ent_phys_t
*mze
=
407 &zap
->zap_m
.zap_phys
->mz_chunk
[i
];
408 if (mze
->mze_name
[0]) {
411 zap
->zap_m
.zap_num_entries
++;
412 zn
= zap_name_alloc(zap
, mze
->mze_name
,
414 mze_insert(zap
, i
, zn
->zn_hash
);
419 zap
->zap_salt
= zap
->zap_f
.zap_phys
->zap_salt
;
420 zap
->zap_normflags
= zap
->zap_f
.zap_phys
->zap_normflags
;
422 ASSERT3U(sizeof (struct zap_leaf_header
), ==,
423 2*ZAP_LEAF_CHUNKSIZE
);
426 * The embedded pointer table should not overlap the
429 ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap
, 0), >,
430 &zap
->zap_f
.zap_phys
->zap_salt
);
433 * The embedded pointer table should end at the end of
436 ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap
,
437 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap
)) -
438 (uintptr_t)zap
->zap_f
.zap_phys
, ==,
439 zap
->zap_dbuf
->db_size
);
441 rw_exit(&zap
->zap_rwlock
);
446 zap_lockdir(objset_t
*os
, uint64_t obj
, dmu_tx_t
*tx
,
447 krw_t lti
, boolean_t fatreader
, boolean_t adding
, zap_t
**zapp
)
449 dmu_object_info_t doi
;
457 err
= dmu_buf_hold(os
, obj
, 0, NULL
, &db
, DMU_READ_NO_PREFETCH
);
461 dmu_object_info_from_db(db
, &doi
);
462 if (DMU_OT_BYTESWAP(doi
.doi_type
) != DMU_BSWAP_ZAP
)
463 return (SET_ERROR(EINVAL
));
465 zap
= dmu_buf_get_user(db
);
467 zap
= mzap_open(os
, obj
, db
);
470 * We're checking zap_ismicro without the lock held, in order to
471 * tell what type of lock we want. Once we have some sort of
472 * lock, see if it really is the right type. In practice this
473 * can only be different if it was upgraded from micro to fat,
474 * and micro wanted WRITER but fat only needs READER.
476 lt
= (!zap
->zap_ismicro
&& fatreader
) ? RW_READER
: lti
;
477 rw_enter(&zap
->zap_rwlock
, lt
);
478 if (lt
!= ((!zap
->zap_ismicro
&& fatreader
) ? RW_READER
: lti
)) {
479 /* it was upgraded, now we only need reader */
480 ASSERT(lt
== RW_WRITER
);
482 (!zap
->zap_ismicro
&& fatreader
) ? RW_READER
: lti
);
483 rw_downgrade(&zap
->zap_rwlock
);
487 zap
->zap_objset
= os
;
490 dmu_buf_will_dirty(db
, tx
);
492 ASSERT3P(zap
->zap_dbuf
, ==, db
);
494 ASSERT(!zap
->zap_ismicro
||
495 zap
->zap_m
.zap_num_entries
<= zap
->zap_m
.zap_num_chunks
);
496 if (zap
->zap_ismicro
&& tx
&& adding
&&
497 zap
->zap_m
.zap_num_entries
== zap
->zap_m
.zap_num_chunks
) {
498 uint64_t newsz
= db
->db_size
+ SPA_MINBLOCKSIZE
;
499 if (newsz
> MZAP_MAX_BLKSZ
) {
500 dprintf("upgrading obj %llu: num_entries=%u\n",
501 obj
, zap
->zap_m
.zap_num_entries
);
503 return (mzap_upgrade(zapp
, tx
, 0));
505 err
= dmu_object_set_blocksize(os
, obj
, newsz
, 0, tx
);
507 zap
->zap_m
.zap_num_chunks
=
508 db
->db_size
/ MZAP_ENT_LEN
- 1;
516 zap_unlockdir(zap_t
*zap
)
518 rw_exit(&zap
->zap_rwlock
);
519 dmu_buf_rele(zap
->zap_dbuf
, NULL
);
523 mzap_upgrade(zap_t
**zapp
, dmu_tx_t
*tx
, zap_flags_t flags
)
530 ASSERT(RW_WRITE_HELD(&zap
->zap_rwlock
));
532 sz
= zap
->zap_dbuf
->db_size
;
533 mzp
= zio_buf_alloc(sz
);
534 bcopy(zap
->zap_dbuf
->db_data
, mzp
, sz
);
535 nchunks
= zap
->zap_m
.zap_num_chunks
;
538 err
= dmu_object_set_blocksize(zap
->zap_objset
, zap
->zap_object
,
539 1ULL << fzap_default_block_shift
, 0, tx
);
541 zio_buf_free(mzp
, sz
);
546 dprintf("upgrading obj=%llu with %u chunks\n",
547 zap
->zap_object
, nchunks
);
548 /* XXX destroy the avl later, so we can use the stored hash value */
551 fzap_upgrade(zap
, tx
, flags
);
553 for (i
= 0; i
< nchunks
; i
++) {
554 mzap_ent_phys_t
*mze
= &mzp
->mz_chunk
[i
];
556 if (mze
->mze_name
[0] == 0)
558 dprintf("adding %s=%llu\n",
559 mze
->mze_name
, mze
->mze_value
);
560 zn
= zap_name_alloc(zap
, mze
->mze_name
, MT_EXACT
);
561 err
= fzap_add_cd(zn
, 8, 1, &mze
->mze_value
, mze
->mze_cd
, tx
);
562 zap
= zn
->zn_zap
; /* fzap_add_cd() may change zap */
567 zio_buf_free(mzp
, sz
);
573 mzap_create_impl(objset_t
*os
, uint64_t obj
, int normflags
, zap_flags_t flags
,
579 VERIFY(0 == dmu_buf_hold(os
, obj
, 0, FTAG
, &db
, DMU_READ_NO_PREFETCH
));
583 dmu_object_info_t doi
;
584 dmu_object_info_from_db(db
, &doi
);
585 ASSERT3U(DMU_OT_BYTESWAP(doi
.doi_type
), ==, DMU_BSWAP_ZAP
);
589 dmu_buf_will_dirty(db
, tx
);
591 zp
->mz_block_type
= ZBT_MICRO
;
592 zp
->mz_salt
= ((uintptr_t)db
^ (uintptr_t)tx
^ (obj
<< 1)) | 1ULL;
593 zp
->mz_normflags
= normflags
;
594 dmu_buf_rele(db
, FTAG
);
598 /* Only fat zap supports flags; upgrade immediately. */
599 VERIFY(0 == zap_lockdir(os
, obj
, tx
, RW_WRITER
,
600 B_FALSE
, B_FALSE
, &zap
));
601 VERIFY3U(0, ==, mzap_upgrade(&zap
, tx
, flags
));
607 zap_create_claim(objset_t
*os
, uint64_t obj
, dmu_object_type_t ot
,
608 dmu_object_type_t bonustype
, int bonuslen
, dmu_tx_t
*tx
)
610 return (zap_create_claim_norm(os
, obj
,
611 0, ot
, bonustype
, bonuslen
, tx
));
615 zap_create_claim_norm(objset_t
*os
, uint64_t obj
, int normflags
,
616 dmu_object_type_t ot
,
617 dmu_object_type_t bonustype
, int bonuslen
, dmu_tx_t
*tx
)
621 err
= dmu_object_claim(os
, obj
, ot
, 0, bonustype
, bonuslen
, tx
);
624 mzap_create_impl(os
, obj
, normflags
, 0, tx
);
629 zap_create(objset_t
*os
, dmu_object_type_t ot
,
630 dmu_object_type_t bonustype
, int bonuslen
, dmu_tx_t
*tx
)
632 return (zap_create_norm(os
, 0, ot
, bonustype
, bonuslen
, tx
));
636 zap_create_norm(objset_t
*os
, int normflags
, dmu_object_type_t ot
,
637 dmu_object_type_t bonustype
, int bonuslen
, dmu_tx_t
*tx
)
639 uint64_t obj
= dmu_object_alloc(os
, ot
, 0, bonustype
, bonuslen
, tx
);
641 mzap_create_impl(os
, obj
, normflags
, 0, tx
);
646 zap_create_flags(objset_t
*os
, int normflags
, zap_flags_t flags
,
647 dmu_object_type_t ot
, int leaf_blockshift
, int indirect_blockshift
,
648 dmu_object_type_t bonustype
, int bonuslen
, dmu_tx_t
*tx
)
650 uint64_t obj
= dmu_object_alloc(os
, ot
, 0, bonustype
, bonuslen
, tx
);
652 ASSERT(leaf_blockshift
>= SPA_MINBLOCKSHIFT
&&
653 leaf_blockshift
<= SPA_MAXBLOCKSHIFT
&&
654 indirect_blockshift
>= SPA_MINBLOCKSHIFT
&&
655 indirect_blockshift
<= SPA_MAXBLOCKSHIFT
);
657 VERIFY(dmu_object_set_blocksize(os
, obj
,
658 1ULL << leaf_blockshift
, indirect_blockshift
, tx
) == 0);
660 mzap_create_impl(os
, obj
, normflags
, flags
, tx
);
665 zap_destroy(objset_t
*os
, uint64_t zapobj
, dmu_tx_t
*tx
)
668 * dmu_object_free will free the object number and free the
669 * data. Freeing the data will cause our pageout function to be
670 * called, which will destroy our data (zap_leaf_t's and zap_t).
673 return (dmu_object_free(os
, zapobj
, tx
));
678 zap_evict(dmu_buf_t
*db
, void *vzap
)
682 rw_destroy(&zap
->zap_rwlock
);
684 if (zap
->zap_ismicro
)
687 mutex_destroy(&zap
->zap_f
.zap_num_entries_mtx
);
689 kmem_free(zap
, sizeof (zap_t
));
693 zap_count(objset_t
*os
, uint64_t zapobj
, uint64_t *count
)
698 err
= zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, &zap
);
701 if (!zap
->zap_ismicro
) {
702 err
= fzap_count(zap
, count
);
704 *count
= zap
->zap_m
.zap_num_entries
;
711 * zn may be NULL; if not specified, it will be computed if needed.
712 * See also the comment above zap_entry_normalization_conflict().
715 mzap_normalization_conflict(zap_t
*zap
, zap_name_t
*zn
, mzap_ent_t
*mze
)
718 int direction
= AVL_BEFORE
;
719 boolean_t allocdzn
= B_FALSE
;
721 if (zap
->zap_normflags
== 0)
725 for (other
= avl_walk(&zap
->zap_m
.zap_avl
, mze
, direction
);
726 other
&& other
->mze_hash
== mze
->mze_hash
;
727 other
= avl_walk(&zap
->zap_m
.zap_avl
, other
, direction
)) {
730 zn
= zap_name_alloc(zap
, MZE_PHYS(zap
, mze
)->mze_name
,
734 if (zap_match(zn
, MZE_PHYS(zap
, other
)->mze_name
)) {
741 if (direction
== AVL_BEFORE
) {
742 direction
= AVL_AFTER
;
752 * Routines for manipulating attributes.
756 zap_lookup(objset_t
*os
, uint64_t zapobj
, const char *name
,
757 uint64_t integer_size
, uint64_t num_integers
, void *buf
)
759 return (zap_lookup_norm(os
, zapobj
, name
, integer_size
,
760 num_integers
, buf
, MT_EXACT
, NULL
, 0, NULL
));
764 zap_lookup_norm(objset_t
*os
, uint64_t zapobj
, const char *name
,
765 uint64_t integer_size
, uint64_t num_integers
, void *buf
,
766 matchtype_t mt
, char *realname
, int rn_len
,
774 err
= zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, &zap
);
777 zn
= zap_name_alloc(zap
, name
, mt
);
780 return (SET_ERROR(ENOTSUP
));
783 if (!zap
->zap_ismicro
) {
784 err
= fzap_lookup(zn
, integer_size
, num_integers
, buf
,
785 realname
, rn_len
, ncp
);
789 err
= SET_ERROR(ENOENT
);
791 if (num_integers
< 1) {
792 err
= SET_ERROR(EOVERFLOW
);
793 } else if (integer_size
!= 8) {
794 err
= SET_ERROR(EINVAL
);
797 MZE_PHYS(zap
, mze
)->mze_value
;
798 (void) strlcpy(realname
,
799 MZE_PHYS(zap
, mze
)->mze_name
, rn_len
);
801 *ncp
= mzap_normalization_conflict(zap
,
813 zap_prefetch_uint64(objset_t
*os
, uint64_t zapobj
, const uint64_t *key
,
820 err
= zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, &zap
);
823 zn
= zap_name_alloc_uint64(zap
, key
, key_numints
);
826 return (SET_ERROR(ENOTSUP
));
836 zap_lookup_uint64(objset_t
*os
, uint64_t zapobj
, const uint64_t *key
,
837 int key_numints
, uint64_t integer_size
, uint64_t num_integers
, void *buf
)
843 err
= zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, &zap
);
846 zn
= zap_name_alloc_uint64(zap
, key
, key_numints
);
849 return (SET_ERROR(ENOTSUP
));
852 err
= fzap_lookup(zn
, integer_size
, num_integers
, buf
,
860 zap_contains(objset_t
*os
, uint64_t zapobj
, const char *name
)
862 int err
= zap_lookup_norm(os
, zapobj
, name
, 0,
863 0, NULL
, MT_EXACT
, NULL
, 0, NULL
);
864 if (err
== EOVERFLOW
|| err
== EINVAL
)
865 err
= 0; /* found, but skipped reading the value */
870 zap_length(objset_t
*os
, uint64_t zapobj
, const char *name
,
871 uint64_t *integer_size
, uint64_t *num_integers
)
878 err
= zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, &zap
);
881 zn
= zap_name_alloc(zap
, name
, MT_EXACT
);
884 return (SET_ERROR(ENOTSUP
));
886 if (!zap
->zap_ismicro
) {
887 err
= fzap_length(zn
, integer_size
, num_integers
);
891 err
= SET_ERROR(ENOENT
);
905 zap_length_uint64(objset_t
*os
, uint64_t zapobj
, const uint64_t *key
,
906 int key_numints
, uint64_t *integer_size
, uint64_t *num_integers
)
912 err
= zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, &zap
);
915 zn
= zap_name_alloc_uint64(zap
, key
, key_numints
);
918 return (SET_ERROR(ENOTSUP
));
920 err
= fzap_length(zn
, integer_size
, num_integers
);
927 mzap_addent(zap_name_t
*zn
, uint64_t value
)
930 zap_t
*zap
= zn
->zn_zap
;
931 int start
= zap
->zap_m
.zap_alloc_next
;
934 ASSERT(RW_WRITE_HELD(&zap
->zap_rwlock
));
937 for (i
= 0; i
< zap
->zap_m
.zap_num_chunks
; i
++) {
938 ASSERTV(mzap_ent_phys_t
*mze
);
939 ASSERT(mze
= &zap
->zap_m
.zap_phys
->mz_chunk
[i
]);
940 ASSERT(strcmp(zn
->zn_key_orig
, mze
->mze_name
) != 0);
944 cd
= mze_find_unused_cd(zap
, zn
->zn_hash
);
945 /* given the limited size of the microzap, this can't happen */
946 ASSERT(cd
< zap_maxcd(zap
));
949 for (i
= start
; i
< zap
->zap_m
.zap_num_chunks
; i
++) {
950 mzap_ent_phys_t
*mze
= &zap
->zap_m
.zap_phys
->mz_chunk
[i
];
951 if (mze
->mze_name
[0] == 0) {
952 mze
->mze_value
= value
;
954 (void) strcpy(mze
->mze_name
, zn
->zn_key_orig
);
955 zap
->zap_m
.zap_num_entries
++;
956 zap
->zap_m
.zap_alloc_next
= i
+1;
957 if (zap
->zap_m
.zap_alloc_next
==
958 zap
->zap_m
.zap_num_chunks
)
959 zap
->zap_m
.zap_alloc_next
= 0;
960 mze_insert(zap
, i
, zn
->zn_hash
);
968 cmn_err(CE_PANIC
, "out of entries!");
972 zap_add(objset_t
*os
, uint64_t zapobj
, const char *key
,
973 int integer_size
, uint64_t num_integers
,
974 const void *val
, dmu_tx_t
*tx
)
979 const uint64_t *intval
= val
;
982 err
= zap_lockdir(os
, zapobj
, tx
, RW_WRITER
, TRUE
, TRUE
, &zap
);
985 zn
= zap_name_alloc(zap
, key
, MT_EXACT
);
988 return (SET_ERROR(ENOTSUP
));
990 if (!zap
->zap_ismicro
) {
991 err
= fzap_add(zn
, integer_size
, num_integers
, val
, tx
);
992 zap
= zn
->zn_zap
; /* fzap_add() may change zap */
993 } else if (integer_size
!= 8 || num_integers
!= 1 ||
994 strlen(key
) >= MZAP_NAME_LEN
) {
995 err
= mzap_upgrade(&zn
->zn_zap
, tx
, 0);
997 err
= fzap_add(zn
, integer_size
, num_integers
, val
, tx
);
998 zap
= zn
->zn_zap
; /* fzap_add() may change zap */
1002 err
= SET_ERROR(EEXIST
);
1004 mzap_addent(zn
, *intval
);
1007 ASSERT(zap
== zn
->zn_zap
);
1009 if (zap
!= NULL
) /* may be NULL if fzap_add() failed */
1015 zap_add_uint64(objset_t
*os
, uint64_t zapobj
, const uint64_t *key
,
1016 int key_numints
, int integer_size
, uint64_t num_integers
,
1017 const void *val
, dmu_tx_t
*tx
)
1023 err
= zap_lockdir(os
, zapobj
, tx
, RW_WRITER
, TRUE
, TRUE
, &zap
);
1026 zn
= zap_name_alloc_uint64(zap
, key
, key_numints
);
1029 return (SET_ERROR(ENOTSUP
));
1031 err
= fzap_add(zn
, integer_size
, num_integers
, val
, tx
);
1032 zap
= zn
->zn_zap
; /* fzap_add() may change zap */
1034 if (zap
!= NULL
) /* may be NULL if fzap_add() failed */
1040 zap_update(objset_t
*os
, uint64_t zapobj
, const char *name
,
1041 int integer_size
, uint64_t num_integers
, const void *val
, dmu_tx_t
*tx
)
1045 const uint64_t *intval
= val
;
1053 * If there is an old value, it shouldn't change across the
1054 * lockdir (eg, due to bprewrite's xlation).
1056 if (integer_size
== 8 && num_integers
== 1)
1057 (void) zap_lookup(os
, zapobj
, name
, 8, 1, &oldval
);
1060 err
= zap_lockdir(os
, zapobj
, tx
, RW_WRITER
, TRUE
, TRUE
, &zap
);
1063 zn
= zap_name_alloc(zap
, name
, MT_EXACT
);
1066 return (SET_ERROR(ENOTSUP
));
1068 if (!zap
->zap_ismicro
) {
1069 err
= fzap_update(zn
, integer_size
, num_integers
, val
, tx
);
1070 zap
= zn
->zn_zap
; /* fzap_update() may change zap */
1071 } else if (integer_size
!= 8 || num_integers
!= 1 ||
1072 strlen(name
) >= MZAP_NAME_LEN
) {
1073 dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
1074 zapobj
, integer_size
, num_integers
, name
);
1075 err
= mzap_upgrade(&zn
->zn_zap
, tx
, 0);
1077 err
= fzap_update(zn
, integer_size
, num_integers
,
1079 zap
= zn
->zn_zap
; /* fzap_update() may change zap */
1083 ASSERT3U(MZE_PHYS(zap
, mze
)->mze_value
, ==, oldval
);
1084 MZE_PHYS(zap
, mze
)->mze_value
= *intval
;
1086 mzap_addent(zn
, *intval
);
1089 ASSERT(zap
== zn
->zn_zap
);
1091 if (zap
!= NULL
) /* may be NULL if fzap_upgrade() failed */
1097 zap_update_uint64(objset_t
*os
, uint64_t zapobj
, const uint64_t *key
,
1099 int integer_size
, uint64_t num_integers
, const void *val
, dmu_tx_t
*tx
)
1105 err
= zap_lockdir(os
, zapobj
, tx
, RW_WRITER
, TRUE
, TRUE
, &zap
);
1108 zn
= zap_name_alloc_uint64(zap
, key
, key_numints
);
1111 return (SET_ERROR(ENOTSUP
));
1113 err
= fzap_update(zn
, integer_size
, num_integers
, val
, tx
);
1114 zap
= zn
->zn_zap
; /* fzap_update() may change zap */
1116 if (zap
!= NULL
) /* may be NULL if fzap_upgrade() failed */
1122 zap_remove(objset_t
*os
, uint64_t zapobj
, const char *name
, dmu_tx_t
*tx
)
1124 return (zap_remove_norm(os
, zapobj
, name
, MT_EXACT
, tx
));
1128 zap_remove_norm(objset_t
*os
, uint64_t zapobj
, const char *name
,
1129 matchtype_t mt
, dmu_tx_t
*tx
)
1136 err
= zap_lockdir(os
, zapobj
, tx
, RW_WRITER
, TRUE
, FALSE
, &zap
);
1139 zn
= zap_name_alloc(zap
, name
, mt
);
1142 return (SET_ERROR(ENOTSUP
));
1144 if (!zap
->zap_ismicro
) {
1145 err
= fzap_remove(zn
, tx
);
1149 err
= SET_ERROR(ENOENT
);
1151 zap
->zap_m
.zap_num_entries
--;
1152 bzero(&zap
->zap_m
.zap_phys
->mz_chunk
[mze
->mze_chunkid
],
1153 sizeof (mzap_ent_phys_t
));
1154 mze_remove(zap
, mze
);
1163 zap_remove_uint64(objset_t
*os
, uint64_t zapobj
, const uint64_t *key
,
1164 int key_numints
, dmu_tx_t
*tx
)
1170 err
= zap_lockdir(os
, zapobj
, tx
, RW_WRITER
, TRUE
, FALSE
, &zap
);
1173 zn
= zap_name_alloc_uint64(zap
, key
, key_numints
);
1176 return (SET_ERROR(ENOTSUP
));
1178 err
= fzap_remove(zn
, tx
);
1185 * Routines for iterating over the attributes.
1189 zap_cursor_init_serialized(zap_cursor_t
*zc
, objset_t
*os
, uint64_t zapobj
,
1190 uint64_t serialized
)
1195 zc
->zc_zapobj
= zapobj
;
1196 zc
->zc_serialized
= serialized
;
1202 zap_cursor_init(zap_cursor_t
*zc
, objset_t
*os
, uint64_t zapobj
)
1204 zap_cursor_init_serialized(zc
, os
, zapobj
, 0);
1208 zap_cursor_fini(zap_cursor_t
*zc
)
1211 rw_enter(&zc
->zc_zap
->zap_rwlock
, RW_READER
);
1212 zap_unlockdir(zc
->zc_zap
);
1216 rw_enter(&zc
->zc_leaf
->l_rwlock
, RW_READER
);
1217 zap_put_leaf(zc
->zc_leaf
);
1220 zc
->zc_objset
= NULL
;
1224 zap_cursor_serialize(zap_cursor_t
*zc
)
1226 if (zc
->zc_hash
== -1ULL)
1228 if (zc
->zc_zap
== NULL
)
1229 return (zc
->zc_serialized
);
1230 ASSERT((zc
->zc_hash
& zap_maxcd(zc
->zc_zap
)) == 0);
1231 ASSERT(zc
->zc_cd
< zap_maxcd(zc
->zc_zap
));
1234 * We want to keep the high 32 bits of the cursor zero if we can, so
1235 * that 32-bit programs can access this. So usually use a small
1236 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
1239 * [ collision differentiator | zap_hashbits()-bit hash value ]
1241 return ((zc
->zc_hash
>> (64 - zap_hashbits(zc
->zc_zap
))) |
1242 ((uint64_t)zc
->zc_cd
<< zap_hashbits(zc
->zc_zap
)));
1246 zap_cursor_retrieve(zap_cursor_t
*zc
, zap_attribute_t
*za
)
1250 mzap_ent_t mze_tofind
;
1253 if (zc
->zc_hash
== -1ULL)
1254 return (SET_ERROR(ENOENT
));
1256 if (zc
->zc_zap
== NULL
) {
1258 err
= zap_lockdir(zc
->zc_objset
, zc
->zc_zapobj
, NULL
,
1259 RW_READER
, TRUE
, FALSE
, &zc
->zc_zap
);
1264 * To support zap_cursor_init_serialized, advance, retrieve,
1265 * we must add to the existing zc_cd, which may already
1266 * be 1 due to the zap_cursor_advance.
1268 ASSERT(zc
->zc_hash
== 0);
1269 hb
= zap_hashbits(zc
->zc_zap
);
1270 zc
->zc_hash
= zc
->zc_serialized
<< (64 - hb
);
1271 zc
->zc_cd
+= zc
->zc_serialized
>> hb
;
1272 if (zc
->zc_cd
>= zap_maxcd(zc
->zc_zap
)) /* corrupt serialized */
1275 rw_enter(&zc
->zc_zap
->zap_rwlock
, RW_READER
);
1277 if (!zc
->zc_zap
->zap_ismicro
) {
1278 err
= fzap_cursor_retrieve(zc
->zc_zap
, zc
, za
);
1280 mze_tofind
.mze_hash
= zc
->zc_hash
;
1281 mze_tofind
.mze_cd
= zc
->zc_cd
;
1283 mze
= avl_find(&zc
->zc_zap
->zap_m
.zap_avl
, &mze_tofind
, &idx
);
1285 mze
= avl_nearest(&zc
->zc_zap
->zap_m
.zap_avl
,
1289 mzap_ent_phys_t
*mzep
= MZE_PHYS(zc
->zc_zap
, mze
);
1290 ASSERT3U(mze
->mze_cd
, ==, mzep
->mze_cd
);
1291 za
->za_normalization_conflict
=
1292 mzap_normalization_conflict(zc
->zc_zap
, NULL
, mze
);
1293 za
->za_integer_length
= 8;
1294 za
->za_num_integers
= 1;
1295 za
->za_first_integer
= mzep
->mze_value
;
1296 (void) strcpy(za
->za_name
, mzep
->mze_name
);
1297 zc
->zc_hash
= mze
->mze_hash
;
1298 zc
->zc_cd
= mze
->mze_cd
;
1301 zc
->zc_hash
= -1ULL;
1302 err
= SET_ERROR(ENOENT
);
1305 rw_exit(&zc
->zc_zap
->zap_rwlock
);
1310 zap_cursor_advance(zap_cursor_t
*zc
)
1312 if (zc
->zc_hash
== -1ULL)
1318 zap_cursor_move_to_key(zap_cursor_t
*zc
, const char *name
, matchtype_t mt
)
1324 if (zc
->zc_zap
== NULL
) {
1325 err
= zap_lockdir(zc
->zc_objset
, zc
->zc_zapobj
, NULL
,
1326 RW_READER
, TRUE
, FALSE
, &zc
->zc_zap
);
1330 rw_enter(&zc
->zc_zap
->zap_rwlock
, RW_READER
);
1333 zn
= zap_name_alloc(zc
->zc_zap
, name
, mt
);
1335 rw_exit(&zc
->zc_zap
->zap_rwlock
);
1336 return (SET_ERROR(ENOTSUP
));
1339 if (!zc
->zc_zap
->zap_ismicro
) {
1340 err
= fzap_cursor_move_to_key(zc
, zn
);
1344 err
= SET_ERROR(ENOENT
);
1347 zc
->zc_hash
= mze
->mze_hash
;
1348 zc
->zc_cd
= mze
->mze_cd
;
1353 rw_exit(&zc
->zc_zap
->zap_rwlock
);
1358 zap_get_stats(objset_t
*os
, uint64_t zapobj
, zap_stats_t
*zs
)
1363 err
= zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, &zap
);
1367 bzero(zs
, sizeof (zap_stats_t
));
1369 if (zap
->zap_ismicro
) {
1370 zs
->zs_blocksize
= zap
->zap_dbuf
->db_size
;
1371 zs
->zs_num_entries
= zap
->zap_m
.zap_num_entries
;
1372 zs
->zs_num_blocks
= 1;
1374 fzap_get_stats(zap
, zs
);
1381 zap_count_write(objset_t
*os
, uint64_t zapobj
, const char *name
, int add
,
1382 uint64_t *towrite
, uint64_t *tooverwrite
)
1389 * Since, we don't have a name, we cannot figure out which blocks will
1390 * be affected in this operation. So, account for the worst case :
1391 * - 3 blocks overwritten: target leaf, ptrtbl block, header block
1392 * - 4 new blocks written if adding:
1393 * - 2 blocks for possibly split leaves,
1394 * - 2 grown ptrtbl blocks
1396 * This also accomodates the case where an add operation to a fairly
1397 * large microzap results in a promotion to fatzap.
1400 *towrite
+= (3 + (add
? 4 : 0)) * SPA_MAXBLOCKSIZE
;
1405 * We lock the zap with adding == FALSE. Because, if we pass
1406 * the actual value of add, it could trigger a mzap_upgrade().
1407 * At present we are just evaluating the possibility of this operation
1408 * and hence we donot want to trigger an upgrade.
1410 err
= zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, &zap
);
1414 if (!zap
->zap_ismicro
) {
1415 zap_name_t
*zn
= zap_name_alloc(zap
, name
, MT_EXACT
);
1417 err
= fzap_count_write(zn
, add
, towrite
,
1422 * We treat this case as similar to (name == NULL)
1424 *towrite
+= (3 + (add
? 4 : 0)) * SPA_MAXBLOCKSIZE
;
1428 * We are here if (name != NULL) and this is a micro-zap.
1429 * We account for the header block depending on whether it
1432 * Incase of an add-operation it is hard to find out
1433 * if this add will promote this microzap to fatzap.
1434 * Hence, we consider the worst case and account for the
1435 * blocks assuming this microzap would be promoted to a
1438 * 1 block overwritten : header block
1439 * 4 new blocks written : 2 new split leaf, 2 grown
1442 if (dmu_buf_freeable(zap
->zap_dbuf
))
1443 *tooverwrite
+= SPA_MAXBLOCKSIZE
;
1445 *towrite
+= SPA_MAXBLOCKSIZE
;
1448 *towrite
+= 4 * SPA_MAXBLOCKSIZE
;
1456 #if defined(_KERNEL) && defined(HAVE_SPL)
1457 EXPORT_SYMBOL(zap_create
);
1458 EXPORT_SYMBOL(zap_create_norm
);
1459 EXPORT_SYMBOL(zap_create_flags
);
1460 EXPORT_SYMBOL(zap_create_claim
);
1461 EXPORT_SYMBOL(zap_create_claim_norm
);
1462 EXPORT_SYMBOL(zap_destroy
);
1463 EXPORT_SYMBOL(zap_lookup
);
1464 EXPORT_SYMBOL(zap_lookup_norm
);
1465 EXPORT_SYMBOL(zap_lookup_uint64
);
1466 EXPORT_SYMBOL(zap_contains
);
1467 EXPORT_SYMBOL(zap_prefetch_uint64
);
1468 EXPORT_SYMBOL(zap_count_write
);
1469 EXPORT_SYMBOL(zap_add
);
1470 EXPORT_SYMBOL(zap_add_uint64
);
1471 EXPORT_SYMBOL(zap_update
);
1472 EXPORT_SYMBOL(zap_update_uint64
);
1473 EXPORT_SYMBOL(zap_length
);
1474 EXPORT_SYMBOL(zap_length_uint64
);
1475 EXPORT_SYMBOL(zap_remove
);
1476 EXPORT_SYMBOL(zap_remove_norm
);
1477 EXPORT_SYMBOL(zap_remove_uint64
);
1478 EXPORT_SYMBOL(zap_count
);
1479 EXPORT_SYMBOL(zap_value_search
);
1480 EXPORT_SYMBOL(zap_join
);
1481 EXPORT_SYMBOL(zap_join_increment
);
1482 EXPORT_SYMBOL(zap_add_int
);
1483 EXPORT_SYMBOL(zap_remove_int
);
1484 EXPORT_SYMBOL(zap_lookup_int
);
1485 EXPORT_SYMBOL(zap_increment_int
);
1486 EXPORT_SYMBOL(zap_add_int_key
);
1487 EXPORT_SYMBOL(zap_lookup_int_key
);
1488 EXPORT_SYMBOL(zap_increment
);
1489 EXPORT_SYMBOL(zap_cursor_init
);
1490 EXPORT_SYMBOL(zap_cursor_fini
);
1491 EXPORT_SYMBOL(zap_cursor_retrieve
);
1492 EXPORT_SYMBOL(zap_cursor_advance
);
1493 EXPORT_SYMBOL(zap_cursor_serialize
);
1494 EXPORT_SYMBOL(zap_cursor_move_to_key
);
1495 EXPORT_SYMBOL(zap_cursor_init_serialized
);
1496 EXPORT_SYMBOL(zap_get_stats
);