]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/zap_micro.c
Provide more flexible object allocation interface
[mirror_zfs.git] / module / zfs / zap_micro.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
26 * Copyright 2017 Nexenta Systems, Inc.
27 */
28
29 #include <sys/zio.h>
30 #include <sys/spa.h>
31 #include <sys/dmu.h>
32 #include <sys/zfs_context.h>
33 #include <sys/zap.h>
34 #include <sys/refcount.h>
35 #include <sys/zap_impl.h>
36 #include <sys/zap_leaf.h>
37 #include <sys/avl.h>
38 #include <sys/arc.h>
39 #include <sys/dmu_objset.h>
40
41 #ifdef _KERNEL
42 #include <sys/sunddi.h>
43 #endif
44
45 extern inline mzap_phys_t *zap_m_phys(zap_t *zap);
46
47 static int mzap_upgrade(zap_t **zapp,
48 void *tag, dmu_tx_t *tx, zap_flags_t flags);
49
50 uint64_t
51 zap_getflags(zap_t *zap)
52 {
53 if (zap->zap_ismicro)
54 return (0);
55 return (zap_f_phys(zap)->zap_flags);
56 }
57
58 int
59 zap_hashbits(zap_t *zap)
60 {
61 if (zap_getflags(zap) & ZAP_FLAG_HASH64)
62 return (48);
63 else
64 return (28);
65 }
66
67 uint32_t
68 zap_maxcd(zap_t *zap)
69 {
70 if (zap_getflags(zap) & ZAP_FLAG_HASH64)
71 return ((1<<16)-1);
72 else
73 return (-1U);
74 }
75
76 static uint64_t
77 zap_hash(zap_name_t *zn)
78 {
79 zap_t *zap = zn->zn_zap;
80 uint64_t h = 0;
81
82 if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
83 ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
84 h = *(uint64_t *)zn->zn_key_orig;
85 } else {
86 h = zap->zap_salt;
87 ASSERT(h != 0);
88 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
89
90 if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
91 const uint64_t *wp = zn->zn_key_norm;
92
93 ASSERT(zn->zn_key_intlen == 8);
94 for (int i = 0; i < zn->zn_key_norm_numints;
95 wp++, i++) {
96 uint64_t word = *wp;
97
98 for (int j = 0; j < zn->zn_key_intlen; j++) {
99 h = (h >> 8) ^
100 zfs_crc64_table[(h ^ word) & 0xFF];
101 word >>= NBBY;
102 }
103 }
104 } else {
105 const uint8_t *cp = zn->zn_key_norm;
106
107 /*
108 * We previously stored the terminating null on
109 * disk, but didn't hash it, so we need to
110 * continue to not hash it. (The
111 * zn_key_*_numints includes the terminating
112 * null for non-binary keys.)
113 */
114 int len = zn->zn_key_norm_numints - 1;
115
116 ASSERT(zn->zn_key_intlen == 1);
117 for (int i = 0; i < len; cp++, i++) {
118 h = (h >> 8) ^
119 zfs_crc64_table[(h ^ *cp) & 0xFF];
120 }
121 }
122 }
123 /*
124 * Don't use all 64 bits, since we need some in the cookie for
125 * the collision differentiator. We MUST use the high bits,
126 * since those are the ones that we first pay attention to when
127 * choosing the bucket.
128 */
129 h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
130
131 return (h);
132 }
133
134 static int
135 zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags)
136 {
137 ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
138
139 size_t inlen = strlen(name) + 1;
140 size_t outlen = ZAP_MAXNAMELEN;
141
142 int err = 0;
143 (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
144 normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
145 U8_UNICODE_LATEST, &err);
146
147 return (err);
148 }
149
150 boolean_t
151 zap_match(zap_name_t *zn, const char *matchname)
152 {
153 ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
154
155 if (zn->zn_matchtype & MT_NORMALIZE) {
156 char norm[ZAP_MAXNAMELEN];
157
158 if (zap_normalize(zn->zn_zap, matchname, norm,
159 zn->zn_normflags) != 0)
160 return (B_FALSE);
161
162 return (strcmp(zn->zn_key_norm, norm) == 0);
163 } else {
164 return (strcmp(zn->zn_key_orig, matchname) == 0);
165 }
166 }
167
168 void
169 zap_name_free(zap_name_t *zn)
170 {
171 kmem_free(zn, sizeof (zap_name_t));
172 }
173
174 zap_name_t *
175 zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
176 {
177 zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
178
179 zn->zn_zap = zap;
180 zn->zn_key_intlen = sizeof (*key);
181 zn->zn_key_orig = key;
182 zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
183 zn->zn_matchtype = mt;
184 zn->zn_normflags = zap->zap_normflags;
185
186 /*
187 * If we're dealing with a case sensitive lookup on a mixed or
188 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
189 * will fold case to all caps overriding the lookup request.
190 */
191 if (mt & MT_MATCH_CASE)
192 zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
193
194 if (zap->zap_normflags) {
195 /*
196 * We *must* use zap_normflags because this normalization is
197 * what the hash is computed from.
198 */
199 if (zap_normalize(zap, key, zn->zn_normbuf,
200 zap->zap_normflags) != 0) {
201 zap_name_free(zn);
202 return (NULL);
203 }
204 zn->zn_key_norm = zn->zn_normbuf;
205 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
206 } else {
207 if (mt != 0) {
208 zap_name_free(zn);
209 return (NULL);
210 }
211 zn->zn_key_norm = zn->zn_key_orig;
212 zn->zn_key_norm_numints = zn->zn_key_orig_numints;
213 }
214
215 zn->zn_hash = zap_hash(zn);
216
217 if (zap->zap_normflags != zn->zn_normflags) {
218 /*
219 * We *must* use zn_normflags because this normalization is
220 * what the matching is based on. (Not the hash!)
221 */
222 if (zap_normalize(zap, key, zn->zn_normbuf,
223 zn->zn_normflags) != 0) {
224 zap_name_free(zn);
225 return (NULL);
226 }
227 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
228 }
229
230 return (zn);
231 }
232
233 zap_name_t *
234 zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
235 {
236 zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
237
238 ASSERT(zap->zap_normflags == 0);
239 zn->zn_zap = zap;
240 zn->zn_key_intlen = sizeof (*key);
241 zn->zn_key_orig = zn->zn_key_norm = key;
242 zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
243 zn->zn_matchtype = 0;
244
245 zn->zn_hash = zap_hash(zn);
246 return (zn);
247 }
248
249 static void
250 mzap_byteswap(mzap_phys_t *buf, size_t size)
251 {
252 buf->mz_block_type = BSWAP_64(buf->mz_block_type);
253 buf->mz_salt = BSWAP_64(buf->mz_salt);
254 buf->mz_normflags = BSWAP_64(buf->mz_normflags);
255 int max = (size / MZAP_ENT_LEN) - 1;
256 for (int i = 0; i < max; i++) {
257 buf->mz_chunk[i].mze_value =
258 BSWAP_64(buf->mz_chunk[i].mze_value);
259 buf->mz_chunk[i].mze_cd =
260 BSWAP_32(buf->mz_chunk[i].mze_cd);
261 }
262 }
263
264 void
265 zap_byteswap(void *buf, size_t size)
266 {
267 uint64_t block_type = *(uint64_t *)buf;
268
269 if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
270 /* ASSERT(magic == ZAP_LEAF_MAGIC); */
271 mzap_byteswap(buf, size);
272 } else {
273 fzap_byteswap(buf, size);
274 }
275 }
276
277 static int
278 mze_compare(const void *arg1, const void *arg2)
279 {
280 const mzap_ent_t *mze1 = arg1;
281 const mzap_ent_t *mze2 = arg2;
282
283 int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash);
284 if (likely(cmp))
285 return (cmp);
286
287 return (AVL_CMP(mze1->mze_cd, mze2->mze_cd));
288 }
289
290 static void
291 mze_insert(zap_t *zap, int chunkid, uint64_t hash)
292 {
293 ASSERT(zap->zap_ismicro);
294 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
295
296 mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
297 mze->mze_chunkid = chunkid;
298 mze->mze_hash = hash;
299 mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
300 ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
301 avl_add(&zap->zap_m.zap_avl, mze);
302 }
303
304 static mzap_ent_t *
305 mze_find(zap_name_t *zn)
306 {
307 mzap_ent_t mze_tofind;
308 mzap_ent_t *mze;
309 avl_index_t idx;
310 avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
311
312 ASSERT(zn->zn_zap->zap_ismicro);
313 ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
314
315 mze_tofind.mze_hash = zn->zn_hash;
316 mze_tofind.mze_cd = 0;
317
318 mze = avl_find(avl, &mze_tofind, &idx);
319 if (mze == NULL)
320 mze = avl_nearest(avl, idx, AVL_AFTER);
321 for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
322 ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
323 if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
324 return (mze);
325 }
326
327 return (NULL);
328 }
329
330 static uint32_t
331 mze_find_unused_cd(zap_t *zap, uint64_t hash)
332 {
333 mzap_ent_t mze_tofind;
334 avl_index_t idx;
335 avl_tree_t *avl = &zap->zap_m.zap_avl;
336
337 ASSERT(zap->zap_ismicro);
338 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
339
340 mze_tofind.mze_hash = hash;
341 mze_tofind.mze_cd = 0;
342
343 uint32_t cd = 0;
344 for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx);
345 mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
346 if (mze->mze_cd != cd)
347 break;
348 cd++;
349 }
350
351 return (cd);
352 }
353
354 /*
355 * Each mzap entry requires at max : 4 chunks
356 * 3 chunks for names + 1 chunk for value.
357 */
358 #define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \
359 ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t)))
360
361 /*
362 * Check if the current entry keeps the colliding entries under the fatzap leaf
363 * size.
364 */
365 static boolean_t
366 mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
367 {
368 zap_t *zap = zn->zn_zap;
369 mzap_ent_t mze_tofind;
370 mzap_ent_t *mze;
371 avl_index_t idx;
372 avl_tree_t *avl = &zap->zap_m.zap_avl;
373 uint32_t mzap_ents = 0;
374
375 mze_tofind.mze_hash = hash;
376 mze_tofind.mze_cd = 0;
377
378 for (mze = avl_find(avl, &mze_tofind, &idx);
379 mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
380 mzap_ents++;
381 }
382
383 /* Include the new entry being added */
384 mzap_ents++;
385
386 return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS));
387 }
388
389 static void
390 mze_remove(zap_t *zap, mzap_ent_t *mze)
391 {
392 ASSERT(zap->zap_ismicro);
393 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
394
395 avl_remove(&zap->zap_m.zap_avl, mze);
396 kmem_free(mze, sizeof (mzap_ent_t));
397 }
398
399 static void
400 mze_destroy(zap_t *zap)
401 {
402 mzap_ent_t *mze;
403 void *avlcookie = NULL;
404
405 while ((mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)))
406 kmem_free(mze, sizeof (mzap_ent_t));
407 avl_destroy(&zap->zap_m.zap_avl);
408 }
409
410 static zap_t *
411 mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
412 {
413 zap_t *winner;
414 uint64_t *zap_hdr = (uint64_t *)db->db_data;
415 uint64_t zap_block_type = zap_hdr[0];
416 uint64_t zap_magic = zap_hdr[1];
417
418 ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
419
420 zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
421 rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
422 rw_enter(&zap->zap_rwlock, RW_WRITER);
423 zap->zap_objset = os;
424 zap->zap_object = obj;
425 zap->zap_dbuf = db;
426
427 if (zap_block_type != ZBT_MICRO) {
428 mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT,
429 0);
430 zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
431 if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) {
432 winner = NULL; /* No actual winner here... */
433 goto handle_winner;
434 }
435 } else {
436 zap->zap_ismicro = TRUE;
437 }
438
439 /*
440 * Make sure that zap_ismicro is set before we let others see
441 * it, because zap_lockdir() checks zap_ismicro without the lock
442 * held.
443 */
444 dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
445 winner = dmu_buf_set_user(db, &zap->zap_dbu);
446
447 if (winner != NULL)
448 goto handle_winner;
449
450 if (zap->zap_ismicro) {
451 zap->zap_salt = zap_m_phys(zap)->mz_salt;
452 zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
453 zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
454 avl_create(&zap->zap_m.zap_avl, mze_compare,
455 sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
456
457 for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
458 mzap_ent_phys_t *mze =
459 &zap_m_phys(zap)->mz_chunk[i];
460 if (mze->mze_name[0]) {
461 zap_name_t *zn;
462
463 zap->zap_m.zap_num_entries++;
464 zn = zap_name_alloc(zap, mze->mze_name, 0);
465 mze_insert(zap, i, zn->zn_hash);
466 zap_name_free(zn);
467 }
468 }
469 } else {
470 zap->zap_salt = zap_f_phys(zap)->zap_salt;
471 zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
472
473 ASSERT3U(sizeof (struct zap_leaf_header), ==,
474 2*ZAP_LEAF_CHUNKSIZE);
475
476 /*
477 * The embedded pointer table should not overlap the
478 * other members.
479 */
480 ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
481 &zap_f_phys(zap)->zap_salt);
482
483 /*
484 * The embedded pointer table should end at the end of
485 * the block
486 */
487 ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
488 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
489 (uintptr_t)zap_f_phys(zap), ==,
490 zap->zap_dbuf->db_size);
491 }
492 rw_exit(&zap->zap_rwlock);
493 return (zap);
494
495 handle_winner:
496 rw_exit(&zap->zap_rwlock);
497 rw_destroy(&zap->zap_rwlock);
498 if (!zap->zap_ismicro)
499 mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
500 kmem_free(zap, sizeof (zap_t));
501 return (winner);
502 }
503
504 /*
505 * This routine "consumes" the caller's hold on the dbuf, which must
506 * have the specified tag.
507 */
508 static int
509 zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
510 krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
511 {
512 ASSERT0(db->db_offset);
513 objset_t *os = dmu_buf_get_objset(db);
514 uint64_t obj = db->db_object;
515 dmu_object_info_t doi;
516
517 *zapp = NULL;
518
519 dmu_object_info_from_db(db, &doi);
520 if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
521 return (SET_ERROR(EINVAL));
522
523 zap_t *zap = dmu_buf_get_user(db);
524 if (zap == NULL) {
525 zap = mzap_open(os, obj, db);
526 if (zap == NULL) {
527 /*
528 * mzap_open() didn't like what it saw on-disk.
529 * Check for corruption!
530 */
531 return (SET_ERROR(EIO));
532 }
533 }
534
535 /*
536 * We're checking zap_ismicro without the lock held, in order to
537 * tell what type of lock we want. Once we have some sort of
538 * lock, see if it really is the right type. In practice this
539 * can only be different if it was upgraded from micro to fat,
540 * and micro wanted WRITER but fat only needs READER.
541 */
542 krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
543 rw_enter(&zap->zap_rwlock, lt);
544 if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
545 /* it was upgraded, now we only need reader */
546 ASSERT(lt == RW_WRITER);
547 ASSERT(RW_READER ==
548 ((!zap->zap_ismicro && fatreader) ? RW_READER : lti));
549 rw_downgrade(&zap->zap_rwlock);
550 lt = RW_READER;
551 }
552
553 zap->zap_objset = os;
554
555 if (lt == RW_WRITER)
556 dmu_buf_will_dirty(db, tx);
557
558 ASSERT3P(zap->zap_dbuf, ==, db);
559
560 ASSERT(!zap->zap_ismicro ||
561 zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
562 if (zap->zap_ismicro && tx && adding &&
563 zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
564 uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
565 if (newsz > MZAP_MAX_BLKSZ) {
566 dprintf("upgrading obj %llu: num_entries=%u\n",
567 obj, zap->zap_m.zap_num_entries);
568 *zapp = zap;
569 int err = mzap_upgrade(zapp, tag, tx, 0);
570 if (err != 0)
571 rw_exit(&zap->zap_rwlock);
572 return (err);
573 }
574 VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
575 zap->zap_m.zap_num_chunks =
576 db->db_size / MZAP_ENT_LEN - 1;
577 }
578
579 *zapp = zap;
580 return (0);
581 }
582
583 static int
584 zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
585 krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
586 {
587 dmu_buf_t *db;
588
589 int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
590 if (err != 0) {
591 return (err);
592 }
593 #ifdef ZFS_DEBUG
594 {
595 dmu_object_info_t doi;
596 dmu_object_info_from_db(db, &doi);
597 ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
598 }
599 #endif
600
601 err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
602 if (err != 0) {
603 dmu_buf_rele(db, tag);
604 }
605 return (err);
606 }
607
608 int
609 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
610 krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
611 {
612 dmu_buf_t *db;
613
614 int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
615 if (err != 0)
616 return (err);
617 #ifdef ZFS_DEBUG
618 {
619 dmu_object_info_t doi;
620 dmu_object_info_from_db(db, &doi);
621 ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
622 }
623 #endif
624 err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
625 if (err != 0)
626 dmu_buf_rele(db, tag);
627 return (err);
628 }
629
630 void
631 zap_unlockdir(zap_t *zap, void *tag)
632 {
633 rw_exit(&zap->zap_rwlock);
634 dmu_buf_rele(zap->zap_dbuf, tag);
635 }
636
637 static int
638 mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
639 {
640 int err = 0;
641 zap_t *zap = *zapp;
642
643 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
644
645 int sz = zap->zap_dbuf->db_size;
646 mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP);
647 bcopy(zap->zap_dbuf->db_data, mzp, sz);
648 int nchunks = zap->zap_m.zap_num_chunks;
649
650 if (!flags) {
651 err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
652 1ULL << fzap_default_block_shift, 0, tx);
653 if (err != 0) {
654 vmem_free(mzp, sz);
655 return (err);
656 }
657 }
658
659 dprintf("upgrading obj=%llu with %u chunks\n",
660 zap->zap_object, nchunks);
661 /* XXX destroy the avl later, so we can use the stored hash value */
662 mze_destroy(zap);
663
664 fzap_upgrade(zap, tx, flags);
665
666 for (int i = 0; i < nchunks; i++) {
667 mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
668 if (mze->mze_name[0] == 0)
669 continue;
670 dprintf("adding %s=%llu\n",
671 mze->mze_name, mze->mze_value);
672 zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0);
673 /* If we fail here, we would end up losing entries */
674 VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
675 tag, tx));
676 zap = zn->zn_zap; /* fzap_add_cd() may change zap */
677 zap_name_free(zn);
678 }
679 vmem_free(mzp, sz);
680 *zapp = zap;
681 return (0);
682 }
683
684 /*
685 * The "normflags" determine the behavior of the matchtype_t which is
686 * passed to zap_lookup_norm(). Names which have the same normalized
687 * version will be stored with the same hash value, and therefore we can
688 * perform normalization-insensitive lookups. We can be Unicode form-
689 * insensitive and/or case-insensitive. The following flags are valid for
690 * "normflags":
691 *
692 * U8_TEXTPREP_NFC
693 * U8_TEXTPREP_NFD
694 * U8_TEXTPREP_NFKC
695 * U8_TEXTPREP_NFKD
696 * U8_TEXTPREP_TOUPPER
697 *
698 * The *_NF* (Normalization Form) flags are mutually exclusive; at most one
699 * of them may be supplied.
700 */
701 void
702 mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
703 {
704 dmu_buf_t *db;
705
706 VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
707
708 dmu_buf_will_dirty(db, tx);
709 mzap_phys_t *zp = db->db_data;
710 zp->mz_block_type = ZBT_MICRO;
711 zp->mz_salt =
712 ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL;
713 zp->mz_normflags = normflags;
714
715 if (flags != 0) {
716 zap_t *zap;
717 /* Only fat zap supports flags; upgrade immediately. */
718 VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER,
719 B_FALSE, B_FALSE, &zap));
720 VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
721 zap_unlockdir(zap, FTAG);
722 } else {
723 dmu_buf_rele(db, FTAG);
724 }
725 }
726
727 static uint64_t
728 zap_create_impl(objset_t *os, int normflags, zap_flags_t flags,
729 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
730 dmu_object_type_t bonustype, int bonuslen, int dnodesize,
731 dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
732 {
733 uint64_t obj;
734
735 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
736
737 if (allocated_dnode == NULL) {
738 dnode_t *dn;
739 obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
740 indirect_blockshift, bonustype, bonuslen, dnodesize,
741 &dn, FTAG, tx);
742 mzap_create_impl(dn, normflags, flags, tx);
743 dnode_rele(dn, FTAG);
744 } else {
745 obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
746 indirect_blockshift, bonustype, bonuslen, dnodesize,
747 allocated_dnode, tag, tx);
748 mzap_create_impl(*allocated_dnode, normflags, flags, tx);
749 }
750
751 return (obj);
752 }
753
754 int
755 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
756 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
757 {
758 return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
759 0, tx));
760 }
761
762 int
763 zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
764 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
765 {
766 return (zap_create_claim_norm_dnsize(os, obj,
767 0, ot, bonustype, bonuslen, dnodesize, tx));
768 }
769
770 int
771 zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
772 dmu_object_type_t ot,
773 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
774 {
775 return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
776 bonuslen, 0, tx));
777 }
778
779 int
780 zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
781 dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
782 int dnodesize, dmu_tx_t *tx)
783 {
784 dnode_t *dn;
785 int error;
786
787 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
788 error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
789 dnodesize, tx);
790 if (error != 0)
791 return (error);
792
793 error = dnode_hold(os, obj, FTAG, &dn);
794 if (error != 0)
795 return (error);
796
797 mzap_create_impl(dn, normflags, 0, tx);
798
799 dnode_rele(dn, FTAG);
800
801 return (0);
802 }
803
804 uint64_t
805 zap_create(objset_t *os, dmu_object_type_t ot,
806 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
807 {
808 return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
809 }
810
811 uint64_t
812 zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
813 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
814 {
815 return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
816 dnodesize, tx));
817 }
818
819 uint64_t
820 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
821 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
822 {
823 return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
824 0, tx));
825 }
826
827 uint64_t
828 zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
829 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
830 {
831 return (zap_create_impl(os, normflags, 0, ot, 0, 0,
832 bonustype, bonuslen, dnodesize, NULL, NULL, tx));
833 }
834
835 uint64_t
836 zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
837 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
838 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
839 {
840 return (zap_create_flags_dnsize(os, normflags, flags, ot,
841 leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
842 }
843
844 uint64_t
845 zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
846 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
847 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
848 {
849 return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
850 indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL,
851 tx));
852 }
853
854 /*
855 * Create a zap object and return a pointer to the newly allocated dnode via
856 * the allocated_dnode argument. The returned dnode will be held and the
857 * caller is responsible for releasing the hold by calling dnode_rele().
858 */
859 uint64_t
860 zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
861 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
862 dmu_object_type_t bonustype, int bonuslen, int dnodesize,
863 dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
864 {
865 return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
866 indirect_blockshift, bonustype, bonuslen, dnodesize,
867 allocated_dnode, tag, tx));
868 }
869
870 int
871 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
872 {
873 /*
874 * dmu_object_free will free the object number and free the
875 * data. Freeing the data will cause our pageout function to be
876 * called, which will destroy our data (zap_leaf_t's and zap_t).
877 */
878
879 return (dmu_object_free(os, zapobj, tx));
880 }
881
882 void
883 zap_evict_sync(void *dbu)
884 {
885 zap_t *zap = dbu;
886
887 rw_destroy(&zap->zap_rwlock);
888
889 if (zap->zap_ismicro)
890 mze_destroy(zap);
891 else
892 mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
893
894 kmem_free(zap, sizeof (zap_t));
895 }
896
897 int
898 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
899 {
900 zap_t *zap;
901
902 int err =
903 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
904 if (err != 0)
905 return (err);
906 if (!zap->zap_ismicro) {
907 err = fzap_count(zap, count);
908 } else {
909 *count = zap->zap_m.zap_num_entries;
910 }
911 zap_unlockdir(zap, FTAG);
912 return (err);
913 }
914
915 /*
916 * zn may be NULL; if not specified, it will be computed if needed.
917 * See also the comment above zap_entry_normalization_conflict().
918 */
919 static boolean_t
920 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
921 {
922 int direction = AVL_BEFORE;
923 boolean_t allocdzn = B_FALSE;
924
925 if (zap->zap_normflags == 0)
926 return (B_FALSE);
927
928 again:
929 for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
930 other && other->mze_hash == mze->mze_hash;
931 other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
932
933 if (zn == NULL) {
934 zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
935 MT_NORMALIZE);
936 allocdzn = B_TRUE;
937 }
938 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
939 if (allocdzn)
940 zap_name_free(zn);
941 return (B_TRUE);
942 }
943 }
944
945 if (direction == AVL_BEFORE) {
946 direction = AVL_AFTER;
947 goto again;
948 }
949
950 if (allocdzn)
951 zap_name_free(zn);
952 return (B_FALSE);
953 }
954
955 /*
956 * Routines for manipulating attributes.
957 */
958
959 int
960 zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
961 uint64_t integer_size, uint64_t num_integers, void *buf)
962 {
963 return (zap_lookup_norm(os, zapobj, name, integer_size,
964 num_integers, buf, 0, NULL, 0, NULL));
965 }
966
967 static int
968 zap_lookup_impl(zap_t *zap, const char *name,
969 uint64_t integer_size, uint64_t num_integers, void *buf,
970 matchtype_t mt, char *realname, int rn_len,
971 boolean_t *ncp)
972 {
973 int err = 0;
974
975 zap_name_t *zn = zap_name_alloc(zap, name, mt);
976 if (zn == NULL)
977 return (SET_ERROR(ENOTSUP));
978
979 if (!zap->zap_ismicro) {
980 err = fzap_lookup(zn, integer_size, num_integers, buf,
981 realname, rn_len, ncp);
982 } else {
983 mzap_ent_t *mze = mze_find(zn);
984 if (mze == NULL) {
985 err = SET_ERROR(ENOENT);
986 } else {
987 if (num_integers < 1) {
988 err = SET_ERROR(EOVERFLOW);
989 } else if (integer_size != 8) {
990 err = SET_ERROR(EINVAL);
991 } else {
992 *(uint64_t *)buf =
993 MZE_PHYS(zap, mze)->mze_value;
994 (void) strlcpy(realname,
995 MZE_PHYS(zap, mze)->mze_name, rn_len);
996 if (ncp) {
997 *ncp = mzap_normalization_conflict(zap,
998 zn, mze);
999 }
1000 }
1001 }
1002 }
1003 zap_name_free(zn);
1004 return (err);
1005 }
1006
1007 int
1008 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
1009 uint64_t integer_size, uint64_t num_integers, void *buf,
1010 matchtype_t mt, char *realname, int rn_len,
1011 boolean_t *ncp)
1012 {
1013 zap_t *zap;
1014
1015 int err =
1016 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1017 if (err != 0)
1018 return (err);
1019 err = zap_lookup_impl(zap, name, integer_size,
1020 num_integers, buf, mt, realname, rn_len, ncp);
1021 zap_unlockdir(zap, FTAG);
1022 return (err);
1023 }
1024
1025 int
1026 zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
1027 {
1028 zap_t *zap;
1029 int err;
1030 zap_name_t *zn;
1031
1032 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1033 if (err)
1034 return (err);
1035 zn = zap_name_alloc(zap, name, 0);
1036 if (zn == NULL) {
1037 zap_unlockdir(zap, FTAG);
1038 return (SET_ERROR(ENOTSUP));
1039 }
1040
1041 fzap_prefetch(zn);
1042 zap_name_free(zn);
1043 zap_unlockdir(zap, FTAG);
1044 return (err);
1045 }
1046
1047 int
1048 zap_lookup_by_dnode(dnode_t *dn, const char *name,
1049 uint64_t integer_size, uint64_t num_integers, void *buf)
1050 {
1051 return (zap_lookup_norm_by_dnode(dn, name, integer_size,
1052 num_integers, buf, 0, NULL, 0, NULL));
1053 }
1054
1055 int
1056 zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
1057 uint64_t integer_size, uint64_t num_integers, void *buf,
1058 matchtype_t mt, char *realname, int rn_len,
1059 boolean_t *ncp)
1060 {
1061 zap_t *zap;
1062
1063 int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
1064 FTAG, &zap);
1065 if (err != 0)
1066 return (err);
1067 err = zap_lookup_impl(zap, name, integer_size,
1068 num_integers, buf, mt, realname, rn_len, ncp);
1069 zap_unlockdir(zap, FTAG);
1070 return (err);
1071 }
1072
1073 int
1074 zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1075 int key_numints)
1076 {
1077 zap_t *zap;
1078
1079 int err =
1080 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1081 if (err != 0)
1082 return (err);
1083 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1084 if (zn == NULL) {
1085 zap_unlockdir(zap, FTAG);
1086 return (SET_ERROR(ENOTSUP));
1087 }
1088
1089 fzap_prefetch(zn);
1090 zap_name_free(zn);
1091 zap_unlockdir(zap, FTAG);
1092 return (err);
1093 }
1094
1095 int
1096 zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1097 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
1098 {
1099 zap_t *zap;
1100
1101 int err =
1102 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1103 if (err != 0)
1104 return (err);
1105 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1106 if (zn == NULL) {
1107 zap_unlockdir(zap, FTAG);
1108 return (SET_ERROR(ENOTSUP));
1109 }
1110
1111 err = fzap_lookup(zn, integer_size, num_integers, buf,
1112 NULL, 0, NULL);
1113 zap_name_free(zn);
1114 zap_unlockdir(zap, FTAG);
1115 return (err);
1116 }
1117
1118 int
1119 zap_contains(objset_t *os, uint64_t zapobj, const char *name)
1120 {
1121 int err = zap_lookup_norm(os, zapobj, name, 0,
1122 0, NULL, 0, NULL, 0, NULL);
1123 if (err == EOVERFLOW || err == EINVAL)
1124 err = 0; /* found, but skipped reading the value */
1125 return (err);
1126 }
1127
1128 int
1129 zap_length(objset_t *os, uint64_t zapobj, const char *name,
1130 uint64_t *integer_size, uint64_t *num_integers)
1131 {
1132 zap_t *zap;
1133
1134 int err =
1135 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1136 if (err != 0)
1137 return (err);
1138 zap_name_t *zn = zap_name_alloc(zap, name, 0);
1139 if (zn == NULL) {
1140 zap_unlockdir(zap, FTAG);
1141 return (SET_ERROR(ENOTSUP));
1142 }
1143 if (!zap->zap_ismicro) {
1144 err = fzap_length(zn, integer_size, num_integers);
1145 } else {
1146 mzap_ent_t *mze = mze_find(zn);
1147 if (mze == NULL) {
1148 err = SET_ERROR(ENOENT);
1149 } else {
1150 if (integer_size)
1151 *integer_size = 8;
1152 if (num_integers)
1153 *num_integers = 1;
1154 }
1155 }
1156 zap_name_free(zn);
1157 zap_unlockdir(zap, FTAG);
1158 return (err);
1159 }
1160
1161 int
1162 zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1163 int key_numints, uint64_t *integer_size, uint64_t *num_integers)
1164 {
1165 zap_t *zap;
1166
1167 int err =
1168 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1169 if (err != 0)
1170 return (err);
1171 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1172 if (zn == NULL) {
1173 zap_unlockdir(zap, FTAG);
1174 return (SET_ERROR(ENOTSUP));
1175 }
1176 err = fzap_length(zn, integer_size, num_integers);
1177 zap_name_free(zn);
1178 zap_unlockdir(zap, FTAG);
1179 return (err);
1180 }
1181
1182 static void
1183 mzap_addent(zap_name_t *zn, uint64_t value)
1184 {
1185 zap_t *zap = zn->zn_zap;
1186 int start = zap->zap_m.zap_alloc_next;
1187
1188 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
1189
1190 #ifdef ZFS_DEBUG
1191 for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
1192 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
1193 ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
1194 }
1195 #endif
1196
1197 uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash);
1198 /* given the limited size of the microzap, this can't happen */
1199 ASSERT(cd < zap_maxcd(zap));
1200
1201 again:
1202 for (int i = start; i < zap->zap_m.zap_num_chunks; i++) {
1203 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
1204 if (mze->mze_name[0] == 0) {
1205 mze->mze_value = value;
1206 mze->mze_cd = cd;
1207 (void) strlcpy(mze->mze_name, zn->zn_key_orig,
1208 sizeof (mze->mze_name));
1209 zap->zap_m.zap_num_entries++;
1210 zap->zap_m.zap_alloc_next = i+1;
1211 if (zap->zap_m.zap_alloc_next ==
1212 zap->zap_m.zap_num_chunks)
1213 zap->zap_m.zap_alloc_next = 0;
1214 mze_insert(zap, i, zn->zn_hash);
1215 return;
1216 }
1217 }
1218 if (start != 0) {
1219 start = 0;
1220 goto again;
1221 }
1222 cmn_err(CE_PANIC, "out of entries!");
1223 }
1224
1225 static int
1226 zap_add_impl(zap_t *zap, const char *key,
1227 int integer_size, uint64_t num_integers,
1228 const void *val, dmu_tx_t *tx, void *tag)
1229 {
1230 const uint64_t *intval = val;
1231 int err = 0;
1232
1233 zap_name_t *zn = zap_name_alloc(zap, key, 0);
1234 if (zn == NULL) {
1235 zap_unlockdir(zap, tag);
1236 return (SET_ERROR(ENOTSUP));
1237 }
1238 if (!zap->zap_ismicro) {
1239 err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
1240 zap = zn->zn_zap; /* fzap_add() may change zap */
1241 } else if (integer_size != 8 || num_integers != 1 ||
1242 strlen(key) >= MZAP_NAME_LEN ||
1243 !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
1244 err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
1245 if (err == 0) {
1246 err = fzap_add(zn, integer_size, num_integers, val,
1247 tag, tx);
1248 }
1249 zap = zn->zn_zap; /* fzap_add() may change zap */
1250 } else {
1251 if (mze_find(zn) != NULL) {
1252 err = SET_ERROR(EEXIST);
1253 } else {
1254 mzap_addent(zn, *intval);
1255 }
1256 }
1257 ASSERT(zap == zn->zn_zap);
1258 zap_name_free(zn);
1259 if (zap != NULL) /* may be NULL if fzap_add() failed */
1260 zap_unlockdir(zap, tag);
1261 return (err);
1262 }
1263
1264 int
1265 zap_add(objset_t *os, uint64_t zapobj, const char *key,
1266 int integer_size, uint64_t num_integers,
1267 const void *val, dmu_tx_t *tx)
1268 {
1269 zap_t *zap;
1270 int err;
1271
1272 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1273 if (err != 0)
1274 return (err);
1275 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
1276 /* zap_add_impl() calls zap_unlockdir() */
1277 return (err);
1278 }
1279
1280 int
1281 zap_add_by_dnode(dnode_t *dn, const char *key,
1282 int integer_size, uint64_t num_integers,
1283 const void *val, dmu_tx_t *tx)
1284 {
1285 zap_t *zap;
1286 int err;
1287
1288 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1289 if (err != 0)
1290 return (err);
1291 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
1292 /* zap_add_impl() calls zap_unlockdir() */
1293 return (err);
1294 }
1295
1296 int
1297 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1298 int key_numints, int integer_size, uint64_t num_integers,
1299 const void *val, dmu_tx_t *tx)
1300 {
1301 zap_t *zap;
1302
1303 int err =
1304 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1305 if (err != 0)
1306 return (err);
1307 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1308 if (zn == NULL) {
1309 zap_unlockdir(zap, FTAG);
1310 return (SET_ERROR(ENOTSUP));
1311 }
1312 err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
1313 zap = zn->zn_zap; /* fzap_add() may change zap */
1314 zap_name_free(zn);
1315 if (zap != NULL) /* may be NULL if fzap_add() failed */
1316 zap_unlockdir(zap, FTAG);
1317 return (err);
1318 }
1319
1320 int
1321 zap_update(objset_t *os, uint64_t zapobj, const char *name,
1322 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1323 {
1324 zap_t *zap;
1325 const uint64_t *intval = val;
1326
1327 int err =
1328 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1329 if (err != 0)
1330 return (err);
1331 zap_name_t *zn = zap_name_alloc(zap, name, 0);
1332 if (zn == NULL) {
1333 zap_unlockdir(zap, FTAG);
1334 return (SET_ERROR(ENOTSUP));
1335 }
1336 if (!zap->zap_ismicro) {
1337 err = fzap_update(zn, integer_size, num_integers, val,
1338 FTAG, tx);
1339 zap = zn->zn_zap; /* fzap_update() may change zap */
1340 } else if (integer_size != 8 || num_integers != 1 ||
1341 strlen(name) >= MZAP_NAME_LEN) {
1342 dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
1343 zapobj, integer_size, num_integers, name);
1344 err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
1345 if (err == 0) {
1346 err = fzap_update(zn, integer_size, num_integers,
1347 val, FTAG, tx);
1348 }
1349 zap = zn->zn_zap; /* fzap_update() may change zap */
1350 } else {
1351 mzap_ent_t *mze = mze_find(zn);
1352 if (mze != NULL) {
1353 MZE_PHYS(zap, mze)->mze_value = *intval;
1354 } else {
1355 mzap_addent(zn, *intval);
1356 }
1357 }
1358 ASSERT(zap == zn->zn_zap);
1359 zap_name_free(zn);
1360 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
1361 zap_unlockdir(zap, FTAG);
1362 return (err);
1363 }
1364
1365 int
1366 zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1367 int key_numints,
1368 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1369 {
1370 zap_t *zap;
1371
1372 int err =
1373 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1374 if (err != 0)
1375 return (err);
1376 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1377 if (zn == NULL) {
1378 zap_unlockdir(zap, FTAG);
1379 return (SET_ERROR(ENOTSUP));
1380 }
1381 err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
1382 zap = zn->zn_zap; /* fzap_update() may change zap */
1383 zap_name_free(zn);
1384 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
1385 zap_unlockdir(zap, FTAG);
1386 return (err);
1387 }
1388
1389 int
1390 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
1391 {
1392 return (zap_remove_norm(os, zapobj, name, 0, tx));
1393 }
1394
1395 static int
1396 zap_remove_impl(zap_t *zap, const char *name,
1397 matchtype_t mt, dmu_tx_t *tx)
1398 {
1399 int err = 0;
1400
1401 zap_name_t *zn = zap_name_alloc(zap, name, mt);
1402 if (zn == NULL)
1403 return (SET_ERROR(ENOTSUP));
1404 if (!zap->zap_ismicro) {
1405 err = fzap_remove(zn, tx);
1406 } else {
1407 mzap_ent_t *mze = mze_find(zn);
1408 if (mze == NULL) {
1409 err = SET_ERROR(ENOENT);
1410 } else {
1411 zap->zap_m.zap_num_entries--;
1412 bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid],
1413 sizeof (mzap_ent_phys_t));
1414 mze_remove(zap, mze);
1415 }
1416 }
1417 zap_name_free(zn);
1418 return (err);
1419 }
1420
1421 int
1422 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
1423 matchtype_t mt, dmu_tx_t *tx)
1424 {
1425 zap_t *zap;
1426 int err;
1427
1428 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1429 if (err)
1430 return (err);
1431 err = zap_remove_impl(zap, name, mt, tx);
1432 zap_unlockdir(zap, FTAG);
1433 return (err);
1434 }
1435
1436 int
1437 zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
1438 {
1439 zap_t *zap;
1440 int err;
1441
1442 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1443 if (err)
1444 return (err);
1445 err = zap_remove_impl(zap, name, 0, tx);
1446 zap_unlockdir(zap, FTAG);
1447 return (err);
1448 }
1449
1450 int
1451 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1452 int key_numints, dmu_tx_t *tx)
1453 {
1454 zap_t *zap;
1455
1456 int err =
1457 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1458 if (err != 0)
1459 return (err);
1460 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1461 if (zn == NULL) {
1462 zap_unlockdir(zap, FTAG);
1463 return (SET_ERROR(ENOTSUP));
1464 }
1465 err = fzap_remove(zn, tx);
1466 zap_name_free(zn);
1467 zap_unlockdir(zap, FTAG);
1468 return (err);
1469 }
1470
1471 /*
1472 * Routines for iterating over the attributes.
1473 */
1474
1475 void
1476 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
1477 uint64_t serialized)
1478 {
1479 zc->zc_objset = os;
1480 zc->zc_zap = NULL;
1481 zc->zc_leaf = NULL;
1482 zc->zc_zapobj = zapobj;
1483 zc->zc_serialized = serialized;
1484 zc->zc_hash = 0;
1485 zc->zc_cd = 0;
1486 }
1487
1488 void
1489 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
1490 {
1491 zap_cursor_init_serialized(zc, os, zapobj, 0);
1492 }
1493
1494 void
1495 zap_cursor_fini(zap_cursor_t *zc)
1496 {
1497 if (zc->zc_zap) {
1498 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1499 zap_unlockdir(zc->zc_zap, NULL);
1500 zc->zc_zap = NULL;
1501 }
1502 if (zc->zc_leaf) {
1503 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
1504 zap_put_leaf(zc->zc_leaf);
1505 zc->zc_leaf = NULL;
1506 }
1507 zc->zc_objset = NULL;
1508 }
1509
1510 uint64_t
1511 zap_cursor_serialize(zap_cursor_t *zc)
1512 {
1513 if (zc->zc_hash == -1ULL)
1514 return (-1ULL);
1515 if (zc->zc_zap == NULL)
1516 return (zc->zc_serialized);
1517 ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
1518 ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
1519
1520 /*
1521 * We want to keep the high 32 bits of the cursor zero if we can, so
1522 * that 32-bit programs can access this. So usually use a small
1523 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
1524 * of the cursor.
1525 *
1526 * [ collision differentiator | zap_hashbits()-bit hash value ]
1527 */
1528 return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
1529 ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
1530 }
1531
1532 int
1533 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
1534 {
1535 int err;
1536
1537 if (zc->zc_hash == -1ULL)
1538 return (SET_ERROR(ENOENT));
1539
1540 if (zc->zc_zap == NULL) {
1541 int hb;
1542 err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
1543 RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
1544 if (err != 0)
1545 return (err);
1546
1547 /*
1548 * To support zap_cursor_init_serialized, advance, retrieve,
1549 * we must add to the existing zc_cd, which may already
1550 * be 1 due to the zap_cursor_advance.
1551 */
1552 ASSERT(zc->zc_hash == 0);
1553 hb = zap_hashbits(zc->zc_zap);
1554 zc->zc_hash = zc->zc_serialized << (64 - hb);
1555 zc->zc_cd += zc->zc_serialized >> hb;
1556 if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
1557 zc->zc_cd = 0;
1558 } else {
1559 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1560 }
1561 if (!zc->zc_zap->zap_ismicro) {
1562 err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
1563 } else {
1564 avl_index_t idx;
1565 mzap_ent_t mze_tofind;
1566
1567 mze_tofind.mze_hash = zc->zc_hash;
1568 mze_tofind.mze_cd = zc->zc_cd;
1569
1570 mzap_ent_t *mze =
1571 avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
1572 if (mze == NULL) {
1573 mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
1574 idx, AVL_AFTER);
1575 }
1576 if (mze) {
1577 mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
1578 ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
1579 za->za_normalization_conflict =
1580 mzap_normalization_conflict(zc->zc_zap, NULL, mze);
1581 za->za_integer_length = 8;
1582 za->za_num_integers = 1;
1583 za->za_first_integer = mzep->mze_value;
1584 (void) strcpy(za->za_name, mzep->mze_name);
1585 zc->zc_hash = mze->mze_hash;
1586 zc->zc_cd = mze->mze_cd;
1587 err = 0;
1588 } else {
1589 zc->zc_hash = -1ULL;
1590 err = SET_ERROR(ENOENT);
1591 }
1592 }
1593 rw_exit(&zc->zc_zap->zap_rwlock);
1594 return (err);
1595 }
1596
1597 void
1598 zap_cursor_advance(zap_cursor_t *zc)
1599 {
1600 if (zc->zc_hash == -1ULL)
1601 return;
1602 zc->zc_cd++;
1603 }
1604
1605 int
1606 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
1607 {
1608 zap_t *zap;
1609
1610 int err =
1611 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1612 if (err != 0)
1613 return (err);
1614
1615 bzero(zs, sizeof (zap_stats_t));
1616
1617 if (zap->zap_ismicro) {
1618 zs->zs_blocksize = zap->zap_dbuf->db_size;
1619 zs->zs_num_entries = zap->zap_m.zap_num_entries;
1620 zs->zs_num_blocks = 1;
1621 } else {
1622 fzap_get_stats(zap, zs);
1623 }
1624 zap_unlockdir(zap, FTAG);
1625 return (0);
1626 }
1627
1628 #if defined(_KERNEL)
1629 EXPORT_SYMBOL(zap_create);
1630 EXPORT_SYMBOL(zap_create_dnsize);
1631 EXPORT_SYMBOL(zap_create_norm);
1632 EXPORT_SYMBOL(zap_create_norm_dnsize);
1633 EXPORT_SYMBOL(zap_create_flags);
1634 EXPORT_SYMBOL(zap_create_flags_dnsize);
1635 EXPORT_SYMBOL(zap_create_claim);
1636 EXPORT_SYMBOL(zap_create_claim_norm);
1637 EXPORT_SYMBOL(zap_create_claim_norm_dnsize);
1638 EXPORT_SYMBOL(zap_create_hold);
1639 EXPORT_SYMBOL(zap_destroy);
1640 EXPORT_SYMBOL(zap_lookup);
1641 EXPORT_SYMBOL(zap_lookup_by_dnode);
1642 EXPORT_SYMBOL(zap_lookup_norm);
1643 EXPORT_SYMBOL(zap_lookup_uint64);
1644 EXPORT_SYMBOL(zap_contains);
1645 EXPORT_SYMBOL(zap_prefetch);
1646 EXPORT_SYMBOL(zap_prefetch_uint64);
1647 EXPORT_SYMBOL(zap_add);
1648 EXPORT_SYMBOL(zap_add_by_dnode);
1649 EXPORT_SYMBOL(zap_add_uint64);
1650 EXPORT_SYMBOL(zap_update);
1651 EXPORT_SYMBOL(zap_update_uint64);
1652 EXPORT_SYMBOL(zap_length);
1653 EXPORT_SYMBOL(zap_length_uint64);
1654 EXPORT_SYMBOL(zap_remove);
1655 EXPORT_SYMBOL(zap_remove_by_dnode);
1656 EXPORT_SYMBOL(zap_remove_norm);
1657 EXPORT_SYMBOL(zap_remove_uint64);
1658 EXPORT_SYMBOL(zap_count);
1659 EXPORT_SYMBOL(zap_value_search);
1660 EXPORT_SYMBOL(zap_join);
1661 EXPORT_SYMBOL(zap_join_increment);
1662 EXPORT_SYMBOL(zap_add_int);
1663 EXPORT_SYMBOL(zap_remove_int);
1664 EXPORT_SYMBOL(zap_lookup_int);
1665 EXPORT_SYMBOL(zap_increment_int);
1666 EXPORT_SYMBOL(zap_add_int_key);
1667 EXPORT_SYMBOL(zap_lookup_int_key);
1668 EXPORT_SYMBOL(zap_increment);
1669 EXPORT_SYMBOL(zap_cursor_init);
1670 EXPORT_SYMBOL(zap_cursor_fini);
1671 EXPORT_SYMBOL(zap_cursor_retrieve);
1672 EXPORT_SYMBOL(zap_cursor_advance);
1673 EXPORT_SYMBOL(zap_cursor_serialize);
1674 EXPORT_SYMBOL(zap_cursor_init_serialized);
1675 EXPORT_SYMBOL(zap_get_stats);
1676 #endif