]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/zap_micro.c
OpenZFS 9235 - rename zpool_rewind_policy_t to zpool_load_policy_t
[mirror_zfs.git] / module / zfs / zap_micro.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
26 * Copyright 2017 Nexenta Systems, Inc.
27 */
28
29 #include <sys/zio.h>
30 #include <sys/spa.h>
31 #include <sys/dmu.h>
32 #include <sys/zfs_context.h>
33 #include <sys/zap.h>
34 #include <sys/refcount.h>
35 #include <sys/zap_impl.h>
36 #include <sys/zap_leaf.h>
37 #include <sys/avl.h>
38 #include <sys/arc.h>
39 #include <sys/dmu_objset.h>
40
41 #ifdef _KERNEL
42 #include <sys/sunddi.h>
43 #endif
44
45 extern inline mzap_phys_t *zap_m_phys(zap_t *zap);
46
47 static int mzap_upgrade(zap_t **zapp,
48 void *tag, dmu_tx_t *tx, zap_flags_t flags);
49
50 uint64_t
51 zap_getflags(zap_t *zap)
52 {
53 if (zap->zap_ismicro)
54 return (0);
55 return (zap_f_phys(zap)->zap_flags);
56 }
57
58 int
59 zap_hashbits(zap_t *zap)
60 {
61 if (zap_getflags(zap) & ZAP_FLAG_HASH64)
62 return (48);
63 else
64 return (28);
65 }
66
67 uint32_t
68 zap_maxcd(zap_t *zap)
69 {
70 if (zap_getflags(zap) & ZAP_FLAG_HASH64)
71 return ((1<<16)-1);
72 else
73 return (-1U);
74 }
75
76 static uint64_t
77 zap_hash(zap_name_t *zn)
78 {
79 zap_t *zap = zn->zn_zap;
80 uint64_t h = 0;
81
82 if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
83 ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
84 h = *(uint64_t *)zn->zn_key_orig;
85 } else {
86 h = zap->zap_salt;
87 ASSERT(h != 0);
88 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
89
90 if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
91 const uint64_t *wp = zn->zn_key_norm;
92
93 ASSERT(zn->zn_key_intlen == 8);
94 for (int i = 0; i < zn->zn_key_norm_numints;
95 wp++, i++) {
96 uint64_t word = *wp;
97
98 for (int j = 0; j < zn->zn_key_intlen; j++) {
99 h = (h >> 8) ^
100 zfs_crc64_table[(h ^ word) & 0xFF];
101 word >>= NBBY;
102 }
103 }
104 } else {
105 const uint8_t *cp = zn->zn_key_norm;
106
107 /*
108 * We previously stored the terminating null on
109 * disk, but didn't hash it, so we need to
110 * continue to not hash it. (The
111 * zn_key_*_numints includes the terminating
112 * null for non-binary keys.)
113 */
114 int len = zn->zn_key_norm_numints - 1;
115
116 ASSERT(zn->zn_key_intlen == 1);
117 for (int i = 0; i < len; cp++, i++) {
118 h = (h >> 8) ^
119 zfs_crc64_table[(h ^ *cp) & 0xFF];
120 }
121 }
122 }
123 /*
124 * Don't use all 64 bits, since we need some in the cookie for
125 * the collision differentiator. We MUST use the high bits,
126 * since those are the ones that we first pay attention to when
127 * choosing the bucket.
128 */
129 h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
130
131 return (h);
132 }
133
134 static int
135 zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags)
136 {
137 ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
138
139 size_t inlen = strlen(name) + 1;
140 size_t outlen = ZAP_MAXNAMELEN;
141
142 int err = 0;
143 (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
144 normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
145 U8_UNICODE_LATEST, &err);
146
147 return (err);
148 }
149
150 boolean_t
151 zap_match(zap_name_t *zn, const char *matchname)
152 {
153 ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
154
155 if (zn->zn_matchtype & MT_NORMALIZE) {
156 char norm[ZAP_MAXNAMELEN];
157
158 if (zap_normalize(zn->zn_zap, matchname, norm,
159 zn->zn_normflags) != 0)
160 return (B_FALSE);
161
162 return (strcmp(zn->zn_key_norm, norm) == 0);
163 } else {
164 return (strcmp(zn->zn_key_orig, matchname) == 0);
165 }
166 }
167
168 void
169 zap_name_free(zap_name_t *zn)
170 {
171 kmem_free(zn, sizeof (zap_name_t));
172 }
173
174 zap_name_t *
175 zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
176 {
177 zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
178
179 zn->zn_zap = zap;
180 zn->zn_key_intlen = sizeof (*key);
181 zn->zn_key_orig = key;
182 zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
183 zn->zn_matchtype = mt;
184 zn->zn_normflags = zap->zap_normflags;
185
186 /*
187 * If we're dealing with a case sensitive lookup on a mixed or
188 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
189 * will fold case to all caps overriding the lookup request.
190 */
191 if (mt & MT_MATCH_CASE)
192 zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
193
194 if (zap->zap_normflags) {
195 /*
196 * We *must* use zap_normflags because this normalization is
197 * what the hash is computed from.
198 */
199 if (zap_normalize(zap, key, zn->zn_normbuf,
200 zap->zap_normflags) != 0) {
201 zap_name_free(zn);
202 return (NULL);
203 }
204 zn->zn_key_norm = zn->zn_normbuf;
205 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
206 } else {
207 if (mt != 0) {
208 zap_name_free(zn);
209 return (NULL);
210 }
211 zn->zn_key_norm = zn->zn_key_orig;
212 zn->zn_key_norm_numints = zn->zn_key_orig_numints;
213 }
214
215 zn->zn_hash = zap_hash(zn);
216
217 if (zap->zap_normflags != zn->zn_normflags) {
218 /*
219 * We *must* use zn_normflags because this normalization is
220 * what the matching is based on. (Not the hash!)
221 */
222 if (zap_normalize(zap, key, zn->zn_normbuf,
223 zn->zn_normflags) != 0) {
224 zap_name_free(zn);
225 return (NULL);
226 }
227 zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
228 }
229
230 return (zn);
231 }
232
233 zap_name_t *
234 zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
235 {
236 zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
237
238 ASSERT(zap->zap_normflags == 0);
239 zn->zn_zap = zap;
240 zn->zn_key_intlen = sizeof (*key);
241 zn->zn_key_orig = zn->zn_key_norm = key;
242 zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
243 zn->zn_matchtype = 0;
244
245 zn->zn_hash = zap_hash(zn);
246 return (zn);
247 }
248
249 static void
250 mzap_byteswap(mzap_phys_t *buf, size_t size)
251 {
252 buf->mz_block_type = BSWAP_64(buf->mz_block_type);
253 buf->mz_salt = BSWAP_64(buf->mz_salt);
254 buf->mz_normflags = BSWAP_64(buf->mz_normflags);
255 int max = (size / MZAP_ENT_LEN) - 1;
256 for (int i = 0; i < max; i++) {
257 buf->mz_chunk[i].mze_value =
258 BSWAP_64(buf->mz_chunk[i].mze_value);
259 buf->mz_chunk[i].mze_cd =
260 BSWAP_32(buf->mz_chunk[i].mze_cd);
261 }
262 }
263
264 void
265 zap_byteswap(void *buf, size_t size)
266 {
267 uint64_t block_type = *(uint64_t *)buf;
268
269 if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
270 /* ASSERT(magic == ZAP_LEAF_MAGIC); */
271 mzap_byteswap(buf, size);
272 } else {
273 fzap_byteswap(buf, size);
274 }
275 }
276
277 static int
278 mze_compare(const void *arg1, const void *arg2)
279 {
280 const mzap_ent_t *mze1 = arg1;
281 const mzap_ent_t *mze2 = arg2;
282
283 int cmp = AVL_CMP(mze1->mze_hash, mze2->mze_hash);
284 if (likely(cmp))
285 return (cmp);
286
287 return (AVL_CMP(mze1->mze_cd, mze2->mze_cd));
288 }
289
290 static void
291 mze_insert(zap_t *zap, int chunkid, uint64_t hash)
292 {
293 ASSERT(zap->zap_ismicro);
294 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
295
296 mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
297 mze->mze_chunkid = chunkid;
298 mze->mze_hash = hash;
299 mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
300 ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
301 avl_add(&zap->zap_m.zap_avl, mze);
302 }
303
304 static mzap_ent_t *
305 mze_find(zap_name_t *zn)
306 {
307 mzap_ent_t mze_tofind;
308 mzap_ent_t *mze;
309 avl_index_t idx;
310 avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
311
312 ASSERT(zn->zn_zap->zap_ismicro);
313 ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
314
315 mze_tofind.mze_hash = zn->zn_hash;
316 mze_tofind.mze_cd = 0;
317
318 mze = avl_find(avl, &mze_tofind, &idx);
319 if (mze == NULL)
320 mze = avl_nearest(avl, idx, AVL_AFTER);
321 for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
322 ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
323 if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
324 return (mze);
325 }
326
327 return (NULL);
328 }
329
330 static uint32_t
331 mze_find_unused_cd(zap_t *zap, uint64_t hash)
332 {
333 mzap_ent_t mze_tofind;
334 avl_index_t idx;
335 avl_tree_t *avl = &zap->zap_m.zap_avl;
336
337 ASSERT(zap->zap_ismicro);
338 ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
339
340 mze_tofind.mze_hash = hash;
341 mze_tofind.mze_cd = 0;
342
343 uint32_t cd = 0;
344 for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx);
345 mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
346 if (mze->mze_cd != cd)
347 break;
348 cd++;
349 }
350
351 return (cd);
352 }
353
354 /*
355 * Each mzap entry requires at max : 4 chunks
356 * 3 chunks for names + 1 chunk for value.
357 */
358 #define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \
359 ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t)))
360
361 /*
362 * Check if the current entry keeps the colliding entries under the fatzap leaf
363 * size.
364 */
365 static boolean_t
366 mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
367 {
368 zap_t *zap = zn->zn_zap;
369 mzap_ent_t mze_tofind;
370 mzap_ent_t *mze;
371 avl_index_t idx;
372 avl_tree_t *avl = &zap->zap_m.zap_avl;
373 uint32_t mzap_ents = 0;
374
375 mze_tofind.mze_hash = hash;
376 mze_tofind.mze_cd = 0;
377
378 for (mze = avl_find(avl, &mze_tofind, &idx);
379 mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
380 mzap_ents++;
381 }
382
383 /* Include the new entry being added */
384 mzap_ents++;
385
386 return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS));
387 }
388
389 static void
390 mze_remove(zap_t *zap, mzap_ent_t *mze)
391 {
392 ASSERT(zap->zap_ismicro);
393 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
394
395 avl_remove(&zap->zap_m.zap_avl, mze);
396 kmem_free(mze, sizeof (mzap_ent_t));
397 }
398
399 static void
400 mze_destroy(zap_t *zap)
401 {
402 mzap_ent_t *mze;
403 void *avlcookie = NULL;
404
405 while ((mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)))
406 kmem_free(mze, sizeof (mzap_ent_t));
407 avl_destroy(&zap->zap_m.zap_avl);
408 }
409
410 static zap_t *
411 mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
412 {
413 zap_t *winner;
414 uint64_t *zap_hdr = (uint64_t *)db->db_data;
415 uint64_t zap_block_type = zap_hdr[0];
416 uint64_t zap_magic = zap_hdr[1];
417
418 ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
419
420 zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
421 rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
422 rw_enter(&zap->zap_rwlock, RW_WRITER);
423 zap->zap_objset = os;
424 zap->zap_object = obj;
425 zap->zap_dbuf = db;
426
427 if (zap_block_type != ZBT_MICRO) {
428 mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT,
429 0);
430 zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
431 if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) {
432 winner = NULL; /* No actual winner here... */
433 goto handle_winner;
434 }
435 } else {
436 zap->zap_ismicro = TRUE;
437 }
438
439 /*
440 * Make sure that zap_ismicro is set before we let others see
441 * it, because zap_lockdir() checks zap_ismicro without the lock
442 * held.
443 */
444 dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
445 winner = dmu_buf_set_user(db, &zap->zap_dbu);
446
447 if (winner != NULL)
448 goto handle_winner;
449
450 if (zap->zap_ismicro) {
451 zap->zap_salt = zap_m_phys(zap)->mz_salt;
452 zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
453 zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
454 avl_create(&zap->zap_m.zap_avl, mze_compare,
455 sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
456
457 for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
458 mzap_ent_phys_t *mze =
459 &zap_m_phys(zap)->mz_chunk[i];
460 if (mze->mze_name[0]) {
461 zap_name_t *zn;
462
463 zap->zap_m.zap_num_entries++;
464 zn = zap_name_alloc(zap, mze->mze_name, 0);
465 mze_insert(zap, i, zn->zn_hash);
466 zap_name_free(zn);
467 }
468 }
469 } else {
470 zap->zap_salt = zap_f_phys(zap)->zap_salt;
471 zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
472
473 ASSERT3U(sizeof (struct zap_leaf_header), ==,
474 2*ZAP_LEAF_CHUNKSIZE);
475
476 /*
477 * The embedded pointer table should not overlap the
478 * other members.
479 */
480 ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
481 &zap_f_phys(zap)->zap_salt);
482
483 /*
484 * The embedded pointer table should end at the end of
485 * the block
486 */
487 ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
488 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
489 (uintptr_t)zap_f_phys(zap), ==,
490 zap->zap_dbuf->db_size);
491 }
492 rw_exit(&zap->zap_rwlock);
493 return (zap);
494
495 handle_winner:
496 rw_exit(&zap->zap_rwlock);
497 rw_destroy(&zap->zap_rwlock);
498 if (!zap->zap_ismicro)
499 mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
500 kmem_free(zap, sizeof (zap_t));
501 return (winner);
502 }
503
504 /*
505 * This routine "consumes" the caller's hold on the dbuf, which must
506 * have the specified tag.
507 */
508 static int
509 zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
510 krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
511 {
512 ASSERT0(db->db_offset);
513 objset_t *os = dmu_buf_get_objset(db);
514 uint64_t obj = db->db_object;
515 dmu_object_info_t doi;
516
517 *zapp = NULL;
518
519 dmu_object_info_from_db(db, &doi);
520 if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
521 return (SET_ERROR(EINVAL));
522
523 zap_t *zap = dmu_buf_get_user(db);
524 if (zap == NULL) {
525 zap = mzap_open(os, obj, db);
526 if (zap == NULL) {
527 /*
528 * mzap_open() didn't like what it saw on-disk.
529 * Check for corruption!
530 */
531 return (SET_ERROR(EIO));
532 }
533 }
534
535 /*
536 * We're checking zap_ismicro without the lock held, in order to
537 * tell what type of lock we want. Once we have some sort of
538 * lock, see if it really is the right type. In practice this
539 * can only be different if it was upgraded from micro to fat,
540 * and micro wanted WRITER but fat only needs READER.
541 */
542 krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
543 rw_enter(&zap->zap_rwlock, lt);
544 if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
545 /* it was upgraded, now we only need reader */
546 ASSERT(lt == RW_WRITER);
547 ASSERT(RW_READER ==
548 ((!zap->zap_ismicro && fatreader) ? RW_READER : lti));
549 rw_downgrade(&zap->zap_rwlock);
550 lt = RW_READER;
551 }
552
553 zap->zap_objset = os;
554
555 if (lt == RW_WRITER)
556 dmu_buf_will_dirty(db, tx);
557
558 ASSERT3P(zap->zap_dbuf, ==, db);
559
560 ASSERT(!zap->zap_ismicro ||
561 zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
562 if (zap->zap_ismicro && tx && adding &&
563 zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
564 uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
565 if (newsz > MZAP_MAX_BLKSZ) {
566 dprintf("upgrading obj %llu: num_entries=%u\n",
567 obj, zap->zap_m.zap_num_entries);
568 *zapp = zap;
569 int err = mzap_upgrade(zapp, tag, tx, 0);
570 if (err != 0)
571 rw_exit(&zap->zap_rwlock);
572 return (err);
573 }
574 VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
575 zap->zap_m.zap_num_chunks =
576 db->db_size / MZAP_ENT_LEN - 1;
577 }
578
579 *zapp = zap;
580 return (0);
581 }
582
583 static int
584 zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
585 krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
586 {
587 dmu_buf_t *db;
588
589 int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
590 if (err != 0) {
591 return (err);
592 }
593 #ifdef ZFS_DEBUG
594 {
595 dmu_object_info_t doi;
596 dmu_object_info_from_db(db, &doi);
597 ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
598 }
599 #endif
600
601 err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
602 if (err != 0) {
603 dmu_buf_rele(db, tag);
604 }
605 return (err);
606 }
607
608 int
609 zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
610 krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
611 {
612 dmu_buf_t *db;
613
614 int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
615 if (err != 0)
616 return (err);
617 #ifdef ZFS_DEBUG
618 {
619 dmu_object_info_t doi;
620 dmu_object_info_from_db(db, &doi);
621 ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
622 }
623 #endif
624 err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
625 if (err != 0)
626 dmu_buf_rele(db, tag);
627 return (err);
628 }
629
630 void
631 zap_unlockdir(zap_t *zap, void *tag)
632 {
633 rw_exit(&zap->zap_rwlock);
634 dmu_buf_rele(zap->zap_dbuf, tag);
635 }
636
637 static int
638 mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
639 {
640 int err = 0;
641 zap_t *zap = *zapp;
642
643 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
644
645 int sz = zap->zap_dbuf->db_size;
646 mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP);
647 bcopy(zap->zap_dbuf->db_data, mzp, sz);
648 int nchunks = zap->zap_m.zap_num_chunks;
649
650 if (!flags) {
651 err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
652 1ULL << fzap_default_block_shift, 0, tx);
653 if (err != 0) {
654 vmem_free(mzp, sz);
655 return (err);
656 }
657 }
658
659 dprintf("upgrading obj=%llu with %u chunks\n",
660 zap->zap_object, nchunks);
661 /* XXX destroy the avl later, so we can use the stored hash value */
662 mze_destroy(zap);
663
664 fzap_upgrade(zap, tx, flags);
665
666 for (int i = 0; i < nchunks; i++) {
667 mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
668 if (mze->mze_name[0] == 0)
669 continue;
670 dprintf("adding %s=%llu\n",
671 mze->mze_name, mze->mze_value);
672 zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0);
673 /* If we fail here, we would end up losing entries */
674 VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
675 tag, tx));
676 zap = zn->zn_zap; /* fzap_add_cd() may change zap */
677 zap_name_free(zn);
678 }
679 vmem_free(mzp, sz);
680 *zapp = zap;
681 return (0);
682 }
683
684 /*
685 * The "normflags" determine the behavior of the matchtype_t which is
686 * passed to zap_lookup_norm(). Names which have the same normalized
687 * version will be stored with the same hash value, and therefore we can
688 * perform normalization-insensitive lookups. We can be Unicode form-
689 * insensitive and/or case-insensitive. The following flags are valid for
690 * "normflags":
691 *
692 * U8_TEXTPREP_NFC
693 * U8_TEXTPREP_NFD
694 * U8_TEXTPREP_NFKC
695 * U8_TEXTPREP_NFKD
696 * U8_TEXTPREP_TOUPPER
697 *
698 * The *_NF* (Normalization Form) flags are mutually exclusive; at most one
699 * of them may be supplied.
700 */
701 void
702 mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
703 dmu_tx_t *tx)
704 {
705 dmu_buf_t *db;
706
707 VERIFY0(dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
708
709 dmu_buf_will_dirty(db, tx);
710 mzap_phys_t *zp = db->db_data;
711 zp->mz_block_type = ZBT_MICRO;
712 zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
713 zp->mz_normflags = normflags;
714
715 if (flags != 0) {
716 zap_t *zap;
717 /* Only fat zap supports flags; upgrade immediately. */
718 VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER,
719 B_FALSE, B_FALSE, &zap));
720 VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
721 zap_unlockdir(zap, FTAG);
722 } else {
723 dmu_buf_rele(db, FTAG);
724 }
725 }
726
727 int
728 zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
729 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
730 {
731 return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
732 0, tx));
733 }
734
735 int
736 zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
737 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
738 {
739 return (zap_create_claim_norm_dnsize(os, obj,
740 0, ot, bonustype, bonuslen, dnodesize, tx));
741 }
742
743 int
744 zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
745 dmu_object_type_t ot,
746 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
747 {
748 return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
749 bonuslen, 0, tx));
750 }
751
752 int
753 zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
754 dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
755 int dnodesize, dmu_tx_t *tx)
756 {
757 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
758 int err = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
759 dnodesize, tx);
760 if (err != 0)
761 return (err);
762 mzap_create_impl(os, obj, normflags, 0, tx);
763 return (0);
764 }
765
766 uint64_t
767 zap_create(objset_t *os, dmu_object_type_t ot,
768 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
769 {
770 return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
771 }
772
773 uint64_t
774 zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
775 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
776 {
777 return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
778 dnodesize, tx));
779 }
780
781 uint64_t
782 zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
783 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
784 {
785 return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
786 0, tx));
787 }
788
789 uint64_t
790 zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
791 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
792 {
793 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
794 uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen,
795 dnodesize, tx);
796
797 mzap_create_impl(os, obj, normflags, 0, tx);
798 return (obj);
799 }
800
801 uint64_t
802 zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
803 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
804 dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
805 {
806 return (zap_create_flags_dnsize(os, normflags, flags, ot,
807 leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
808 }
809
810 uint64_t
811 zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
812 dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
813 dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
814 {
815 ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
816 uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen,
817 dnodesize, tx);
818
819 ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
820 leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
821 indirect_blockshift >= SPA_MINBLOCKSHIFT &&
822 indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
823
824 VERIFY(dmu_object_set_blocksize(os, obj,
825 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
826
827 mzap_create_impl(os, obj, normflags, flags, tx);
828 return (obj);
829 }
830
831 int
832 zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
833 {
834 /*
835 * dmu_object_free will free the object number and free the
836 * data. Freeing the data will cause our pageout function to be
837 * called, which will destroy our data (zap_leaf_t's and zap_t).
838 */
839
840 return (dmu_object_free(os, zapobj, tx));
841 }
842
843 void
844 zap_evict_sync(void *dbu)
845 {
846 zap_t *zap = dbu;
847
848 rw_destroy(&zap->zap_rwlock);
849
850 if (zap->zap_ismicro)
851 mze_destroy(zap);
852 else
853 mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
854
855 kmem_free(zap, sizeof (zap_t));
856 }
857
858 int
859 zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
860 {
861 zap_t *zap;
862
863 int err =
864 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
865 if (err != 0)
866 return (err);
867 if (!zap->zap_ismicro) {
868 err = fzap_count(zap, count);
869 } else {
870 *count = zap->zap_m.zap_num_entries;
871 }
872 zap_unlockdir(zap, FTAG);
873 return (err);
874 }
875
876 /*
877 * zn may be NULL; if not specified, it will be computed if needed.
878 * See also the comment above zap_entry_normalization_conflict().
879 */
880 static boolean_t
881 mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
882 {
883 int direction = AVL_BEFORE;
884 boolean_t allocdzn = B_FALSE;
885
886 if (zap->zap_normflags == 0)
887 return (B_FALSE);
888
889 again:
890 for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
891 other && other->mze_hash == mze->mze_hash;
892 other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
893
894 if (zn == NULL) {
895 zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
896 MT_NORMALIZE);
897 allocdzn = B_TRUE;
898 }
899 if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
900 if (allocdzn)
901 zap_name_free(zn);
902 return (B_TRUE);
903 }
904 }
905
906 if (direction == AVL_BEFORE) {
907 direction = AVL_AFTER;
908 goto again;
909 }
910
911 if (allocdzn)
912 zap_name_free(zn);
913 return (B_FALSE);
914 }
915
916 /*
917 * Routines for manipulating attributes.
918 */
919
920 int
921 zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
922 uint64_t integer_size, uint64_t num_integers, void *buf)
923 {
924 return (zap_lookup_norm(os, zapobj, name, integer_size,
925 num_integers, buf, 0, NULL, 0, NULL));
926 }
927
928 static int
929 zap_lookup_impl(zap_t *zap, const char *name,
930 uint64_t integer_size, uint64_t num_integers, void *buf,
931 matchtype_t mt, char *realname, int rn_len,
932 boolean_t *ncp)
933 {
934 int err = 0;
935
936 zap_name_t *zn = zap_name_alloc(zap, name, mt);
937 if (zn == NULL)
938 return (SET_ERROR(ENOTSUP));
939
940 if (!zap->zap_ismicro) {
941 err = fzap_lookup(zn, integer_size, num_integers, buf,
942 realname, rn_len, ncp);
943 } else {
944 mzap_ent_t *mze = mze_find(zn);
945 if (mze == NULL) {
946 err = SET_ERROR(ENOENT);
947 } else {
948 if (num_integers < 1) {
949 err = SET_ERROR(EOVERFLOW);
950 } else if (integer_size != 8) {
951 err = SET_ERROR(EINVAL);
952 } else {
953 *(uint64_t *)buf =
954 MZE_PHYS(zap, mze)->mze_value;
955 (void) strlcpy(realname,
956 MZE_PHYS(zap, mze)->mze_name, rn_len);
957 if (ncp) {
958 *ncp = mzap_normalization_conflict(zap,
959 zn, mze);
960 }
961 }
962 }
963 }
964 zap_name_free(zn);
965 return (err);
966 }
967
968 int
969 zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
970 uint64_t integer_size, uint64_t num_integers, void *buf,
971 matchtype_t mt, char *realname, int rn_len,
972 boolean_t *ncp)
973 {
974 zap_t *zap;
975
976 int err =
977 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
978 if (err != 0)
979 return (err);
980 err = zap_lookup_impl(zap, name, integer_size,
981 num_integers, buf, mt, realname, rn_len, ncp);
982 zap_unlockdir(zap, FTAG);
983 return (err);
984 }
985
986 int
987 zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
988 {
989 zap_t *zap;
990 int err;
991 zap_name_t *zn;
992
993 err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
994 if (err)
995 return (err);
996 zn = zap_name_alloc(zap, name, 0);
997 if (zn == NULL) {
998 zap_unlockdir(zap, FTAG);
999 return (SET_ERROR(ENOTSUP));
1000 }
1001
1002 fzap_prefetch(zn);
1003 zap_name_free(zn);
1004 zap_unlockdir(zap, FTAG);
1005 return (err);
1006 }
1007
1008 int
1009 zap_lookup_by_dnode(dnode_t *dn, const char *name,
1010 uint64_t integer_size, uint64_t num_integers, void *buf)
1011 {
1012 return (zap_lookup_norm_by_dnode(dn, name, integer_size,
1013 num_integers, buf, 0, NULL, 0, NULL));
1014 }
1015
1016 int
1017 zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
1018 uint64_t integer_size, uint64_t num_integers, void *buf,
1019 matchtype_t mt, char *realname, int rn_len,
1020 boolean_t *ncp)
1021 {
1022 zap_t *zap;
1023
1024 int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
1025 FTAG, &zap);
1026 if (err != 0)
1027 return (err);
1028 err = zap_lookup_impl(zap, name, integer_size,
1029 num_integers, buf, mt, realname, rn_len, ncp);
1030 zap_unlockdir(zap, FTAG);
1031 return (err);
1032 }
1033
1034 int
1035 zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1036 int key_numints)
1037 {
1038 zap_t *zap;
1039
1040 int err =
1041 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1042 if (err != 0)
1043 return (err);
1044 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1045 if (zn == NULL) {
1046 zap_unlockdir(zap, FTAG);
1047 return (SET_ERROR(ENOTSUP));
1048 }
1049
1050 fzap_prefetch(zn);
1051 zap_name_free(zn);
1052 zap_unlockdir(zap, FTAG);
1053 return (err);
1054 }
1055
1056 int
1057 zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1058 int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
1059 {
1060 zap_t *zap;
1061
1062 int err =
1063 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1064 if (err != 0)
1065 return (err);
1066 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1067 if (zn == NULL) {
1068 zap_unlockdir(zap, FTAG);
1069 return (SET_ERROR(ENOTSUP));
1070 }
1071
1072 err = fzap_lookup(zn, integer_size, num_integers, buf,
1073 NULL, 0, NULL);
1074 zap_name_free(zn);
1075 zap_unlockdir(zap, FTAG);
1076 return (err);
1077 }
1078
1079 int
1080 zap_contains(objset_t *os, uint64_t zapobj, const char *name)
1081 {
1082 int err = zap_lookup_norm(os, zapobj, name, 0,
1083 0, NULL, 0, NULL, 0, NULL);
1084 if (err == EOVERFLOW || err == EINVAL)
1085 err = 0; /* found, but skipped reading the value */
1086 return (err);
1087 }
1088
1089 int
1090 zap_length(objset_t *os, uint64_t zapobj, const char *name,
1091 uint64_t *integer_size, uint64_t *num_integers)
1092 {
1093 zap_t *zap;
1094
1095 int err =
1096 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1097 if (err != 0)
1098 return (err);
1099 zap_name_t *zn = zap_name_alloc(zap, name, 0);
1100 if (zn == NULL) {
1101 zap_unlockdir(zap, FTAG);
1102 return (SET_ERROR(ENOTSUP));
1103 }
1104 if (!zap->zap_ismicro) {
1105 err = fzap_length(zn, integer_size, num_integers);
1106 } else {
1107 mzap_ent_t *mze = mze_find(zn);
1108 if (mze == NULL) {
1109 err = SET_ERROR(ENOENT);
1110 } else {
1111 if (integer_size)
1112 *integer_size = 8;
1113 if (num_integers)
1114 *num_integers = 1;
1115 }
1116 }
1117 zap_name_free(zn);
1118 zap_unlockdir(zap, FTAG);
1119 return (err);
1120 }
1121
1122 int
1123 zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1124 int key_numints, uint64_t *integer_size, uint64_t *num_integers)
1125 {
1126 zap_t *zap;
1127
1128 int err =
1129 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1130 if (err != 0)
1131 return (err);
1132 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1133 if (zn == NULL) {
1134 zap_unlockdir(zap, FTAG);
1135 return (SET_ERROR(ENOTSUP));
1136 }
1137 err = fzap_length(zn, integer_size, num_integers);
1138 zap_name_free(zn);
1139 zap_unlockdir(zap, FTAG);
1140 return (err);
1141 }
1142
1143 static void
1144 mzap_addent(zap_name_t *zn, uint64_t value)
1145 {
1146 zap_t *zap = zn->zn_zap;
1147 int start = zap->zap_m.zap_alloc_next;
1148
1149 ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
1150
1151 #ifdef ZFS_DEBUG
1152 for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
1153 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
1154 ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
1155 }
1156 #endif
1157
1158 uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash);
1159 /* given the limited size of the microzap, this can't happen */
1160 ASSERT(cd < zap_maxcd(zap));
1161
1162 again:
1163 for (int i = start; i < zap->zap_m.zap_num_chunks; i++) {
1164 mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
1165 if (mze->mze_name[0] == 0) {
1166 mze->mze_value = value;
1167 mze->mze_cd = cd;
1168 (void) strlcpy(mze->mze_name, zn->zn_key_orig,
1169 sizeof (mze->mze_name));
1170 zap->zap_m.zap_num_entries++;
1171 zap->zap_m.zap_alloc_next = i+1;
1172 if (zap->zap_m.zap_alloc_next ==
1173 zap->zap_m.zap_num_chunks)
1174 zap->zap_m.zap_alloc_next = 0;
1175 mze_insert(zap, i, zn->zn_hash);
1176 return;
1177 }
1178 }
1179 if (start != 0) {
1180 start = 0;
1181 goto again;
1182 }
1183 cmn_err(CE_PANIC, "out of entries!");
1184 }
1185
1186 static int
1187 zap_add_impl(zap_t *zap, const char *key,
1188 int integer_size, uint64_t num_integers,
1189 const void *val, dmu_tx_t *tx, void *tag)
1190 {
1191 const uint64_t *intval = val;
1192 int err = 0;
1193
1194 zap_name_t *zn = zap_name_alloc(zap, key, 0);
1195 if (zn == NULL) {
1196 zap_unlockdir(zap, tag);
1197 return (SET_ERROR(ENOTSUP));
1198 }
1199 if (!zap->zap_ismicro) {
1200 err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
1201 zap = zn->zn_zap; /* fzap_add() may change zap */
1202 } else if (integer_size != 8 || num_integers != 1 ||
1203 strlen(key) >= MZAP_NAME_LEN ||
1204 !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
1205 err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
1206 if (err == 0) {
1207 err = fzap_add(zn, integer_size, num_integers, val,
1208 tag, tx);
1209 }
1210 zap = zn->zn_zap; /* fzap_add() may change zap */
1211 } else {
1212 if (mze_find(zn) != NULL) {
1213 err = SET_ERROR(EEXIST);
1214 } else {
1215 mzap_addent(zn, *intval);
1216 }
1217 }
1218 ASSERT(zap == zn->zn_zap);
1219 zap_name_free(zn);
1220 if (zap != NULL) /* may be NULL if fzap_add() failed */
1221 zap_unlockdir(zap, tag);
1222 return (err);
1223 }
1224
1225 int
1226 zap_add(objset_t *os, uint64_t zapobj, const char *key,
1227 int integer_size, uint64_t num_integers,
1228 const void *val, dmu_tx_t *tx)
1229 {
1230 zap_t *zap;
1231 int err;
1232
1233 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1234 if (err != 0)
1235 return (err);
1236 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
1237 /* zap_add_impl() calls zap_unlockdir() */
1238 return (err);
1239 }
1240
1241 int
1242 zap_add_by_dnode(dnode_t *dn, const char *key,
1243 int integer_size, uint64_t num_integers,
1244 const void *val, dmu_tx_t *tx)
1245 {
1246 zap_t *zap;
1247 int err;
1248
1249 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1250 if (err != 0)
1251 return (err);
1252 err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
1253 /* zap_add_impl() calls zap_unlockdir() */
1254 return (err);
1255 }
1256
1257 int
1258 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1259 int key_numints, int integer_size, uint64_t num_integers,
1260 const void *val, dmu_tx_t *tx)
1261 {
1262 zap_t *zap;
1263
1264 int err =
1265 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1266 if (err != 0)
1267 return (err);
1268 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1269 if (zn == NULL) {
1270 zap_unlockdir(zap, FTAG);
1271 return (SET_ERROR(ENOTSUP));
1272 }
1273 err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
1274 zap = zn->zn_zap; /* fzap_add() may change zap */
1275 zap_name_free(zn);
1276 if (zap != NULL) /* may be NULL if fzap_add() failed */
1277 zap_unlockdir(zap, FTAG);
1278 return (err);
1279 }
1280
1281 int
1282 zap_update(objset_t *os, uint64_t zapobj, const char *name,
1283 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1284 {
1285 zap_t *zap;
1286 ASSERTV(uint64_t oldval);
1287 const uint64_t *intval = val;
1288
1289 #ifdef ZFS_DEBUG
1290
1291 /*
1292 * If there is an old value, it shouldn't change across the
1293 * lockdir (eg, due to bprewrite's xlation).
1294 */
1295 if (integer_size == 8 && num_integers == 1)
1296 (void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
1297 #endif
1298
1299 int err =
1300 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1301 if (err != 0)
1302 return (err);
1303 zap_name_t *zn = zap_name_alloc(zap, name, 0);
1304 if (zn == NULL) {
1305 zap_unlockdir(zap, FTAG);
1306 return (SET_ERROR(ENOTSUP));
1307 }
1308 if (!zap->zap_ismicro) {
1309 err = fzap_update(zn, integer_size, num_integers, val,
1310 FTAG, tx);
1311 zap = zn->zn_zap; /* fzap_update() may change zap */
1312 } else if (integer_size != 8 || num_integers != 1 ||
1313 strlen(name) >= MZAP_NAME_LEN) {
1314 dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
1315 zapobj, integer_size, num_integers, name);
1316 err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
1317 if (err == 0) {
1318 err = fzap_update(zn, integer_size, num_integers,
1319 val, FTAG, tx);
1320 }
1321 zap = zn->zn_zap; /* fzap_update() may change zap */
1322 } else {
1323 mzap_ent_t *mze = mze_find(zn);
1324 if (mze != NULL) {
1325 ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval);
1326 MZE_PHYS(zap, mze)->mze_value = *intval;
1327 } else {
1328 mzap_addent(zn, *intval);
1329 }
1330 }
1331 ASSERT(zap == zn->zn_zap);
1332 zap_name_free(zn);
1333 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
1334 zap_unlockdir(zap, FTAG);
1335 return (err);
1336 }
1337
1338 int
1339 zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1340 int key_numints,
1341 int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1342 {
1343 zap_t *zap;
1344
1345 int err =
1346 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1347 if (err != 0)
1348 return (err);
1349 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1350 if (zn == NULL) {
1351 zap_unlockdir(zap, FTAG);
1352 return (SET_ERROR(ENOTSUP));
1353 }
1354 err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
1355 zap = zn->zn_zap; /* fzap_update() may change zap */
1356 zap_name_free(zn);
1357 if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
1358 zap_unlockdir(zap, FTAG);
1359 return (err);
1360 }
1361
1362 int
1363 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
1364 {
1365 return (zap_remove_norm(os, zapobj, name, 0, tx));
1366 }
1367
1368 static int
1369 zap_remove_impl(zap_t *zap, const char *name,
1370 matchtype_t mt, dmu_tx_t *tx)
1371 {
1372 int err = 0;
1373
1374 zap_name_t *zn = zap_name_alloc(zap, name, mt);
1375 if (zn == NULL)
1376 return (SET_ERROR(ENOTSUP));
1377 if (!zap->zap_ismicro) {
1378 err = fzap_remove(zn, tx);
1379 } else {
1380 mzap_ent_t *mze = mze_find(zn);
1381 if (mze == NULL) {
1382 err = SET_ERROR(ENOENT);
1383 } else {
1384 zap->zap_m.zap_num_entries--;
1385 bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid],
1386 sizeof (mzap_ent_phys_t));
1387 mze_remove(zap, mze);
1388 }
1389 }
1390 zap_name_free(zn);
1391 return (err);
1392 }
1393
1394 int
1395 zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
1396 matchtype_t mt, dmu_tx_t *tx)
1397 {
1398 zap_t *zap;
1399 int err;
1400
1401 err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1402 if (err)
1403 return (err);
1404 err = zap_remove_impl(zap, name, mt, tx);
1405 zap_unlockdir(zap, FTAG);
1406 return (err);
1407 }
1408
1409 int
1410 zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
1411 {
1412 zap_t *zap;
1413 int err;
1414
1415 err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1416 if (err)
1417 return (err);
1418 err = zap_remove_impl(zap, name, 0, tx);
1419 zap_unlockdir(zap, FTAG);
1420 return (err);
1421 }
1422
1423 int
1424 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1425 int key_numints, dmu_tx_t *tx)
1426 {
1427 zap_t *zap;
1428
1429 int err =
1430 zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1431 if (err != 0)
1432 return (err);
1433 zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1434 if (zn == NULL) {
1435 zap_unlockdir(zap, FTAG);
1436 return (SET_ERROR(ENOTSUP));
1437 }
1438 err = fzap_remove(zn, tx);
1439 zap_name_free(zn);
1440 zap_unlockdir(zap, FTAG);
1441 return (err);
1442 }
1443
1444 /*
1445 * Routines for iterating over the attributes.
1446 */
1447
1448 void
1449 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
1450 uint64_t serialized)
1451 {
1452 zc->zc_objset = os;
1453 zc->zc_zap = NULL;
1454 zc->zc_leaf = NULL;
1455 zc->zc_zapobj = zapobj;
1456 zc->zc_serialized = serialized;
1457 zc->zc_hash = 0;
1458 zc->zc_cd = 0;
1459 }
1460
1461 void
1462 zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
1463 {
1464 zap_cursor_init_serialized(zc, os, zapobj, 0);
1465 }
1466
1467 void
1468 zap_cursor_fini(zap_cursor_t *zc)
1469 {
1470 if (zc->zc_zap) {
1471 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1472 zap_unlockdir(zc->zc_zap, NULL);
1473 zc->zc_zap = NULL;
1474 }
1475 if (zc->zc_leaf) {
1476 rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
1477 zap_put_leaf(zc->zc_leaf);
1478 zc->zc_leaf = NULL;
1479 }
1480 zc->zc_objset = NULL;
1481 }
1482
1483 uint64_t
1484 zap_cursor_serialize(zap_cursor_t *zc)
1485 {
1486 if (zc->zc_hash == -1ULL)
1487 return (-1ULL);
1488 if (zc->zc_zap == NULL)
1489 return (zc->zc_serialized);
1490 ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
1491 ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
1492
1493 /*
1494 * We want to keep the high 32 bits of the cursor zero if we can, so
1495 * that 32-bit programs can access this. So usually use a small
1496 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
1497 * of the cursor.
1498 *
1499 * [ collision differentiator | zap_hashbits()-bit hash value ]
1500 */
1501 return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
1502 ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
1503 }
1504
1505 int
1506 zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
1507 {
1508 int err;
1509
1510 if (zc->zc_hash == -1ULL)
1511 return (SET_ERROR(ENOENT));
1512
1513 if (zc->zc_zap == NULL) {
1514 int hb;
1515 err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
1516 RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
1517 if (err != 0)
1518 return (err);
1519
1520 /*
1521 * To support zap_cursor_init_serialized, advance, retrieve,
1522 * we must add to the existing zc_cd, which may already
1523 * be 1 due to the zap_cursor_advance.
1524 */
1525 ASSERT(zc->zc_hash == 0);
1526 hb = zap_hashbits(zc->zc_zap);
1527 zc->zc_hash = zc->zc_serialized << (64 - hb);
1528 zc->zc_cd += zc->zc_serialized >> hb;
1529 if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
1530 zc->zc_cd = 0;
1531 } else {
1532 rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1533 }
1534 if (!zc->zc_zap->zap_ismicro) {
1535 err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
1536 } else {
1537 avl_index_t idx;
1538 mzap_ent_t mze_tofind;
1539
1540 mze_tofind.mze_hash = zc->zc_hash;
1541 mze_tofind.mze_cd = zc->zc_cd;
1542
1543 mzap_ent_t *mze =
1544 avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
1545 if (mze == NULL) {
1546 mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
1547 idx, AVL_AFTER);
1548 }
1549 if (mze) {
1550 mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
1551 ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
1552 za->za_normalization_conflict =
1553 mzap_normalization_conflict(zc->zc_zap, NULL, mze);
1554 za->za_integer_length = 8;
1555 za->za_num_integers = 1;
1556 za->za_first_integer = mzep->mze_value;
1557 (void) strcpy(za->za_name, mzep->mze_name);
1558 zc->zc_hash = mze->mze_hash;
1559 zc->zc_cd = mze->mze_cd;
1560 err = 0;
1561 } else {
1562 zc->zc_hash = -1ULL;
1563 err = SET_ERROR(ENOENT);
1564 }
1565 }
1566 rw_exit(&zc->zc_zap->zap_rwlock);
1567 return (err);
1568 }
1569
1570 void
1571 zap_cursor_advance(zap_cursor_t *zc)
1572 {
1573 if (zc->zc_hash == -1ULL)
1574 return;
1575 zc->zc_cd++;
1576 }
1577
1578 int
1579 zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
1580 {
1581 zap_t *zap;
1582
1583 int err =
1584 zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1585 if (err != 0)
1586 return (err);
1587
1588 bzero(zs, sizeof (zap_stats_t));
1589
1590 if (zap->zap_ismicro) {
1591 zs->zs_blocksize = zap->zap_dbuf->db_size;
1592 zs->zs_num_entries = zap->zap_m.zap_num_entries;
1593 zs->zs_num_blocks = 1;
1594 } else {
1595 fzap_get_stats(zap, zs);
1596 }
1597 zap_unlockdir(zap, FTAG);
1598 return (0);
1599 }
1600
1601 #if defined(_KERNEL)
1602 EXPORT_SYMBOL(zap_create);
1603 EXPORT_SYMBOL(zap_create_dnsize);
1604 EXPORT_SYMBOL(zap_create_norm);
1605 EXPORT_SYMBOL(zap_create_norm_dnsize);
1606 EXPORT_SYMBOL(zap_create_flags);
1607 EXPORT_SYMBOL(zap_create_flags_dnsize);
1608 EXPORT_SYMBOL(zap_create_claim);
1609 EXPORT_SYMBOL(zap_create_claim_norm);
1610 EXPORT_SYMBOL(zap_create_claim_norm_dnsize);
1611 EXPORT_SYMBOL(zap_destroy);
1612 EXPORT_SYMBOL(zap_lookup);
1613 EXPORT_SYMBOL(zap_lookup_by_dnode);
1614 EXPORT_SYMBOL(zap_lookup_norm);
1615 EXPORT_SYMBOL(zap_lookup_uint64);
1616 EXPORT_SYMBOL(zap_contains);
1617 EXPORT_SYMBOL(zap_prefetch);
1618 EXPORT_SYMBOL(zap_prefetch_uint64);
1619 EXPORT_SYMBOL(zap_add);
1620 EXPORT_SYMBOL(zap_add_by_dnode);
1621 EXPORT_SYMBOL(zap_add_uint64);
1622 EXPORT_SYMBOL(zap_update);
1623 EXPORT_SYMBOL(zap_update_uint64);
1624 EXPORT_SYMBOL(zap_length);
1625 EXPORT_SYMBOL(zap_length_uint64);
1626 EXPORT_SYMBOL(zap_remove);
1627 EXPORT_SYMBOL(zap_remove_by_dnode);
1628 EXPORT_SYMBOL(zap_remove_norm);
1629 EXPORT_SYMBOL(zap_remove_uint64);
1630 EXPORT_SYMBOL(zap_count);
1631 EXPORT_SYMBOL(zap_value_search);
1632 EXPORT_SYMBOL(zap_join);
1633 EXPORT_SYMBOL(zap_join_increment);
1634 EXPORT_SYMBOL(zap_add_int);
1635 EXPORT_SYMBOL(zap_remove_int);
1636 EXPORT_SYMBOL(zap_lookup_int);
1637 EXPORT_SYMBOL(zap_increment_int);
1638 EXPORT_SYMBOL(zap_add_int_key);
1639 EXPORT_SYMBOL(zap_lookup_int_key);
1640 EXPORT_SYMBOL(zap_increment);
1641 EXPORT_SYMBOL(zap_cursor_init);
1642 EXPORT_SYMBOL(zap_cursor_fini);
1643 EXPORT_SYMBOL(zap_cursor_retrieve);
1644 EXPORT_SYMBOL(zap_cursor_advance);
1645 EXPORT_SYMBOL(zap_cursor_serialize);
1646 EXPORT_SYMBOL(zap_cursor_init_serialized);
1647 EXPORT_SYMBOL(zap_get_stats);
1648 #endif