]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright 2008 Sun Microsystems, Inc. All rights reserved. | |
23 | * Use is subject to license terms. | |
24 | */ | |
25 | ||
b128c09f | 26 | #pragma ident "%Z%%M% %I% %E% SMI" |
34dc7c2f BB |
27 | |
28 | #include <sys/spa.h> | |
29 | #include <sys/dmu.h> | |
30 | #include <sys/zfs_context.h> | |
31 | #include <sys/zap.h> | |
32 | #include <sys/refcount.h> | |
33 | #include <sys/zap_impl.h> | |
34 | #include <sys/zap_leaf.h> | |
35 | #include <sys/avl.h> | |
36 | ||
37 | #ifdef _KERNEL | |
38 | #include <sys/sunddi.h> | |
39 | #endif | |
40 | ||
41 | static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx); | |
42 | ||
43 | ||
44 | static uint64_t | |
45 | zap_hash(zap_t *zap, const char *normname) | |
46 | { | |
47 | const uint8_t *cp; | |
48 | uint8_t c; | |
49 | uint64_t crc = zap->zap_salt; | |
50 | ||
51 | /* NB: name must already be normalized, if necessary */ | |
52 | ||
53 | ASSERT(crc != 0); | |
54 | ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); | |
55 | for (cp = (const uint8_t *)normname; (c = *cp) != '\0'; cp++) { | |
56 | crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; | |
57 | } | |
58 | ||
59 | /* | |
60 | * Only use 28 bits, since we need 4 bits in the cookie for the | |
61 | * collision differentiator. We MUST use the high bits, since | |
62 | * those are the ones that we first pay attention to when | |
63 | * chosing the bucket. | |
64 | */ | |
65 | crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); | |
66 | ||
67 | return (crc); | |
68 | } | |
69 | ||
70 | static int | |
71 | zap_normalize(zap_t *zap, const char *name, char *namenorm) | |
72 | { | |
73 | size_t inlen, outlen; | |
74 | int err; | |
75 | ||
76 | inlen = strlen(name) + 1; | |
77 | outlen = ZAP_MAXNAMELEN; | |
78 | ||
79 | err = 0; | |
80 | (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, | |
81 | zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL, U8_UNICODE_LATEST, | |
82 | &err); | |
83 | ||
84 | return (err); | |
85 | } | |
86 | ||
87 | boolean_t | |
88 | zap_match(zap_name_t *zn, const char *matchname) | |
89 | { | |
90 | if (zn->zn_matchtype == MT_FIRST) { | |
91 | char norm[ZAP_MAXNAMELEN]; | |
92 | ||
93 | if (zap_normalize(zn->zn_zap, matchname, norm) != 0) | |
94 | return (B_FALSE); | |
95 | ||
96 | return (strcmp(zn->zn_name_norm, norm) == 0); | |
97 | } else { | |
98 | /* MT_BEST or MT_EXACT */ | |
99 | return (strcmp(zn->zn_name_orij, matchname) == 0); | |
100 | } | |
101 | } | |
102 | ||
103 | void | |
104 | zap_name_free(zap_name_t *zn) | |
105 | { | |
106 | kmem_free(zn, sizeof (zap_name_t)); | |
107 | } | |
108 | ||
109 | /* XXX combine this with zap_lockdir()? */ | |
110 | zap_name_t * | |
111 | zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt) | |
112 | { | |
113 | zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); | |
114 | ||
115 | zn->zn_zap = zap; | |
116 | zn->zn_name_orij = name; | |
117 | zn->zn_matchtype = mt; | |
118 | if (zap->zap_normflags) { | |
119 | if (zap_normalize(zap, name, zn->zn_normbuf) != 0) { | |
120 | zap_name_free(zn); | |
121 | return (NULL); | |
122 | } | |
123 | zn->zn_name_norm = zn->zn_normbuf; | |
124 | } else { | |
125 | if (mt != MT_EXACT) { | |
126 | zap_name_free(zn); | |
127 | return (NULL); | |
128 | } | |
129 | zn->zn_name_norm = zn->zn_name_orij; | |
130 | } | |
131 | ||
132 | zn->zn_hash = zap_hash(zap, zn->zn_name_norm); | |
133 | return (zn); | |
134 | } | |
135 | ||
136 | static void | |
137 | mzap_byteswap(mzap_phys_t *buf, size_t size) | |
138 | { | |
139 | int i, max; | |
140 | buf->mz_block_type = BSWAP_64(buf->mz_block_type); | |
141 | buf->mz_salt = BSWAP_64(buf->mz_salt); | |
142 | buf->mz_normflags = BSWAP_64(buf->mz_normflags); | |
143 | max = (size / MZAP_ENT_LEN) - 1; | |
144 | for (i = 0; i < max; i++) { | |
145 | buf->mz_chunk[i].mze_value = | |
146 | BSWAP_64(buf->mz_chunk[i].mze_value); | |
147 | buf->mz_chunk[i].mze_cd = | |
148 | BSWAP_32(buf->mz_chunk[i].mze_cd); | |
149 | } | |
150 | } | |
151 | ||
152 | void | |
153 | zap_byteswap(void *buf, size_t size) | |
154 | { | |
155 | uint64_t block_type; | |
156 | ||
157 | block_type = *(uint64_t *)buf; | |
158 | ||
159 | if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { | |
160 | /* ASSERT(magic == ZAP_LEAF_MAGIC); */ | |
161 | mzap_byteswap(buf, size); | |
162 | } else { | |
163 | fzap_byteswap(buf, size); | |
164 | } | |
165 | } | |
166 | ||
167 | static int | |
168 | mze_compare(const void *arg1, const void *arg2) | |
169 | { | |
170 | const mzap_ent_t *mze1 = arg1; | |
171 | const mzap_ent_t *mze2 = arg2; | |
172 | ||
173 | if (mze1->mze_hash > mze2->mze_hash) | |
174 | return (+1); | |
175 | if (mze1->mze_hash < mze2->mze_hash) | |
176 | return (-1); | |
177 | if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd) | |
178 | return (+1); | |
179 | if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd) | |
180 | return (-1); | |
181 | return (0); | |
182 | } | |
183 | ||
184 | static void | |
185 | mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep) | |
186 | { | |
187 | mzap_ent_t *mze; | |
188 | ||
189 | ASSERT(zap->zap_ismicro); | |
190 | ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); | |
191 | ASSERT(mzep->mze_cd < ZAP_MAXCD); | |
192 | ||
193 | mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); | |
194 | mze->mze_chunkid = chunkid; | |
195 | mze->mze_hash = hash; | |
196 | mze->mze_phys = *mzep; | |
197 | avl_add(&zap->zap_m.zap_avl, mze); | |
198 | } | |
199 | ||
200 | static mzap_ent_t * | |
201 | mze_find(zap_name_t *zn) | |
202 | { | |
203 | mzap_ent_t mze_tofind; | |
204 | mzap_ent_t *mze; | |
205 | avl_index_t idx; | |
206 | avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl; | |
207 | ||
208 | ASSERT(zn->zn_zap->zap_ismicro); | |
209 | ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); | |
210 | ||
211 | if (strlen(zn->zn_name_norm) >= sizeof (mze_tofind.mze_phys.mze_name)) | |
212 | return (NULL); | |
213 | ||
214 | mze_tofind.mze_hash = zn->zn_hash; | |
215 | mze_tofind.mze_phys.mze_cd = 0; | |
216 | ||
217 | again: | |
218 | mze = avl_find(avl, &mze_tofind, &idx); | |
219 | if (mze == NULL) | |
220 | mze = avl_nearest(avl, idx, AVL_AFTER); | |
221 | for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { | |
222 | if (zap_match(zn, mze->mze_phys.mze_name)) | |
223 | return (mze); | |
224 | } | |
225 | if (zn->zn_matchtype == MT_BEST) { | |
226 | zn->zn_matchtype = MT_FIRST; | |
227 | goto again; | |
228 | } | |
229 | return (NULL); | |
230 | } | |
231 | ||
232 | static uint32_t | |
233 | mze_find_unused_cd(zap_t *zap, uint64_t hash) | |
234 | { | |
235 | mzap_ent_t mze_tofind; | |
236 | mzap_ent_t *mze; | |
237 | avl_index_t idx; | |
238 | avl_tree_t *avl = &zap->zap_m.zap_avl; | |
239 | uint32_t cd; | |
240 | ||
241 | ASSERT(zap->zap_ismicro); | |
242 | ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); | |
243 | ||
244 | mze_tofind.mze_hash = hash; | |
245 | mze_tofind.mze_phys.mze_cd = 0; | |
246 | ||
247 | cd = 0; | |
248 | for (mze = avl_find(avl, &mze_tofind, &idx); | |
249 | mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { | |
250 | if (mze->mze_phys.mze_cd != cd) | |
251 | break; | |
252 | cd++; | |
253 | } | |
254 | ||
255 | return (cd); | |
256 | } | |
257 | ||
258 | static void | |
259 | mze_remove(zap_t *zap, mzap_ent_t *mze) | |
260 | { | |
261 | ASSERT(zap->zap_ismicro); | |
262 | ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); | |
263 | ||
264 | avl_remove(&zap->zap_m.zap_avl, mze); | |
265 | kmem_free(mze, sizeof (mzap_ent_t)); | |
266 | } | |
267 | ||
268 | static void | |
269 | mze_destroy(zap_t *zap) | |
270 | { | |
271 | mzap_ent_t *mze; | |
272 | void *avlcookie = NULL; | |
273 | ||
274 | while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)) | |
275 | kmem_free(mze, sizeof (mzap_ent_t)); | |
276 | avl_destroy(&zap->zap_m.zap_avl); | |
277 | } | |
278 | ||
279 | static zap_t * | |
280 | mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) | |
281 | { | |
282 | zap_t *winner; | |
283 | zap_t *zap; | |
284 | int i; | |
285 | ||
286 | ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); | |
287 | ||
288 | zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); | |
289 | rw_init(&zap->zap_rwlock, 0, 0, 0); | |
290 | rw_enter(&zap->zap_rwlock, RW_WRITER); | |
291 | zap->zap_objset = os; | |
292 | zap->zap_object = obj; | |
293 | zap->zap_dbuf = db; | |
294 | ||
295 | if (*(uint64_t *)db->db_data != ZBT_MICRO) { | |
296 | mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); | |
297 | zap->zap_f.zap_block_shift = highbit(db->db_size) - 1; | |
298 | } else { | |
299 | zap->zap_ismicro = TRUE; | |
300 | } | |
301 | ||
302 | /* | |
303 | * Make sure that zap_ismicro is set before we let others see | |
304 | * it, because zap_lockdir() checks zap_ismicro without the lock | |
305 | * held. | |
306 | */ | |
307 | winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict); | |
308 | ||
309 | if (winner != NULL) { | |
310 | rw_exit(&zap->zap_rwlock); | |
311 | rw_destroy(&zap->zap_rwlock); | |
312 | if (!zap->zap_ismicro) | |
313 | mutex_destroy(&zap->zap_f.zap_num_entries_mtx); | |
314 | kmem_free(zap, sizeof (zap_t)); | |
315 | return (winner); | |
316 | } | |
317 | ||
318 | if (zap->zap_ismicro) { | |
319 | zap->zap_salt = zap->zap_m.zap_phys->mz_salt; | |
320 | zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags; | |
321 | zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; | |
322 | avl_create(&zap->zap_m.zap_avl, mze_compare, | |
323 | sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); | |
324 | ||
325 | for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { | |
326 | mzap_ent_phys_t *mze = | |
327 | &zap->zap_m.zap_phys->mz_chunk[i]; | |
328 | if (mze->mze_name[0]) { | |
329 | zap_name_t *zn; | |
330 | ||
331 | zap->zap_m.zap_num_entries++; | |
332 | zn = zap_name_alloc(zap, mze->mze_name, | |
333 | MT_EXACT); | |
334 | mze_insert(zap, i, zn->zn_hash, mze); | |
335 | zap_name_free(zn); | |
336 | } | |
337 | } | |
338 | } else { | |
339 | zap->zap_salt = zap->zap_f.zap_phys->zap_salt; | |
340 | zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags; | |
341 | ||
342 | ASSERT3U(sizeof (struct zap_leaf_header), ==, | |
343 | 2*ZAP_LEAF_CHUNKSIZE); | |
344 | ||
345 | /* | |
346 | * The embedded pointer table should not overlap the | |
347 | * other members. | |
348 | */ | |
349 | ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, | |
350 | &zap->zap_f.zap_phys->zap_salt); | |
351 | ||
352 | /* | |
353 | * The embedded pointer table should end at the end of | |
354 | * the block | |
355 | */ | |
356 | ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, | |
357 | 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) - | |
358 | (uintptr_t)zap->zap_f.zap_phys, ==, | |
359 | zap->zap_dbuf->db_size); | |
360 | } | |
361 | rw_exit(&zap->zap_rwlock); | |
362 | return (zap); | |
363 | } | |
364 | ||
365 | int | |
366 | zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, | |
367 | krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) | |
368 | { | |
369 | zap_t *zap; | |
370 | dmu_buf_t *db; | |
371 | krw_t lt; | |
372 | int err; | |
373 | ||
374 | *zapp = NULL; | |
375 | ||
376 | err = dmu_buf_hold(os, obj, 0, NULL, &db); | |
377 | if (err) | |
378 | return (err); | |
379 | ||
380 | #ifdef ZFS_DEBUG | |
381 | { | |
382 | dmu_object_info_t doi; | |
383 | dmu_object_info_from_db(db, &doi); | |
384 | ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); | |
385 | } | |
386 | #endif | |
387 | ||
388 | zap = dmu_buf_get_user(db); | |
389 | if (zap == NULL) | |
390 | zap = mzap_open(os, obj, db); | |
391 | ||
392 | /* | |
393 | * We're checking zap_ismicro without the lock held, in order to | |
394 | * tell what type of lock we want. Once we have some sort of | |
395 | * lock, see if it really is the right type. In practice this | |
396 | * can only be different if it was upgraded from micro to fat, | |
397 | * and micro wanted WRITER but fat only needs READER. | |
398 | */ | |
399 | lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; | |
400 | rw_enter(&zap->zap_rwlock, lt); | |
401 | if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { | |
402 | /* it was upgraded, now we only need reader */ | |
403 | ASSERT(lt == RW_WRITER); | |
404 | ASSERT(RW_READER == | |
405 | (!zap->zap_ismicro && fatreader) ? RW_READER : lti); | |
406 | rw_downgrade(&zap->zap_rwlock); | |
407 | lt = RW_READER; | |
408 | } | |
409 | ||
410 | zap->zap_objset = os; | |
411 | ||
412 | if (lt == RW_WRITER) | |
413 | dmu_buf_will_dirty(db, tx); | |
414 | ||
415 | ASSERT3P(zap->zap_dbuf, ==, db); | |
416 | ||
417 | ASSERT(!zap->zap_ismicro || | |
418 | zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); | |
419 | if (zap->zap_ismicro && tx && adding && | |
420 | zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { | |
421 | uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; | |
422 | if (newsz > MZAP_MAX_BLKSZ) { | |
423 | dprintf("upgrading obj %llu: num_entries=%u\n", | |
424 | obj, zap->zap_m.zap_num_entries); | |
425 | *zapp = zap; | |
426 | return (mzap_upgrade(zapp, tx)); | |
427 | } | |
428 | err = dmu_object_set_blocksize(os, obj, newsz, 0, tx); | |
429 | ASSERT3U(err, ==, 0); | |
430 | zap->zap_m.zap_num_chunks = | |
431 | db->db_size / MZAP_ENT_LEN - 1; | |
432 | } | |
433 | ||
434 | *zapp = zap; | |
435 | return (0); | |
436 | } | |
437 | ||
438 | void | |
439 | zap_unlockdir(zap_t *zap) | |
440 | { | |
441 | rw_exit(&zap->zap_rwlock); | |
442 | dmu_buf_rele(zap->zap_dbuf, NULL); | |
443 | } | |
444 | ||
445 | static int | |
446 | mzap_upgrade(zap_t **zapp, dmu_tx_t *tx) | |
447 | { | |
448 | mzap_phys_t *mzp; | |
449 | int i, sz, nchunks, err; | |
450 | zap_t *zap = *zapp; | |
451 | ||
452 | ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); | |
453 | ||
454 | sz = zap->zap_dbuf->db_size; | |
455 | mzp = kmem_alloc(sz, KM_SLEEP); | |
456 | bcopy(zap->zap_dbuf->db_data, mzp, sz); | |
457 | nchunks = zap->zap_m.zap_num_chunks; | |
458 | ||
459 | err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, | |
460 | 1ULL << fzap_default_block_shift, 0, tx); | |
461 | if (err) { | |
462 | kmem_free(mzp, sz); | |
463 | return (err); | |
464 | } | |
465 | ||
466 | dprintf("upgrading obj=%llu with %u chunks\n", | |
467 | zap->zap_object, nchunks); | |
468 | /* XXX destroy the avl later, so we can use the stored hash value */ | |
469 | mze_destroy(zap); | |
470 | ||
471 | fzap_upgrade(zap, tx); | |
472 | ||
473 | for (i = 0; i < nchunks; i++) { | |
474 | int err; | |
475 | mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; | |
476 | zap_name_t *zn; | |
477 | if (mze->mze_name[0] == 0) | |
478 | continue; | |
479 | dprintf("adding %s=%llu\n", | |
480 | mze->mze_name, mze->mze_value); | |
481 | zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT); | |
482 | err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx); | |
483 | zap = zn->zn_zap; /* fzap_add_cd() may change zap */ | |
484 | zap_name_free(zn); | |
485 | if (err) | |
486 | break; | |
487 | } | |
488 | kmem_free(mzp, sz); | |
489 | *zapp = zap; | |
490 | return (err); | |
491 | } | |
492 | ||
493 | static void | |
494 | mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx) | |
495 | { | |
496 | dmu_buf_t *db; | |
497 | mzap_phys_t *zp; | |
498 | ||
499 | VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db)); | |
500 | ||
501 | #ifdef ZFS_DEBUG | |
502 | { | |
503 | dmu_object_info_t doi; | |
504 | dmu_object_info_from_db(db, &doi); | |
505 | ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap); | |
506 | } | |
507 | #endif | |
508 | ||
509 | dmu_buf_will_dirty(db, tx); | |
510 | zp = db->db_data; | |
511 | zp->mz_block_type = ZBT_MICRO; | |
512 | zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; | |
513 | zp->mz_normflags = normflags; | |
514 | dmu_buf_rele(db, FTAG); | |
515 | } | |
516 | ||
517 | int | |
518 | zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, | |
519 | dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) | |
520 | { | |
521 | return (zap_create_claim_norm(os, obj, | |
522 | 0, ot, bonustype, bonuslen, tx)); | |
523 | } | |
524 | ||
525 | int | |
526 | zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, | |
527 | dmu_object_type_t ot, | |
528 | dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) | |
529 | { | |
530 | int err; | |
531 | ||
532 | err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx); | |
533 | if (err != 0) | |
534 | return (err); | |
535 | mzap_create_impl(os, obj, normflags, tx); | |
536 | return (0); | |
537 | } | |
538 | ||
539 | uint64_t | |
540 | zap_create(objset_t *os, dmu_object_type_t ot, | |
541 | dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) | |
542 | { | |
543 | return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); | |
544 | } | |
545 | ||
546 | uint64_t | |
547 | zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, | |
548 | dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) | |
549 | { | |
550 | uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); | |
551 | ||
552 | mzap_create_impl(os, obj, normflags, tx); | |
553 | return (obj); | |
554 | } | |
555 | ||
556 | int | |
557 | zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) | |
558 | { | |
559 | /* | |
560 | * dmu_object_free will free the object number and free the | |
561 | * data. Freeing the data will cause our pageout function to be | |
562 | * called, which will destroy our data (zap_leaf_t's and zap_t). | |
563 | */ | |
564 | ||
565 | return (dmu_object_free(os, zapobj, tx)); | |
566 | } | |
567 | ||
568 | _NOTE(ARGSUSED(0)) | |
569 | void | |
570 | zap_evict(dmu_buf_t *db, void *vzap) | |
571 | { | |
572 | zap_t *zap = vzap; | |
573 | ||
574 | rw_destroy(&zap->zap_rwlock); | |
575 | ||
576 | if (zap->zap_ismicro) | |
577 | mze_destroy(zap); | |
578 | else | |
579 | mutex_destroy(&zap->zap_f.zap_num_entries_mtx); | |
580 | ||
581 | kmem_free(zap, sizeof (zap_t)); | |
582 | } | |
583 | ||
584 | int | |
585 | zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) | |
586 | { | |
587 | zap_t *zap; | |
588 | int err; | |
589 | ||
590 | err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); | |
591 | if (err) | |
592 | return (err); | |
593 | if (!zap->zap_ismicro) { | |
594 | err = fzap_count(zap, count); | |
595 | } else { | |
596 | *count = zap->zap_m.zap_num_entries; | |
597 | } | |
598 | zap_unlockdir(zap); | |
599 | return (err); | |
600 | } | |
601 | ||
602 | /* | |
603 | * zn may be NULL; if not specified, it will be computed if needed. | |
604 | * See also the comment above zap_entry_normalization_conflict(). | |
605 | */ | |
606 | static boolean_t | |
607 | mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) | |
608 | { | |
609 | mzap_ent_t *other; | |
610 | int direction = AVL_BEFORE; | |
611 | boolean_t allocdzn = B_FALSE; | |
612 | ||
613 | if (zap->zap_normflags == 0) | |
614 | return (B_FALSE); | |
615 | ||
616 | again: | |
617 | for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction); | |
618 | other && other->mze_hash == mze->mze_hash; | |
619 | other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { | |
620 | ||
621 | if (zn == NULL) { | |
622 | zn = zap_name_alloc(zap, mze->mze_phys.mze_name, | |
623 | MT_FIRST); | |
624 | allocdzn = B_TRUE; | |
625 | } | |
626 | if (zap_match(zn, other->mze_phys.mze_name)) { | |
627 | if (allocdzn) | |
628 | zap_name_free(zn); | |
629 | return (B_TRUE); | |
630 | } | |
631 | } | |
632 | ||
633 | if (direction == AVL_BEFORE) { | |
634 | direction = AVL_AFTER; | |
635 | goto again; | |
636 | } | |
637 | ||
638 | if (allocdzn) | |
639 | zap_name_free(zn); | |
640 | return (B_FALSE); | |
641 | } | |
642 | ||
643 | /* | |
644 | * Routines for manipulating attributes. | |
645 | */ | |
646 | ||
647 | int | |
648 | zap_lookup(objset_t *os, uint64_t zapobj, const char *name, | |
649 | uint64_t integer_size, uint64_t num_integers, void *buf) | |
650 | { | |
651 | return (zap_lookup_norm(os, zapobj, name, integer_size, | |
652 | num_integers, buf, MT_EXACT, NULL, 0, NULL)); | |
653 | } | |
654 | ||
655 | int | |
656 | zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, | |
657 | uint64_t integer_size, uint64_t num_integers, void *buf, | |
658 | matchtype_t mt, char *realname, int rn_len, | |
659 | boolean_t *ncp) | |
660 | { | |
661 | zap_t *zap; | |
662 | int err; | |
663 | mzap_ent_t *mze; | |
664 | zap_name_t *zn; | |
665 | ||
666 | err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); | |
667 | if (err) | |
668 | return (err); | |
669 | zn = zap_name_alloc(zap, name, mt); | |
670 | if (zn == NULL) { | |
671 | zap_unlockdir(zap); | |
672 | return (ENOTSUP); | |
673 | } | |
674 | ||
675 | if (!zap->zap_ismicro) { | |
676 | err = fzap_lookup(zn, integer_size, num_integers, buf, | |
677 | realname, rn_len, ncp); | |
678 | } else { | |
679 | mze = mze_find(zn); | |
680 | if (mze == NULL) { | |
681 | err = ENOENT; | |
682 | } else { | |
683 | if (num_integers < 1) { | |
684 | err = EOVERFLOW; | |
685 | } else if (integer_size != 8) { | |
686 | err = EINVAL; | |
687 | } else { | |
688 | *(uint64_t *)buf = mze->mze_phys.mze_value; | |
689 | (void) strlcpy(realname, | |
690 | mze->mze_phys.mze_name, rn_len); | |
691 | if (ncp) { | |
692 | *ncp = mzap_normalization_conflict(zap, | |
693 | zn, mze); | |
694 | } | |
695 | } | |
696 | } | |
697 | } | |
698 | zap_name_free(zn); | |
699 | zap_unlockdir(zap); | |
700 | return (err); | |
701 | } | |
702 | ||
703 | int | |
704 | zap_length(objset_t *os, uint64_t zapobj, const char *name, | |
705 | uint64_t *integer_size, uint64_t *num_integers) | |
706 | { | |
707 | zap_t *zap; | |
708 | int err; | |
709 | mzap_ent_t *mze; | |
710 | zap_name_t *zn; | |
711 | ||
712 | err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); | |
713 | if (err) | |
714 | return (err); | |
715 | zn = zap_name_alloc(zap, name, MT_EXACT); | |
716 | if (zn == NULL) { | |
717 | zap_unlockdir(zap); | |
718 | return (ENOTSUP); | |
719 | } | |
720 | if (!zap->zap_ismicro) { | |
721 | err = fzap_length(zn, integer_size, num_integers); | |
722 | } else { | |
723 | mze = mze_find(zn); | |
724 | if (mze == NULL) { | |
725 | err = ENOENT; | |
726 | } else { | |
727 | if (integer_size) | |
728 | *integer_size = 8; | |
729 | if (num_integers) | |
730 | *num_integers = 1; | |
731 | } | |
732 | } | |
733 | zap_name_free(zn); | |
734 | zap_unlockdir(zap); | |
735 | return (err); | |
736 | } | |
737 | ||
738 | static void | |
739 | mzap_addent(zap_name_t *zn, uint64_t value) | |
740 | { | |
741 | int i; | |
742 | zap_t *zap = zn->zn_zap; | |
743 | int start = zap->zap_m.zap_alloc_next; | |
744 | uint32_t cd; | |
745 | ||
746 | dprintf("obj=%llu %s=%llu\n", zap->zap_object, | |
747 | zn->zn_name_orij, value); | |
748 | ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); | |
749 | ||
750 | #ifdef ZFS_DEBUG | |
751 | for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { | |
752 | mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; | |
753 | ASSERT(strcmp(zn->zn_name_orij, mze->mze_name) != 0); | |
754 | } | |
755 | #endif | |
756 | ||
757 | cd = mze_find_unused_cd(zap, zn->zn_hash); | |
758 | /* given the limited size of the microzap, this can't happen */ | |
759 | ASSERT(cd != ZAP_MAXCD); | |
760 | ||
761 | again: | |
762 | for (i = start; i < zap->zap_m.zap_num_chunks; i++) { | |
763 | mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; | |
764 | if (mze->mze_name[0] == 0) { | |
765 | mze->mze_value = value; | |
766 | mze->mze_cd = cd; | |
767 | (void) strcpy(mze->mze_name, zn->zn_name_orij); | |
768 | zap->zap_m.zap_num_entries++; | |
769 | zap->zap_m.zap_alloc_next = i+1; | |
770 | if (zap->zap_m.zap_alloc_next == | |
771 | zap->zap_m.zap_num_chunks) | |
772 | zap->zap_m.zap_alloc_next = 0; | |
773 | mze_insert(zap, i, zn->zn_hash, mze); | |
774 | return; | |
775 | } | |
776 | } | |
777 | if (start != 0) { | |
778 | start = 0; | |
779 | goto again; | |
780 | } | |
781 | ASSERT(!"out of entries!"); | |
782 | } | |
783 | ||
784 | int | |
785 | zap_add(objset_t *os, uint64_t zapobj, const char *name, | |
786 | int integer_size, uint64_t num_integers, | |
787 | const void *val, dmu_tx_t *tx) | |
788 | { | |
789 | zap_t *zap; | |
790 | int err; | |
791 | mzap_ent_t *mze; | |
792 | const uint64_t *intval = val; | |
793 | zap_name_t *zn; | |
794 | ||
795 | err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); | |
796 | if (err) | |
797 | return (err); | |
798 | zn = zap_name_alloc(zap, name, MT_EXACT); | |
799 | if (zn == NULL) { | |
800 | zap_unlockdir(zap); | |
801 | return (ENOTSUP); | |
802 | } | |
803 | if (!zap->zap_ismicro) { | |
804 | err = fzap_add(zn, integer_size, num_integers, val, tx); | |
805 | zap = zn->zn_zap; /* fzap_add() may change zap */ | |
806 | } else if (integer_size != 8 || num_integers != 1 || | |
807 | strlen(name) >= MZAP_NAME_LEN) { | |
808 | dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", | |
809 | zapobj, integer_size, num_integers, name); | |
810 | err = mzap_upgrade(&zn->zn_zap, tx); | |
811 | if (err == 0) | |
812 | err = fzap_add(zn, integer_size, num_integers, val, tx); | |
813 | zap = zn->zn_zap; /* fzap_add() may change zap */ | |
814 | } else { | |
815 | mze = mze_find(zn); | |
816 | if (mze != NULL) { | |
817 | err = EEXIST; | |
818 | } else { | |
819 | mzap_addent(zn, *intval); | |
820 | } | |
821 | } | |
822 | ASSERT(zap == zn->zn_zap); | |
823 | zap_name_free(zn); | |
824 | if (zap != NULL) /* may be NULL if fzap_add() failed */ | |
825 | zap_unlockdir(zap); | |
826 | return (err); | |
827 | } | |
828 | ||
829 | int | |
830 | zap_update(objset_t *os, uint64_t zapobj, const char *name, | |
831 | int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) | |
832 | { | |
833 | zap_t *zap; | |
834 | mzap_ent_t *mze; | |
835 | const uint64_t *intval = val; | |
836 | zap_name_t *zn; | |
837 | int err; | |
838 | ||
839 | err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); | |
840 | if (err) | |
841 | return (err); | |
842 | zn = zap_name_alloc(zap, name, MT_EXACT); | |
843 | if (zn == NULL) { | |
844 | zap_unlockdir(zap); | |
845 | return (ENOTSUP); | |
846 | } | |
847 | if (!zap->zap_ismicro) { | |
848 | err = fzap_update(zn, integer_size, num_integers, val, tx); | |
849 | zap = zn->zn_zap; /* fzap_update() may change zap */ | |
850 | } else if (integer_size != 8 || num_integers != 1 || | |
851 | strlen(name) >= MZAP_NAME_LEN) { | |
852 | dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", | |
853 | zapobj, integer_size, num_integers, name); | |
854 | err = mzap_upgrade(&zn->zn_zap, tx); | |
855 | if (err == 0) | |
856 | err = fzap_update(zn, integer_size, num_integers, | |
857 | val, tx); | |
858 | zap = zn->zn_zap; /* fzap_update() may change zap */ | |
859 | } else { | |
860 | mze = mze_find(zn); | |
861 | if (mze != NULL) { | |
862 | mze->mze_phys.mze_value = *intval; | |
863 | zap->zap_m.zap_phys->mz_chunk | |
864 | [mze->mze_chunkid].mze_value = *intval; | |
865 | } else { | |
866 | mzap_addent(zn, *intval); | |
867 | } | |
868 | } | |
869 | ASSERT(zap == zn->zn_zap); | |
870 | zap_name_free(zn); | |
871 | if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ | |
872 | zap_unlockdir(zap); | |
873 | return (err); | |
874 | } | |
875 | ||
876 | int | |
877 | zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) | |
878 | { | |
879 | return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx)); | |
880 | } | |
881 | ||
882 | int | |
883 | zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, | |
884 | matchtype_t mt, dmu_tx_t *tx) | |
885 | { | |
886 | zap_t *zap; | |
887 | int err; | |
888 | mzap_ent_t *mze; | |
889 | zap_name_t *zn; | |
890 | ||
891 | err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap); | |
892 | if (err) | |
893 | return (err); | |
894 | zn = zap_name_alloc(zap, name, mt); | |
895 | if (zn == NULL) { | |
896 | zap_unlockdir(zap); | |
897 | return (ENOTSUP); | |
898 | } | |
899 | if (!zap->zap_ismicro) { | |
900 | err = fzap_remove(zn, tx); | |
901 | } else { | |
902 | mze = mze_find(zn); | |
903 | if (mze == NULL) { | |
904 | err = ENOENT; | |
905 | } else { | |
906 | zap->zap_m.zap_num_entries--; | |
907 | bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid], | |
908 | sizeof (mzap_ent_phys_t)); | |
909 | mze_remove(zap, mze); | |
910 | } | |
911 | } | |
912 | zap_name_free(zn); | |
913 | zap_unlockdir(zap); | |
914 | return (err); | |
915 | } | |
916 | ||
917 | /* | |
918 | * Routines for iterating over the attributes. | |
919 | */ | |
920 | ||
921 | /* | |
922 | * We want to keep the high 32 bits of the cursor zero if we can, so | |
923 | * that 32-bit programs can access this. So use a small hash value so | |
924 | * we can fit 4 bits of cd into the 32-bit cursor. | |
925 | * | |
926 | * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ] | |
927 | */ | |
928 | void | |
929 | zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, | |
930 | uint64_t serialized) | |
931 | { | |
932 | zc->zc_objset = os; | |
933 | zc->zc_zap = NULL; | |
934 | zc->zc_leaf = NULL; | |
935 | zc->zc_zapobj = zapobj; | |
936 | if (serialized == -1ULL) { | |
937 | zc->zc_hash = -1ULL; | |
938 | zc->zc_cd = 0; | |
939 | } else { | |
940 | zc->zc_hash = serialized << (64-ZAP_HASHBITS); | |
941 | zc->zc_cd = serialized >> ZAP_HASHBITS; | |
942 | if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */ | |
943 | zc->zc_cd = 0; | |
944 | } | |
945 | } | |
946 | ||
947 | void | |
948 | zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) | |
949 | { | |
950 | zap_cursor_init_serialized(zc, os, zapobj, 0); | |
951 | } | |
952 | ||
953 | void | |
954 | zap_cursor_fini(zap_cursor_t *zc) | |
955 | { | |
956 | if (zc->zc_zap) { | |
957 | rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); | |
958 | zap_unlockdir(zc->zc_zap); | |
959 | zc->zc_zap = NULL; | |
960 | } | |
961 | if (zc->zc_leaf) { | |
962 | rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); | |
963 | zap_put_leaf(zc->zc_leaf); | |
964 | zc->zc_leaf = NULL; | |
965 | } | |
966 | zc->zc_objset = NULL; | |
967 | } | |
968 | ||
969 | uint64_t | |
970 | zap_cursor_serialize(zap_cursor_t *zc) | |
971 | { | |
972 | if (zc->zc_hash == -1ULL) | |
973 | return (-1ULL); | |
974 | ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0); | |
975 | ASSERT(zc->zc_cd < ZAP_MAXCD); | |
976 | return ((zc->zc_hash >> (64-ZAP_HASHBITS)) | | |
977 | ((uint64_t)zc->zc_cd << ZAP_HASHBITS)); | |
978 | } | |
979 | ||
980 | int | |
981 | zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) | |
982 | { | |
983 | int err; | |
984 | avl_index_t idx; | |
985 | mzap_ent_t mze_tofind; | |
986 | mzap_ent_t *mze; | |
987 | ||
988 | if (zc->zc_hash == -1ULL) | |
989 | return (ENOENT); | |
990 | ||
991 | if (zc->zc_zap == NULL) { | |
992 | err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, | |
993 | RW_READER, TRUE, FALSE, &zc->zc_zap); | |
994 | if (err) | |
995 | return (err); | |
996 | } else { | |
997 | rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); | |
998 | } | |
999 | if (!zc->zc_zap->zap_ismicro) { | |
1000 | err = fzap_cursor_retrieve(zc->zc_zap, zc, za); | |
1001 | } else { | |
1002 | err = ENOENT; | |
1003 | ||
1004 | mze_tofind.mze_hash = zc->zc_hash; | |
1005 | mze_tofind.mze_phys.mze_cd = zc->zc_cd; | |
1006 | ||
1007 | mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); | |
1008 | if (mze == NULL) { | |
1009 | mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, | |
1010 | idx, AVL_AFTER); | |
1011 | } | |
1012 | if (mze) { | |
1013 | ASSERT(0 == bcmp(&mze->mze_phys, | |
1014 | &zc->zc_zap->zap_m.zap_phys->mz_chunk | |
1015 | [mze->mze_chunkid], sizeof (mze->mze_phys))); | |
1016 | ||
1017 | za->za_normalization_conflict = | |
1018 | mzap_normalization_conflict(zc->zc_zap, NULL, mze); | |
1019 | za->za_integer_length = 8; | |
1020 | za->za_num_integers = 1; | |
1021 | za->za_first_integer = mze->mze_phys.mze_value; | |
1022 | (void) strcpy(za->za_name, mze->mze_phys.mze_name); | |
1023 | zc->zc_hash = mze->mze_hash; | |
1024 | zc->zc_cd = mze->mze_phys.mze_cd; | |
1025 | err = 0; | |
1026 | } else { | |
1027 | zc->zc_hash = -1ULL; | |
1028 | } | |
1029 | } | |
1030 | rw_exit(&zc->zc_zap->zap_rwlock); | |
1031 | return (err); | |
1032 | } | |
1033 | ||
1034 | void | |
1035 | zap_cursor_advance(zap_cursor_t *zc) | |
1036 | { | |
1037 | if (zc->zc_hash == -1ULL) | |
1038 | return; | |
1039 | zc->zc_cd++; | |
1040 | if (zc->zc_cd >= ZAP_MAXCD) { | |
1041 | zc->zc_cd = 0; | |
1042 | zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS); | |
1043 | if (zc->zc_hash == 0) /* EOF */ | |
1044 | zc->zc_hash = -1ULL; | |
1045 | } | |
1046 | } | |
1047 | ||
1048 | int | |
1049 | zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) | |
1050 | { | |
1051 | int err; | |
1052 | zap_t *zap; | |
1053 | ||
1054 | err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); | |
1055 | if (err) | |
1056 | return (err); | |
1057 | ||
1058 | bzero(zs, sizeof (zap_stats_t)); | |
1059 | ||
1060 | if (zap->zap_ismicro) { | |
1061 | zs->zs_blocksize = zap->zap_dbuf->db_size; | |
1062 | zs->zs_num_entries = zap->zap_m.zap_num_entries; | |
1063 | zs->zs_num_blocks = 1; | |
1064 | } else { | |
1065 | fzap_get_stats(zap, zs); | |
1066 | } | |
1067 | zap_unlockdir(zap); | |
1068 | return (0); | |
1069 | } |