]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
9babb374 | 22 | * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
23 | * Use is subject to license terms. |
24 | */ | |
25 | ||
34dc7c2f BB |
26 | #include <sys/dmu.h> |
27 | #include <sys/dmu_impl.h> | |
28 | #include <sys/dmu_tx.h> | |
29 | #include <sys/dbuf.h> | |
30 | #include <sys/dnode.h> | |
31 | #include <sys/zfs_context.h> | |
32 | #include <sys/dmu_objset.h> | |
33 | #include <sys/dmu_traverse.h> | |
34 | #include <sys/dsl_dataset.h> | |
35 | #include <sys/dsl_dir.h> | |
36 | #include <sys/dsl_pool.h> | |
37 | #include <sys/dsl_synctask.h> | |
38 | #include <sys/dsl_prop.h> | |
39 | #include <sys/dmu_zfetch.h> | |
40 | #include <sys/zfs_ioctl.h> | |
41 | #include <sys/zap.h> | |
42 | #include <sys/zio_checksum.h> | |
43 | #ifdef _KERNEL | |
44 | #include <sys/vmsystm.h> | |
b128c09f | 45 | #include <sys/zfs_znode.h> |
34dc7c2f BB |
46 | #endif |
47 | ||
48 | const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { | |
49 | { byteswap_uint8_array, TRUE, "unallocated" }, | |
50 | { zap_byteswap, TRUE, "object directory" }, | |
51 | { byteswap_uint64_array, TRUE, "object array" }, | |
52 | { byteswap_uint8_array, TRUE, "packed nvlist" }, | |
53 | { byteswap_uint64_array, TRUE, "packed nvlist size" }, | |
54 | { byteswap_uint64_array, TRUE, "bplist" }, | |
55 | { byteswap_uint64_array, TRUE, "bplist header" }, | |
56 | { byteswap_uint64_array, TRUE, "SPA space map header" }, | |
57 | { byteswap_uint64_array, TRUE, "SPA space map" }, | |
58 | { byteswap_uint64_array, TRUE, "ZIL intent log" }, | |
59 | { dnode_buf_byteswap, TRUE, "DMU dnode" }, | |
60 | { dmu_objset_byteswap, TRUE, "DMU objset" }, | |
61 | { byteswap_uint64_array, TRUE, "DSL directory" }, | |
62 | { zap_byteswap, TRUE, "DSL directory child map"}, | |
63 | { zap_byteswap, TRUE, "DSL dataset snap map" }, | |
64 | { zap_byteswap, TRUE, "DSL props" }, | |
65 | { byteswap_uint64_array, TRUE, "DSL dataset" }, | |
66 | { zfs_znode_byteswap, TRUE, "ZFS znode" }, | |
67 | { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" }, | |
68 | { byteswap_uint8_array, FALSE, "ZFS plain file" }, | |
69 | { zap_byteswap, TRUE, "ZFS directory" }, | |
70 | { zap_byteswap, TRUE, "ZFS master node" }, | |
71 | { zap_byteswap, TRUE, "ZFS delete queue" }, | |
72 | { byteswap_uint8_array, FALSE, "zvol object" }, | |
73 | { zap_byteswap, TRUE, "zvol prop" }, | |
74 | { byteswap_uint8_array, FALSE, "other uint8[]" }, | |
75 | { byteswap_uint64_array, FALSE, "other uint64[]" }, | |
76 | { zap_byteswap, TRUE, "other ZAP" }, | |
77 | { zap_byteswap, TRUE, "persistent error log" }, | |
78 | { byteswap_uint8_array, TRUE, "SPA history" }, | |
79 | { byteswap_uint64_array, TRUE, "SPA history offsets" }, | |
80 | { zap_byteswap, TRUE, "Pool properties" }, | |
81 | { zap_byteswap, TRUE, "DSL permissions" }, | |
82 | { zfs_acl_byteswap, TRUE, "ZFS ACL" }, | |
83 | { byteswap_uint8_array, TRUE, "ZFS SYSACL" }, | |
84 | { byteswap_uint8_array, TRUE, "FUID table" }, | |
85 | { byteswap_uint64_array, TRUE, "FUID table size" }, | |
b128c09f BB |
86 | { zap_byteswap, TRUE, "DSL dataset next clones"}, |
87 | { zap_byteswap, TRUE, "scrub work queue" }, | |
9babb374 BB |
88 | { zap_byteswap, TRUE, "ZFS user/group used" }, |
89 | { zap_byteswap, TRUE, "ZFS user/group quota" }, | |
34dc7c2f BB |
90 | }; |
91 | ||
92 | int | |
93 | dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, | |
94 | void *tag, dmu_buf_t **dbp) | |
95 | { | |
96 | dnode_t *dn; | |
97 | uint64_t blkid; | |
98 | dmu_buf_impl_t *db; | |
99 | int err; | |
100 | ||
101 | err = dnode_hold(os->os, object, FTAG, &dn); | |
102 | if (err) | |
103 | return (err); | |
104 | blkid = dbuf_whichblock(dn, offset); | |
105 | rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
106 | db = dbuf_hold(dn, blkid, tag); | |
107 | rw_exit(&dn->dn_struct_rwlock); | |
108 | if (db == NULL) { | |
109 | err = EIO; | |
110 | } else { | |
111 | err = dbuf_read(db, NULL, DB_RF_CANFAIL); | |
112 | if (err) { | |
113 | dbuf_rele(db, tag); | |
114 | db = NULL; | |
115 | } | |
116 | } | |
117 | ||
118 | dnode_rele(dn, FTAG); | |
119 | *dbp = &db->db; | |
120 | return (err); | |
121 | } | |
122 | ||
123 | int | |
124 | dmu_bonus_max(void) | |
125 | { | |
126 | return (DN_MAX_BONUSLEN); | |
127 | } | |
128 | ||
129 | int | |
130 | dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx) | |
131 | { | |
132 | dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; | |
133 | ||
134 | if (dn->dn_bonus != (dmu_buf_impl_t *)db) | |
135 | return (EINVAL); | |
136 | if (newsize < 0 || newsize > db->db_size) | |
137 | return (EINVAL); | |
138 | dnode_setbonuslen(dn, newsize, tx); | |
139 | return (0); | |
140 | } | |
141 | ||
142 | /* | |
143 | * returns ENOENT, EIO, or 0. | |
144 | */ | |
145 | int | |
146 | dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) | |
147 | { | |
148 | dnode_t *dn; | |
149 | dmu_buf_impl_t *db; | |
150 | int error; | |
151 | ||
152 | error = dnode_hold(os->os, object, FTAG, &dn); | |
153 | if (error) | |
154 | return (error); | |
155 | ||
156 | rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
157 | if (dn->dn_bonus == NULL) { | |
158 | rw_exit(&dn->dn_struct_rwlock); | |
159 | rw_enter(&dn->dn_struct_rwlock, RW_WRITER); | |
160 | if (dn->dn_bonus == NULL) | |
161 | dbuf_create_bonus(dn); | |
162 | } | |
163 | db = dn->dn_bonus; | |
164 | rw_exit(&dn->dn_struct_rwlock); | |
165 | ||
166 | /* as long as the bonus buf is held, the dnode will be held */ | |
167 | if (refcount_add(&db->db_holds, tag) == 1) | |
168 | VERIFY(dnode_add_ref(dn, db)); | |
169 | ||
170 | dnode_rele(dn, FTAG); | |
171 | ||
172 | VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); | |
173 | ||
174 | *dbp = &db->db; | |
175 | return (0); | |
176 | } | |
177 | ||
178 | /* | |
179 | * Note: longer-term, we should modify all of the dmu_buf_*() interfaces | |
180 | * to take a held dnode rather than <os, object> -- the lookup is wasteful, | |
181 | * and can induce severe lock contention when writing to several files | |
182 | * whose dnodes are in the same block. | |
183 | */ | |
184 | static int | |
9babb374 BB |
185 | dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, |
186 | int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) | |
34dc7c2f | 187 | { |
b128c09f | 188 | dsl_pool_t *dp = NULL; |
34dc7c2f BB |
189 | dmu_buf_t **dbp; |
190 | uint64_t blkid, nblks, i; | |
9babb374 | 191 | uint32_t dbuf_flags; |
34dc7c2f BB |
192 | int err; |
193 | zio_t *zio; | |
b128c09f | 194 | hrtime_t start; |
34dc7c2f BB |
195 | |
196 | ASSERT(length <= DMU_MAX_ACCESS); | |
197 | ||
9babb374 BB |
198 | dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; |
199 | if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) | |
200 | dbuf_flags |= DB_RF_NOPREFETCH; | |
34dc7c2f BB |
201 | |
202 | rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
203 | if (dn->dn_datablkshift) { | |
204 | int blkshift = dn->dn_datablkshift; | |
205 | nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) - | |
206 | P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift; | |
207 | } else { | |
208 | if (offset + length > dn->dn_datablksz) { | |
209 | zfs_panic_recover("zfs: accessing past end of object " | |
210 | "%llx/%llx (size=%u access=%llu+%llu)", | |
211 | (longlong_t)dn->dn_objset-> | |
212 | os_dsl_dataset->ds_object, | |
213 | (longlong_t)dn->dn_object, dn->dn_datablksz, | |
214 | (longlong_t)offset, (longlong_t)length); | |
215 | return (EIO); | |
216 | } | |
217 | nblks = 1; | |
218 | } | |
219 | dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); | |
220 | ||
b128c09f BB |
221 | if (dn->dn_objset->os_dsl_dataset) |
222 | dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; | |
223 | if (dp && dsl_pool_sync_context(dp)) | |
224 | start = gethrtime(); | |
225 | zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); | |
34dc7c2f BB |
226 | blkid = dbuf_whichblock(dn, offset); |
227 | for (i = 0; i < nblks; i++) { | |
228 | dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); | |
229 | if (db == NULL) { | |
230 | rw_exit(&dn->dn_struct_rwlock); | |
231 | dmu_buf_rele_array(dbp, nblks, tag); | |
232 | zio_nowait(zio); | |
233 | return (EIO); | |
234 | } | |
235 | /* initiate async i/o */ | |
236 | if (read) { | |
237 | rw_exit(&dn->dn_struct_rwlock); | |
9babb374 | 238 | (void) dbuf_read(db, zio, dbuf_flags); |
34dc7c2f BB |
239 | rw_enter(&dn->dn_struct_rwlock, RW_READER); |
240 | } | |
241 | dbp[i] = &db->db; | |
242 | } | |
243 | rw_exit(&dn->dn_struct_rwlock); | |
244 | ||
245 | /* wait for async i/o */ | |
246 | err = zio_wait(zio); | |
b128c09f BB |
247 | /* track read overhead when we are in sync context */ |
248 | if (dp && dsl_pool_sync_context(dp)) | |
249 | dp->dp_read_overhead += gethrtime() - start; | |
34dc7c2f BB |
250 | if (err) { |
251 | dmu_buf_rele_array(dbp, nblks, tag); | |
252 | return (err); | |
253 | } | |
254 | ||
255 | /* wait for other io to complete */ | |
256 | if (read) { | |
257 | for (i = 0; i < nblks; i++) { | |
258 | dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; | |
259 | mutex_enter(&db->db_mtx); | |
260 | while (db->db_state == DB_READ || | |
261 | db->db_state == DB_FILL) | |
262 | cv_wait(&db->db_changed, &db->db_mtx); | |
263 | if (db->db_state == DB_UNCACHED) | |
264 | err = EIO; | |
265 | mutex_exit(&db->db_mtx); | |
266 | if (err) { | |
267 | dmu_buf_rele_array(dbp, nblks, tag); | |
268 | return (err); | |
269 | } | |
270 | } | |
271 | } | |
272 | ||
273 | *numbufsp = nblks; | |
274 | *dbpp = dbp; | |
275 | return (0); | |
276 | } | |
277 | ||
278 | static int | |
279 | dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, | |
280 | uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) | |
281 | { | |
282 | dnode_t *dn; | |
283 | int err; | |
284 | ||
285 | err = dnode_hold(os->os, object, FTAG, &dn); | |
286 | if (err) | |
287 | return (err); | |
288 | ||
289 | err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, | |
9babb374 | 290 | numbufsp, dbpp, DMU_READ_PREFETCH); |
34dc7c2f BB |
291 | |
292 | dnode_rele(dn, FTAG); | |
293 | ||
294 | return (err); | |
295 | } | |
296 | ||
297 | int | |
298 | dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, | |
299 | uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) | |
300 | { | |
301 | dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; | |
302 | int err; | |
303 | ||
304 | err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, | |
9babb374 | 305 | numbufsp, dbpp, DMU_READ_PREFETCH); |
34dc7c2f BB |
306 | |
307 | return (err); | |
308 | } | |
309 | ||
310 | void | |
311 | dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) | |
312 | { | |
313 | int i; | |
314 | dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; | |
315 | ||
316 | if (numbufs == 0) | |
317 | return; | |
318 | ||
319 | for (i = 0; i < numbufs; i++) { | |
320 | if (dbp[i]) | |
321 | dbuf_rele(dbp[i], tag); | |
322 | } | |
323 | ||
324 | kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); | |
325 | } | |
326 | ||
327 | void | |
328 | dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) | |
329 | { | |
330 | dnode_t *dn; | |
331 | uint64_t blkid; | |
332 | int nblks, i, err; | |
333 | ||
334 | if (zfs_prefetch_disable) | |
335 | return; | |
336 | ||
337 | if (len == 0) { /* they're interested in the bonus buffer */ | |
338 | dn = os->os->os_meta_dnode; | |
339 | ||
340 | if (object == 0 || object >= DN_MAX_OBJECT) | |
341 | return; | |
342 | ||
343 | rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
344 | blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); | |
345 | dbuf_prefetch(dn, blkid); | |
346 | rw_exit(&dn->dn_struct_rwlock); | |
347 | return; | |
348 | } | |
349 | ||
350 | /* | |
351 | * XXX - Note, if the dnode for the requested object is not | |
352 | * already cached, we will do a *synchronous* read in the | |
353 | * dnode_hold() call. The same is true for any indirects. | |
354 | */ | |
355 | err = dnode_hold(os->os, object, FTAG, &dn); | |
356 | if (err != 0) | |
357 | return; | |
358 | ||
359 | rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
360 | if (dn->dn_datablkshift) { | |
361 | int blkshift = dn->dn_datablkshift; | |
362 | nblks = (P2ROUNDUP(offset+len, 1<<blkshift) - | |
363 | P2ALIGN(offset, 1<<blkshift)) >> blkshift; | |
364 | } else { | |
365 | nblks = (offset < dn->dn_datablksz); | |
366 | } | |
367 | ||
368 | if (nblks != 0) { | |
369 | blkid = dbuf_whichblock(dn, offset); | |
370 | for (i = 0; i < nblks; i++) | |
371 | dbuf_prefetch(dn, blkid+i); | |
372 | } | |
373 | ||
374 | rw_exit(&dn->dn_struct_rwlock); | |
375 | ||
376 | dnode_rele(dn, FTAG); | |
377 | } | |
378 | ||
b128c09f BB |
379 | static int |
380 | get_next_chunk(dnode_t *dn, uint64_t *offset, uint64_t limit) | |
381 | { | |
382 | uint64_t len = *offset - limit; | |
383 | uint64_t chunk_len = dn->dn_datablksz * DMU_MAX_DELETEBLKCNT; | |
384 | uint64_t subchunk = | |
385 | dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); | |
386 | ||
387 | ASSERT(limit <= *offset); | |
388 | ||
389 | if (len <= chunk_len) { | |
390 | *offset = limit; | |
391 | return (0); | |
392 | } | |
393 | ||
394 | ASSERT(ISP2(subchunk)); | |
395 | ||
396 | while (*offset > limit) { | |
397 | uint64_t initial_offset = P2ROUNDUP(*offset, subchunk); | |
398 | uint64_t delta; | |
399 | int err; | |
400 | ||
401 | /* skip over allocated data */ | |
402 | err = dnode_next_offset(dn, | |
403 | DNODE_FIND_HOLE|DNODE_FIND_BACKWARDS, offset, 1, 1, 0); | |
404 | if (err == ESRCH) | |
405 | *offset = limit; | |
406 | else if (err) | |
407 | return (err); | |
408 | ||
409 | ASSERT3U(*offset, <=, initial_offset); | |
410 | *offset = P2ALIGN(*offset, subchunk); | |
411 | delta = initial_offset - *offset; | |
412 | if (delta >= chunk_len) { | |
413 | *offset += delta - chunk_len; | |
414 | return (0); | |
415 | } | |
416 | chunk_len -= delta; | |
417 | ||
418 | /* skip over unallocated data */ | |
419 | err = dnode_next_offset(dn, | |
420 | DNODE_FIND_BACKWARDS, offset, 1, 1, 0); | |
421 | if (err == ESRCH) | |
422 | *offset = limit; | |
423 | else if (err) | |
424 | return (err); | |
425 | ||
426 | if (*offset < limit) | |
427 | *offset = limit; | |
428 | ASSERT3U(*offset, <, initial_offset); | |
429 | } | |
430 | return (0); | |
431 | } | |
432 | ||
433 | static int | |
434 | dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, | |
435 | uint64_t length, boolean_t free_dnode) | |
436 | { | |
437 | dmu_tx_t *tx; | |
438 | uint64_t object_size, start, end, len; | |
439 | boolean_t trunc = (length == DMU_OBJECT_END); | |
440 | int align, err; | |
441 | ||
442 | align = 1 << dn->dn_datablkshift; | |
443 | ASSERT(align > 0); | |
444 | object_size = align == 1 ? dn->dn_datablksz : | |
445 | (dn->dn_maxblkid + 1) << dn->dn_datablkshift; | |
446 | ||
9babb374 BB |
447 | end = offset + length; |
448 | if (trunc || end > object_size) | |
b128c09f BB |
449 | end = object_size; |
450 | if (end <= offset) | |
451 | return (0); | |
452 | length = end - offset; | |
453 | ||
454 | while (length) { | |
455 | start = end; | |
9babb374 | 456 | /* assert(offset <= start) */ |
b128c09f BB |
457 | err = get_next_chunk(dn, &start, offset); |
458 | if (err) | |
459 | return (err); | |
460 | len = trunc ? DMU_OBJECT_END : end - start; | |
461 | ||
462 | tx = dmu_tx_create(os); | |
463 | dmu_tx_hold_free(tx, dn->dn_object, start, len); | |
464 | err = dmu_tx_assign(tx, TXG_WAIT); | |
465 | if (err) { | |
466 | dmu_tx_abort(tx); | |
467 | return (err); | |
468 | } | |
469 | ||
470 | dnode_free_range(dn, start, trunc ? -1 : len, tx); | |
471 | ||
472 | if (start == 0 && free_dnode) { | |
473 | ASSERT(trunc); | |
474 | dnode_free(dn, tx); | |
475 | } | |
476 | ||
477 | length -= end - start; | |
478 | ||
479 | dmu_tx_commit(tx); | |
480 | end = start; | |
481 | } | |
482 | return (0); | |
483 | } | |
484 | ||
485 | int | |
486 | dmu_free_long_range(objset_t *os, uint64_t object, | |
487 | uint64_t offset, uint64_t length) | |
488 | { | |
489 | dnode_t *dn; | |
490 | int err; | |
491 | ||
492 | err = dnode_hold(os->os, object, FTAG, &dn); | |
493 | if (err != 0) | |
494 | return (err); | |
495 | err = dmu_free_long_range_impl(os, dn, offset, length, FALSE); | |
496 | dnode_rele(dn, FTAG); | |
497 | return (err); | |
498 | } | |
499 | ||
500 | int | |
501 | dmu_free_object(objset_t *os, uint64_t object) | |
502 | { | |
503 | dnode_t *dn; | |
504 | dmu_tx_t *tx; | |
505 | int err; | |
506 | ||
507 | err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, | |
508 | FTAG, &dn); | |
509 | if (err != 0) | |
510 | return (err); | |
511 | if (dn->dn_nlevels == 1) { | |
512 | tx = dmu_tx_create(os); | |
513 | dmu_tx_hold_bonus(tx, object); | |
514 | dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END); | |
515 | err = dmu_tx_assign(tx, TXG_WAIT); | |
516 | if (err == 0) { | |
517 | dnode_free_range(dn, 0, DMU_OBJECT_END, tx); | |
518 | dnode_free(dn, tx); | |
519 | dmu_tx_commit(tx); | |
520 | } else { | |
521 | dmu_tx_abort(tx); | |
522 | } | |
523 | } else { | |
524 | err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE); | |
525 | } | |
526 | dnode_rele(dn, FTAG); | |
527 | return (err); | |
528 | } | |
529 | ||
34dc7c2f BB |
530 | int |
531 | dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, | |
532 | uint64_t size, dmu_tx_t *tx) | |
533 | { | |
534 | dnode_t *dn; | |
535 | int err = dnode_hold(os->os, object, FTAG, &dn); | |
536 | if (err) | |
537 | return (err); | |
538 | ASSERT(offset < UINT64_MAX); | |
539 | ASSERT(size == -1ULL || size <= UINT64_MAX - offset); | |
540 | dnode_free_range(dn, offset, size, tx); | |
541 | dnode_rele(dn, FTAG); | |
542 | return (0); | |
543 | } | |
544 | ||
545 | int | |
546 | dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, | |
9babb374 | 547 | void *buf, uint32_t flags) |
34dc7c2f BB |
548 | { |
549 | dnode_t *dn; | |
550 | dmu_buf_t **dbp; | |
551 | int numbufs, i, err; | |
552 | ||
553 | err = dnode_hold(os->os, object, FTAG, &dn); | |
554 | if (err) | |
555 | return (err); | |
556 | ||
557 | /* | |
558 | * Deal with odd block sizes, where there can't be data past the first | |
559 | * block. If we ever do the tail block optimization, we will need to | |
560 | * handle that here as well. | |
561 | */ | |
562 | if (dn->dn_datablkshift == 0) { | |
563 | int newsz = offset > dn->dn_datablksz ? 0 : | |
564 | MIN(size, dn->dn_datablksz - offset); | |
565 | bzero((char *)buf + newsz, size - newsz); | |
566 | size = newsz; | |
567 | } | |
568 | ||
569 | while (size > 0) { | |
570 | uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); | |
571 | ||
572 | /* | |
573 | * NB: we could do this block-at-a-time, but it's nice | |
574 | * to be reading in parallel. | |
575 | */ | |
576 | err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, | |
9babb374 | 577 | TRUE, FTAG, &numbufs, &dbp, flags); |
34dc7c2f BB |
578 | if (err) |
579 | break; | |
580 | ||
581 | for (i = 0; i < numbufs; i++) { | |
582 | int tocpy; | |
583 | int bufoff; | |
584 | dmu_buf_t *db = dbp[i]; | |
585 | ||
586 | ASSERT(size > 0); | |
587 | ||
588 | bufoff = offset - db->db_offset; | |
589 | tocpy = (int)MIN(db->db_size - bufoff, size); | |
590 | ||
591 | bcopy((char *)db->db_data + bufoff, buf, tocpy); | |
592 | ||
593 | offset += tocpy; | |
594 | size -= tocpy; | |
595 | buf = (char *)buf + tocpy; | |
596 | } | |
597 | dmu_buf_rele_array(dbp, numbufs, FTAG); | |
598 | } | |
599 | dnode_rele(dn, FTAG); | |
600 | return (err); | |
601 | } | |
602 | ||
603 | void | |
604 | dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, | |
605 | const void *buf, dmu_tx_t *tx) | |
606 | { | |
607 | dmu_buf_t **dbp; | |
608 | int numbufs, i; | |
609 | ||
610 | if (size == 0) | |
611 | return; | |
612 | ||
613 | VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, | |
614 | FALSE, FTAG, &numbufs, &dbp)); | |
615 | ||
616 | for (i = 0; i < numbufs; i++) { | |
617 | int tocpy; | |
618 | int bufoff; | |
619 | dmu_buf_t *db = dbp[i]; | |
620 | ||
621 | ASSERT(size > 0); | |
622 | ||
623 | bufoff = offset - db->db_offset; | |
624 | tocpy = (int)MIN(db->db_size - bufoff, size); | |
625 | ||
626 | ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); | |
627 | ||
628 | if (tocpy == db->db_size) | |
629 | dmu_buf_will_fill(db, tx); | |
630 | else | |
631 | dmu_buf_will_dirty(db, tx); | |
632 | ||
633 | bcopy(buf, (char *)db->db_data + bufoff, tocpy); | |
634 | ||
635 | if (tocpy == db->db_size) | |
636 | dmu_buf_fill_done(db, tx); | |
637 | ||
638 | offset += tocpy; | |
639 | size -= tocpy; | |
640 | buf = (char *)buf + tocpy; | |
641 | } | |
642 | dmu_buf_rele_array(dbp, numbufs, FTAG); | |
643 | } | |
644 | ||
b128c09f BB |
645 | void |
646 | dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, | |
647 | dmu_tx_t *tx) | |
648 | { | |
649 | dmu_buf_t **dbp; | |
650 | int numbufs, i; | |
651 | ||
652 | if (size == 0) | |
653 | return; | |
654 | ||
655 | VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, | |
656 | FALSE, FTAG, &numbufs, &dbp)); | |
657 | ||
658 | for (i = 0; i < numbufs; i++) { | |
659 | dmu_buf_t *db = dbp[i]; | |
660 | ||
661 | dmu_buf_will_not_fill(db, tx); | |
662 | } | |
663 | dmu_buf_rele_array(dbp, numbufs, FTAG); | |
664 | } | |
665 | ||
34dc7c2f BB |
666 | #ifdef _KERNEL |
667 | int | |
668 | dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) | |
669 | { | |
670 | dmu_buf_t **dbp; | |
671 | int numbufs, i, err; | |
672 | ||
673 | /* | |
674 | * NB: we could do this block-at-a-time, but it's nice | |
675 | * to be reading in parallel. | |
676 | */ | |
677 | err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG, | |
678 | &numbufs, &dbp); | |
679 | if (err) | |
680 | return (err); | |
681 | ||
682 | for (i = 0; i < numbufs; i++) { | |
683 | int tocpy; | |
684 | int bufoff; | |
685 | dmu_buf_t *db = dbp[i]; | |
686 | ||
687 | ASSERT(size > 0); | |
688 | ||
689 | bufoff = uio->uio_loffset - db->db_offset; | |
690 | tocpy = (int)MIN(db->db_size - bufoff, size); | |
691 | ||
692 | err = uiomove((char *)db->db_data + bufoff, tocpy, | |
693 | UIO_READ, uio); | |
694 | if (err) | |
695 | break; | |
696 | ||
697 | size -= tocpy; | |
698 | } | |
699 | dmu_buf_rele_array(dbp, numbufs, FTAG); | |
700 | ||
701 | return (err); | |
702 | } | |
703 | ||
704 | int | |
705 | dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, | |
706 | dmu_tx_t *tx) | |
707 | { | |
708 | dmu_buf_t **dbp; | |
709 | int numbufs, i; | |
710 | int err = 0; | |
711 | ||
712 | if (size == 0) | |
713 | return (0); | |
714 | ||
715 | err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, | |
716 | FALSE, FTAG, &numbufs, &dbp); | |
717 | if (err) | |
718 | return (err); | |
719 | ||
720 | for (i = 0; i < numbufs; i++) { | |
721 | int tocpy; | |
722 | int bufoff; | |
723 | dmu_buf_t *db = dbp[i]; | |
724 | ||
725 | ASSERT(size > 0); | |
726 | ||
727 | bufoff = uio->uio_loffset - db->db_offset; | |
728 | tocpy = (int)MIN(db->db_size - bufoff, size); | |
729 | ||
730 | ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); | |
731 | ||
732 | if (tocpy == db->db_size) | |
733 | dmu_buf_will_fill(db, tx); | |
734 | else | |
735 | dmu_buf_will_dirty(db, tx); | |
736 | ||
737 | /* | |
738 | * XXX uiomove could block forever (eg. nfs-backed | |
739 | * pages). There needs to be a uiolockdown() function | |
740 | * to lock the pages in memory, so that uiomove won't | |
741 | * block. | |
742 | */ | |
743 | err = uiomove((char *)db->db_data + bufoff, tocpy, | |
744 | UIO_WRITE, uio); | |
745 | ||
746 | if (tocpy == db->db_size) | |
747 | dmu_buf_fill_done(db, tx); | |
748 | ||
749 | if (err) | |
750 | break; | |
751 | ||
752 | size -= tocpy; | |
753 | } | |
754 | dmu_buf_rele_array(dbp, numbufs, FTAG); | |
755 | return (err); | |
756 | } | |
757 | ||
758 | int | |
759 | dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, | |
760 | page_t *pp, dmu_tx_t *tx) | |
761 | { | |
762 | dmu_buf_t **dbp; | |
763 | int numbufs, i; | |
764 | int err; | |
765 | ||
766 | if (size == 0) | |
767 | return (0); | |
768 | ||
769 | err = dmu_buf_hold_array(os, object, offset, size, | |
770 | FALSE, FTAG, &numbufs, &dbp); | |
771 | if (err) | |
772 | return (err); | |
773 | ||
774 | for (i = 0; i < numbufs; i++) { | |
775 | int tocpy, copied, thiscpy; | |
776 | int bufoff; | |
777 | dmu_buf_t *db = dbp[i]; | |
778 | caddr_t va; | |
779 | ||
780 | ASSERT(size > 0); | |
781 | ASSERT3U(db->db_size, >=, PAGESIZE); | |
782 | ||
783 | bufoff = offset - db->db_offset; | |
784 | tocpy = (int)MIN(db->db_size - bufoff, size); | |
785 | ||
786 | ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); | |
787 | ||
788 | if (tocpy == db->db_size) | |
789 | dmu_buf_will_fill(db, tx); | |
790 | else | |
791 | dmu_buf_will_dirty(db, tx); | |
792 | ||
793 | for (copied = 0; copied < tocpy; copied += PAGESIZE) { | |
794 | ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); | |
795 | thiscpy = MIN(PAGESIZE, tocpy - copied); | |
b128c09f | 796 | va = zfs_map_page(pp, S_READ); |
34dc7c2f | 797 | bcopy(va, (char *)db->db_data + bufoff, thiscpy); |
b128c09f | 798 | zfs_unmap_page(pp, va); |
34dc7c2f BB |
799 | pp = pp->p_next; |
800 | bufoff += PAGESIZE; | |
801 | } | |
802 | ||
803 | if (tocpy == db->db_size) | |
804 | dmu_buf_fill_done(db, tx); | |
805 | ||
806 | if (err) | |
807 | break; | |
808 | ||
809 | offset += tocpy; | |
810 | size -= tocpy; | |
811 | } | |
812 | dmu_buf_rele_array(dbp, numbufs, FTAG); | |
813 | return (err); | |
814 | } | |
815 | #endif | |
816 | ||
9babb374 BB |
817 | /* |
818 | * Allocate a loaned anonymous arc buffer. | |
819 | */ | |
820 | arc_buf_t * | |
821 | dmu_request_arcbuf(dmu_buf_t *handle, int size) | |
822 | { | |
823 | dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode; | |
824 | ||
825 | return (arc_loan_buf(dn->dn_objset->os_spa, size)); | |
826 | } | |
827 | ||
828 | /* | |
829 | * Free a loaned arc buffer. | |
830 | */ | |
831 | void | |
832 | dmu_return_arcbuf(arc_buf_t *buf) | |
833 | { | |
834 | arc_return_buf(buf, FTAG); | |
835 | VERIFY(arc_buf_remove_ref(buf, FTAG) == 1); | |
836 | } | |
837 | ||
838 | /* | |
839 | * When possible directly assign passed loaned arc buffer to a dbuf. | |
840 | * If this is not possible copy the contents of passed arc buf via | |
841 | * dmu_write(). | |
842 | */ | |
843 | void | |
844 | dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, | |
845 | dmu_tx_t *tx) | |
846 | { | |
847 | dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode; | |
848 | dmu_buf_impl_t *db; | |
849 | uint32_t blksz = (uint32_t)arc_buf_size(buf); | |
850 | uint64_t blkid; | |
851 | ||
852 | rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
853 | blkid = dbuf_whichblock(dn, offset); | |
854 | VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); | |
855 | rw_exit(&dn->dn_struct_rwlock); | |
856 | ||
857 | if (offset == db->db.db_offset && blksz == db->db.db_size) { | |
858 | dbuf_assign_arcbuf(db, buf, tx); | |
859 | dbuf_rele(db, FTAG); | |
860 | } else { | |
861 | dbuf_rele(db, FTAG); | |
862 | ASSERT(dn->dn_objset->os.os == dn->dn_objset); | |
863 | dmu_write(&dn->dn_objset->os, dn->dn_object, offset, blksz, | |
864 | buf->b_data, tx); | |
865 | dmu_return_arcbuf(buf); | |
866 | } | |
867 | } | |
868 | ||
34dc7c2f BB |
869 | typedef struct { |
870 | dbuf_dirty_record_t *dr; | |
871 | dmu_sync_cb_t *done; | |
872 | void *arg; | |
873 | } dmu_sync_arg_t; | |
874 | ||
b128c09f BB |
875 | /* ARGSUSED */ |
876 | static void | |
877 | dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) | |
878 | { | |
879 | blkptr_t *bp = zio->io_bp; | |
880 | ||
881 | if (!BP_IS_HOLE(bp)) { | |
882 | dmu_sync_arg_t *in = varg; | |
883 | dbuf_dirty_record_t *dr = in->dr; | |
884 | dmu_buf_impl_t *db = dr->dr_dbuf; | |
885 | ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type); | |
886 | ASSERT(BP_GET_LEVEL(bp) == 0); | |
887 | bp->blk_fill = 1; | |
888 | } | |
889 | } | |
890 | ||
34dc7c2f BB |
891 | /* ARGSUSED */ |
892 | static void | |
893 | dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) | |
894 | { | |
895 | dmu_sync_arg_t *in = varg; | |
896 | dbuf_dirty_record_t *dr = in->dr; | |
897 | dmu_buf_impl_t *db = dr->dr_dbuf; | |
898 | dmu_sync_cb_t *done = in->done; | |
899 | ||
34dc7c2f BB |
900 | mutex_enter(&db->db_mtx); |
901 | ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); | |
902 | dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */ | |
903 | dr->dt.dl.dr_override_state = DR_OVERRIDDEN; | |
904 | cv_broadcast(&db->db_changed); | |
905 | mutex_exit(&db->db_mtx); | |
906 | ||
907 | if (done) | |
908 | done(&(db->db), in->arg); | |
909 | ||
910 | kmem_free(in, sizeof (dmu_sync_arg_t)); | |
911 | } | |
912 | ||
913 | /* | |
914 | * Intent log support: sync the block associated with db to disk. | |
915 | * N.B. and XXX: the caller is responsible for making sure that the | |
916 | * data isn't changing while dmu_sync() is writing it. | |
917 | * | |
918 | * Return values: | |
919 | * | |
920 | * EEXIST: this txg has already been synced, so there's nothing to to. | |
921 | * The caller should not log the write. | |
922 | * | |
923 | * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. | |
924 | * The caller should not log the write. | |
925 | * | |
926 | * EALREADY: this block is already in the process of being synced. | |
927 | * The caller should track its progress (somehow). | |
928 | * | |
929 | * EINPROGRESS: the IO has been initiated. | |
930 | * The caller should log this blkptr in the callback. | |
931 | * | |
932 | * 0: completed. Sets *bp to the blkptr just written. | |
933 | * The caller should log this blkptr immediately. | |
934 | */ | |
935 | int | |
936 | dmu_sync(zio_t *pio, dmu_buf_t *db_fake, | |
937 | blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg) | |
938 | { | |
939 | dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; | |
940 | objset_impl_t *os = db->db_objset; | |
941 | dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; | |
942 | tx_state_t *tx = &dp->dp_tx; | |
943 | dbuf_dirty_record_t *dr; | |
944 | dmu_sync_arg_t *in; | |
945 | zbookmark_t zb; | |
b128c09f | 946 | writeprops_t wp = { 0 }; |
34dc7c2f | 947 | zio_t *zio; |
34dc7c2f BB |
948 | int err; |
949 | ||
950 | ASSERT(BP_IS_HOLE(bp)); | |
951 | ASSERT(txg != 0); | |
952 | ||
34dc7c2f BB |
953 | dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n", |
954 | txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); | |
955 | ||
956 | /* | |
957 | * XXX - would be nice if we could do this without suspending... | |
958 | */ | |
959 | txg_suspend(dp); | |
960 | ||
961 | /* | |
962 | * If this txg already synced, there's nothing to do. | |
963 | */ | |
964 | if (txg <= tx->tx_synced_txg) { | |
965 | txg_resume(dp); | |
966 | /* | |
967 | * If we're running ziltest, we need the blkptr regardless. | |
968 | */ | |
969 | if (txg > spa_freeze_txg(dp->dp_spa)) { | |
970 | /* if db_blkptr == NULL, this was an empty write */ | |
971 | if (db->db_blkptr) | |
972 | *bp = *db->db_blkptr; /* structure assignment */ | |
973 | return (0); | |
974 | } | |
975 | return (EEXIST); | |
976 | } | |
977 | ||
978 | mutex_enter(&db->db_mtx); | |
979 | ||
980 | if (txg == tx->tx_syncing_txg) { | |
981 | while (db->db_data_pending) { | |
982 | /* | |
983 | * IO is in-progress. Wait for it to finish. | |
984 | * XXX - would be nice to be able to somehow "attach" | |
985 | * this zio to the parent zio passed in. | |
986 | */ | |
987 | cv_wait(&db->db_changed, &db->db_mtx); | |
988 | if (!db->db_data_pending && | |
989 | db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) { | |
990 | /* | |
991 | * IO was compressed away | |
992 | */ | |
993 | *bp = *db->db_blkptr; /* structure assignment */ | |
994 | mutex_exit(&db->db_mtx); | |
995 | txg_resume(dp); | |
996 | return (0); | |
997 | } | |
998 | ASSERT(db->db_data_pending || | |
999 | (db->db_blkptr && db->db_blkptr->blk_birth == txg)); | |
1000 | } | |
1001 | ||
1002 | if (db->db_blkptr && db->db_blkptr->blk_birth == txg) { | |
1003 | /* | |
1004 | * IO is already completed. | |
1005 | */ | |
1006 | *bp = *db->db_blkptr; /* structure assignment */ | |
1007 | mutex_exit(&db->db_mtx); | |
1008 | txg_resume(dp); | |
1009 | return (0); | |
1010 | } | |
1011 | } | |
1012 | ||
1013 | dr = db->db_last_dirty; | |
1014 | while (dr && dr->dr_txg > txg) | |
1015 | dr = dr->dr_next; | |
1016 | if (dr == NULL || dr->dr_txg < txg) { | |
1017 | /* | |
1018 | * This dbuf isn't dirty, must have been free_range'd. | |
1019 | * There's no need to log writes to freed blocks, so we're done. | |
1020 | */ | |
1021 | mutex_exit(&db->db_mtx); | |
1022 | txg_resume(dp); | |
1023 | return (ENOENT); | |
1024 | } | |
1025 | ||
1026 | ASSERT(dr->dr_txg == txg); | |
1027 | if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { | |
1028 | /* | |
1029 | * We have already issued a sync write for this buffer. | |
1030 | */ | |
1031 | mutex_exit(&db->db_mtx); | |
1032 | txg_resume(dp); | |
1033 | return (EALREADY); | |
1034 | } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { | |
1035 | /* | |
1036 | * This buffer has already been synced. It could not | |
1037 | * have been dirtied since, or we would have cleared the state. | |
1038 | */ | |
1039 | *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */ | |
1040 | mutex_exit(&db->db_mtx); | |
1041 | txg_resume(dp); | |
1042 | return (0); | |
1043 | } | |
1044 | ||
1045 | dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; | |
1046 | in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); | |
1047 | in->dr = dr; | |
1048 | in->done = done; | |
1049 | in->arg = arg; | |
1050 | mutex_exit(&db->db_mtx); | |
1051 | txg_resume(dp); | |
1052 | ||
1053 | zb.zb_objset = os->os_dsl_dataset->ds_object; | |
1054 | zb.zb_object = db->db.db_object; | |
1055 | zb.zb_level = db->db_level; | |
1056 | zb.zb_blkid = db->db_blkid; | |
34dc7c2f | 1057 | |
b128c09f BB |
1058 | wp.wp_type = db->db_dnode->dn_type; |
1059 | wp.wp_level = db->db_level; | |
1060 | wp.wp_copies = os->os_copies; | |
1061 | wp.wp_dnchecksum = db->db_dnode->dn_checksum; | |
1062 | wp.wp_oschecksum = os->os_checksum; | |
1063 | wp.wp_dncompress = db->db_dnode->dn_compress; | |
1064 | wp.wp_oscompress = os->os_compress; | |
1065 | ||
1066 | ASSERT(BP_IS_HOLE(bp)); | |
1067 | ||
1068 | zio = arc_write(pio, os->os_spa, &wp, DBUF_IS_L2CACHEABLE(db), | |
1069 | txg, bp, dr->dt.dl.dr_data, dmu_sync_ready, dmu_sync_done, in, | |
1070 | ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); | |
34dc7c2f BB |
1071 | if (pio) { |
1072 | zio_nowait(zio); | |
1073 | err = EINPROGRESS; | |
1074 | } else { | |
1075 | err = zio_wait(zio); | |
1076 | ASSERT(err == 0); | |
1077 | } | |
1078 | return (err); | |
1079 | } | |
1080 | ||
1081 | int | |
1082 | dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, | |
1083 | dmu_tx_t *tx) | |
1084 | { | |
1085 | dnode_t *dn; | |
1086 | int err; | |
1087 | ||
1088 | err = dnode_hold(os->os, object, FTAG, &dn); | |
1089 | if (err) | |
1090 | return (err); | |
1091 | err = dnode_set_blksz(dn, size, ibs, tx); | |
1092 | dnode_rele(dn, FTAG); | |
1093 | return (err); | |
1094 | } | |
1095 | ||
1096 | void | |
1097 | dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, | |
1098 | dmu_tx_t *tx) | |
1099 | { | |
1100 | dnode_t *dn; | |
1101 | ||
1102 | /* XXX assumes dnode_hold will not get an i/o error */ | |
1103 | (void) dnode_hold(os->os, object, FTAG, &dn); | |
1104 | ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); | |
1105 | dn->dn_checksum = checksum; | |
1106 | dnode_setdirty(dn, tx); | |
1107 | dnode_rele(dn, FTAG); | |
1108 | } | |
1109 | ||
1110 | void | |
1111 | dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, | |
1112 | dmu_tx_t *tx) | |
1113 | { | |
1114 | dnode_t *dn; | |
1115 | ||
1116 | /* XXX assumes dnode_hold will not get an i/o error */ | |
1117 | (void) dnode_hold(os->os, object, FTAG, &dn); | |
1118 | ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); | |
1119 | dn->dn_compress = compress; | |
1120 | dnode_setdirty(dn, tx); | |
1121 | dnode_rele(dn, FTAG); | |
1122 | } | |
1123 | ||
34dc7c2f BB |
1124 | int |
1125 | dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) | |
1126 | { | |
1127 | dnode_t *dn; | |
1128 | int i, err; | |
1129 | ||
1130 | err = dnode_hold(os->os, object, FTAG, &dn); | |
1131 | if (err) | |
1132 | return (err); | |
1133 | /* | |
1134 | * Sync any current changes before | |
1135 | * we go trundling through the block pointers. | |
1136 | */ | |
1137 | for (i = 0; i < TXG_SIZE; i++) { | |
1138 | if (list_link_active(&dn->dn_dirty_link[i])) | |
1139 | break; | |
1140 | } | |
1141 | if (i != TXG_SIZE) { | |
1142 | dnode_rele(dn, FTAG); | |
1143 | txg_wait_synced(dmu_objset_pool(os), 0); | |
1144 | err = dnode_hold(os->os, object, FTAG, &dn); | |
1145 | if (err) | |
1146 | return (err); | |
1147 | } | |
1148 | ||
b128c09f | 1149 | err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); |
34dc7c2f BB |
1150 | dnode_rele(dn, FTAG); |
1151 | ||
1152 | return (err); | |
1153 | } | |
1154 | ||
1155 | void | |
1156 | dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) | |
1157 | { | |
1158 | rw_enter(&dn->dn_struct_rwlock, RW_READER); | |
1159 | mutex_enter(&dn->dn_mtx); | |
1160 | ||
1161 | doi->doi_data_block_size = dn->dn_datablksz; | |
1162 | doi->doi_metadata_block_size = dn->dn_indblkshift ? | |
1163 | 1ULL << dn->dn_indblkshift : 0; | |
1164 | doi->doi_indirection = dn->dn_nlevels; | |
1165 | doi->doi_checksum = dn->dn_checksum; | |
1166 | doi->doi_compress = dn->dn_compress; | |
1167 | doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) + | |
1168 | SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT; | |
1169 | doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid; | |
1170 | doi->doi_type = dn->dn_type; | |
1171 | doi->doi_bonus_size = dn->dn_bonuslen; | |
1172 | doi->doi_bonus_type = dn->dn_bonustype; | |
1173 | ||
1174 | mutex_exit(&dn->dn_mtx); | |
1175 | rw_exit(&dn->dn_struct_rwlock); | |
1176 | } | |
1177 | ||
1178 | /* | |
1179 | * Get information on a DMU object. | |
1180 | * If doi is NULL, just indicates whether the object exists. | |
1181 | */ | |
1182 | int | |
1183 | dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) | |
1184 | { | |
1185 | dnode_t *dn; | |
1186 | int err = dnode_hold(os->os, object, FTAG, &dn); | |
1187 | ||
1188 | if (err) | |
1189 | return (err); | |
1190 | ||
1191 | if (doi != NULL) | |
1192 | dmu_object_info_from_dnode(dn, doi); | |
1193 | ||
1194 | dnode_rele(dn, FTAG); | |
1195 | return (0); | |
1196 | } | |
1197 | ||
1198 | /* | |
1199 | * As above, but faster; can be used when you have a held dbuf in hand. | |
1200 | */ | |
1201 | void | |
1202 | dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi) | |
1203 | { | |
1204 | dmu_object_info_from_dnode(((dmu_buf_impl_t *)db)->db_dnode, doi); | |
1205 | } | |
1206 | ||
1207 | /* | |
1208 | * Faster still when you only care about the size. | |
1209 | * This is specifically optimized for zfs_getattr(). | |
1210 | */ | |
1211 | void | |
1212 | dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) | |
1213 | { | |
1214 | dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; | |
1215 | ||
1216 | *blksize = dn->dn_datablksz; | |
1217 | /* add 1 for dnode space */ | |
1218 | *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> | |
1219 | SPA_MINBLOCKSHIFT) + 1; | |
1220 | } | |
1221 | ||
1222 | void | |
1223 | byteswap_uint64_array(void *vbuf, size_t size) | |
1224 | { | |
1225 | uint64_t *buf = vbuf; | |
1226 | size_t count = size >> 3; | |
1227 | int i; | |
1228 | ||
1229 | ASSERT((size & 7) == 0); | |
1230 | ||
1231 | for (i = 0; i < count; i++) | |
1232 | buf[i] = BSWAP_64(buf[i]); | |
1233 | } | |
1234 | ||
1235 | void | |
1236 | byteswap_uint32_array(void *vbuf, size_t size) | |
1237 | { | |
1238 | uint32_t *buf = vbuf; | |
1239 | size_t count = size >> 2; | |
1240 | int i; | |
1241 | ||
1242 | ASSERT((size & 3) == 0); | |
1243 | ||
1244 | for (i = 0; i < count; i++) | |
1245 | buf[i] = BSWAP_32(buf[i]); | |
1246 | } | |
1247 | ||
1248 | void | |
1249 | byteswap_uint16_array(void *vbuf, size_t size) | |
1250 | { | |
1251 | uint16_t *buf = vbuf; | |
1252 | size_t count = size >> 1; | |
1253 | int i; | |
1254 | ||
1255 | ASSERT((size & 1) == 0); | |
1256 | ||
1257 | for (i = 0; i < count; i++) | |
1258 | buf[i] = BSWAP_16(buf[i]); | |
1259 | } | |
1260 | ||
1261 | /* ARGSUSED */ | |
1262 | void | |
1263 | byteswap_uint8_array(void *vbuf, size_t size) | |
1264 | { | |
1265 | } | |
1266 | ||
1267 | void | |
1268 | dmu_init(void) | |
1269 | { | |
1270 | dbuf_init(); | |
1271 | dnode_init(); | |
1272 | arc_init(); | |
1273 | l2arc_init(); | |
1274 | } | |
1275 | ||
1276 | void | |
1277 | dmu_fini(void) | |
1278 | { | |
1279 | arc_fini(); | |
1280 | dnode_fini(); | |
1281 | dbuf_fini(); | |
1282 | l2arc_fini(); | |
1283 | } |