]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/dmu.c
Native Encryption for ZFS on Linux
[mirror_zfs.git] / module / zfs / dmu.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
82644107 23 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
3a17a7a9 24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
bc77ba73 25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
a08abc1b 26 * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved.
5475aada 27 * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
34dc7c2f
BB
28 */
29
34dc7c2f
BB
30#include <sys/dmu.h>
31#include <sys/dmu_impl.h>
32#include <sys/dmu_tx.h>
33#include <sys/dbuf.h>
34#include <sys/dnode.h>
35#include <sys/zfs_context.h>
36#include <sys/dmu_objset.h>
37#include <sys/dmu_traverse.h>
38#include <sys/dsl_dataset.h>
39#include <sys/dsl_dir.h>
40#include <sys/dsl_pool.h>
41#include <sys/dsl_synctask.h>
42#include <sys/dsl_prop.h>
43#include <sys/dmu_zfetch.h>
44#include <sys/zfs_ioctl.h>
45#include <sys/zap.h>
46#include <sys/zio_checksum.h>
03c6040b 47#include <sys/zio_compress.h>
428870ff 48#include <sys/sa.h>
62bdd5eb 49#include <sys/zfeature.h>
a6255b7f 50#include <sys/abd.h>
539d33c7 51#include <sys/trace_dmu.h>
34dc7c2f
BB
52#ifdef _KERNEL
53#include <sys/vmsystm.h>
b128c09f 54#include <sys/zfs_znode.h>
34dc7c2f
BB
55#endif
56
03c6040b
GW
57/*
58 * Enable/disable nopwrite feature.
59 */
60int zfs_nopwrite_enabled = 1;
61
539d33c7
GM
62/*
63 * Tunable to control percentage of dirtied blocks from frees in one TXG.
64 * After this threshold is crossed, additional dirty blocks from frees
65 * wait until the next TXG.
66 * A value of zero will disable this throttle.
67 */
bef78122 68unsigned long zfs_per_txg_dirty_frees_percent = 30;
539d33c7 69
66aca247
DB
70/*
71 * Enable/disable forcing txg sync when dirty in dmu_offset_next.
72 */
73int zfs_dmu_offset_next_sync = 0;
74
34dc7c2f 75const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
b5256303
TC
76 { DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" },
77 { DMU_BSWAP_ZAP, TRUE, FALSE, "object directory" },
78 { DMU_BSWAP_UINT64, TRUE, FALSE, "object array" },
79 { DMU_BSWAP_UINT8, TRUE, FALSE, "packed nvlist" },
80 { DMU_BSWAP_UINT64, TRUE, FALSE, "packed nvlist size" },
81 { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj" },
82 { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj header" },
83 { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map header" },
84 { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map" },
85 { DMU_BSWAP_UINT64, TRUE, TRUE, "ZIL intent log" },
86 { DMU_BSWAP_DNODE, TRUE, TRUE, "DMU dnode" },
87 { DMU_BSWAP_OBJSET, TRUE, FALSE, "DMU objset" },
88 { DMU_BSWAP_UINT64, TRUE, FALSE, "DSL directory" },
89 { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL directory child map"},
90 { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL dataset snap map" },
91 { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL props" },
92 { DMU_BSWAP_UINT64, TRUE, FALSE, "DSL dataset" },
93 { DMU_BSWAP_ZNODE, TRUE, FALSE, "ZFS znode" },
94 { DMU_BSWAP_OLDACL, TRUE, TRUE, "ZFS V0 ACL" },
95 { DMU_BSWAP_UINT8, FALSE, TRUE, "ZFS plain file" },
96 { DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS directory" },
97 { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS master node" },
98 { DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS delete queue" },
99 { DMU_BSWAP_UINT8, FALSE, TRUE, "zvol object" },
100 { DMU_BSWAP_ZAP, TRUE, FALSE, "zvol prop" },
101 { DMU_BSWAP_UINT8, FALSE, TRUE, "other uint8[]" },
102 { DMU_BSWAP_UINT64, FALSE, TRUE, "other uint64[]" },
103 { DMU_BSWAP_ZAP, TRUE, FALSE, "other ZAP" },
104 { DMU_BSWAP_ZAP, TRUE, FALSE, "persistent error log" },
105 { DMU_BSWAP_UINT8, TRUE, FALSE, "SPA history" },
106 { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA history offsets" },
107 { DMU_BSWAP_ZAP, TRUE, FALSE, "Pool properties" },
108 { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL permissions" },
109 { DMU_BSWAP_ACL, TRUE, TRUE, "ZFS ACL" },
110 { DMU_BSWAP_UINT8, TRUE, TRUE, "ZFS SYSACL" },
111 { DMU_BSWAP_UINT8, TRUE, TRUE, "FUID table" },
112 { DMU_BSWAP_UINT64, TRUE, FALSE, "FUID table size" },
113 { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL dataset next clones"},
114 { DMU_BSWAP_ZAP, TRUE, FALSE, "scan work queue" },
115 { DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS user/group used" },
116 { DMU_BSWAP_ZAP, TRUE, TRUE, "ZFS user/group quota" },
117 { DMU_BSWAP_ZAP, TRUE, FALSE, "snapshot refcount tags"},
118 { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT ZAP algorithm" },
119 { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT statistics" },
120 { DMU_BSWAP_UINT8, TRUE, TRUE, "System attributes" },
121 { DMU_BSWAP_ZAP, TRUE, TRUE, "SA master node" },
122 { DMU_BSWAP_ZAP, TRUE, TRUE, "SA attr registration" },
123 { DMU_BSWAP_ZAP, TRUE, TRUE, "SA attr layouts" },
124 { DMU_BSWAP_ZAP, TRUE, FALSE, "scan translations" },
125 { DMU_BSWAP_UINT8, FALSE, TRUE, "deduplicated block" },
126 { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL deadlist map" },
127 { DMU_BSWAP_UINT64, TRUE, FALSE, "DSL deadlist map hdr" },
128 { DMU_BSWAP_ZAP, TRUE, FALSE, "DSL dir clones" },
129 { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj subobj" }
9ae529ec
CS
130};
131
132const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
133 { byteswap_uint8_array, "uint8" },
134 { byteswap_uint16_array, "uint16" },
135 { byteswap_uint32_array, "uint32" },
136 { byteswap_uint64_array, "uint64" },
137 { zap_byteswap, "zap" },
138 { dnode_buf_byteswap, "dnode" },
139 { dmu_objset_byteswap, "objset" },
140 { zfs_znode_byteswap, "znode" },
141 { zfs_oldacl_byteswap, "oldacl" },
142 { zfs_acl_byteswap, "acl" }
34dc7c2f
BB
143};
144
2bce8049
MA
145int
146dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
147 void *tag, dmu_buf_t **dbp)
148{
149 uint64_t blkid;
150 dmu_buf_impl_t *db;
151
152 blkid = dbuf_whichblock(dn, 0, offset);
153 rw_enter(&dn->dn_struct_rwlock, RW_READER);
154 db = dbuf_hold(dn, blkid, tag);
155 rw_exit(&dn->dn_struct_rwlock);
156
157 if (db == NULL) {
158 *dbp = NULL;
159 return (SET_ERROR(EIO));
160 }
161
162 *dbp = &db->db;
163 return (0);
164}
34dc7c2f 165int
9b67f605
MA
166dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
167 void *tag, dmu_buf_t **dbp)
34dc7c2f
BB
168{
169 dnode_t *dn;
170 uint64_t blkid;
171 dmu_buf_impl_t *db;
172 int err;
428870ff
BB
173
174 err = dnode_hold(os, object, FTAG, &dn);
34dc7c2f
BB
175 if (err)
176 return (err);
fcff0f35 177 blkid = dbuf_whichblock(dn, 0, offset);
34dc7c2f
BB
178 rw_enter(&dn->dn_struct_rwlock, RW_READER);
179 db = dbuf_hold(dn, blkid, tag);
180 rw_exit(&dn->dn_struct_rwlock);
9b67f605
MA
181 dnode_rele(dn, FTAG);
182
34dc7c2f 183 if (db == NULL) {
9b67f605
MA
184 *dbp = NULL;
185 return (SET_ERROR(EIO));
186 }
187
188 *dbp = &db->db;
189 return (err);
190}
191
2bce8049
MA
192int
193dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
194 void *tag, dmu_buf_t **dbp, int flags)
195{
196 int err;
197 int db_flags = DB_RF_CANFAIL;
198
199 if (flags & DMU_READ_NO_PREFETCH)
200 db_flags |= DB_RF_NOPREFETCH;
b5256303
TC
201 if (flags & DMU_READ_NO_DECRYPT)
202 db_flags |= DB_RF_NO_DECRYPT;
2bce8049
MA
203
204 err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
205 if (err == 0) {
206 dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
207 err = dbuf_read(db, NULL, db_flags);
208 if (err != 0) {
209 dbuf_rele(db, tag);
210 *dbp = NULL;
211 }
212 }
213
214 return (err);
215}
216
9b67f605
MA
217int
218dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
219 void *tag, dmu_buf_t **dbp, int flags)
220{
221 int err;
222 int db_flags = DB_RF_CANFAIL;
223
224 if (flags & DMU_READ_NO_PREFETCH)
225 db_flags |= DB_RF_NOPREFETCH;
b5256303
TC
226 if (flags & DMU_READ_NO_DECRYPT)
227 db_flags |= DB_RF_NO_DECRYPT;
9b67f605
MA
228
229 err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
230 if (err == 0) {
231 dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
428870ff 232 err = dbuf_read(db, NULL, db_flags);
9b67f605 233 if (err != 0) {
34dc7c2f 234 dbuf_rele(db, tag);
9b67f605 235 *dbp = NULL;
34dc7c2f
BB
236 }
237 }
238
34dc7c2f
BB
239 return (err);
240}
241
242int
243dmu_bonus_max(void)
244{
50c957f7 245 return (DN_OLD_MAX_BONUSLEN);
34dc7c2f
BB
246}
247
248int
572e2857 249dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
34dc7c2f 250{
572e2857
BB
251 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
252 dnode_t *dn;
253 int error;
34dc7c2f 254
572e2857
BB
255 DB_DNODE_ENTER(db);
256 dn = DB_DNODE(db);
257
258 if (dn->dn_bonus != db) {
2e528b49 259 error = SET_ERROR(EINVAL);
572e2857 260 } else if (newsize < 0 || newsize > db_fake->db_size) {
2e528b49 261 error = SET_ERROR(EINVAL);
572e2857
BB
262 } else {
263 dnode_setbonuslen(dn, newsize, tx);
264 error = 0;
265 }
266
267 DB_DNODE_EXIT(db);
268 return (error);
34dc7c2f
BB
269}
270
428870ff 271int
572e2857 272dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
428870ff 273{
572e2857
BB
274 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
275 dnode_t *dn;
276 int error;
428870ff 277
572e2857
BB
278 DB_DNODE_ENTER(db);
279 dn = DB_DNODE(db);
428870ff 280
9ae529ec 281 if (!DMU_OT_IS_VALID(type)) {
2e528b49 282 error = SET_ERROR(EINVAL);
572e2857 283 } else if (dn->dn_bonus != db) {
2e528b49 284 error = SET_ERROR(EINVAL);
572e2857
BB
285 } else {
286 dnode_setbonus_type(dn, type, tx);
287 error = 0;
288 }
428870ff 289
572e2857
BB
290 DB_DNODE_EXIT(db);
291 return (error);
292}
293
294dmu_object_type_t
295dmu_get_bonustype(dmu_buf_t *db_fake)
296{
297 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
298 dnode_t *dn;
299 dmu_object_type_t type;
300
301 DB_DNODE_ENTER(db);
302 dn = DB_DNODE(db);
303 type = dn->dn_bonustype;
304 DB_DNODE_EXIT(db);
305
306 return (type);
428870ff
BB
307}
308
309int
310dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
311{
312 dnode_t *dn;
313 int error;
314
315 error = dnode_hold(os, object, FTAG, &dn);
316 dbuf_rm_spill(dn, tx);
317 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
318 dnode_rm_spill(dn, tx);
319 rw_exit(&dn->dn_struct_rwlock);
320 dnode_rele(dn, FTAG);
321 return (error);
322}
323
34dc7c2f
BB
324/*
325 * returns ENOENT, EIO, or 0.
326 */
327int
b5256303
TC
328dmu_bonus_hold_impl(objset_t *os, uint64_t object, void *tag, uint32_t flags,
329 dmu_buf_t **dbp)
34dc7c2f
BB
330{
331 dnode_t *dn;
332 dmu_buf_impl_t *db;
333 int error;
b5256303
TC
334 uint32_t db_flags = DB_RF_MUST_SUCCEED;
335
336 if (flags & DMU_READ_NO_PREFETCH)
337 db_flags |= DB_RF_NOPREFETCH;
338 if (flags & DMU_READ_NO_DECRYPT)
339 db_flags |= DB_RF_NO_DECRYPT;
34dc7c2f 340
428870ff 341 error = dnode_hold(os, object, FTAG, &dn);
34dc7c2f
BB
342 if (error)
343 return (error);
344
345 rw_enter(&dn->dn_struct_rwlock, RW_READER);
346 if (dn->dn_bonus == NULL) {
347 rw_exit(&dn->dn_struct_rwlock);
348 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
349 if (dn->dn_bonus == NULL)
350 dbuf_create_bonus(dn);
351 }
352 db = dn->dn_bonus;
34dc7c2f
BB
353
354 /* as long as the bonus buf is held, the dnode will be held */
572e2857 355 if (refcount_add(&db->db_holds, tag) == 1) {
34dc7c2f 356 VERIFY(dnode_add_ref(dn, db));
73ad4a9f 357 atomic_inc_32(&dn->dn_dbufs_count);
572e2857
BB
358 }
359
360 /*
361 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
362 * hold and incrementing the dbuf count to ensure that dnode_move() sees
363 * a dnode hold for every dbuf.
364 */
365 rw_exit(&dn->dn_struct_rwlock);
34dc7c2f
BB
366
367 dnode_rele(dn, FTAG);
368
b5256303
TC
369 error = dbuf_read(db, NULL, db_flags);
370 if (error) {
371 dnode_evict_bonus(dn);
372 dbuf_rele(db, tag);
373 *dbp = NULL;
374 return (error);
375 }
34dc7c2f
BB
376
377 *dbp = &db->db;
378 return (0);
379}
380
b5256303
TC
381int
382dmu_bonus_hold(objset_t *os, uint64_t obj, void *tag, dmu_buf_t **dbp)
383{
384 return (dmu_bonus_hold_impl(os, obj, tag, DMU_READ_NO_PREFETCH, dbp));
385}
386
428870ff
BB
387/*
388 * returns ENOENT, EIO, or 0.
389 *
390 * This interface will allocate a blank spill dbuf when a spill blk
391 * doesn't already exist on the dnode.
392 *
393 * if you only want to find an already existing spill db, then
394 * dmu_spill_hold_existing() should be used.
395 */
396int
397dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
398{
399 dmu_buf_impl_t *db = NULL;
400 int err;
401
402 if ((flags & DB_RF_HAVESTRUCT) == 0)
403 rw_enter(&dn->dn_struct_rwlock, RW_READER);
404
405 db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
406
407 if ((flags & DB_RF_HAVESTRUCT) == 0)
408 rw_exit(&dn->dn_struct_rwlock);
409
b182ac00 410 if (db == NULL) {
411 *dbp = NULL;
412 return (SET_ERROR(EIO));
413 }
572e2857
BB
414 err = dbuf_read(db, NULL, flags);
415 if (err == 0)
416 *dbp = &db->db;
b182ac00 417 else {
572e2857 418 dbuf_rele(db, tag);
b182ac00 419 *dbp = NULL;
420 }
428870ff
BB
421 return (err);
422}
423
424int
425dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
426{
572e2857
BB
427 dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
428 dnode_t *dn;
428870ff
BB
429 int err;
430
572e2857
BB
431 DB_DNODE_ENTER(db);
432 dn = DB_DNODE(db);
433
434 if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
2e528b49 435 err = SET_ERROR(EINVAL);
572e2857
BB
436 } else {
437 rw_enter(&dn->dn_struct_rwlock, RW_READER);
438
439 if (!dn->dn_have_spill) {
2e528b49 440 err = SET_ERROR(ENOENT);
572e2857
BB
441 } else {
442 err = dmu_spill_hold_by_dnode(dn,
443 DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
444 }
428870ff 445
428870ff 446 rw_exit(&dn->dn_struct_rwlock);
428870ff 447 }
572e2857
BB
448
449 DB_DNODE_EXIT(db);
428870ff
BB
450 return (err);
451}
452
453int
454dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
455{
572e2857
BB
456 dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
457 dnode_t *dn;
458 int err;
459
460 DB_DNODE_ENTER(db);
461 dn = DB_DNODE(db);
462 err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
463 DB_DNODE_EXIT(db);
464
465 return (err);
428870ff
BB
466}
467
34dc7c2f
BB
468/*
469 * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
470 * to take a held dnode rather than <os, object> -- the lookup is wasteful,
471 * and can induce severe lock contention when writing to several files
472 * whose dnodes are in the same block.
473 */
474static int
9babb374 475dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
7f60329a 476 boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
34dc7c2f
BB
477{
478 dmu_buf_t **dbp;
479 uint64_t blkid, nblks, i;
9babb374 480 uint32_t dbuf_flags;
34dc7c2f
BB
481 int err;
482 zio_t *zio;
483
484 ASSERT(length <= DMU_MAX_ACCESS);
485
7f60329a
MA
486 /*
487 * Note: We directly notify the prefetch code of this read, so that
488 * we can tell it about the multi-block read. dbuf_read() only knows
489 * about the one block it is accessing.
490 */
491 dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
492 DB_RF_NOPREFETCH;
34dc7c2f
BB
493
494 rw_enter(&dn->dn_struct_rwlock, RW_READER);
495 if (dn->dn_datablkshift) {
496 int blkshift = dn->dn_datablkshift;
7f60329a
MA
497 nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
498 P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
34dc7c2f
BB
499 } else {
500 if (offset + length > dn->dn_datablksz) {
501 zfs_panic_recover("zfs: accessing past end of object "
502 "%llx/%llx (size=%u access=%llu+%llu)",
503 (longlong_t)dn->dn_objset->
504 os_dsl_dataset->ds_object,
505 (longlong_t)dn->dn_object, dn->dn_datablksz,
506 (longlong_t)offset, (longlong_t)length);
45d1cae3 507 rw_exit(&dn->dn_struct_rwlock);
2e528b49 508 return (SET_ERROR(EIO));
34dc7c2f
BB
509 }
510 nblks = 1;
511 }
79c76d5b 512 dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
34dc7c2f 513
b128c09f 514 zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
fcff0f35 515 blkid = dbuf_whichblock(dn, 0, offset);
34dc7c2f 516 for (i = 0; i < nblks; i++) {
7f60329a 517 dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
34dc7c2f
BB
518 if (db == NULL) {
519 rw_exit(&dn->dn_struct_rwlock);
520 dmu_buf_rele_array(dbp, nblks, tag);
521 zio_nowait(zio);
2e528b49 522 return (SET_ERROR(EIO));
34dc7c2f 523 }
7f60329a 524
34dc7c2f 525 /* initiate async i/o */
7f60329a 526 if (read)
9babb374 527 (void) dbuf_read(db, zio, dbuf_flags);
34dc7c2f
BB
528 dbp[i] = &db->db;
529 }
7f60329a 530
755065f3
AM
531 if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
532 DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
533 dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
534 read && DNODE_IS_CACHEABLE(dn));
7f60329a 535 }
34dc7c2f
BB
536 rw_exit(&dn->dn_struct_rwlock);
537
538 /* wait for async i/o */
539 err = zio_wait(zio);
540 if (err) {
541 dmu_buf_rele_array(dbp, nblks, tag);
542 return (err);
543 }
544
545 /* wait for other io to complete */
546 if (read) {
547 for (i = 0; i < nblks; i++) {
548 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
549 mutex_enter(&db->db_mtx);
550 while (db->db_state == DB_READ ||
551 db->db_state == DB_FILL)
552 cv_wait(&db->db_changed, &db->db_mtx);
553 if (db->db_state == DB_UNCACHED)
2e528b49 554 err = SET_ERROR(EIO);
34dc7c2f
BB
555 mutex_exit(&db->db_mtx);
556 if (err) {
557 dmu_buf_rele_array(dbp, nblks, tag);
558 return (err);
559 }
560 }
561 }
562
563 *numbufsp = nblks;
564 *dbpp = dbp;
565 return (0);
566}
567
568static int
569dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
570 uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
571{
572 dnode_t *dn;
573 int err;
574
428870ff 575 err = dnode_hold(os, object, FTAG, &dn);
34dc7c2f
BB
576 if (err)
577 return (err);
578
579 err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
9babb374 580 numbufsp, dbpp, DMU_READ_PREFETCH);
34dc7c2f
BB
581
582 dnode_rele(dn, FTAG);
583
584 return (err);
585}
586
587int
572e2857 588dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
7f60329a
MA
589 uint64_t length, boolean_t read, void *tag, int *numbufsp,
590 dmu_buf_t ***dbpp)
34dc7c2f 591{
572e2857
BB
592 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
593 dnode_t *dn;
34dc7c2f
BB
594 int err;
595
572e2857
BB
596 DB_DNODE_ENTER(db);
597 dn = DB_DNODE(db);
34dc7c2f 598 err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
9babb374 599 numbufsp, dbpp, DMU_READ_PREFETCH);
572e2857 600 DB_DNODE_EXIT(db);
34dc7c2f
BB
601
602 return (err);
603}
604
605void
606dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
607{
608 int i;
609 dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
610
611 if (numbufs == 0)
612 return;
613
614 for (i = 0; i < numbufs; i++) {
615 if (dbp[i])
616 dbuf_rele(dbp[i], tag);
617 }
618
619 kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
620}
621
e8b96c60 622/*
fcff0f35
PD
623 * Issue prefetch i/os for the given blocks. If level is greater than 0, the
624 * indirect blocks prefeteched will be those that point to the blocks containing
625 * the data starting at offset, and continuing to offset + len.
e8b96c60 626 *
b5256303
TC
627 * Note that if the indirect blocks above the blocks being prefetched are not
628 * in cache, they will be asychronously read in.
e8b96c60 629 */
34dc7c2f 630void
fcff0f35
PD
631dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
632 uint64_t len, zio_priority_t pri)
34dc7c2f
BB
633{
634 dnode_t *dn;
635 uint64_t blkid;
e8b96c60 636 int nblks, err;
34dc7c2f 637
34dc7c2f 638 if (len == 0) { /* they're interested in the bonus buffer */
572e2857 639 dn = DMU_META_DNODE(os);
34dc7c2f
BB
640
641 if (object == 0 || object >= DN_MAX_OBJECT)
642 return;
643
644 rw_enter(&dn->dn_struct_rwlock, RW_READER);
fcff0f35
PD
645 blkid = dbuf_whichblock(dn, level,
646 object * sizeof (dnode_phys_t));
647 dbuf_prefetch(dn, level, blkid, pri, 0);
34dc7c2f
BB
648 rw_exit(&dn->dn_struct_rwlock);
649 return;
650 }
651
652 /*
653 * XXX - Note, if the dnode for the requested object is not
654 * already cached, we will do a *synchronous* read in the
655 * dnode_hold() call. The same is true for any indirects.
656 */
428870ff 657 err = dnode_hold(os, object, FTAG, &dn);
34dc7c2f
BB
658 if (err != 0)
659 return;
660
661 rw_enter(&dn->dn_struct_rwlock, RW_READER);
fcff0f35
PD
662 /*
663 * offset + len - 1 is the last byte we want to prefetch for, and offset
664 * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the
665 * last block we want to prefetch, and dbuf_whichblock(dn, level,
666 * offset) is the first. Then the number we need to prefetch is the
667 * last - first + 1.
668 */
669 if (level > 0 || dn->dn_datablkshift != 0) {
670 nblks = dbuf_whichblock(dn, level, offset + len - 1) -
671 dbuf_whichblock(dn, level, offset) + 1;
34dc7c2f
BB
672 } else {
673 nblks = (offset < dn->dn_datablksz);
674 }
675
676 if (nblks != 0) {
e8b96c60
MA
677 int i;
678
fcff0f35 679 blkid = dbuf_whichblock(dn, level, offset);
34dc7c2f 680 for (i = 0; i < nblks; i++)
fcff0f35 681 dbuf_prefetch(dn, level, blkid + i, pri, 0);
34dc7c2f
BB
682 }
683
684 rw_exit(&dn->dn_struct_rwlock);
685
686 dnode_rele(dn, FTAG);
687}
688
45d1cae3
BB
689/*
690 * Get the next "chunk" of file data to free. We traverse the file from
691 * the end so that the file gets shorter over time (if we crashes in the
692 * middle, this will leave us in a better state). We find allocated file
693 * data by simply searching the allocated level 1 indirects.
b663a23d
MA
694 *
695 * On input, *start should be the first offset that does not need to be
696 * freed (e.g. "offset + length"). On return, *start will be the first
697 * offset that should be freed.
45d1cae3 698 */
b128c09f 699static int
b663a23d 700get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
b128c09f 701{
b663a23d
MA
702 uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
703 /* bytes of data covered by a level-1 indirect block */
45d1cae3 704 uint64_t iblkrange =
b128c09f 705 dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
b663a23d 706 uint64_t blks;
b128c09f 707
b663a23d 708 ASSERT3U(minimum, <=, *start);
b128c09f 709
b663a23d
MA
710 if (*start - minimum <= iblkrange * maxblks) {
711 *start = minimum;
b128c09f
BB
712 return (0);
713 }
45d1cae3 714 ASSERT(ISP2(iblkrange));
b128c09f 715
b663a23d 716 for (blks = 0; *start > minimum && blks < maxblks; blks++) {
b128c09f
BB
717 int err;
718
b663a23d
MA
719 /*
720 * dnode_next_offset(BACKWARDS) will find an allocated L1
721 * indirect block at or before the input offset. We must
722 * decrement *start so that it is at the end of the region
723 * to search.
724 */
725 (*start)--;
b128c09f 726 err = dnode_next_offset(dn,
45d1cae3 727 DNODE_FIND_BACKWARDS, start, 2, 1, 0);
b128c09f 728
b663a23d 729 /* if there are no indirect blocks before start, we are done */
45d1cae3 730 if (err == ESRCH) {
b663a23d
MA
731 *start = minimum;
732 break;
733 } else if (err != 0) {
b128c09f 734 return (err);
45d1cae3 735 }
b128c09f 736
b663a23d 737 /* set start to the beginning of this L1 indirect */
45d1cae3 738 *start = P2ALIGN(*start, iblkrange);
b128c09f 739 }
b663a23d
MA
740 if (*start < minimum)
741 *start = minimum;
b128c09f
BB
742 return (0);
743}
744
a08abc1b
GM
745/*
746 * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
747 * otherwise return false.
748 * Used below in dmu_free_long_range_impl() to enable abort when unmounting
749 */
750/*ARGSUSED*/
751static boolean_t
752dmu_objset_zfs_unmounting(objset_t *os)
753{
754#ifdef _KERNEL
755 if (dmu_objset_type(os) == DMU_OST_ZFS)
756 return (zfs_get_vfs_flag_unmounted(os));
757#endif
758 return (B_FALSE);
759}
760
b128c09f
BB
761static int
762dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
b663a23d 763 uint64_t length)
b128c09f 764{
c97d3069 765 uint64_t object_size;
b663a23d 766 int err;
539d33c7
GM
767 uint64_t dirty_frees_threshold;
768 dsl_pool_t *dp = dmu_objset_pool(os);
769 int t;
b663a23d 770
c97d3069
BB
771 if (dn == NULL)
772 return (SET_ERROR(EINVAL));
773
774 object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
b663a23d 775 if (offset >= object_size)
b128c09f 776 return (0);
b128c09f 777
539d33c7
GM
778 if (zfs_per_txg_dirty_frees_percent <= 100)
779 dirty_frees_threshold =
780 zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
781 else
782 dirty_frees_threshold = zfs_dirty_data_max / 4;
783
b663a23d
MA
784 if (length == DMU_OBJECT_END || offset + length > object_size)
785 length = object_size - offset;
786
787 while (length != 0) {
539d33c7
GM
788 uint64_t chunk_end, chunk_begin, chunk_len;
789 uint64_t long_free_dirty_all_txgs = 0;
b663a23d
MA
790 dmu_tx_t *tx;
791
a08abc1b
GM
792 if (dmu_objset_zfs_unmounting(dn->dn_objset))
793 return (SET_ERROR(EINTR));
794
b663a23d
MA
795 chunk_end = chunk_begin = offset + length;
796
797 /* move chunk_begin backwards to the beginning of this chunk */
798 err = get_next_chunk(dn, &chunk_begin, offset);
b128c09f
BB
799 if (err)
800 return (err);
b663a23d
MA
801 ASSERT3U(chunk_begin, >=, offset);
802 ASSERT3U(chunk_begin, <=, chunk_end);
b128c09f 803
539d33c7
GM
804 chunk_len = chunk_end - chunk_begin;
805
806 mutex_enter(&dp->dp_lock);
807 for (t = 0; t < TXG_SIZE; t++) {
808 long_free_dirty_all_txgs +=
809 dp->dp_long_free_dirty_pertxg[t];
810 }
811 mutex_exit(&dp->dp_lock);
812
813 /*
814 * To avoid filling up a TXG with just frees wait for
815 * the next TXG to open before freeing more chunks if
816 * we have reached the threshold of frees
817 */
818 if (dirty_frees_threshold != 0 &&
819 long_free_dirty_all_txgs >= dirty_frees_threshold) {
820 txg_wait_open(dp, 0);
821 continue;
822 }
823
b128c09f 824 tx = dmu_tx_create(os);
539d33c7 825 dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
19d55079
MA
826
827 /*
828 * Mark this transaction as typically resulting in a net
829 * reduction in space used.
830 */
831 dmu_tx_mark_netfree(tx);
b128c09f
BB
832 err = dmu_tx_assign(tx, TXG_WAIT);
833 if (err) {
834 dmu_tx_abort(tx);
835 return (err);
836 }
539d33c7
GM
837
838 mutex_enter(&dp->dp_lock);
839 dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] +=
840 chunk_len;
841 mutex_exit(&dp->dp_lock);
842 DTRACE_PROBE3(free__long__range,
843 uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len,
844 uint64_t, dmu_tx_get_txg(tx));
845 dnode_free_range(dn, chunk_begin, chunk_len, tx);
b128c09f 846 dmu_tx_commit(tx);
b663a23d 847
539d33c7 848 length -= chunk_len;
b128c09f
BB
849 }
850 return (0);
851}
852
853int
854dmu_free_long_range(objset_t *os, uint64_t object,
855 uint64_t offset, uint64_t length)
856{
857 dnode_t *dn;
858 int err;
859
428870ff 860 err = dnode_hold(os, object, FTAG, &dn);
b128c09f
BB
861 if (err != 0)
862 return (err);
b663a23d 863 err = dmu_free_long_range_impl(os, dn, offset, length);
92bc214c
MA
864
865 /*
866 * It is important to zero out the maxblkid when freeing the entire
867 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
868 * will take the fast path, and (b) dnode_reallocate() can verify
869 * that the entire file has been freed.
870 */
b0bc7a84 871 if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
92bc214c
MA
872 dn->dn_maxblkid = 0;
873
b128c09f
BB
874 dnode_rele(dn, FTAG);
875 return (err);
876}
877
878int
b663a23d 879dmu_free_long_object(objset_t *os, uint64_t object)
b128c09f 880{
b128c09f
BB
881 dmu_tx_t *tx;
882 int err;
883
b663a23d 884 err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
b128c09f
BB
885 if (err != 0)
886 return (err);
b663a23d
MA
887
888 tx = dmu_tx_create(os);
889 dmu_tx_hold_bonus(tx, object);
890 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
19d55079 891 dmu_tx_mark_netfree(tx);
b663a23d
MA
892 err = dmu_tx_assign(tx, TXG_WAIT);
893 if (err == 0) {
894 err = dmu_object_free(os, object, tx);
895 dmu_tx_commit(tx);
b128c09f 896 } else {
b663a23d 897 dmu_tx_abort(tx);
b128c09f 898 }
b663a23d 899
b128c09f
BB
900 return (err);
901}
902
34dc7c2f
BB
903int
904dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
905 uint64_t size, dmu_tx_t *tx)
906{
907 dnode_t *dn;
428870ff 908 int err = dnode_hold(os, object, FTAG, &dn);
34dc7c2f
BB
909 if (err)
910 return (err);
911 ASSERT(offset < UINT64_MAX);
912 ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
913 dnode_free_range(dn, offset, size, tx);
914 dnode_rele(dn, FTAG);
915 return (0);
916}
917
0eef1bde 918static int
919dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
9babb374 920 void *buf, uint32_t flags)
34dc7c2f 921{
34dc7c2f 922 dmu_buf_t **dbp;
0eef1bde 923 int numbufs, err = 0;
34dc7c2f
BB
924
925 /*
926 * Deal with odd block sizes, where there can't be data past the first
927 * block. If we ever do the tail block optimization, we will need to
928 * handle that here as well.
929 */
45d1cae3 930 if (dn->dn_maxblkid == 0) {
c9520ecc 931 uint64_t newsz = offset > dn->dn_datablksz ? 0 :
34dc7c2f
BB
932 MIN(size, dn->dn_datablksz - offset);
933 bzero((char *)buf + newsz, size - newsz);
934 size = newsz;
935 }
936
937 while (size > 0) {
938 uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
45d1cae3 939 int i;
34dc7c2f
BB
940
941 /*
942 * NB: we could do this block-at-a-time, but it's nice
943 * to be reading in parallel.
944 */
945 err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
9babb374 946 TRUE, FTAG, &numbufs, &dbp, flags);
34dc7c2f
BB
947 if (err)
948 break;
949
950 for (i = 0; i < numbufs; i++) {
c9520ecc
JZ
951 uint64_t tocpy;
952 int64_t bufoff;
34dc7c2f
BB
953 dmu_buf_t *db = dbp[i];
954
955 ASSERT(size > 0);
956
957 bufoff = offset - db->db_offset;
c9520ecc 958 tocpy = MIN(db->db_size - bufoff, size);
34dc7c2f 959
c9520ecc 960 (void) memcpy(buf, (char *)db->db_data + bufoff, tocpy);
34dc7c2f
BB
961
962 offset += tocpy;
963 size -= tocpy;
964 buf = (char *)buf + tocpy;
965 }
966 dmu_buf_rele_array(dbp, numbufs, FTAG);
967 }
34dc7c2f
BB
968 return (err);
969}
970
0eef1bde 971int
972dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
973 void *buf, uint32_t flags)
34dc7c2f 974{
0eef1bde 975 dnode_t *dn;
976 int err;
34dc7c2f 977
0eef1bde 978 err = dnode_hold(os, object, FTAG, &dn);
979 if (err != 0)
980 return (err);
34dc7c2f 981
0eef1bde 982 err = dmu_read_impl(dn, offset, size, buf, flags);
983 dnode_rele(dn, FTAG);
984 return (err);
985}
986
987int
988dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
989 uint32_t flags)
990{
991 return (dmu_read_impl(dn, offset, size, buf, flags));
992}
993
994static void
995dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
996 const void *buf, dmu_tx_t *tx)
997{
998 int i;
34dc7c2f
BB
999
1000 for (i = 0; i < numbufs; i++) {
c9520ecc
JZ
1001 uint64_t tocpy;
1002 int64_t bufoff;
34dc7c2f
BB
1003 dmu_buf_t *db = dbp[i];
1004
1005 ASSERT(size > 0);
1006
1007 bufoff = offset - db->db_offset;
c9520ecc 1008 tocpy = MIN(db->db_size - bufoff, size);
34dc7c2f
BB
1009
1010 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1011
1012 if (tocpy == db->db_size)
1013 dmu_buf_will_fill(db, tx);
1014 else
1015 dmu_buf_will_dirty(db, tx);
1016
60101509 1017 (void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
34dc7c2f
BB
1018
1019 if (tocpy == db->db_size)
1020 dmu_buf_fill_done(db, tx);
1021
1022 offset += tocpy;
1023 size -= tocpy;
1024 buf = (char *)buf + tocpy;
1025 }
0eef1bde 1026}
1027
1028void
1029dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1030 const void *buf, dmu_tx_t *tx)
1031{
1032 dmu_buf_t **dbp;
1033 int numbufs;
1034
1035 if (size == 0)
1036 return;
1037
1038 VERIFY0(dmu_buf_hold_array(os, object, offset, size,
1039 FALSE, FTAG, &numbufs, &dbp));
1040 dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
1041 dmu_buf_rele_array(dbp, numbufs, FTAG);
1042}
1043
1044void
1045dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
1046 const void *buf, dmu_tx_t *tx)
1047{
1048 dmu_buf_t **dbp;
1049 int numbufs;
1050
1051 if (size == 0)
1052 return;
1053
1054 VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
1055 FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
1056 dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
34dc7c2f
BB
1057 dmu_buf_rele_array(dbp, numbufs, FTAG);
1058}
1059
b128c09f
BB
1060void
1061dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1062 dmu_tx_t *tx)
1063{
1064 dmu_buf_t **dbp;
1065 int numbufs, i;
1066
1067 if (size == 0)
1068 return;
1069
1070 VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
1071 FALSE, FTAG, &numbufs, &dbp));
1072
1073 for (i = 0; i < numbufs; i++) {
1074 dmu_buf_t *db = dbp[i];
1075
1076 dmu_buf_will_not_fill(db, tx);
1077 }
1078 dmu_buf_rele_array(dbp, numbufs, FTAG);
1079}
1080
9b67f605
MA
1081void
1082dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
1083 void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
1084 int compressed_size, int byteorder, dmu_tx_t *tx)
1085{
1086 dmu_buf_t *db;
1087
1088 ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
1089 ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
1090 VERIFY0(dmu_buf_hold_noread(os, object, offset,
1091 FTAG, &db));
1092
1093 dmu_buf_write_embedded(db,
1094 data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
1095 uncompressed_size, compressed_size, byteorder, tx);
1096
1097 dmu_buf_rele(db, FTAG);
1098}
1099
428870ff
BB
1100/*
1101 * DMU support for xuio
1102 */
1103kstat_t *xuio_ksp = NULL;
1104
59e6e7ca
BB
1105typedef struct xuio_stats {
1106 /* loaned yet not returned arc_buf */
1107 kstat_named_t xuiostat_onloan_rbuf;
1108 kstat_named_t xuiostat_onloan_wbuf;
1109 /* whether a copy is made when loaning out a read buffer */
1110 kstat_named_t xuiostat_rbuf_copied;
1111 kstat_named_t xuiostat_rbuf_nocopy;
1112 /* whether a copy is made when assigning a write buffer */
1113 kstat_named_t xuiostat_wbuf_copied;
1114 kstat_named_t xuiostat_wbuf_nocopy;
1115} xuio_stats_t;
1116
1117static xuio_stats_t xuio_stats = {
1118 { "onloan_read_buf", KSTAT_DATA_UINT64 },
1119 { "onloan_write_buf", KSTAT_DATA_UINT64 },
1120 { "read_buf_copied", KSTAT_DATA_UINT64 },
1121 { "read_buf_nocopy", KSTAT_DATA_UINT64 },
1122 { "write_buf_copied", KSTAT_DATA_UINT64 },
1123 { "write_buf_nocopy", KSTAT_DATA_UINT64 }
1124};
1125
d1d7e268
MK
1126#define XUIOSTAT_INCR(stat, val) \
1127 atomic_add_64(&xuio_stats.stat.value.ui64, (val))
1128#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1)
59e6e7ca 1129
5a6765cf 1130#ifdef HAVE_UIO_ZEROCOPY
428870ff
BB
1131int
1132dmu_xuio_init(xuio_t *xuio, int nblk)
1133{
1134 dmu_xuio_t *priv;
1135 uio_t *uio = &xuio->xu_uio;
1136
1137 uio->uio_iovcnt = nblk;
79c76d5b 1138 uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
428870ff 1139
79c76d5b 1140 priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
428870ff 1141 priv->cnt = nblk;
79c76d5b 1142 priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
5475aada 1143 priv->iovp = (iovec_t *)uio->uio_iov;
428870ff
BB
1144 XUIO_XUZC_PRIV(xuio) = priv;
1145
1146 if (XUIO_XUZC_RW(xuio) == UIO_READ)
1147 XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
1148 else
1149 XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
1150
1151 return (0);
1152}
1153
1154void
1155dmu_xuio_fini(xuio_t *xuio)
1156{
1157 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1158 int nblk = priv->cnt;
1159
1160 kmem_free(priv->iovp, nblk * sizeof (iovec_t));
1161 kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
1162 kmem_free(priv, sizeof (dmu_xuio_t));
1163
1164 if (XUIO_XUZC_RW(xuio) == UIO_READ)
1165 XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
1166 else
1167 XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
1168}
1169
1170/*
1171 * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
1172 * and increase priv->next by 1.
1173 */
1174int
1175dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
1176{
1177 struct iovec *iov;
1178 uio_t *uio = &xuio->xu_uio;
1179 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1180 int i = priv->next++;
1181
1182 ASSERT(i < priv->cnt);
2aa34383 1183 ASSERT(off + n <= arc_buf_lsize(abuf));
5475aada 1184 iov = (iovec_t *)uio->uio_iov + i;
428870ff
BB
1185 iov->iov_base = (char *)abuf->b_data + off;
1186 iov->iov_len = n;
1187 priv->bufs[i] = abuf;
1188 return (0);
1189}
1190
1191int
1192dmu_xuio_cnt(xuio_t *xuio)
1193{
1194 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1195 return (priv->cnt);
1196}
1197
1198arc_buf_t *
1199dmu_xuio_arcbuf(xuio_t *xuio, int i)
1200{
1201 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1202
1203 ASSERT(i < priv->cnt);
1204 return (priv->bufs[i]);
1205}
1206
1207void
1208dmu_xuio_clear(xuio_t *xuio, int i)
1209{
1210 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1211
1212 ASSERT(i < priv->cnt);
1213 priv->bufs[i] = NULL;
1214}
5a6765cf 1215#endif /* HAVE_UIO_ZEROCOPY */
428870ff
BB
1216
1217static void
1218xuio_stat_init(void)
1219{
1220 xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
1221 KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
1222 KSTAT_FLAG_VIRTUAL);
1223 if (xuio_ksp != NULL) {
1224 xuio_ksp->ks_data = &xuio_stats;
1225 kstat_install(xuio_ksp);
1226 }
1227}
1228
1229static void
1230xuio_stat_fini(void)
1231{
1232 if (xuio_ksp != NULL) {
1233 kstat_delete(xuio_ksp);
1234 xuio_ksp = NULL;
1235 }
1236}
1237
1238void
5043684a 1239xuio_stat_wbuf_copied(void)
428870ff
BB
1240{
1241 XUIOSTAT_BUMP(xuiostat_wbuf_copied);
1242}
1243
1244void
5043684a 1245xuio_stat_wbuf_nocopy(void)
428870ff
BB
1246{
1247 XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
1248}
1249
34dc7c2f 1250#ifdef _KERNEL
5228cf01 1251int
804e0504 1252dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
872e8d26
BB
1253{
1254 dmu_buf_t **dbp;
1255 int numbufs, i, err;
5a6765cf 1256#ifdef HAVE_UIO_ZEROCOPY
872e8d26 1257 xuio_t *xuio = NULL;
5a6765cf 1258#endif
872e8d26
BB
1259
1260 /*
1261 * NB: we could do this block-at-a-time, but it's nice
1262 * to be reading in parallel.
1263 */
804e0504
MA
1264 err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
1265 TRUE, FTAG, &numbufs, &dbp, 0);
872e8d26
BB
1266 if (err)
1267 return (err);
1268
1269 for (i = 0; i < numbufs; i++) {
c9520ecc
JZ
1270 uint64_t tocpy;
1271 int64_t bufoff;
872e8d26
BB
1272 dmu_buf_t *db = dbp[i];
1273
1274 ASSERT(size > 0);
1275
1276 bufoff = uio->uio_loffset - db->db_offset;
c9520ecc 1277 tocpy = MIN(db->db_size - bufoff, size);
872e8d26 1278
5a6765cf 1279#ifdef HAVE_UIO_ZEROCOPY
872e8d26
BB
1280 if (xuio) {
1281 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
1282 arc_buf_t *dbuf_abuf = dbi->db_buf;
1283 arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
1284 err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
1285 if (!err) {
1286 uio->uio_resid -= tocpy;
1287 uio->uio_loffset += tocpy;
1288 }
1289
1290 if (abuf == dbuf_abuf)
1291 XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
1292 else
1293 XUIOSTAT_BUMP(xuiostat_rbuf_copied);
5a6765cf 1294 } else
1295#endif
872e8d26
BB
1296 err = uiomove((char *)db->db_data + bufoff, tocpy,
1297 UIO_READ, uio);
872e8d26
BB
1298 if (err)
1299 break;
1300
1301 size -= tocpy;
1302 }
1303 dmu_buf_rele_array(dbp, numbufs, FTAG);
1304
1305 return (err);
1306}
1307
804e0504
MA
1308/*
1309 * Read 'size' bytes into the uio buffer.
1310 * From object zdb->db_object.
1311 * Starting at offset uio->uio_loffset.
1312 *
1313 * If the caller already has a dbuf in the target object
1314 * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
1315 * because we don't have to find the dnode_t for the object.
1316 */
1317int
1318dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
1319{
1320 dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
1321 dnode_t *dn;
1322 int err;
1323
1324 if (size == 0)
1325 return (0);
1326
1327 DB_DNODE_ENTER(db);
1328 dn = DB_DNODE(db);
1329 err = dmu_read_uio_dnode(dn, uio, size);
1330 DB_DNODE_EXIT(db);
1331
1332 return (err);
1333}
1334
1335/*
1336 * Read 'size' bytes into the uio buffer.
1337 * From the specified object
1338 * Starting at offset uio->uio_loffset.
1339 */
1340int
1341dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
1342{
1343 dnode_t *dn;
1344 int err;
1345
1346 if (size == 0)
1347 return (0);
1348
1349 err = dnode_hold(os, object, FTAG, &dn);
1350 if (err)
1351 return (err);
1352
1353 err = dmu_read_uio_dnode(dn, uio, size);
1354
1355 dnode_rele(dn, FTAG);
1356
1357 return (err);
1358}
1359
5228cf01 1360int
872e8d26
BB
1361dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
1362{
1363 dmu_buf_t **dbp;
1364 int numbufs;
1365 int err = 0;
1366 int i;
1367
1368 err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
1369 FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
1370 if (err)
1371 return (err);
1372
1373 for (i = 0; i < numbufs; i++) {
c9520ecc
JZ
1374 uint64_t tocpy;
1375 int64_t bufoff;
872e8d26
BB
1376 dmu_buf_t *db = dbp[i];
1377
1378 ASSERT(size > 0);
1379
1380 bufoff = uio->uio_loffset - db->db_offset;
c9520ecc 1381 tocpy = MIN(db->db_size - bufoff, size);
872e8d26
BB
1382
1383 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1384
1385 if (tocpy == db->db_size)
1386 dmu_buf_will_fill(db, tx);
1387 else
1388 dmu_buf_will_dirty(db, tx);
1389
1390 /*
1391 * XXX uiomove could block forever (eg.nfs-backed
1392 * pages). There needs to be a uiolockdown() function
1393 * to lock the pages in memory, so that uiomove won't
1394 * block.
1395 */
1396 err = uiomove((char *)db->db_data + bufoff, tocpy,
1397 UIO_WRITE, uio);
1398
1399 if (tocpy == db->db_size)
1400 dmu_buf_fill_done(db, tx);
1401
1402 if (err)
1403 break;
1404
1405 size -= tocpy;
1406 }
1407
1408 dmu_buf_rele_array(dbp, numbufs, FTAG);
1409 return (err);
1410}
1411
804e0504
MA
1412/*
1413 * Write 'size' bytes from the uio buffer.
1414 * To object zdb->db_object.
1415 * Starting at offset uio->uio_loffset.
1416 *
1417 * If the caller already has a dbuf in the target object
1418 * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
1419 * because we don't have to find the dnode_t for the object.
1420 */
428870ff
BB
1421int
1422dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
1423 dmu_tx_t *tx)
1424{
572e2857
BB
1425 dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
1426 dnode_t *dn;
1427 int err;
1428
428870ff
BB
1429 if (size == 0)
1430 return (0);
1431
572e2857
BB
1432 DB_DNODE_ENTER(db);
1433 dn = DB_DNODE(db);
1434 err = dmu_write_uio_dnode(dn, uio, size, tx);
1435 DB_DNODE_EXIT(db);
1436
1437 return (err);
428870ff
BB
1438}
1439
804e0504
MA
1440/*
1441 * Write 'size' bytes from the uio buffer.
1442 * To the specified object.
1443 * Starting at offset uio->uio_loffset.
1444 */
428870ff
BB
1445int
1446dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
1447 dmu_tx_t *tx)
1448{
1449 dnode_t *dn;
1450 int err;
1451
1452 if (size == 0)
1453 return (0);
1454
1455 err = dnode_hold(os, object, FTAG, &dn);
1456 if (err)
1457 return (err);
1458
1459 err = dmu_write_uio_dnode(dn, uio, size, tx);
1460
1461 dnode_rele(dn, FTAG);
1462
1463 return (err);
1464}
872e8d26 1465#endif /* _KERNEL */
34dc7c2f 1466
9babb374
BB
1467/*
1468 * Allocate a loaned anonymous arc buffer.
1469 */
1470arc_buf_t *
1471dmu_request_arcbuf(dmu_buf_t *handle, int size)
1472{
572e2857 1473 dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
9babb374 1474
2aa34383 1475 return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
9babb374
BB
1476}
1477
1478/*
1479 * Free a loaned arc buffer.
1480 */
1481void
1482dmu_return_arcbuf(arc_buf_t *buf)
1483{
1484 arc_return_buf(buf, FTAG);
d3c2ae1c 1485 arc_buf_destroy(buf, FTAG);
9babb374
BB
1486}
1487
b5256303
TC
1488void
1489dmu_assign_arcbuf_impl(dmu_buf_t *handle, arc_buf_t *buf, dmu_tx_t *tx)
1490{
1491 dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
1492 dbuf_assign_arcbuf(db, buf, tx);
1493}
1494
1495void
1496dmu_convert_to_raw(dmu_buf_t *handle, boolean_t byteorder, const uint8_t *salt,
1497 const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)
1498{
1499 dmu_object_type_t type;
1500 dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
1501 uint64_t dsobj = dmu_objset_id(db->db_objset);
1502
1503 ASSERT3P(db->db_buf, !=, NULL);
1504 ASSERT3U(dsobj, !=, 0);
1505
1506 dmu_buf_will_change_crypt_params(handle, tx);
1507
1508 DB_DNODE_ENTER(db);
1509 type = DB_DNODE(db)->dn_type;
1510 DB_DNODE_EXIT(db);
1511
1512 /*
1513 * This technically violates the assumption the dmu code makes
1514 * that dnode blocks are only released in syncing context.
1515 */
1516 (void) arc_release(db->db_buf, db);
1517 arc_convert_to_raw(db->db_buf, dsobj, byteorder, type, salt, iv, mac);
1518}
1519
1520void
1521dmu_copy_from_buf(objset_t *os, uint64_t object, uint64_t offset,
1522 dmu_buf_t *handle, dmu_tx_t *tx)
1523{
1524 dmu_buf_t *dst_handle;
1525 dmu_buf_impl_t *dstdb;
1526 dmu_buf_impl_t *srcdb = (dmu_buf_impl_t *)handle;
1527 arc_buf_t *abuf;
1528 uint64_t datalen;
1529 boolean_t byteorder;
1530 uint8_t salt[ZIO_DATA_SALT_LEN];
1531 uint8_t iv[ZIO_DATA_IV_LEN];
1532 uint8_t mac[ZIO_DATA_MAC_LEN];
1533
1534 ASSERT3P(srcdb->db_buf, !=, NULL);
1535
1536 /* hold the db that we want to write to */
1537 VERIFY0(dmu_buf_hold(os, object, offset, FTAG, &dst_handle,
1538 DMU_READ_NO_DECRYPT));
1539 dstdb = (dmu_buf_impl_t *)dst_handle;
1540 datalen = arc_buf_size(srcdb->db_buf);
1541
1542 /* allocated an arc buffer that matches the type of srcdb->db_buf */
1543 if (arc_is_encrypted(srcdb->db_buf)) {
1544 arc_get_raw_params(srcdb->db_buf, &byteorder, salt, iv, mac);
1545 abuf = arc_loan_raw_buf(os->os_spa, dmu_objset_id(os),
1546 byteorder, salt, iv, mac, DB_DNODE(dstdb)->dn_type,
1547 datalen, arc_buf_lsize(srcdb->db_buf),
1548 arc_get_compression(srcdb->db_buf));
1549 } else {
1550 /* we won't get a compressed db back from dmu_buf_hold() */
1551 ASSERT3U(arc_get_compression(srcdb->db_buf),
1552 ==, ZIO_COMPRESS_OFF);
1553 abuf = arc_loan_buf(os->os_spa,
1554 DMU_OT_IS_METADATA(DB_DNODE(dstdb)->dn_type), datalen);
1555 }
1556
1557 ASSERT3U(datalen, ==, arc_buf_size(abuf));
1558
1559 /* copy the data to the new buffer and assign it to the dstdb */
1560 bcopy(srcdb->db_buf->b_data, abuf->b_data, datalen);
1561 dbuf_assign_arcbuf(dstdb, abuf, tx);
1562 dmu_buf_rele(dst_handle, FTAG);
1563}
1564
9babb374
BB
1565/*
1566 * When possible directly assign passed loaned arc buffer to a dbuf.
1567 * If this is not possible copy the contents of passed arc buf via
1568 * dmu_write().
1569 */
1570void
1571dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
1572 dmu_tx_t *tx)
1573{
572e2857
BB
1574 dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
1575 dnode_t *dn;
9babb374 1576 dmu_buf_impl_t *db;
2aa34383 1577 uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
9babb374
BB
1578 uint64_t blkid;
1579
572e2857
BB
1580 DB_DNODE_ENTER(dbuf);
1581 dn = DB_DNODE(dbuf);
9babb374 1582 rw_enter(&dn->dn_struct_rwlock, RW_READER);
fcff0f35 1583 blkid = dbuf_whichblock(dn, 0, offset);
9babb374
BB
1584 VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
1585 rw_exit(&dn->dn_struct_rwlock);
572e2857 1586 DB_DNODE_EXIT(dbuf);
9babb374 1587
88904bb3
MA
1588 /*
1589 * We can only assign if the offset is aligned, the arc buf is the
2aa34383 1590 * same size as the dbuf, and the dbuf is not metadata.
88904bb3 1591 */
2aa34383 1592 if (offset == db->db.db_offset && blksz == db->db.db_size) {
9babb374
BB
1593 dbuf_assign_arcbuf(db, buf, tx);
1594 dbuf_rele(db, FTAG);
1595 } else {
572e2857
BB
1596 objset_t *os;
1597 uint64_t object;
1598
2aa34383
DK
1599 /* compressed bufs must always be assignable to their dbuf */
1600 ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
524b4217 1601 ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
2aa34383 1602
572e2857
BB
1603 DB_DNODE_ENTER(dbuf);
1604 dn = DB_DNODE(dbuf);
1605 os = dn->dn_objset;
1606 object = dn->dn_object;
1607 DB_DNODE_EXIT(dbuf);
1608
9babb374 1609 dbuf_rele(db, FTAG);
572e2857 1610 dmu_write(os, object, offset, blksz, buf->b_data, tx);
9babb374 1611 dmu_return_arcbuf(buf);
428870ff 1612 XUIOSTAT_BUMP(xuiostat_wbuf_copied);
9babb374
BB
1613 }
1614}
1615
34dc7c2f 1616typedef struct {
428870ff
BB
1617 dbuf_dirty_record_t *dsa_dr;
1618 dmu_sync_cb_t *dsa_done;
1619 zgd_t *dsa_zgd;
1620 dmu_tx_t *dsa_tx;
34dc7c2f
BB
1621} dmu_sync_arg_t;
1622
b128c09f
BB
1623/* ARGSUSED */
1624static void
1625dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
1626{
428870ff
BB
1627 dmu_sync_arg_t *dsa = varg;
1628 dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
b128c09f
BB
1629 blkptr_t *bp = zio->io_bp;
1630
428870ff
BB
1631 if (zio->io_error == 0) {
1632 if (BP_IS_HOLE(bp)) {
1633 /*
1634 * A block of zeros may compress to a hole, but the
1635 * block size still needs to be known for replay.
1636 */
1637 BP_SET_LSIZE(bp, db->db_size);
9b67f605 1638 } else if (!BP_IS_EMBEDDED(bp)) {
428870ff 1639 ASSERT(BP_GET_LEVEL(bp) == 0);
b5256303 1640 BP_SET_FILL(bp, 1);
428870ff 1641 }
b128c09f
BB
1642 }
1643}
1644
428870ff
BB
1645static void
1646dmu_sync_late_arrival_ready(zio_t *zio)
1647{
1648 dmu_sync_ready(zio, NULL, zio->io_private);
1649}
1650
34dc7c2f
BB
1651/* ARGSUSED */
1652static void
1653dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
1654{
428870ff
BB
1655 dmu_sync_arg_t *dsa = varg;
1656 dbuf_dirty_record_t *dr = dsa->dsa_dr;
34dc7c2f 1657 dmu_buf_impl_t *db = dr->dr_dbuf;
34dc7c2f 1658
34dc7c2f
BB
1659 mutex_enter(&db->db_mtx);
1660 ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
428870ff 1661 if (zio->io_error == 0) {
03c6040b
GW
1662 dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
1663 if (dr->dt.dl.dr_nopwrite) {
02dc43bc
MA
1664 blkptr_t *bp = zio->io_bp;
1665 blkptr_t *bp_orig = &zio->io_bp_orig;
1666 uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
03c6040b
GW
1667
1668 ASSERT(BP_EQUAL(bp, bp_orig));
02dc43bc 1669 VERIFY(BP_EQUAL(bp, db->db_blkptr));
03c6040b 1670 ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
02dc43bc 1671 VERIFY(zio_checksum_table[chksum].ci_flags &
3c67d83a 1672 ZCHECKSUM_FLAG_NOPWRITE);
03c6040b 1673 }
428870ff
BB
1674 dr->dt.dl.dr_overridden_by = *zio->io_bp;
1675 dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
1676 dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
a4069eef
PS
1677
1678 /*
1679 * Old style holes are filled with all zeros, whereas
1680 * new-style holes maintain their lsize, type, level,
1681 * and birth time (see zio_write_compress). While we
1682 * need to reset the BP_SET_LSIZE() call that happened
1683 * in dmu_sync_ready for old style holes, we do *not*
1684 * want to wipe out the information contained in new
1685 * style holes. Thus, only zero out the block pointer if
1686 * it's an old style hole.
1687 */
1688 if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
1689 dr->dt.dl.dr_overridden_by.blk_birth == 0)
428870ff
BB
1690 BP_ZERO(&dr->dt.dl.dr_overridden_by);
1691 } else {
1692 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1693 }
34dc7c2f
BB
1694 cv_broadcast(&db->db_changed);
1695 mutex_exit(&db->db_mtx);
1696
428870ff 1697 dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
34dc7c2f 1698
428870ff
BB
1699 kmem_free(dsa, sizeof (*dsa));
1700}
1701
1702static void
1703dmu_sync_late_arrival_done(zio_t *zio)
1704{
1705 blkptr_t *bp = zio->io_bp;
1706 dmu_sync_arg_t *dsa = zio->io_private;
03c6040b 1707 ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig);
428870ff
BB
1708
1709 if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
02dc43bc
MA
1710 ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
1711 ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
1712 ASSERT(zio->io_bp->blk_birth == zio->io_txg);
1713 ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
1714 zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
428870ff
BB
1715 }
1716
1717 dmu_tx_commit(dsa->dsa_tx);
1718
1719 dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
1720
a6255b7f 1721 abd_put(zio->io_abd);
428870ff
BB
1722 kmem_free(dsa, sizeof (*dsa));
1723}
1724
1725static int
1726dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
5dbd68a3 1727 zio_prop_t *zp, zbookmark_phys_t *zb)
428870ff
BB
1728{
1729 dmu_sync_arg_t *dsa;
1730 dmu_tx_t *tx;
1731
1732 tx = dmu_tx_create(os);
1733 dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
1734 if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
1735 dmu_tx_abort(tx);
2e528b49
MA
1736 /* Make zl_get_data do txg_waited_synced() */
1737 return (SET_ERROR(EIO));
428870ff
BB
1738 }
1739
79c76d5b 1740 dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
428870ff
BB
1741 dsa->dsa_dr = NULL;
1742 dsa->dsa_done = done;
1743 dsa->dsa_zgd = zgd;
1744 dsa->dsa_tx = tx;
1745
02dc43bc
MA
1746 /*
1747 * Since we are currently syncing this txg, it's nontrivial to
1748 * determine what BP to nopwrite against, so we disable nopwrite.
1749 *
1750 * When syncing, the db_blkptr is initially the BP of the previous
1751 * txg. We can not nopwrite against it because it will be changed
1752 * (this is similar to the non-late-arrival case where the dbuf is
1753 * dirty in a future txg).
1754 *
1755 * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
1756 * We can not nopwrite against it because although the BP will not
1757 * (typically) be changed, the data has not yet been persisted to this
1758 * location.
1759 *
1760 * Finally, when dbuf_write_done() is called, it is theoretically
1761 * possible to always nopwrite, because the data that was written in
1762 * this txg is the same data that we are trying to write. However we
1763 * would need to check that this dbuf is not dirty in any future
1764 * txg's (as we do in the normal dmu_sync() path). For simplicity, we
1765 * don't nopwrite in this case.
1766 */
1767 zp->zp_nopwrite = B_FALSE;
1768
a6255b7f
DQ
1769 zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
1770 abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
1771 zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
1772 dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
1773 dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
428870ff
BB
1774
1775 return (0);
34dc7c2f
BB
1776}
1777
1778/*
1779 * Intent log support: sync the block associated with db to disk.
1780 * N.B. and XXX: the caller is responsible for making sure that the
1781 * data isn't changing while dmu_sync() is writing it.
1782 *
1783 * Return values:
1784 *
03c6040b 1785 * EEXIST: this txg has already been synced, so there's nothing to do.
34dc7c2f
BB
1786 * The caller should not log the write.
1787 *
1788 * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
1789 * The caller should not log the write.
1790 *
1791 * EALREADY: this block is already in the process of being synced.
1792 * The caller should track its progress (somehow).
1793 *
428870ff
BB
1794 * EIO: could not do the I/O.
1795 * The caller should do a txg_wait_synced().
34dc7c2f 1796 *
428870ff
BB
1797 * 0: the I/O has been initiated.
1798 * The caller should log this blkptr in the done callback.
1799 * It is possible that the I/O will fail, in which case
1800 * the error will be reported to the done callback and
1801 * propagated to pio from zio_done().
34dc7c2f
BB
1802 */
1803int
428870ff 1804dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
34dc7c2f 1805{
428870ff
BB
1806 dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
1807 objset_t *os = db->db_objset;
1808 dsl_dataset_t *ds = os->os_dsl_dataset;
34dc7c2f 1809 dbuf_dirty_record_t *dr;
428870ff 1810 dmu_sync_arg_t *dsa;
5dbd68a3 1811 zbookmark_phys_t zb;
428870ff 1812 zio_prop_t zp;
572e2857 1813 dnode_t *dn;
34dc7c2f 1814
428870ff 1815 ASSERT(pio != NULL);
34dc7c2f
BB
1816 ASSERT(txg != 0);
1817
428870ff
BB
1818 SET_BOOKMARK(&zb, ds->ds_object,
1819 db->db.db_object, db->db_level, db->db_blkid);
1820
572e2857
BB
1821 DB_DNODE_ENTER(db);
1822 dn = DB_DNODE(db);
82644107 1823 dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
572e2857 1824 DB_DNODE_EXIT(db);
34dc7c2f
BB
1825
1826 /*
428870ff 1827 * If we're frozen (running ziltest), we always need to generate a bp.
34dc7c2f 1828 */
428870ff
BB
1829 if (txg > spa_freeze_txg(os->os_spa))
1830 return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
34dc7c2f
BB
1831
1832 /*
428870ff
BB
1833 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
1834 * and us. If we determine that this txg is not yet syncing,
1835 * but it begins to sync a moment later, that's OK because the
1836 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
34dc7c2f 1837 */
428870ff
BB
1838 mutex_enter(&db->db_mtx);
1839
1840 if (txg <= spa_last_synced_txg(os->os_spa)) {
34dc7c2f 1841 /*
428870ff 1842 * This txg has already synced. There's nothing to do.
34dc7c2f 1843 */
428870ff 1844 mutex_exit(&db->db_mtx);
2e528b49 1845 return (SET_ERROR(EEXIST));
34dc7c2f
BB
1846 }
1847
428870ff
BB
1848 if (txg <= spa_syncing_txg(os->os_spa)) {
1849 /*
1850 * This txg is currently syncing, so we can't mess with
1851 * the dirty record anymore; just write a new log block.
1852 */
1853 mutex_exit(&db->db_mtx);
1854 return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
34dc7c2f
BB
1855 }
1856
1857 dr = db->db_last_dirty;
428870ff 1858 while (dr && dr->dr_txg != txg)
34dc7c2f 1859 dr = dr->dr_next;
428870ff
BB
1860
1861 if (dr == NULL) {
34dc7c2f 1862 /*
428870ff 1863 * There's no dr for this dbuf, so it must have been freed.
34dc7c2f
BB
1864 * There's no need to log writes to freed blocks, so we're done.
1865 */
1866 mutex_exit(&db->db_mtx);
2e528b49 1867 return (SET_ERROR(ENOENT));
34dc7c2f
BB
1868 }
1869
03c6040b
GW
1870 ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
1871
02dc43bc
MA
1872 if (db->db_blkptr != NULL) {
1873 /*
1874 * We need to fill in zgd_bp with the current blkptr so that
1875 * the nopwrite code can check if we're writing the same
1876 * data that's already on disk. We can only nopwrite if we
1877 * are sure that after making the copy, db_blkptr will not
1878 * change until our i/o completes. We ensure this by
1879 * holding the db_mtx, and only allowing nopwrite if the
1880 * block is not already dirty (see below). This is verified
1881 * by dmu_sync_done(), which VERIFYs that the db_blkptr has
1882 * not changed.
1883 */
1884 *zgd->zgd_bp = *db->db_blkptr;
1885 }
1886
03c6040b 1887 /*
f3c517d8
MA
1888 * Assume the on-disk data is X, the current syncing data (in
1889 * txg - 1) is Y, and the current in-memory data is Z (currently
1890 * in dmu_sync).
1891 *
1892 * We usually want to perform a nopwrite if X and Z are the
1893 * same. However, if Y is different (i.e. the BP is going to
1894 * change before this write takes effect), then a nopwrite will
1895 * be incorrect - we would override with X, which could have
1896 * been freed when Y was written.
1897 *
1898 * (Note that this is not a concern when we are nop-writing from
1899 * syncing context, because X and Y must be identical, because
1900 * all previous txgs have been synced.)
1901 *
1902 * Therefore, we disable nopwrite if the current BP could change
1903 * before this TXG. There are two ways it could change: by
1904 * being dirty (dr_next is non-NULL), or by being freed
1905 * (dnode_block_freed()). This behavior is verified by
1906 * zio_done(), which VERIFYs that the override BP is identical
1907 * to the on-disk BP.
03c6040b 1908 */
f3c517d8
MA
1909 DB_DNODE_ENTER(db);
1910 dn = DB_DNODE(db);
1911 if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
03c6040b 1912 zp.zp_nopwrite = B_FALSE;
f3c517d8 1913 DB_DNODE_EXIT(db);
03c6040b 1914
34dc7c2f 1915 ASSERT(dr->dr_txg == txg);
428870ff
BB
1916 if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
1917 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
34dc7c2f 1918 /*
428870ff
BB
1919 * We have already issued a sync write for this buffer,
1920 * or this buffer has already been synced. It could not
34dc7c2f
BB
1921 * have been dirtied since, or we would have cleared the state.
1922 */
34dc7c2f 1923 mutex_exit(&db->db_mtx);
2e528b49 1924 return (SET_ERROR(EALREADY));
34dc7c2f
BB
1925 }
1926
428870ff 1927 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
34dc7c2f 1928 dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
34dc7c2f 1929 mutex_exit(&db->db_mtx);
34dc7c2f 1930
79c76d5b 1931 dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
428870ff
BB
1932 dsa->dsa_dr = dr;
1933 dsa->dsa_done = done;
1934 dsa->dsa_zgd = zgd;
1935 dsa->dsa_tx = NULL;
b128c09f 1936
428870ff 1937 zio_nowait(arc_write(pio, os->os_spa, txg,
02dc43bc 1938 zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
d3c2ae1c 1939 &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
bc77ba73 1940 ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
b128c09f 1941
428870ff 1942 return (0);
34dc7c2f
BB
1943}
1944
b5256303
TC
1945int
1946dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx)
1947{
1948 dnode_t *dn;
1949 int err;
1950
1951 err = dnode_hold(os, object, FTAG, &dn);
1952 if (err)
1953 return (err);
1954 err = dnode_set_nlevels(dn, nlevels, tx);
1955 dnode_rele(dn, FTAG);
1956 return (err);
1957}
1958
34dc7c2f
BB
1959int
1960dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
4ea3f864 1961 dmu_tx_t *tx)
34dc7c2f
BB
1962{
1963 dnode_t *dn;
1964 int err;
1965
428870ff 1966 err = dnode_hold(os, object, FTAG, &dn);
34dc7c2f
BB
1967 if (err)
1968 return (err);
1969 err = dnode_set_blksz(dn, size, ibs, tx);
1970 dnode_rele(dn, FTAG);
1971 return (err);
1972}
1973
1974void
1975dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
4ea3f864 1976 dmu_tx_t *tx)
34dc7c2f
BB
1977{
1978 dnode_t *dn;
1979
9b67f605
MA
1980 /*
1981 * Send streams include each object's checksum function. This
1982 * check ensures that the receiving system can understand the
1983 * checksum function transmitted.
1984 */
1985 ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
1986
1987 VERIFY0(dnode_hold(os, object, FTAG, &dn));
1988 ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
34dc7c2f
BB
1989 dn->dn_checksum = checksum;
1990 dnode_setdirty(dn, tx);
1991 dnode_rele(dn, FTAG);
1992}
1993
1994void
1995dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
4ea3f864 1996 dmu_tx_t *tx)
34dc7c2f
BB
1997{
1998 dnode_t *dn;
1999
9b67f605
MA
2000 /*
2001 * Send streams include each object's compression function. This
2002 * check ensures that the receiving system can understand the
2003 * compression function transmitted.
2004 */
2005 ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
2006
2007 VERIFY0(dnode_hold(os, object, FTAG, &dn));
34dc7c2f
BB
2008 dn->dn_compress = compress;
2009 dnode_setdirty(dn, tx);
2010 dnode_rele(dn, FTAG);
2011}
2012
428870ff
BB
2013int zfs_mdcomp_disable = 0;
2014
faf0f58c
MA
2015/*
2016 * When the "redundant_metadata" property is set to "most", only indirect
2017 * blocks of this level and higher will have an additional ditto block.
2018 */
2019int zfs_redundant_metadata_most_ditto_level = 2;
2020
428870ff 2021void
82644107 2022dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
428870ff
BB
2023{
2024 dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
9ae529ec 2025 boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
572e2857 2026 (wp & WP_SPILL));
428870ff
BB
2027 enum zio_checksum checksum = os->os_checksum;
2028 enum zio_compress compress = os->os_compress;
2029 enum zio_checksum dedup_checksum = os->os_dedup_checksum;
03c6040b
GW
2030 boolean_t dedup = B_FALSE;
2031 boolean_t nopwrite = B_FALSE;
428870ff 2032 boolean_t dedup_verify = os->os_dedup_verify;
b5256303 2033 boolean_t encrypt = B_FALSE;
428870ff 2034 int copies = os->os_copies;
a7004725 2035
428870ff 2036 /*
03c6040b
GW
2037 * We maintain different write policies for each of the following
2038 * types of data:
2039 * 1. metadata
2040 * 2. preallocated blocks (i.e. level-0 blocks of a dump device)
2041 * 3. all other level 0 blocks
428870ff
BB
2042 */
2043 if (ismd) {
62bdd5eb
DL
2044 if (zfs_mdcomp_disable) {
2045 compress = ZIO_COMPRESS_EMPTY;
62bdd5eb 2046 } else {
99197f03
JG
2047 /*
2048 * XXX -- we should design a compression algorithm
2049 * that specializes in arrays of bps.
2050 */
2051 compress = zio_compress_select(os->os_spa,
2052 ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
62bdd5eb 2053 }
03c6040b 2054
428870ff
BB
2055 /*
2056 * Metadata always gets checksummed. If the data
2057 * checksum is multi-bit correctable, and it's not a
2058 * ZBT-style checksum, then it's suitable for metadata
2059 * as well. Otherwise, the metadata checksum defaults
2060 * to fletcher4.
2061 */
3c67d83a
TH
2062 if (!(zio_checksum_table[checksum].ci_flags &
2063 ZCHECKSUM_FLAG_METADATA) ||
2064 (zio_checksum_table[checksum].ci_flags &
2065 ZCHECKSUM_FLAG_EMBEDDED))
428870ff 2066 checksum = ZIO_CHECKSUM_FLETCHER_4;
faf0f58c
MA
2067
2068 if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
2069 (os->os_redundant_metadata ==
2070 ZFS_REDUNDANT_METADATA_MOST &&
2071 (level >= zfs_redundant_metadata_most_ditto_level ||
2072 DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
2073 copies++;
03c6040b
GW
2074 } else if (wp & WP_NOFILL) {
2075 ASSERT(level == 0);
428870ff 2076
428870ff 2077 /*
03c6040b
GW
2078 * If we're writing preallocated blocks, we aren't actually
2079 * writing them so don't set any policy properties. These
2080 * blocks are currently only used by an external subsystem
2081 * outside of zfs (i.e. dump) and not written by the zio
2082 * pipeline.
428870ff 2083 */
03c6040b
GW
2084 compress = ZIO_COMPRESS_OFF;
2085 checksum = ZIO_CHECKSUM_OFF;
428870ff 2086 } else {
99197f03
JG
2087 compress = zio_compress_select(os->os_spa, dn->dn_compress,
2088 compress);
428870ff 2089
03c6040b
GW
2090 checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
2091 zio_checksum_select(dn->dn_checksum, checksum) :
2092 dedup_checksum;
428870ff 2093
03c6040b
GW
2094 /*
2095 * Determine dedup setting. If we are in dmu_sync(),
2096 * we won't actually dedup now because that's all
2097 * done in syncing context; but we do want to use the
2098 * dedup checkum. If the checksum is not strong
2099 * enough to ensure unique signatures, force
2100 * dedup_verify.
2101 */
2102 if (dedup_checksum != ZIO_CHECKSUM_OFF) {
2103 dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
3c67d83a
TH
2104 if (!(zio_checksum_table[checksum].ci_flags &
2105 ZCHECKSUM_FLAG_DEDUP))
03c6040b
GW
2106 dedup_verify = B_TRUE;
2107 }
428870ff 2108
03c6040b 2109 /*
3c67d83a
TH
2110 * Enable nopwrite if we have secure enough checksum
2111 * algorithm (see comment in zio_nop_write) and
2112 * compression is enabled. We don't enable nopwrite if
2113 * dedup is enabled as the two features are mutually
2114 * exclusive.
03c6040b 2115 */
3c67d83a
TH
2116 nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
2117 ZCHECKSUM_FLAG_NOPWRITE) &&
03c6040b 2118 compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
428870ff
BB
2119 }
2120
b5256303
TC
2121 /*
2122 * All objects in an encrypted objset are protected from modification
2123 * via a MAC. Encrypted objects store their IV and salt in the last DVA
2124 * in the bp, so we cannot use all copies. Encrypted objects are also
2125 * not subject to nopwrite since writing the same data will still
2126 * result in a new ciphertext. Only encrypted blocks can be dedup'd
2127 * to avoid ambiguity in the dedup code since the DDT does not store
2128 * object types.
2129 */
2130 if (os->os_encrypted && (wp & WP_NOFILL) == 0) {
2131 encrypt = B_TRUE;
2132
2133 if (DMU_OT_IS_ENCRYPTED(type)) {
2134 copies = MIN(copies, SPA_DVAS_PER_BP - 1);
2135 nopwrite = B_FALSE;
2136 } else {
2137 dedup = B_FALSE;
2138 }
2139
2140 if (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)
2141 compress = ZIO_COMPRESS_EMPTY;
2142 }
2aa34383 2143
b5256303
TC
2144 zp->zp_compress = compress;
2145 zp->zp_checksum = checksum;
428870ff
BB
2146 zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
2147 zp->zp_level = level;
faf0f58c 2148 zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
428870ff
BB
2149 zp->zp_dedup = dedup;
2150 zp->zp_dedup_verify = dedup && dedup_verify;
03c6040b 2151 zp->zp_nopwrite = nopwrite;
b5256303
TC
2152 zp->zp_encrypt = encrypt;
2153 zp->zp_byteorder = ZFS_HOST_BYTEORDER;
2154 bzero(zp->zp_salt, ZIO_DATA_SALT_LEN);
2155 bzero(zp->zp_iv, ZIO_DATA_IV_LEN);
2156 bzero(zp->zp_mac, ZIO_DATA_MAC_LEN);
2157
2158 ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
428870ff
BB
2159}
2160
66aca247
DB
2161/*
2162 * This function is only called from zfs_holey_common() for zpl_llseek()
2163 * in order to determine the location of holes. In order to accurately
2164 * report holes all dirty data must be synced to disk. This causes extremely
2165 * poor performance when seeking for holes in a dirty file. As a compromise,
2166 * only provide hole data when the dnode is clean. When a dnode is dirty
2167 * report the dnode as having no holes which is always a safe thing to do.
2168 */
34dc7c2f
BB
2169int
2170dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
2171{
2172 dnode_t *dn;
2173 int i, err;
66aca247 2174 boolean_t clean = B_TRUE;
34dc7c2f 2175
428870ff 2176 err = dnode_hold(os, object, FTAG, &dn);
34dc7c2f
BB
2177 if (err)
2178 return (err);
66aca247 2179
34dc7c2f 2180 /*
66aca247 2181 * Check if dnode is dirty
34dc7c2f 2182 */
66aca247
DB
2183 if (dn->dn_dirtyctx != DN_UNDIRTIED) {
2184 for (i = 0; i < TXG_SIZE; i++) {
2185 if (!list_is_empty(&dn->dn_dirty_records[i])) {
2186 clean = B_FALSE;
2187 break;
2188 }
2189 }
34dc7c2f 2190 }
66aca247
DB
2191
2192 /*
2193 * If compatibility option is on, sync any current changes before
2194 * we go trundling through the block pointers.
2195 */
2196 if (!clean && zfs_dmu_offset_next_sync) {
2197 clean = B_TRUE;
34dc7c2f
BB
2198 dnode_rele(dn, FTAG);
2199 txg_wait_synced(dmu_objset_pool(os), 0);
428870ff 2200 err = dnode_hold(os, object, FTAG, &dn);
34dc7c2f
BB
2201 if (err)
2202 return (err);
2203 }
2204
66aca247
DB
2205 if (clean)
2206 err = dnode_next_offset(dn,
2207 (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
2208 else
2209 err = SET_ERROR(EBUSY);
2210
34dc7c2f
BB
2211 dnode_rele(dn, FTAG);
2212
2213 return (err);
2214}
2215
2216void
e0b0ca98 2217__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
34dc7c2f 2218{
e0b0ca98 2219 dnode_phys_t *dnp = dn->dn_phys;
d6320ddb 2220 int i;
428870ff 2221
34dc7c2f
BB
2222 doi->doi_data_block_size = dn->dn_datablksz;
2223 doi->doi_metadata_block_size = dn->dn_indblkshift ?
2224 1ULL << dn->dn_indblkshift : 0;
428870ff
BB
2225 doi->doi_type = dn->dn_type;
2226 doi->doi_bonus_type = dn->dn_bonustype;
2227 doi->doi_bonus_size = dn->dn_bonuslen;
50c957f7 2228 doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT;
34dc7c2f
BB
2229 doi->doi_indirection = dn->dn_nlevels;
2230 doi->doi_checksum = dn->dn_checksum;
2231 doi->doi_compress = dn->dn_compress;
6c59307a 2232 doi->doi_nblkptr = dn->dn_nblkptr;
428870ff 2233 doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
d1fada1e 2234 doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
428870ff 2235 doi->doi_fill_count = 0;
d6320ddb 2236 for (i = 0; i < dnp->dn_nblkptr; i++)
9b67f605 2237 doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
e0b0ca98
BB
2238}
2239
2240void
2241dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
2242{
2243 rw_enter(&dn->dn_struct_rwlock, RW_READER);
2244 mutex_enter(&dn->dn_mtx);
2245
2246 __dmu_object_info_from_dnode(dn, doi);
34dc7c2f
BB
2247
2248 mutex_exit(&dn->dn_mtx);
2249 rw_exit(&dn->dn_struct_rwlock);
2250}
2251
2252/*
2253 * Get information on a DMU object.
2254 * If doi is NULL, just indicates whether the object exists.
2255 */
2256int
2257dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
2258{
2259 dnode_t *dn;
428870ff 2260 int err = dnode_hold(os, object, FTAG, &dn);
34dc7c2f
BB
2261
2262 if (err)
2263 return (err);
2264
2265 if (doi != NULL)
2266 dmu_object_info_from_dnode(dn, doi);
2267
2268 dnode_rele(dn, FTAG);
2269 return (0);
2270}
2271
2272/*
2273 * As above, but faster; can be used when you have a held dbuf in hand.
2274 */
2275void
572e2857 2276dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
34dc7c2f 2277{
572e2857
BB
2278 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2279
2280 DB_DNODE_ENTER(db);
2281 dmu_object_info_from_dnode(DB_DNODE(db), doi);
2282 DB_DNODE_EXIT(db);
34dc7c2f
BB
2283}
2284
2285/*
2286 * Faster still when you only care about the size.
2287 * This is specifically optimized for zfs_getattr().
2288 */
2289void
572e2857
BB
2290dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
2291 u_longlong_t *nblk512)
34dc7c2f 2292{
572e2857
BB
2293 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2294 dnode_t *dn;
2295
2296 DB_DNODE_ENTER(db);
2297 dn = DB_DNODE(db);
34dc7c2f
BB
2298
2299 *blksize = dn->dn_datablksz;
50c957f7 2300 /* add in number of slots used for the dnode itself */
34dc7c2f 2301 *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
50c957f7
NB
2302 SPA_MINBLOCKSHIFT) + dn->dn_num_slots;
2303 DB_DNODE_EXIT(db);
2304}
2305
2306void
2307dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize)
2308{
2309 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2310 dnode_t *dn;
2311
2312 DB_DNODE_ENTER(db);
2313 dn = DB_DNODE(db);
2314 *dnsize = dn->dn_num_slots << DNODE_SHIFT;
572e2857 2315 DB_DNODE_EXIT(db);
34dc7c2f
BB
2316}
2317
2318void
2319byteswap_uint64_array(void *vbuf, size_t size)
2320{
2321 uint64_t *buf = vbuf;
2322 size_t count = size >> 3;
2323 int i;
2324
2325 ASSERT((size & 7) == 0);
2326
2327 for (i = 0; i < count; i++)
2328 buf[i] = BSWAP_64(buf[i]);
2329}
2330
2331void
2332byteswap_uint32_array(void *vbuf, size_t size)
2333{
2334 uint32_t *buf = vbuf;
2335 size_t count = size >> 2;
2336 int i;
2337
2338 ASSERT((size & 3) == 0);
2339
2340 for (i = 0; i < count; i++)
2341 buf[i] = BSWAP_32(buf[i]);
2342}
2343
2344void
2345byteswap_uint16_array(void *vbuf, size_t size)
2346{
2347 uint16_t *buf = vbuf;
2348 size_t count = size >> 1;
2349 int i;
2350
2351 ASSERT((size & 1) == 0);
2352
2353 for (i = 0; i < count; i++)
2354 buf[i] = BSWAP_16(buf[i]);
2355}
2356
2357/* ARGSUSED */
2358void
2359byteswap_uint8_array(void *vbuf, size_t size)
2360{
2361}
2362
2363void
2364dmu_init(void)
2365{
a6255b7f 2366 abd_init();
428870ff 2367 zfs_dbgmsg_init();
572e2857
BB
2368 sa_cache_init();
2369 xuio_stat_init();
2370 dmu_objset_init();
34dc7c2f 2371 dnode_init();
428870ff 2372 zfetch_init();
570827e1 2373 dmu_tx_init();
34dc7c2f 2374 l2arc_init();
29809a6c 2375 arc_init();
d3c2ae1c 2376 dbuf_init();
34dc7c2f
BB
2377}
2378
2379void
2380dmu_fini(void)
2381{
e49f1e20 2382 arc_fini(); /* arc depends on l2arc, so arc must go first */
29809a6c 2383 l2arc_fini();
570827e1 2384 dmu_tx_fini();
428870ff 2385 zfetch_fini();
34dc7c2f 2386 dbuf_fini();
572e2857
BB
2387 dnode_fini();
2388 dmu_objset_fini();
428870ff
BB
2389 xuio_stat_fini();
2390 sa_cache_fini();
2391 zfs_dbgmsg_fini();
a6255b7f 2392 abd_fini();
34dc7c2f 2393}
c28b2279
BB
2394
2395#if defined(_KERNEL) && defined(HAVE_SPL)
2396EXPORT_SYMBOL(dmu_bonus_hold);
a473d90c
AZ
2397EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
2398EXPORT_SYMBOL(dmu_buf_rele_array);
57b650b8 2399EXPORT_SYMBOL(dmu_prefetch);
c28b2279 2400EXPORT_SYMBOL(dmu_free_range);
57b650b8 2401EXPORT_SYMBOL(dmu_free_long_range);
b663a23d 2402EXPORT_SYMBOL(dmu_free_long_object);
c28b2279 2403EXPORT_SYMBOL(dmu_read);
0eef1bde 2404EXPORT_SYMBOL(dmu_read_by_dnode);
c28b2279 2405EXPORT_SYMBOL(dmu_write);
0eef1bde 2406EXPORT_SYMBOL(dmu_write_by_dnode);
57b650b8 2407EXPORT_SYMBOL(dmu_prealloc);
c28b2279
BB
2408EXPORT_SYMBOL(dmu_object_info);
2409EXPORT_SYMBOL(dmu_object_info_from_dnode);
2410EXPORT_SYMBOL(dmu_object_info_from_db);
2411EXPORT_SYMBOL(dmu_object_size_from_db);
50c957f7 2412EXPORT_SYMBOL(dmu_object_dnsize_from_db);
b5256303 2413EXPORT_SYMBOL(dmu_object_set_nlevels);
c28b2279
BB
2414EXPORT_SYMBOL(dmu_object_set_blocksize);
2415EXPORT_SYMBOL(dmu_object_set_checksum);
2416EXPORT_SYMBOL(dmu_object_set_compress);
57b650b8
BB
2417EXPORT_SYMBOL(dmu_write_policy);
2418EXPORT_SYMBOL(dmu_sync);
b10c77f7
BB
2419EXPORT_SYMBOL(dmu_request_arcbuf);
2420EXPORT_SYMBOL(dmu_return_arcbuf);
2421EXPORT_SYMBOL(dmu_assign_arcbuf);
2422EXPORT_SYMBOL(dmu_buf_hold);
c28b2279 2423EXPORT_SYMBOL(dmu_ot);
afec56b4 2424
bef78122 2425/* BEGIN CSTYLED */
afec56b4
BB
2426module_param(zfs_mdcomp_disable, int, 0644);
2427MODULE_PARM_DESC(zfs_mdcomp_disable, "Disable meta data compression");
03c6040b
GW
2428
2429module_param(zfs_nopwrite_enabled, int, 0644);
2430MODULE_PARM_DESC(zfs_nopwrite_enabled, "Enable NOP writes");
2431
bef78122
DQ
2432module_param(zfs_per_txg_dirty_frees_percent, ulong, 0644);
2433MODULE_PARM_DESC(zfs_per_txg_dirty_frees_percent,
2434 "percentage of dirtied blocks from frees in one TXG");
66aca247
DB
2435
2436module_param(zfs_dmu_offset_next_sync, int, 0644);
2437MODULE_PARM_DESC(zfs_dmu_offset_next_sync,
2438 "Enable forcing txg sync to find holes");
2439
bef78122 2440/* END CSTYLED */
66aca247 2441
c28b2279 2442#endif