]> git.proxmox.com Git - mirror_zfs-debian.git/blob - module/zfs/dmu.c
New upstream version 0.7.4
[mirror_zfs-debian.git] / module / zfs / dmu.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26 * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved.
27 * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
28 */
29
30 #include <sys/dmu.h>
31 #include <sys/dmu_impl.h>
32 #include <sys/dmu_tx.h>
33 #include <sys/dbuf.h>
34 #include <sys/dnode.h>
35 #include <sys/zfs_context.h>
36 #include <sys/dmu_objset.h>
37 #include <sys/dmu_traverse.h>
38 #include <sys/dsl_dataset.h>
39 #include <sys/dsl_dir.h>
40 #include <sys/dsl_pool.h>
41 #include <sys/dsl_synctask.h>
42 #include <sys/dsl_prop.h>
43 #include <sys/dmu_zfetch.h>
44 #include <sys/zfs_ioctl.h>
45 #include <sys/zap.h>
46 #include <sys/zio_checksum.h>
47 #include <sys/zio_compress.h>
48 #include <sys/sa.h>
49 #include <sys/zfeature.h>
50 #include <sys/abd.h>
51 #include <sys/trace_dmu.h>
52 #include <sys/zfs_rlock.h>
53 #ifdef _KERNEL
54 #include <sys/vmsystm.h>
55 #include <sys/zfs_znode.h>
56 #endif
57
58 /*
59 * Enable/disable nopwrite feature.
60 */
61 int zfs_nopwrite_enabled = 1;
62
63 /*
64 * Tunable to control percentage of dirtied blocks from frees in one TXG.
65 * After this threshold is crossed, additional dirty blocks from frees
66 * wait until the next TXG.
67 * A value of zero will disable this throttle.
68 */
69 unsigned long zfs_per_txg_dirty_frees_percent = 30;
70
71 /*
72 * Enable/disable forcing txg sync when dirty in dmu_offset_next.
73 */
74 int zfs_dmu_offset_next_sync = 0;
75
76 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
77 { DMU_BSWAP_UINT8, TRUE, "unallocated" },
78 { DMU_BSWAP_ZAP, TRUE, "object directory" },
79 { DMU_BSWAP_UINT64, TRUE, "object array" },
80 { DMU_BSWAP_UINT8, TRUE, "packed nvlist" },
81 { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" },
82 { DMU_BSWAP_UINT64, TRUE, "bpobj" },
83 { DMU_BSWAP_UINT64, TRUE, "bpobj header" },
84 { DMU_BSWAP_UINT64, TRUE, "SPA space map header" },
85 { DMU_BSWAP_UINT64, TRUE, "SPA space map" },
86 { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" },
87 { DMU_BSWAP_DNODE, TRUE, "DMU dnode" },
88 { DMU_BSWAP_OBJSET, TRUE, "DMU objset" },
89 { DMU_BSWAP_UINT64, TRUE, "DSL directory" },
90 { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"},
91 { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" },
92 { DMU_BSWAP_ZAP, TRUE, "DSL props" },
93 { DMU_BSWAP_UINT64, TRUE, "DSL dataset" },
94 { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" },
95 { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" },
96 { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" },
97 { DMU_BSWAP_ZAP, TRUE, "ZFS directory" },
98 { DMU_BSWAP_ZAP, TRUE, "ZFS master node" },
99 { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" },
100 { DMU_BSWAP_UINT8, FALSE, "zvol object" },
101 { DMU_BSWAP_ZAP, TRUE, "zvol prop" },
102 { DMU_BSWAP_UINT8, FALSE, "other uint8[]" },
103 { DMU_BSWAP_UINT64, FALSE, "other uint64[]" },
104 { DMU_BSWAP_ZAP, TRUE, "other ZAP" },
105 { DMU_BSWAP_ZAP, TRUE, "persistent error log" },
106 { DMU_BSWAP_UINT8, TRUE, "SPA history" },
107 { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" },
108 { DMU_BSWAP_ZAP, TRUE, "Pool properties" },
109 { DMU_BSWAP_ZAP, TRUE, "DSL permissions" },
110 { DMU_BSWAP_ACL, TRUE, "ZFS ACL" },
111 { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" },
112 { DMU_BSWAP_UINT8, TRUE, "FUID table" },
113 { DMU_BSWAP_UINT64, TRUE, "FUID table size" },
114 { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"},
115 { DMU_BSWAP_ZAP, TRUE, "scan work queue" },
116 { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" },
117 { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" },
118 { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"},
119 { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" },
120 { DMU_BSWAP_ZAP, TRUE, "DDT statistics" },
121 { DMU_BSWAP_UINT8, TRUE, "System attributes" },
122 { DMU_BSWAP_ZAP, TRUE, "SA master node" },
123 { DMU_BSWAP_ZAP, TRUE, "SA attr registration" },
124 { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" },
125 { DMU_BSWAP_ZAP, TRUE, "scan translations" },
126 { DMU_BSWAP_UINT8, FALSE, "deduplicated block" },
127 { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" },
128 { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" },
129 { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" },
130 { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" }
131 };
132
133 const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
134 { byteswap_uint8_array, "uint8" },
135 { byteswap_uint16_array, "uint16" },
136 { byteswap_uint32_array, "uint32" },
137 { byteswap_uint64_array, "uint64" },
138 { zap_byteswap, "zap" },
139 { dnode_buf_byteswap, "dnode" },
140 { dmu_objset_byteswap, "objset" },
141 { zfs_znode_byteswap, "znode" },
142 { zfs_oldacl_byteswap, "oldacl" },
143 { zfs_acl_byteswap, "acl" }
144 };
145
146 int
147 dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
148 void *tag, dmu_buf_t **dbp)
149 {
150 uint64_t blkid;
151 dmu_buf_impl_t *db;
152
153 blkid = dbuf_whichblock(dn, 0, offset);
154 rw_enter(&dn->dn_struct_rwlock, RW_READER);
155 db = dbuf_hold(dn, blkid, tag);
156 rw_exit(&dn->dn_struct_rwlock);
157
158 if (db == NULL) {
159 *dbp = NULL;
160 return (SET_ERROR(EIO));
161 }
162
163 *dbp = &db->db;
164 return (0);
165 }
166 int
167 dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
168 void *tag, dmu_buf_t **dbp)
169 {
170 dnode_t *dn;
171 uint64_t blkid;
172 dmu_buf_impl_t *db;
173 int err;
174
175 err = dnode_hold(os, object, FTAG, &dn);
176 if (err)
177 return (err);
178 blkid = dbuf_whichblock(dn, 0, offset);
179 rw_enter(&dn->dn_struct_rwlock, RW_READER);
180 db = dbuf_hold(dn, blkid, tag);
181 rw_exit(&dn->dn_struct_rwlock);
182 dnode_rele(dn, FTAG);
183
184 if (db == NULL) {
185 *dbp = NULL;
186 return (SET_ERROR(EIO));
187 }
188
189 *dbp = &db->db;
190 return (err);
191 }
192
193 int
194 dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
195 void *tag, dmu_buf_t **dbp, int flags)
196 {
197 int err;
198 int db_flags = DB_RF_CANFAIL;
199
200 if (flags & DMU_READ_NO_PREFETCH)
201 db_flags |= DB_RF_NOPREFETCH;
202
203 err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
204 if (err == 0) {
205 dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
206 err = dbuf_read(db, NULL, db_flags);
207 if (err != 0) {
208 dbuf_rele(db, tag);
209 *dbp = NULL;
210 }
211 }
212
213 return (err);
214 }
215
216 int
217 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
218 void *tag, dmu_buf_t **dbp, int flags)
219 {
220 int err;
221 int db_flags = DB_RF_CANFAIL;
222
223 if (flags & DMU_READ_NO_PREFETCH)
224 db_flags |= DB_RF_NOPREFETCH;
225
226 err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
227 if (err == 0) {
228 dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
229 err = dbuf_read(db, NULL, db_flags);
230 if (err != 0) {
231 dbuf_rele(db, tag);
232 *dbp = NULL;
233 }
234 }
235
236 return (err);
237 }
238
239 int
240 dmu_bonus_max(void)
241 {
242 return (DN_OLD_MAX_BONUSLEN);
243 }
244
245 int
246 dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
247 {
248 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
249 dnode_t *dn;
250 int error;
251
252 DB_DNODE_ENTER(db);
253 dn = DB_DNODE(db);
254
255 if (dn->dn_bonus != db) {
256 error = SET_ERROR(EINVAL);
257 } else if (newsize < 0 || newsize > db_fake->db_size) {
258 error = SET_ERROR(EINVAL);
259 } else {
260 dnode_setbonuslen(dn, newsize, tx);
261 error = 0;
262 }
263
264 DB_DNODE_EXIT(db);
265 return (error);
266 }
267
268 int
269 dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
270 {
271 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
272 dnode_t *dn;
273 int error;
274
275 DB_DNODE_ENTER(db);
276 dn = DB_DNODE(db);
277
278 if (!DMU_OT_IS_VALID(type)) {
279 error = SET_ERROR(EINVAL);
280 } else if (dn->dn_bonus != db) {
281 error = SET_ERROR(EINVAL);
282 } else {
283 dnode_setbonus_type(dn, type, tx);
284 error = 0;
285 }
286
287 DB_DNODE_EXIT(db);
288 return (error);
289 }
290
291 dmu_object_type_t
292 dmu_get_bonustype(dmu_buf_t *db_fake)
293 {
294 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
295 dnode_t *dn;
296 dmu_object_type_t type;
297
298 DB_DNODE_ENTER(db);
299 dn = DB_DNODE(db);
300 type = dn->dn_bonustype;
301 DB_DNODE_EXIT(db);
302
303 return (type);
304 }
305
306 int
307 dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
308 {
309 dnode_t *dn;
310 int error;
311
312 error = dnode_hold(os, object, FTAG, &dn);
313 dbuf_rm_spill(dn, tx);
314 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
315 dnode_rm_spill(dn, tx);
316 rw_exit(&dn->dn_struct_rwlock);
317 dnode_rele(dn, FTAG);
318 return (error);
319 }
320
321 /*
322 * returns ENOENT, EIO, or 0.
323 */
324 int
325 dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
326 {
327 dnode_t *dn;
328 dmu_buf_impl_t *db;
329 int error;
330
331 error = dnode_hold(os, object, FTAG, &dn);
332 if (error)
333 return (error);
334
335 rw_enter(&dn->dn_struct_rwlock, RW_READER);
336 if (dn->dn_bonus == NULL) {
337 rw_exit(&dn->dn_struct_rwlock);
338 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
339 if (dn->dn_bonus == NULL)
340 dbuf_create_bonus(dn);
341 }
342 db = dn->dn_bonus;
343
344 /* as long as the bonus buf is held, the dnode will be held */
345 if (refcount_add(&db->db_holds, tag) == 1) {
346 VERIFY(dnode_add_ref(dn, db));
347 atomic_inc_32(&dn->dn_dbufs_count);
348 }
349
350 /*
351 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
352 * hold and incrementing the dbuf count to ensure that dnode_move() sees
353 * a dnode hold for every dbuf.
354 */
355 rw_exit(&dn->dn_struct_rwlock);
356
357 dnode_rele(dn, FTAG);
358
359 VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
360
361 *dbp = &db->db;
362 return (0);
363 }
364
365 /*
366 * returns ENOENT, EIO, or 0.
367 *
368 * This interface will allocate a blank spill dbuf when a spill blk
369 * doesn't already exist on the dnode.
370 *
371 * if you only want to find an already existing spill db, then
372 * dmu_spill_hold_existing() should be used.
373 */
374 int
375 dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
376 {
377 dmu_buf_impl_t *db = NULL;
378 int err;
379
380 if ((flags & DB_RF_HAVESTRUCT) == 0)
381 rw_enter(&dn->dn_struct_rwlock, RW_READER);
382
383 db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
384
385 if ((flags & DB_RF_HAVESTRUCT) == 0)
386 rw_exit(&dn->dn_struct_rwlock);
387
388 if (db == NULL) {
389 *dbp = NULL;
390 return (SET_ERROR(EIO));
391 }
392 err = dbuf_read(db, NULL, flags);
393 if (err == 0)
394 *dbp = &db->db;
395 else {
396 dbuf_rele(db, tag);
397 *dbp = NULL;
398 }
399 return (err);
400 }
401
402 int
403 dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
404 {
405 dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
406 dnode_t *dn;
407 int err;
408
409 DB_DNODE_ENTER(db);
410 dn = DB_DNODE(db);
411
412 if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
413 err = SET_ERROR(EINVAL);
414 } else {
415 rw_enter(&dn->dn_struct_rwlock, RW_READER);
416
417 if (!dn->dn_have_spill) {
418 err = SET_ERROR(ENOENT);
419 } else {
420 err = dmu_spill_hold_by_dnode(dn,
421 DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
422 }
423
424 rw_exit(&dn->dn_struct_rwlock);
425 }
426
427 DB_DNODE_EXIT(db);
428 return (err);
429 }
430
431 int
432 dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
433 {
434 dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
435 dnode_t *dn;
436 int err;
437
438 DB_DNODE_ENTER(db);
439 dn = DB_DNODE(db);
440 err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
441 DB_DNODE_EXIT(db);
442
443 return (err);
444 }
445
446 /*
447 * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
448 * to take a held dnode rather than <os, object> -- the lookup is wasteful,
449 * and can induce severe lock contention when writing to several files
450 * whose dnodes are in the same block.
451 */
452 static int
453 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
454 boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
455 {
456 dmu_buf_t **dbp;
457 uint64_t blkid, nblks, i;
458 uint32_t dbuf_flags;
459 int err;
460 zio_t *zio;
461
462 ASSERT(length <= DMU_MAX_ACCESS);
463
464 /*
465 * Note: We directly notify the prefetch code of this read, so that
466 * we can tell it about the multi-block read. dbuf_read() only knows
467 * about the one block it is accessing.
468 */
469 dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
470 DB_RF_NOPREFETCH;
471
472 rw_enter(&dn->dn_struct_rwlock, RW_READER);
473 if (dn->dn_datablkshift) {
474 int blkshift = dn->dn_datablkshift;
475 nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
476 P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
477 } else {
478 if (offset + length > dn->dn_datablksz) {
479 zfs_panic_recover("zfs: accessing past end of object "
480 "%llx/%llx (size=%u access=%llu+%llu)",
481 (longlong_t)dn->dn_objset->
482 os_dsl_dataset->ds_object,
483 (longlong_t)dn->dn_object, dn->dn_datablksz,
484 (longlong_t)offset, (longlong_t)length);
485 rw_exit(&dn->dn_struct_rwlock);
486 return (SET_ERROR(EIO));
487 }
488 nblks = 1;
489 }
490 dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
491
492 zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
493 blkid = dbuf_whichblock(dn, 0, offset);
494 for (i = 0; i < nblks; i++) {
495 dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
496 if (db == NULL) {
497 rw_exit(&dn->dn_struct_rwlock);
498 dmu_buf_rele_array(dbp, nblks, tag);
499 zio_nowait(zio);
500 return (SET_ERROR(EIO));
501 }
502
503 /* initiate async i/o */
504 if (read)
505 (void) dbuf_read(db, zio, dbuf_flags);
506 dbp[i] = &db->db;
507 }
508
509 if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
510 DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
511 dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
512 read && DNODE_IS_CACHEABLE(dn));
513 }
514 rw_exit(&dn->dn_struct_rwlock);
515
516 /* wait for async i/o */
517 err = zio_wait(zio);
518 if (err) {
519 dmu_buf_rele_array(dbp, nblks, tag);
520 return (err);
521 }
522
523 /* wait for other io to complete */
524 if (read) {
525 for (i = 0; i < nblks; i++) {
526 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
527 mutex_enter(&db->db_mtx);
528 while (db->db_state == DB_READ ||
529 db->db_state == DB_FILL)
530 cv_wait(&db->db_changed, &db->db_mtx);
531 if (db->db_state == DB_UNCACHED)
532 err = SET_ERROR(EIO);
533 mutex_exit(&db->db_mtx);
534 if (err) {
535 dmu_buf_rele_array(dbp, nblks, tag);
536 return (err);
537 }
538 }
539 }
540
541 *numbufsp = nblks;
542 *dbpp = dbp;
543 return (0);
544 }
545
546 static int
547 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
548 uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
549 {
550 dnode_t *dn;
551 int err;
552
553 err = dnode_hold(os, object, FTAG, &dn);
554 if (err)
555 return (err);
556
557 err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
558 numbufsp, dbpp, DMU_READ_PREFETCH);
559
560 dnode_rele(dn, FTAG);
561
562 return (err);
563 }
564
565 int
566 dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
567 uint64_t length, boolean_t read, void *tag, int *numbufsp,
568 dmu_buf_t ***dbpp)
569 {
570 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
571 dnode_t *dn;
572 int err;
573
574 DB_DNODE_ENTER(db);
575 dn = DB_DNODE(db);
576 err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
577 numbufsp, dbpp, DMU_READ_PREFETCH);
578 DB_DNODE_EXIT(db);
579
580 return (err);
581 }
582
583 void
584 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
585 {
586 int i;
587 dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
588
589 if (numbufs == 0)
590 return;
591
592 for (i = 0; i < numbufs; i++) {
593 if (dbp[i])
594 dbuf_rele(dbp[i], tag);
595 }
596
597 kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
598 }
599
600 /*
601 * Issue prefetch i/os for the given blocks. If level is greater than 0, the
602 * indirect blocks prefeteched will be those that point to the blocks containing
603 * the data starting at offset, and continuing to offset + len.
604 *
605 * Note that if the indirect blocks above the blocks being prefetched are not in
606 * cache, they will be asychronously read in.
607 */
608 void
609 dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
610 uint64_t len, zio_priority_t pri)
611 {
612 dnode_t *dn;
613 uint64_t blkid;
614 int nblks, err;
615
616 if (len == 0) { /* they're interested in the bonus buffer */
617 dn = DMU_META_DNODE(os);
618
619 if (object == 0 || object >= DN_MAX_OBJECT)
620 return;
621
622 rw_enter(&dn->dn_struct_rwlock, RW_READER);
623 blkid = dbuf_whichblock(dn, level,
624 object * sizeof (dnode_phys_t));
625 dbuf_prefetch(dn, level, blkid, pri, 0);
626 rw_exit(&dn->dn_struct_rwlock);
627 return;
628 }
629
630 /*
631 * XXX - Note, if the dnode for the requested object is not
632 * already cached, we will do a *synchronous* read in the
633 * dnode_hold() call. The same is true for any indirects.
634 */
635 err = dnode_hold(os, object, FTAG, &dn);
636 if (err != 0)
637 return;
638
639 rw_enter(&dn->dn_struct_rwlock, RW_READER);
640 /*
641 * offset + len - 1 is the last byte we want to prefetch for, and offset
642 * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the
643 * last block we want to prefetch, and dbuf_whichblock(dn, level,
644 * offset) is the first. Then the number we need to prefetch is the
645 * last - first + 1.
646 */
647 if (level > 0 || dn->dn_datablkshift != 0) {
648 nblks = dbuf_whichblock(dn, level, offset + len - 1) -
649 dbuf_whichblock(dn, level, offset) + 1;
650 } else {
651 nblks = (offset < dn->dn_datablksz);
652 }
653
654 if (nblks != 0) {
655 int i;
656
657 blkid = dbuf_whichblock(dn, level, offset);
658 for (i = 0; i < nblks; i++)
659 dbuf_prefetch(dn, level, blkid + i, pri, 0);
660 }
661
662 rw_exit(&dn->dn_struct_rwlock);
663
664 dnode_rele(dn, FTAG);
665 }
666
667 /*
668 * Get the next "chunk" of file data to free. We traverse the file from
669 * the end so that the file gets shorter over time (if we crashes in the
670 * middle, this will leave us in a better state). We find allocated file
671 * data by simply searching the allocated level 1 indirects.
672 *
673 * On input, *start should be the first offset that does not need to be
674 * freed (e.g. "offset + length"). On return, *start will be the first
675 * offset that should be freed.
676 */
677 static int
678 get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
679 {
680 uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
681 /* bytes of data covered by a level-1 indirect block */
682 uint64_t iblkrange =
683 dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
684 uint64_t blks;
685
686 ASSERT3U(minimum, <=, *start);
687
688 if (*start - minimum <= iblkrange * maxblks) {
689 *start = minimum;
690 return (0);
691 }
692 ASSERT(ISP2(iblkrange));
693
694 for (blks = 0; *start > minimum && blks < maxblks; blks++) {
695 int err;
696
697 /*
698 * dnode_next_offset(BACKWARDS) will find an allocated L1
699 * indirect block at or before the input offset. We must
700 * decrement *start so that it is at the end of the region
701 * to search.
702 */
703 (*start)--;
704 err = dnode_next_offset(dn,
705 DNODE_FIND_BACKWARDS, start, 2, 1, 0);
706
707 /* if there are no indirect blocks before start, we are done */
708 if (err == ESRCH) {
709 *start = minimum;
710 break;
711 } else if (err != 0) {
712 return (err);
713 }
714
715 /* set start to the beginning of this L1 indirect */
716 *start = P2ALIGN(*start, iblkrange);
717 }
718 if (*start < minimum)
719 *start = minimum;
720 return (0);
721 }
722
723 /*
724 * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
725 * otherwise return false.
726 * Used below in dmu_free_long_range_impl() to enable abort when unmounting
727 */
728 /*ARGSUSED*/
729 static boolean_t
730 dmu_objset_zfs_unmounting(objset_t *os)
731 {
732 #ifdef _KERNEL
733 if (dmu_objset_type(os) == DMU_OST_ZFS)
734 return (zfs_get_vfs_flag_unmounted(os));
735 #endif
736 return (B_FALSE);
737 }
738
739 static int
740 dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
741 uint64_t length)
742 {
743 uint64_t object_size;
744 int err;
745 uint64_t dirty_frees_threshold;
746 dsl_pool_t *dp = dmu_objset_pool(os);
747 int t;
748
749 if (dn == NULL)
750 return (SET_ERROR(EINVAL));
751
752 object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
753 if (offset >= object_size)
754 return (0);
755
756 if (zfs_per_txg_dirty_frees_percent <= 100)
757 dirty_frees_threshold =
758 zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
759 else
760 dirty_frees_threshold = zfs_dirty_data_max / 4;
761
762 if (length == DMU_OBJECT_END || offset + length > object_size)
763 length = object_size - offset;
764
765 while (length != 0) {
766 uint64_t chunk_end, chunk_begin, chunk_len;
767 uint64_t long_free_dirty_all_txgs = 0;
768 dmu_tx_t *tx;
769
770 if (dmu_objset_zfs_unmounting(dn->dn_objset))
771 return (SET_ERROR(EINTR));
772
773 chunk_end = chunk_begin = offset + length;
774
775 /* move chunk_begin backwards to the beginning of this chunk */
776 err = get_next_chunk(dn, &chunk_begin, offset);
777 if (err)
778 return (err);
779 ASSERT3U(chunk_begin, >=, offset);
780 ASSERT3U(chunk_begin, <=, chunk_end);
781
782 chunk_len = chunk_end - chunk_begin;
783
784 mutex_enter(&dp->dp_lock);
785 for (t = 0; t < TXG_SIZE; t++) {
786 long_free_dirty_all_txgs +=
787 dp->dp_long_free_dirty_pertxg[t];
788 }
789 mutex_exit(&dp->dp_lock);
790
791 /*
792 * To avoid filling up a TXG with just frees wait for
793 * the next TXG to open before freeing more chunks if
794 * we have reached the threshold of frees
795 */
796 if (dirty_frees_threshold != 0 &&
797 long_free_dirty_all_txgs >= dirty_frees_threshold) {
798 txg_wait_open(dp, 0);
799 continue;
800 }
801
802 tx = dmu_tx_create(os);
803 dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
804
805 /*
806 * Mark this transaction as typically resulting in a net
807 * reduction in space used.
808 */
809 dmu_tx_mark_netfree(tx);
810 err = dmu_tx_assign(tx, TXG_WAIT);
811 if (err) {
812 dmu_tx_abort(tx);
813 return (err);
814 }
815
816 mutex_enter(&dp->dp_lock);
817 dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] +=
818 chunk_len;
819 mutex_exit(&dp->dp_lock);
820 DTRACE_PROBE3(free__long__range,
821 uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len,
822 uint64_t, dmu_tx_get_txg(tx));
823 dnode_free_range(dn, chunk_begin, chunk_len, tx);
824 dmu_tx_commit(tx);
825
826 length -= chunk_len;
827 }
828 return (0);
829 }
830
831 int
832 dmu_free_long_range(objset_t *os, uint64_t object,
833 uint64_t offset, uint64_t length)
834 {
835 dnode_t *dn;
836 int err;
837
838 err = dnode_hold(os, object, FTAG, &dn);
839 if (err != 0)
840 return (err);
841 err = dmu_free_long_range_impl(os, dn, offset, length);
842
843 /*
844 * It is important to zero out the maxblkid when freeing the entire
845 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
846 * will take the fast path, and (b) dnode_reallocate() can verify
847 * that the entire file has been freed.
848 */
849 if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
850 dn->dn_maxblkid = 0;
851
852 dnode_rele(dn, FTAG);
853 return (err);
854 }
855
856 int
857 dmu_free_long_object(objset_t *os, uint64_t object)
858 {
859 dmu_tx_t *tx;
860 int err;
861
862 err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
863 if (err != 0)
864 return (err);
865
866 tx = dmu_tx_create(os);
867 dmu_tx_hold_bonus(tx, object);
868 dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
869 dmu_tx_mark_netfree(tx);
870 err = dmu_tx_assign(tx, TXG_WAIT);
871 if (err == 0) {
872 err = dmu_object_free(os, object, tx);
873 dmu_tx_commit(tx);
874 } else {
875 dmu_tx_abort(tx);
876 }
877
878 return (err);
879 }
880
881 int
882 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
883 uint64_t size, dmu_tx_t *tx)
884 {
885 dnode_t *dn;
886 int err = dnode_hold(os, object, FTAG, &dn);
887 if (err)
888 return (err);
889 ASSERT(offset < UINT64_MAX);
890 ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
891 dnode_free_range(dn, offset, size, tx);
892 dnode_rele(dn, FTAG);
893 return (0);
894 }
895
896 static int
897 dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
898 void *buf, uint32_t flags)
899 {
900 dmu_buf_t **dbp;
901 int numbufs, err = 0;
902
903 /*
904 * Deal with odd block sizes, where there can't be data past the first
905 * block. If we ever do the tail block optimization, we will need to
906 * handle that here as well.
907 */
908 if (dn->dn_maxblkid == 0) {
909 uint64_t newsz = offset > dn->dn_datablksz ? 0 :
910 MIN(size, dn->dn_datablksz - offset);
911 bzero((char *)buf + newsz, size - newsz);
912 size = newsz;
913 }
914
915 while (size > 0) {
916 uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
917 int i;
918
919 /*
920 * NB: we could do this block-at-a-time, but it's nice
921 * to be reading in parallel.
922 */
923 err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
924 TRUE, FTAG, &numbufs, &dbp, flags);
925 if (err)
926 break;
927
928 for (i = 0; i < numbufs; i++) {
929 uint64_t tocpy;
930 int64_t bufoff;
931 dmu_buf_t *db = dbp[i];
932
933 ASSERT(size > 0);
934
935 bufoff = offset - db->db_offset;
936 tocpy = MIN(db->db_size - bufoff, size);
937
938 (void) memcpy(buf, (char *)db->db_data + bufoff, tocpy);
939
940 offset += tocpy;
941 size -= tocpy;
942 buf = (char *)buf + tocpy;
943 }
944 dmu_buf_rele_array(dbp, numbufs, FTAG);
945 }
946 return (err);
947 }
948
949 int
950 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
951 void *buf, uint32_t flags)
952 {
953 dnode_t *dn;
954 int err;
955
956 err = dnode_hold(os, object, FTAG, &dn);
957 if (err != 0)
958 return (err);
959
960 err = dmu_read_impl(dn, offset, size, buf, flags);
961 dnode_rele(dn, FTAG);
962 return (err);
963 }
964
965 int
966 dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
967 uint32_t flags)
968 {
969 return (dmu_read_impl(dn, offset, size, buf, flags));
970 }
971
972 static void
973 dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
974 const void *buf, dmu_tx_t *tx)
975 {
976 int i;
977
978 for (i = 0; i < numbufs; i++) {
979 uint64_t tocpy;
980 int64_t bufoff;
981 dmu_buf_t *db = dbp[i];
982
983 ASSERT(size > 0);
984
985 bufoff = offset - db->db_offset;
986 tocpy = MIN(db->db_size - bufoff, size);
987
988 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
989
990 if (tocpy == db->db_size)
991 dmu_buf_will_fill(db, tx);
992 else
993 dmu_buf_will_dirty(db, tx);
994
995 (void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
996
997 if (tocpy == db->db_size)
998 dmu_buf_fill_done(db, tx);
999
1000 offset += tocpy;
1001 size -= tocpy;
1002 buf = (char *)buf + tocpy;
1003 }
1004 }
1005
1006 void
1007 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1008 const void *buf, dmu_tx_t *tx)
1009 {
1010 dmu_buf_t **dbp;
1011 int numbufs;
1012
1013 if (size == 0)
1014 return;
1015
1016 VERIFY0(dmu_buf_hold_array(os, object, offset, size,
1017 FALSE, FTAG, &numbufs, &dbp));
1018 dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
1019 dmu_buf_rele_array(dbp, numbufs, FTAG);
1020 }
1021
1022 void
1023 dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
1024 const void *buf, dmu_tx_t *tx)
1025 {
1026 dmu_buf_t **dbp;
1027 int numbufs;
1028
1029 if (size == 0)
1030 return;
1031
1032 VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
1033 FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
1034 dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
1035 dmu_buf_rele_array(dbp, numbufs, FTAG);
1036 }
1037
1038 void
1039 dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1040 dmu_tx_t *tx)
1041 {
1042 dmu_buf_t **dbp;
1043 int numbufs, i;
1044
1045 if (size == 0)
1046 return;
1047
1048 VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
1049 FALSE, FTAG, &numbufs, &dbp));
1050
1051 for (i = 0; i < numbufs; i++) {
1052 dmu_buf_t *db = dbp[i];
1053
1054 dmu_buf_will_not_fill(db, tx);
1055 }
1056 dmu_buf_rele_array(dbp, numbufs, FTAG);
1057 }
1058
1059 void
1060 dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
1061 void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
1062 int compressed_size, int byteorder, dmu_tx_t *tx)
1063 {
1064 dmu_buf_t *db;
1065
1066 ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
1067 ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
1068 VERIFY0(dmu_buf_hold_noread(os, object, offset,
1069 FTAG, &db));
1070
1071 dmu_buf_write_embedded(db,
1072 data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
1073 uncompressed_size, compressed_size, byteorder, tx);
1074
1075 dmu_buf_rele(db, FTAG);
1076 }
1077
1078 /*
1079 * DMU support for xuio
1080 */
1081 kstat_t *xuio_ksp = NULL;
1082
1083 typedef struct xuio_stats {
1084 /* loaned yet not returned arc_buf */
1085 kstat_named_t xuiostat_onloan_rbuf;
1086 kstat_named_t xuiostat_onloan_wbuf;
1087 /* whether a copy is made when loaning out a read buffer */
1088 kstat_named_t xuiostat_rbuf_copied;
1089 kstat_named_t xuiostat_rbuf_nocopy;
1090 /* whether a copy is made when assigning a write buffer */
1091 kstat_named_t xuiostat_wbuf_copied;
1092 kstat_named_t xuiostat_wbuf_nocopy;
1093 } xuio_stats_t;
1094
1095 static xuio_stats_t xuio_stats = {
1096 { "onloan_read_buf", KSTAT_DATA_UINT64 },
1097 { "onloan_write_buf", KSTAT_DATA_UINT64 },
1098 { "read_buf_copied", KSTAT_DATA_UINT64 },
1099 { "read_buf_nocopy", KSTAT_DATA_UINT64 },
1100 { "write_buf_copied", KSTAT_DATA_UINT64 },
1101 { "write_buf_nocopy", KSTAT_DATA_UINT64 }
1102 };
1103
1104 #define XUIOSTAT_INCR(stat, val) \
1105 atomic_add_64(&xuio_stats.stat.value.ui64, (val))
1106 #define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1)
1107
1108 #ifdef HAVE_UIO_ZEROCOPY
1109 int
1110 dmu_xuio_init(xuio_t *xuio, int nblk)
1111 {
1112 dmu_xuio_t *priv;
1113 uio_t *uio = &xuio->xu_uio;
1114
1115 uio->uio_iovcnt = nblk;
1116 uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
1117
1118 priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
1119 priv->cnt = nblk;
1120 priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
1121 priv->iovp = (iovec_t *)uio->uio_iov;
1122 XUIO_XUZC_PRIV(xuio) = priv;
1123
1124 if (XUIO_XUZC_RW(xuio) == UIO_READ)
1125 XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
1126 else
1127 XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
1128
1129 return (0);
1130 }
1131
1132 void
1133 dmu_xuio_fini(xuio_t *xuio)
1134 {
1135 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1136 int nblk = priv->cnt;
1137
1138 kmem_free(priv->iovp, nblk * sizeof (iovec_t));
1139 kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
1140 kmem_free(priv, sizeof (dmu_xuio_t));
1141
1142 if (XUIO_XUZC_RW(xuio) == UIO_READ)
1143 XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
1144 else
1145 XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
1146 }
1147
1148 /*
1149 * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
1150 * and increase priv->next by 1.
1151 */
1152 int
1153 dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
1154 {
1155 struct iovec *iov;
1156 uio_t *uio = &xuio->xu_uio;
1157 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1158 int i = priv->next++;
1159
1160 ASSERT(i < priv->cnt);
1161 ASSERT(off + n <= arc_buf_lsize(abuf));
1162 iov = (iovec_t *)uio->uio_iov + i;
1163 iov->iov_base = (char *)abuf->b_data + off;
1164 iov->iov_len = n;
1165 priv->bufs[i] = abuf;
1166 return (0);
1167 }
1168
1169 int
1170 dmu_xuio_cnt(xuio_t *xuio)
1171 {
1172 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1173 return (priv->cnt);
1174 }
1175
1176 arc_buf_t *
1177 dmu_xuio_arcbuf(xuio_t *xuio, int i)
1178 {
1179 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1180
1181 ASSERT(i < priv->cnt);
1182 return (priv->bufs[i]);
1183 }
1184
1185 void
1186 dmu_xuio_clear(xuio_t *xuio, int i)
1187 {
1188 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1189
1190 ASSERT(i < priv->cnt);
1191 priv->bufs[i] = NULL;
1192 }
1193 #endif /* HAVE_UIO_ZEROCOPY */
1194
1195 static void
1196 xuio_stat_init(void)
1197 {
1198 xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
1199 KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
1200 KSTAT_FLAG_VIRTUAL);
1201 if (xuio_ksp != NULL) {
1202 xuio_ksp->ks_data = &xuio_stats;
1203 kstat_install(xuio_ksp);
1204 }
1205 }
1206
1207 static void
1208 xuio_stat_fini(void)
1209 {
1210 if (xuio_ksp != NULL) {
1211 kstat_delete(xuio_ksp);
1212 xuio_ksp = NULL;
1213 }
1214 }
1215
1216 void
1217 xuio_stat_wbuf_copied(void)
1218 {
1219 XUIOSTAT_BUMP(xuiostat_wbuf_copied);
1220 }
1221
1222 void
1223 xuio_stat_wbuf_nocopy(void)
1224 {
1225 XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
1226 }
1227
1228 #ifdef _KERNEL
1229 int
1230 dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
1231 {
1232 dmu_buf_t **dbp;
1233 int numbufs, i, err;
1234 #ifdef HAVE_UIO_ZEROCOPY
1235 xuio_t *xuio = NULL;
1236 #endif
1237
1238 /*
1239 * NB: we could do this block-at-a-time, but it's nice
1240 * to be reading in parallel.
1241 */
1242 err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
1243 TRUE, FTAG, &numbufs, &dbp, 0);
1244 if (err)
1245 return (err);
1246
1247 for (i = 0; i < numbufs; i++) {
1248 uint64_t tocpy;
1249 int64_t bufoff;
1250 dmu_buf_t *db = dbp[i];
1251
1252 ASSERT(size > 0);
1253
1254 bufoff = uio->uio_loffset - db->db_offset;
1255 tocpy = MIN(db->db_size - bufoff, size);
1256
1257 #ifdef HAVE_UIO_ZEROCOPY
1258 if (xuio) {
1259 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
1260 arc_buf_t *dbuf_abuf = dbi->db_buf;
1261 arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
1262 err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
1263 if (!err) {
1264 uio->uio_resid -= tocpy;
1265 uio->uio_loffset += tocpy;
1266 }
1267
1268 if (abuf == dbuf_abuf)
1269 XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
1270 else
1271 XUIOSTAT_BUMP(xuiostat_rbuf_copied);
1272 } else
1273 #endif
1274 err = uiomove((char *)db->db_data + bufoff, tocpy,
1275 UIO_READ, uio);
1276 if (err)
1277 break;
1278
1279 size -= tocpy;
1280 }
1281 dmu_buf_rele_array(dbp, numbufs, FTAG);
1282
1283 return (err);
1284 }
1285
1286 /*
1287 * Read 'size' bytes into the uio buffer.
1288 * From object zdb->db_object.
1289 * Starting at offset uio->uio_loffset.
1290 *
1291 * If the caller already has a dbuf in the target object
1292 * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
1293 * because we don't have to find the dnode_t for the object.
1294 */
1295 int
1296 dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
1297 {
1298 dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
1299 dnode_t *dn;
1300 int err;
1301
1302 if (size == 0)
1303 return (0);
1304
1305 DB_DNODE_ENTER(db);
1306 dn = DB_DNODE(db);
1307 err = dmu_read_uio_dnode(dn, uio, size);
1308 DB_DNODE_EXIT(db);
1309
1310 return (err);
1311 }
1312
1313 /*
1314 * Read 'size' bytes into the uio buffer.
1315 * From the specified object
1316 * Starting at offset uio->uio_loffset.
1317 */
1318 int
1319 dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
1320 {
1321 dnode_t *dn;
1322 int err;
1323
1324 if (size == 0)
1325 return (0);
1326
1327 err = dnode_hold(os, object, FTAG, &dn);
1328 if (err)
1329 return (err);
1330
1331 err = dmu_read_uio_dnode(dn, uio, size);
1332
1333 dnode_rele(dn, FTAG);
1334
1335 return (err);
1336 }
1337
1338 int
1339 dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
1340 {
1341 dmu_buf_t **dbp;
1342 int numbufs;
1343 int err = 0;
1344 int i;
1345
1346 err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
1347 FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
1348 if (err)
1349 return (err);
1350
1351 for (i = 0; i < numbufs; i++) {
1352 uint64_t tocpy;
1353 int64_t bufoff;
1354 dmu_buf_t *db = dbp[i];
1355
1356 ASSERT(size > 0);
1357
1358 bufoff = uio->uio_loffset - db->db_offset;
1359 tocpy = MIN(db->db_size - bufoff, size);
1360
1361 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1362
1363 if (tocpy == db->db_size)
1364 dmu_buf_will_fill(db, tx);
1365 else
1366 dmu_buf_will_dirty(db, tx);
1367
1368 /*
1369 * XXX uiomove could block forever (eg.nfs-backed
1370 * pages). There needs to be a uiolockdown() function
1371 * to lock the pages in memory, so that uiomove won't
1372 * block.
1373 */
1374 err = uiomove((char *)db->db_data + bufoff, tocpy,
1375 UIO_WRITE, uio);
1376
1377 if (tocpy == db->db_size)
1378 dmu_buf_fill_done(db, tx);
1379
1380 if (err)
1381 break;
1382
1383 size -= tocpy;
1384 }
1385
1386 dmu_buf_rele_array(dbp, numbufs, FTAG);
1387 return (err);
1388 }
1389
1390 /*
1391 * Write 'size' bytes from the uio buffer.
1392 * To object zdb->db_object.
1393 * Starting at offset uio->uio_loffset.
1394 *
1395 * If the caller already has a dbuf in the target object
1396 * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
1397 * because we don't have to find the dnode_t for the object.
1398 */
1399 int
1400 dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
1401 dmu_tx_t *tx)
1402 {
1403 dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
1404 dnode_t *dn;
1405 int err;
1406
1407 if (size == 0)
1408 return (0);
1409
1410 DB_DNODE_ENTER(db);
1411 dn = DB_DNODE(db);
1412 err = dmu_write_uio_dnode(dn, uio, size, tx);
1413 DB_DNODE_EXIT(db);
1414
1415 return (err);
1416 }
1417
1418 /*
1419 * Write 'size' bytes from the uio buffer.
1420 * To the specified object.
1421 * Starting at offset uio->uio_loffset.
1422 */
1423 int
1424 dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
1425 dmu_tx_t *tx)
1426 {
1427 dnode_t *dn;
1428 int err;
1429
1430 if (size == 0)
1431 return (0);
1432
1433 err = dnode_hold(os, object, FTAG, &dn);
1434 if (err)
1435 return (err);
1436
1437 err = dmu_write_uio_dnode(dn, uio, size, tx);
1438
1439 dnode_rele(dn, FTAG);
1440
1441 return (err);
1442 }
1443 #endif /* _KERNEL */
1444
1445 /*
1446 * Allocate a loaned anonymous arc buffer.
1447 */
1448 arc_buf_t *
1449 dmu_request_arcbuf(dmu_buf_t *handle, int size)
1450 {
1451 dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
1452
1453 return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
1454 }
1455
1456 /*
1457 * Free a loaned arc buffer.
1458 */
1459 void
1460 dmu_return_arcbuf(arc_buf_t *buf)
1461 {
1462 arc_return_buf(buf, FTAG);
1463 arc_buf_destroy(buf, FTAG);
1464 }
1465
1466 /*
1467 * When possible directly assign passed loaned arc buffer to a dbuf.
1468 * If this is not possible copy the contents of passed arc buf via
1469 * dmu_write().
1470 */
1471 void
1472 dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
1473 dmu_tx_t *tx)
1474 {
1475 dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
1476 dnode_t *dn;
1477 dmu_buf_impl_t *db;
1478 uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
1479 uint64_t blkid;
1480
1481 DB_DNODE_ENTER(dbuf);
1482 dn = DB_DNODE(dbuf);
1483 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1484 blkid = dbuf_whichblock(dn, 0, offset);
1485 VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
1486 rw_exit(&dn->dn_struct_rwlock);
1487 DB_DNODE_EXIT(dbuf);
1488
1489 /*
1490 * We can only assign if the offset is aligned, the arc buf is the
1491 * same size as the dbuf, and the dbuf is not metadata.
1492 */
1493 if (offset == db->db.db_offset && blksz == db->db.db_size) {
1494 dbuf_assign_arcbuf(db, buf, tx);
1495 dbuf_rele(db, FTAG);
1496 } else {
1497 objset_t *os;
1498 uint64_t object;
1499
1500 /* compressed bufs must always be assignable to their dbuf */
1501 ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
1502 ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
1503
1504 DB_DNODE_ENTER(dbuf);
1505 dn = DB_DNODE(dbuf);
1506 os = dn->dn_objset;
1507 object = dn->dn_object;
1508 DB_DNODE_EXIT(dbuf);
1509
1510 dbuf_rele(db, FTAG);
1511 dmu_write(os, object, offset, blksz, buf->b_data, tx);
1512 dmu_return_arcbuf(buf);
1513 XUIOSTAT_BUMP(xuiostat_wbuf_copied);
1514 }
1515 }
1516
1517 typedef struct {
1518 dbuf_dirty_record_t *dsa_dr;
1519 dmu_sync_cb_t *dsa_done;
1520 zgd_t *dsa_zgd;
1521 dmu_tx_t *dsa_tx;
1522 } dmu_sync_arg_t;
1523
1524 /* ARGSUSED */
1525 static void
1526 dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
1527 {
1528 dmu_sync_arg_t *dsa = varg;
1529 dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
1530 blkptr_t *bp = zio->io_bp;
1531
1532 if (zio->io_error == 0) {
1533 if (BP_IS_HOLE(bp)) {
1534 /*
1535 * A block of zeros may compress to a hole, but the
1536 * block size still needs to be known for replay.
1537 */
1538 BP_SET_LSIZE(bp, db->db_size);
1539 } else if (!BP_IS_EMBEDDED(bp)) {
1540 ASSERT(BP_GET_LEVEL(bp) == 0);
1541 bp->blk_fill = 1;
1542 }
1543 }
1544 }
1545
1546 static void
1547 dmu_sync_late_arrival_ready(zio_t *zio)
1548 {
1549 dmu_sync_ready(zio, NULL, zio->io_private);
1550 }
1551
1552 /* ARGSUSED */
1553 static void
1554 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
1555 {
1556 dmu_sync_arg_t *dsa = varg;
1557 dbuf_dirty_record_t *dr = dsa->dsa_dr;
1558 dmu_buf_impl_t *db = dr->dr_dbuf;
1559
1560 mutex_enter(&db->db_mtx);
1561 ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
1562 if (zio->io_error == 0) {
1563 dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
1564 if (dr->dt.dl.dr_nopwrite) {
1565 blkptr_t *bp = zio->io_bp;
1566 blkptr_t *bp_orig = &zio->io_bp_orig;
1567 uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
1568
1569 ASSERT(BP_EQUAL(bp, bp_orig));
1570 VERIFY(BP_EQUAL(bp, db->db_blkptr));
1571 ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
1572 VERIFY(zio_checksum_table[chksum].ci_flags &
1573 ZCHECKSUM_FLAG_NOPWRITE);
1574 }
1575 dr->dt.dl.dr_overridden_by = *zio->io_bp;
1576 dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
1577 dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
1578
1579 /*
1580 * Old style holes are filled with all zeros, whereas
1581 * new-style holes maintain their lsize, type, level,
1582 * and birth time (see zio_write_compress). While we
1583 * need to reset the BP_SET_LSIZE() call that happened
1584 * in dmu_sync_ready for old style holes, we do *not*
1585 * want to wipe out the information contained in new
1586 * style holes. Thus, only zero out the block pointer if
1587 * it's an old style hole.
1588 */
1589 if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
1590 dr->dt.dl.dr_overridden_by.blk_birth == 0)
1591 BP_ZERO(&dr->dt.dl.dr_overridden_by);
1592 } else {
1593 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1594 }
1595 cv_broadcast(&db->db_changed);
1596 mutex_exit(&db->db_mtx);
1597
1598 dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
1599
1600 kmem_free(dsa, sizeof (*dsa));
1601 }
1602
1603 static void
1604 dmu_sync_late_arrival_done(zio_t *zio)
1605 {
1606 blkptr_t *bp = zio->io_bp;
1607 dmu_sync_arg_t *dsa = zio->io_private;
1608 ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig);
1609
1610 if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
1611 ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
1612 ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
1613 ASSERT(zio->io_bp->blk_birth == zio->io_txg);
1614 ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
1615 zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
1616 }
1617
1618 dmu_tx_commit(dsa->dsa_tx);
1619
1620 dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
1621
1622 abd_put(zio->io_abd);
1623 kmem_free(dsa, sizeof (*dsa));
1624 }
1625
1626 static int
1627 dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
1628 zio_prop_t *zp, zbookmark_phys_t *zb)
1629 {
1630 dmu_sync_arg_t *dsa;
1631 dmu_tx_t *tx;
1632
1633 tx = dmu_tx_create(os);
1634 dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
1635 if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
1636 dmu_tx_abort(tx);
1637 /* Make zl_get_data do txg_waited_synced() */
1638 return (SET_ERROR(EIO));
1639 }
1640
1641 dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1642 dsa->dsa_dr = NULL;
1643 dsa->dsa_done = done;
1644 dsa->dsa_zgd = zgd;
1645 dsa->dsa_tx = tx;
1646
1647 /*
1648 * Since we are currently syncing this txg, it's nontrivial to
1649 * determine what BP to nopwrite against, so we disable nopwrite.
1650 *
1651 * When syncing, the db_blkptr is initially the BP of the previous
1652 * txg. We can not nopwrite against it because it will be changed
1653 * (this is similar to the non-late-arrival case where the dbuf is
1654 * dirty in a future txg).
1655 *
1656 * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
1657 * We can not nopwrite against it because although the BP will not
1658 * (typically) be changed, the data has not yet been persisted to this
1659 * location.
1660 *
1661 * Finally, when dbuf_write_done() is called, it is theoretically
1662 * possible to always nopwrite, because the data that was written in
1663 * this txg is the same data that we are trying to write. However we
1664 * would need to check that this dbuf is not dirty in any future
1665 * txg's (as we do in the normal dmu_sync() path). For simplicity, we
1666 * don't nopwrite in this case.
1667 */
1668 zp->zp_nopwrite = B_FALSE;
1669
1670 zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
1671 abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
1672 zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
1673 dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
1674 dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
1675
1676 return (0);
1677 }
1678
1679 /*
1680 * Intent log support: sync the block associated with db to disk.
1681 * N.B. and XXX: the caller is responsible for making sure that the
1682 * data isn't changing while dmu_sync() is writing it.
1683 *
1684 * Return values:
1685 *
1686 * EEXIST: this txg has already been synced, so there's nothing to do.
1687 * The caller should not log the write.
1688 *
1689 * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
1690 * The caller should not log the write.
1691 *
1692 * EALREADY: this block is already in the process of being synced.
1693 * The caller should track its progress (somehow).
1694 *
1695 * EIO: could not do the I/O.
1696 * The caller should do a txg_wait_synced().
1697 *
1698 * 0: the I/O has been initiated.
1699 * The caller should log this blkptr in the done callback.
1700 * It is possible that the I/O will fail, in which case
1701 * the error will be reported to the done callback and
1702 * propagated to pio from zio_done().
1703 */
1704 int
1705 dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
1706 {
1707 dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
1708 objset_t *os = db->db_objset;
1709 dsl_dataset_t *ds = os->os_dsl_dataset;
1710 dbuf_dirty_record_t *dr;
1711 dmu_sync_arg_t *dsa;
1712 zbookmark_phys_t zb;
1713 zio_prop_t zp;
1714 dnode_t *dn;
1715
1716 ASSERT(pio != NULL);
1717 ASSERT(txg != 0);
1718
1719 /* dbuf is within the locked range */
1720 ASSERT3U(db->db.db_offset, >=, zgd->zgd_rl->r_off);
1721 ASSERT3U(db->db.db_offset + db->db.db_size, <=,
1722 zgd->zgd_rl->r_off + zgd->zgd_rl->r_len);
1723
1724 SET_BOOKMARK(&zb, ds->ds_object,
1725 db->db.db_object, db->db_level, db->db_blkid);
1726
1727 DB_DNODE_ENTER(db);
1728 dn = DB_DNODE(db);
1729 dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
1730 DB_DNODE_EXIT(db);
1731
1732 /*
1733 * If we're frozen (running ziltest), we always need to generate a bp.
1734 */
1735 if (txg > spa_freeze_txg(os->os_spa))
1736 return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
1737
1738 /*
1739 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
1740 * and us. If we determine that this txg is not yet syncing,
1741 * but it begins to sync a moment later, that's OK because the
1742 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
1743 */
1744 mutex_enter(&db->db_mtx);
1745
1746 if (txg <= spa_last_synced_txg(os->os_spa)) {
1747 /*
1748 * This txg has already synced. There's nothing to do.
1749 */
1750 mutex_exit(&db->db_mtx);
1751 return (SET_ERROR(EEXIST));
1752 }
1753
1754 if (txg <= spa_syncing_txg(os->os_spa)) {
1755 /*
1756 * This txg is currently syncing, so we can't mess with
1757 * the dirty record anymore; just write a new log block.
1758 */
1759 mutex_exit(&db->db_mtx);
1760 return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
1761 }
1762
1763 dr = db->db_last_dirty;
1764 while (dr && dr->dr_txg != txg)
1765 dr = dr->dr_next;
1766
1767 if (dr == NULL) {
1768 /*
1769 * There's no dr for this dbuf, so it must have been freed.
1770 * There's no need to log writes to freed blocks, so we're done.
1771 */
1772 mutex_exit(&db->db_mtx);
1773 return (SET_ERROR(ENOENT));
1774 }
1775
1776 ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
1777
1778 if (db->db_blkptr != NULL) {
1779 /*
1780 * We need to fill in zgd_bp with the current blkptr so that
1781 * the nopwrite code can check if we're writing the same
1782 * data that's already on disk. We can only nopwrite if we
1783 * are sure that after making the copy, db_blkptr will not
1784 * change until our i/o completes. We ensure this by
1785 * holding the db_mtx, and only allowing nopwrite if the
1786 * block is not already dirty (see below). This is verified
1787 * by dmu_sync_done(), which VERIFYs that the db_blkptr has
1788 * not changed.
1789 */
1790 *zgd->zgd_bp = *db->db_blkptr;
1791 }
1792
1793 /*
1794 * Assume the on-disk data is X, the current syncing data (in
1795 * txg - 1) is Y, and the current in-memory data is Z (currently
1796 * in dmu_sync).
1797 *
1798 * We usually want to perform a nopwrite if X and Z are the
1799 * same. However, if Y is different (i.e. the BP is going to
1800 * change before this write takes effect), then a nopwrite will
1801 * be incorrect - we would override with X, which could have
1802 * been freed when Y was written.
1803 *
1804 * (Note that this is not a concern when we are nop-writing from
1805 * syncing context, because X and Y must be identical, because
1806 * all previous txgs have been synced.)
1807 *
1808 * Therefore, we disable nopwrite if the current BP could change
1809 * before this TXG. There are two ways it could change: by
1810 * being dirty (dr_next is non-NULL), or by being freed
1811 * (dnode_block_freed()). This behavior is verified by
1812 * zio_done(), which VERIFYs that the override BP is identical
1813 * to the on-disk BP.
1814 */
1815 DB_DNODE_ENTER(db);
1816 dn = DB_DNODE(db);
1817 if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
1818 zp.zp_nopwrite = B_FALSE;
1819 DB_DNODE_EXIT(db);
1820
1821 ASSERT(dr->dr_txg == txg);
1822 if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
1823 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
1824 /*
1825 * We have already issued a sync write for this buffer,
1826 * or this buffer has already been synced. It could not
1827 * have been dirtied since, or we would have cleared the state.
1828 */
1829 mutex_exit(&db->db_mtx);
1830 return (SET_ERROR(EALREADY));
1831 }
1832
1833 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
1834 dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
1835 mutex_exit(&db->db_mtx);
1836
1837 dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1838 dsa->dsa_dr = dr;
1839 dsa->dsa_done = done;
1840 dsa->dsa_zgd = zgd;
1841 dsa->dsa_tx = NULL;
1842
1843 zio_nowait(arc_write(pio, os->os_spa, txg,
1844 zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
1845 &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
1846 ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
1847
1848 return (0);
1849 }
1850
1851 int
1852 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
1853 dmu_tx_t *tx)
1854 {
1855 dnode_t *dn;
1856 int err;
1857
1858 err = dnode_hold(os, object, FTAG, &dn);
1859 if (err)
1860 return (err);
1861 err = dnode_set_blksz(dn, size, ibs, tx);
1862 dnode_rele(dn, FTAG);
1863 return (err);
1864 }
1865
1866 void
1867 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
1868 dmu_tx_t *tx)
1869 {
1870 dnode_t *dn;
1871
1872 /*
1873 * Send streams include each object's checksum function. This
1874 * check ensures that the receiving system can understand the
1875 * checksum function transmitted.
1876 */
1877 ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
1878
1879 VERIFY0(dnode_hold(os, object, FTAG, &dn));
1880 ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
1881 dn->dn_checksum = checksum;
1882 dnode_setdirty(dn, tx);
1883 dnode_rele(dn, FTAG);
1884 }
1885
1886 void
1887 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
1888 dmu_tx_t *tx)
1889 {
1890 dnode_t *dn;
1891
1892 /*
1893 * Send streams include each object's compression function. This
1894 * check ensures that the receiving system can understand the
1895 * compression function transmitted.
1896 */
1897 ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
1898
1899 VERIFY0(dnode_hold(os, object, FTAG, &dn));
1900 dn->dn_compress = compress;
1901 dnode_setdirty(dn, tx);
1902 dnode_rele(dn, FTAG);
1903 }
1904
1905 int zfs_mdcomp_disable = 0;
1906
1907 /*
1908 * When the "redundant_metadata" property is set to "most", only indirect
1909 * blocks of this level and higher will have an additional ditto block.
1910 */
1911 int zfs_redundant_metadata_most_ditto_level = 2;
1912
1913 void
1914 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
1915 {
1916 dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
1917 boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
1918 (wp & WP_SPILL));
1919 enum zio_checksum checksum = os->os_checksum;
1920 enum zio_compress compress = os->os_compress;
1921 enum zio_checksum dedup_checksum = os->os_dedup_checksum;
1922 boolean_t dedup = B_FALSE;
1923 boolean_t nopwrite = B_FALSE;
1924 boolean_t dedup_verify = os->os_dedup_verify;
1925 int copies = os->os_copies;
1926
1927 /*
1928 * We maintain different write policies for each of the following
1929 * types of data:
1930 * 1. metadata
1931 * 2. preallocated blocks (i.e. level-0 blocks of a dump device)
1932 * 3. all other level 0 blocks
1933 */
1934 if (ismd) {
1935 if (zfs_mdcomp_disable) {
1936 compress = ZIO_COMPRESS_EMPTY;
1937 } else {
1938 /*
1939 * XXX -- we should design a compression algorithm
1940 * that specializes in arrays of bps.
1941 */
1942 compress = zio_compress_select(os->os_spa,
1943 ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
1944 }
1945
1946 /*
1947 * Metadata always gets checksummed. If the data
1948 * checksum is multi-bit correctable, and it's not a
1949 * ZBT-style checksum, then it's suitable for metadata
1950 * as well. Otherwise, the metadata checksum defaults
1951 * to fletcher4.
1952 */
1953 if (!(zio_checksum_table[checksum].ci_flags &
1954 ZCHECKSUM_FLAG_METADATA) ||
1955 (zio_checksum_table[checksum].ci_flags &
1956 ZCHECKSUM_FLAG_EMBEDDED))
1957 checksum = ZIO_CHECKSUM_FLETCHER_4;
1958
1959 if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
1960 (os->os_redundant_metadata ==
1961 ZFS_REDUNDANT_METADATA_MOST &&
1962 (level >= zfs_redundant_metadata_most_ditto_level ||
1963 DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
1964 copies++;
1965 } else if (wp & WP_NOFILL) {
1966 ASSERT(level == 0);
1967
1968 /*
1969 * If we're writing preallocated blocks, we aren't actually
1970 * writing them so don't set any policy properties. These
1971 * blocks are currently only used by an external subsystem
1972 * outside of zfs (i.e. dump) and not written by the zio
1973 * pipeline.
1974 */
1975 compress = ZIO_COMPRESS_OFF;
1976 checksum = ZIO_CHECKSUM_OFF;
1977 } else {
1978 compress = zio_compress_select(os->os_spa, dn->dn_compress,
1979 compress);
1980
1981 checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
1982 zio_checksum_select(dn->dn_checksum, checksum) :
1983 dedup_checksum;
1984
1985 /*
1986 * Determine dedup setting. If we are in dmu_sync(),
1987 * we won't actually dedup now because that's all
1988 * done in syncing context; but we do want to use the
1989 * dedup checkum. If the checksum is not strong
1990 * enough to ensure unique signatures, force
1991 * dedup_verify.
1992 */
1993 if (dedup_checksum != ZIO_CHECKSUM_OFF) {
1994 dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
1995 if (!(zio_checksum_table[checksum].ci_flags &
1996 ZCHECKSUM_FLAG_DEDUP))
1997 dedup_verify = B_TRUE;
1998 }
1999
2000 /*
2001 * Enable nopwrite if we have secure enough checksum
2002 * algorithm (see comment in zio_nop_write) and
2003 * compression is enabled. We don't enable nopwrite if
2004 * dedup is enabled as the two features are mutually
2005 * exclusive.
2006 */
2007 nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
2008 ZCHECKSUM_FLAG_NOPWRITE) &&
2009 compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
2010 }
2011
2012 zp->zp_checksum = checksum;
2013 zp->zp_compress = compress;
2014 ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
2015
2016 zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
2017 zp->zp_level = level;
2018 zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
2019 zp->zp_dedup = dedup;
2020 zp->zp_dedup_verify = dedup && dedup_verify;
2021 zp->zp_nopwrite = nopwrite;
2022 }
2023
2024 /*
2025 * This function is only called from zfs_holey_common() for zpl_llseek()
2026 * in order to determine the location of holes. In order to accurately
2027 * report holes all dirty data must be synced to disk. This causes extremely
2028 * poor performance when seeking for holes in a dirty file. As a compromise,
2029 * only provide hole data when the dnode is clean. When a dnode is dirty
2030 * report the dnode as having no holes which is always a safe thing to do.
2031 */
2032 int
2033 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
2034 {
2035 dnode_t *dn;
2036 int i, err;
2037 boolean_t clean = B_TRUE;
2038
2039 err = dnode_hold(os, object, FTAG, &dn);
2040 if (err)
2041 return (err);
2042
2043 /*
2044 * Check if dnode is dirty
2045 */
2046 for (i = 0; i < TXG_SIZE; i++) {
2047 if (list_link_active(&dn->dn_dirty_link[i])) {
2048 clean = B_FALSE;
2049 break;
2050 }
2051 }
2052
2053 /*
2054 * If compatibility option is on, sync any current changes before
2055 * we go trundling through the block pointers.
2056 */
2057 if (!clean && zfs_dmu_offset_next_sync) {
2058 clean = B_TRUE;
2059 dnode_rele(dn, FTAG);
2060 txg_wait_synced(dmu_objset_pool(os), 0);
2061 err = dnode_hold(os, object, FTAG, &dn);
2062 if (err)
2063 return (err);
2064 }
2065
2066 if (clean)
2067 err = dnode_next_offset(dn,
2068 (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
2069 else
2070 err = SET_ERROR(EBUSY);
2071
2072 dnode_rele(dn, FTAG);
2073
2074 return (err);
2075 }
2076
2077 void
2078 __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
2079 {
2080 dnode_phys_t *dnp = dn->dn_phys;
2081 int i;
2082
2083 doi->doi_data_block_size = dn->dn_datablksz;
2084 doi->doi_metadata_block_size = dn->dn_indblkshift ?
2085 1ULL << dn->dn_indblkshift : 0;
2086 doi->doi_type = dn->dn_type;
2087 doi->doi_bonus_type = dn->dn_bonustype;
2088 doi->doi_bonus_size = dn->dn_bonuslen;
2089 doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT;
2090 doi->doi_indirection = dn->dn_nlevels;
2091 doi->doi_checksum = dn->dn_checksum;
2092 doi->doi_compress = dn->dn_compress;
2093 doi->doi_nblkptr = dn->dn_nblkptr;
2094 doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
2095 doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
2096 doi->doi_fill_count = 0;
2097 for (i = 0; i < dnp->dn_nblkptr; i++)
2098 doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
2099 }
2100
2101 void
2102 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
2103 {
2104 rw_enter(&dn->dn_struct_rwlock, RW_READER);
2105 mutex_enter(&dn->dn_mtx);
2106
2107 __dmu_object_info_from_dnode(dn, doi);
2108
2109 mutex_exit(&dn->dn_mtx);
2110 rw_exit(&dn->dn_struct_rwlock);
2111 }
2112
2113 /*
2114 * Get information on a DMU object.
2115 * If doi is NULL, just indicates whether the object exists.
2116 */
2117 int
2118 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
2119 {
2120 dnode_t *dn;
2121 int err = dnode_hold(os, object, FTAG, &dn);
2122
2123 if (err)
2124 return (err);
2125
2126 if (doi != NULL)
2127 dmu_object_info_from_dnode(dn, doi);
2128
2129 dnode_rele(dn, FTAG);
2130 return (0);
2131 }
2132
2133 /*
2134 * As above, but faster; can be used when you have a held dbuf in hand.
2135 */
2136 void
2137 dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
2138 {
2139 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2140
2141 DB_DNODE_ENTER(db);
2142 dmu_object_info_from_dnode(DB_DNODE(db), doi);
2143 DB_DNODE_EXIT(db);
2144 }
2145
2146 /*
2147 * Faster still when you only care about the size.
2148 * This is specifically optimized for zfs_getattr().
2149 */
2150 void
2151 dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
2152 u_longlong_t *nblk512)
2153 {
2154 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2155 dnode_t *dn;
2156
2157 DB_DNODE_ENTER(db);
2158 dn = DB_DNODE(db);
2159
2160 *blksize = dn->dn_datablksz;
2161 /* add in number of slots used for the dnode itself */
2162 *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
2163 SPA_MINBLOCKSHIFT) + dn->dn_num_slots;
2164 DB_DNODE_EXIT(db);
2165 }
2166
2167 void
2168 dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize)
2169 {
2170 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2171 dnode_t *dn;
2172
2173 DB_DNODE_ENTER(db);
2174 dn = DB_DNODE(db);
2175 *dnsize = dn->dn_num_slots << DNODE_SHIFT;
2176 DB_DNODE_EXIT(db);
2177 }
2178
2179 void
2180 byteswap_uint64_array(void *vbuf, size_t size)
2181 {
2182 uint64_t *buf = vbuf;
2183 size_t count = size >> 3;
2184 int i;
2185
2186 ASSERT((size & 7) == 0);
2187
2188 for (i = 0; i < count; i++)
2189 buf[i] = BSWAP_64(buf[i]);
2190 }
2191
2192 void
2193 byteswap_uint32_array(void *vbuf, size_t size)
2194 {
2195 uint32_t *buf = vbuf;
2196 size_t count = size >> 2;
2197 int i;
2198
2199 ASSERT((size & 3) == 0);
2200
2201 for (i = 0; i < count; i++)
2202 buf[i] = BSWAP_32(buf[i]);
2203 }
2204
2205 void
2206 byteswap_uint16_array(void *vbuf, size_t size)
2207 {
2208 uint16_t *buf = vbuf;
2209 size_t count = size >> 1;
2210 int i;
2211
2212 ASSERT((size & 1) == 0);
2213
2214 for (i = 0; i < count; i++)
2215 buf[i] = BSWAP_16(buf[i]);
2216 }
2217
2218 /* ARGSUSED */
2219 void
2220 byteswap_uint8_array(void *vbuf, size_t size)
2221 {
2222 }
2223
2224 void
2225 dmu_init(void)
2226 {
2227 abd_init();
2228 zfs_dbgmsg_init();
2229 sa_cache_init();
2230 xuio_stat_init();
2231 dmu_objset_init();
2232 dnode_init();
2233 zfetch_init();
2234 dmu_tx_init();
2235 l2arc_init();
2236 arc_init();
2237 dbuf_init();
2238 }
2239
2240 void
2241 dmu_fini(void)
2242 {
2243 arc_fini(); /* arc depends on l2arc, so arc must go first */
2244 l2arc_fini();
2245 dmu_tx_fini();
2246 zfetch_fini();
2247 dbuf_fini();
2248 dnode_fini();
2249 dmu_objset_fini();
2250 xuio_stat_fini();
2251 sa_cache_fini();
2252 zfs_dbgmsg_fini();
2253 abd_fini();
2254 }
2255
2256 #if defined(_KERNEL) && defined(HAVE_SPL)
2257 EXPORT_SYMBOL(dmu_bonus_hold);
2258 EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
2259 EXPORT_SYMBOL(dmu_buf_rele_array);
2260 EXPORT_SYMBOL(dmu_prefetch);
2261 EXPORT_SYMBOL(dmu_free_range);
2262 EXPORT_SYMBOL(dmu_free_long_range);
2263 EXPORT_SYMBOL(dmu_free_long_object);
2264 EXPORT_SYMBOL(dmu_read);
2265 EXPORT_SYMBOL(dmu_read_by_dnode);
2266 EXPORT_SYMBOL(dmu_write);
2267 EXPORT_SYMBOL(dmu_write_by_dnode);
2268 EXPORT_SYMBOL(dmu_prealloc);
2269 EXPORT_SYMBOL(dmu_object_info);
2270 EXPORT_SYMBOL(dmu_object_info_from_dnode);
2271 EXPORT_SYMBOL(dmu_object_info_from_db);
2272 EXPORT_SYMBOL(dmu_object_size_from_db);
2273 EXPORT_SYMBOL(dmu_object_dnsize_from_db);
2274 EXPORT_SYMBOL(dmu_object_set_blocksize);
2275 EXPORT_SYMBOL(dmu_object_set_checksum);
2276 EXPORT_SYMBOL(dmu_object_set_compress);
2277 EXPORT_SYMBOL(dmu_write_policy);
2278 EXPORT_SYMBOL(dmu_sync);
2279 EXPORT_SYMBOL(dmu_request_arcbuf);
2280 EXPORT_SYMBOL(dmu_return_arcbuf);
2281 EXPORT_SYMBOL(dmu_assign_arcbuf);
2282 EXPORT_SYMBOL(dmu_buf_hold);
2283 EXPORT_SYMBOL(dmu_ot);
2284
2285 /* BEGIN CSTYLED */
2286 module_param(zfs_mdcomp_disable, int, 0644);
2287 MODULE_PARM_DESC(zfs_mdcomp_disable, "Disable meta data compression");
2288
2289 module_param(zfs_nopwrite_enabled, int, 0644);
2290 MODULE_PARM_DESC(zfs_nopwrite_enabled, "Enable NOP writes");
2291
2292 module_param(zfs_per_txg_dirty_frees_percent, ulong, 0644);
2293 MODULE_PARM_DESC(zfs_per_txg_dirty_frees_percent,
2294 "percentage of dirtied blocks from frees in one TXG");
2295
2296 module_param(zfs_dmu_offset_next_sync, int, 0644);
2297 MODULE_PARM_DESC(zfs_dmu_offset_next_sync,
2298 "Enable forcing txg sync to find holes");
2299
2300 /* END CSTYLED */
2301
2302 #endif