]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
8d35c149 | 23 | * Copyright 2011 Nexenta Systems, Inc. All rights reserved. |
e6d3a843 | 24 | * Copyright (c) 2011, 2015 by Delphix. All rights reserved. |
788eb90c | 25 | * Copyright (c) 2014, Joyent, Inc. All rights reserved. |
47dfff3b | 26 | * Copyright 2014 HybridCluster. All rights reserved. |
b607405f | 27 | * Copyright 2016 RackTop Systems. |
a0bd735a | 28 | * Copyright (c) 2016 Actifio, Inc. All rights reserved. |
8d35c149 | 29 | */ |
34dc7c2f | 30 | |
34dc7c2f BB |
31 | #include <sys/dmu.h> |
32 | #include <sys/dmu_impl.h> | |
33 | #include <sys/dmu_tx.h> | |
34 | #include <sys/dbuf.h> | |
35 | #include <sys/dnode.h> | |
36 | #include <sys/zfs_context.h> | |
37 | #include <sys/dmu_objset.h> | |
38 | #include <sys/dmu_traverse.h> | |
39 | #include <sys/dsl_dataset.h> | |
40 | #include <sys/dsl_dir.h> | |
428870ff | 41 | #include <sys/dsl_prop.h> |
34dc7c2f BB |
42 | #include <sys/dsl_pool.h> |
43 | #include <sys/dsl_synctask.h> | |
044baf00 | 44 | #include <sys/spa_impl.h> |
34dc7c2f BB |
45 | #include <sys/zfs_ioctl.h> |
46 | #include <sys/zap.h> | |
47 | #include <sys/zio_checksum.h> | |
428870ff BB |
48 | #include <sys/zfs_znode.h> |
49 | #include <zfs_fletcher.h> | |
50 | #include <sys/avl.h> | |
51 | #include <sys/ddt.h> | |
572e2857 | 52 | #include <sys/zfs_onexit.h> |
13fe0198 MA |
53 | #include <sys/dmu_send.h> |
54 | #include <sys/dsl_destroy.h> | |
9b67f605 | 55 | #include <sys/blkptr.h> |
da536844 | 56 | #include <sys/dsl_bookmark.h> |
9b67f605 | 57 | #include <sys/zfeature.h> |
fcff0f35 | 58 | #include <sys/bqueue.h> |
a0bd735a | 59 | #include <sys/zvol.h> |
f74b821a | 60 | #include <sys/policy.h> |
34dc7c2f | 61 | |
330d06f9 MA |
62 | /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ |
63 | int zfs_send_corrupt_data = B_FALSE; | |
3b0d9928 | 64 | int zfs_send_queue_length = SPA_MAXBLOCKSIZE; |
b607405f AS |
65 | /* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */ |
66 | int zfs_send_set_freerecords_bit = B_TRUE; | |
caf9dd20 BB |
67 | /* Set this tunable to FALSE is disable sending unmodified spill blocks. */ |
68 | int zfs_send_unmodified_spill_blocks = B_TRUE; | |
330d06f9 | 69 | |
ca0845d5 PD |
70 | /* |
71 | * Use this to override the recordsize calculation for fast zfs send estimates. | |
72 | */ | |
73 | unsigned long zfs_override_estimate_recordsize = 0; | |
74 | ||
fcff0f35 PD |
75 | #define BP_SPAN(datablkszsec, indblkshift, level) \ |
76 | (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ | |
77 | (level) * (indblkshift - SPA_BLKPTRSHIFT))) | |
78 | ||
79 | struct send_thread_arg { | |
80 | bqueue_t q; | |
81 | dsl_dataset_t *ds; /* Dataset to traverse */ | |
82 | uint64_t fromtxg; /* Traverse from this txg */ | |
83 | int flags; /* flags to pass to traverse_dataset */ | |
84 | int error_code; | |
85 | boolean_t cancel; | |
47dfff3b | 86 | zbookmark_phys_t resume; |
fcff0f35 PD |
87 | }; |
88 | ||
89 | struct send_block_record { | |
90 | boolean_t eos_marker; /* Marks the end of the stream */ | |
91 | blkptr_t bp; | |
92 | zbookmark_phys_t zb; | |
93 | uint8_t indblkshift; | |
94 | uint16_t datablkszsec; | |
95 | bqueue_node_t ln; | |
96 | }; | |
97 | ||
044baf00 BB |
98 | typedef struct dump_bytes_io { |
99 | dmu_sendarg_t *dbi_dsp; | |
100 | void *dbi_buf; | |
101 | int dbi_len; | |
102 | } dump_bytes_io_t; | |
103 | ||
caf9dd20 BB |
104 | static int do_dump(dmu_sendarg_t *dsa, struct send_block_record *data); |
105 | ||
044baf00 | 106 | static void |
b58986ee | 107 | dump_bytes_cb(void *arg) |
34dc7c2f | 108 | { |
044baf00 BB |
109 | dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg; |
110 | dmu_sendarg_t *dsp = dbi->dbi_dsp; | |
47dfff3b | 111 | dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os); |
34dc7c2f | 112 | ssize_t resid; /* have to get resid to get detailed errno */ |
f8866f8a ER |
113 | |
114 | /* | |
b5256303 | 115 | * The code does not rely on len being a multiple of 8. We keep |
f8866f8a ER |
116 | * this assertion because of the corresponding assertion in |
117 | * receive_read(). Keeping this assertion ensures that we do not | |
118 | * inadvertently break backwards compatibility (causing the assertion | |
b5256303 TC |
119 | * in receive_read() to trigger on old software). Newer feature flags |
120 | * (such as raw send) may break this assertion since they were | |
121 | * introduced after the requirement was made obsolete. | |
f8866f8a ER |
122 | */ |
123 | ||
b5256303 TC |
124 | ASSERT(dbi->dbi_len % 8 == 0 || |
125 | (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0); | |
34dc7c2f | 126 | |
37abac6d | 127 | dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp, |
044baf00 | 128 | (caddr_t)dbi->dbi_buf, dbi->dbi_len, |
34dc7c2f | 129 | 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid); |
37abac6d BP |
130 | |
131 | mutex_enter(&ds->ds_sendstream_lock); | |
044baf00 | 132 | *dsp->dsa_off += dbi->dbi_len; |
37abac6d | 133 | mutex_exit(&ds->ds_sendstream_lock); |
044baf00 BB |
134 | } |
135 | ||
136 | static int | |
137 | dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) | |
138 | { | |
139 | dump_bytes_io_t dbi; | |
140 | ||
141 | dbi.dbi_dsp = dsp; | |
142 | dbi.dbi_buf = buf; | |
143 | dbi.dbi_len = len; | |
144 | ||
b58986ee BB |
145 | #if defined(HAVE_LARGE_STACKS) |
146 | dump_bytes_cb(&dbi); | |
147 | #else | |
044baf00 BB |
148 | /* |
149 | * The vn_rdwr() call is performed in a taskq to ensure that there is | |
150 | * always enough stack space to write safely to the target filesystem. | |
151 | * The ZIO_TYPE_FREE threads are used because there can be a lot of | |
152 | * them and they are used in vdev_file.c for a similar purpose. | |
153 | */ | |
154 | spa_taskq_dispatch_sync(dmu_objset_spa(dsp->dsa_os), ZIO_TYPE_FREE, | |
b58986ee BB |
155 | ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP); |
156 | #endif /* HAVE_LARGE_STACKS */ | |
37abac6d BP |
157 | |
158 | return (dsp->dsa_err); | |
34dc7c2f BB |
159 | } |
160 | ||
37f8a883 MA |
161 | /* |
162 | * For all record types except BEGIN, fill in the checksum (overlaid in | |
163 | * drr_u.drr_checksum.drr_checksum). The checksum verifies everything | |
164 | * up to the start of the checksum itself. | |
165 | */ | |
166 | static int | |
167 | dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len) | |
168 | { | |
169 | ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), | |
170 | ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); | |
a6255b7f | 171 | (void) fletcher_4_incremental_native(dsp->dsa_drr, |
37f8a883 MA |
172 | offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), |
173 | &dsp->dsa_zc); | |
51907a31 K |
174 | if (dsp->dsa_drr->drr_type == DRR_BEGIN) { |
175 | dsp->dsa_sent_begin = B_TRUE; | |
176 | } else { | |
37f8a883 MA |
177 | ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u. |
178 | drr_checksum.drr_checksum)); | |
179 | dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc; | |
180 | } | |
51907a31 K |
181 | if (dsp->dsa_drr->drr_type == DRR_END) { |
182 | dsp->dsa_sent_end = B_TRUE; | |
183 | } | |
a6255b7f | 184 | (void) fletcher_4_incremental_native(&dsp->dsa_drr-> |
37f8a883 MA |
185 | drr_u.drr_checksum.drr_checksum, |
186 | sizeof (zio_cksum_t), &dsp->dsa_zc); | |
187 | if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) | |
188 | return (SET_ERROR(EINTR)); | |
189 | if (payload_len != 0) { | |
a6255b7f | 190 | (void) fletcher_4_incremental_native(payload, payload_len, |
37f8a883 MA |
191 | &dsp->dsa_zc); |
192 | if (dump_bytes(dsp, payload, payload_len) != 0) | |
193 | return (SET_ERROR(EINTR)); | |
194 | } | |
195 | return (0); | |
196 | } | |
197 | ||
e6d3a843 PD |
198 | /* |
199 | * Fill in the drr_free struct, or perform aggregation if the previous record is | |
200 | * also a free record, and the two are adjacent. | |
201 | * | |
202 | * Note that we send free records even for a full send, because we want to be | |
203 | * able to receive a full send as a clone, which requires a list of all the free | |
204 | * and freeobject records that were generated on the source. | |
205 | */ | |
34dc7c2f | 206 | static int |
37abac6d | 207 | dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, |
34dc7c2f BB |
208 | uint64_t length) |
209 | { | |
37abac6d | 210 | struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); |
428870ff | 211 | |
ea97f8ce MA |
212 | /* |
213 | * When we receive a free record, dbuf_free_range() assumes | |
214 | * that the receiving system doesn't have any dbufs in the range | |
215 | * being freed. This is always true because there is a one-record | |
216 | * constraint: we only send one WRITE record for any given | |
47dfff3b | 217 | * object,offset. We know that the one-record constraint is |
ea97f8ce MA |
218 | * true because we always send data in increasing order by |
219 | * object,offset. | |
220 | * | |
221 | * If the increasing-order constraint ever changes, we should find | |
222 | * another way to assert that the one-record constraint is still | |
223 | * satisfied. | |
224 | */ | |
225 | ASSERT(object > dsp->dsa_last_data_object || | |
226 | (object == dsp->dsa_last_data_object && | |
227 | offset > dsp->dsa_last_data_offset)); | |
228 | ||
428870ff BB |
229 | /* |
230 | * If there is a pending op, but it's not PENDING_FREE, push it out, | |
231 | * since free block aggregation can only be done for blocks of the | |
232 | * same type (i.e., DRR_FREE records can only be aggregated with | |
233 | * other DRR_FREE records. DRR_FREEOBJECTS records can only be | |
234 | * aggregated with other DRR_FREEOBJECTS records. | |
235 | */ | |
37abac6d BP |
236 | if (dsp->dsa_pending_op != PENDING_NONE && |
237 | dsp->dsa_pending_op != PENDING_FREE) { | |
37f8a883 | 238 | if (dump_record(dsp, NULL, 0) != 0) |
2e528b49 | 239 | return (SET_ERROR(EINTR)); |
37abac6d | 240 | dsp->dsa_pending_op = PENDING_NONE; |
428870ff BB |
241 | } |
242 | ||
37abac6d | 243 | if (dsp->dsa_pending_op == PENDING_FREE) { |
428870ff | 244 | /* |
ee45fbd8 | 245 | * There should never be a PENDING_FREE if length is |
246 | * DMU_OBJECT_END (because dump_dnode is the only place where | |
247 | * this function is called with a DMU_OBJECT_END, and only after | |
248 | * flushing any pending record). | |
428870ff | 249 | */ |
ee45fbd8 | 250 | ASSERT(length != DMU_OBJECT_END); |
428870ff BB |
251 | /* |
252 | * Check to see whether this free block can be aggregated | |
253 | * with pending one. | |
254 | */ | |
255 | if (drrf->drr_object == object && drrf->drr_offset + | |
256 | drrf->drr_length == offset) { | |
ee45fbd8 | 257 | if (offset + length < offset) |
258 | drrf->drr_length = DMU_OBJECT_END; | |
259 | else | |
260 | drrf->drr_length += length; | |
428870ff BB |
261 | return (0); |
262 | } else { | |
263 | /* not a continuation. Push out pending record */ | |
37f8a883 | 264 | if (dump_record(dsp, NULL, 0) != 0) |
2e528b49 | 265 | return (SET_ERROR(EINTR)); |
37abac6d | 266 | dsp->dsa_pending_op = PENDING_NONE; |
428870ff BB |
267 | } |
268 | } | |
269 | /* create a FREE record and make it pending */ | |
37abac6d BP |
270 | bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); |
271 | dsp->dsa_drr->drr_type = DRR_FREE; | |
428870ff BB |
272 | drrf->drr_object = object; |
273 | drrf->drr_offset = offset; | |
ee45fbd8 | 274 | if (offset + length < offset) |
275 | drrf->drr_length = DMU_OBJECT_END; | |
276 | else | |
277 | drrf->drr_length = length; | |
37abac6d | 278 | drrf->drr_toguid = dsp->dsa_toguid; |
ee45fbd8 | 279 | if (length == DMU_OBJECT_END) { |
37f8a883 | 280 | if (dump_record(dsp, NULL, 0) != 0) |
2e528b49 | 281 | return (SET_ERROR(EINTR)); |
428870ff | 282 | } else { |
37abac6d | 283 | dsp->dsa_pending_op = PENDING_FREE; |
428870ff | 284 | } |
34dc7c2f | 285 | |
34dc7c2f BB |
286 | return (0); |
287 | } | |
288 | ||
289 | static int | |
b5256303 TC |
290 | dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, |
291 | uint64_t offset, int lsize, int psize, const blkptr_t *bp, void *data) | |
34dc7c2f | 292 | { |
2aa34383 | 293 | uint64_t payload_size; |
b5256303 | 294 | boolean_t raw = (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW); |
37abac6d | 295 | struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); |
428870ff | 296 | |
ea97f8ce MA |
297 | /* |
298 | * We send data in increasing object, offset order. | |
299 | * See comment in dump_free() for details. | |
300 | */ | |
301 | ASSERT(object > dsp->dsa_last_data_object || | |
302 | (object == dsp->dsa_last_data_object && | |
303 | offset > dsp->dsa_last_data_offset)); | |
304 | dsp->dsa_last_data_object = object; | |
2aa34383 | 305 | dsp->dsa_last_data_offset = offset + lsize - 1; |
428870ff BB |
306 | |
307 | /* | |
308 | * If there is any kind of pending aggregation (currently either | |
309 | * a grouping of free objects or free blocks), push it out to | |
310 | * the stream, since aggregation can't be done across operations | |
311 | * of different types. | |
312 | */ | |
37abac6d | 313 | if (dsp->dsa_pending_op != PENDING_NONE) { |
37f8a883 | 314 | if (dump_record(dsp, NULL, 0) != 0) |
2e528b49 | 315 | return (SET_ERROR(EINTR)); |
37abac6d | 316 | dsp->dsa_pending_op = PENDING_NONE; |
428870ff | 317 | } |
37f8a883 | 318 | /* write a WRITE record */ |
37abac6d BP |
319 | bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); |
320 | dsp->dsa_drr->drr_type = DRR_WRITE; | |
428870ff BB |
321 | drrw->drr_object = object; |
322 | drrw->drr_type = type; | |
323 | drrw->drr_offset = offset; | |
37abac6d | 324 | drrw->drr_toguid = dsp->dsa_toguid; |
2aa34383 DK |
325 | drrw->drr_logical_size = lsize; |
326 | ||
b5256303 TC |
327 | /* only set the compression fields if the buf is compressed or raw */ |
328 | if (raw || lsize != psize) { | |
2aa34383 | 329 | ASSERT(!BP_IS_EMBEDDED(bp)); |
2aa34383 | 330 | ASSERT3S(psize, >, 0); |
2aa34383 | 331 | |
b5256303 TC |
332 | if (raw) { |
333 | ASSERT(BP_IS_PROTECTED(bp)); | |
334 | ||
335 | /* | |
9b840763 TC |
336 | * This is a raw protected block so we need to pass |
337 | * along everything the receiving side will need to | |
338 | * interpret this block, including the byteswap, salt, | |
339 | * IV, and MAC. | |
b5256303 | 340 | */ |
b5256303 TC |
341 | if (BP_SHOULD_BYTESWAP(bp)) |
342 | drrw->drr_flags |= DRR_RAW_BYTESWAP; | |
343 | zio_crypt_decode_params_bp(bp, drrw->drr_salt, | |
344 | drrw->drr_iv); | |
345 | zio_crypt_decode_mac_bp(bp, drrw->drr_mac); | |
346 | } else { | |
347 | /* this is a compressed block */ | |
348 | ASSERT(dsp->dsa_featureflags & | |
349 | DMU_BACKUP_FEATURE_COMPRESSED); | |
350 | ASSERT(!BP_SHOULD_BYTESWAP(bp)); | |
351 | ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp))); | |
352 | ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF); | |
353 | ASSERT3S(lsize, >=, psize); | |
354 | } | |
355 | ||
356 | /* set fields common to compressed and raw sends */ | |
2aa34383 DK |
357 | drrw->drr_compressiontype = BP_GET_COMPRESS(bp); |
358 | drrw->drr_compressed_size = psize; | |
359 | payload_size = drrw->drr_compressed_size; | |
360 | } else { | |
361 | payload_size = drrw->drr_logical_size; | |
362 | } | |
363 | ||
b5256303 | 364 | if (bp == NULL || BP_IS_EMBEDDED(bp) || (BP_IS_PROTECTED(bp) && !raw)) { |
9b67f605 | 365 | /* |
b5256303 TC |
366 | * There's no pre-computed checksum for partial-block writes, |
367 | * embedded BP's, or encrypted BP's that are being sent as | |
368 | * plaintext, so (like fletcher4-checkummed blocks) userland | |
369 | * will have to compute a dedup-capable checksum itself. | |
9b67f605 MA |
370 | */ |
371 | drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; | |
372 | } else { | |
373 | drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); | |
3c67d83a TH |
374 | if (zio_checksum_table[drrw->drr_checksumtype].ci_flags & |
375 | ZCHECKSUM_FLAG_DEDUP) | |
b5256303 | 376 | drrw->drr_flags |= DRR_CHECKSUM_DEDUP; |
9b67f605 MA |
377 | DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); |
378 | DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); | |
379 | DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); | |
b5256303 | 380 | DDK_SET_CRYPT(&drrw->drr_key, BP_IS_PROTECTED(bp)); |
9b67f605 MA |
381 | drrw->drr_key.ddk_cksum = bp->blk_cksum; |
382 | } | |
428870ff | 383 | |
2aa34383 | 384 | if (dump_record(dsp, data, payload_size) != 0) |
2e528b49 | 385 | return (SET_ERROR(EINTR)); |
428870ff BB |
386 | return (0); |
387 | } | |
388 | ||
9b67f605 MA |
389 | static int |
390 | dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, | |
391 | int blksz, const blkptr_t *bp) | |
392 | { | |
393 | char buf[BPE_PAYLOAD_SIZE]; | |
394 | struct drr_write_embedded *drrw = | |
395 | &(dsp->dsa_drr->drr_u.drr_write_embedded); | |
396 | ||
397 | if (dsp->dsa_pending_op != PENDING_NONE) { | |
37f8a883 | 398 | if (dump_record(dsp, NULL, 0) != 0) |
ecb2b7dc | 399 | return (SET_ERROR(EINTR)); |
9b67f605 MA |
400 | dsp->dsa_pending_op = PENDING_NONE; |
401 | } | |
402 | ||
403 | ASSERT(BP_IS_EMBEDDED(bp)); | |
404 | ||
405 | bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); | |
406 | dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; | |
407 | drrw->drr_object = object; | |
408 | drrw->drr_offset = offset; | |
409 | drrw->drr_length = blksz; | |
410 | drrw->drr_toguid = dsp->dsa_toguid; | |
411 | drrw->drr_compression = BP_GET_COMPRESS(bp); | |
412 | drrw->drr_etype = BPE_GET_ETYPE(bp); | |
413 | drrw->drr_lsize = BPE_GET_LSIZE(bp); | |
414 | drrw->drr_psize = BPE_GET_PSIZE(bp); | |
415 | ||
416 | decode_embedded_bp_compressed(bp, buf); | |
417 | ||
37f8a883 | 418 | if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) |
ecb2b7dc | 419 | return (SET_ERROR(EINTR)); |
9b67f605 MA |
420 | return (0); |
421 | } | |
422 | ||
428870ff | 423 | static int |
b5256303 | 424 | dump_spill(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, void *data) |
428870ff | 425 | { |
37abac6d | 426 | struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); |
b5256303 | 427 | uint64_t blksz = BP_GET_LSIZE(bp); |
b0ee5946 | 428 | uint64_t payload_size = blksz; |
428870ff | 429 | |
37abac6d | 430 | if (dsp->dsa_pending_op != PENDING_NONE) { |
37f8a883 | 431 | if (dump_record(dsp, NULL, 0) != 0) |
2e528b49 | 432 | return (SET_ERROR(EINTR)); |
37abac6d | 433 | dsp->dsa_pending_op = PENDING_NONE; |
428870ff BB |
434 | } |
435 | ||
436 | /* write a SPILL record */ | |
37abac6d BP |
437 | bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); |
438 | dsp->dsa_drr->drr_type = DRR_SPILL; | |
428870ff BB |
439 | drrs->drr_object = object; |
440 | drrs->drr_length = blksz; | |
37abac6d | 441 | drrs->drr_toguid = dsp->dsa_toguid; |
34dc7c2f | 442 | |
caf9dd20 BB |
443 | /* See comment in dump_dnode() for full details */ |
444 | if (zfs_send_unmodified_spill_blocks && | |
445 | (bp->blk_birth <= dsp->dsa_fromtxg)) { | |
446 | drrs->drr_flags |= DRR_SPILL_UNMODIFIED; | |
447 | } | |
448 | ||
b5256303 | 449 | /* handle raw send fields */ |
9b840763 TC |
450 | if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { |
451 | ASSERT(BP_IS_PROTECTED(bp)); | |
452 | ||
b5256303 TC |
453 | if (BP_SHOULD_BYTESWAP(bp)) |
454 | drrs->drr_flags |= DRR_RAW_BYTESWAP; | |
455 | drrs->drr_compressiontype = BP_GET_COMPRESS(bp); | |
456 | drrs->drr_compressed_size = BP_GET_PSIZE(bp); | |
457 | zio_crypt_decode_params_bp(bp, drrs->drr_salt, drrs->drr_iv); | |
458 | zio_crypt_decode_mac_bp(bp, drrs->drr_mac); | |
b0ee5946 | 459 | payload_size = drrs->drr_compressed_size; |
b5256303 TC |
460 | } |
461 | ||
b0ee5946 | 462 | if (dump_record(dsp, data, payload_size) != 0) |
2e528b49 | 463 | return (SET_ERROR(EINTR)); |
34dc7c2f BB |
464 | return (0); |
465 | } | |
466 | ||
467 | static int | |
37abac6d | 468 | dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) |
34dc7c2f | 469 | { |
37abac6d | 470 | struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); |
829e95c4 FG |
471 | uint64_t maxobj = DNODES_PER_BLOCK * |
472 | (DMU_META_DNODE(dsp->dsa_os)->dn_maxblkid + 1); | |
473 | ||
474 | /* | |
475 | * ZoL < 0.7 does not handle large FREEOBJECTS records correctly, | |
476 | * leading to zfs recv never completing. to avoid this issue, don't | |
477 | * send FREEOBJECTS records for object IDs which cannot exist on the | |
478 | * receiving side. | |
479 | */ | |
480 | if (maxobj > 0) { | |
481 | if (maxobj < firstobj) | |
482 | return (0); | |
483 | ||
484 | if (maxobj < firstobj + numobjs) | |
485 | numobjs = maxobj - firstobj; | |
486 | } | |
428870ff BB |
487 | |
488 | /* | |
489 | * If there is a pending op, but it's not PENDING_FREEOBJECTS, | |
490 | * push it out, since free block aggregation can only be done for | |
491 | * blocks of the same type (i.e., DRR_FREE records can only be | |
492 | * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records | |
493 | * can only be aggregated with other DRR_FREEOBJECTS records. | |
494 | */ | |
37abac6d BP |
495 | if (dsp->dsa_pending_op != PENDING_NONE && |
496 | dsp->dsa_pending_op != PENDING_FREEOBJECTS) { | |
37f8a883 | 497 | if (dump_record(dsp, NULL, 0) != 0) |
2e528b49 | 498 | return (SET_ERROR(EINTR)); |
37abac6d | 499 | dsp->dsa_pending_op = PENDING_NONE; |
428870ff | 500 | } |
37abac6d | 501 | if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { |
428870ff BB |
502 | /* |
503 | * See whether this free object array can be aggregated | |
504 | * with pending one | |
505 | */ | |
506 | if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { | |
507 | drrfo->drr_numobjs += numobjs; | |
508 | return (0); | |
509 | } else { | |
510 | /* can't be aggregated. Push out pending record */ | |
37f8a883 | 511 | if (dump_record(dsp, NULL, 0) != 0) |
2e528b49 | 512 | return (SET_ERROR(EINTR)); |
37abac6d | 513 | dsp->dsa_pending_op = PENDING_NONE; |
428870ff BB |
514 | } |
515 | } | |
516 | ||
34dc7c2f | 517 | /* write a FREEOBJECTS record */ |
37abac6d BP |
518 | bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); |
519 | dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; | |
428870ff BB |
520 | drrfo->drr_firstobj = firstobj; |
521 | drrfo->drr_numobjs = numobjs; | |
37abac6d | 522 | drrfo->drr_toguid = dsp->dsa_toguid; |
428870ff | 523 | |
37abac6d | 524 | dsp->dsa_pending_op = PENDING_FREEOBJECTS; |
34dc7c2f | 525 | |
34dc7c2f BB |
526 | return (0); |
527 | } | |
528 | ||
529 | static int | |
b5256303 TC |
530 | dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, |
531 | dnode_phys_t *dnp) | |
34dc7c2f | 532 | { |
37abac6d | 533 | struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); |
4807c0ba | 534 | int bonuslen; |
428870ff | 535 | |
47dfff3b MA |
536 | if (object < dsp->dsa_resume_object) { |
537 | /* | |
538 | * Note: when resuming, we will visit all the dnodes in | |
539 | * the block of dnodes that we are resuming from. In | |
540 | * this case it's unnecessary to send the dnodes prior to | |
541 | * the one we are resuming from. We should be at most one | |
542 | * block's worth of dnodes behind the resume point. | |
543 | */ | |
544 | ASSERT3U(dsp->dsa_resume_object - object, <, | |
545 | 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)); | |
546 | return (0); | |
547 | } | |
548 | ||
34dc7c2f | 549 | if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) |
37abac6d | 550 | return (dump_freeobjects(dsp, object, 1)); |
34dc7c2f | 551 | |
37abac6d | 552 | if (dsp->dsa_pending_op != PENDING_NONE) { |
37f8a883 | 553 | if (dump_record(dsp, NULL, 0) != 0) |
2e528b49 | 554 | return (SET_ERROR(EINTR)); |
37abac6d | 555 | dsp->dsa_pending_op = PENDING_NONE; |
428870ff BB |
556 | } |
557 | ||
34dc7c2f | 558 | /* write an OBJECT record */ |
37abac6d BP |
559 | bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); |
560 | dsp->dsa_drr->drr_type = DRR_OBJECT; | |
428870ff BB |
561 | drro->drr_object = object; |
562 | drro->drr_type = dnp->dn_type; | |
563 | drro->drr_bonustype = dnp->dn_bonustype; | |
564 | drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; | |
565 | drro->drr_bonuslen = dnp->dn_bonuslen; | |
50c957f7 | 566 | drro->drr_dn_slots = dnp->dn_extra_slots + 1; |
428870ff BB |
567 | drro->drr_checksumtype = dnp->dn_checksum; |
568 | drro->drr_compress = dnp->dn_compress; | |
37abac6d | 569 | drro->drr_toguid = dsp->dsa_toguid; |
428870ff | 570 | |
f1512ee6 MA |
571 | if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && |
572 | drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) | |
573 | drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; | |
574 | ||
4807c0ba TC |
575 | bonuslen = P2ROUNDUP(dnp->dn_bonuslen, 8); |
576 | ||
9b840763 TC |
577 | if ((dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW)) { |
578 | ASSERT(BP_IS_ENCRYPTED(bp)); | |
579 | ||
b5256303 TC |
580 | if (BP_SHOULD_BYTESWAP(bp)) |
581 | drro->drr_flags |= DRR_RAW_BYTESWAP; | |
582 | ||
583 | /* needed for reconstructing dnp on recv side */ | |
ae76f45c | 584 | drro->drr_maxblkid = dnp->dn_maxblkid; |
b5256303 TC |
585 | drro->drr_indblkshift = dnp->dn_indblkshift; |
586 | drro->drr_nlevels = dnp->dn_nlevels; | |
587 | drro->drr_nblkptr = dnp->dn_nblkptr; | |
588 | ||
589 | /* | |
590 | * Since we encrypt the entire bonus area, the (raw) part | |
4807c0ba | 591 | * beyond the bonuslen is actually nonzero, so we need |
b5256303 TC |
592 | * to send it. |
593 | */ | |
594 | if (bonuslen != 0) { | |
595 | drro->drr_raw_bonuslen = DN_MAX_BONUS_LEN(dnp); | |
596 | bonuslen = drro->drr_raw_bonuslen; | |
597 | } | |
37f8a883 | 598 | } |
34dc7c2f | 599 | |
caf9dd20 BB |
600 | /* |
601 | * DRR_OBJECT_SPILL is set for every dnode which references a | |
602 | * spill block. This allows the receiving pool to definitively | |
603 | * determine when a spill block should be kept or freed. | |
604 | */ | |
605 | if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) | |
606 | drro->drr_flags |= DRR_OBJECT_SPILL; | |
607 | ||
b5256303 TC |
608 | if (dump_record(dsp, DN_BONUS(dnp), bonuslen) != 0) |
609 | return (SET_ERROR(EINTR)); | |
610 | ||
ea97f8ce | 611 | /* Free anything past the end of the file. */ |
37abac6d | 612 | if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * |
ee45fbd8 | 613 | (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0) |
2e528b49 | 614 | return (SET_ERROR(EINTR)); |
caf9dd20 BB |
615 | |
616 | /* | |
617 | * Send DRR_SPILL records for unmodified spill blocks. This is useful | |
618 | * because changing certain attributes of the object (e.g. blocksize) | |
619 | * can cause old versions of ZFS to incorrectly remove a spill block. | |
620 | * Including these records in the stream forces an up to date version | |
621 | * to always be written ensuring they're never lost. Current versions | |
622 | * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can | |
623 | * ignore these unmodified spill blocks. | |
624 | */ | |
625 | if (zfs_send_unmodified_spill_blocks && | |
626 | (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && | |
627 | (DN_SPILL_BLKPTR(dnp)->blk_birth <= dsp->dsa_fromtxg)) { | |
628 | struct send_block_record record; | |
629 | ||
630 | bzero(&record, sizeof (struct send_block_record)); | |
631 | record.eos_marker = B_FALSE; | |
632 | record.bp = *DN_SPILL_BLKPTR(dnp); | |
633 | SET_BOOKMARK(&(record.zb), dmu_objset_id(dsp->dsa_os), | |
634 | object, 0, DMU_SPILL_BLKID); | |
635 | ||
636 | if (do_dump(dsp, &record) != 0) | |
637 | return (SET_ERROR(EINTR)); | |
638 | } | |
639 | ||
13fe0198 | 640 | if (dsp->dsa_err != 0) |
2e528b49 | 641 | return (SET_ERROR(EINTR)); |
caf9dd20 | 642 | |
34dc7c2f BB |
643 | return (0); |
644 | } | |
645 | ||
b5256303 TC |
646 | static int |
647 | dump_object_range(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t firstobj, | |
648 | uint64_t numslots) | |
649 | { | |
650 | struct drr_object_range *drror = | |
651 | &(dsp->dsa_drr->drr_u.drr_object_range); | |
652 | ||
653 | /* we only use this record type for raw sends */ | |
654 | ASSERT(BP_IS_PROTECTED(bp)); | |
655 | ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW); | |
656 | ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); | |
657 | ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE); | |
658 | ASSERT0(BP_GET_LEVEL(bp)); | |
659 | ||
660 | if (dsp->dsa_pending_op != PENDING_NONE) { | |
661 | if (dump_record(dsp, NULL, 0) != 0) | |
662 | return (SET_ERROR(EINTR)); | |
663 | dsp->dsa_pending_op = PENDING_NONE; | |
664 | } | |
665 | ||
666 | bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); | |
667 | dsp->dsa_drr->drr_type = DRR_OBJECT_RANGE; | |
668 | drror->drr_firstobj = firstobj; | |
669 | drror->drr_numslots = numslots; | |
670 | drror->drr_toguid = dsp->dsa_toguid; | |
b5256303 TC |
671 | if (BP_SHOULD_BYTESWAP(bp)) |
672 | drror->drr_flags |= DRR_RAW_BYTESWAP; | |
673 | zio_crypt_decode_params_bp(bp, drror->drr_salt, drror->drr_iv); | |
674 | zio_crypt_decode_mac_bp(bp, drror->drr_mac); | |
675 | ||
676 | if (dump_record(dsp, NULL, 0) != 0) | |
677 | return (SET_ERROR(EINTR)); | |
678 | return (0); | |
679 | } | |
680 | ||
9b67f605 MA |
681 | static boolean_t |
682 | backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) | |
683 | { | |
684 | if (!BP_IS_EMBEDDED(bp)) | |
685 | return (B_FALSE); | |
686 | ||
687 | /* | |
688 | * Compression function must be legacy, or explicitly enabled. | |
689 | */ | |
690 | if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && | |
2aa34383 | 691 | !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4))) |
9b67f605 MA |
692 | return (B_FALSE); |
693 | ||
694 | /* | |
695 | * Embed type must be explicitly enabled. | |
696 | */ | |
697 | switch (BPE_GET_ETYPE(bp)) { | |
698 | case BP_EMBEDDED_TYPE_DATA: | |
699 | if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) | |
700 | return (B_TRUE); | |
701 | break; | |
702 | default: | |
703 | return (B_FALSE); | |
704 | } | |
705 | return (B_FALSE); | |
706 | } | |
707 | ||
fcff0f35 PD |
708 | /* |
709 | * This is the callback function to traverse_dataset that acts as the worker | |
710 | * thread for dmu_send_impl. | |
711 | */ | |
712 | /*ARGSUSED*/ | |
713 | static int | |
714 | send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |
715 | const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) | |
716 | { | |
717 | struct send_thread_arg *sta = arg; | |
718 | struct send_block_record *record; | |
719 | uint64_t record_size; | |
720 | int err = 0; | |
721 | ||
47dfff3b MA |
722 | ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || |
723 | zb->zb_object >= sta->resume.zb_object); | |
b5256303 | 724 | ASSERT3P(sta->ds, !=, NULL); |
47dfff3b | 725 | |
fcff0f35 PD |
726 | if (sta->cancel) |
727 | return (SET_ERROR(EINTR)); | |
34dc7c2f | 728 | |
fcff0f35 PD |
729 | if (bp == NULL) { |
730 | ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); | |
731 | return (0); | |
732 | } else if (zb->zb_level < 0) { | |
733 | return (0); | |
734 | } | |
735 | ||
736 | record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP); | |
737 | record->eos_marker = B_FALSE; | |
738 | record->bp = *bp; | |
739 | record->zb = *zb; | |
740 | record->indblkshift = dnp->dn_indblkshift; | |
741 | record->datablkszsec = dnp->dn_datablkszsec; | |
742 | record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; | |
743 | bqueue_enqueue(&sta->q, record, record_size); | |
744 | ||
745 | return (err); | |
746 | } | |
747 | ||
748 | /* | |
749 | * This function kicks off the traverse_dataset. It also handles setting the | |
750 | * error code of the thread in case something goes wrong, and pushes the End of | |
751 | * Stream record when the traverse_dataset call has finished. If there is no | |
752 | * dataset to traverse, the thread immediately pushes End of Stream marker. | |
753 | */ | |
754 | static void | |
755 | send_traverse_thread(void *arg) | |
756 | { | |
757 | struct send_thread_arg *st_arg = arg; | |
758 | int err; | |
759 | struct send_block_record *data; | |
3e635ac1 | 760 | fstrans_cookie_t cookie = spl_fstrans_mark(); |
fcff0f35 PD |
761 | |
762 | if (st_arg->ds != NULL) { | |
47dfff3b MA |
763 | err = traverse_dataset_resume(st_arg->ds, |
764 | st_arg->fromtxg, &st_arg->resume, | |
765 | st_arg->flags, send_cb, st_arg); | |
766 | ||
fcff0f35 PD |
767 | if (err != EINTR) |
768 | st_arg->error_code = err; | |
769 | } | |
770 | data = kmem_zalloc(sizeof (*data), KM_SLEEP); | |
771 | data->eos_marker = B_TRUE; | |
772 | bqueue_enqueue(&st_arg->q, data, 1); | |
3e635ac1 | 773 | spl_fstrans_unmark(cookie); |
34a6b428 | 774 | thread_exit(); |
fcff0f35 PD |
775 | } |
776 | ||
777 | /* | |
778 | * This function actually handles figuring out what kind of record needs to be | |
779 | * dumped, reading the data (which has hopefully been prefetched), and calling | |
780 | * the appropriate helper function. | |
781 | */ | |
34dc7c2f | 782 | static int |
fcff0f35 | 783 | do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) |
34dc7c2f | 784 | { |
fcff0f35 PD |
785 | dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os); |
786 | const blkptr_t *bp = &data->bp; | |
787 | const zbookmark_phys_t *zb = &data->zb; | |
788 | uint8_t indblkshift = data->indblkshift; | |
789 | uint16_t dblkszsec = data->datablkszsec; | |
790 | spa_t *spa = ds->ds_dir->dd_pool->dp_spa; | |
34dc7c2f | 791 | dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; |
34dc7c2f BB |
792 | int err = 0; |
793 | ||
fcff0f35 | 794 | ASSERT3U(zb->zb_level, >=, 0); |
34dc7c2f | 795 | |
47dfff3b MA |
796 | ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || |
797 | zb->zb_object >= dsa->dsa_resume_object); | |
798 | ||
b5256303 TC |
799 | /* |
800 | * All bps of an encrypted os should have the encryption bit set. | |
801 | * If this is not true it indicates tampering and we report an error. | |
802 | */ | |
803 | if (dsa->dsa_os->os_encrypted && | |
804 | !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) { | |
805 | spa_log_error(spa, zb); | |
806 | zfs_panic_recover("unencrypted block in encrypted " | |
807 | "object set %llu", ds->ds_object); | |
808 | return (SET_ERROR(EIO)); | |
809 | } | |
810 | ||
428870ff BB |
811 | if (zb->zb_object != DMU_META_DNODE_OBJECT && |
812 | DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { | |
9babb374 | 813 | return (0); |
b0bc7a84 MG |
814 | } else if (BP_IS_HOLE(bp) && |
815 | zb->zb_object == DMU_META_DNODE_OBJECT) { | |
fcff0f35 | 816 | uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); |
b128c09f | 817 | uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; |
fcff0f35 | 818 | err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT); |
b0bc7a84 | 819 | } else if (BP_IS_HOLE(bp)) { |
fcff0f35 PD |
820 | uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); |
821 | uint64_t offset = zb->zb_blkid * span; | |
ee45fbd8 | 822 | /* Don't dump free records for offsets > DMU_OBJECT_END */ |
823 | if (zb->zb_blkid == 0 || span <= DMU_OBJECT_END / zb->zb_blkid) | |
824 | err = dump_free(dsa, zb->zb_object, offset, span); | |
b128c09f BB |
825 | } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { |
826 | return (0); | |
827 | } else if (type == DMU_OT_DNODE) { | |
50c957f7 | 828 | int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; |
2a432414 | 829 | arc_flags_t aflags = ARC_FLAG_WAIT; |
b128c09f | 830 | arc_buf_t *abuf; |
b5256303 | 831 | enum zio_flag zioflags = ZIO_FLAG_CANFAIL; |
fcff0f35 | 832 | |
b5256303 TC |
833 | if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { |
834 | ASSERT(BP_IS_ENCRYPTED(bp)); | |
835 | ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); | |
836 | zioflags |= ZIO_FLAG_RAW; | |
837 | } | |
838 | ||
fcff0f35 | 839 | ASSERT0(zb->zb_level); |
b128c09f | 840 | |
294f6806 | 841 | if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, |
b5256303 | 842 | ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) |
2e528b49 | 843 | return (SET_ERROR(EIO)); |
34dc7c2f | 844 | |
1c27024e DB |
845 | dnode_phys_t *blk = abuf->b_data; |
846 | uint64_t dnobj = zb->zb_blkid * epb; | |
b5256303 TC |
847 | |
848 | /* | |
849 | * Raw sends require sending encryption parameters for the | |
850 | * block of dnodes. Regular sends do not need to send this | |
851 | * info. | |
852 | */ | |
853 | if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { | |
854 | ASSERT(arc_is_encrypted(abuf)); | |
855 | err = dump_object_range(dsa, bp, dnobj, epb); | |
856 | } | |
857 | ||
858 | if (err == 0) { | |
1c27024e DB |
859 | for (int i = 0; i < epb; |
860 | i += blk[i].dn_extra_slots + 1) { | |
b5256303 TC |
861 | err = dump_dnode(dsa, bp, dnobj + i, blk + i); |
862 | if (err != 0) | |
863 | break; | |
864 | } | |
34dc7c2f | 865 | } |
d3c2ae1c | 866 | arc_buf_destroy(abuf, &abuf); |
428870ff | 867 | } else if (type == DMU_OT_SA) { |
2a432414 | 868 | arc_flags_t aflags = ARC_FLAG_WAIT; |
b128c09f | 869 | arc_buf_t *abuf; |
b5256303 TC |
870 | enum zio_flag zioflags = ZIO_FLAG_CANFAIL; |
871 | ||
872 | if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { | |
873 | ASSERT(BP_IS_PROTECTED(bp)); | |
874 | zioflags |= ZIO_FLAG_RAW; | |
875 | } | |
b128c09f | 876 | |
294f6806 | 877 | if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, |
b5256303 | 878 | ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) |
2e528b49 | 879 | return (SET_ERROR(EIO)); |
b128c09f | 880 | |
b5256303 | 881 | err = dump_spill(dsa, bp, zb->zb_object, abuf->b_data); |
d3c2ae1c | 882 | arc_buf_destroy(abuf, &abuf); |
fcff0f35 | 883 | } else if (backup_do_embed(dsa, bp)) { |
9b67f605 | 884 | /* it's an embedded level-0 block of a regular object */ |
fcff0f35 PD |
885 | int blksz = dblkszsec << SPA_MINBLOCKSHIFT; |
886 | ASSERT0(zb->zb_level); | |
887 | err = dump_write_embedded(dsa, zb->zb_object, | |
9b67f605 | 888 | zb->zb_blkid * blksz, blksz, bp); |
fcff0f35 PD |
889 | } else { |
890 | /* it's a level-0 block of a regular object */ | |
2a432414 | 891 | arc_flags_t aflags = ARC_FLAG_WAIT; |
428870ff | 892 | arc_buf_t *abuf; |
fcff0f35 PD |
893 | int blksz = dblkszsec << SPA_MINBLOCKSHIFT; |
894 | uint64_t offset; | |
2aa34383 DK |
895 | |
896 | /* | |
897 | * If we have large blocks stored on disk but the send flags | |
898 | * don't allow us to send large blocks, we split the data from | |
899 | * the arc buf into chunks. | |
900 | */ | |
a7004725 | 901 | boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE && |
2aa34383 | 902 | !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS); |
b5256303 TC |
903 | |
904 | /* | |
905 | * Raw sends require that we always get raw data as it exists | |
906 | * on disk, so we assert that we are not splitting blocks here. | |
907 | */ | |
908 | boolean_t request_raw = | |
909 | (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0; | |
910 | ||
2aa34383 DK |
911 | /* |
912 | * We should only request compressed data from the ARC if all | |
913 | * the following are true: | |
914 | * - stream compression was requested | |
915 | * - we aren't splitting large blocks into smaller chunks | |
916 | * - the data won't need to be byteswapped before sending | |
917 | * - this isn't an embedded block | |
918 | * - this isn't metadata (if receiving on a different endian | |
919 | * system it can be byteswapped more easily) | |
920 | */ | |
921 | boolean_t request_compressed = | |
922 | (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) && | |
923 | !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) && | |
924 | !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp)); | |
428870ff | 925 | |
b5256303 TC |
926 | IMPLY(request_raw, !split_large_blocks); |
927 | IMPLY(request_raw, BP_IS_PROTECTED(bp)); | |
da536844 | 928 | ASSERT0(zb->zb_level); |
47dfff3b MA |
929 | ASSERT(zb->zb_object > dsa->dsa_resume_object || |
930 | (zb->zb_object == dsa->dsa_resume_object && | |
931 | zb->zb_blkid * blksz >= dsa->dsa_resume_offset)); | |
932 | ||
a7004725 DK |
933 | ASSERT3U(blksz, ==, BP_GET_LSIZE(bp)); |
934 | ||
935 | enum zio_flag zioflags = ZIO_FLAG_CANFAIL; | |
b5256303 | 936 | if (request_raw) |
2aa34383 | 937 | zioflags |= ZIO_FLAG_RAW; |
b5256303 TC |
938 | else if (request_compressed) |
939 | zioflags |= ZIO_FLAG_RAW_COMPRESS; | |
2aa34383 | 940 | |
294f6806 | 941 | if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, |
a7004725 | 942 | ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) { |
330d06f9 | 943 | if (zfs_send_corrupt_data) { |
330d06f9 | 944 | /* Send a block filled with 0x"zfs badd bloc" */ |
2aa34383 DK |
945 | abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA, |
946 | blksz); | |
a7004725 | 947 | uint64_t *ptr; |
330d06f9 MA |
948 | for (ptr = abuf->b_data; |
949 | (char *)ptr < (char *)abuf->b_data + blksz; | |
950 | ptr++) | |
dd26aa53 | 951 | *ptr = 0x2f5baddb10cULL; |
330d06f9 | 952 | } else { |
2e528b49 | 953 | return (SET_ERROR(EIO)); |
330d06f9 MA |
954 | } |
955 | } | |
428870ff | 956 | |
f1512ee6 MA |
957 | offset = zb->zb_blkid * blksz; |
958 | ||
2aa34383 | 959 | if (split_large_blocks) { |
b5256303 | 960 | ASSERT0(arc_is_encrypted(abuf)); |
2aa34383 DK |
961 | ASSERT3U(arc_get_compression(abuf), ==, |
962 | ZIO_COMPRESS_OFF); | |
a7004725 | 963 | char *buf = abuf->b_data; |
f1512ee6 MA |
964 | while (blksz > 0 && err == 0) { |
965 | int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); | |
fcff0f35 | 966 | err = dump_write(dsa, type, zb->zb_object, |
2aa34383 | 967 | offset, n, n, NULL, buf); |
f1512ee6 MA |
968 | offset += n; |
969 | buf += n; | |
970 | blksz -= n; | |
971 | } | |
972 | } else { | |
2aa34383 | 973 | err = dump_write(dsa, type, zb->zb_object, offset, |
b5256303 | 974 | blksz, arc_buf_size(abuf), bp, abuf->b_data); |
f1512ee6 | 975 | } |
d3c2ae1c | 976 | arc_buf_destroy(abuf, &abuf); |
34dc7c2f BB |
977 | } |
978 | ||
979 | ASSERT(err == 0 || err == EINTR); | |
980 | return (err); | |
981 | } | |
982 | ||
6f1ffb06 | 983 | /* |
fcff0f35 PD |
984 | * Pop the new data off the queue, and free the old data. |
985 | */ | |
986 | static struct send_block_record * | |
987 | get_next_record(bqueue_t *bq, struct send_block_record *data) | |
988 | { | |
989 | struct send_block_record *tmp = bqueue_dequeue(bq); | |
990 | kmem_free(data, sizeof (*data)); | |
991 | return (tmp); | |
992 | } | |
993 | ||
994 | /* | |
995 | * Actually do the bulk of the work in a zfs send. | |
996 | * | |
997 | * Note: Releases dp using the specified tag. | |
6f1ffb06 | 998 | */ |
13fe0198 | 999 | static int |
fcff0f35 | 1000 | dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, |
2aa34383 DK |
1001 | zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, |
1002 | boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, | |
b5256303 | 1003 | boolean_t rawok, int outfd, uint64_t resumeobj, uint64_t resumeoff, |
47dfff3b | 1004 | vnode_t *vp, offset_t *off) |
34dc7c2f | 1005 | { |
13fe0198 | 1006 | objset_t *os; |
34dc7c2f | 1007 | dmu_replay_record_t *drr; |
37abac6d | 1008 | dmu_sendarg_t *dsp; |
34dc7c2f BB |
1009 | int err; |
1010 | uint64_t fromtxg = 0; | |
9b67f605 | 1011 | uint64_t featureflags = 0; |
fcff0f35 | 1012 | struct send_thread_arg to_arg; |
47dfff3b MA |
1013 | void *payload = NULL; |
1014 | size_t payload_len = 0; | |
fcff0f35 | 1015 | struct send_block_record *to_data; |
34dc7c2f | 1016 | |
fcff0f35 | 1017 | err = dmu_objset_from_ds(to_ds, &os); |
13fe0198 | 1018 | if (err != 0) { |
13fe0198 MA |
1019 | dsl_pool_rele(dp, tag); |
1020 | return (err); | |
1021 | } | |
34dc7c2f | 1022 | |
b5256303 TC |
1023 | /* |
1024 | * If this is a non-raw send of an encrypted ds, we can ensure that | |
1025 | * the objset_phys_t is authenticated. This is safe because this is | |
1026 | * either a snapshot or we have owned the dataset, ensuring that | |
1027 | * it can't be modified. | |
1028 | */ | |
1029 | if (!rawok && os->os_encrypted && | |
1030 | arc_is_unauthenticated(os->os_phys_buf)) { | |
a2c2ed1b TC |
1031 | zbookmark_phys_t zb; |
1032 | ||
1033 | SET_BOOKMARK(&zb, to_ds->ds_object, ZB_ROOT_OBJECT, | |
1034 | ZB_ROOT_LEVEL, ZB_ROOT_BLKID); | |
b5256303 | 1035 | err = arc_untransform(os->os_phys_buf, os->os_spa, |
a2c2ed1b | 1036 | &zb, B_FALSE); |
b5256303 TC |
1037 | if (err != 0) { |
1038 | dsl_pool_rele(dp, tag); | |
1039 | return (err); | |
1040 | } | |
1041 | ||
1042 | ASSERT0(arc_is_unauthenticated(os->os_phys_buf)); | |
1043 | } | |
1044 | ||
34dc7c2f BB |
1045 | drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); |
1046 | drr->drr_type = DRR_BEGIN; | |
1047 | drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; | |
428870ff BB |
1048 | DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, |
1049 | DMU_SUBSTREAM); | |
1050 | ||
47dfff3b MA |
1051 | bzero(&to_arg, sizeof (to_arg)); |
1052 | ||
428870ff | 1053 | #ifdef _KERNEL |
13fe0198 | 1054 | if (dmu_objset_type(os) == DMU_OST_ZFS) { |
428870ff | 1055 | uint64_t version; |
13fe0198 | 1056 | if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { |
37abac6d | 1057 | kmem_free(drr, sizeof (dmu_replay_record_t)); |
13fe0198 | 1058 | dsl_pool_rele(dp, tag); |
2e528b49 | 1059 | return (SET_ERROR(EINVAL)); |
37abac6d | 1060 | } |
13fe0198 | 1061 | if (version >= ZPL_VERSION_SA) { |
9b67f605 | 1062 | featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; |
428870ff BB |
1063 | } |
1064 | } | |
1065 | #endif | |
1066 | ||
b5256303 TC |
1067 | /* raw sends imply large_block_ok */ |
1068 | if ((large_block_ok || rawok) && | |
d52d80b7 | 1069 | dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_BLOCKS)) |
f1512ee6 | 1070 | featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; |
d52d80b7 | 1071 | if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_DNODE)) |
50c957f7 | 1072 | featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE; |
b5256303 TC |
1073 | |
1074 | /* encrypted datasets will not have embedded blocks */ | |
1075 | if ((embedok || rawok) && !os->os_encrypted && | |
9b67f605 MA |
1076 | spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { |
1077 | featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; | |
2aa34383 | 1078 | } |
b5256303 TC |
1079 | |
1080 | /* raw send implies compressok */ | |
1081 | if (compressok || rawok) | |
2aa34383 | 1082 | featureflags |= DMU_BACKUP_FEATURE_COMPRESSED; |
caf9dd20 | 1083 | |
b5256303 TC |
1084 | if (rawok && os->os_encrypted) |
1085 | featureflags |= DMU_BACKUP_FEATURE_RAW; | |
1086 | ||
2aa34383 | 1087 | if ((featureflags & |
b5256303 TC |
1088 | (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED | |
1089 | DMU_BACKUP_FEATURE_RAW)) != 0 && | |
1090 | spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) { | |
2aa34383 | 1091 | featureflags |= DMU_BACKUP_FEATURE_LZ4; |
9b67f605 MA |
1092 | } |
1093 | ||
47dfff3b MA |
1094 | if (resumeobj != 0 || resumeoff != 0) { |
1095 | featureflags |= DMU_BACKUP_FEATURE_RESUMING; | |
1096 | } | |
1097 | ||
9b67f605 MA |
1098 | DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, |
1099 | featureflags); | |
1100 | ||
34dc7c2f | 1101 | drr->drr_u.drr_begin.drr_creation_time = |
fcff0f35 | 1102 | dsl_dataset_phys(to_ds)->ds_creation_time; |
13fe0198 | 1103 | drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); |
da536844 | 1104 | if (is_clone) |
34dc7c2f | 1105 | drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; |
fcff0f35 PD |
1106 | drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; |
1107 | if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET) | |
34dc7c2f | 1108 | drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; |
b607405f AS |
1109 | if (zfs_send_set_freerecords_bit) |
1110 | drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS; | |
34dc7c2f | 1111 | |
caf9dd20 BB |
1112 | drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_SPILL_BLOCK; |
1113 | ||
fcff0f35 PD |
1114 | if (ancestor_zb != NULL) { |
1115 | drr->drr_u.drr_begin.drr_fromguid = | |
1116 | ancestor_zb->zbm_guid; | |
1117 | fromtxg = ancestor_zb->zbm_creation_txg; | |
da536844 | 1118 | } |
fcff0f35 PD |
1119 | dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname); |
1120 | if (!to_ds->ds_is_snapshot) { | |
da536844 MA |
1121 | (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", |
1122 | sizeof (drr->drr_u.drr_begin.drr_toname)); | |
13fe0198 | 1123 | } |
34dc7c2f | 1124 | |
37abac6d BP |
1125 | dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); |
1126 | ||
1127 | dsp->dsa_drr = drr; | |
1128 | dsp->dsa_vp = vp; | |
1129 | dsp->dsa_outfd = outfd; | |
1130 | dsp->dsa_proc = curproc; | |
13fe0198 | 1131 | dsp->dsa_os = os; |
37abac6d | 1132 | dsp->dsa_off = off; |
fcff0f35 | 1133 | dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid; |
caf9dd20 | 1134 | dsp->dsa_fromtxg = fromtxg; |
37abac6d | 1135 | dsp->dsa_pending_op = PENDING_NONE; |
9b67f605 | 1136 | dsp->dsa_featureflags = featureflags; |
47dfff3b MA |
1137 | dsp->dsa_resume_object = resumeobj; |
1138 | dsp->dsa_resume_offset = resumeoff; | |
37abac6d | 1139 | |
fcff0f35 PD |
1140 | mutex_enter(&to_ds->ds_sendstream_lock); |
1141 | list_insert_head(&to_ds->ds_sendstreams, dsp); | |
1142 | mutex_exit(&to_ds->ds_sendstream_lock); | |
37abac6d | 1143 | |
fcff0f35 | 1144 | dsl_dataset_long_hold(to_ds, FTAG); |
7ec09286 MA |
1145 | dsl_pool_rele(dp, tag); |
1146 | ||
b5256303 TC |
1147 | /* handle features that require a DRR_BEGIN payload */ |
1148 | if (featureflags & | |
1149 | (DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_RAW)) { | |
1150 | nvlist_t *keynvl = NULL; | |
1151 | nvlist_t *nvl = fnvlist_alloc(); | |
1152 | ||
1153 | if (featureflags & DMU_BACKUP_FEATURE_RESUMING) { | |
1154 | dmu_object_info_t to_doi; | |
1155 | err = dmu_object_info(os, resumeobj, &to_doi); | |
1156 | if (err != 0) { | |
1157 | fnvlist_free(nvl); | |
1158 | goto out; | |
1159 | } | |
1160 | ||
1161 | SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, | |
1162 | resumeobj, 0, | |
1163 | resumeoff / to_doi.doi_data_block_size); | |
1164 | ||
1165 | fnvlist_add_uint64(nvl, "resume_object", resumeobj); | |
1166 | fnvlist_add_uint64(nvl, "resume_offset", resumeoff); | |
1167 | } | |
1168 | ||
1169 | if (featureflags & DMU_BACKUP_FEATURE_RAW) { | |
f00ab3f2 TC |
1170 | uint64_t ivset_guid = (ancestor_zb != NULL) ? |
1171 | ancestor_zb->zbm_ivset_guid : 0; | |
1172 | ||
b5256303 TC |
1173 | ASSERT(os->os_encrypted); |
1174 | ||
f00ab3f2 TC |
1175 | err = dsl_crypto_populate_key_nvlist(to_ds, |
1176 | ivset_guid, &keynvl); | |
b5256303 TC |
1177 | if (err != 0) { |
1178 | fnvlist_free(nvl); | |
1179 | goto out; | |
1180 | } | |
1181 | ||
1182 | fnvlist_add_nvlist(nvl, "crypt_keydata", keynvl); | |
1183 | } | |
47dfff3b | 1184 | |
47dfff3b MA |
1185 | payload = fnvlist_pack(nvl, &payload_len); |
1186 | drr->drr_payloadlen = payload_len; | |
b5256303 | 1187 | fnvlist_free(keynvl); |
47dfff3b MA |
1188 | fnvlist_free(nvl); |
1189 | } | |
1190 | ||
1191 | err = dump_record(dsp, payload, payload_len); | |
1192 | fnvlist_pack_free(payload, payload_len); | |
1193 | if (err != 0) { | |
37abac6d BP |
1194 | err = dsp->dsa_err; |
1195 | goto out; | |
34dc7c2f BB |
1196 | } |
1197 | ||
3b0d9928 BB |
1198 | err = bqueue_init(&to_arg.q, |
1199 | MAX(zfs_send_queue_length, 2 * zfs_max_recordsize), | |
fcff0f35 PD |
1200 | offsetof(struct send_block_record, ln)); |
1201 | to_arg.error_code = 0; | |
1202 | to_arg.cancel = B_FALSE; | |
1203 | to_arg.ds = to_ds; | |
1204 | to_arg.fromtxg = fromtxg; | |
1205 | to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH; | |
b5256303 TC |
1206 | if (rawok) |
1207 | to_arg.flags |= TRAVERSE_NO_DECRYPT; | |
fcff0f35 PD |
1208 | (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc, |
1209 | TS_RUN, minclsyspri); | |
1210 | ||
1211 | to_data = bqueue_dequeue(&to_arg.q); | |
1212 | ||
1213 | while (!to_data->eos_marker && err == 0) { | |
1214 | err = do_dump(dsp, to_data); | |
1215 | to_data = get_next_record(&to_arg.q, to_data); | |
1216 | if (issig(JUSTLOOKING) && issig(FORREAL)) | |
1217 | err = EINTR; | |
1218 | } | |
1219 | ||
1220 | if (err != 0) { | |
1221 | to_arg.cancel = B_TRUE; | |
1222 | while (!to_data->eos_marker) { | |
1223 | to_data = get_next_record(&to_arg.q, to_data); | |
1224 | } | |
1225 | } | |
1226 | kmem_free(to_data, sizeof (*to_data)); | |
1227 | ||
1228 | bqueue_destroy(&to_arg.q); | |
1229 | ||
1230 | if (err == 0 && to_arg.error_code != 0) | |
1231 | err = to_arg.error_code; | |
1232 | ||
1233 | if (err != 0) | |
1234 | goto out; | |
34dc7c2f | 1235 | |
37abac6d | 1236 | if (dsp->dsa_pending_op != PENDING_NONE) |
37f8a883 | 1237 | if (dump_record(dsp, NULL, 0) != 0) |
2e528b49 | 1238 | err = SET_ERROR(EINTR); |
428870ff | 1239 | |
13fe0198 MA |
1240 | if (err != 0) { |
1241 | if (err == EINTR && dsp->dsa_err != 0) | |
37abac6d BP |
1242 | err = dsp->dsa_err; |
1243 | goto out; | |
34dc7c2f BB |
1244 | } |
1245 | ||
1246 | bzero(drr, sizeof (dmu_replay_record_t)); | |
1247 | drr->drr_type = DRR_END; | |
37abac6d BP |
1248 | drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; |
1249 | drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; | |
34dc7c2f | 1250 | |
fcff0f35 | 1251 | if (dump_record(dsp, NULL, 0) != 0) |
37abac6d | 1252 | err = dsp->dsa_err; |
37abac6d | 1253 | out: |
fcff0f35 PD |
1254 | mutex_enter(&to_ds->ds_sendstream_lock); |
1255 | list_remove(&to_ds->ds_sendstreams, dsp); | |
1256 | mutex_exit(&to_ds->ds_sendstream_lock); | |
37abac6d | 1257 | |
51907a31 K |
1258 | VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end)); |
1259 | ||
34dc7c2f | 1260 | kmem_free(drr, sizeof (dmu_replay_record_t)); |
37abac6d | 1261 | kmem_free(dsp, sizeof (dmu_sendarg_t)); |
34dc7c2f | 1262 | |
fcff0f35 | 1263 | dsl_dataset_long_rele(to_ds, FTAG); |
13fe0198 | 1264 | |
37abac6d | 1265 | return (err); |
34dc7c2f BB |
1266 | } |
1267 | ||
330d06f9 | 1268 | int |
13fe0198 | 1269 | dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, |
2aa34383 | 1270 | boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, |
b5256303 | 1271 | boolean_t rawok, int outfd, vnode_t *vp, offset_t *off) |
13fe0198 MA |
1272 | { |
1273 | dsl_pool_t *dp; | |
1274 | dsl_dataset_t *ds; | |
1275 | dsl_dataset_t *fromds = NULL; | |
b5256303 | 1276 | ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; |
13fe0198 MA |
1277 | int err; |
1278 | ||
1279 | err = dsl_pool_hold(pool, FTAG, &dp); | |
1280 | if (err != 0) | |
1281 | return (err); | |
1282 | ||
b5256303 | 1283 | err = dsl_dataset_hold_obj_flags(dp, tosnap, dsflags, FTAG, &ds); |
13fe0198 MA |
1284 | if (err != 0) { |
1285 | dsl_pool_rele(dp, FTAG); | |
1286 | return (err); | |
1287 | } | |
1288 | ||
1289 | if (fromsnap != 0) { | |
f00ab3f2 | 1290 | zfs_bookmark_phys_t zb = { 0 }; |
da536844 MA |
1291 | boolean_t is_clone; |
1292 | ||
13fe0198 MA |
1293 | err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); |
1294 | if (err != 0) { | |
b5256303 | 1295 | dsl_dataset_rele_flags(ds, dsflags, FTAG); |
13fe0198 MA |
1296 | dsl_pool_rele(dp, FTAG); |
1297 | return (err); | |
1298 | } | |
f00ab3f2 | 1299 | if (!dsl_dataset_is_before(ds, fromds, 0)) { |
da536844 | 1300 | err = SET_ERROR(EXDEV); |
f00ab3f2 TC |
1301 | dsl_dataset_rele(fromds, FTAG); |
1302 | dsl_dataset_rele_flags(ds, dsflags, FTAG); | |
1303 | dsl_pool_rele(dp, FTAG); | |
1304 | return (err); | |
1305 | } | |
1306 | ||
d683ddbb JG |
1307 | zb.zbm_creation_time = |
1308 | dsl_dataset_phys(fromds)->ds_creation_time; | |
1309 | zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; | |
1310 | zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; | |
f00ab3f2 TC |
1311 | |
1312 | if (dsl_dataset_is_zapified(fromds)) { | |
1313 | (void) zap_lookup(dp->dp_meta_objset, | |
1314 | fromds->ds_object, DS_FIELD_IVSET_GUID, 8, 1, | |
1315 | &zb.zbm_ivset_guid); | |
1316 | } | |
1317 | ||
da536844 MA |
1318 | is_clone = (fromds->ds_dir != ds->ds_dir); |
1319 | dsl_dataset_rele(fromds, FTAG); | |
f1512ee6 | 1320 | err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, |
b5256303 TC |
1321 | embedok, large_block_ok, compressok, rawok, outfd, |
1322 | 0, 0, vp, off); | |
da536844 | 1323 | } else { |
f1512ee6 | 1324 | err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, |
b5256303 TC |
1325 | embedok, large_block_ok, compressok, rawok, outfd, |
1326 | 0, 0, vp, off); | |
13fe0198 | 1327 | } |
b5256303 | 1328 | dsl_dataset_rele_flags(ds, dsflags, FTAG); |
da536844 | 1329 | return (err); |
13fe0198 MA |
1330 | } |
1331 | ||
1332 | int | |
47dfff3b | 1333 | dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, |
b5256303 TC |
1334 | boolean_t large_block_ok, boolean_t compressok, boolean_t rawok, |
1335 | int outfd, uint64_t resumeobj, uint64_t resumeoff, vnode_t *vp, | |
1336 | offset_t *off) | |
13fe0198 MA |
1337 | { |
1338 | dsl_pool_t *dp; | |
1339 | dsl_dataset_t *ds; | |
13fe0198 | 1340 | int err; |
b5256303 | 1341 | ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; |
da536844 | 1342 | boolean_t owned = B_FALSE; |
13fe0198 | 1343 | |
da536844 | 1344 | if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) |
2e528b49 | 1345 | return (SET_ERROR(EINVAL)); |
13fe0198 MA |
1346 | |
1347 | err = dsl_pool_hold(tosnap, FTAG, &dp); | |
1348 | if (err != 0) | |
1349 | return (err); | |
da536844 MA |
1350 | if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { |
1351 | /* | |
1352 | * We are sending a filesystem or volume. Ensure | |
1353 | * that it doesn't change by owning the dataset. | |
1354 | */ | |
b5256303 | 1355 | err = dsl_dataset_own(dp, tosnap, dsflags, FTAG, &ds); |
da536844 MA |
1356 | owned = B_TRUE; |
1357 | } else { | |
b5256303 | 1358 | err = dsl_dataset_hold_flags(dp, tosnap, dsflags, FTAG, &ds); |
da536844 | 1359 | } |
13fe0198 MA |
1360 | if (err != 0) { |
1361 | dsl_pool_rele(dp, FTAG); | |
1362 | return (err); | |
1363 | } | |
1364 | ||
1365 | if (fromsnap != NULL) { | |
f00ab3f2 | 1366 | zfs_bookmark_phys_t zb = { 0 }; |
da536844 MA |
1367 | boolean_t is_clone = B_FALSE; |
1368 | int fsnamelen = strchr(tosnap, '@') - tosnap; | |
1369 | ||
1370 | /* | |
1371 | * If the fromsnap is in a different filesystem, then | |
1372 | * mark the send stream as a clone. | |
1373 | */ | |
1374 | if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || | |
1375 | (fromsnap[fsnamelen] != '@' && | |
1376 | fromsnap[fsnamelen] != '#')) { | |
1377 | is_clone = B_TRUE; | |
1378 | } | |
1379 | ||
1380 | if (strchr(fromsnap, '@')) { | |
1381 | dsl_dataset_t *fromds; | |
1382 | err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); | |
1383 | if (err == 0) { | |
1384 | if (!dsl_dataset_is_before(ds, fromds, 0)) | |
1385 | err = SET_ERROR(EXDEV); | |
1386 | zb.zbm_creation_time = | |
d683ddbb | 1387 | dsl_dataset_phys(fromds)->ds_creation_time; |
da536844 | 1388 | zb.zbm_creation_txg = |
d683ddbb JG |
1389 | dsl_dataset_phys(fromds)->ds_creation_txg; |
1390 | zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; | |
da536844 | 1391 | is_clone = (ds->ds_dir != fromds->ds_dir); |
f00ab3f2 TC |
1392 | |
1393 | if (dsl_dataset_is_zapified(fromds)) { | |
1394 | (void) zap_lookup(dp->dp_meta_objset, | |
1395 | fromds->ds_object, | |
1396 | DS_FIELD_IVSET_GUID, 8, 1, | |
1397 | &zb.zbm_ivset_guid); | |
1398 | } | |
da536844 MA |
1399 | dsl_dataset_rele(fromds, FTAG); |
1400 | } | |
1401 | } else { | |
1402 | err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); | |
1403 | } | |
13fe0198 | 1404 | if (err != 0) { |
b5256303 TC |
1405 | if (owned) |
1406 | dsl_dataset_disown(ds, dsflags, FTAG); | |
1407 | else | |
1408 | dsl_dataset_rele_flags(ds, dsflags, FTAG); | |
1409 | ||
13fe0198 MA |
1410 | dsl_pool_rele(dp, FTAG); |
1411 | return (err); | |
1412 | } | |
f1512ee6 | 1413 | err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, |
b5256303 | 1414 | embedok, large_block_ok, compressok, rawok, |
47dfff3b | 1415 | outfd, resumeobj, resumeoff, vp, off); |
da536844 | 1416 | } else { |
f1512ee6 | 1417 | err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, |
b5256303 | 1418 | embedok, large_block_ok, compressok, rawok, |
47dfff3b | 1419 | outfd, resumeobj, resumeoff, vp, off); |
13fe0198 | 1420 | } |
da536844 | 1421 | if (owned) |
b5256303 | 1422 | dsl_dataset_disown(ds, dsflags, FTAG); |
da536844 | 1423 | else |
b5256303 TC |
1424 | dsl_dataset_rele_flags(ds, dsflags, FTAG); |
1425 | ||
da536844 | 1426 | return (err); |
13fe0198 MA |
1427 | } |
1428 | ||
5dc8b736 | 1429 | static int |
2aa34383 DK |
1430 | dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed, |
1431 | uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep) | |
5dc8b736 | 1432 | { |
ca0845d5 | 1433 | int err = 0; |
2aa34383 | 1434 | uint64_t size; |
5dc8b736 MG |
1435 | /* |
1436 | * Assume that space (both on-disk and in-stream) is dominated by | |
1437 | * data. We will adjust for indirect blocks and the copies property, | |
1438 | * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). | |
1439 | */ | |
1440 | ||
2aa34383 DK |
1441 | uint64_t recordsize; |
1442 | uint64_t record_count; | |
dd429b46 PD |
1443 | objset_t *os; |
1444 | VERIFY0(dmu_objset_from_ds(ds, &os)); | |
2aa34383 DK |
1445 | |
1446 | /* Assume all (uncompressed) blocks are recordsize. */ | |
ca0845d5 PD |
1447 | if (zfs_override_estimate_recordsize != 0) { |
1448 | recordsize = zfs_override_estimate_recordsize; | |
1449 | } else if (os->os_phys->os_type == DMU_OST_ZVOL) { | |
dd429b46 PD |
1450 | err = dsl_prop_get_int_ds(ds, |
1451 | zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize); | |
1452 | } else { | |
1453 | err = dsl_prop_get_int_ds(ds, | |
1454 | zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize); | |
1455 | } | |
2aa34383 DK |
1456 | if (err != 0) |
1457 | return (err); | |
1458 | record_count = uncompressed / recordsize; | |
1459 | ||
1460 | /* | |
1461 | * If we're estimating a send size for a compressed stream, use the | |
1462 | * compressed data size to estimate the stream size. Otherwise, use the | |
1463 | * uncompressed data size. | |
1464 | */ | |
1465 | size = stream_compressed ? compressed : uncompressed; | |
1466 | ||
5dc8b736 MG |
1467 | /* |
1468 | * Subtract out approximate space used by indirect blocks. | |
1469 | * Assume most space is used by data blocks (non-indirect, non-dnode). | |
2aa34383 | 1470 | * Assume no ditto blocks or internal fragmentation. |
5dc8b736 MG |
1471 | * |
1472 | * Therefore, space used by indirect blocks is sizeof(blkptr_t) per | |
2aa34383 | 1473 | * block. |
5dc8b736 | 1474 | */ |
2aa34383 | 1475 | size -= record_count * sizeof (blkptr_t); |
5dc8b736 MG |
1476 | |
1477 | /* Add in the space for the record associated with each block. */ | |
2aa34383 | 1478 | size += record_count * sizeof (dmu_replay_record_t); |
5dc8b736 MG |
1479 | |
1480 | *sizep = size; | |
1481 | ||
1482 | return (0); | |
1483 | } | |
1484 | ||
13fe0198 | 1485 | int |
2aa34383 DK |
1486 | dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, |
1487 | boolean_t stream_compressed, uint64_t *sizep) | |
330d06f9 | 1488 | { |
330d06f9 | 1489 | int err; |
2aa34383 | 1490 | uint64_t uncomp, comp; |
13fe0198 | 1491 | |
fd0fd646 | 1492 | ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); |
330d06f9 MA |
1493 | |
1494 | /* tosnap must be a snapshot */ | |
0c66c32d | 1495 | if (!ds->ds_is_snapshot) |
2e528b49 | 1496 | return (SET_ERROR(EINVAL)); |
330d06f9 | 1497 | |
71e2fe41 AG |
1498 | /* fromsnap, if provided, must be a snapshot */ |
1499 | if (fromds != NULL && !fromds->ds_is_snapshot) | |
1500 | return (SET_ERROR(EINVAL)); | |
1501 | ||
6f1ffb06 MA |
1502 | /* |
1503 | * fromsnap must be an earlier snapshot from the same fs as tosnap, | |
1504 | * or the origin's fs. | |
1505 | */ | |
da536844 | 1506 | if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) |
2e528b49 | 1507 | return (SET_ERROR(EXDEV)); |
330d06f9 | 1508 | |
2aa34383 | 1509 | /* Get compressed and uncompressed size estimates of changed data. */ |
330d06f9 | 1510 | if (fromds == NULL) { |
2aa34383 DK |
1511 | uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes; |
1512 | comp = dsl_dataset_phys(ds)->ds_compressed_bytes; | |
330d06f9 | 1513 | } else { |
2aa34383 | 1514 | uint64_t used; |
330d06f9 | 1515 | err = dsl_dataset_space_written(fromds, ds, |
2aa34383 | 1516 | &used, &comp, &uncomp); |
13fe0198 | 1517 | if (err != 0) |
330d06f9 MA |
1518 | return (err); |
1519 | } | |
1520 | ||
2aa34383 DK |
1521 | err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp, |
1522 | stream_compressed, sizep); | |
dd429b46 PD |
1523 | /* |
1524 | * Add the size of the BEGIN and END records to the estimate. | |
1525 | */ | |
1526 | *sizep += 2 * sizeof (dmu_replay_record_t); | |
5dc8b736 MG |
1527 | return (err); |
1528 | } | |
330d06f9 | 1529 | |
2aa34383 DK |
1530 | struct calculate_send_arg { |
1531 | uint64_t uncompressed; | |
1532 | uint64_t compressed; | |
1533 | }; | |
1534 | ||
5dc8b736 MG |
1535 | /* |
1536 | * Simple callback used to traverse the blocks of a snapshot and sum their | |
2aa34383 | 1537 | * uncompressed and compressed sizes. |
5dc8b736 MG |
1538 | */ |
1539 | /* ARGSUSED */ | |
1540 | static int | |
1541 | dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, | |
1542 | const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) | |
1543 | { | |
2aa34383 | 1544 | struct calculate_send_arg *space = arg; |
5dc8b736 | 1545 | if (bp != NULL && !BP_IS_HOLE(bp)) { |
2aa34383 DK |
1546 | space->uncompressed += BP_GET_UCSIZE(bp); |
1547 | space->compressed += BP_GET_PSIZE(bp); | |
5dc8b736 MG |
1548 | } |
1549 | return (0); | |
1550 | } | |
1551 | ||
1552 | /* | |
1553 | * Given a desination snapshot and a TXG, calculate the approximate size of a | |
1554 | * send stream sent from that TXG. from_txg may be zero, indicating that the | |
1555 | * whole snapshot will be sent. | |
1556 | */ | |
1557 | int | |
1558 | dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg, | |
2aa34383 | 1559 | boolean_t stream_compressed, uint64_t *sizep) |
5dc8b736 | 1560 | { |
5dc8b736 | 1561 | int err; |
2aa34383 | 1562 | struct calculate_send_arg size = { 0 }; |
5dc8b736 | 1563 | |
fd0fd646 | 1564 | ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); |
5dc8b736 MG |
1565 | |
1566 | /* tosnap must be a snapshot */ | |
1567 | if (!dsl_dataset_is_snapshot(ds)) | |
1568 | return (SET_ERROR(EINVAL)); | |
1569 | ||
1570 | /* verify that from_txg is before the provided snapshot was taken */ | |
1571 | if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) { | |
1572 | return (SET_ERROR(EXDEV)); | |
1573 | } | |
330d06f9 | 1574 | /* |
5dc8b736 MG |
1575 | * traverse the blocks of the snapshot with birth times after |
1576 | * from_txg, summing their uncompressed size | |
330d06f9 | 1577 | */ |
b5256303 TC |
1578 | err = traverse_dataset(ds, from_txg, |
1579 | TRAVERSE_POST | TRAVERSE_NO_DECRYPT, | |
5dc8b736 | 1580 | dmu_calculate_send_traversal, &size); |
2aa34383 | 1581 | |
5dc8b736 | 1582 | if (err) |
330d06f9 | 1583 | return (err); |
330d06f9 | 1584 | |
2aa34383 DK |
1585 | err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed, |
1586 | size.compressed, stream_compressed, sizep); | |
5dc8b736 | 1587 | return (err); |
330d06f9 MA |
1588 | } |
1589 | ||
47dfff3b | 1590 | |
03916905 PD |
1591 | #if defined(_KERNEL) |
1592 | /* BEGIN CSTYLED */ | |
1593 | module_param(zfs_override_estimate_recordsize, ulong, 0644); | |
1594 | MODULE_PARM_DESC(zfs_override_estimate_recordsize, | |
1595 | "Record size calculation override for zfs send estimates"); | |
1596 | /* END CSTYLED */ | |
37f8a883 | 1597 | |
03916905 PD |
1598 | module_param(zfs_send_corrupt_data, int, 0644); |
1599 | MODULE_PARM_DESC(zfs_send_corrupt_data, "Allow sending corrupt data"); | |
3b0d9928 BB |
1600 | |
1601 | module_param(zfs_send_queue_length, int, 0644); | |
1602 | MODULE_PARM_DESC(zfs_send_queue_length, "Maximum send queue length"); | |
caf9dd20 BB |
1603 | |
1604 | module_param(zfs_send_unmodified_spill_blocks, int, 0644); | |
1605 | MODULE_PARM_DESC(zfs_send_unmodified_spill_blocks, | |
1606 | "Send unmodified spill blocks"); | |
fd8febbd | 1607 | #endif |