]> git.proxmox.com Git - mirror_zfs.git/blob - cmd/zstream/zstream_redup.c
379025ce59e5a18eb2117b96ec2bef8f350ccb1b
[mirror_zfs.git] / cmd / zstream / zstream_redup.c
1 /*
2 * CDDL HEADER START
3 *
4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
7 * 1.0 of the CDDL.
8 *
9 * A full copy of the text of the CDDL should have accompanied this
10 * source. A copy of the CDDL is also available via the Internet at
11 * http://www.illumos.org/license/CDDL.
12 *
13 * CDDL HEADER END
14 */
15
16 /*
17 * Copyright (c) 2020 by Delphix. All rights reserved.
18 */
19
20 #include <assert.h>
21 #include <cityhash.h>
22 #include <ctype.h>
23 #include <errno.h>
24 #include <fcntl.h>
25 #include <libzfs_impl.h>
26 #include <libzfs.h>
27 #include <libzutil.h>
28 #include <stddef.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <strings.h>
32 #include <umem.h>
33 #include <unistd.h>
34 #include <sys/debug.h>
35 #include <sys/stat.h>
36 #include <sys/zfs_ioctl.h>
37 #include <sys/zio_checksum.h>
38 #include "zfs_fletcher.h"
39 #include "zstream.h"
40
41
42 #define MAX_RDT_PHYSMEM_PERCENT 20
43 #define SMALLEST_POSSIBLE_MAX_RDT_MB 128
44
45 typedef struct redup_entry {
46 struct redup_entry *rde_next;
47 uint64_t rde_guid;
48 uint64_t rde_object;
49 uint64_t rde_offset;
50 uint64_t rde_stream_offset;
51 } redup_entry_t;
52
53 typedef struct redup_table {
54 redup_entry_t **redup_hash_array;
55 umem_cache_t *ddecache;
56 uint64_t ddt_count;
57 int numhashbits;
58 } redup_table_t;
59
60 int
61 highbit64(uint64_t i)
62 {
63 if (i == 0)
64 return (0);
65
66 return (NBBY * sizeof (uint64_t) - __builtin_clzll(i));
67 }
68
69 static void *
70 safe_calloc(size_t n)
71 {
72 void *rv = calloc(1, n);
73 if (rv == NULL) {
74 fprintf(stderr,
75 "Error: could not allocate %u bytes of memory\n",
76 (int)n);
77 exit(1);
78 }
79 return (rv);
80 }
81
82 /*
83 * Safe version of fread(), exits on error.
84 */
85 static int
86 sfread(void *buf, size_t size, FILE *fp)
87 {
88 int rv = fread(buf, size, 1, fp);
89 if (rv == 0 && ferror(fp)) {
90 (void) fprintf(stderr, "Error while reading file: %s\n",
91 strerror(errno));
92 exit(1);
93 }
94 return (rv);
95 }
96
97 /*
98 * Safe version of pread(), exits on error.
99 */
100 static void
101 spread(int fd, void *buf, size_t count, off_t offset)
102 {
103 ssize_t err = pread(fd, buf, count, offset);
104 if (err == -1) {
105 (void) fprintf(stderr,
106 "Error while reading file: %s\n",
107 strerror(errno));
108 exit(1);
109 } else if (err != count) {
110 (void) fprintf(stderr,
111 "Error while reading file: short read\n");
112 exit(1);
113 }
114 }
115
116 static int
117 dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
118 zio_cksum_t *zc, int outfd)
119 {
120 assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum)
121 == sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
122 fletcher_4_incremental_native(drr,
123 offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
124 if (drr->drr_type != DRR_BEGIN) {
125 assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
126 drr_checksum.drr_checksum));
127 drr->drr_u.drr_checksum.drr_checksum = *zc;
128 }
129 fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
130 sizeof (zio_cksum_t), zc);
131 if (write(outfd, drr, sizeof (*drr)) == -1)
132 return (errno);
133 if (payload_len != 0) {
134 fletcher_4_incremental_native(payload, payload_len, zc);
135 if (write(outfd, payload, payload_len) == -1)
136 return (errno);
137 }
138 return (0);
139 }
140
141 static void
142 rdt_insert(redup_table_t *rdt,
143 uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset)
144 {
145 uint64_t ch = cityhash4(guid, object, offset, 0);
146 uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
147 redup_entry_t **rdepp;
148
149 rdepp = &(rdt->redup_hash_array[hashcode]);
150 redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL);
151 rde->rde_next = *rdepp;
152 rde->rde_guid = guid;
153 rde->rde_object = object;
154 rde->rde_offset = offset;
155 rde->rde_stream_offset = stream_offset;
156 *rdepp = rde;
157 rdt->ddt_count++;
158 }
159
160 static void
161 rdt_lookup(redup_table_t *rdt,
162 uint64_t guid, uint64_t object, uint64_t offset,
163 uint64_t *stream_offsetp)
164 {
165 uint64_t ch = cityhash4(guid, object, offset, 0);
166 uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
167
168 for (redup_entry_t *rde = rdt->redup_hash_array[hashcode];
169 rde != NULL; rde = rde->rde_next) {
170 if (rde->rde_guid == guid &&
171 rde->rde_object == object &&
172 rde->rde_offset == offset) {
173 *stream_offsetp = rde->rde_stream_offset;
174 return;
175 }
176 }
177 assert(!"could not find expected redup table entry");
178 }
179
180 /*
181 * Convert a dedup stream (generated by "zfs send -D") to a
182 * non-deduplicated stream. The entire infd will be converted, including
183 * any substreams in a stream package (generated by "zfs send -RD"). The
184 * infd must be seekable.
185 */
186 static void
187 zfs_redup_stream(int infd, int outfd, boolean_t verbose)
188 {
189 int bufsz = SPA_MAXBLOCKSIZE;
190 dmu_replay_record_t thedrr = { 0 };
191 dmu_replay_record_t *drr = &thedrr;
192 redup_table_t rdt;
193 zio_cksum_t stream_cksum;
194 uint64_t numbuckets;
195 uint64_t num_records = 0;
196 uint64_t num_write_byref_records = 0;
197
198 #ifdef _ILP32
199 uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20;
200 #else
201 uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
202 uint64_t max_rde_size =
203 MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100,
204 SMALLEST_POSSIBLE_MAX_RDT_MB << 20);
205 #endif
206
207 numbuckets = max_rde_size / (sizeof (redup_entry_t));
208
209 /*
210 * numbuckets must be a power of 2. Increase number to
211 * a power of 2 if necessary.
212 */
213 if (!ISP2(numbuckets))
214 numbuckets = 1ULL << highbit64(numbuckets);
215
216 rdt.redup_hash_array =
217 safe_calloc(numbuckets * sizeof (redup_entry_t *));
218 rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0,
219 NULL, NULL, NULL, NULL, NULL, 0);
220 rdt.numhashbits = highbit64(numbuckets) - 1;
221 rdt.ddt_count = 0;
222
223 char *buf = safe_calloc(bufsz);
224 FILE *ofp = fdopen(infd, "r");
225 long offset = ftell(ofp);
226 while (sfread(drr, sizeof (*drr), ofp) != 0) {
227 num_records++;
228
229 /*
230 * We need to regenerate the checksum.
231 */
232 if (drr->drr_type != DRR_BEGIN) {
233 bzero(&drr->drr_u.drr_checksum.drr_checksum,
234 sizeof (drr->drr_u.drr_checksum.drr_checksum));
235 }
236
237 uint64_t payload_size = 0;
238 switch (drr->drr_type) {
239 case DRR_BEGIN:
240 {
241 struct drr_begin *drrb = &drr->drr_u.drr_begin;
242 int fflags;
243 ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
244
245 assert(drrb->drr_magic == DMU_BACKUP_MAGIC);
246
247 /* clear the DEDUP feature flag for this stream */
248 fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
249 fflags &= ~(DMU_BACKUP_FEATURE_DEDUP |
250 DMU_BACKUP_FEATURE_DEDUPPROPS);
251 DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
252
253 int sz = drr->drr_payloadlen;
254 if (sz != 0) {
255 if (sz > bufsz) {
256 free(buf);
257 buf = safe_calloc(sz);
258 bufsz = sz;
259 }
260 (void) sfread(buf, sz, ofp);
261 }
262 payload_size = sz;
263 break;
264 }
265
266 case DRR_END:
267 {
268 struct drr_end *drre = &drr->drr_u.drr_end;
269 /*
270 * Use the recalculated checksum, unless this is
271 * the END record of a stream package, which has
272 * no checksum.
273 */
274 if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum))
275 drre->drr_checksum = stream_cksum;
276 break;
277 }
278
279 case DRR_OBJECT:
280 {
281 struct drr_object *drro = &drr->drr_u.drr_object;
282
283 if (drro->drr_bonuslen > 0) {
284 payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro);
285 (void) sfread(buf, payload_size, ofp);
286 }
287 break;
288 }
289
290 case DRR_SPILL:
291 {
292 struct drr_spill *drrs = &drr->drr_u.drr_spill;
293 payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs);
294 (void) sfread(buf, payload_size, ofp);
295 break;
296 }
297
298 case DRR_WRITE_BYREF:
299 {
300 struct drr_write_byref drrwb =
301 drr->drr_u.drr_write_byref;
302
303 num_write_byref_records++;
304
305 /*
306 * Look up in hash table by drrwb->drr_refguid,
307 * drr_refobject, drr_refoffset. Replace this
308 * record with the found WRITE record, but with
309 * drr_object,drr_offset,drr_toguid replaced with ours.
310 */
311 uint64_t stream_offset = 0;
312 rdt_lookup(&rdt, drrwb.drr_refguid,
313 drrwb.drr_refobject, drrwb.drr_refoffset,
314 &stream_offset);
315
316 spread(infd, drr, sizeof (*drr), stream_offset);
317
318 assert(drr->drr_type == DRR_WRITE);
319 struct drr_write *drrw = &drr->drr_u.drr_write;
320 assert(drrw->drr_toguid == drrwb.drr_refguid);
321 assert(drrw->drr_object == drrwb.drr_refobject);
322 assert(drrw->drr_offset == drrwb.drr_refoffset);
323
324 payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
325 spread(infd, buf, payload_size,
326 stream_offset + sizeof (*drr));
327
328 drrw->drr_toguid = drrwb.drr_toguid;
329 drrw->drr_object = drrwb.drr_object;
330 drrw->drr_offset = drrwb.drr_offset;
331 break;
332 }
333
334 case DRR_WRITE:
335 {
336 struct drr_write *drrw = &drr->drr_u.drr_write;
337 payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
338 (void) sfread(buf, payload_size, ofp);
339
340 rdt_insert(&rdt, drrw->drr_toguid,
341 drrw->drr_object, drrw->drr_offset, offset);
342 break;
343 }
344
345 case DRR_WRITE_EMBEDDED:
346 {
347 struct drr_write_embedded *drrwe =
348 &drr->drr_u.drr_write_embedded;
349 payload_size =
350 P2ROUNDUP((uint64_t)drrwe->drr_psize, 8);
351 (void) sfread(buf, payload_size, ofp);
352 break;
353 }
354
355 case DRR_FREEOBJECTS:
356 case DRR_FREE:
357 case DRR_OBJECT_RANGE:
358 break;
359
360 default:
361 (void) fprintf(stderr, "INVALID record type 0x%x\n",
362 drr->drr_type);
363 /* should never happen, so assert */
364 assert(B_FALSE);
365 }
366
367 if (feof(ofp)) {
368 fprintf(stderr, "Error: unexpected end-of-file\n");
369 exit(1);
370 }
371 if (ferror(ofp)) {
372 fprintf(stderr, "Error while reading file: %s\n",
373 strerror(errno));
374 exit(1);
375 }
376
377 /*
378 * We need to recalculate the checksum, and it needs to be
379 * initially zero to do that. BEGIN records don't have
380 * a checksum.
381 */
382 if (drr->drr_type != DRR_BEGIN) {
383 bzero(&drr->drr_u.drr_checksum.drr_checksum,
384 sizeof (drr->drr_u.drr_checksum.drr_checksum));
385 }
386 if (dump_record(drr, buf, payload_size,
387 &stream_cksum, outfd) != 0)
388 break;
389 if (drr->drr_type == DRR_END) {
390 /*
391 * Typically the END record is either the last
392 * thing in the stream, or it is followed
393 * by a BEGIN record (which also zeros the checksum).
394 * However, a stream package ends with two END
395 * records. The last END record's checksum starts
396 * from zero.
397 */
398 ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
399 }
400 offset = ftell(ofp);
401 }
402
403 if (verbose) {
404 char mem_str[16];
405 zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t),
406 mem_str, sizeof (mem_str));
407 fprintf(stderr, "converted stream with %llu total records, "
408 "including %llu dedup records, using %sB memory.\n",
409 (long long)num_records,
410 (long long)num_write_byref_records,
411 mem_str);
412 }
413
414 umem_cache_destroy(rdt.ddecache);
415 free(rdt.redup_hash_array);
416 free(buf);
417 (void) fclose(ofp);
418 }
419
420 int
421 zstream_do_redup(int argc, char *argv[])
422 {
423 boolean_t verbose = B_FALSE;
424 char c;
425
426 while ((c = getopt(argc, argv, "v")) != -1) {
427 switch (c) {
428 case 'v':
429 verbose = B_TRUE;
430 break;
431 case '?':
432 (void) fprintf(stderr, "invalid option '%c'\n",
433 optopt);
434 zstream_usage();
435 break;
436 }
437 }
438
439 argc -= optind;
440 argv += optind;
441
442 if (argc != 1)
443 zstream_usage();
444
445 const char *filename = argv[0];
446
447 if (isatty(STDOUT_FILENO)) {
448 (void) fprintf(stderr,
449 "Error: Stream can not be written to a terminal.\n"
450 "You must redirect standard output.\n");
451 return (1);
452 }
453
454 int fd = open(filename, O_RDONLY);
455 if (fd == -1) {
456 (void) fprintf(stderr,
457 "Error while opening file '%s': %s\n",
458 filename, strerror(errno));
459 exit(1);
460 }
461
462 fletcher_4_init();
463 zfs_redup_stream(fd, STDOUT_FILENO, verbose);
464 fletcher_4_fini();
465
466 close(fd);
467
468 return (0);
469 }