]> git.proxmox.com Git - mirror_qemu.git/blame - block/vhdx-log.c
block: vhdx - log parsing, replay, and flush support
[mirror_qemu.git] / block / vhdx-log.c
CommitLineData
0a43a1b5
JC
1/*
2 * Block driver for Hyper-V VHDX Images
3 *
4 * Copyright (c) 2013 Red Hat, Inc.,
5 *
6 * Authors:
7 * Jeff Cody <jcody@redhat.com>
8 *
9 * This is based on the "VHDX Format Specification v1.00", published 8/25/2012
10 * by Microsoft:
11 * https://www.microsoft.com/en-us/download/details.aspx?id=34750
12 *
13 * This file covers the functionality of the metadata log writing, parsing, and
14 * replay.
15 *
16 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
17 * See the COPYING.LIB file in the top-level directory.
18 *
19 */
20#include "qemu-common.h"
21#include "block/block_int.h"
22#include "qemu/module.h"
23#include "block/vhdx.h"
24
25
26typedef struct VHDXLogSequence {
27 bool valid;
28 uint32_t count;
29 VHDXLogEntries log;
30 VHDXLogEntryHeader hdr;
31} VHDXLogSequence;
32
33typedef struct VHDXLogDescEntries {
34 VHDXLogEntryHeader hdr;
35 VHDXLogDescriptor desc[];
36} VHDXLogDescEntries;
37
38static const MSGUID zero_guid = { 0 };
39
40/* The log located on the disk is circular buffer containing
41 * sectors of 4096 bytes each.
42 *
43 * It is assumed for the read/write functions below that the
44 * circular buffer scheme uses a 'one sector open' to indicate
45 * the buffer is full. Given the validation methods used for each
46 * sector, this method should be compatible with other methods that
47 * do not waste a sector.
48 */
49
50
51/* Allow peeking at the hdr entry at the beginning of the current
52 * read index, without advancing the read index */
53static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log,
54 VHDXLogEntryHeader *hdr)
55{
56 int ret = 0;
57 uint64_t offset;
58 uint32_t read;
59
60 assert(hdr != NULL);
61
62 /* peek is only supported on sector boundaries */
63 if (log->read % VHDX_LOG_SECTOR_SIZE) {
64 ret = -EFAULT;
65 goto exit;
66 }
67
68 read = log->read;
69 /* we are guaranteed that a) log sectors are 4096 bytes,
70 * and b) the log length is a multiple of 1MB. So, there
71 * is always a round number of sectors in the buffer */
72 if ((read + sizeof(VHDXLogEntryHeader)) > log->length) {
73 read = 0;
74 }
75
76 if (read == log->write) {
77 ret = -EINVAL;
78 goto exit;
79 }
80
81 offset = log->offset + read;
82
83 ret = bdrv_pread(bs->file, offset, hdr, sizeof(VHDXLogEntryHeader));
84 if (ret < 0) {
85 goto exit;
86 }
87
88exit:
89 return ret;
90}
91
92/* Index increment for log, based on sector boundaries */
93static int vhdx_log_inc_idx(uint32_t idx, uint64_t length)
94{
95 idx += VHDX_LOG_SECTOR_SIZE;
96 /* we are guaranteed that a) log sectors are 4096 bytes,
97 * and b) the log length is a multiple of 1MB. So, there
98 * is always a round number of sectors in the buffer */
99 return idx >= length ? 0 : idx;
100}
101
102
103/* Reset the log to empty */
104static void vhdx_log_reset(BlockDriverState *bs, BDRVVHDXState *s)
105{
106 MSGUID guid = { 0 };
107 s->log.read = s->log.write = 0;
108 /* a log guid of 0 indicates an empty log to any parser of v0
109 * VHDX logs */
110 vhdx_update_headers(bs, s, false, &guid);
111}
112
113/* Reads num_sectors from the log (all log sectors are 4096 bytes),
114 * into buffer 'buffer'. Upon return, *sectors_read will contain
115 * the number of sectors successfully read.
116 *
117 * It is assumed that 'buffer' is already allocated, and of sufficient
118 * size (i.e. >= 4096*num_sectors).
119 *
120 * If 'peek' is true, then the tail (read) pointer for the circular buffer is
121 * not modified.
122 *
123 * 0 is returned on success, -errno otherwise. */
124static int vhdx_log_read_sectors(BlockDriverState *bs, VHDXLogEntries *log,
125 uint32_t *sectors_read, void *buffer,
126 uint32_t num_sectors, bool peek)
127{
128 int ret = 0;
129 uint64_t offset;
130 uint32_t read;
131
132 read = log->read;
133
134 *sectors_read = 0;
135 while (num_sectors) {
136 if (read == log->write) {
137 /* empty */
138 break;
139 }
140 offset = log->offset + read;
141
142 ret = bdrv_pread(bs->file, offset, buffer, VHDX_LOG_SECTOR_SIZE);
143 if (ret < 0) {
144 goto exit;
145 }
146 read = vhdx_log_inc_idx(read, log->length);
147
148 *sectors_read = *sectors_read + 1;
149 num_sectors--;
150 }
151
152exit:
153 if (!peek) {
154 log->read = read;
155 }
156 return ret;
157}
158
159/* Validates a log entry header */
160static bool vhdx_log_hdr_is_valid(VHDXLogEntries *log, VHDXLogEntryHeader *hdr,
161 BDRVVHDXState *s)
162{
163 int valid = false;
164
165 if (memcmp(&hdr->signature, "loge", 4)) {
166 goto exit;
167 }
168
169 /* if the individual entry length is larger than the whole log
170 * buffer, that is obviously invalid */
171 if (log->length < hdr->entry_length) {
172 goto exit;
173 }
174
175 /* length of entire entry must be in units of 4KB (log sector size) */
176 if (hdr->entry_length % (VHDX_LOG_SECTOR_SIZE)) {
177 goto exit;
178 }
179
180 /* per spec, sequence # must be > 0 */
181 if (hdr->sequence_number == 0) {
182 goto exit;
183 }
184
185 /* log entries are only valid if they match the file-wide log guid
186 * found in the active header */
187 if (!guid_eq(hdr->log_guid, s->headers[s->curr_header]->log_guid)) {
188 goto exit;
189 }
190
191 if (hdr->descriptor_count * sizeof(VHDXLogDescriptor) > hdr->entry_length) {
192 goto exit;
193 }
194
195 valid = true;
196
197exit:
198 return valid;
199}
200
201/*
202 * Given a log header, this will validate that the descriptors and the
203 * corresponding data sectors (if applicable)
204 *
205 * Validation consists of:
206 * 1. Making sure the sequence numbers matches the entry header
207 * 2. Verifying a valid signature ('zero' or 'desc' for descriptors)
208 * 3. File offset field is a multiple of 4KB
209 * 4. If a data descriptor, the corresponding data sector
210 * has its signature ('data') and matching sequence number
211 *
212 * @desc: the data buffer containing the descriptor
213 * @hdr: the log entry header
214 *
215 * Returns true if valid
216 */
217static bool vhdx_log_desc_is_valid(VHDXLogDescriptor *desc,
218 VHDXLogEntryHeader *hdr)
219{
220 bool ret = false;
221
222 if (desc->sequence_number != hdr->sequence_number) {
223 goto exit;
224 }
225 if (desc->file_offset % VHDX_LOG_SECTOR_SIZE) {
226 goto exit;
227 }
228
229 if (!memcmp(&desc->signature, "zero", 4)) {
230 if (desc->zero_length % VHDX_LOG_SECTOR_SIZE == 0) {
231 /* valid */
232 ret = true;
233 }
234 } else if (!memcmp(&desc->signature, "desc", 4)) {
235 /* valid */
236 ret = true;
237 }
238
239exit:
240 return ret;
241}
242
243
244/* Prior to sector data for a log entry, there is the header
245 * and the descriptors referenced in the header:
246 *
247 * [] = 4KB sector
248 *
249 * [ hdr, desc ][ desc ][ ... ][ data ][ ... ]
250 *
251 * The first sector in a log entry has a 64 byte header, and
252 * up to 126 32-byte descriptors. If more descriptors than
253 * 126 are required, then subsequent sectors can have up to 128
254 * descriptors. Each sector is 4KB. Data follows the descriptor
255 * sectors.
256 *
257 * This will return the number of sectors needed to encompass
258 * the passed number of descriptors in desc_cnt.
259 *
260 * This will never return 0, even if desc_cnt is 0.
261 */
262static int vhdx_compute_desc_sectors(uint32_t desc_cnt)
263{
264 uint32_t desc_sectors;
265
266 desc_cnt += 2; /* account for header in first sector */
267 desc_sectors = desc_cnt / 128;
268 if (desc_cnt % 128) {
269 desc_sectors++;
270 }
271
272 return desc_sectors;
273}
274
275
276/* Reads the log header, and subsequent descriptors (if any). This
277 * will allocate all the space for buffer, which must be NULL when
278 * passed into this function. Each descriptor will also be validated,
279 * and error returned if any are invalid. */
280static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s,
281 VHDXLogEntries *log, VHDXLogDescEntries **buffer)
282{
283 int ret = 0;
284 uint32_t desc_sectors;
285 uint32_t sectors_read;
286 VHDXLogEntryHeader hdr;
287 VHDXLogDescEntries *desc_entries = NULL;
288 int i;
289
290 assert(*buffer == NULL);
291
292 ret = vhdx_log_peek_hdr(bs, log, &hdr);
293 if (ret < 0) {
294 goto exit;
295 }
296 vhdx_log_entry_hdr_le_import(&hdr);
297 if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
298 ret = -EINVAL;
299 goto exit;
300 }
301
302 desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);
303 desc_entries = qemu_blockalign(bs, desc_sectors * VHDX_LOG_SECTOR_SIZE);
304
305 ret = vhdx_log_read_sectors(bs, log, &sectors_read, desc_entries,
306 desc_sectors, false);
307 if (ret < 0) {
308 goto free_and_exit;
309 }
310 if (sectors_read != desc_sectors) {
311 ret = -EINVAL;
312 goto free_and_exit;
313 }
314
315 /* put in proper endianness, and validate each desc */
316 for (i = 0; i < hdr.descriptor_count; i++) {
317 vhdx_log_desc_le_import(&desc_entries->desc[i]);
318 if (vhdx_log_desc_is_valid(&desc_entries->desc[i], &hdr) == false) {
319 ret = -EINVAL;
320 goto free_and_exit;
321 }
322 }
323
324 *buffer = desc_entries;
325 goto exit;
326
327free_and_exit:
328 qemu_vfree(desc_entries);
329exit:
330 return ret;
331}
332
333
334/* Flushes the descriptor described by desc to the VHDX image file.
335 * If the descriptor is a data descriptor, than 'data' must be non-NULL,
336 * and >= 4096 bytes (VHDX_LOG_SECTOR_SIZE), containing the data to be
337 * written.
338 *
339 * Verification is performed to make sure the sequence numbers of a data
340 * descriptor match the sequence number in the desc.
341 *
342 * For a zero descriptor, it may describe multiple sectors to fill with zeroes.
343 * In this case, it should be noted that zeroes are written to disk, and the
344 * image file is not extended as a sparse file. */
345static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc,
346 VHDXLogDataSector *data)
347{
348 int ret = 0;
349 uint64_t seq, file_offset;
350 uint32_t offset = 0;
351 void *buffer = NULL;
352 uint64_t count = 1;
353 int i;
354
355 buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
356
357 if (!memcmp(&desc->signature, "desc", 4)) {
358 /* data sector */
359 if (data == NULL) {
360 ret = -EFAULT;
361 goto exit;
362 }
363
364 /* The sequence number of the data sector must match that
365 * in the descriptor */
366 seq = data->sequence_high;
367 seq <<= 32;
368 seq |= data->sequence_low & 0xffffffff;
369
370 if (seq != desc->sequence_number) {
371 ret = -EINVAL;
372 goto exit;
373 }
374
375 /* Each data sector is in total 4096 bytes, however the first
376 * 8 bytes, and last 4 bytes, are located in the descriptor */
377 memcpy(buffer, &desc->leading_bytes, 8);
378 offset += 8;
379
380 memcpy(buffer+offset, data->data, 4084);
381 offset += 4084;
382
383 memcpy(buffer+offset, &desc->trailing_bytes, 4);
384
385 } else if (!memcmp(&desc->signature, "zero", 4)) {
386 /* write 'count' sectors of sector */
387 memset(buffer, 0, VHDX_LOG_SECTOR_SIZE);
388 count = desc->zero_length / VHDX_LOG_SECTOR_SIZE;
389 }
390
391 file_offset = desc->file_offset;
392
393 /* count is only > 1 if we are writing zeroes */
394 for (i = 0; i < count; i++) {
395 ret = bdrv_pwrite_sync(bs->file, file_offset, buffer,
396 VHDX_LOG_SECTOR_SIZE);
397 if (ret < 0) {
398 goto exit;
399 }
400 file_offset += VHDX_LOG_SECTOR_SIZE;
401 }
402
403exit:
404 qemu_vfree(buffer);
405 return ret;
406}
407
408/* Flush the entire log (as described by 'logs') to the VHDX image
409 * file, and then set the log to 'empty' status once complete.
410 *
411 * The log entries should be validate prior to flushing */
412static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s,
413 VHDXLogSequence *logs)
414{
415 int ret = 0;
416 int i;
417 uint32_t cnt, sectors_read;
418 uint64_t new_file_size;
419 void *data = NULL;
420 VHDXLogDescEntries *desc_entries = NULL;
421 VHDXLogEntryHeader hdr_tmp = { 0 };
422
423 cnt = logs->count;
424
425 data = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
426
427 ret = vhdx_user_visible_write(bs, s);
428 if (ret < 0) {
429 goto exit;
430 }
431
432 /* each iteration represents one log sequence, which may span multiple
433 * sectors */
434 while (cnt--) {
435 ret = vhdx_log_peek_hdr(bs, &logs->log, &hdr_tmp);
436 if (ret < 0) {
437 goto exit;
438 }
439 /* if the log shows a FlushedFileOffset larger than our current file
440 * size, then that means the file has been truncated / corrupted, and
441 * we must refused to open it / use it */
442 if (hdr_tmp.flushed_file_offset > bdrv_getlength(bs->file)) {
443 ret = -EINVAL;
444 goto exit;
445 }
446
447 ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries);
448 if (ret < 0) {
449 goto exit;
450 }
451
452 for (i = 0; i < desc_entries->hdr.descriptor_count; i++) {
453 if (!memcmp(&desc_entries->desc[i].signature, "desc", 4)) {
454 /* data sector, so read a sector to flush */
455 ret = vhdx_log_read_sectors(bs, &logs->log, &sectors_read,
456 data, 1, false);
457 if (ret < 0) {
458 goto exit;
459 }
460 if (sectors_read != 1) {
461 ret = -EINVAL;
462 goto exit;
463 }
464 }
465
466 ret = vhdx_log_flush_desc(bs, &desc_entries->desc[i], data);
467 if (ret < 0) {
468 goto exit;
469 }
470 }
471 if (bdrv_getlength(bs->file) < desc_entries->hdr.last_file_offset) {
472 new_file_size = desc_entries->hdr.last_file_offset;
473 if (new_file_size % (1024*1024)) {
474 /* round up to nearest 1MB boundary */
475 new_file_size = ((new_file_size >> 20) + 1) << 20;
476 bdrv_truncate(bs->file, new_file_size);
477 }
478 }
479 qemu_vfree(desc_entries);
480 desc_entries = NULL;
481 }
482
483 bdrv_flush(bs);
484 /* once the log is fully flushed, indicate that we have an empty log
485 * now. This also sets the log guid to 0, to indicate an empty log */
486 vhdx_log_reset(bs, s);
487
488exit:
489 qemu_vfree(data);
490 qemu_vfree(desc_entries);
491 return ret;
492}
493
494static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s,
495 VHDXLogEntries *log, uint64_t seq,
496 bool *valid, VHDXLogEntryHeader *entry)
497{
498 int ret = 0;
499 VHDXLogEntryHeader hdr;
500 void *buffer = NULL;
501 uint32_t i, desc_sectors, total_sectors, crc;
502 uint32_t sectors_read = 0;
503 VHDXLogDescEntries *desc_buffer = NULL;
504
505 *valid = false;
506
507 ret = vhdx_log_peek_hdr(bs, log, &hdr);
508 if (ret < 0) {
509 goto inc_and_exit;
510 }
511
512 vhdx_log_entry_hdr_le_import(&hdr);
513
514
515 if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) {
516 goto inc_and_exit;
517 }
518
519 if (seq > 0) {
520 if (hdr.sequence_number != seq + 1) {
521 goto inc_and_exit;
522 }
523 }
524
525 desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count);
526
527 /* Read desc sectors, and calculate log checksum */
528
529 total_sectors = hdr.entry_length / VHDX_LOG_SECTOR_SIZE;
530
531
532 /* read_desc() will incrememnt the read idx */
533 ret = vhdx_log_read_desc(bs, s, log, &desc_buffer);
534 if (ret < 0) {
535 goto free_and_exit;
536 }
537
538 crc = vhdx_checksum_calc(0xffffffff, (void *)desc_buffer,
539 desc_sectors * VHDX_LOG_SECTOR_SIZE, 4);
540 crc ^= 0xffffffff;
541
542 buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE);
543 if (total_sectors > desc_sectors) {
544 for (i = 0; i < total_sectors - desc_sectors; i++) {
545 sectors_read = 0;
546 ret = vhdx_log_read_sectors(bs, log, &sectors_read, buffer,
547 1, false);
548 if (ret < 0 || sectors_read != 1) {
549 goto free_and_exit;
550 }
551 crc = vhdx_checksum_calc(crc, buffer, VHDX_LOG_SECTOR_SIZE, -1);
552 crc ^= 0xffffffff;
553 }
554 }
555 crc ^= 0xffffffff;
556 if (crc != desc_buffer->hdr.checksum) {
557 goto free_and_exit;
558 }
559
560 *valid = true;
561 *entry = hdr;
562 goto free_and_exit;
563
564inc_and_exit:
565 log->read = vhdx_log_inc_idx(log->read, log->length);
566
567free_and_exit:
568 qemu_vfree(buffer);
569 qemu_vfree(desc_buffer);
570 return ret;
571}
572
573/* Search through the log circular buffer, and find the valid, active
574 * log sequence, if any exists
575 * */
576static int vhdx_log_search(BlockDriverState *bs, BDRVVHDXState *s,
577 VHDXLogSequence *logs)
578{
579 int ret = 0;
580 uint32_t tail;
581 bool seq_valid = false;
582 VHDXLogSequence candidate = { 0 };
583 VHDXLogEntryHeader hdr = { 0 };
584 VHDXLogEntries curr_log;
585
586 memcpy(&curr_log, &s->log, sizeof(VHDXLogEntries));
587 curr_log.write = curr_log.length; /* assume log is full */
588 curr_log.read = 0;
589
590
591 /* now we will go through the whole log sector by sector, until
592 * we find a valid, active log sequence, or reach the end of the
593 * log buffer */
594 for (;;) {
595 uint64_t curr_seq = 0;
596 VHDXLogSequence current = { 0 };
597
598 tail = curr_log.read;
599
600 ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq,
601 &seq_valid, &hdr);
602 if (ret < 0) {
603 goto exit;
604 }
605
606 if (seq_valid) {
607 current.valid = true;
608 current.log = curr_log;
609 current.log.read = tail;
610 current.log.write = curr_log.read;
611 current.count = 1;
612 current.hdr = hdr;
613
614
615 for (;;) {
616 ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq,
617 &seq_valid, &hdr);
618 if (ret < 0) {
619 goto exit;
620 }
621 if (seq_valid == false) {
622 break;
623 }
624 current.log.write = curr_log.read;
625 current.count++;
626
627 curr_seq = hdr.sequence_number;
628 }
629 }
630
631 if (current.valid) {
632 if (candidate.valid == false ||
633 current.hdr.sequence_number > candidate.hdr.sequence_number) {
634 candidate = current;
635 }
636 }
637
638 if (curr_log.read < tail) {
639 break;
640 }
641 }
642
643 *logs = candidate;
644
645 if (candidate.valid) {
646 /* this is the next sequence number, for writes */
647 s->log.sequence = candidate.hdr.sequence_number + 1;
648 }
649
650
651exit:
652 return ret;
653}
654
655/* Parse the replay log. Per the VHDX spec, if the log is present
656 * it must be replayed prior to opening the file, even read-only.
657 *
658 * If read-only, we must replay the log in RAM (or refuse to open
659 * a dirty VHDX file read-only) */
660int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed)
661{
662 int ret = 0;
663 VHDXHeader *hdr;
664 VHDXLogSequence logs = { 0 };
665
666 hdr = s->headers[s->curr_header];
667
668 *flushed = false;
669
670 /* s->log.hdr is freed in vhdx_close() */
671 if (s->log.hdr == NULL) {
672 s->log.hdr = qemu_blockalign(bs, sizeof(VHDXLogEntryHeader));
673 }
674
675 s->log.offset = hdr->log_offset;
676 s->log.length = hdr->log_length;
677
678 if (s->log.offset < VHDX_LOG_MIN_SIZE ||
679 s->log.offset % VHDX_LOG_MIN_SIZE) {
680 ret = -EINVAL;
681 goto exit;
682 }
683
684 /* per spec, only log version of 0 is supported */
685 if (hdr->log_version != 0) {
686 ret = -EINVAL;
687 goto exit;
688 }
689
690 /* If either the log guid, or log length is zero,
691 * then a replay log is not present */
692 if (guid_eq(hdr->log_guid, zero_guid)) {
693 goto exit;
694 }
695
696 if (hdr->log_length == 0) {
697 goto exit;
698 }
699
700 if (hdr->log_length % VHDX_LOG_MIN_SIZE) {
701 ret = -EINVAL;
702 goto exit;
703 }
704
705
706 /* The log is present, we need to find if and where there is an active
707 * sequence of valid entries present in the log. */
708
709 ret = vhdx_log_search(bs, s, &logs);
710 if (ret < 0) {
711 goto exit;
712 }
713
714 if (logs.valid) {
715 /* now flush the log */
716 ret = vhdx_log_flush(bs, s, &logs);
717 if (ret < 0) {
718 goto exit;
719 }
720 *flushed = true;
721 }
722
723
724exit:
725 return ret;
726}
727
728