2 * Block driver for Hyper-V VHDX Images
4 * Copyright (c) 2013 Red Hat, Inc.,
7 * Jeff Cody <jcody@redhat.com>
9 * This is based on the "VHDX Format Specification v1.00", published 8/25/2012
11 * https://www.microsoft.com/en-us/download/details.aspx?id=34750
13 * This file covers the functionality of the metadata log writing, parsing, and
16 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
17 * See the COPYING.LIB file in the top-level directory.
20 #include "qemu-common.h"
21 #include "block/block_int.h"
22 #include "qemu/module.h"
23 #include "block/vhdx.h"
26 typedef struct VHDXLogSequence
{
30 VHDXLogEntryHeader hdr
;
33 typedef struct VHDXLogDescEntries
{
34 VHDXLogEntryHeader hdr
;
35 VHDXLogDescriptor desc
[];
38 static const MSGUID zero_guid
= { 0 };
40 /* The log located on the disk is circular buffer containing
41 * sectors of 4096 bytes each.
43 * It is assumed for the read/write functions below that the
44 * circular buffer scheme uses a 'one sector open' to indicate
45 * the buffer is full. Given the validation methods used for each
46 * sector, this method should be compatible with other methods that
47 * do not waste a sector.
51 /* Allow peeking at the hdr entry at the beginning of the current
52 * read index, without advancing the read index */
53 static int vhdx_log_peek_hdr(BlockDriverState
*bs
, VHDXLogEntries
*log
,
54 VHDXLogEntryHeader
*hdr
)
62 /* peek is only supported on sector boundaries */
63 if (log
->read
% VHDX_LOG_SECTOR_SIZE
) {
69 /* we are guaranteed that a) log sectors are 4096 bytes,
70 * and b) the log length is a multiple of 1MB. So, there
71 * is always a round number of sectors in the buffer */
72 if ((read
+ sizeof(VHDXLogEntryHeader
)) > log
->length
) {
76 if (read
== log
->write
) {
81 offset
= log
->offset
+ read
;
83 ret
= bdrv_pread(bs
->file
, offset
, hdr
, sizeof(VHDXLogEntryHeader
));
92 /* Index increment for log, based on sector boundaries */
93 static int vhdx_log_inc_idx(uint32_t idx
, uint64_t length
)
95 idx
+= VHDX_LOG_SECTOR_SIZE
;
96 /* we are guaranteed that a) log sectors are 4096 bytes,
97 * and b) the log length is a multiple of 1MB. So, there
98 * is always a round number of sectors in the buffer */
99 return idx
>= length
? 0 : idx
;
103 /* Reset the log to empty */
104 static void vhdx_log_reset(BlockDriverState
*bs
, BDRVVHDXState
*s
)
107 s
->log
.read
= s
->log
.write
= 0;
108 /* a log guid of 0 indicates an empty log to any parser of v0
110 vhdx_update_headers(bs
, s
, false, &guid
);
113 /* Reads num_sectors from the log (all log sectors are 4096 bytes),
114 * into buffer 'buffer'. Upon return, *sectors_read will contain
115 * the number of sectors successfully read.
117 * It is assumed that 'buffer' is already allocated, and of sufficient
118 * size (i.e. >= 4096*num_sectors).
120 * If 'peek' is true, then the tail (read) pointer for the circular buffer is
123 * 0 is returned on success, -errno otherwise. */
124 static int vhdx_log_read_sectors(BlockDriverState
*bs
, VHDXLogEntries
*log
,
125 uint32_t *sectors_read
, void *buffer
,
126 uint32_t num_sectors
, bool peek
)
135 while (num_sectors
) {
136 if (read
== log
->write
) {
140 offset
= log
->offset
+ read
;
142 ret
= bdrv_pread(bs
->file
, offset
, buffer
, VHDX_LOG_SECTOR_SIZE
);
146 read
= vhdx_log_inc_idx(read
, log
->length
);
148 *sectors_read
= *sectors_read
+ 1;
159 /* Validates a log entry header */
160 static bool vhdx_log_hdr_is_valid(VHDXLogEntries
*log
, VHDXLogEntryHeader
*hdr
,
165 if (memcmp(&hdr
->signature
, "loge", 4)) {
169 /* if the individual entry length is larger than the whole log
170 * buffer, that is obviously invalid */
171 if (log
->length
< hdr
->entry_length
) {
175 /* length of entire entry must be in units of 4KB (log sector size) */
176 if (hdr
->entry_length
% (VHDX_LOG_SECTOR_SIZE
)) {
180 /* per spec, sequence # must be > 0 */
181 if (hdr
->sequence_number
== 0) {
185 /* log entries are only valid if they match the file-wide log guid
186 * found in the active header */
187 if (!guid_eq(hdr
->log_guid
, s
->headers
[s
->curr_header
]->log_guid
)) {
191 if (hdr
->descriptor_count
* sizeof(VHDXLogDescriptor
) > hdr
->entry_length
) {
202 * Given a log header, this will validate that the descriptors and the
203 * corresponding data sectors (if applicable)
205 * Validation consists of:
206 * 1. Making sure the sequence numbers matches the entry header
207 * 2. Verifying a valid signature ('zero' or 'desc' for descriptors)
208 * 3. File offset field is a multiple of 4KB
209 * 4. If a data descriptor, the corresponding data sector
210 * has its signature ('data') and matching sequence number
212 * @desc: the data buffer containing the descriptor
213 * @hdr: the log entry header
215 * Returns true if valid
217 static bool vhdx_log_desc_is_valid(VHDXLogDescriptor
*desc
,
218 VHDXLogEntryHeader
*hdr
)
222 if (desc
->sequence_number
!= hdr
->sequence_number
) {
225 if (desc
->file_offset
% VHDX_LOG_SECTOR_SIZE
) {
229 if (!memcmp(&desc
->signature
, "zero", 4)) {
230 if (desc
->zero_length
% VHDX_LOG_SECTOR_SIZE
== 0) {
234 } else if (!memcmp(&desc
->signature
, "desc", 4)) {
244 /* Prior to sector data for a log entry, there is the header
245 * and the descriptors referenced in the header:
249 * [ hdr, desc ][ desc ][ ... ][ data ][ ... ]
251 * The first sector in a log entry has a 64 byte header, and
252 * up to 126 32-byte descriptors. If more descriptors than
253 * 126 are required, then subsequent sectors can have up to 128
254 * descriptors. Each sector is 4KB. Data follows the descriptor
257 * This will return the number of sectors needed to encompass
258 * the passed number of descriptors in desc_cnt.
260 * This will never return 0, even if desc_cnt is 0.
262 static int vhdx_compute_desc_sectors(uint32_t desc_cnt
)
264 uint32_t desc_sectors
;
266 desc_cnt
+= 2; /* account for header in first sector */
267 desc_sectors
= desc_cnt
/ 128;
268 if (desc_cnt
% 128) {
276 /* Reads the log header, and subsequent descriptors (if any). This
277 * will allocate all the space for buffer, which must be NULL when
278 * passed into this function. Each descriptor will also be validated,
279 * and error returned if any are invalid. */
280 static int vhdx_log_read_desc(BlockDriverState
*bs
, BDRVVHDXState
*s
,
281 VHDXLogEntries
*log
, VHDXLogDescEntries
**buffer
)
284 uint32_t desc_sectors
;
285 uint32_t sectors_read
;
286 VHDXLogEntryHeader hdr
;
287 VHDXLogDescEntries
*desc_entries
= NULL
;
290 assert(*buffer
== NULL
);
292 ret
= vhdx_log_peek_hdr(bs
, log
, &hdr
);
296 vhdx_log_entry_hdr_le_import(&hdr
);
297 if (vhdx_log_hdr_is_valid(log
, &hdr
, s
) == false) {
302 desc_sectors
= vhdx_compute_desc_sectors(hdr
.descriptor_count
);
303 desc_entries
= qemu_blockalign(bs
, desc_sectors
* VHDX_LOG_SECTOR_SIZE
);
305 ret
= vhdx_log_read_sectors(bs
, log
, §ors_read
, desc_entries
,
306 desc_sectors
, false);
310 if (sectors_read
!= desc_sectors
) {
315 /* put in proper endianness, and validate each desc */
316 for (i
= 0; i
< hdr
.descriptor_count
; i
++) {
317 vhdx_log_desc_le_import(&desc_entries
->desc
[i
]);
318 if (vhdx_log_desc_is_valid(&desc_entries
->desc
[i
], &hdr
) == false) {
324 *buffer
= desc_entries
;
328 qemu_vfree(desc_entries
);
334 /* Flushes the descriptor described by desc to the VHDX image file.
335 * If the descriptor is a data descriptor, than 'data' must be non-NULL,
336 * and >= 4096 bytes (VHDX_LOG_SECTOR_SIZE), containing the data to be
339 * Verification is performed to make sure the sequence numbers of a data
340 * descriptor match the sequence number in the desc.
342 * For a zero descriptor, it may describe multiple sectors to fill with zeroes.
343 * In this case, it should be noted that zeroes are written to disk, and the
344 * image file is not extended as a sparse file. */
345 static int vhdx_log_flush_desc(BlockDriverState
*bs
, VHDXLogDescriptor
*desc
,
346 VHDXLogDataSector
*data
)
349 uint64_t seq
, file_offset
;
355 buffer
= qemu_blockalign(bs
, VHDX_LOG_SECTOR_SIZE
);
357 if (!memcmp(&desc
->signature
, "desc", 4)) {
364 /* The sequence number of the data sector must match that
365 * in the descriptor */
366 seq
= data
->sequence_high
;
368 seq
|= data
->sequence_low
& 0xffffffff;
370 if (seq
!= desc
->sequence_number
) {
375 /* Each data sector is in total 4096 bytes, however the first
376 * 8 bytes, and last 4 bytes, are located in the descriptor */
377 memcpy(buffer
, &desc
->leading_bytes
, 8);
380 memcpy(buffer
+offset
, data
->data
, 4084);
383 memcpy(buffer
+offset
, &desc
->trailing_bytes
, 4);
385 } else if (!memcmp(&desc
->signature
, "zero", 4)) {
386 /* write 'count' sectors of sector */
387 memset(buffer
, 0, VHDX_LOG_SECTOR_SIZE
);
388 count
= desc
->zero_length
/ VHDX_LOG_SECTOR_SIZE
;
391 file_offset
= desc
->file_offset
;
393 /* count is only > 1 if we are writing zeroes */
394 for (i
= 0; i
< count
; i
++) {
395 ret
= bdrv_pwrite_sync(bs
->file
, file_offset
, buffer
,
396 VHDX_LOG_SECTOR_SIZE
);
400 file_offset
+= VHDX_LOG_SECTOR_SIZE
;
408 /* Flush the entire log (as described by 'logs') to the VHDX image
409 * file, and then set the log to 'empty' status once complete.
411 * The log entries should be validate prior to flushing */
412 static int vhdx_log_flush(BlockDriverState
*bs
, BDRVVHDXState
*s
,
413 VHDXLogSequence
*logs
)
417 uint32_t cnt
, sectors_read
;
418 uint64_t new_file_size
;
420 VHDXLogDescEntries
*desc_entries
= NULL
;
421 VHDXLogEntryHeader hdr_tmp
= { 0 };
425 data
= qemu_blockalign(bs
, VHDX_LOG_SECTOR_SIZE
);
427 ret
= vhdx_user_visible_write(bs
, s
);
432 /* each iteration represents one log sequence, which may span multiple
435 ret
= vhdx_log_peek_hdr(bs
, &logs
->log
, &hdr_tmp
);
439 /* if the log shows a FlushedFileOffset larger than our current file
440 * size, then that means the file has been truncated / corrupted, and
441 * we must refused to open it / use it */
442 if (hdr_tmp
.flushed_file_offset
> bdrv_getlength(bs
->file
)) {
447 ret
= vhdx_log_read_desc(bs
, s
, &logs
->log
, &desc_entries
);
452 for (i
= 0; i
< desc_entries
->hdr
.descriptor_count
; i
++) {
453 if (!memcmp(&desc_entries
->desc
[i
].signature
, "desc", 4)) {
454 /* data sector, so read a sector to flush */
455 ret
= vhdx_log_read_sectors(bs
, &logs
->log
, §ors_read
,
460 if (sectors_read
!= 1) {
466 ret
= vhdx_log_flush_desc(bs
, &desc_entries
->desc
[i
], data
);
471 if (bdrv_getlength(bs
->file
) < desc_entries
->hdr
.last_file_offset
) {
472 new_file_size
= desc_entries
->hdr
.last_file_offset
;
473 if (new_file_size
% (1024*1024)) {
474 /* round up to nearest 1MB boundary */
475 new_file_size
= ((new_file_size
>> 20) + 1) << 20;
476 bdrv_truncate(bs
->file
, new_file_size
);
479 qemu_vfree(desc_entries
);
484 /* once the log is fully flushed, indicate that we have an empty log
485 * now. This also sets the log guid to 0, to indicate an empty log */
486 vhdx_log_reset(bs
, s
);
490 qemu_vfree(desc_entries
);
494 static int vhdx_validate_log_entry(BlockDriverState
*bs
, BDRVVHDXState
*s
,
495 VHDXLogEntries
*log
, uint64_t seq
,
496 bool *valid
, VHDXLogEntryHeader
*entry
)
499 VHDXLogEntryHeader hdr
;
501 uint32_t i
, desc_sectors
, total_sectors
, crc
;
502 uint32_t sectors_read
= 0;
503 VHDXLogDescEntries
*desc_buffer
= NULL
;
507 ret
= vhdx_log_peek_hdr(bs
, log
, &hdr
);
512 vhdx_log_entry_hdr_le_import(&hdr
);
515 if (vhdx_log_hdr_is_valid(log
, &hdr
, s
) == false) {
520 if (hdr
.sequence_number
!= seq
+ 1) {
525 desc_sectors
= vhdx_compute_desc_sectors(hdr
.descriptor_count
);
527 /* Read desc sectors, and calculate log checksum */
529 total_sectors
= hdr
.entry_length
/ VHDX_LOG_SECTOR_SIZE
;
532 /* read_desc() will incrememnt the read idx */
533 ret
= vhdx_log_read_desc(bs
, s
, log
, &desc_buffer
);
538 crc
= vhdx_checksum_calc(0xffffffff, (void *)desc_buffer
,
539 desc_sectors
* VHDX_LOG_SECTOR_SIZE
, 4);
542 buffer
= qemu_blockalign(bs
, VHDX_LOG_SECTOR_SIZE
);
543 if (total_sectors
> desc_sectors
) {
544 for (i
= 0; i
< total_sectors
- desc_sectors
; i
++) {
546 ret
= vhdx_log_read_sectors(bs
, log
, §ors_read
, buffer
,
548 if (ret
< 0 || sectors_read
!= 1) {
551 crc
= vhdx_checksum_calc(crc
, buffer
, VHDX_LOG_SECTOR_SIZE
, -1);
556 if (crc
!= desc_buffer
->hdr
.checksum
) {
565 log
->read
= vhdx_log_inc_idx(log
->read
, log
->length
);
569 qemu_vfree(desc_buffer
);
573 /* Search through the log circular buffer, and find the valid, active
574 * log sequence, if any exists
576 static int vhdx_log_search(BlockDriverState
*bs
, BDRVVHDXState
*s
,
577 VHDXLogSequence
*logs
)
581 bool seq_valid
= false;
582 VHDXLogSequence candidate
= { 0 };
583 VHDXLogEntryHeader hdr
= { 0 };
584 VHDXLogEntries curr_log
;
586 memcpy(&curr_log
, &s
->log
, sizeof(VHDXLogEntries
));
587 curr_log
.write
= curr_log
.length
; /* assume log is full */
591 /* now we will go through the whole log sector by sector, until
592 * we find a valid, active log sequence, or reach the end of the
595 uint64_t curr_seq
= 0;
596 VHDXLogSequence current
= { 0 };
598 tail
= curr_log
.read
;
600 ret
= vhdx_validate_log_entry(bs
, s
, &curr_log
, curr_seq
,
607 current
.valid
= true;
608 current
.log
= curr_log
;
609 current
.log
.read
= tail
;
610 current
.log
.write
= curr_log
.read
;
616 ret
= vhdx_validate_log_entry(bs
, s
, &curr_log
, curr_seq
,
621 if (seq_valid
== false) {
624 current
.log
.write
= curr_log
.read
;
627 curr_seq
= hdr
.sequence_number
;
632 if (candidate
.valid
== false ||
633 current
.hdr
.sequence_number
> candidate
.hdr
.sequence_number
) {
638 if (curr_log
.read
< tail
) {
645 if (candidate
.valid
) {
646 /* this is the next sequence number, for writes */
647 s
->log
.sequence
= candidate
.hdr
.sequence_number
+ 1;
655 /* Parse the replay log. Per the VHDX spec, if the log is present
656 * it must be replayed prior to opening the file, even read-only.
658 * If read-only, we must replay the log in RAM (or refuse to open
659 * a dirty VHDX file read-only) */
660 int vhdx_parse_log(BlockDriverState
*bs
, BDRVVHDXState
*s
, bool *flushed
)
664 VHDXLogSequence logs
= { 0 };
666 hdr
= s
->headers
[s
->curr_header
];
670 /* s->log.hdr is freed in vhdx_close() */
671 if (s
->log
.hdr
== NULL
) {
672 s
->log
.hdr
= qemu_blockalign(bs
, sizeof(VHDXLogEntryHeader
));
675 s
->log
.offset
= hdr
->log_offset
;
676 s
->log
.length
= hdr
->log_length
;
678 if (s
->log
.offset
< VHDX_LOG_MIN_SIZE
||
679 s
->log
.offset
% VHDX_LOG_MIN_SIZE
) {
684 /* per spec, only log version of 0 is supported */
685 if (hdr
->log_version
!= 0) {
690 /* If either the log guid, or log length is zero,
691 * then a replay log is not present */
692 if (guid_eq(hdr
->log_guid
, zero_guid
)) {
696 if (hdr
->log_length
== 0) {
700 if (hdr
->log_length
% VHDX_LOG_MIN_SIZE
) {
706 /* The log is present, we need to find if and where there is an active
707 * sequence of valid entries present in the log. */
709 ret
= vhdx_log_search(bs
, s
, &logs
);
715 /* now flush the log */
716 ret
= vhdx_log_flush(bs
, s
, &logs
);