]> git.proxmox.com Git - qemu.git/blame - block/vmdk.c
VMDK: introduce VmdkExtent
[qemu.git] / block / vmdk.c
CommitLineData
ea2384d3
FB
1/*
2 * Block driver for the VMDK format
5fafdf24 3 *
ea2384d3 4 * Copyright (c) 2004 Fabrice Bellard
ff1afc72 5 * Copyright (c) 2005 Filip Navara
5fafdf24 6 *
ea2384d3
FB
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 * THE SOFTWARE.
24 */
5f4da8c0 25
faf07963 26#include "qemu-common.h"
ea2384d3 27#include "block_int.h"
5efa9d5a 28#include "module.h"
ea2384d3 29
ea2384d3
FB
30#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
31#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
32
33typedef struct {
34 uint32_t version;
35 uint32_t flags;
36 uint32_t disk_sectors;
37 uint32_t granularity;
38 uint32_t l1dir_offset;
39 uint32_t l1dir_size;
40 uint32_t file_sectors;
41 uint32_t cylinders;
42 uint32_t heads;
43 uint32_t sectors_per_track;
44} VMDK3Header;
45
46typedef struct {
47 uint32_t version;
48 uint32_t flags;
49 int64_t capacity;
50 int64_t granularity;
51 int64_t desc_offset;
52 int64_t desc_size;
53 int32_t num_gtes_per_gte;
54 int64_t rgd_offset;
55 int64_t gd_offset;
56 int64_t grain_offset;
57 char filler[1];
58 char check_bytes[4];
ff1afc72 59} __attribute__((packed)) VMDK4Header;
ea2384d3
FB
60
61#define L2_CACHE_SIZE 16
62
b3976d3c
FZ
63typedef struct VmdkExtent {
64 BlockDriverState *file;
65 bool flat;
66 int64_t sectors;
67 int64_t end_sector;
ea2384d3 68 int64_t l1_table_offset;
ff1afc72 69 int64_t l1_backup_table_offset;
ea2384d3 70 uint32_t *l1_table;
ff1afc72 71 uint32_t *l1_backup_table;
ea2384d3
FB
72 unsigned int l1_size;
73 uint32_t l1_entry_sectors;
74
75 unsigned int l2_size;
76 uint32_t *l2_cache;
77 uint32_t l2_cache_offsets[L2_CACHE_SIZE];
78 uint32_t l2_cache_counts[L2_CACHE_SIZE];
79
80 unsigned int cluster_sectors;
b3976d3c
FZ
81} VmdkExtent;
82
83typedef struct BDRVVmdkState {
5f4da8c0 84 uint32_t parent_cid;
b3976d3c
FZ
85 int num_extents;
86 /* Extent array with num_extents entries, ascend ordered by address */
87 VmdkExtent *extents;
ea2384d3
FB
88} BDRVVmdkState;
89
630530a6
TS
90typedef struct VmdkMetaData {
91 uint32_t offset;
92 unsigned int l1_index;
93 unsigned int l2_index;
94 unsigned int l2_offset;
95 int valid;
96} VmdkMetaData;
97
ea2384d3
FB
98static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
99{
100 uint32_t magic;
101
102 if (buf_size < 4)
103 return 0;
104 magic = be32_to_cpu(*(uint32_t *)buf);
105 if (magic == VMDK3_MAGIC ||
106 magic == VMDK4_MAGIC)
107 return 100;
108 else
109 return 0;
110}
111
5f4da8c0
TS
112#define CHECK_CID 1
113
3b46e624 114#define SECTOR_SIZE 512
5f4da8c0 115#define DESC_SIZE 20*SECTOR_SIZE // 20 sectors of 512 bytes each
5fafdf24 116#define HEADER_SIZE 512 // first sector of 512 bytes
5f4da8c0 117
b3976d3c
FZ
118static void vmdk_free_extents(BlockDriverState *bs)
119{
120 int i;
121 BDRVVmdkState *s = bs->opaque;
122
123 for (i = 0; i < s->num_extents; i++) {
124 qemu_free(s->extents[i].l1_table);
125 qemu_free(s->extents[i].l2_cache);
126 qemu_free(s->extents[i].l1_backup_table);
127 }
128 qemu_free(s->extents);
129}
130
5f4da8c0 131static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
ea2384d3 132{
5f4da8c0
TS
133 char desc[DESC_SIZE];
134 uint32_t cid;
7ccfb2eb 135 const char *p_name, *cid_str;
5f4da8c0
TS
136 size_t cid_str_size;
137
138 /* the descriptor offset = 0x200 */
6511ef77 139 if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
5f4da8c0
TS
140 return 0;
141
142 if (parent) {
143 cid_str = "parentCID";
144 cid_str_size = sizeof("parentCID");
145 } else {
146 cid_str = "CID";
147 cid_str_size = sizeof("CID");
148 }
149
511d2b14 150 if ((p_name = strstr(desc,cid_str)) != NULL) {
5f4da8c0
TS
151 p_name += cid_str_size;
152 sscanf(p_name,"%x",&cid);
153 }
154
155 return cid;
156}
157
158static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
159{
5f4da8c0
TS
160 char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
161 char *p_name, *tmp_str;
162
163 /* the descriptor offset = 0x200 */
6511ef77 164 if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
5f4da8c0
TS
165 return -1;
166
167 tmp_str = strstr(desc,"parentCID");
363a37d5 168 pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
511d2b14 169 if ((p_name = strstr(desc,"CID")) != NULL) {
5f4da8c0 170 p_name += sizeof("CID");
363a37d5
BS
171 snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid);
172 pstrcat(desc, sizeof(desc), tmp_desc);
5f4da8c0
TS
173 }
174
b8852e87 175 if (bdrv_pwrite_sync(bs->file, 0x200, desc, DESC_SIZE) < 0)
5f4da8c0
TS
176 return -1;
177 return 0;
178}
179
180static int vmdk_is_cid_valid(BlockDriverState *bs)
181{
182#ifdef CHECK_CID
183 BDRVVmdkState *s = bs->opaque;
b171271a 184 BlockDriverState *p_bs = bs->backing_hd;
5f4da8c0
TS
185 uint32_t cur_pcid;
186
187 if (p_bs) {
188 cur_pcid = vmdk_read_cid(p_bs,0);
189 if (s->parent_cid != cur_pcid)
190 // CID not valid
191 return 0;
192 }
193#endif
194 // CID valid
195 return 1;
196}
197
198static int vmdk_snapshot_create(const char *filename, const char *backing_file)
199{
200 int snp_fd, p_fd;
53c2e716 201 int ret;
5f4da8c0 202 uint32_t p_cid;
5fafdf24 203 char *p_name, *gd_buf, *rgd_buf;
5f4da8c0
TS
204 const char *real_filename, *temp_str;
205 VMDK4Header header;
206 uint32_t gde_entries, gd_size;
207 int64_t gd_offset, rgd_offset, capacity, gt_size;
208 char p_desc[DESC_SIZE], s_desc[DESC_SIZE], hdr[HEADER_SIZE];
7ccfb2eb 209 static const char desc_template[] =
5f4da8c0
TS
210 "# Disk DescriptorFile\n"
211 "version=1\n"
212 "CID=%x\n"
213 "parentCID=%x\n"
214 "createType=\"monolithicSparse\"\n"
215 "parentFileNameHint=\"%s\"\n"
216 "\n"
217 "# Extent description\n"
7ccfb2eb 218 "RW %u SPARSE \"%s\"\n"
5f4da8c0
TS
219 "\n"
220 "# The Disk Data Base \n"
221 "#DDB\n"
222 "\n";
223
224 snp_fd = open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, 0644);
225 if (snp_fd < 0)
53c2e716 226 return -errno;
5f4da8c0
TS
227 p_fd = open(backing_file, O_RDONLY | O_BINARY | O_LARGEFILE);
228 if (p_fd < 0) {
229 close(snp_fd);
53c2e716 230 return -errno;
5f4da8c0
TS
231 }
232
233 /* read the header */
53c2e716
JQ
234 if (lseek(p_fd, 0x0, SEEK_SET) == -1) {
235 ret = -errno;
5f4da8c0 236 goto fail;
53c2e716
JQ
237 }
238 if (read(p_fd, hdr, HEADER_SIZE) != HEADER_SIZE) {
239 ret = -errno;
5f4da8c0 240 goto fail;
53c2e716 241 }
5f4da8c0
TS
242
243 /* write the header */
53c2e716
JQ
244 if (lseek(snp_fd, 0x0, SEEK_SET) == -1) {
245 ret = -errno;
5f4da8c0 246 goto fail;
53c2e716
JQ
247 }
248 if (write(snp_fd, hdr, HEADER_SIZE) == -1) {
249 ret = -errno;
5f4da8c0 250 goto fail;
53c2e716 251 }
5f4da8c0
TS
252
253 memset(&header, 0, sizeof(header));
254 memcpy(&header,&hdr[4], sizeof(header)); // skip the VMDK4_MAGIC
255
53c2e716
JQ
256 if (ftruncate(snp_fd, header.grain_offset << 9)) {
257 ret = -errno;
1640366c 258 goto fail;
53c2e716 259 }
5f4da8c0 260 /* the descriptor offset = 0x200 */
53c2e716
JQ
261 if (lseek(p_fd, 0x200, SEEK_SET) == -1) {
262 ret = -errno;
5f4da8c0 263 goto fail;
53c2e716
JQ
264 }
265 if (read(p_fd, p_desc, DESC_SIZE) != DESC_SIZE) {
266 ret = -errno;
5f4da8c0 267 goto fail;
53c2e716 268 }
5f4da8c0 269
511d2b14 270 if ((p_name = strstr(p_desc,"CID")) != NULL) {
5f4da8c0
TS
271 p_name += sizeof("CID");
272 sscanf(p_name,"%x",&p_cid);
273 }
274
275 real_filename = filename;
276 if ((temp_str = strrchr(real_filename, '\\')) != NULL)
277 real_filename = temp_str + 1;
278 if ((temp_str = strrchr(real_filename, '/')) != NULL)
279 real_filename = temp_str + 1;
280 if ((temp_str = strrchr(real_filename, ':')) != NULL)
281 real_filename = temp_str + 1;
282
363a37d5
BS
283 snprintf(s_desc, sizeof(s_desc), desc_template, p_cid, p_cid, backing_file,
284 (uint32_t)header.capacity, real_filename);
5f4da8c0
TS
285
286 /* write the descriptor */
53c2e716
JQ
287 if (lseek(snp_fd, 0x200, SEEK_SET) == -1) {
288 ret = -errno;
5f4da8c0 289 goto fail;
53c2e716
JQ
290 }
291 if (write(snp_fd, s_desc, strlen(s_desc)) == -1) {
292 ret = -errno;
5f4da8c0 293 goto fail;
53c2e716 294 }
ea2384d3 295
5f4da8c0
TS
296 gd_offset = header.gd_offset * SECTOR_SIZE; // offset of GD table
297 rgd_offset = header.rgd_offset * SECTOR_SIZE; // offset of RGD table
298 capacity = header.capacity * SECTOR_SIZE; // Extent size
299 /*
300 * Each GDE span 32M disk, means:
301 * 512 GTE per GT, each GTE points to grain
302 */
303 gt_size = (int64_t)header.num_gtes_per_gte * header.granularity * SECTOR_SIZE;
53c2e716
JQ
304 if (!gt_size) {
305 ret = -EINVAL;
5f4da8c0 306 goto fail;
53c2e716 307 }
5fafdf24 308 gde_entries = (uint32_t)(capacity / gt_size); // number of gde/rgde
5f4da8c0
TS
309 gd_size = gde_entries * sizeof(uint32_t);
310
311 /* write RGD */
312 rgd_buf = qemu_malloc(gd_size);
53c2e716
JQ
313 if (lseek(p_fd, rgd_offset, SEEK_SET) == -1) {
314 ret = -errno;
5f4da8c0 315 goto fail_rgd;
53c2e716
JQ
316 }
317 if (read(p_fd, rgd_buf, gd_size) != gd_size) {
318 ret = -errno;
5f4da8c0 319 goto fail_rgd;
53c2e716
JQ
320 }
321 if (lseek(snp_fd, rgd_offset, SEEK_SET) == -1) {
322 ret = -errno;
5f4da8c0 323 goto fail_rgd;
53c2e716
JQ
324 }
325 if (write(snp_fd, rgd_buf, gd_size) == -1) {
326 ret = -errno;
5f4da8c0 327 goto fail_rgd;
53c2e716 328 }
5f4da8c0
TS
329
330 /* write GD */
331 gd_buf = qemu_malloc(gd_size);
53c2e716
JQ
332 if (lseek(p_fd, gd_offset, SEEK_SET) == -1) {
333 ret = -errno;
5f4da8c0 334 goto fail_gd;
53c2e716
JQ
335 }
336 if (read(p_fd, gd_buf, gd_size) != gd_size) {
337 ret = -errno;
5f4da8c0 338 goto fail_gd;
53c2e716
JQ
339 }
340 if (lseek(snp_fd, gd_offset, SEEK_SET) == -1) {
341 ret = -errno;
5f4da8c0 342 goto fail_gd;
53c2e716
JQ
343 }
344 if (write(snp_fd, gd_buf, gd_size) == -1) {
345 ret = -errno;
5f4da8c0 346 goto fail_gd;
53c2e716 347 }
3829cb46 348 ret = 0;
5f4da8c0 349
3829cb46 350fail_gd:
5f4da8c0 351 qemu_free(gd_buf);
3829cb46 352fail_rgd:
5f4da8c0 353 qemu_free(rgd_buf);
3829cb46 354fail:
5f4da8c0
TS
355 close(p_fd);
356 close(snp_fd);
53c2e716 357 return ret;
5f4da8c0
TS
358}
359
9949f97e 360static int vmdk_parent_open(BlockDriverState *bs)
5f4da8c0 361{
5fafdf24 362 char *p_name;
5f4da8c0 363 char desc[DESC_SIZE];
5f4da8c0
TS
364
365 /* the descriptor offset = 0x200 */
6511ef77 366 if (bdrv_pread(bs->file, 0x200, desc, DESC_SIZE) != DESC_SIZE)
5f4da8c0
TS
367 return -1;
368
511d2b14 369 if ((p_name = strstr(desc,"parentFileNameHint")) != NULL) {
5f4da8c0 370 char *end_name;
5f4da8c0
TS
371
372 p_name += sizeof("parentFileNameHint") + 1;
511d2b14 373 if ((end_name = strchr(p_name,'\"')) == NULL)
5f4da8c0 374 return -1;
b171271a 375 if ((end_name - p_name) > sizeof (bs->backing_file) - 1)
b34d259a 376 return -1;
3b46e624 377
b171271a 378 pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
ff1afc72 379 }
5f4da8c0
TS
380
381 return 0;
382}
383
b3976d3c
FZ
384/* Create and append extent to the extent array. Return the added VmdkExtent
385 * address. return NULL if allocation failed. */
386static VmdkExtent *vmdk_add_extent(BlockDriverState *bs,
387 BlockDriverState *file, bool flat, int64_t sectors,
388 int64_t l1_offset, int64_t l1_backup_offset,
389 uint32_t l1_size,
390 int l2_size, unsigned int cluster_sectors)
391{
392 VmdkExtent *extent;
393 BDRVVmdkState *s = bs->opaque;
394
395 s->extents = qemu_realloc(s->extents,
396 (s->num_extents + 1) * sizeof(VmdkExtent));
397 extent = &s->extents[s->num_extents];
398 s->num_extents++;
399
400 memset(extent, 0, sizeof(VmdkExtent));
401 extent->file = file;
402 extent->flat = flat;
403 extent->sectors = sectors;
404 extent->l1_table_offset = l1_offset;
405 extent->l1_backup_table_offset = l1_backup_offset;
406 extent->l1_size = l1_size;
407 extent->l1_entry_sectors = l2_size * cluster_sectors;
408 extent->l2_size = l2_size;
409 extent->cluster_sectors = cluster_sectors;
410
411 if (s->num_extents > 1) {
412 extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
413 } else {
414 extent->end_sector = extent->sectors;
415 }
416 bs->total_sectors = extent->end_sector;
417 return extent;
418}
419
420
6511ef77 421static int vmdk_open(BlockDriverState *bs, int flags)
5f4da8c0
TS
422{
423 BDRVVmdkState *s = bs->opaque;
424 uint32_t magic;
b3976d3c
FZ
425 int i;
426 uint32_t l1_size, l1_entry_sectors;
427 VmdkExtent *extent = NULL;
5f4da8c0 428
6511ef77 429 if (bdrv_pread(bs->file, 0, &magic, sizeof(magic)) != sizeof(magic))
ea2384d3 430 goto fail;
5f4da8c0 431
7143c62c 432 magic = be32_to_cpu(magic);
ea2384d3
FB
433 if (magic == VMDK3_MAGIC) {
434 VMDK3Header header;
b3976d3c
FZ
435 if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header))
436 != sizeof(header)) {
ea2384d3 437 goto fail;
b3976d3c
FZ
438 }
439 extent = vmdk_add_extent(bs, bs->file, false,
440 le32_to_cpu(header.disk_sectors),
441 le32_to_cpu(header.l1dir_offset) << 9, 0,
442 1 << 6, 1 << 9, le32_to_cpu(header.granularity));
ea2384d3
FB
443 } else if (magic == VMDK4_MAGIC) {
444 VMDK4Header header;
b3976d3c
FZ
445 if (bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header))
446 != sizeof(header)) {
ea2384d3 447 goto fail;
b3976d3c
FZ
448 }
449 l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gte)
450 * le64_to_cpu(header.granularity);
451 l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
452 / l1_entry_sectors;
453 extent = vmdk_add_extent(bs, bs->file, false,
454 le64_to_cpu(header.capacity),
455 le64_to_cpu(header.gd_offset) << 9,
456 le64_to_cpu(header.rgd_offset) << 9,
457 l1_size,
458 le32_to_cpu(header.num_gtes_per_gte),
459 le64_to_cpu(header.granularity));
460 if (extent->l1_entry_sectors <= 0) {
ea2384d3 461 goto fail;
b3976d3c 462 }
5f4da8c0 463 // try to open parent images, if exist
9949f97e 464 if (vmdk_parent_open(bs) != 0)
5f4da8c0
TS
465 goto fail;
466 // write the CID once after the image creation
467 s->parent_cid = vmdk_read_cid(bs,1);
ea2384d3
FB
468 } else {
469 goto fail;
470 }
5f4da8c0 471
ea2384d3 472 /* read the L1 table */
b3976d3c
FZ
473 l1_size = extent->l1_size * sizeof(uint32_t);
474 extent->l1_table = qemu_malloc(l1_size);
475 if (bdrv_pread(bs->file,
476 extent->l1_table_offset,
477 extent->l1_table,
478 l1_size)
479 != l1_size) {
ea2384d3 480 goto fail;
b3976d3c
FZ
481 }
482 for (i = 0; i < extent->l1_size; i++) {
483 le32_to_cpus(&extent->l1_table[i]);
ea2384d3
FB
484 }
485
b3976d3c
FZ
486 if (extent->l1_backup_table_offset) {
487 extent->l1_backup_table = qemu_malloc(l1_size);
488 if (bdrv_pread(bs->file,
489 extent->l1_backup_table_offset,
490 extent->l1_backup_table,
491 l1_size)
492 != l1_size) {
ff1afc72 493 goto fail;
b3976d3c
FZ
494 }
495 for (i = 0; i < extent->l1_size; i++) {
496 le32_to_cpus(&extent->l1_backup_table[i]);
ff1afc72
FB
497 }
498 }
499
b3976d3c
FZ
500 extent->l2_cache =
501 qemu_malloc(extent->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
ea2384d3
FB
502 return 0;
503 fail:
b3976d3c 504 vmdk_free_extents(bs);
ea2384d3
FB
505 return -1;
506}
507
b3976d3c
FZ
508static int get_whole_cluster(BlockDriverState *bs,
509 VmdkExtent *extent,
510 uint64_t cluster_offset,
511 uint64_t offset,
512 bool allocate)
5f4da8c0 513{
b3976d3c
FZ
514 /* 128 sectors * 512 bytes each = grain size 64KB */
515 uint8_t whole_grain[extent->cluster_sectors * 512];
5f4da8c0
TS
516
517 // we will be here if it's first write on non-exist grain(cluster).
518 // try to read from parent image, if exist
b171271a 519 if (bs->backing_hd) {
c336500d 520 int ret;
5f4da8c0
TS
521
522 if (!vmdk_is_cid_valid(bs))
523 return -1;
5f4da8c0 524
c336500d 525 ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
b3976d3c 526 extent->cluster_sectors);
c336500d
KW
527 if (ret < 0) {
528 return -1;
529 }
630530a6 530
c336500d 531 //Write grain only into the active image
b3976d3c
FZ
532 ret = bdrv_write(extent->file, cluster_offset, whole_grain,
533 extent->cluster_sectors);
c336500d
KW
534 if (ret < 0) {
535 return -1;
630530a6
TS
536 }
537 }
538 return 0;
539}
540
b3976d3c 541static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
630530a6 542{
630530a6 543 /* update L2 table */
b3976d3c
FZ
544 if (bdrv_pwrite_sync(
545 extent->file,
546 ((int64_t)m_data->l2_offset * 512)
547 + (m_data->l2_index * sizeof(m_data->offset)),
548 &(m_data->offset),
549 sizeof(m_data->offset)
550 ) < 0) {
630530a6 551 return -1;
b3976d3c 552 }
630530a6 553 /* update backup L2 table */
b3976d3c
FZ
554 if (extent->l1_backup_table_offset != 0) {
555 m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
556 if (bdrv_pwrite_sync(
557 extent->file,
558 ((int64_t)m_data->l2_offset * 512)
559 + (m_data->l2_index * sizeof(m_data->offset)),
560 &(m_data->offset), sizeof(m_data->offset)
561 ) < 0) {
5f4da8c0 562 return -1;
b3976d3c 563 }
5f4da8c0 564 }
630530a6 565
5f4da8c0
TS
566 return 0;
567}
568
b3976d3c
FZ
569static uint64_t get_cluster_offset(BlockDriverState *bs,
570 VmdkExtent *extent,
571 VmdkMetaData *m_data,
572 uint64_t offset, int allocate)
ea2384d3 573{
ea2384d3
FB
574 unsigned int l1_index, l2_offset, l2_index;
575 int min_index, i, j;
630530a6 576 uint32_t min_count, *l2_table, tmp = 0;
ea2384d3 577 uint64_t cluster_offset;
630530a6
TS
578
579 if (m_data)
580 m_data->valid = 0;
581
b3976d3c
FZ
582 l1_index = (offset >> 9) / extent->l1_entry_sectors;
583 if (l1_index >= extent->l1_size) {
ea2384d3 584 return 0;
b3976d3c
FZ
585 }
586 l2_offset = extent->l1_table[l1_index];
587 if (!l2_offset) {
ea2384d3 588 return 0;
b3976d3c 589 }
ea2384d3 590 for(i = 0; i < L2_CACHE_SIZE; i++) {
b3976d3c 591 if (l2_offset == extent->l2_cache_offsets[i]) {
ea2384d3 592 /* increment the hit count */
b3976d3c 593 if (++extent->l2_cache_counts[i] == 0xffffffff) {
ea2384d3 594 for(j = 0; j < L2_CACHE_SIZE; j++) {
b3976d3c 595 extent->l2_cache_counts[j] >>= 1;
ea2384d3
FB
596 }
597 }
b3976d3c 598 l2_table = extent->l2_cache + (i * extent->l2_size);
ea2384d3
FB
599 goto found;
600 }
601 }
602 /* not found: load a new entry in the least used one */
603 min_index = 0;
604 min_count = 0xffffffff;
605 for(i = 0; i < L2_CACHE_SIZE; i++) {
b3976d3c
FZ
606 if (extent->l2_cache_counts[i] < min_count) {
607 min_count = extent->l2_cache_counts[i];
ea2384d3
FB
608 min_index = i;
609 }
610 }
b3976d3c
FZ
611 l2_table = extent->l2_cache + (min_index * extent->l2_size);
612 if (bdrv_pread(
613 extent->file,
614 (int64_t)l2_offset * 512,
615 l2_table,
616 extent->l2_size * sizeof(uint32_t)
617 ) != extent->l2_size * sizeof(uint32_t)) {
ea2384d3 618 return 0;
b3976d3c 619 }
5f4da8c0 620
b3976d3c
FZ
621 extent->l2_cache_offsets[min_index] = l2_offset;
622 extent->l2_cache_counts[min_index] = 1;
ea2384d3 623 found:
b3976d3c 624 l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
ea2384d3 625 cluster_offset = le32_to_cpu(l2_table[l2_index]);
630530a6 626
ff1afc72
FB
627 if (!cluster_offset) {
628 if (!allocate)
629 return 0;
9949f97e 630
630530a6 631 // Avoid the L2 tables update for the images that have snapshots.
b3976d3c
FZ
632 cluster_offset = bdrv_getlength(extent->file);
633 bdrv_truncate(
634 extent->file,
635 cluster_offset + (extent->cluster_sectors << 9)
636 );
9949f97e
KW
637
638 cluster_offset >>= 9;
639 tmp = cpu_to_le32(cluster_offset);
640 l2_table[l2_index] = tmp;
630530a6 641
630530a6
TS
642 /* First of all we write grain itself, to avoid race condition
643 * that may to corrupt the image.
644 * This problem may occur because of insufficient space on host disk
645 * or inappropriate VM shutdown.
646 */
b3976d3c
FZ
647 if (get_whole_cluster(
648 bs, extent, cluster_offset, offset, allocate) == -1)
5f4da8c0 649 return 0;
630530a6
TS
650
651 if (m_data) {
652 m_data->offset = tmp;
653 m_data->l1_index = l1_index;
654 m_data->l2_index = l2_index;
655 m_data->l2_offset = l2_offset;
656 m_data->valid = 1;
657 }
ff1afc72 658 }
ea2384d3
FB
659 cluster_offset <<= 9;
660 return cluster_offset;
661}
662
b3976d3c
FZ
663static VmdkExtent *find_extent(BDRVVmdkState *s,
664 int64_t sector_num, VmdkExtent *start_hint)
665{
666 VmdkExtent *extent = start_hint;
667
668 if (!extent) {
669 extent = &s->extents[0];
670 }
671 while (extent < &s->extents[s->num_extents]) {
672 if (sector_num < extent->end_sector) {
673 return extent;
674 }
675 extent++;
676 }
677 return NULL;
678}
679
5fafdf24 680static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
ea2384d3
FB
681 int nb_sectors, int *pnum)
682{
683 BDRVVmdkState *s = bs->opaque;
ea2384d3 684
b3976d3c
FZ
685 int64_t index_in_cluster, n, ret;
686 uint64_t offset;
687 VmdkExtent *extent;
688
689 extent = find_extent(s, sector_num, NULL);
690 if (!extent) {
691 return 0;
692 }
693 if (extent->flat) {
694 n = extent->end_sector - sector_num;
695 ret = 1;
696 } else {
697 offset = get_cluster_offset(bs, extent, NULL, sector_num * 512, 0);
698 index_in_cluster = sector_num % extent->cluster_sectors;
699 n = extent->cluster_sectors - index_in_cluster;
700 ret = offset ? 1 : 0;
701 }
ea2384d3
FB
702 if (n > nb_sectors)
703 n = nb_sectors;
704 *pnum = n;
b3976d3c 705 return ret;
ea2384d3
FB
706}
707
5fafdf24 708static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
ea2384d3
FB
709 uint8_t *buf, int nb_sectors)
710{
711 BDRVVmdkState *s = bs->opaque;
b3976d3c
FZ
712 int ret;
713 uint64_t n, index_in_cluster;
714 VmdkExtent *extent = NULL;
ea2384d3 715 uint64_t cluster_offset;
5f4da8c0 716
ea2384d3 717 while (nb_sectors > 0) {
b3976d3c
FZ
718 extent = find_extent(s, sector_num, extent);
719 if (!extent) {
720 return -EIO;
721 }
722 cluster_offset = get_cluster_offset(
723 bs, extent, NULL, sector_num << 9, 0);
724 index_in_cluster = sector_num % extent->cluster_sectors;
725 n = extent->cluster_sectors - index_in_cluster;
ea2384d3
FB
726 if (n > nb_sectors)
727 n = nb_sectors;
728 if (!cluster_offset) {
5f4da8c0 729 // try to read from parent image, if exist
b171271a 730 if (bs->backing_hd) {
5f4da8c0
TS
731 if (!vmdk_is_cid_valid(bs))
732 return -1;
b171271a 733 ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
5f4da8c0
TS
734 if (ret < 0)
735 return -1;
736 } else {
737 memset(buf, 0, 512 * n);
738 }
ea2384d3 739 } else {
6511ef77 740 if(bdrv_pread(bs->file, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512)
ea2384d3
FB
741 return -1;
742 }
743 nb_sectors -= n;
744 sector_num += n;
745 buf += n * 512;
746 }
747 return 0;
748}
749
5fafdf24 750static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
ea2384d3
FB
751 const uint8_t *buf, int nb_sectors)
752{
ff1afc72 753 BDRVVmdkState *s = bs->opaque;
b3976d3c
FZ
754 VmdkExtent *extent = NULL;
755 int n;
756 int64_t index_in_cluster;
ff1afc72 757 uint64_t cluster_offset;
5f4da8c0 758 static int cid_update = 0;
b3976d3c 759 VmdkMetaData m_data;
ff1afc72 760
630530a6
TS
761 if (sector_num > bs->total_sectors) {
762 fprintf(stderr,
92868412
JM
763 "(VMDK) Wrong offset: sector_num=0x%" PRIx64
764 " total_sectors=0x%" PRIx64 "\n",
630530a6
TS
765 sector_num, bs->total_sectors);
766 return -1;
767 }
768
ff1afc72 769 while (nb_sectors > 0) {
b3976d3c
FZ
770 extent = find_extent(s, sector_num, extent);
771 if (!extent) {
772 return -EIO;
773 }
774 cluster_offset = get_cluster_offset(
775 bs,
776 extent,
777 &m_data,
778 sector_num << 9, 1);
779 if (!cluster_offset) {
ff1afc72 780 return -1;
b3976d3c
FZ
781 }
782 index_in_cluster = sector_num % extent->cluster_sectors;
783 n = extent->cluster_sectors - index_in_cluster;
784 if (n > nb_sectors) {
785 n = nb_sectors;
786 }
630530a6 787
b3976d3c
FZ
788 if (bdrv_pwrite(bs->file,
789 cluster_offset + index_in_cluster * 512,
790 buf, n * 512)
791 != n * 512) {
ff1afc72 792 return -1;
b3976d3c 793 }
630530a6
TS
794 if (m_data.valid) {
795 /* update L2 tables */
b3976d3c 796 if (vmdk_L2update(extent, &m_data) == -1) {
630530a6 797 return -1;
b3976d3c 798 }
630530a6 799 }
ff1afc72
FB
800 nb_sectors -= n;
801 sector_num += n;
802 buf += n * 512;
5f4da8c0
TS
803
804 // update CID on the first write every time the virtual disk is opened
805 if (!cid_update) {
806 vmdk_write_cid(bs, time(NULL));
807 cid_update++;
808 }
ff1afc72
FB
809 }
810 return 0;
ea2384d3
FB
811}
812
0e7e1989 813static int vmdk_create(const char *filename, QEMUOptionParameter *options)
8979b227
FB
814{
815 int fd, i;
816 VMDK4Header header;
817 uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
7ccfb2eb 818 static const char desc_template[] =
8979b227
FB
819 "# Disk DescriptorFile\n"
820 "version=1\n"
821 "CID=%x\n"
822 "parentCID=ffffffff\n"
823 "createType=\"monolithicSparse\"\n"
824 "\n"
825 "# Extent description\n"
7fd6d9fc 826 "RW %" PRId64 " SPARSE \"%s\"\n"
8979b227
FB
827 "\n"
828 "# The Disk Data Base \n"
829 "#DDB\n"
830 "\n"
ec36ba14 831 "ddb.virtualHWVersion = \"%d\"\n"
7fd6d9fc 832 "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
8979b227
FB
833 "ddb.geometry.heads = \"16\"\n"
834 "ddb.geometry.sectors = \"63\"\n"
835 "ddb.adapterType = \"ide\"\n";
836 char desc[1024];
837 const char *real_filename, *temp_str;
0e7e1989
KW
838 int64_t total_size = 0;
839 const char *backing_file = NULL;
840 int flags = 0;
1640366c 841 int ret;
0e7e1989
KW
842
843 // Read out options
844 while (options && options->name) {
845 if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
846 total_size = options->value.n / 512;
847 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
848 backing_file = options->value.s;
849 } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
850 flags |= options->value.n ? BLOCK_FLAG_COMPAT6: 0;
851 }
852 options++;
853 }
8979b227
FB
854
855 /* XXX: add support for backing file */
5f4da8c0
TS
856 if (backing_file) {
857 return vmdk_snapshot_create(filename, backing_file);
858 }
8979b227
FB
859
860 fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
861 0644);
862 if (fd < 0)
b781cce5 863 return -errno;
8979b227
FB
864 magic = cpu_to_be32(VMDK4_MAGIC);
865 memset(&header, 0, sizeof(header));
16372ff0
AG
866 header.version = 1;
867 header.flags = 3; /* ?? */
868 header.capacity = total_size;
869 header.granularity = 128;
870 header.num_gtes_per_gte = 512;
8979b227
FB
871
872 grains = (total_size + header.granularity - 1) / header.granularity;
873 gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
874 gt_count = (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
875 gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
876
877 header.desc_offset = 1;
878 header.desc_size = 20;
879 header.rgd_offset = header.desc_offset + header.desc_size;
880 header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count);
881 header.grain_offset =
882 ((header.gd_offset + gd_size + (gt_size * gt_count) +
883 header.granularity - 1) / header.granularity) *
884 header.granularity;
885
16372ff0
AG
886 /* swap endianness for all header fields */
887 header.version = cpu_to_le32(header.version);
888 header.flags = cpu_to_le32(header.flags);
889 header.capacity = cpu_to_le64(header.capacity);
890 header.granularity = cpu_to_le64(header.granularity);
891 header.num_gtes_per_gte = cpu_to_le32(header.num_gtes_per_gte);
8979b227
FB
892 header.desc_offset = cpu_to_le64(header.desc_offset);
893 header.desc_size = cpu_to_le64(header.desc_size);
894 header.rgd_offset = cpu_to_le64(header.rgd_offset);
895 header.gd_offset = cpu_to_le64(header.gd_offset);
896 header.grain_offset = cpu_to_le64(header.grain_offset);
897
898 header.check_bytes[0] = 0xa;
899 header.check_bytes[1] = 0x20;
900 header.check_bytes[2] = 0xd;
901 header.check_bytes[3] = 0xa;
3b46e624
TS
902
903 /* write all the data */
1640366c
KS
904 ret = qemu_write_full(fd, &magic, sizeof(magic));
905 if (ret != sizeof(magic)) {
b781cce5 906 ret = -errno;
1640366c
KS
907 goto exit;
908 }
909 ret = qemu_write_full(fd, &header, sizeof(header));
910 if (ret != sizeof(header)) {
b781cce5 911 ret = -errno;
1640366c
KS
912 goto exit;
913 }
8979b227 914
16372ff0 915 ret = ftruncate(fd, le64_to_cpu(header.grain_offset) << 9);
1640366c 916 if (ret < 0) {
b781cce5 917 ret = -errno;
1640366c
KS
918 goto exit;
919 }
8979b227
FB
920
921 /* write grain directory */
922 lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET);
16372ff0 923 for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_size;
1640366c
KS
924 i < gt_count; i++, tmp += gt_size) {
925 ret = qemu_write_full(fd, &tmp, sizeof(tmp));
926 if (ret != sizeof(tmp)) {
b781cce5 927 ret = -errno;
1640366c
KS
928 goto exit;
929 }
930 }
3b46e624 931
8979b227
FB
932 /* write backup grain directory */
933 lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET);
16372ff0 934 for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_size;
1640366c
KS
935 i < gt_count; i++, tmp += gt_size) {
936 ret = qemu_write_full(fd, &tmp, sizeof(tmp));
937 if (ret != sizeof(tmp)) {
b781cce5 938 ret = -errno;
1640366c
KS
939 goto exit;
940 }
941 }
8979b227
FB
942
943 /* compose the descriptor */
944 real_filename = filename;
945 if ((temp_str = strrchr(real_filename, '\\')) != NULL)
946 real_filename = temp_str + 1;
947 if ((temp_str = strrchr(real_filename, '/')) != NULL)
948 real_filename = temp_str + 1;
949 if ((temp_str = strrchr(real_filename, ':')) != NULL)
950 real_filename = temp_str + 1;
7ccfb2eb 951 snprintf(desc, sizeof(desc), desc_template, (unsigned int)time(NULL),
7fd6d9fc
BS
952 total_size, real_filename,
953 (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
954 total_size / (int64_t)(63 * 16));
8979b227
FB
955
956 /* write the descriptor */
957 lseek(fd, le64_to_cpu(header.desc_offset) << 9, SEEK_SET);
1640366c
KS
958 ret = qemu_write_full(fd, desc, strlen(desc));
959 if (ret != strlen(desc)) {
b781cce5 960 ret = -errno;
1640366c
KS
961 goto exit;
962 }
8979b227 963
1640366c
KS
964 ret = 0;
965exit:
8979b227 966 close(fd);
1640366c 967 return ret;
8979b227
FB
968}
969
e2731add 970static void vmdk_close(BlockDriverState *bs)
ea2384d3 971{
b3976d3c 972 vmdk_free_extents(bs);
ea2384d3
FB
973}
974
205ef796 975static int vmdk_flush(BlockDriverState *bs)
7a6cba61 976{
205ef796 977 return bdrv_flush(bs->file);
7a6cba61
PB
978}
979
0e7e1989
KW
980
981static QEMUOptionParameter vmdk_create_options[] = {
db08adf5
KW
982 {
983 .name = BLOCK_OPT_SIZE,
984 .type = OPT_SIZE,
985 .help = "Virtual disk size"
986 },
987 {
988 .name = BLOCK_OPT_BACKING_FILE,
989 .type = OPT_STRING,
990 .help = "File name of a base image"
991 },
992 {
993 .name = BLOCK_OPT_COMPAT6,
994 .type = OPT_FLAG,
995 .help = "VMDK version 6 image"
996 },
0e7e1989
KW
997 { NULL }
998};
999
5efa9d5a 1000static BlockDriver bdrv_vmdk = {
e60f469c
AJ
1001 .format_name = "vmdk",
1002 .instance_size = sizeof(BDRVVmdkState),
1003 .bdrv_probe = vmdk_probe,
6511ef77 1004 .bdrv_open = vmdk_open,
e60f469c
AJ
1005 .bdrv_read = vmdk_read,
1006 .bdrv_write = vmdk_write,
1007 .bdrv_close = vmdk_close,
1008 .bdrv_create = vmdk_create,
1009 .bdrv_flush = vmdk_flush,
1010 .bdrv_is_allocated = vmdk_is_allocated,
0e7e1989
KW
1011
1012 .create_options = vmdk_create_options,
ea2384d3 1013};
5efa9d5a
AL
1014
1015static void bdrv_vmdk_init(void)
1016{
1017 bdrv_register(&bdrv_vmdk);
1018}
1019
1020block_init(bdrv_vmdk_init);