]> git.proxmox.com Git - qemu.git/blame - block/vmdk.c
VMDK: add field BDRVVmdkState.desc_offset
[qemu.git] / block / vmdk.c
CommitLineData
ea2384d3
FB
1/*
2 * Block driver for the VMDK format
5fafdf24 3 *
ea2384d3 4 * Copyright (c) 2004 Fabrice Bellard
ff1afc72 5 * Copyright (c) 2005 Filip Navara
5fafdf24 6 *
ea2384d3
FB
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 * THE SOFTWARE.
24 */
5f4da8c0 25
faf07963 26#include "qemu-common.h"
ea2384d3 27#include "block_int.h"
5efa9d5a 28#include "module.h"
ea2384d3 29
ea2384d3
FB
30#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
31#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
32
33typedef struct {
34 uint32_t version;
35 uint32_t flags;
36 uint32_t disk_sectors;
37 uint32_t granularity;
38 uint32_t l1dir_offset;
39 uint32_t l1dir_size;
40 uint32_t file_sectors;
41 uint32_t cylinders;
42 uint32_t heads;
43 uint32_t sectors_per_track;
44} VMDK3Header;
45
46typedef struct {
47 uint32_t version;
48 uint32_t flags;
49 int64_t capacity;
50 int64_t granularity;
51 int64_t desc_offset;
52 int64_t desc_size;
53 int32_t num_gtes_per_gte;
54 int64_t rgd_offset;
55 int64_t gd_offset;
56 int64_t grain_offset;
57 char filler[1];
58 char check_bytes[4];
ff1afc72 59} __attribute__((packed)) VMDK4Header;
ea2384d3
FB
60
61#define L2_CACHE_SIZE 16
62
b3976d3c
FZ
63typedef struct VmdkExtent {
64 BlockDriverState *file;
65 bool flat;
66 int64_t sectors;
67 int64_t end_sector;
ea2384d3 68 int64_t l1_table_offset;
ff1afc72 69 int64_t l1_backup_table_offset;
ea2384d3 70 uint32_t *l1_table;
ff1afc72 71 uint32_t *l1_backup_table;
ea2384d3
FB
72 unsigned int l1_size;
73 uint32_t l1_entry_sectors;
74
75 unsigned int l2_size;
76 uint32_t *l2_cache;
77 uint32_t l2_cache_offsets[L2_CACHE_SIZE];
78 uint32_t l2_cache_counts[L2_CACHE_SIZE];
79
80 unsigned int cluster_sectors;
b3976d3c
FZ
81} VmdkExtent;
82
83typedef struct BDRVVmdkState {
e1da9b24 84 int desc_offset;
5f4da8c0 85 uint32_t parent_cid;
b3976d3c
FZ
86 int num_extents;
87 /* Extent array with num_extents entries, ascend ordered by address */
88 VmdkExtent *extents;
ea2384d3
FB
89} BDRVVmdkState;
90
630530a6
TS
91typedef struct VmdkMetaData {
92 uint32_t offset;
93 unsigned int l1_index;
94 unsigned int l2_index;
95 unsigned int l2_offset;
96 int valid;
97} VmdkMetaData;
98
ea2384d3
FB
99static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
100{
101 uint32_t magic;
102
103 if (buf_size < 4)
104 return 0;
105 magic = be32_to_cpu(*(uint32_t *)buf);
106 if (magic == VMDK3_MAGIC ||
01fc99d6 107 magic == VMDK4_MAGIC) {
ea2384d3 108 return 100;
01fc99d6
FZ
109 } else {
110 const char *p = (const char *)buf;
111 const char *end = p + buf_size;
112 while (p < end) {
113 if (*p == '#') {
114 /* skip comment line */
115 while (p < end && *p != '\n') {
116 p++;
117 }
118 p++;
119 continue;
120 }
121 if (*p == ' ') {
122 while (p < end && *p == ' ') {
123 p++;
124 }
125 /* skip '\r' if windows line endings used. */
126 if (p < end && *p == '\r') {
127 p++;
128 }
129 /* only accept blank lines before 'version=' line */
130 if (p == end || *p != '\n') {
131 return 0;
132 }
133 p++;
134 continue;
135 }
136 if (end - p >= strlen("version=X\n")) {
137 if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 ||
138 strncmp("version=2\n", p, strlen("version=2\n")) == 0) {
139 return 100;
140 }
141 }
142 if (end - p >= strlen("version=X\r\n")) {
143 if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 ||
144 strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0) {
145 return 100;
146 }
147 }
148 return 0;
149 }
ea2384d3 150 return 0;
01fc99d6 151 }
ea2384d3
FB
152}
153
5f4da8c0
TS
154#define CHECK_CID 1
155
3b46e624 156#define SECTOR_SIZE 512
5f4da8c0 157#define DESC_SIZE 20*SECTOR_SIZE // 20 sectors of 512 bytes each
5fafdf24 158#define HEADER_SIZE 512 // first sector of 512 bytes
5f4da8c0 159
b3976d3c
FZ
160static void vmdk_free_extents(BlockDriverState *bs)
161{
162 int i;
163 BDRVVmdkState *s = bs->opaque;
164
165 for (i = 0; i < s->num_extents; i++) {
166 qemu_free(s->extents[i].l1_table);
167 qemu_free(s->extents[i].l2_cache);
168 qemu_free(s->extents[i].l1_backup_table);
169 }
170 qemu_free(s->extents);
171}
172
5f4da8c0 173static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
ea2384d3 174{
5f4da8c0
TS
175 char desc[DESC_SIZE];
176 uint32_t cid;
7ccfb2eb 177 const char *p_name, *cid_str;
5f4da8c0 178 size_t cid_str_size;
e1da9b24 179 BDRVVmdkState *s = bs->opaque;
5f4da8c0 180
e1da9b24 181 if (bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE) != DESC_SIZE) {
5f4da8c0 182 return 0;
e1da9b24 183 }
5f4da8c0
TS
184
185 if (parent) {
186 cid_str = "parentCID";
187 cid_str_size = sizeof("parentCID");
188 } else {
189 cid_str = "CID";
190 cid_str_size = sizeof("CID");
191 }
192
511d2b14 193 if ((p_name = strstr(desc,cid_str)) != NULL) {
5f4da8c0
TS
194 p_name += cid_str_size;
195 sscanf(p_name,"%x",&cid);
196 }
197
198 return cid;
199}
200
201static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
202{
5f4da8c0
TS
203 char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
204 char *p_name, *tmp_str;
e1da9b24 205 BDRVVmdkState *s = bs->opaque;
5f4da8c0 206
e1da9b24
FZ
207 memset(desc, 0, sizeof(desc));
208 if (bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE) != DESC_SIZE) {
209 return -EIO;
210 }
5f4da8c0
TS
211
212 tmp_str = strstr(desc,"parentCID");
363a37d5 213 pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
511d2b14 214 if ((p_name = strstr(desc,"CID")) != NULL) {
5f4da8c0 215 p_name += sizeof("CID");
363a37d5
BS
216 snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid);
217 pstrcat(desc, sizeof(desc), tmp_desc);
5f4da8c0
TS
218 }
219
e1da9b24
FZ
220 if (bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE) < 0) {
221 return -EIO;
222 }
5f4da8c0
TS
223 return 0;
224}
225
226static int vmdk_is_cid_valid(BlockDriverState *bs)
227{
228#ifdef CHECK_CID
229 BDRVVmdkState *s = bs->opaque;
b171271a 230 BlockDriverState *p_bs = bs->backing_hd;
5f4da8c0
TS
231 uint32_t cur_pcid;
232
233 if (p_bs) {
234 cur_pcid = vmdk_read_cid(p_bs,0);
235 if (s->parent_cid != cur_pcid)
236 // CID not valid
237 return 0;
238 }
239#endif
240 // CID valid
241 return 1;
242}
243
244static int vmdk_snapshot_create(const char *filename, const char *backing_file)
245{
246 int snp_fd, p_fd;
53c2e716 247 int ret;
5f4da8c0 248 uint32_t p_cid;
5fafdf24 249 char *p_name, *gd_buf, *rgd_buf;
5f4da8c0
TS
250 const char *real_filename, *temp_str;
251 VMDK4Header header;
252 uint32_t gde_entries, gd_size;
253 int64_t gd_offset, rgd_offset, capacity, gt_size;
254 char p_desc[DESC_SIZE], s_desc[DESC_SIZE], hdr[HEADER_SIZE];
7ccfb2eb 255 static const char desc_template[] =
5f4da8c0
TS
256 "# Disk DescriptorFile\n"
257 "version=1\n"
258 "CID=%x\n"
259 "parentCID=%x\n"
260 "createType=\"monolithicSparse\"\n"
261 "parentFileNameHint=\"%s\"\n"
262 "\n"
263 "# Extent description\n"
7ccfb2eb 264 "RW %u SPARSE \"%s\"\n"
5f4da8c0
TS
265 "\n"
266 "# The Disk Data Base \n"
267 "#DDB\n"
268 "\n";
269
270 snp_fd = open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, 0644);
271 if (snp_fd < 0)
53c2e716 272 return -errno;
5f4da8c0
TS
273 p_fd = open(backing_file, O_RDONLY | O_BINARY | O_LARGEFILE);
274 if (p_fd < 0) {
275 close(snp_fd);
53c2e716 276 return -errno;
5f4da8c0
TS
277 }
278
279 /* read the header */
53c2e716
JQ
280 if (lseek(p_fd, 0x0, SEEK_SET) == -1) {
281 ret = -errno;
5f4da8c0 282 goto fail;
53c2e716
JQ
283 }
284 if (read(p_fd, hdr, HEADER_SIZE) != HEADER_SIZE) {
285 ret = -errno;
5f4da8c0 286 goto fail;
53c2e716 287 }
5f4da8c0
TS
288
289 /* write the header */
53c2e716
JQ
290 if (lseek(snp_fd, 0x0, SEEK_SET) == -1) {
291 ret = -errno;
5f4da8c0 292 goto fail;
53c2e716
JQ
293 }
294 if (write(snp_fd, hdr, HEADER_SIZE) == -1) {
295 ret = -errno;
5f4da8c0 296 goto fail;
53c2e716 297 }
5f4da8c0
TS
298
299 memset(&header, 0, sizeof(header));
300 memcpy(&header,&hdr[4], sizeof(header)); // skip the VMDK4_MAGIC
301
53c2e716
JQ
302 if (ftruncate(snp_fd, header.grain_offset << 9)) {
303 ret = -errno;
1640366c 304 goto fail;
53c2e716 305 }
5f4da8c0 306 /* the descriptor offset = 0x200 */
53c2e716
JQ
307 if (lseek(p_fd, 0x200, SEEK_SET) == -1) {
308 ret = -errno;
5f4da8c0 309 goto fail;
53c2e716
JQ
310 }
311 if (read(p_fd, p_desc, DESC_SIZE) != DESC_SIZE) {
312 ret = -errno;
5f4da8c0 313 goto fail;
53c2e716 314 }
5f4da8c0 315
511d2b14 316 if ((p_name = strstr(p_desc,"CID")) != NULL) {
5f4da8c0
TS
317 p_name += sizeof("CID");
318 sscanf(p_name,"%x",&p_cid);
319 }
320
321 real_filename = filename;
322 if ((temp_str = strrchr(real_filename, '\\')) != NULL)
323 real_filename = temp_str + 1;
324 if ((temp_str = strrchr(real_filename, '/')) != NULL)
325 real_filename = temp_str + 1;
326 if ((temp_str = strrchr(real_filename, ':')) != NULL)
327 real_filename = temp_str + 1;
328
363a37d5
BS
329 snprintf(s_desc, sizeof(s_desc), desc_template, p_cid, p_cid, backing_file,
330 (uint32_t)header.capacity, real_filename);
5f4da8c0
TS
331
332 /* write the descriptor */
53c2e716
JQ
333 if (lseek(snp_fd, 0x200, SEEK_SET) == -1) {
334 ret = -errno;
5f4da8c0 335 goto fail;
53c2e716
JQ
336 }
337 if (write(snp_fd, s_desc, strlen(s_desc)) == -1) {
338 ret = -errno;
5f4da8c0 339 goto fail;
53c2e716 340 }
ea2384d3 341
5f4da8c0
TS
342 gd_offset = header.gd_offset * SECTOR_SIZE; // offset of GD table
343 rgd_offset = header.rgd_offset * SECTOR_SIZE; // offset of RGD table
344 capacity = header.capacity * SECTOR_SIZE; // Extent size
345 /*
346 * Each GDE span 32M disk, means:
347 * 512 GTE per GT, each GTE points to grain
348 */
349 gt_size = (int64_t)header.num_gtes_per_gte * header.granularity * SECTOR_SIZE;
53c2e716
JQ
350 if (!gt_size) {
351 ret = -EINVAL;
5f4da8c0 352 goto fail;
53c2e716 353 }
5fafdf24 354 gde_entries = (uint32_t)(capacity / gt_size); // number of gde/rgde
5f4da8c0
TS
355 gd_size = gde_entries * sizeof(uint32_t);
356
357 /* write RGD */
358 rgd_buf = qemu_malloc(gd_size);
53c2e716
JQ
359 if (lseek(p_fd, rgd_offset, SEEK_SET) == -1) {
360 ret = -errno;
5f4da8c0 361 goto fail_rgd;
53c2e716
JQ
362 }
363 if (read(p_fd, rgd_buf, gd_size) != gd_size) {
364 ret = -errno;
5f4da8c0 365 goto fail_rgd;
53c2e716
JQ
366 }
367 if (lseek(snp_fd, rgd_offset, SEEK_SET) == -1) {
368 ret = -errno;
5f4da8c0 369 goto fail_rgd;
53c2e716
JQ
370 }
371 if (write(snp_fd, rgd_buf, gd_size) == -1) {
372 ret = -errno;
5f4da8c0 373 goto fail_rgd;
53c2e716 374 }
5f4da8c0
TS
375
376 /* write GD */
377 gd_buf = qemu_malloc(gd_size);
53c2e716
JQ
378 if (lseek(p_fd, gd_offset, SEEK_SET) == -1) {
379 ret = -errno;
5f4da8c0 380 goto fail_gd;
53c2e716
JQ
381 }
382 if (read(p_fd, gd_buf, gd_size) != gd_size) {
383 ret = -errno;
5f4da8c0 384 goto fail_gd;
53c2e716
JQ
385 }
386 if (lseek(snp_fd, gd_offset, SEEK_SET) == -1) {
387 ret = -errno;
5f4da8c0 388 goto fail_gd;
53c2e716
JQ
389 }
390 if (write(snp_fd, gd_buf, gd_size) == -1) {
391 ret = -errno;
5f4da8c0 392 goto fail_gd;
53c2e716 393 }
3829cb46 394 ret = 0;
5f4da8c0 395
3829cb46 396fail_gd:
5f4da8c0 397 qemu_free(gd_buf);
3829cb46 398fail_rgd:
5f4da8c0 399 qemu_free(rgd_buf);
3829cb46 400fail:
5f4da8c0
TS
401 close(p_fd);
402 close(snp_fd);
53c2e716 403 return ret;
5f4da8c0
TS
404}
405
9949f97e 406static int vmdk_parent_open(BlockDriverState *bs)
5f4da8c0 407{
5fafdf24 408 char *p_name;
5f4da8c0 409 char desc[DESC_SIZE];
e1da9b24 410 BDRVVmdkState *s = bs->opaque;
5f4da8c0 411
e1da9b24 412 if (bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE) != DESC_SIZE) {
5f4da8c0 413 return -1;
e1da9b24 414 }
5f4da8c0 415
511d2b14 416 if ((p_name = strstr(desc,"parentFileNameHint")) != NULL) {
5f4da8c0 417 char *end_name;
5f4da8c0
TS
418
419 p_name += sizeof("parentFileNameHint") + 1;
511d2b14 420 if ((end_name = strchr(p_name,'\"')) == NULL)
5f4da8c0 421 return -1;
b171271a 422 if ((end_name - p_name) > sizeof (bs->backing_file) - 1)
b34d259a 423 return -1;
3b46e624 424
b171271a 425 pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
ff1afc72 426 }
5f4da8c0
TS
427
428 return 0;
429}
430
b3976d3c
FZ
431/* Create and append extent to the extent array. Return the added VmdkExtent
432 * address. return NULL if allocation failed. */
433static VmdkExtent *vmdk_add_extent(BlockDriverState *bs,
434 BlockDriverState *file, bool flat, int64_t sectors,
435 int64_t l1_offset, int64_t l1_backup_offset,
436 uint32_t l1_size,
437 int l2_size, unsigned int cluster_sectors)
438{
439 VmdkExtent *extent;
440 BDRVVmdkState *s = bs->opaque;
441
442 s->extents = qemu_realloc(s->extents,
443 (s->num_extents + 1) * sizeof(VmdkExtent));
444 extent = &s->extents[s->num_extents];
445 s->num_extents++;
446
447 memset(extent, 0, sizeof(VmdkExtent));
448 extent->file = file;
449 extent->flat = flat;
450 extent->sectors = sectors;
451 extent->l1_table_offset = l1_offset;
452 extent->l1_backup_table_offset = l1_backup_offset;
453 extent->l1_size = l1_size;
454 extent->l1_entry_sectors = l2_size * cluster_sectors;
455 extent->l2_size = l2_size;
456 extent->cluster_sectors = cluster_sectors;
457
458 if (s->num_extents > 1) {
459 extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
460 } else {
461 extent->end_sector = extent->sectors;
462 }
463 bs->total_sectors = extent->end_sector;
464 return extent;
465}
466
b4b3ab14 467static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent)
5f4da8c0 468{
b4b3ab14
FZ
469 int ret;
470 int l1_size, i;
5f4da8c0 471
ea2384d3 472 /* read the L1 table */
b3976d3c
FZ
473 l1_size = extent->l1_size * sizeof(uint32_t);
474 extent->l1_table = qemu_malloc(l1_size);
b4b3ab14
FZ
475 ret = bdrv_pread(extent->file,
476 extent->l1_table_offset,
477 extent->l1_table,
478 l1_size);
479 if (ret < 0) {
480 goto fail_l1;
b3976d3c
FZ
481 }
482 for (i = 0; i < extent->l1_size; i++) {
483 le32_to_cpus(&extent->l1_table[i]);
ea2384d3
FB
484 }
485
b3976d3c
FZ
486 if (extent->l1_backup_table_offset) {
487 extent->l1_backup_table = qemu_malloc(l1_size);
b4b3ab14
FZ
488 ret = bdrv_pread(extent->file,
489 extent->l1_backup_table_offset,
490 extent->l1_backup_table,
491 l1_size);
492 if (ret < 0) {
493 goto fail_l1b;
b3976d3c
FZ
494 }
495 for (i = 0; i < extent->l1_size; i++) {
496 le32_to_cpus(&extent->l1_backup_table[i]);
ff1afc72
FB
497 }
498 }
499
b3976d3c
FZ
500 extent->l2_cache =
501 qemu_malloc(extent->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
ea2384d3 502 return 0;
b4b3ab14
FZ
503 fail_l1b:
504 qemu_free(extent->l1_backup_table);
505 fail_l1:
506 qemu_free(extent->l1_table);
507 return ret;
508}
509
510static int vmdk_open_vmdk3(BlockDriverState *bs, int flags)
511{
512 int ret;
513 uint32_t magic;
514 VMDK3Header header;
e1da9b24 515 BDRVVmdkState *s = bs->opaque;
b4b3ab14
FZ
516 VmdkExtent *extent;
517
e1da9b24 518 s->desc_offset = 0x200;
b4b3ab14
FZ
519 ret = bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header));
520 if (ret < 0) {
521 goto fail;
522 }
523 extent = vmdk_add_extent(bs,
524 bs->file, false,
525 le32_to_cpu(header.disk_sectors),
526 le32_to_cpu(header.l1dir_offset) << 9,
527 0, 1 << 6, 1 << 9,
528 le32_to_cpu(header.granularity));
529 ret = vmdk_init_tables(bs, extent);
530 if (ret) {
531 /* vmdk_init_tables cleans up on fail, so only free allocation of
532 * vmdk_add_extent here. */
533 goto fail;
534 }
535 return 0;
ea2384d3 536 fail:
b3976d3c 537 vmdk_free_extents(bs);
b4b3ab14
FZ
538 return ret;
539}
540
541static int vmdk_open_vmdk4(BlockDriverState *bs, int flags)
542{
543 int ret;
544 uint32_t magic;
545 uint32_t l1_size, l1_entry_sectors;
546 VMDK4Header header;
547 BDRVVmdkState *s = bs->opaque;
548 VmdkExtent *extent;
549
e1da9b24 550 s->desc_offset = 0x200;
b4b3ab14
FZ
551 ret = bdrv_pread(bs->file, sizeof(magic), &header, sizeof(header));
552 if (ret < 0) {
553 goto fail;
554 }
555 l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gte)
556 * le64_to_cpu(header.granularity);
557 l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1)
558 / l1_entry_sectors;
559 extent = vmdk_add_extent(bs, bs->file, false,
560 le64_to_cpu(header.capacity),
561 le64_to_cpu(header.gd_offset) << 9,
562 le64_to_cpu(header.rgd_offset) << 9,
563 l1_size,
564 le32_to_cpu(header.num_gtes_per_gte),
565 le64_to_cpu(header.granularity));
566 if (extent->l1_entry_sectors <= 0) {
567 ret = -EINVAL;
568 goto fail;
569 }
570 /* try to open parent images, if exist */
571 ret = vmdk_parent_open(bs);
572 if (ret) {
573 goto fail;
574 }
575 s->parent_cid = vmdk_read_cid(bs, 1);
576 ret = vmdk_init_tables(bs, extent);
577 if (ret) {
578 goto fail;
579 }
580 return 0;
581 fail:
582 vmdk_free_extents(bs);
583 return ret;
584}
585
586static int vmdk_open(BlockDriverState *bs, int flags)
587{
588 uint32_t magic;
589
590 if (bdrv_pread(bs->file, 0, &magic, sizeof(magic)) != sizeof(magic)) {
591 return -EIO;
592 }
593
594 magic = be32_to_cpu(magic);
595 if (magic == VMDK3_MAGIC) {
596 return vmdk_open_vmdk3(bs, flags);
597 } else if (magic == VMDK4_MAGIC) {
598 return vmdk_open_vmdk4(bs, flags);
599 } else {
600 return -EINVAL;
601 }
ea2384d3
FB
602}
603
b3976d3c
FZ
604static int get_whole_cluster(BlockDriverState *bs,
605 VmdkExtent *extent,
606 uint64_t cluster_offset,
607 uint64_t offset,
608 bool allocate)
5f4da8c0 609{
b3976d3c
FZ
610 /* 128 sectors * 512 bytes each = grain size 64KB */
611 uint8_t whole_grain[extent->cluster_sectors * 512];
5f4da8c0 612
0e69c543
FZ
613 /* we will be here if it's first write on non-exist grain(cluster).
614 * try to read from parent image, if exist */
b171271a 615 if (bs->backing_hd) {
c336500d 616 int ret;
5f4da8c0
TS
617
618 if (!vmdk_is_cid_valid(bs))
619 return -1;
5f4da8c0 620
0e69c543
FZ
621 /* floor offset to cluster */
622 offset -= offset % (extent->cluster_sectors * 512);
c336500d 623 ret = bdrv_read(bs->backing_hd, offset >> 9, whole_grain,
b3976d3c 624 extent->cluster_sectors);
c336500d
KW
625 if (ret < 0) {
626 return -1;
627 }
630530a6 628
0e69c543 629 /* Write grain only into the active image */
b3976d3c
FZ
630 ret = bdrv_write(extent->file, cluster_offset, whole_grain,
631 extent->cluster_sectors);
c336500d
KW
632 if (ret < 0) {
633 return -1;
630530a6
TS
634 }
635 }
636 return 0;
637}
638
b3976d3c 639static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data)
630530a6 640{
630530a6 641 /* update L2 table */
b3976d3c
FZ
642 if (bdrv_pwrite_sync(
643 extent->file,
644 ((int64_t)m_data->l2_offset * 512)
645 + (m_data->l2_index * sizeof(m_data->offset)),
646 &(m_data->offset),
647 sizeof(m_data->offset)
648 ) < 0) {
630530a6 649 return -1;
b3976d3c 650 }
630530a6 651 /* update backup L2 table */
b3976d3c
FZ
652 if (extent->l1_backup_table_offset != 0) {
653 m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
654 if (bdrv_pwrite_sync(
655 extent->file,
656 ((int64_t)m_data->l2_offset * 512)
657 + (m_data->l2_index * sizeof(m_data->offset)),
658 &(m_data->offset), sizeof(m_data->offset)
659 ) < 0) {
5f4da8c0 660 return -1;
b3976d3c 661 }
5f4da8c0 662 }
630530a6 663
5f4da8c0
TS
664 return 0;
665}
666
b3976d3c
FZ
667static uint64_t get_cluster_offset(BlockDriverState *bs,
668 VmdkExtent *extent,
669 VmdkMetaData *m_data,
670 uint64_t offset, int allocate)
ea2384d3 671{
ea2384d3
FB
672 unsigned int l1_index, l2_offset, l2_index;
673 int min_index, i, j;
630530a6 674 uint32_t min_count, *l2_table, tmp = 0;
ea2384d3 675 uint64_t cluster_offset;
630530a6
TS
676
677 if (m_data)
678 m_data->valid = 0;
679
b3976d3c
FZ
680 l1_index = (offset >> 9) / extent->l1_entry_sectors;
681 if (l1_index >= extent->l1_size) {
ea2384d3 682 return 0;
b3976d3c
FZ
683 }
684 l2_offset = extent->l1_table[l1_index];
685 if (!l2_offset) {
ea2384d3 686 return 0;
b3976d3c 687 }
b4b3ab14 688 for (i = 0; i < L2_CACHE_SIZE; i++) {
b3976d3c 689 if (l2_offset == extent->l2_cache_offsets[i]) {
ea2384d3 690 /* increment the hit count */
b3976d3c 691 if (++extent->l2_cache_counts[i] == 0xffffffff) {
b4b3ab14 692 for (j = 0; j < L2_CACHE_SIZE; j++) {
b3976d3c 693 extent->l2_cache_counts[j] >>= 1;
ea2384d3
FB
694 }
695 }
b3976d3c 696 l2_table = extent->l2_cache + (i * extent->l2_size);
ea2384d3
FB
697 goto found;
698 }
699 }
700 /* not found: load a new entry in the least used one */
701 min_index = 0;
702 min_count = 0xffffffff;
b4b3ab14 703 for (i = 0; i < L2_CACHE_SIZE; i++) {
b3976d3c
FZ
704 if (extent->l2_cache_counts[i] < min_count) {
705 min_count = extent->l2_cache_counts[i];
ea2384d3
FB
706 min_index = i;
707 }
708 }
b3976d3c
FZ
709 l2_table = extent->l2_cache + (min_index * extent->l2_size);
710 if (bdrv_pread(
711 extent->file,
712 (int64_t)l2_offset * 512,
713 l2_table,
714 extent->l2_size * sizeof(uint32_t)
715 ) != extent->l2_size * sizeof(uint32_t)) {
ea2384d3 716 return 0;
b3976d3c 717 }
5f4da8c0 718
b3976d3c
FZ
719 extent->l2_cache_offsets[min_index] = l2_offset;
720 extent->l2_cache_counts[min_index] = 1;
ea2384d3 721 found:
b3976d3c 722 l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
ea2384d3 723 cluster_offset = le32_to_cpu(l2_table[l2_index]);
630530a6 724
ff1afc72
FB
725 if (!cluster_offset) {
726 if (!allocate)
727 return 0;
9949f97e 728
630530a6 729 // Avoid the L2 tables update for the images that have snapshots.
b3976d3c
FZ
730 cluster_offset = bdrv_getlength(extent->file);
731 bdrv_truncate(
732 extent->file,
733 cluster_offset + (extent->cluster_sectors << 9)
734 );
9949f97e
KW
735
736 cluster_offset >>= 9;
737 tmp = cpu_to_le32(cluster_offset);
738 l2_table[l2_index] = tmp;
630530a6 739
630530a6
TS
740 /* First of all we write grain itself, to avoid race condition
741 * that may to corrupt the image.
742 * This problem may occur because of insufficient space on host disk
743 * or inappropriate VM shutdown.
744 */
b3976d3c
FZ
745 if (get_whole_cluster(
746 bs, extent, cluster_offset, offset, allocate) == -1)
5f4da8c0 747 return 0;
630530a6
TS
748
749 if (m_data) {
750 m_data->offset = tmp;
751 m_data->l1_index = l1_index;
752 m_data->l2_index = l2_index;
753 m_data->l2_offset = l2_offset;
754 m_data->valid = 1;
755 }
ff1afc72 756 }
ea2384d3
FB
757 cluster_offset <<= 9;
758 return cluster_offset;
759}
760
b3976d3c
FZ
761static VmdkExtent *find_extent(BDRVVmdkState *s,
762 int64_t sector_num, VmdkExtent *start_hint)
763{
764 VmdkExtent *extent = start_hint;
765
766 if (!extent) {
767 extent = &s->extents[0];
768 }
769 while (extent < &s->extents[s->num_extents]) {
770 if (sector_num < extent->end_sector) {
771 return extent;
772 }
773 extent++;
774 }
775 return NULL;
776}
777
5fafdf24 778static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
ea2384d3
FB
779 int nb_sectors, int *pnum)
780{
781 BDRVVmdkState *s = bs->opaque;
ea2384d3 782
b3976d3c
FZ
783 int64_t index_in_cluster, n, ret;
784 uint64_t offset;
785 VmdkExtent *extent;
786
787 extent = find_extent(s, sector_num, NULL);
788 if (!extent) {
789 return 0;
790 }
791 if (extent->flat) {
792 n = extent->end_sector - sector_num;
793 ret = 1;
794 } else {
795 offset = get_cluster_offset(bs, extent, NULL, sector_num * 512, 0);
796 index_in_cluster = sector_num % extent->cluster_sectors;
797 n = extent->cluster_sectors - index_in_cluster;
798 ret = offset ? 1 : 0;
799 }
ea2384d3
FB
800 if (n > nb_sectors)
801 n = nb_sectors;
802 *pnum = n;
b3976d3c 803 return ret;
ea2384d3
FB
804}
805
5fafdf24 806static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
ea2384d3
FB
807 uint8_t *buf, int nb_sectors)
808{
809 BDRVVmdkState *s = bs->opaque;
b3976d3c
FZ
810 int ret;
811 uint64_t n, index_in_cluster;
812 VmdkExtent *extent = NULL;
ea2384d3 813 uint64_t cluster_offset;
5f4da8c0 814
ea2384d3 815 while (nb_sectors > 0) {
b3976d3c
FZ
816 extent = find_extent(s, sector_num, extent);
817 if (!extent) {
818 return -EIO;
819 }
820 cluster_offset = get_cluster_offset(
821 bs, extent, NULL, sector_num << 9, 0);
822 index_in_cluster = sector_num % extent->cluster_sectors;
823 n = extent->cluster_sectors - index_in_cluster;
ea2384d3
FB
824 if (n > nb_sectors)
825 n = nb_sectors;
826 if (!cluster_offset) {
5f4da8c0 827 // try to read from parent image, if exist
b171271a 828 if (bs->backing_hd) {
5f4da8c0
TS
829 if (!vmdk_is_cid_valid(bs))
830 return -1;
b171271a 831 ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
5f4da8c0
TS
832 if (ret < 0)
833 return -1;
834 } else {
835 memset(buf, 0, 512 * n);
836 }
ea2384d3 837 } else {
6511ef77 838 if(bdrv_pread(bs->file, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512)
ea2384d3
FB
839 return -1;
840 }
841 nb_sectors -= n;
842 sector_num += n;
843 buf += n * 512;
844 }
845 return 0;
846}
847
5fafdf24 848static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
ea2384d3
FB
849 const uint8_t *buf, int nb_sectors)
850{
ff1afc72 851 BDRVVmdkState *s = bs->opaque;
b3976d3c
FZ
852 VmdkExtent *extent = NULL;
853 int n;
854 int64_t index_in_cluster;
ff1afc72 855 uint64_t cluster_offset;
5f4da8c0 856 static int cid_update = 0;
b3976d3c 857 VmdkMetaData m_data;
ff1afc72 858
630530a6
TS
859 if (sector_num > bs->total_sectors) {
860 fprintf(stderr,
92868412
JM
861 "(VMDK) Wrong offset: sector_num=0x%" PRIx64
862 " total_sectors=0x%" PRIx64 "\n",
630530a6
TS
863 sector_num, bs->total_sectors);
864 return -1;
865 }
866
ff1afc72 867 while (nb_sectors > 0) {
b3976d3c
FZ
868 extent = find_extent(s, sector_num, extent);
869 if (!extent) {
870 return -EIO;
871 }
872 cluster_offset = get_cluster_offset(
873 bs,
874 extent,
875 &m_data,
876 sector_num << 9, 1);
877 if (!cluster_offset) {
ff1afc72 878 return -1;
b3976d3c
FZ
879 }
880 index_in_cluster = sector_num % extent->cluster_sectors;
881 n = extent->cluster_sectors - index_in_cluster;
882 if (n > nb_sectors) {
883 n = nb_sectors;
884 }
630530a6 885
b3976d3c
FZ
886 if (bdrv_pwrite(bs->file,
887 cluster_offset + index_in_cluster * 512,
888 buf, n * 512)
889 != n * 512) {
ff1afc72 890 return -1;
b3976d3c 891 }
630530a6
TS
892 if (m_data.valid) {
893 /* update L2 tables */
b3976d3c 894 if (vmdk_L2update(extent, &m_data) == -1) {
630530a6 895 return -1;
b3976d3c 896 }
630530a6 897 }
ff1afc72
FB
898 nb_sectors -= n;
899 sector_num += n;
900 buf += n * 512;
5f4da8c0
TS
901
902 // update CID on the first write every time the virtual disk is opened
903 if (!cid_update) {
904 vmdk_write_cid(bs, time(NULL));
905 cid_update++;
906 }
ff1afc72
FB
907 }
908 return 0;
ea2384d3
FB
909}
910
0e7e1989 911static int vmdk_create(const char *filename, QEMUOptionParameter *options)
8979b227
FB
912{
913 int fd, i;
914 VMDK4Header header;
915 uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
7ccfb2eb 916 static const char desc_template[] =
8979b227
FB
917 "# Disk DescriptorFile\n"
918 "version=1\n"
919 "CID=%x\n"
920 "parentCID=ffffffff\n"
921 "createType=\"monolithicSparse\"\n"
922 "\n"
923 "# Extent description\n"
7fd6d9fc 924 "RW %" PRId64 " SPARSE \"%s\"\n"
8979b227
FB
925 "\n"
926 "# The Disk Data Base \n"
927 "#DDB\n"
928 "\n"
ec36ba14 929 "ddb.virtualHWVersion = \"%d\"\n"
7fd6d9fc 930 "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
8979b227
FB
931 "ddb.geometry.heads = \"16\"\n"
932 "ddb.geometry.sectors = \"63\"\n"
933 "ddb.adapterType = \"ide\"\n";
934 char desc[1024];
935 const char *real_filename, *temp_str;
0e7e1989
KW
936 int64_t total_size = 0;
937 const char *backing_file = NULL;
938 int flags = 0;
1640366c 939 int ret;
0e7e1989
KW
940
941 // Read out options
942 while (options && options->name) {
943 if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
944 total_size = options->value.n / 512;
945 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
946 backing_file = options->value.s;
947 } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
948 flags |= options->value.n ? BLOCK_FLAG_COMPAT6: 0;
949 }
950 options++;
951 }
8979b227
FB
952
953 /* XXX: add support for backing file */
5f4da8c0
TS
954 if (backing_file) {
955 return vmdk_snapshot_create(filename, backing_file);
956 }
8979b227
FB
957
958 fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
959 0644);
960 if (fd < 0)
b781cce5 961 return -errno;
8979b227
FB
962 magic = cpu_to_be32(VMDK4_MAGIC);
963 memset(&header, 0, sizeof(header));
16372ff0
AG
964 header.version = 1;
965 header.flags = 3; /* ?? */
966 header.capacity = total_size;
967 header.granularity = 128;
968 header.num_gtes_per_gte = 512;
8979b227
FB
969
970 grains = (total_size + header.granularity - 1) / header.granularity;
971 gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
972 gt_count = (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
973 gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
974
975 header.desc_offset = 1;
976 header.desc_size = 20;
977 header.rgd_offset = header.desc_offset + header.desc_size;
978 header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count);
979 header.grain_offset =
980 ((header.gd_offset + gd_size + (gt_size * gt_count) +
981 header.granularity - 1) / header.granularity) *
982 header.granularity;
983
16372ff0
AG
984 /* swap endianness for all header fields */
985 header.version = cpu_to_le32(header.version);
986 header.flags = cpu_to_le32(header.flags);
987 header.capacity = cpu_to_le64(header.capacity);
988 header.granularity = cpu_to_le64(header.granularity);
989 header.num_gtes_per_gte = cpu_to_le32(header.num_gtes_per_gte);
8979b227
FB
990 header.desc_offset = cpu_to_le64(header.desc_offset);
991 header.desc_size = cpu_to_le64(header.desc_size);
992 header.rgd_offset = cpu_to_le64(header.rgd_offset);
993 header.gd_offset = cpu_to_le64(header.gd_offset);
994 header.grain_offset = cpu_to_le64(header.grain_offset);
995
996 header.check_bytes[0] = 0xa;
997 header.check_bytes[1] = 0x20;
998 header.check_bytes[2] = 0xd;
999 header.check_bytes[3] = 0xa;
3b46e624
TS
1000
1001 /* write all the data */
1640366c
KS
1002 ret = qemu_write_full(fd, &magic, sizeof(magic));
1003 if (ret != sizeof(magic)) {
b781cce5 1004 ret = -errno;
1640366c
KS
1005 goto exit;
1006 }
1007 ret = qemu_write_full(fd, &header, sizeof(header));
1008 if (ret != sizeof(header)) {
b781cce5 1009 ret = -errno;
1640366c
KS
1010 goto exit;
1011 }
8979b227 1012
16372ff0 1013 ret = ftruncate(fd, le64_to_cpu(header.grain_offset) << 9);
1640366c 1014 if (ret < 0) {
b781cce5 1015 ret = -errno;
1640366c
KS
1016 goto exit;
1017 }
8979b227
FB
1018
1019 /* write grain directory */
1020 lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET);
16372ff0 1021 for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_size;
1640366c
KS
1022 i < gt_count; i++, tmp += gt_size) {
1023 ret = qemu_write_full(fd, &tmp, sizeof(tmp));
1024 if (ret != sizeof(tmp)) {
b781cce5 1025 ret = -errno;
1640366c
KS
1026 goto exit;
1027 }
1028 }
3b46e624 1029
8979b227
FB
1030 /* write backup grain directory */
1031 lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET);
16372ff0 1032 for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_size;
1640366c
KS
1033 i < gt_count; i++, tmp += gt_size) {
1034 ret = qemu_write_full(fd, &tmp, sizeof(tmp));
1035 if (ret != sizeof(tmp)) {
b781cce5 1036 ret = -errno;
1640366c
KS
1037 goto exit;
1038 }
1039 }
8979b227
FB
1040
1041 /* compose the descriptor */
1042 real_filename = filename;
1043 if ((temp_str = strrchr(real_filename, '\\')) != NULL)
1044 real_filename = temp_str + 1;
1045 if ((temp_str = strrchr(real_filename, '/')) != NULL)
1046 real_filename = temp_str + 1;
1047 if ((temp_str = strrchr(real_filename, ':')) != NULL)
1048 real_filename = temp_str + 1;
7ccfb2eb 1049 snprintf(desc, sizeof(desc), desc_template, (unsigned int)time(NULL),
7fd6d9fc
BS
1050 total_size, real_filename,
1051 (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
1052 total_size / (int64_t)(63 * 16));
8979b227
FB
1053
1054 /* write the descriptor */
1055 lseek(fd, le64_to_cpu(header.desc_offset) << 9, SEEK_SET);
1640366c
KS
1056 ret = qemu_write_full(fd, desc, strlen(desc));
1057 if (ret != strlen(desc)) {
b781cce5 1058 ret = -errno;
1640366c
KS
1059 goto exit;
1060 }
8979b227 1061
1640366c
KS
1062 ret = 0;
1063exit:
8979b227 1064 close(fd);
1640366c 1065 return ret;
8979b227
FB
1066}
1067
e2731add 1068static void vmdk_close(BlockDriverState *bs)
ea2384d3 1069{
b3976d3c 1070 vmdk_free_extents(bs);
ea2384d3
FB
1071}
1072
205ef796 1073static int vmdk_flush(BlockDriverState *bs)
7a6cba61 1074{
205ef796 1075 return bdrv_flush(bs->file);
7a6cba61
PB
1076}
1077
0e7e1989
KW
1078
1079static QEMUOptionParameter vmdk_create_options[] = {
db08adf5
KW
1080 {
1081 .name = BLOCK_OPT_SIZE,
1082 .type = OPT_SIZE,
1083 .help = "Virtual disk size"
1084 },
1085 {
1086 .name = BLOCK_OPT_BACKING_FILE,
1087 .type = OPT_STRING,
1088 .help = "File name of a base image"
1089 },
1090 {
1091 .name = BLOCK_OPT_COMPAT6,
1092 .type = OPT_FLAG,
1093 .help = "VMDK version 6 image"
1094 },
0e7e1989
KW
1095 { NULL }
1096};
1097
5efa9d5a 1098static BlockDriver bdrv_vmdk = {
e60f469c
AJ
1099 .format_name = "vmdk",
1100 .instance_size = sizeof(BDRVVmdkState),
1101 .bdrv_probe = vmdk_probe,
6511ef77 1102 .bdrv_open = vmdk_open,
e60f469c
AJ
1103 .bdrv_read = vmdk_read,
1104 .bdrv_write = vmdk_write,
1105 .bdrv_close = vmdk_close,
1106 .bdrv_create = vmdk_create,
1107 .bdrv_flush = vmdk_flush,
1108 .bdrv_is_allocated = vmdk_is_allocated,
0e7e1989
KW
1109
1110 .create_options = vmdk_create_options,
ea2384d3 1111};
5efa9d5a
AL
1112
1113static void bdrv_vmdk_init(void)
1114{
1115 bdrv_register(&bdrv_vmdk);
1116}
1117
1118block_init(bdrv_vmdk_init);