]> git.proxmox.com Git - mirror_qemu.git/blobdiff - block/vmdk.c
i386: Fix signedness of hyperv_spinlock_attempts
[mirror_qemu.git] / block / vmdk.c
index ef15ddbfd3d55290022377ac352df355e1696143..bd36ece125a6e9992419445c4275c980aaa20c5d 100644 (file)
@@ -27,6 +27,7 @@
 #include "qapi/error.h"
 #include "block/block_int.h"
 #include "sysemu/block-backend.h"
+#include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/error-report.h"
 #include "qemu/module.h"
@@ -47,6 +48,8 @@
 #define VMDK4_FLAG_MARKER (1 << 17)
 #define VMDK4_GD_AT_END 0xffffffffffffffffULL
 
+#define VMDK_EXTENT_MAX_SECTORS (1ULL << 32)
+
 #define VMDK_GTE_ZEROED 0x1
 
 /* VMDK internal error codes */
@@ -88,6 +91,44 @@ typedef struct {
     uint16_t compressAlgorithm;
 } QEMU_PACKED VMDK4Header;
 
+typedef struct VMDKSESparseConstHeader {
+    uint64_t magic;
+    uint64_t version;
+    uint64_t capacity;
+    uint64_t grain_size;
+    uint64_t grain_table_size;
+    uint64_t flags;
+    uint64_t reserved1;
+    uint64_t reserved2;
+    uint64_t reserved3;
+    uint64_t reserved4;
+    uint64_t volatile_header_offset;
+    uint64_t volatile_header_size;
+    uint64_t journal_header_offset;
+    uint64_t journal_header_size;
+    uint64_t journal_offset;
+    uint64_t journal_size;
+    uint64_t grain_dir_offset;
+    uint64_t grain_dir_size;
+    uint64_t grain_tables_offset;
+    uint64_t grain_tables_size;
+    uint64_t free_bitmap_offset;
+    uint64_t free_bitmap_size;
+    uint64_t backmap_offset;
+    uint64_t backmap_size;
+    uint64_t grains_offset;
+    uint64_t grains_size;
+    uint8_t pad[304];
+} QEMU_PACKED VMDKSESparseConstHeader;
+
+typedef struct VMDKSESparseVolatileHeader {
+    uint64_t magic;
+    uint64_t free_gt_number;
+    uint64_t next_txn_seq_number;
+    uint64_t replay_journal;
+    uint8_t pad[480];
+} QEMU_PACKED VMDKSESparseVolatileHeader;
+
 #define L2_CACHE_SIZE 16
 
 typedef struct VmdkExtent {
@@ -96,19 +137,23 @@ typedef struct VmdkExtent {
     bool compressed;
     bool has_marker;
     bool has_zero_grain;
+    bool sesparse;
+    uint64_t sesparse_l2_tables_offset;
+    uint64_t sesparse_clusters_offset;
+    int32_t entry_size;
     int version;
     int64_t sectors;
     int64_t end_sector;
     int64_t flat_start_offset;
     int64_t l1_table_offset;
     int64_t l1_backup_table_offset;
-    uint32_t *l1_table;
+    void *l1_table;
     uint32_t *l1_backup_table;
     unsigned int l1_size;
     uint32_t l1_entry_sectors;
 
     unsigned int l2_size;
-    uint32_t *l2_cache;
+    void *l2_cache;
     uint32_t l2_cache_offsets[L2_CACHE_SIZE];
     uint32_t l2_cache_counts[L2_CACHE_SIZE];
 
@@ -192,13 +237,15 @@ static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
             }
             if (end - p >= strlen("version=X\n")) {
                 if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 ||
-                    strncmp("version=2\n", p, strlen("version=2\n")) == 0) {
+                    strncmp("version=2\n", p, strlen("version=2\n")) == 0 ||
+                    strncmp("version=3\n", p, strlen("version=3\n")) == 0) {
                     return 100;
                 }
             }
             if (end - p >= strlen("version=X\r\n")) {
                 if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 ||
-                    strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0) {
+                    strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0 ||
+                    strncmp("version=3\r\n", p, strlen("version=3\r\n")) == 0) {
                     return 100;
                 }
             }
@@ -331,6 +378,12 @@ static int vmdk_is_cid_valid(BlockDriverState *bs)
     if (!s->cid_checked && bs->backing) {
         BlockDriverState *p_bs = bs->backing->bs;
 
+        if (strcmp(p_bs->drv->format_name, "vmdk")) {
+            /* Backing file is not in vmdk format, so it does not have
+             * a CID, which makes the overlay's parent CID invalid */
+            return 0;
+        }
+
         if (vmdk_read_cid(p_bs, 0, &cur_pcid) != 0) {
             /* read failure: report as not valid */
             return 0;
@@ -378,12 +431,16 @@ static int vmdk_parent_open(BlockDriverState *bs)
             ret = -EINVAL;
             goto out;
         }
-        if ((end_name - p_name) > sizeof(bs->backing_file) - 1) {
+        if ((end_name - p_name) > sizeof(bs->auto_backing_file) - 1) {
             ret = -EINVAL;
             goto out;
         }
 
-        pstrcpy(bs->backing_file, end_name - p_name + 1, p_name);
+        pstrcpy(bs->auto_backing_file, end_name - p_name + 1, p_name);
+        pstrcpy(bs->backing_file, sizeof(bs->backing_file),
+                bs->auto_backing_file);
+        pstrcpy(bs->backing_format, sizeof(bs->backing_format),
+                "vmdk");
     }
 
 out:
@@ -410,11 +467,22 @@ static int vmdk_add_extent(BlockDriverState *bs,
         error_setg(errp, "Invalid granularity, image may be corrupt");
         return -EFBIG;
     }
-    if (l1_size > 512 * 1024 * 1024) {
-        /* Although with big capacity and small l1_entry_sectors, we can get a
+    if (l1_size > 32 * 1024 * 1024) {
+        /*
+         * Although with big capacity and small l1_entry_sectors, we can get a
          * big l1_size, we don't want unbounded value to allocate the table.
-         * Limit it to 512M, which is 16PB for default cluster and L2 table
-         * size */
+         * Limit it to 32M, which is enough to store:
+         *     8TB  - for both VMDK3 & VMDK4 with
+         *            minimal cluster size: 512B
+         *            minimal L2 table size: 512 entries
+         *            8 TB is still more than the maximal value supported for
+         *            VMDK3 & VMDK4 which is 2TB.
+         *     64TB - for "ESXi seSparse Extent"
+         *            minimal cluster size: 512B (default is 4KB)
+         *            L2 table size: 4096 entries (const).
+         *            64TB is more than the maximal value supported for
+         *            seSparse VMDKs (which is slightly less than 64TB)
+         */
         error_setg(errp, "L1 size too big");
         return -EFBIG;
     }
@@ -439,6 +507,7 @@ static int vmdk_add_extent(BlockDriverState *bs,
     extent->l2_size = l2_size;
     extent->cluster_sectors = flat ? sectors : cluster_sectors;
     extent->next_cluster_sector = ROUND_UP(nb_sectors, cluster_sectors);
+    extent->entry_size = sizeof(uint32_t);
 
     if (s->num_extents > 1) {
         extent->end_sector = (*(extent - 1)).end_sector + extent->sectors;
@@ -460,7 +529,7 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
     int i;
 
     /* read the L1 table */
-    l1_size = extent->l1_size * sizeof(uint32_t);
+    l1_size = extent->l1_size * extent->entry_size;
     extent->l1_table = g_try_malloc(l1_size);
     if (l1_size && extent->l1_table == NULL) {
         return -ENOMEM;
@@ -471,16 +540,23 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
                      extent->l1_table,
                      l1_size);
     if (ret < 0) {
+        bdrv_refresh_filename(extent->file->bs);
         error_setg_errno(errp, -ret,
                          "Could not read l1 table from extent '%s'",
                          extent->file->bs->filename);
         goto fail_l1;
     }
     for (i = 0; i < extent->l1_size; i++) {
-        le32_to_cpus(&extent->l1_table[i]);
+        if (extent->entry_size == sizeof(uint64_t)) {
+            le64_to_cpus((uint64_t *)extent->l1_table + i);
+        } else {
+            assert(extent->entry_size == sizeof(uint32_t));
+            le32_to_cpus((uint32_t *)extent->l1_table + i);
+        }
     }
 
     if (extent->l1_backup_table_offset) {
+        assert(!extent->sesparse);
         extent->l1_backup_table = g_try_malloc(l1_size);
         if (l1_size && extent->l1_backup_table == NULL) {
             ret = -ENOMEM;
@@ -491,6 +567,7 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
                          extent->l1_backup_table,
                          l1_size);
         if (ret < 0) {
+            bdrv_refresh_filename(extent->file->bs);
             error_setg_errno(errp, -ret,
                              "Could not read l1 backup table from extent '%s'",
                              extent->file->bs->filename);
@@ -502,7 +579,7 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent,
     }
 
     extent->l2_cache =
-        g_new(uint32_t, extent->l2_size * L2_CACHE_SIZE);
+        g_malloc(extent->entry_size * extent->l2_size * L2_CACHE_SIZE);
     return 0;
  fail_l1b:
     g_free(extent->l1_backup_table);
@@ -522,6 +599,7 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
 
     ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
     if (ret < 0) {
+        bdrv_refresh_filename(file->bs);
         error_setg_errno(errp, -ret,
                          "Could not read header from file '%s'",
                          file->bs->filename);
@@ -547,6 +625,205 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs,
     return ret;
 }
 
+#define SESPARSE_CONST_HEADER_MAGIC UINT64_C(0x00000000cafebabe)
+#define SESPARSE_VOLATILE_HEADER_MAGIC UINT64_C(0x00000000cafecafe)
+
+/* Strict checks - format not officially documented */
+static int check_se_sparse_const_header(VMDKSESparseConstHeader *header,
+                                        Error **errp)
+{
+    header->magic = le64_to_cpu(header->magic);
+    header->version = le64_to_cpu(header->version);
+    header->grain_size = le64_to_cpu(header->grain_size);
+    header->grain_table_size = le64_to_cpu(header->grain_table_size);
+    header->flags = le64_to_cpu(header->flags);
+    header->reserved1 = le64_to_cpu(header->reserved1);
+    header->reserved2 = le64_to_cpu(header->reserved2);
+    header->reserved3 = le64_to_cpu(header->reserved3);
+    header->reserved4 = le64_to_cpu(header->reserved4);
+
+    header->volatile_header_offset =
+        le64_to_cpu(header->volatile_header_offset);
+    header->volatile_header_size = le64_to_cpu(header->volatile_header_size);
+
+    header->journal_header_offset = le64_to_cpu(header->journal_header_offset);
+    header->journal_header_size = le64_to_cpu(header->journal_header_size);
+
+    header->journal_offset = le64_to_cpu(header->journal_offset);
+    header->journal_size = le64_to_cpu(header->journal_size);
+
+    header->grain_dir_offset = le64_to_cpu(header->grain_dir_offset);
+    header->grain_dir_size = le64_to_cpu(header->grain_dir_size);
+
+    header->grain_tables_offset = le64_to_cpu(header->grain_tables_offset);
+    header->grain_tables_size = le64_to_cpu(header->grain_tables_size);
+
+    header->free_bitmap_offset = le64_to_cpu(header->free_bitmap_offset);
+    header->free_bitmap_size = le64_to_cpu(header->free_bitmap_size);
+
+    header->backmap_offset = le64_to_cpu(header->backmap_offset);
+    header->backmap_size = le64_to_cpu(header->backmap_size);
+
+    header->grains_offset = le64_to_cpu(header->grains_offset);
+    header->grains_size = le64_to_cpu(header->grains_size);
+
+    if (header->magic != SESPARSE_CONST_HEADER_MAGIC) {
+        error_setg(errp, "Bad const header magic: 0x%016" PRIx64,
+                   header->magic);
+        return -EINVAL;
+    }
+
+    if (header->version != 0x0000000200000001) {
+        error_setg(errp, "Unsupported version: 0x%016" PRIx64,
+                   header->version);
+        return -ENOTSUP;
+    }
+
+    if (header->grain_size != 8) {
+        error_setg(errp, "Unsupported grain size: %" PRIu64,
+                   header->grain_size);
+        return -ENOTSUP;
+    }
+
+    if (header->grain_table_size != 64) {
+        error_setg(errp, "Unsupported grain table size: %" PRIu64,
+                   header->grain_table_size);
+        return -ENOTSUP;
+    }
+
+    if (header->flags != 0) {
+        error_setg(errp, "Unsupported flags: 0x%016" PRIx64,
+                   header->flags);
+        return -ENOTSUP;
+    }
+
+    if (header->reserved1 != 0 || header->reserved2 != 0 ||
+        header->reserved3 != 0 || header->reserved4 != 0) {
+        error_setg(errp, "Unsupported reserved bits:"
+                   " 0x%016" PRIx64 " 0x%016" PRIx64
+                   " 0x%016" PRIx64 " 0x%016" PRIx64,
+                   header->reserved1, header->reserved2,
+                   header->reserved3, header->reserved4);
+        return -ENOTSUP;
+    }
+
+    /* check that padding is 0 */
+    if (!buffer_is_zero(header->pad, sizeof(header->pad))) {
+        error_setg(errp, "Unsupported non-zero const header padding");
+        return -ENOTSUP;
+    }
+
+    return 0;
+}
+
+static int check_se_sparse_volatile_header(VMDKSESparseVolatileHeader *header,
+                                           Error **errp)
+{
+    header->magic = le64_to_cpu(header->magic);
+    header->free_gt_number = le64_to_cpu(header->free_gt_number);
+    header->next_txn_seq_number = le64_to_cpu(header->next_txn_seq_number);
+    header->replay_journal = le64_to_cpu(header->replay_journal);
+
+    if (header->magic != SESPARSE_VOLATILE_HEADER_MAGIC) {
+        error_setg(errp, "Bad volatile header magic: 0x%016" PRIx64,
+                   header->magic);
+        return -EINVAL;
+    }
+
+    if (header->replay_journal) {
+        error_setg(errp, "Image is dirty, Replaying journal not supported");
+        return -ENOTSUP;
+    }
+
+    /* check that padding is 0 */
+    if (!buffer_is_zero(header->pad, sizeof(header->pad))) {
+        error_setg(errp, "Unsupported non-zero volatile header padding");
+        return -ENOTSUP;
+    }
+
+    return 0;
+}
+
+static int vmdk_open_se_sparse(BlockDriverState *bs,
+                               BdrvChild *file,
+                               int flags, Error **errp)
+{
+    int ret;
+    VMDKSESparseConstHeader const_header;
+    VMDKSESparseVolatileHeader volatile_header;
+    VmdkExtent *extent;
+
+    ret = bdrv_apply_auto_read_only(bs,
+            "No write support for seSparse images available", errp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    assert(sizeof(const_header) == SECTOR_SIZE);
+
+    ret = bdrv_pread(file, 0, &const_header, sizeof(const_header));
+    if (ret < 0) {
+        bdrv_refresh_filename(file->bs);
+        error_setg_errno(errp, -ret,
+                         "Could not read const header from file '%s'",
+                         file->bs->filename);
+        return ret;
+    }
+
+    /* check const header */
+    ret = check_se_sparse_const_header(&const_header, errp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    assert(sizeof(volatile_header) == SECTOR_SIZE);
+
+    ret = bdrv_pread(file,
+                     const_header.volatile_header_offset * SECTOR_SIZE,
+                     &volatile_header, sizeof(volatile_header));
+    if (ret < 0) {
+        bdrv_refresh_filename(file->bs);
+        error_setg_errno(errp, -ret,
+                         "Could not read volatile header from file '%s'",
+                         file->bs->filename);
+        return ret;
+    }
+
+    /* check volatile header */
+    ret = check_se_sparse_volatile_header(&volatile_header, errp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = vmdk_add_extent(bs, file, false,
+                          const_header.capacity,
+                          const_header.grain_dir_offset * SECTOR_SIZE,
+                          0,
+                          const_header.grain_dir_size *
+                          SECTOR_SIZE / sizeof(uint64_t),
+                          const_header.grain_table_size *
+                          SECTOR_SIZE / sizeof(uint64_t),
+                          const_header.grain_size,
+                          &extent,
+                          errp);
+    if (ret < 0) {
+        return ret;
+    }
+
+    extent->sesparse = true;
+    extent->sesparse_l2_tables_offset = const_header.grain_tables_offset;
+    extent->sesparse_clusters_offset = const_header.grains_offset;
+    extent->entry_size = sizeof(uint64_t);
+
+    ret = vmdk_init_tables(bs, extent, errp);
+    if (ret) {
+        /* free extent allocated by vmdk_add_extent */
+        vmdk_free_last_extent(bs);
+    }
+
+    return ret;
+}
+
 static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
                                QDict *options, Error **errp);
 
@@ -599,6 +876,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs,
 
     ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header));
     if (ret < 0) {
+        bdrv_refresh_filename(file->bs);
         error_setg_errno(errp, -ret,
                          "Could not read header from file '%s'",
                          file->bs->filename);
@@ -823,6 +1101,7 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
          * RW [size in sectors] SPARSE "file-name.vmdk"
          * RW [size in sectors] VMFS "file-name.vmdk"
          * RW [size in sectors] VMFSSPARSE "file-name.vmdk"
+         * RW [size in sectors] SESPARSE "file-name.vmdk"
          */
         flat_offset = -1;
         matches = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64,
@@ -845,7 +1124,8 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
 
         if (sectors <= 0 ||
             (strcmp(type, "FLAT") && strcmp(type, "SPARSE") &&
-             strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE")) ||
+             strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE") &&
+             strcmp(type, "SESPARSE")) ||
             (strcmp(access, "RW"))) {
             continue;
         }
@@ -853,13 +1133,13 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
         if (!path_is_absolute(fname) && !path_has_protocol(fname) &&
             !desc_file_path[0])
         {
+            bdrv_refresh_filename(bs->file->bs);
             error_setg(errp, "Cannot use relative extent paths with VMDK "
                        "descriptor file '%s'", bs->file->bs->filename);
             return -EINVAL;
         }
 
-        extent_path = g_malloc0(PATH_MAX);
-        path_combine(extent_path, PATH_MAX, desc_file_path, fname);
+        extent_path = path_combine(desc_file_path, fname);
 
         ret = snprintf(extent_opt_prefix, 32, "extents.%d", s->num_extents);
         assert(ret < 32);
@@ -898,6 +1178,13 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs,
                 return ret;
             }
             extent = &s->extents[s->num_extents - 1];
+        } else if (!strcmp(type, "SESPARSE")) {
+            ret = vmdk_open_se_sparse(bs, extent_file, bs->open_flags, errp);
+            if (ret) {
+                bdrv_unref_child(bs, extent_file);
+                return ret;
+            }
+            extent = &s->extents[s->num_extents - 1];
         } else {
             error_setg(errp, "Unsupported extent type '%s'", type);
             bdrv_unref_child(bs, extent_file);
@@ -932,6 +1219,7 @@ static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf,
     if (strcmp(ct, "monolithicFlat") &&
         strcmp(ct, "vmfs") &&
         strcmp(ct, "vmfsSparse") &&
+        strcmp(ct, "seSparse") &&
         strcmp(ct, "twoGbMaxExtentSparse") &&
         strcmp(ct, "twoGbMaxExtentFlat")) {
         error_setg(errp, "Unsupported image type '%s'", ct);
@@ -1182,10 +1470,12 @@ static int get_cluster_offset(BlockDriverState *bs,
 {
     unsigned int l1_index, l2_offset, l2_index;
     int min_index, i, j;
-    uint32_t min_count, *l2_table;
+    uint32_t min_count;
+    void *l2_table;
     bool zeroed = false;
     int64_t ret;
     int64_t cluster_sector;
+    unsigned int l2_size_bytes = extent->l2_size * extent->entry_size;
 
     if (m_data) {
         m_data->valid = 0;
@@ -1200,7 +1490,36 @@ static int get_cluster_offset(BlockDriverState *bs,
     if (l1_index >= extent->l1_size) {
         return VMDK_ERROR;
     }
-    l2_offset = extent->l1_table[l1_index];
+    if (extent->sesparse) {
+        uint64_t l2_offset_u64;
+
+        assert(extent->entry_size == sizeof(uint64_t));
+
+        l2_offset_u64 = ((uint64_t *)extent->l1_table)[l1_index];
+        if (l2_offset_u64 == 0) {
+            l2_offset = 0;
+        } else if ((l2_offset_u64 & 0xffffffff00000000) != 0x1000000000000000) {
+            /*
+             * Top most nibble is 0x1 if grain table is allocated.
+             * strict check - top most 4 bytes must be 0x10000000 since max
+             * supported size is 64TB for disk - so no more than 64TB / 16MB
+             * grain directories which is smaller than uint32,
+             * where 16MB is the only supported default grain table coverage.
+             */
+            return VMDK_ERROR;
+        } else {
+            l2_offset_u64 = l2_offset_u64 & 0x00000000ffffffff;
+            l2_offset_u64 = extent->sesparse_l2_tables_offset +
+                l2_offset_u64 * l2_size_bytes / SECTOR_SIZE;
+            if (l2_offset_u64 > 0x00000000ffffffff) {
+                return VMDK_ERROR;
+            }
+            l2_offset = (unsigned int)(l2_offset_u64);
+        }
+    } else {
+        assert(extent->entry_size == sizeof(uint32_t));
+        l2_offset = ((uint32_t *)extent->l1_table)[l1_index];
+    }
     if (!l2_offset) {
         return VMDK_UNALLOC;
     }
@@ -1212,7 +1531,7 @@ static int get_cluster_offset(BlockDriverState *bs,
                     extent->l2_cache_counts[j] >>= 1;
                 }
             }
-            l2_table = extent->l2_cache + (i * extent->l2_size);
+            l2_table = (char *)extent->l2_cache + (i * l2_size_bytes);
             goto found;
         }
     }
@@ -1225,13 +1544,13 @@ static int get_cluster_offset(BlockDriverState *bs,
             min_index = i;
         }
     }
-    l2_table = extent->l2_cache + (min_index * extent->l2_size);
+    l2_table = (char *)extent->l2_cache + (min_index * l2_size_bytes);
     BLKDBG_EVENT(extent->file, BLKDBG_L2_LOAD);
     if (bdrv_pread(extent->file,
                 (int64_t)l2_offset * 512,
                 l2_table,
-                extent->l2_size * sizeof(uint32_t)
-            ) != extent->l2_size * sizeof(uint32_t)) {
+                l2_size_bytes
+            ) != l2_size_bytes) {
         return VMDK_ERROR;
     }
 
@@ -1239,16 +1558,49 @@ static int get_cluster_offset(BlockDriverState *bs,
     extent->l2_cache_counts[min_index] = 1;
  found:
     l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
-    cluster_sector = le32_to_cpu(l2_table[l2_index]);
 
-    if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
-        zeroed = true;
+    if (extent->sesparse) {
+        cluster_sector = le64_to_cpu(((uint64_t *)l2_table)[l2_index]);
+        switch (cluster_sector & 0xf000000000000000) {
+        case 0x0000000000000000:
+            /* unallocated grain */
+            if (cluster_sector != 0) {
+                return VMDK_ERROR;
+            }
+            break;
+        case 0x1000000000000000:
+            /* scsi-unmapped grain - fallthrough */
+        case 0x2000000000000000:
+            /* zero grain */
+            zeroed = true;
+            break;
+        case 0x3000000000000000:
+            /* allocated grain */
+            cluster_sector = (((cluster_sector & 0x0fff000000000000) >> 48) |
+                              ((cluster_sector & 0x0000ffffffffffff) << 12));
+            cluster_sector = extent->sesparse_clusters_offset +
+                cluster_sector * extent->cluster_sectors;
+            break;
+        default:
+            return VMDK_ERROR;
+        }
+    } else {
+        cluster_sector = le32_to_cpu(((uint32_t *)l2_table)[l2_index]);
+
+        if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
+            zeroed = true;
+        }
     }
 
     if (!cluster_sector || zeroed) {
         if (!allocate) {
             return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
         }
+        assert(!extent->sesparse);
+
+        if (extent->next_cluster_sector >= VMDK_EXTENT_MAX_SECTORS) {
+            return VMDK_ERROR;
+        }
 
         cluster_sector = extent->next_cluster_sector;
         extent->next_cluster_sector += extent->cluster_sectors;
@@ -1268,7 +1620,7 @@ static int get_cluster_offset(BlockDriverState *bs,
             m_data->l1_index = l1_index;
             m_data->l2_index = l2_index;
             m_data->l2_offset = l2_offset;
-            m_data->l2_cache_entry = &l2_table[l2_index];
+            m_data->l2_cache_entry = ((uint32_t *)l2_table) + l2_index;
         }
     }
     *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
@@ -1304,33 +1656,27 @@ static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
     return extent_relative_offset % cluster_size;
 }
 
-static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent,
-                                                  int64_t sector_num)
-{
-    uint64_t offset;
-    offset = vmdk_find_offset_in_cluster(extent, sector_num * BDRV_SECTOR_SIZE);
-    return offset / BDRV_SECTOR_SIZE;
-}
-
-static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
+static int coroutine_fn vmdk_co_block_status(BlockDriverState *bs,
+                                             bool want_zero,
+                                             int64_t offset, int64_t bytes,
+                                             int64_t *pnum, int64_t *map,
+                                             BlockDriverState **file)
 {
     BDRVVmdkState *s = bs->opaque;
     int64_t index_in_cluster, n, ret;
-    uint64_t offset;
+    uint64_t cluster_offset;
     VmdkExtent *extent;
 
-    extent = find_extent(s, sector_num, NULL);
+    extent = find_extent(s, offset >> BDRV_SECTOR_BITS, NULL);
     if (!extent) {
-        return 0;
+        return -EIO;
     }
     qemu_co_mutex_lock(&s->lock);
-    ret = get_cluster_offset(bs, extent, NULL,
-                             sector_num * 512, false, &offset,
+    ret = get_cluster_offset(bs, extent, NULL, offset, false, &cluster_offset,
                              0, 0);
     qemu_co_mutex_unlock(&s->lock);
 
-    index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
+    index_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
     switch (ret) {
     case VMDK_ERROR:
         ret = -EIO;
@@ -1345,18 +1691,14 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
         ret = BDRV_BLOCK_DATA;
         if (!extent->compressed) {
             ret |= BDRV_BLOCK_OFFSET_VALID;
-            ret |= (offset + (index_in_cluster << BDRV_SECTOR_BITS))
-                    & BDRV_BLOCK_OFFSET_MASK;
+            *map = cluster_offset + index_in_cluster;
         }
         *file = extent->file->bs;
         break;
     }
 
-    n = extent->cluster_sectors - index_in_cluster;
-    if (n > nb_sectors) {
-        n = nb_sectors;
-    }
-    *pnum = n;
+    n = extent->cluster_sectors * BDRV_SECTOR_SIZE - index_in_cluster;
+    *pnum = MIN(n, bytes);
     return ret;
 }
 
@@ -1369,7 +1711,6 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
     VmdkGrainMarker *data = NULL;
     uLongf buf_len;
     QEMUIOVector local_qiov;
-    struct iovec iov;
     int64_t write_offset;
     int64_t write_end_sector;
 
@@ -1397,11 +1738,7 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset,
         data->size = cpu_to_le32(buf_len);
 
         n_bytes = buf_len + sizeof(VmdkGrainMarker);
-        iov = (struct iovec) {
-            .iov_base   = data,
-            .iov_len    = n_bytes,
-        };
-        qemu_iovec_init_external(&local_qiov, &iov, 1);
+        qemu_iovec_init_buf(&local_qiov, data, n_bytes);
 
         BLKDBG_EVENT(extent->file, BLKDBG_WRITE_COMPRESSED);
     } else {
@@ -1609,6 +1946,9 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t offset,
         if (!extent) {
             return -EIO;
         }
+        if (extent->sesparse) {
+            return -ENOTSUP;
+        }
         offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
         n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
                              - offset_in_cluster);
@@ -1696,6 +2036,27 @@ static int coroutine_fn
 vmdk_co_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
                            uint64_t bytes, QEMUIOVector *qiov)
 {
+    if (bytes == 0) {
+        /* The caller will write bytes 0 to signal EOF.
+         * When receive it, we align EOF to a sector boundary. */
+        BDRVVmdkState *s = bs->opaque;
+        int i, ret;
+        int64_t length;
+
+        for (i = 0; i < s->num_extents; i++) {
+            length = bdrv_getlength(s->extents[i].file->bs);
+            if (length < 0) {
+                return length;
+            }
+            length = QEMU_ALIGN_UP(length, BDRV_SECTOR_SIZE);
+            ret = bdrv_truncate(s->extents[i].file, length,
+                                PREALLOC_MODE_OFF, NULL);
+            if (ret < 0) {
+                return ret;
+            }
+        }
+        return 0;
+    }
     return vmdk_co_pwritev(bs, offset, bytes, qiov, 0);
 }
 
@@ -1718,35 +2079,17 @@ static int coroutine_fn vmdk_co_pwrite_zeroes(BlockDriverState *bs,
     return ret;
 }
 
-static int vmdk_create_extent(const char *filename, int64_t filesize,
-                              bool flat, bool compress, bool zeroed_grain,
-                              QemuOpts *opts, Error **errp)
+static int vmdk_init_extent(BlockBackend *blk,
+                            int64_t filesize, bool flat,
+                            bool compress, bool zeroed_grain,
+                            Error **errp)
 {
     int ret, i;
-    BlockBackend *blk = NULL;
     VMDK4Header header;
-    Error *local_err = NULL;
     uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count;
     uint32_t *gd_buf = NULL;
     int gd_buf_size;
 
-    ret = bdrv_create_file(filename, opts, &local_err);
-    if (ret < 0) {
-        error_propagate(errp, local_err);
-        goto exit;
-    }
-
-    blk = blk_new_open(filename, NULL, NULL,
-                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
-                       &local_err);
-    if (blk == NULL) {
-        error_propagate(errp, local_err);
-        ret = -EIO;
-        goto exit;
-    }
-
-    blk_set_allow_write_beyond_eof(blk, true);
-
     if (flat) {
         ret = blk_truncate(blk, filesize, PREALLOC_MODE_OFF, errp);
         goto exit;
@@ -1840,15 +2183,50 @@ static int vmdk_create_extent(const char *filename, int64_t filesize,
                      gd_buf, gd_buf_size, 0);
     if (ret < 0) {
         error_setg(errp, QERR_IO_ERROR);
-        goto exit;
     }
 
     ret = 0;
+exit:
+    g_free(gd_buf);
+    return ret;
+}
+
+static int vmdk_create_extent(const char *filename, int64_t filesize,
+                              bool flat, bool compress, bool zeroed_grain,
+                              BlockBackend **pbb,
+                              QemuOpts *opts, Error **errp)
+{
+    int ret;
+    BlockBackend *blk = NULL;
+    Error *local_err = NULL;
+
+    ret = bdrv_create_file(filename, opts, &local_err);
+    if (ret < 0) {
+        error_propagate(errp, local_err);
+        goto exit;
+    }
+
+    blk = blk_new_open(filename, NULL, NULL,
+                       BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
+                       &local_err);
+    if (blk == NULL) {
+        error_propagate(errp, local_err);
+        ret = -EIO;
+        goto exit;
+    }
+
+    blk_set_allow_write_beyond_eof(blk, true);
+
+    ret = vmdk_init_extent(blk, filesize, flat, compress, zeroed_grain, errp);
 exit:
     if (blk) {
-        blk_unref(blk);
+        if (pbb) {
+            *pbb = blk;
+        } else {
+            blk_unref(blk);
+            blk = NULL;
+        }
     }
-    g_free(gd_buf);
     return ret;
 }
 
@@ -1892,32 +2270,57 @@ static int filename_decompose(const char *filename, char *path, char *prefix,
     return VMDK_OK;
 }
 
-static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
+/*
+ * idx == 0: get or create the descriptor file (also the image file if in a
+ *           non-split format.
+ * idx >= 1: get the n-th extent if in a split subformat
+ */
+typedef BlockBackend *(*vmdk_create_extent_fn)(int64_t size,
+                                               int idx,
+                                               bool flat,
+                                               bool split,
+                                               bool compress,
+                                               bool zeroed_grain,
+                                               void *opaque,
+                                               Error **errp);
+
+static void vmdk_desc_add_extent(GString *desc,
+                                 const char *extent_line_fmt,
+                                 int64_t size, const char *filename)
 {
-    int idx = 0;
-    BlockBackend *new_blk = NULL;
+    char *basename = g_path_get_basename(filename);
+
+    g_string_append_printf(desc, extent_line_fmt,
+                           DIV_ROUND_UP(size, BDRV_SECTOR_SIZE), basename);
+    g_free(basename);
+}
+
+static int coroutine_fn vmdk_co_do_create(int64_t size,
+                                          BlockdevVmdkSubformat subformat,
+                                          BlockdevVmdkAdapterType adapter_type,
+                                          const char *backing_file,
+                                          const char *hw_version,
+                                          bool compat6,
+                                          bool zeroed_grain,
+                                          vmdk_create_extent_fn extent_fn,
+                                          void *opaque,
+                                          Error **errp)
+{
+    int extent_idx;
+    BlockBackend *blk = NULL;
+    BlockBackend *extent_blk;
     Error *local_err = NULL;
     char *desc = NULL;
-    int64_t total_size = 0, filesize;
-    char *adapter_type = NULL;
-    char *backing_file = NULL;
-    char *hw_version = NULL;
-    char *fmt = NULL;
     int ret = 0;
     bool flat, split, compress;
     GString *ext_desc_lines;
-    char *path = g_malloc0(PATH_MAX);
-    char *prefix = g_malloc0(PATH_MAX);
-    char *postfix = g_malloc0(PATH_MAX);
-    char *desc_line = g_malloc0(BUF_SIZE);
-    char *ext_filename = g_malloc0(PATH_MAX);
-    char *desc_filename = g_malloc0(PATH_MAX);
     const int64_t split_size = 0x80000000;  /* VMDK has constant split size */
-    const char *desc_extent_line;
+    int64_t extent_size;
+    int64_t created_size = 0;
+    const char *extent_line_fmt;
     char *parent_desc_line = g_malloc0(BUF_SIZE);
     uint32_t parent_cid = 0xffffffff;
     uint32_t number_heads = 16;
-    bool zeroed_grain = false;
     uint32_t desc_offset = 0, desc_len;
     const char desc_template[] =
         "# Disk DescriptorFile\n"
@@ -1941,71 +2344,35 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
 
     ext_desc_lines = g_string_new(NULL);
 
-    if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) {
-        ret = -EINVAL;
-        goto exit;
-    }
     /* Read out options */
-    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
-                          BDRV_SECTOR_SIZE);
-    adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE);
-    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
-    hw_version = qemu_opt_get_del(opts, BLOCK_OPT_HWVERSION);
-    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false)) {
-        if (strcmp(hw_version, "undefined")) {
+    if (compat6) {
+        if (hw_version) {
             error_setg(errp,
                        "compat6 cannot be enabled with hwversion set");
             ret = -EINVAL;
             goto exit;
         }
-        g_free(hw_version);
-        hw_version = g_strdup("6");
+        hw_version = "6";
     }
-    if (strcmp(hw_version, "undefined") == 0) {
-        g_free(hw_version);
-        hw_version = g_strdup("4");
-    }
-    fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
-    if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false)) {
-        zeroed_grain = true;
+    if (!hw_version) {
+        hw_version = "4";
     }
 
-    if (!adapter_type) {
-        adapter_type = g_strdup("ide");
-    } else if (strcmp(adapter_type, "ide") &&
-               strcmp(adapter_type, "buslogic") &&
-               strcmp(adapter_type, "lsilogic") &&
-               strcmp(adapter_type, "legacyESX")) {
-        error_setg(errp, "Unknown adapter type: '%s'", adapter_type);
-        ret = -EINVAL;
-        goto exit;
-    }
-    if (strcmp(adapter_type, "ide") != 0) {
+    if (adapter_type != BLOCKDEV_VMDK_ADAPTER_TYPE_IDE) {
         /* that's the number of heads with which vmware operates when
            creating, exporting, etc. vmdk files with a non-ide adapter type */
         number_heads = 255;
     }
-    if (!fmt) {
-        /* Default format to monolithicSparse */
-        fmt = g_strdup("monolithicSparse");
-    } else if (strcmp(fmt, "monolithicFlat") &&
-               strcmp(fmt, "monolithicSparse") &&
-               strcmp(fmt, "twoGbMaxExtentSparse") &&
-               strcmp(fmt, "twoGbMaxExtentFlat") &&
-               strcmp(fmt, "streamOptimized")) {
-        error_setg(errp, "Unknown subformat: '%s'", fmt);
-        ret = -EINVAL;
-        goto exit;
-    }
-    split = !(strcmp(fmt, "twoGbMaxExtentFlat") &&
-              strcmp(fmt, "twoGbMaxExtentSparse"));
-    flat = !(strcmp(fmt, "monolithicFlat") &&
-             strcmp(fmt, "twoGbMaxExtentFlat"));
-    compress = !strcmp(fmt, "streamOptimized");
+    split = (subformat == BLOCKDEV_VMDK_SUBFORMAT_TWOGBMAXEXTENTFLAT) ||
+            (subformat == BLOCKDEV_VMDK_SUBFORMAT_TWOGBMAXEXTENTSPARSE);
+    flat = (subformat == BLOCKDEV_VMDK_SUBFORMAT_MONOLITHICFLAT) ||
+           (subformat == BLOCKDEV_VMDK_SUBFORMAT_TWOGBMAXEXTENTFLAT);
+    compress = subformat == BLOCKDEV_VMDK_SUBFORMAT_STREAMOPTIMIZED;
+
     if (flat) {
-        desc_extent_line = "RW %" PRId64 " FLAT \"%s\" 0\n";
+        extent_line_fmt = "RW %" PRId64 " FLAT \"%s\" 0\n";
     } else {
-        desc_extent_line = "RW %" PRId64 " SPARSE \"%s\"\n";
+        extent_line_fmt = "RW %" PRId64 " SPARSE \"%s\"\n";
     }
     if (flat && backing_file) {
         error_setg(errp, "Flat image can't have backing file");
@@ -2017,119 +2384,264 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
         ret = -ENOTSUP;
         goto exit;
     }
+
+    /* Create extents */
+    if (split) {
+        extent_size = split_size;
+    } else {
+        extent_size = size;
+    }
+    if (!split && !flat) {
+        created_size = extent_size;
+    } else {
+        created_size = 0;
+    }
+    /* Get the descriptor file BDS */
+    blk = extent_fn(created_size, 0, flat, split, compress, zeroed_grain,
+                    opaque, errp);
+    if (!blk) {
+        ret = -EIO;
+        goto exit;
+    }
+    if (!split && !flat) {
+        vmdk_desc_add_extent(ext_desc_lines, extent_line_fmt, created_size,
+                             blk_bs(blk)->filename);
+    }
+
     if (backing_file) {
-        BlockBackend *blk;
-        char *full_backing = g_new0(char, PATH_MAX);
-        bdrv_get_full_backing_filename_from_filename(filename, backing_file,
-                                                     full_backing, PATH_MAX,
-                                                     &local_err);
+        BlockBackend *backing;
+        char *full_backing =
+            bdrv_get_full_backing_filename_from_filename(blk_bs(blk)->filename,
+                                                         backing_file,
+                                                         &local_err);
         if (local_err) {
-            g_free(full_backing);
             error_propagate(errp, local_err);
             ret = -ENOENT;
             goto exit;
         }
+        assert(full_backing);
 
-        blk = blk_new_open(full_backing, NULL, NULL,
-                           BDRV_O_NO_BACKING, errp);
+        backing = blk_new_open(full_backing, NULL, NULL,
+                               BDRV_O_NO_BACKING, errp);
         g_free(full_backing);
-        if (blk == NULL) {
+        if (backing == NULL) {
             ret = -EIO;
             goto exit;
         }
-        if (strcmp(blk_bs(blk)->drv->format_name, "vmdk")) {
-            blk_unref(blk);
+        if (strcmp(blk_bs(backing)->drv->format_name, "vmdk")) {
+            error_setg(errp, "Invalid backing file format: %s. Must be vmdk",
+                       blk_bs(backing)->drv->format_name);
+            blk_unref(backing);
             ret = -EINVAL;
             goto exit;
         }
-        ret = vmdk_read_cid(blk_bs(blk), 0, &parent_cid);
-        blk_unref(blk);
+        ret = vmdk_read_cid(blk_bs(backing), 0, &parent_cid);
+        blk_unref(backing);
         if (ret) {
+            error_setg(errp, "Failed to read parent CID");
             goto exit;
         }
         snprintf(parent_desc_line, BUF_SIZE,
                 "parentFileNameHint=\"%s\"", backing_file);
     }
-
-    /* Create extents */
-    filesize = total_size;
-    while (filesize > 0) {
-        int64_t size = filesize;
-
-        if (split && size > split_size) {
-            size = split_size;
-        }
-        if (split) {
-            snprintf(desc_filename, PATH_MAX, "%s-%c%03d%s",
-                    prefix, flat ? 'f' : 's', ++idx, postfix);
-        } else if (flat) {
-            snprintf(desc_filename, PATH_MAX, "%s-flat%s", prefix, postfix);
-        } else {
-            snprintf(desc_filename, PATH_MAX, "%s%s", prefix, postfix);
-        }
-        snprintf(ext_filename, PATH_MAX, "%s%s", path, desc_filename);
-
-        if (vmdk_create_extent(ext_filename, size,
-                               flat, compress, zeroed_grain, opts, errp)) {
+    extent_idx = 1;
+    while (created_size < size) {
+        int64_t cur_size = MIN(size - created_size, extent_size);
+        extent_blk = extent_fn(cur_size, extent_idx, flat, split, compress,
+                               zeroed_grain, opaque, errp);
+        if (!extent_blk) {
             ret = -EINVAL;
             goto exit;
         }
-        filesize -= size;
-
-        /* Format description line */
-        snprintf(desc_line, BUF_SIZE,
-                    desc_extent_line, size / BDRV_SECTOR_SIZE, desc_filename);
-        g_string_append(ext_desc_lines, desc_line);
+        vmdk_desc_add_extent(ext_desc_lines, extent_line_fmt, cur_size,
+                             blk_bs(extent_blk)->filename);
+        created_size += cur_size;
+        extent_idx++;
+        blk_unref(extent_blk);
+    }
+
+    /* Check whether we got excess extents */
+    extent_blk = extent_fn(-1, extent_idx, flat, split, compress, zeroed_grain,
+                           opaque, NULL);
+    if (extent_blk) {
+        blk_unref(extent_blk);
+        error_setg(errp, "List of extents contains unused extents");
+        ret = -EINVAL;
+        goto exit;
     }
+
     /* generate descriptor file */
     desc = g_strdup_printf(desc_template,
                            g_random_int(),
                            parent_cid,
-                           fmt,
+                           BlockdevVmdkSubformat_str(subformat),
                            parent_desc_line,
                            ext_desc_lines->str,
                            hw_version,
-                           total_size /
+                           size /
                                (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE),
                            number_heads,
-                           adapter_type);
+                           BlockdevVmdkAdapterType_str(adapter_type));
     desc_len = strlen(desc);
     /* the descriptor offset = 0x200 */
     if (!split && !flat) {
         desc_offset = 0x200;
-    } else {
-        ret = bdrv_create_file(filename, opts, &local_err);
+    }
+
+    ret = blk_pwrite(blk, desc_offset, desc, desc_len, 0);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Could not write description");
+        goto exit;
+    }
+    /* bdrv_pwrite write padding zeros to align to sector, we don't need that
+     * for description file */
+    if (desc_offset == 0) {
+        ret = blk_truncate(blk, desc_len, PREALLOC_MODE_OFF, errp);
         if (ret < 0) {
-            error_propagate(errp, local_err);
             goto exit;
         }
     }
+    ret = 0;
+exit:
+    if (blk) {
+        blk_unref(blk);
+    }
+    g_free(desc);
+    g_free(parent_desc_line);
+    g_string_free(ext_desc_lines, true);
+    return ret;
+}
 
-    new_blk = blk_new_open(filename, NULL, NULL,
-                           BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
-                           &local_err);
-    if (new_blk == NULL) {
-        error_propagate(errp, local_err);
-        ret = -EIO;
+typedef struct {
+    char *path;
+    char *prefix;
+    char *postfix;
+    QemuOpts *opts;
+} VMDKCreateOptsData;
+
+static BlockBackend *vmdk_co_create_opts_cb(int64_t size, int idx,
+                                            bool flat, bool split, bool compress,
+                                            bool zeroed_grain, void *opaque,
+                                            Error **errp)
+{
+    BlockBackend *blk = NULL;
+    BlockDriverState *bs = NULL;
+    VMDKCreateOptsData *data = opaque;
+    char *ext_filename = NULL;
+    char *rel_filename = NULL;
+
+    /* We're done, don't create excess extents. */
+    if (size == -1) {
+        assert(errp == NULL);
+        return NULL;
+    }
+
+    if (idx == 0) {
+        rel_filename = g_strdup_printf("%s%s", data->prefix, data->postfix);
+    } else if (split) {
+        rel_filename = g_strdup_printf("%s-%c%03d%s",
+                                       data->prefix,
+                                       flat ? 'f' : 's', idx, data->postfix);
+    } else {
+        assert(idx == 1);
+        rel_filename = g_strdup_printf("%s-flat%s", data->prefix, data->postfix);
+    }
+
+    ext_filename = g_strdup_printf("%s%s", data->path, rel_filename);
+    g_free(rel_filename);
+
+    if (vmdk_create_extent(ext_filename, size,
+                           flat, compress, zeroed_grain, &blk, data->opts,
+                           errp)) {
         goto exit;
     }
+    bdrv_unref(bs);
+exit:
+    g_free(ext_filename);
+    return blk;
+}
 
-    blk_set_allow_write_beyond_eof(new_blk, true);
+static int coroutine_fn vmdk_co_create_opts(const char *filename, QemuOpts *opts,
+                                            Error **errp)
+{
+    Error *local_err = NULL;
+    char *desc = NULL;
+    int64_t total_size = 0;
+    char *adapter_type = NULL;
+    BlockdevVmdkAdapterType adapter_type_enum;
+    char *backing_file = NULL;
+    char *hw_version = NULL;
+    char *fmt = NULL;
+    BlockdevVmdkSubformat subformat;
+    int ret = 0;
+    char *path = g_malloc0(PATH_MAX);
+    char *prefix = g_malloc0(PATH_MAX);
+    char *postfix = g_malloc0(PATH_MAX);
+    char *desc_line = g_malloc0(BUF_SIZE);
+    char *ext_filename = g_malloc0(PATH_MAX);
+    char *desc_filename = g_malloc0(PATH_MAX);
+    char *parent_desc_line = g_malloc0(BUF_SIZE);
+    bool zeroed_grain;
+    bool compat6;
+    VMDKCreateOptsData data;
 
-    ret = blk_pwrite(new_blk, desc_offset, desc, desc_len, 0);
-    if (ret < 0) {
-        error_setg_errno(errp, -ret, "Could not write description");
+    if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) {
+        ret = -EINVAL;
         goto exit;
     }
-    /* bdrv_pwrite write padding zeros to align to sector, we don't need that
-     * for description file */
-    if (desc_offset == 0) {
-        ret = blk_truncate(new_blk, desc_len, PREALLOC_MODE_OFF, errp);
+    /* Read out options */
+    total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
+                          BDRV_SECTOR_SIZE);
+    adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE);
+    backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
+    hw_version = qemu_opt_get_del(opts, BLOCK_OPT_HWVERSION);
+    compat6 = qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false);
+    if (strcmp(hw_version, "undefined") == 0) {
+        g_free(hw_version);
+        hw_version = NULL;
     }
-exit:
-    if (new_blk) {
-        blk_unref(new_blk);
+    fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT);
+    zeroed_grain = qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false);
+
+    if (adapter_type) {
+        adapter_type_enum = qapi_enum_parse(&BlockdevVmdkAdapterType_lookup,
+                                            adapter_type,
+                                            BLOCKDEV_VMDK_ADAPTER_TYPE_IDE,
+                                            &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            ret = -EINVAL;
+            goto exit;
+        }
+    } else {
+        adapter_type_enum = BLOCKDEV_VMDK_ADAPTER_TYPE_IDE;
+    }
+
+    if (!fmt) {
+        /* Default format to monolithicSparse */
+        subformat = BLOCKDEV_VMDK_SUBFORMAT_MONOLITHICSPARSE;
+    } else {
+        subformat = qapi_enum_parse(&BlockdevVmdkSubformat_lookup,
+                                    fmt,
+                                    BLOCKDEV_VMDK_SUBFORMAT_MONOLITHICSPARSE,
+                                    &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            ret = -EINVAL;
+            goto exit;
+        }
     }
+    data = (VMDKCreateOptsData){
+        .prefix = prefix,
+        .postfix = postfix,
+        .path = path,
+        .opts = opts,
+    };
+    ret = vmdk_co_do_create(total_size, subformat, adapter_type_enum,
+                            backing_file, hw_version, compat6, zeroed_grain,
+                            vmdk_co_create_opts_cb, &data, errp);
+
+exit:
     g_free(adapter_type);
     g_free(backing_file);
     g_free(hw_version);
@@ -2142,7 +2654,87 @@ exit:
     g_free(ext_filename);
     g_free(desc_filename);
     g_free(parent_desc_line);
-    g_string_free(ext_desc_lines, true);
+    return ret;
+}
+
+static BlockBackend *vmdk_co_create_cb(int64_t size, int idx,
+                                       bool flat, bool split, bool compress,
+                                       bool zeroed_grain, void *opaque,
+                                       Error **errp)
+{
+    int ret;
+    BlockDriverState *bs;
+    BlockBackend *blk;
+    BlockdevCreateOptionsVmdk *opts = opaque;
+
+    if (idx == 0) {
+        bs = bdrv_open_blockdev_ref(opts->file, errp);
+    } else {
+        int i;
+        BlockdevRefList *list = opts->extents;
+        for (i = 1; i < idx; i++) {
+            if (!list || !list->next) {
+                error_setg(errp, "Extent [%d] not specified", i);
+                return NULL;
+            }
+            list = list->next;
+        }
+        if (!list) {
+            error_setg(errp, "Extent [%d] not specified", idx - 1);
+            return NULL;
+        }
+        bs = bdrv_open_blockdev_ref(list->value, errp);
+    }
+    if (!bs) {
+        return NULL;
+    }
+    blk = blk_new(bdrv_get_aio_context(bs),
+                  BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE | BLK_PERM_RESIZE,
+                  BLK_PERM_ALL);
+    if (blk_insert_bs(blk, bs, errp)) {
+        bdrv_unref(bs);
+        return NULL;
+    }
+    blk_set_allow_write_beyond_eof(blk, true);
+    bdrv_unref(bs);
+
+    if (size != -1) {
+        ret = vmdk_init_extent(blk, size, flat, compress, zeroed_grain, errp);
+        if (ret) {
+            blk_unref(blk);
+            blk = NULL;
+        }
+    }
+    return blk;
+}
+
+static int coroutine_fn vmdk_co_create(BlockdevCreateOptions *create_options,
+                                       Error **errp)
+{
+    int ret;
+    BlockdevCreateOptionsVmdk *opts;
+
+    opts = &create_options->u.vmdk;
+
+    /* Validate options */
+    if (!QEMU_IS_ALIGNED(opts->size, BDRV_SECTOR_SIZE)) {
+        error_setg(errp, "Image size must be a multiple of 512 bytes");
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ret = vmdk_co_do_create(opts->size,
+                            opts->subformat,
+                            opts->adapter_type,
+                            opts->backing_file,
+                            opts->hwversion,
+                            false,
+                            opts->zeroed_grain,
+                            vmdk_co_create_cb,
+                            opts, errp);
+    return ret;
+
+out:
     return ret;
 }
 
@@ -2217,6 +2809,7 @@ static ImageInfo *vmdk_get_extent_info(VmdkExtent *extent)
 {
     ImageInfo *info = g_new0(ImageInfo, 1);
 
+    bdrv_refresh_filename(extent->file->bs);
     *info = (ImageInfo){
         .filename         = g_strdup(extent->file->bs->filename),
         .format           = g_strdup(extent->type),
@@ -2230,8 +2823,9 @@ static ImageInfo *vmdk_get_extent_info(VmdkExtent *extent)
     return info;
 }
 
-static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result,
-                      BdrvCheckMode fix)
+static int coroutine_fn vmdk_co_check(BlockDriverState *bs,
+                                      BdrvCheckResult *result,
+                                      BdrvCheckMode fix)
 {
     BDRVVmdkState *s = bs->opaque;
     VmdkExtent *extent = NULL;
@@ -2289,7 +2883,8 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result,
     return ret;
 }
 
-static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs)
+static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs,
+                                                 Error **errp)
 {
     int i;
     BDRVVmdkState *s = bs->opaque;
@@ -2346,6 +2941,23 @@ static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
     return 0;
 }
 
+static void vmdk_gather_child_options(BlockDriverState *bs, QDict *target,
+                                      bool backing_overridden)
+{
+    /* No children but file and backing can be explicitly specified (TODO) */
+    qdict_put(target, "file",
+              qobject_ref(bs->file->bs->full_open_options));
+
+    if (backing_overridden) {
+        if (bs->backing) {
+            qdict_put(target, "backing",
+                      qobject_ref(bs->backing->bs->full_open_options));
+        } else {
+            qdict_put_null(target, "backing");
+        }
+    }
+}
+
 static QemuOptsList vmdk_create_opts = {
     .name = "vmdk-create-opts",
     .head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head),
@@ -2400,7 +3012,7 @@ static BlockDriver bdrv_vmdk = {
     .instance_size                = sizeof(BDRVVmdkState),
     .bdrv_probe                   = vmdk_probe,
     .bdrv_open                    = vmdk_open,
-    .bdrv_check                   = vmdk_check,
+    .bdrv_co_check                = vmdk_co_check,
     .bdrv_reopen_prepare          = vmdk_reopen_prepare,
     .bdrv_child_perm              = bdrv_format_default_perms,
     .bdrv_co_preadv               = vmdk_co_preadv,
@@ -2408,14 +3020,16 @@ static BlockDriver bdrv_vmdk = {
     .bdrv_co_pwritev_compressed   = vmdk_co_pwritev_compressed,
     .bdrv_co_pwrite_zeroes        = vmdk_co_pwrite_zeroes,
     .bdrv_close                   = vmdk_close,
-    .bdrv_create                  = vmdk_create,
+    .bdrv_co_create_opts          = vmdk_co_create_opts,
+    .bdrv_co_create               = vmdk_co_create,
     .bdrv_co_flush_to_disk        = vmdk_co_flush,
-    .bdrv_co_get_block_status     = vmdk_co_get_block_status,
+    .bdrv_co_block_status         = vmdk_co_block_status,
     .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
     .bdrv_has_zero_init           = vmdk_has_zero_init,
     .bdrv_get_specific_info       = vmdk_get_specific_info,
     .bdrv_refresh_limits          = vmdk_refresh_limits,
     .bdrv_get_info                = vmdk_get_info,
+    .bdrv_gather_child_options    = vmdk_gather_child_options,
 
     .supports_backing             = true,
     .create_opts                  = &vmdk_create_opts,