]> git.proxmox.com Git - pve-qemu-kvm.git/blame - debian/patches/pve/0011-introduce-new-vma-archive-format.patch
bump version to 2.9.0-1~rc2+5
[pve-qemu-kvm.git] / debian / patches / pve / 0011-introduce-new-vma-archive-format.patch
CommitLineData
1a91ab45 1From c1338b34ccac2c5e6d7d1aca3ca3e3457a3f744c Mon Sep 17 00:00:00 2001
ca0fe5f5
WB
2From: Dietmar Maurer <dietmar@proxmox.com>
3Date: Tue, 13 Nov 2012 11:11:38 +0100
adeb0c7a 4Subject: [PATCH 11/48] introduce new vma archive format
ca0fe5f5
WB
5
6This is a very simple archive format, see docs/specs/vma_spec.txt
7
8Signed-off-by: Dietmar Maurer <dietmar@proxmox.com>
9---
10 Makefile | 3 +-
11 Makefile.objs | 1 +
68a30562
WB
12 vma-reader.c | 797 +++++++++++++++++++++++++++++++++++++++++++++++++++++
13 vma-writer.c | 870 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1a91ab45 14 vma.c | 586 +++++++++++++++++++++++++++++++++++++++
ca0fe5f5 15 vma.h | 146 ++++++++++
1a91ab45 16 6 files changed, 2402 insertions(+), 1 deletion(-)
ca0fe5f5
WB
17 create mode 100644 vma-reader.c
18 create mode 100644 vma-writer.c
19 create mode 100644 vma.c
20 create mode 100644 vma.h
21
22diff --git a/Makefile b/Makefile
1a91ab45 23index 6c359b2..edbc8b5 100644
ca0fe5f5
WB
24--- a/Makefile
25+++ b/Makefile
1a91ab45 26@@ -284,7 +284,7 @@ ifneq ($(wildcard config-host.mak),)
68a30562 27 include $(SRC_PATH)/tests/Makefile.include
ca0fe5f5
WB
28 endif
29
30-all: $(DOCS) $(TOOLS) $(HELPERS-y) recurse-all modules
31+all: $(DOCS) $(TOOLS) vma$(EXESUF) $(HELPERS-y) recurse-all modules
32
68a30562
WB
33 qemu-version.h: FORCE
34 $(call quiet-command, \
1a91ab45
WB
35@@ -377,6 +377,7 @@ qemu-img.o: qemu-img-cmds.h
36 qemu-img$(EXESUF): qemu-img.o $(block-obj-y) $(crypto-obj-y) $(io-obj-y) $(qom-obj-y) $(COMMON_LDADDS)
37 qemu-nbd$(EXESUF): qemu-nbd.o $(block-obj-y) $(crypto-obj-y) $(io-obj-y) $(qom-obj-y) $(COMMON_LDADDS)
38 qemu-io$(EXESUF): qemu-io.o $(block-obj-y) $(crypto-obj-y) $(io-obj-y) $(qom-obj-y) $(COMMON_LDADDS)
39+vma$(EXESUF): vma.o vma-reader.o $(block-obj-y) $(crypto-obj-y) $(io-obj-y) $(qom-obj-y) $(COMMON_LDADDS)
ca0fe5f5 40
1a91ab45 41 qemu-bridge-helper$(EXESUF): qemu-bridge-helper.o $(COMMON_LDADDS)
ca0fe5f5
WB
42
43diff --git a/Makefile.objs b/Makefile.objs
1a91ab45 44index 6167e7b..9b12ee6 100644
ca0fe5f5
WB
45--- a/Makefile.objs
46+++ b/Makefile.objs
1a91ab45 47@@ -14,6 +14,7 @@ block-obj-y += block.o blockjob.o
ca0fe5f5
WB
48 block-obj-y += block/
49 block-obj-y += qemu-io-cmds.o
1a91ab45 50 block-obj-$(CONFIG_REPLICATION) += replication.o
ca0fe5f5
WB
51+block-obj-y += vma-writer.o
52
53 block-obj-m = block/
54
55diff --git a/vma-reader.c b/vma-reader.c
56new file mode 100644
68a30562 57index 0000000..51dd8fe
ca0fe5f5
WB
58--- /dev/null
59+++ b/vma-reader.c
68a30562 60@@ -0,0 +1,797 @@
ca0fe5f5
WB
61+/*
62+ * VMA: Virtual Machine Archive
63+ *
64+ * Copyright (C) 2012 Proxmox Server Solutions
65+ *
66+ * Authors:
67+ * Dietmar Maurer (dietmar@proxmox.com)
68+ *
69+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
70+ * See the COPYING file in the top-level directory.
71+ *
72+ */
73+
68a30562 74+#include "qemu/osdep.h"
ca0fe5f5
WB
75+#include <glib.h>
76+#include <uuid/uuid.h>
77+
78+#include "qemu-common.h"
79+#include "qemu/timer.h"
80+#include "qemu/ratelimit.h"
81+#include "vma.h"
82+#include "block/block.h"
68a30562 83+#include "sysemu/block-backend.h"
ca0fe5f5
WB
84+
85+static unsigned char zero_vma_block[VMA_BLOCK_SIZE];
86+
87+typedef struct VmaRestoreState {
88+ BlockDriverState *bs;
89+ bool write_zeroes;
90+ unsigned long *bitmap;
91+ int bitmap_size;
92+} VmaRestoreState;
93+
94+struct VmaReader {
95+ int fd;
96+ GChecksum *md5csum;
97+ GHashTable *blob_hash;
98+ unsigned char *head_data;
99+ VmaDeviceInfo devinfo[256];
100+ VmaRestoreState rstate[256];
101+ GList *cdata_list;
102+ guint8 vmstate_stream;
103+ uint32_t vmstate_clusters;
104+ /* to show restore percentage if run with -v */
105+ time_t start_time;
106+ int64_t cluster_count;
107+ int64_t clusters_read;
108+ int clusters_read_per;
109+};
110+
111+static guint
112+g_int32_hash(gconstpointer v)
113+{
114+ return *(const uint32_t *)v;
115+}
116+
117+static gboolean
118+g_int32_equal(gconstpointer v1, gconstpointer v2)
119+{
120+ return *((const uint32_t *)v1) == *((const uint32_t *)v2);
121+}
122+
123+static int vma_reader_get_bitmap(VmaRestoreState *rstate, int64_t cluster_num)
124+{
125+ assert(rstate);
126+ assert(rstate->bitmap);
127+
128+ unsigned long val, idx, bit;
129+
130+ idx = cluster_num / BITS_PER_LONG;
131+
132+ assert(rstate->bitmap_size > idx);
133+
134+ bit = cluster_num % BITS_PER_LONG;
135+ val = rstate->bitmap[idx];
136+
137+ return !!(val & (1UL << bit));
138+}
139+
140+static void vma_reader_set_bitmap(VmaRestoreState *rstate, int64_t cluster_num,
141+ int dirty)
142+{
143+ assert(rstate);
144+ assert(rstate->bitmap);
145+
146+ unsigned long val, idx, bit;
147+
148+ idx = cluster_num / BITS_PER_LONG;
149+
150+ assert(rstate->bitmap_size > idx);
151+
152+ bit = cluster_num % BITS_PER_LONG;
153+ val = rstate->bitmap[idx];
154+ if (dirty) {
155+ if (!(val & (1UL << bit))) {
156+ val |= 1UL << bit;
157+ }
158+ } else {
159+ if (val & (1UL << bit)) {
160+ val &= ~(1UL << bit);
161+ }
162+ }
163+ rstate->bitmap[idx] = val;
164+}
165+
166+typedef struct VmaBlob {
167+ uint32_t start;
168+ uint32_t len;
169+ void *data;
170+} VmaBlob;
171+
172+static const VmaBlob *get_header_blob(VmaReader *vmar, uint32_t pos)
173+{
174+ assert(vmar);
175+ assert(vmar->blob_hash);
176+
177+ return g_hash_table_lookup(vmar->blob_hash, &pos);
178+}
179+
180+static const char *get_header_str(VmaReader *vmar, uint32_t pos)
181+{
182+ const VmaBlob *blob = get_header_blob(vmar, pos);
183+ if (!blob) {
184+ return NULL;
185+ }
186+ const char *res = (char *)blob->data;
187+ if (res[blob->len-1] != '\0') {
188+ return NULL;
189+ }
190+ return res;
191+}
192+
193+static ssize_t
194+safe_read(int fd, unsigned char *buf, size_t count)
195+{
196+ ssize_t n;
197+
198+ do {
199+ n = read(fd, buf, count);
200+ } while (n < 0 && errno == EINTR);
201+
202+ return n;
203+}
204+
205+static ssize_t
206+full_read(int fd, unsigned char *buf, size_t len)
207+{
208+ ssize_t n;
209+ size_t total;
210+
211+ total = 0;
212+
213+ while (len > 0) {
214+ n = safe_read(fd, buf, len);
215+
216+ if (n == 0) {
217+ return total;
218+ }
219+
220+ if (n <= 0) {
221+ break;
222+ }
223+
224+ buf += n;
225+ total += n;
226+ len -= n;
227+ }
228+
229+ if (len) {
230+ return -1;
231+ }
232+
233+ return total;
234+}
235+
236+void vma_reader_destroy(VmaReader *vmar)
237+{
238+ assert(vmar);
239+
240+ if (vmar->fd >= 0) {
241+ close(vmar->fd);
242+ }
243+
244+ if (vmar->cdata_list) {
245+ g_list_free(vmar->cdata_list);
246+ }
247+
248+ int i;
249+ for (i = 1; i < 256; i++) {
250+ if (vmar->rstate[i].bitmap) {
251+ g_free(vmar->rstate[i].bitmap);
252+ }
253+ }
254+
255+ if (vmar->md5csum) {
256+ g_checksum_free(vmar->md5csum);
257+ }
258+
259+ if (vmar->blob_hash) {
260+ g_hash_table_destroy(vmar->blob_hash);
261+ }
262+
263+ if (vmar->head_data) {
264+ g_free(vmar->head_data);
265+ }
266+
267+ g_free(vmar);
268+
269+};
270+
271+static int vma_reader_read_head(VmaReader *vmar, Error **errp)
272+{
273+ assert(vmar);
274+ assert(errp);
275+ assert(*errp == NULL);
276+
277+ unsigned char md5sum[16];
278+ int i;
279+ int ret = 0;
280+
281+ vmar->head_data = g_malloc(sizeof(VmaHeader));
282+
283+ if (full_read(vmar->fd, vmar->head_data, sizeof(VmaHeader)) !=
284+ sizeof(VmaHeader)) {
285+ error_setg(errp, "can't read vma header - %s",
286+ errno ? g_strerror(errno) : "got EOF");
287+ return -1;
288+ }
289+
290+ VmaHeader *h = (VmaHeader *)vmar->head_data;
291+
292+ if (h->magic != VMA_MAGIC) {
293+ error_setg(errp, "not a vma file - wrong magic number");
294+ return -1;
295+ }
296+
297+ uint32_t header_size = GUINT32_FROM_BE(h->header_size);
298+ int need = header_size - sizeof(VmaHeader);
299+ if (need <= 0) {
300+ error_setg(errp, "wrong vma header size %d", header_size);
301+ return -1;
302+ }
303+
304+ vmar->head_data = g_realloc(vmar->head_data, header_size);
305+ h = (VmaHeader *)vmar->head_data;
306+
307+ if (full_read(vmar->fd, vmar->head_data + sizeof(VmaHeader), need) !=
308+ need) {
309+ error_setg(errp, "can't read vma header data - %s",
310+ errno ? g_strerror(errno) : "got EOF");
311+ return -1;
312+ }
313+
314+ memcpy(md5sum, h->md5sum, 16);
315+ memset(h->md5sum, 0, 16);
316+
317+ g_checksum_reset(vmar->md5csum);
318+ g_checksum_update(vmar->md5csum, vmar->head_data, header_size);
319+ gsize csize = 16;
320+ g_checksum_get_digest(vmar->md5csum, (guint8 *)(h->md5sum), &csize);
321+
322+ if (memcmp(md5sum, h->md5sum, 16) != 0) {
323+ error_setg(errp, "wrong vma header chechsum");
324+ return -1;
325+ }
326+
327+ /* we can modify header data after checksum verify */
328+ h->header_size = header_size;
329+
330+ h->version = GUINT32_FROM_BE(h->version);
331+ if (h->version != 1) {
332+ error_setg(errp, "wrong vma version %d", h->version);
333+ return -1;
334+ }
335+
336+ h->ctime = GUINT64_FROM_BE(h->ctime);
337+ h->blob_buffer_offset = GUINT32_FROM_BE(h->blob_buffer_offset);
338+ h->blob_buffer_size = GUINT32_FROM_BE(h->blob_buffer_size);
339+
340+ uint32_t bstart = h->blob_buffer_offset + 1;
341+ uint32_t bend = h->blob_buffer_offset + h->blob_buffer_size;
342+
343+ if (bstart <= sizeof(VmaHeader)) {
344+ error_setg(errp, "wrong vma blob buffer offset %d",
345+ h->blob_buffer_offset);
346+ return -1;
347+ }
348+
349+ if (bend > header_size) {
350+ error_setg(errp, "wrong vma blob buffer size %d/%d",
351+ h->blob_buffer_offset, h->blob_buffer_size);
352+ return -1;
353+ }
354+
355+ while ((bstart + 2) <= bend) {
356+ uint32_t size = vmar->head_data[bstart] +
357+ (vmar->head_data[bstart+1] << 8);
358+ if ((bstart + size + 2) <= bend) {
359+ VmaBlob *blob = g_new0(VmaBlob, 1);
360+ blob->start = bstart - h->blob_buffer_offset;
361+ blob->len = size;
362+ blob->data = vmar->head_data + bstart + 2;
363+ g_hash_table_insert(vmar->blob_hash, &blob->start, blob);
364+ }
365+ bstart += size + 2;
366+ }
367+
368+
369+ int count = 0;
370+ for (i = 1; i < 256; i++) {
371+ VmaDeviceInfoHeader *dih = &h->dev_info[i];
372+ uint32_t devname_ptr = GUINT32_FROM_BE(dih->devname_ptr);
373+ uint64_t size = GUINT64_FROM_BE(dih->size);
374+ const char *devname = get_header_str(vmar, devname_ptr);
375+
376+ if (size && devname) {
377+ count++;
378+ vmar->devinfo[i].size = size;
379+ vmar->devinfo[i].devname = devname;
380+
381+ if (strcmp(devname, "vmstate") == 0) {
382+ vmar->vmstate_stream = i;
383+ }
384+ }
385+ }
386+
387+ if (!count) {
388+ error_setg(errp, "vma does not contain data");
389+ return -1;
390+ }
391+
392+ for (i = 0; i < VMA_MAX_CONFIGS; i++) {
393+ uint32_t name_ptr = GUINT32_FROM_BE(h->config_names[i]);
394+ uint32_t data_ptr = GUINT32_FROM_BE(h->config_data[i]);
395+
396+ if (!(name_ptr && data_ptr)) {
397+ continue;
398+ }
399+ const char *name = get_header_str(vmar, name_ptr);
400+ const VmaBlob *blob = get_header_blob(vmar, data_ptr);
401+
402+ if (!(name && blob)) {
403+ error_setg(errp, "vma contains invalid data pointers");
404+ return -1;
405+ }
406+
407+ VmaConfigData *cdata = g_new0(VmaConfigData, 1);
408+ cdata->name = name;
409+ cdata->data = blob->data;
410+ cdata->len = blob->len;
411+
412+ vmar->cdata_list = g_list_append(vmar->cdata_list, cdata);
413+ }
414+
415+ return ret;
416+};
417+
418+VmaReader *vma_reader_create(const char *filename, Error **errp)
419+{
420+ assert(filename);
421+ assert(errp);
422+
423+ VmaReader *vmar = g_new0(VmaReader, 1);
424+
425+ if (strcmp(filename, "-") == 0) {
426+ vmar->fd = dup(0);
427+ } else {
428+ vmar->fd = open(filename, O_RDONLY);
429+ }
430+
431+ if (vmar->fd < 0) {
432+ error_setg(errp, "can't open file %s - %s\n", filename,
433+ g_strerror(errno));
434+ goto err;
435+ }
436+
437+ vmar->md5csum = g_checksum_new(G_CHECKSUM_MD5);
438+ if (!vmar->md5csum) {
439+ error_setg(errp, "can't allocate cmsum\n");
440+ goto err;
441+ }
442+
443+ vmar->blob_hash = g_hash_table_new_full(g_int32_hash, g_int32_equal,
444+ NULL, g_free);
445+
446+ if (vma_reader_read_head(vmar, errp) < 0) {
447+ goto err;
448+ }
449+
450+ return vmar;
451+
452+err:
453+ if (vmar) {
454+ vma_reader_destroy(vmar);
455+ }
456+
457+ return NULL;
458+}
459+
460+VmaHeader *vma_reader_get_header(VmaReader *vmar)
461+{
462+ assert(vmar);
463+ assert(vmar->head_data);
464+
465+ return (VmaHeader *)(vmar->head_data);
466+}
467+
468+GList *vma_reader_get_config_data(VmaReader *vmar)
469+{
470+ assert(vmar);
471+ assert(vmar->head_data);
472+
473+ return vmar->cdata_list;
474+}
475+
476+VmaDeviceInfo *vma_reader_get_device_info(VmaReader *vmar, guint8 dev_id)
477+{
478+ assert(vmar);
479+ assert(dev_id);
480+
481+ if (vmar->devinfo[dev_id].size && vmar->devinfo[dev_id].devname) {
482+ return &vmar->devinfo[dev_id];
483+ }
484+
485+ return NULL;
486+}
487+
488+int vma_reader_register_bs(VmaReader *vmar, guint8 dev_id, BlockDriverState *bs,
489+ bool write_zeroes, Error **errp)
490+{
491+ assert(vmar);
492+ assert(bs != NULL);
493+ assert(dev_id);
494+ assert(vmar->rstate[dev_id].bs == NULL);
495+
496+ int64_t size = bdrv_getlength(bs);
68a30562
WB
497+ int64_t size_diff = size - vmar->devinfo[dev_id].size;
498+
499+ /* storage types can have different size restrictions, so it
500+ * is not always possible to create an image with exact size.
501+ * So we tolerate a size difference up to 4MB.
502+ */
503+ if ((size_diff < 0) || (size_diff > 4*1024*1024)) {
ca0fe5f5
WB
504+ error_setg(errp, "vma_reader_register_bs for stream %s failed - "
505+ "unexpected size %zd != %zd", vmar->devinfo[dev_id].devname,
506+ size, vmar->devinfo[dev_id].size);
507+ return -1;
508+ }
509+
510+ vmar->rstate[dev_id].bs = bs;
511+ vmar->rstate[dev_id].write_zeroes = write_zeroes;
512+
513+ int64_t bitmap_size = (size/BDRV_SECTOR_SIZE) +
514+ (VMA_CLUSTER_SIZE/BDRV_SECTOR_SIZE) * BITS_PER_LONG - 1;
515+ bitmap_size /= (VMA_CLUSTER_SIZE/BDRV_SECTOR_SIZE) * BITS_PER_LONG;
516+
517+ vmar->rstate[dev_id].bitmap_size = bitmap_size;
518+ vmar->rstate[dev_id].bitmap = g_new0(unsigned long, bitmap_size);
519+
520+ vmar->cluster_count += size/VMA_CLUSTER_SIZE;
521+
522+ return 0;
523+}
524+
525+static ssize_t safe_write(int fd, void *buf, size_t count)
526+{
527+ ssize_t n;
528+
529+ do {
530+ n = write(fd, buf, count);
531+ } while (n < 0 && errno == EINTR);
532+
533+ return n;
534+}
535+
536+static size_t full_write(int fd, void *buf, size_t len)
537+{
538+ ssize_t n;
539+ size_t total;
540+
541+ total = 0;
542+
543+ while (len > 0) {
544+ n = safe_write(fd, buf, len);
545+ if (n < 0) {
546+ return n;
547+ }
548+ buf += n;
549+ total += n;
550+ len -= n;
551+ }
552+
553+ if (len) {
554+ /* incomplete write ? */
555+ return -1;
556+ }
557+
558+ return total;
559+}
560+
561+static int restore_write_data(VmaReader *vmar, guint8 dev_id,
562+ BlockDriverState *bs, int vmstate_fd,
563+ unsigned char *buf, int64_t sector_num,
564+ int nb_sectors, Error **errp)
565+{
566+ assert(vmar);
567+
568+ if (dev_id == vmar->vmstate_stream) {
569+ if (vmstate_fd >= 0) {
570+ int len = nb_sectors * BDRV_SECTOR_SIZE;
571+ int res = full_write(vmstate_fd, buf, len);
572+ if (res < 0) {
573+ error_setg(errp, "write vmstate failed %d", res);
574+ return -1;
575+ }
576+ }
577+ } else {
578+ int res = bdrv_write(bs, sector_num, buf, nb_sectors);
579+ if (res < 0) {
580+ error_setg(errp, "bdrv_write to %s failed (%d)",
581+ bdrv_get_device_name(bs), res);
582+ return -1;
583+ }
584+ }
585+ return 0;
586+}
587+static int restore_extent(VmaReader *vmar, unsigned char *buf,
588+ int extent_size, int vmstate_fd,
589+ bool verbose, Error **errp)
590+{
591+ assert(vmar);
592+ assert(buf);
593+
594+ VmaExtentHeader *ehead = (VmaExtentHeader *)buf;
595+ int start = VMA_EXTENT_HEADER_SIZE;
596+ int i;
597+
598+ for (i = 0; i < VMA_BLOCKS_PER_EXTENT; i++) {
599+ uint64_t block_info = GUINT64_FROM_BE(ehead->blockinfo[i]);
600+ uint64_t cluster_num = block_info & 0xffffffff;
601+ uint8_t dev_id = (block_info >> 32) & 0xff;
602+ uint16_t mask = block_info >> (32+16);
603+ int64_t max_sector;
604+
605+ if (!dev_id) {
606+ continue;
607+ }
608+
609+ VmaRestoreState *rstate = &vmar->rstate[dev_id];
610+ BlockDriverState *bs = NULL;
611+
612+ if (dev_id != vmar->vmstate_stream) {
613+ bs = rstate->bs;
614+ if (!bs) {
615+ error_setg(errp, "got wrong dev id %d", dev_id);
616+ return -1;
617+ }
618+
619+ if (vma_reader_get_bitmap(rstate, cluster_num)) {
620+ error_setg(errp, "found duplicated cluster %zd for stream %s",
621+ cluster_num, vmar->devinfo[dev_id].devname);
622+ return -1;
623+ }
624+ vma_reader_set_bitmap(rstate, cluster_num, 1);
625+
626+ max_sector = vmar->devinfo[dev_id].size/BDRV_SECTOR_SIZE;
627+ } else {
628+ max_sector = G_MAXINT64;
629+ if (cluster_num != vmar->vmstate_clusters) {
630+ error_setg(errp, "found out of order vmstate data");
631+ return -1;
632+ }
633+ vmar->vmstate_clusters++;
634+ }
635+
636+ vmar->clusters_read++;
637+
638+ if (verbose) {
639+ time_t duration = time(NULL) - vmar->start_time;
640+ int percent = (vmar->clusters_read*100)/vmar->cluster_count;
641+ if (percent != vmar->clusters_read_per) {
642+ printf("progress %d%% (read %zd bytes, duration %zd sec)\n",
643+ percent, vmar->clusters_read*VMA_CLUSTER_SIZE,
644+ duration);
645+ fflush(stdout);
646+ vmar->clusters_read_per = percent;
647+ }
648+ }
649+
650+ /* try to write whole clusters to speedup restore */
651+ if (mask == 0xffff) {
652+ if ((start + VMA_CLUSTER_SIZE) > extent_size) {
653+ error_setg(errp, "short vma extent - too many blocks");
654+ return -1;
655+ }
656+ int64_t sector_num = (cluster_num * VMA_CLUSTER_SIZE) /
657+ BDRV_SECTOR_SIZE;
658+ int64_t end_sector = sector_num +
659+ VMA_CLUSTER_SIZE/BDRV_SECTOR_SIZE;
660+
661+ if (end_sector > max_sector) {
662+ end_sector = max_sector;
663+ }
664+
665+ if (end_sector <= sector_num) {
666+ error_setg(errp, "got wrong block address - write bejond end");
667+ return -1;
668+ }
669+
670+ int nb_sectors = end_sector - sector_num;
671+ if (restore_write_data(vmar, dev_id, bs, vmstate_fd, buf + start,
672+ sector_num, nb_sectors, errp) < 0) {
673+ return -1;
674+ }
675+
676+ start += VMA_CLUSTER_SIZE;
677+ } else {
678+ int j;
679+ int bit = 1;
680+
681+ for (j = 0; j < 16; j++) {
682+ int64_t sector_num = (cluster_num*VMA_CLUSTER_SIZE +
683+ j*VMA_BLOCK_SIZE)/BDRV_SECTOR_SIZE;
684+
685+ int64_t end_sector = sector_num +
686+ VMA_BLOCK_SIZE/BDRV_SECTOR_SIZE;
687+ if (end_sector > max_sector) {
688+ end_sector = max_sector;
689+ }
690+
691+ if (mask & bit) {
692+ if ((start + VMA_BLOCK_SIZE) > extent_size) {
693+ error_setg(errp, "short vma extent - too many blocks");
694+ return -1;
695+ }
696+
697+ if (end_sector <= sector_num) {
698+ error_setg(errp, "got wrong block address - "
699+ "write bejond end");
700+ return -1;
701+ }
702+
703+ int nb_sectors = end_sector - sector_num;
704+ if (restore_write_data(vmar, dev_id, bs, vmstate_fd,
705+ buf + start, sector_num,
706+ nb_sectors, errp) < 0) {
707+ return -1;
708+ }
709+
710+ start += VMA_BLOCK_SIZE;
711+
712+ } else {
713+
714+ if (rstate->write_zeroes && (end_sector > sector_num)) {
715+ /* Todo: use bdrv_co_write_zeroes (but that need to
716+ * be run inside coroutine?)
717+ */
718+ int nb_sectors = end_sector - sector_num;
719+ if (restore_write_data(vmar, dev_id, bs, vmstate_fd,
720+ zero_vma_block, sector_num,
721+ nb_sectors, errp) < 0) {
722+ return -1;
723+ }
724+ }
725+ }
726+
727+ bit = bit << 1;
728+ }
729+ }
730+ }
731+
732+ if (start != extent_size) {
733+ error_setg(errp, "vma extent error - missing blocks");
734+ return -1;
735+ }
736+
737+ return 0;
738+}
739+
740+int vma_reader_restore(VmaReader *vmar, int vmstate_fd, bool verbose,
741+ Error **errp)
742+{
743+ assert(vmar);
744+ assert(vmar->head_data);
745+
746+ int ret = 0;
747+ unsigned char buf[VMA_MAX_EXTENT_SIZE];
748+ int buf_pos = 0;
749+ unsigned char md5sum[16];
750+ VmaHeader *h = (VmaHeader *)vmar->head_data;
751+
752+ vmar->start_time = time(NULL);
753+
754+ while (1) {
755+ int bytes = full_read(vmar->fd, buf + buf_pos, sizeof(buf) - buf_pos);
756+ if (bytes < 0) {
757+ error_setg(errp, "read failed - %s", g_strerror(errno));
758+ return -1;
759+ }
760+
761+ buf_pos += bytes;
762+
763+ if (!buf_pos) {
764+ break; /* EOF */
765+ }
766+
767+ if (buf_pos < VMA_EXTENT_HEADER_SIZE) {
768+ error_setg(errp, "read short extent (%d bytes)", buf_pos);
769+ return -1;
770+ }
771+
772+ VmaExtentHeader *ehead = (VmaExtentHeader *)buf;
773+
774+ /* extract md5sum */
775+ memcpy(md5sum, ehead->md5sum, sizeof(ehead->md5sum));
776+ memset(ehead->md5sum, 0, sizeof(ehead->md5sum));
777+
778+ g_checksum_reset(vmar->md5csum);
779+ g_checksum_update(vmar->md5csum, buf, VMA_EXTENT_HEADER_SIZE);
780+ gsize csize = 16;
781+ g_checksum_get_digest(vmar->md5csum, ehead->md5sum, &csize);
782+
783+ if (memcmp(md5sum, ehead->md5sum, 16) != 0) {
784+ error_setg(errp, "wrong vma extent header chechsum");
785+ return -1;
786+ }
787+
788+ if (memcmp(h->uuid, ehead->uuid, sizeof(ehead->uuid)) != 0) {
789+ error_setg(errp, "wrong vma extent uuid");
790+ return -1;
791+ }
792+
793+ if (ehead->magic != VMA_EXTENT_MAGIC || ehead->reserved1 != 0) {
794+ error_setg(errp, "wrong vma extent header magic");
795+ return -1;
796+ }
797+
798+ int block_count = GUINT16_FROM_BE(ehead->block_count);
799+ int extent_size = VMA_EXTENT_HEADER_SIZE + block_count*VMA_BLOCK_SIZE;
800+
801+ if (buf_pos < extent_size) {
802+ error_setg(errp, "short vma extent (%d < %d)", buf_pos,
803+ extent_size);
804+ return -1;
805+ }
806+
807+ if (restore_extent(vmar, buf, extent_size, vmstate_fd, verbose,
808+ errp) < 0) {
809+ return -1;
810+ }
811+
812+ if (buf_pos > extent_size) {
813+ memmove(buf, buf + extent_size, buf_pos - extent_size);
814+ buf_pos = buf_pos - extent_size;
815+ } else {
816+ buf_pos = 0;
817+ }
818+ }
819+
820+ bdrv_drain_all();
821+
822+ int i;
823+ for (i = 1; i < 256; i++) {
824+ VmaRestoreState *rstate = &vmar->rstate[i];
825+ if (!rstate->bs) {
826+ continue;
827+ }
828+
829+ if (bdrv_flush(rstate->bs) < 0) {
830+ error_setg(errp, "vma bdrv_flush %s failed",
831+ vmar->devinfo[i].devname);
832+ return -1;
833+ }
834+
835+ if (vmar->devinfo[i].size &&
836+ (strcmp(vmar->devinfo[i].devname, "vmstate") != 0)) {
837+ assert(rstate->bitmap);
838+
839+ int64_t cluster_num, end;
840+
841+ end = (vmar->devinfo[i].size + VMA_CLUSTER_SIZE - 1) /
842+ VMA_CLUSTER_SIZE;
843+
844+ for (cluster_num = 0; cluster_num < end; cluster_num++) {
845+ if (!vma_reader_get_bitmap(rstate, cluster_num)) {
846+ error_setg(errp, "detected missing cluster %zd "
847+ "for stream %s", cluster_num,
848+ vmar->devinfo[i].devname);
849+ return -1;
850+ }
851+ }
852+ }
853+ }
854+
855+ return ret;
856+}
857+
858diff --git a/vma-writer.c b/vma-writer.c
859new file mode 100644
68a30562 860index 0000000..b0cf529
ca0fe5f5
WB
861--- /dev/null
862+++ b/vma-writer.c
68a30562 863@@ -0,0 +1,870 @@
ca0fe5f5
WB
864+/*
865+ * VMA: Virtual Machine Archive
866+ *
867+ * Copyright (C) 2012 Proxmox Server Solutions
868+ *
869+ * Authors:
870+ * Dietmar Maurer (dietmar@proxmox.com)
871+ *
872+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
873+ * See the COPYING file in the top-level directory.
874+ *
875+ */
876+
68a30562 877+#include "qemu/osdep.h"
ca0fe5f5
WB
878+#include <glib.h>
879+#include <uuid/uuid.h>
880+
ca0fe5f5
WB
881+#include "vma.h"
882+#include "block/block.h"
883+#include "monitor/monitor.h"
884+#include "qemu/main-loop.h"
68a30562
WB
885+#include "qemu/coroutine.h"
886+#include "qemu/cutils.h"
ca0fe5f5
WB
887+
888+#define DEBUG_VMA 0
889+
890+#define DPRINTF(fmt, ...)\
891+ do { if (DEBUG_VMA) { printf("vma: " fmt, ## __VA_ARGS__); } } while (0)
892+
893+#define WRITE_BUFFERS 5
894+
895+typedef struct VmaAIOCB VmaAIOCB;
896+struct VmaAIOCB {
897+ unsigned char buffer[VMA_MAX_EXTENT_SIZE];
898+ VmaWriter *vmaw;
899+ size_t bytes;
900+ Coroutine *co;
901+};
902+
903+struct VmaWriter {
904+ int fd;
905+ FILE *cmd;
906+ int status;
907+ char errmsg[8192];
908+ uuid_t uuid;
909+ bool header_written;
910+ bool closed;
911+
912+ /* we always write extents */
913+ unsigned char outbuf[VMA_MAX_EXTENT_SIZE];
914+ int outbuf_pos; /* in bytes */
915+ int outbuf_count; /* in VMA_BLOCKS */
916+ uint64_t outbuf_block_info[VMA_BLOCKS_PER_EXTENT];
917+
918+ VmaAIOCB *aiocbs[WRITE_BUFFERS];
919+ CoQueue wqueue;
920+
921+ GChecksum *md5csum;
922+ CoMutex writer_lock;
923+ CoMutex flush_lock;
924+ Coroutine *co_writer;
925+
926+ /* drive informations */
927+ VmaStreamInfo stream_info[256];
928+ guint stream_count;
929+
930+ guint8 vmstate_stream;
931+ uint32_t vmstate_clusters;
932+
933+ /* header blob table */
934+ char *header_blob_table;
935+ uint32_t header_blob_table_size;
936+ uint32_t header_blob_table_pos;
937+
938+ /* store for config blobs */
939+ uint32_t config_names[VMA_MAX_CONFIGS]; /* offset into blob_buffer table */
940+ uint32_t config_data[VMA_MAX_CONFIGS]; /* offset into blob_buffer table */
941+ uint32_t config_count;
942+};
943+
944+void vma_writer_set_error(VmaWriter *vmaw, const char *fmt, ...)
945+{
946+ va_list ap;
947+
948+ if (vmaw->status < 0) {
949+ return;
950+ }
951+
952+ vmaw->status = -1;
953+
954+ va_start(ap, fmt);
955+ g_vsnprintf(vmaw->errmsg, sizeof(vmaw->errmsg), fmt, ap);
956+ va_end(ap);
957+
958+ DPRINTF("vma_writer_set_error: %s\n", vmaw->errmsg);
959+}
960+
961+static uint32_t allocate_header_blob(VmaWriter *vmaw, const char *data,
962+ size_t len)
963+{
964+ if (len > 65535) {
965+ return 0;
966+ }
967+
968+ if (!vmaw->header_blob_table ||
969+ (vmaw->header_blob_table_size <
970+ (vmaw->header_blob_table_pos + len + 2))) {
971+ int newsize = vmaw->header_blob_table_size + ((len + 2 + 511)/512)*512;
972+
973+ vmaw->header_blob_table = g_realloc(vmaw->header_blob_table, newsize);
974+ memset(vmaw->header_blob_table + vmaw->header_blob_table_size,
975+ 0, newsize - vmaw->header_blob_table_size);
976+ vmaw->header_blob_table_size = newsize;
977+ }
978+
979+ uint32_t cpos = vmaw->header_blob_table_pos;
980+ vmaw->header_blob_table[cpos] = len & 255;
981+ vmaw->header_blob_table[cpos+1] = (len >> 8) & 255;
982+ memcpy(vmaw->header_blob_table + cpos + 2, data, len);
983+ vmaw->header_blob_table_pos += len + 2;
984+ return cpos;
985+}
986+
987+static uint32_t allocate_header_string(VmaWriter *vmaw, const char *str)
988+{
989+ assert(vmaw);
990+
991+ size_t len = strlen(str) + 1;
992+
993+ return allocate_header_blob(vmaw, str, len);
994+}
995+
996+int vma_writer_add_config(VmaWriter *vmaw, const char *name, gpointer data,
997+ gsize len)
998+{
999+ assert(vmaw);
1000+ assert(!vmaw->header_written);
1001+ assert(vmaw->config_count < VMA_MAX_CONFIGS);
1002+ assert(name);
1003+ assert(data);
1004+ assert(len);
1005+
1006+ gchar *basename = g_path_get_basename(name);
1007+ uint32_t name_ptr = allocate_header_string(vmaw, basename);
1008+ g_free(basename);
1009+
1010+ if (!name_ptr) {
1011+ return -1;
1012+ }
1013+
1014+ uint32_t data_ptr = allocate_header_blob(vmaw, data, len);
1015+ if (!data_ptr) {
1016+ return -1;
1017+ }
1018+
1019+ vmaw->config_names[vmaw->config_count] = name_ptr;
1020+ vmaw->config_data[vmaw->config_count] = data_ptr;
1021+
1022+ vmaw->config_count++;
1023+
1024+ return 0;
1025+}
1026+
1027+int vma_writer_register_stream(VmaWriter *vmaw, const char *devname,
1028+ size_t size)
1029+{
1030+ assert(vmaw);
1031+ assert(devname);
1032+ assert(!vmaw->status);
1033+
1034+ if (vmaw->header_written) {
1035+ vma_writer_set_error(vmaw, "vma_writer_register_stream: header "
1036+ "already written");
1037+ return -1;
1038+ }
1039+
1040+ guint n = vmaw->stream_count + 1;
1041+
1042+ /* we can have dev_ids form 1 to 255 (0 reserved)
1043+ * 255(-1) reseverd for safety
1044+ */
1045+ if (n > 254) {
1046+ vma_writer_set_error(vmaw, "vma_writer_register_stream: "
1047+ "too many drives");
1048+ return -1;
1049+ }
1050+
1051+ if (size <= 0) {
1052+ vma_writer_set_error(vmaw, "vma_writer_register_stream: "
1053+ "got strange size %zd", size);
1054+ return -1;
1055+ }
1056+
1057+ DPRINTF("vma_writer_register_stream %s %zu %d\n", devname, size, n);
1058+
1059+ vmaw->stream_info[n].devname = g_strdup(devname);
1060+ vmaw->stream_info[n].size = size;
1061+
1062+ vmaw->stream_info[n].cluster_count = (size + VMA_CLUSTER_SIZE - 1) /
1063+ VMA_CLUSTER_SIZE;
1064+
1065+ vmaw->stream_count = n;
1066+
1067+ if (strcmp(devname, "vmstate") == 0) {
1068+ vmaw->vmstate_stream = n;
1069+ }
1070+
1071+ return n;
1072+}
1073+
1074+static void vma_co_continue_write(void *opaque)
1075+{
1076+ VmaWriter *vmaw = opaque;
1077+
1078+ DPRINTF("vma_co_continue_write\n");
68a30562 1079+ qemu_coroutine_enter(vmaw->co_writer);
ca0fe5f5
WB
1080+}
1081+
1082+static ssize_t coroutine_fn
1083+vma_co_write(VmaWriter *vmaw, const void *buf, size_t bytes)
1084+{
1085+ size_t done = 0;
1086+ ssize_t ret;
1087+
1088+ /* atomic writes (we cannot interleave writes) */
1089+ qemu_co_mutex_lock(&vmaw->writer_lock);
1090+
1091+ DPRINTF("vma_co_write enter %zd\n", bytes);
1092+
1093+ assert(vmaw->co_writer == NULL);
1094+
1095+ vmaw->co_writer = qemu_coroutine_self();
1096+
68a30562 1097+ aio_set_fd_handler(qemu_get_aio_context(), vmaw->fd, false, NULL, vma_co_continue_write, vmaw);
ca0fe5f5
WB
1098+
1099+ DPRINTF("vma_co_write wait until writable\n");
1100+ qemu_coroutine_yield();
1101+ DPRINTF("vma_co_write starting %zd\n", bytes);
1102+
1103+ while (done < bytes) {
1104+ ret = write(vmaw->fd, buf + done, bytes - done);
1105+ if (ret > 0) {
1106+ done += ret;
1107+ DPRINTF("vma_co_write written %zd %zd\n", done, ret);
1108+ } else if (ret < 0) {
1109+ if (errno == EAGAIN || errno == EWOULDBLOCK) {
1110+ DPRINTF("vma_co_write yield %zd\n", done);
1111+ qemu_coroutine_yield();
1112+ DPRINTF("vma_co_write restart %zd\n", done);
1113+ } else {
1114+ vma_writer_set_error(vmaw, "vma_co_write write error - %s",
1115+ g_strerror(errno));
1116+ done = -1; /* always return failure for partial writes */
1117+ break;
1118+ }
1119+ } else if (ret == 0) {
1120+ /* should not happen - simply try again */
1121+ }
1122+ }
1123+
68a30562 1124+ aio_set_fd_handler(qemu_get_aio_context(), vmaw->fd, false, NULL, NULL, NULL);
ca0fe5f5
WB
1125+
1126+ vmaw->co_writer = NULL;
1127+
1128+ qemu_co_mutex_unlock(&vmaw->writer_lock);
1129+
1130+ DPRINTF("vma_co_write leave %zd\n", done);
1131+ return done;
1132+}
1133+
1134+static void coroutine_fn vma_co_writer_task(void *opaque)
1135+{
1136+ VmaAIOCB *cb = opaque;
1137+
1138+ DPRINTF("vma_co_writer_task start\n");
1139+
1140+ int64_t done = vma_co_write(cb->vmaw, cb->buffer, cb->bytes);
1141+ DPRINTF("vma_co_writer_task write done %zd\n", done);
1142+
1143+ if (done != cb->bytes) {
1144+ DPRINTF("vma_co_writer_task failed write %zd %zd", cb->bytes, done);
1145+ vma_writer_set_error(cb->vmaw, "vma_co_writer_task failed write %zd",
1146+ done);
1147+ }
1148+
1149+ cb->bytes = 0;
1150+
1151+ qemu_co_queue_next(&cb->vmaw->wqueue);
1152+
1153+ DPRINTF("vma_co_writer_task end\n");
1154+}
1155+
1156+static void coroutine_fn vma_queue_flush(VmaWriter *vmaw)
1157+{
1158+ DPRINTF("vma_queue_flush enter\n");
1159+
1160+ assert(vmaw);
1161+
1162+ while (1) {
1163+ int i;
1164+ VmaAIOCB *cb = NULL;
1165+ for (i = 0; i < WRITE_BUFFERS; i++) {
1166+ if (vmaw->aiocbs[i]->bytes) {
1167+ cb = vmaw->aiocbs[i];
1168+ DPRINTF("FOUND USED AIO BUFFER %d %zd\n", i,
1169+ vmaw->aiocbs[i]->bytes);
1170+ break;
1171+ }
1172+ }
1173+ if (!cb) {
1174+ break;
1175+ }
1176+ qemu_co_queue_wait(&vmaw->wqueue);
1177+ }
1178+
1179+ DPRINTF("vma_queue_flush leave\n");
1180+}
1181+
1182+/**
1183+ * NOTE: pipe buffer size in only 4096 bytes on linux (see 'ulimit -a')
1184+ * So we need to create a coroutione to allow 'parallel' execution.
1185+ */
1186+static ssize_t coroutine_fn
1187+vma_queue_write(VmaWriter *vmaw, const void *buf, size_t bytes)
1188+{
1189+ DPRINTF("vma_queue_write enter %zd\n", bytes);
1190+
1191+ assert(vmaw);
1192+ assert(buf);
1193+ assert(bytes <= VMA_MAX_EXTENT_SIZE);
1194+
1195+ VmaAIOCB *cb = NULL;
1196+ while (!cb) {
1197+ int i;
1198+ for (i = 0; i < WRITE_BUFFERS; i++) {
1199+ if (!vmaw->aiocbs[i]->bytes) {
1200+ cb = vmaw->aiocbs[i];
1201+ break;
1202+ }
1203+ }
1204+ if (!cb) {
1205+ qemu_co_queue_wait(&vmaw->wqueue);
1206+ }
1207+ }
1208+
1209+ memcpy(cb->buffer, buf, bytes);
1210+ cb->bytes = bytes;
1211+ cb->vmaw = vmaw;
1212+
1213+ DPRINTF("vma_queue_write start %zd\n", bytes);
1214+ cb->co = qemu_coroutine_create(vma_co_writer_task);
1215+ qemu_coroutine_enter(cb->co, cb);
1216+
1217+ DPRINTF("vma_queue_write leave\n");
1218+
1219+ return bytes;
1220+}
1221+
1222+VmaWriter *vma_writer_create(const char *filename, uuid_t uuid, Error **errp)
1223+{
1224+ const char *p;
1225+
1226+ assert(sizeof(VmaHeader) == (4096 + 8192));
1227+ assert(G_STRUCT_OFFSET(VmaHeader, config_names) == 2044);
1228+ assert(G_STRUCT_OFFSET(VmaHeader, config_data) == 3068);
1229+ assert(G_STRUCT_OFFSET(VmaHeader, dev_info) == 4096);
1230+ assert(sizeof(VmaExtentHeader) == 512);
1231+
1232+ VmaWriter *vmaw = g_new0(VmaWriter, 1);
1233+ vmaw->fd = -1;
1234+
1235+ vmaw->md5csum = g_checksum_new(G_CHECKSUM_MD5);
1236+ if (!vmaw->md5csum) {
1237+ error_setg(errp, "can't allocate cmsum\n");
1238+ goto err;
1239+ }
1240+
1241+ if (strstart(filename, "exec:", &p)) {
1242+ vmaw->cmd = popen(p, "w");
1243+ if (vmaw->cmd == NULL) {
1244+ error_setg(errp, "can't popen command '%s' - %s\n", p,
1245+ g_strerror(errno));
1246+ goto err;
1247+ }
1248+ vmaw->fd = fileno(vmaw->cmd);
1249+
1250+ /* try to use O_NONBLOCK and O_DIRECT */
1251+ fcntl(vmaw->fd, F_SETFL, fcntl(vmaw->fd, F_GETFL)|O_NONBLOCK);
1252+ fcntl(vmaw->fd, F_SETFL, fcntl(vmaw->fd, F_GETFL)|O_DIRECT);
1253+
1254+ } else {
1255+ struct stat st;
1256+ int oflags;
1257+ const char *tmp_id_str;
1258+
1259+ if ((stat(filename, &st) == 0) && S_ISFIFO(st.st_mode)) {
1260+ oflags = O_NONBLOCK|O_DIRECT|O_WRONLY;
1261+ vmaw->fd = qemu_open(filename, oflags, 0644);
1262+ } else if (strstart(filename, "/dev/fdset/", &tmp_id_str)) {
1263+ oflags = O_NONBLOCK|O_DIRECT|O_WRONLY;
1264+ vmaw->fd = qemu_open(filename, oflags, 0644);
1265+ } else if (strstart(filename, "/dev/fdname/", &tmp_id_str)) {
1266+ vmaw->fd = monitor_get_fd(cur_mon, tmp_id_str, errp);
1267+ if (vmaw->fd < 0) {
1268+ goto err;
1269+ }
1270+ /* try to use O_NONBLOCK and O_DIRECT */
1271+ fcntl(vmaw->fd, F_SETFL, fcntl(vmaw->fd, F_GETFL)|O_NONBLOCK);
1272+ fcntl(vmaw->fd, F_SETFL, fcntl(vmaw->fd, F_GETFL)|O_DIRECT);
1273+ } else {
1274+ oflags = O_NONBLOCK|O_DIRECT|O_WRONLY|O_CREAT|O_EXCL;
1275+ vmaw->fd = qemu_open(filename, oflags, 0644);
1276+ }
1277+
1278+ if (vmaw->fd < 0) {
1279+ error_setg(errp, "can't open file %s - %s\n", filename,
1280+ g_strerror(errno));
1281+ goto err;
1282+ }
1283+ }
1284+
1285+ /* we use O_DIRECT, so we need to align IO buffers */
1286+ int i;
1287+ for (i = 0; i < WRITE_BUFFERS; i++) {
1288+ vmaw->aiocbs[i] = qemu_memalign(512, sizeof(VmaAIOCB));
1289+ memset(vmaw->aiocbs[i], 0, sizeof(VmaAIOCB));
1290+ }
1291+
1292+ vmaw->outbuf_count = 0;
1293+ vmaw->outbuf_pos = VMA_EXTENT_HEADER_SIZE;
1294+
1295+ vmaw->header_blob_table_pos = 1; /* start at pos 1 */
1296+
1297+ qemu_co_mutex_init(&vmaw->writer_lock);
1298+ qemu_co_mutex_init(&vmaw->flush_lock);
1299+ qemu_co_queue_init(&vmaw->wqueue);
1300+
1301+ uuid_copy(vmaw->uuid, uuid);
1302+
1303+ return vmaw;
1304+
1305+err:
1306+ if (vmaw) {
1307+ if (vmaw->cmd) {
1308+ pclose(vmaw->cmd);
1309+ } else if (vmaw->fd >= 0) {
1310+ close(vmaw->fd);
1311+ }
1312+
1313+ if (vmaw->md5csum) {
1314+ g_checksum_free(vmaw->md5csum);
1315+ }
1316+
1317+ g_free(vmaw);
1318+ }
1319+
1320+ return NULL;
1321+}
1322+
1323+static int coroutine_fn vma_write_header(VmaWriter *vmaw)
1324+{
1325+ assert(vmaw);
1326+ int header_clusters = 8;
1327+ char buf[65536*header_clusters];
1328+ VmaHeader *head = (VmaHeader *)buf;
1329+
1330+ int i;
1331+
1332+ DPRINTF("VMA WRITE HEADER\n");
1333+
1334+ if (vmaw->status < 0) {
1335+ return vmaw->status;
1336+ }
1337+
1338+ memset(buf, 0, sizeof(buf));
1339+
1340+ head->magic = VMA_MAGIC;
1341+ head->version = GUINT32_TO_BE(1); /* v1 */
1342+ memcpy(head->uuid, vmaw->uuid, 16);
1343+
1344+ time_t ctime = time(NULL);
1345+ head->ctime = GUINT64_TO_BE(ctime);
1346+
1347+ if (!vmaw->stream_count) {
1348+ return -1;
1349+ }
1350+
1351+ for (i = 0; i < VMA_MAX_CONFIGS; i++) {
1352+ head->config_names[i] = GUINT32_TO_BE(vmaw->config_names[i]);
1353+ head->config_data[i] = GUINT32_TO_BE(vmaw->config_data[i]);
1354+ }
1355+
1356+ /* 32 bytes per device (12 used currently) = 8192 bytes max */
1357+ for (i = 1; i <= 254; i++) {
1358+ VmaStreamInfo *si = &vmaw->stream_info[i];
1359+ if (si->size) {
1360+ assert(si->devname);
1361+ uint32_t devname_ptr = allocate_header_string(vmaw, si->devname);
1362+ if (!devname_ptr) {
1363+ return -1;
1364+ }
1365+ head->dev_info[i].devname_ptr = GUINT32_TO_BE(devname_ptr);
1366+ head->dev_info[i].size = GUINT64_TO_BE(si->size);
1367+ }
1368+ }
1369+
1370+ uint32_t header_size = sizeof(VmaHeader) + vmaw->header_blob_table_size;
1371+ head->header_size = GUINT32_TO_BE(header_size);
1372+
1373+ if (header_size > sizeof(buf)) {
1374+ return -1; /* just to be sure */
1375+ }
1376+
1377+ uint32_t blob_buffer_offset = sizeof(VmaHeader);
1378+ memcpy(buf + blob_buffer_offset, vmaw->header_blob_table,
1379+ vmaw->header_blob_table_size);
1380+ head->blob_buffer_offset = GUINT32_TO_BE(blob_buffer_offset);
1381+ head->blob_buffer_size = GUINT32_TO_BE(vmaw->header_blob_table_pos);
1382+
1383+ g_checksum_reset(vmaw->md5csum);
1384+ g_checksum_update(vmaw->md5csum, (const guchar *)buf, header_size);
1385+ gsize csize = 16;
1386+ g_checksum_get_digest(vmaw->md5csum, (guint8 *)(head->md5sum), &csize);
1387+
1388+ return vma_queue_write(vmaw, buf, header_size);
1389+}
1390+
1391+static int coroutine_fn vma_writer_flush(VmaWriter *vmaw)
1392+{
1393+ assert(vmaw);
1394+
1395+ int ret;
1396+ int i;
1397+
1398+ if (vmaw->status < 0) {
1399+ return vmaw->status;
1400+ }
1401+
1402+ if (!vmaw->header_written) {
1403+ vmaw->header_written = true;
1404+ ret = vma_write_header(vmaw);
1405+ if (ret < 0) {
1406+ vma_writer_set_error(vmaw, "vma_writer_flush: write header failed");
1407+ return ret;
1408+ }
1409+ }
1410+
1411+ DPRINTF("VMA WRITE FLUSH %d %d\n", vmaw->outbuf_count, vmaw->outbuf_pos);
1412+
1413+
1414+ VmaExtentHeader *ehead = (VmaExtentHeader *)vmaw->outbuf;
1415+
1416+ ehead->magic = VMA_EXTENT_MAGIC;
1417+ ehead->reserved1 = 0;
1418+
1419+ for (i = 0; i < VMA_BLOCKS_PER_EXTENT; i++) {
1420+ ehead->blockinfo[i] = GUINT64_TO_BE(vmaw->outbuf_block_info[i]);
1421+ }
1422+
1423+ guint16 block_count = (vmaw->outbuf_pos - VMA_EXTENT_HEADER_SIZE) /
1424+ VMA_BLOCK_SIZE;
1425+
1426+ ehead->block_count = GUINT16_TO_BE(block_count);
1427+
1428+ memcpy(ehead->uuid, vmaw->uuid, sizeof(ehead->uuid));
1429+ memset(ehead->md5sum, 0, sizeof(ehead->md5sum));
1430+
1431+ g_checksum_reset(vmaw->md5csum);
1432+ g_checksum_update(vmaw->md5csum, vmaw->outbuf, VMA_EXTENT_HEADER_SIZE);
1433+ gsize csize = 16;
1434+ g_checksum_get_digest(vmaw->md5csum, ehead->md5sum, &csize);
1435+
1436+ int bytes = vmaw->outbuf_pos;
1437+ ret = vma_queue_write(vmaw, vmaw->outbuf, bytes);
1438+ if (ret != bytes) {
1439+ vma_writer_set_error(vmaw, "vma_writer_flush: failed write");
1440+ }
1441+
1442+ vmaw->outbuf_count = 0;
1443+ vmaw->outbuf_pos = VMA_EXTENT_HEADER_SIZE;
1444+
1445+ for (i = 0; i < VMA_BLOCKS_PER_EXTENT; i++) {
1446+ vmaw->outbuf_block_info[i] = 0;
1447+ }
1448+
1449+ return vmaw->status;
1450+}
1451+
1452+static int vma_count_open_streams(VmaWriter *vmaw)
1453+{
1454+ g_assert(vmaw != NULL);
1455+
1456+ int i;
1457+ int open_drives = 0;
1458+ for (i = 0; i <= 255; i++) {
1459+ if (vmaw->stream_info[i].size && !vmaw->stream_info[i].finished) {
1460+ open_drives++;
1461+ }
1462+ }
1463+
1464+ return open_drives;
1465+}
1466+
1467+/**
1468+ * all jobs should call this when there is no more data
1469+ * Returns: number of remaining stream (0 ==> finished)
1470+ */
1471+int coroutine_fn
1472+vma_writer_close_stream(VmaWriter *vmaw, uint8_t dev_id)
1473+{
1474+ g_assert(vmaw != NULL);
1475+
1476+ DPRINTF("vma_writer_set_status %d\n", dev_id);
1477+ if (!vmaw->stream_info[dev_id].size) {
1478+ vma_writer_set_error(vmaw, "vma_writer_close_stream: "
1479+ "no such stream %d", dev_id);
1480+ return -1;
1481+ }
1482+ if (vmaw->stream_info[dev_id].finished) {
1483+ vma_writer_set_error(vmaw, "vma_writer_close_stream: "
1484+ "stream already closed %d", dev_id);
1485+ return -1;
1486+ }
1487+
1488+ vmaw->stream_info[dev_id].finished = true;
1489+
1490+ int open_drives = vma_count_open_streams(vmaw);
1491+
1492+ if (open_drives <= 0) {
1493+ DPRINTF("vma_writer_set_status all drives completed\n");
1494+ qemu_co_mutex_lock(&vmaw->flush_lock);
1495+ int ret = vma_writer_flush(vmaw);
1496+ qemu_co_mutex_unlock(&vmaw->flush_lock);
1497+ if (ret < 0) {
1498+ vma_writer_set_error(vmaw, "vma_writer_close_stream: flush failed");
1499+ }
1500+ }
1501+
1502+ return open_drives;
1503+}
1504+
1505+int vma_writer_get_status(VmaWriter *vmaw, VmaStatus *status)
1506+{
1507+ int i;
1508+
1509+ g_assert(vmaw != NULL);
1510+
1511+ if (status) {
1512+ status->status = vmaw->status;
1513+ g_strlcpy(status->errmsg, vmaw->errmsg, sizeof(status->errmsg));
1514+ for (i = 0; i <= 255; i++) {
1515+ status->stream_info[i] = vmaw->stream_info[i];
1516+ }
1517+
1518+ uuid_unparse_lower(vmaw->uuid, status->uuid_str);
1519+ }
1520+
1521+ status->closed = vmaw->closed;
1522+
1523+ return vmaw->status;
1524+}
1525+
1526+static int vma_writer_get_buffer(VmaWriter *vmaw)
1527+{
1528+ int ret = 0;
1529+
1530+ qemu_co_mutex_lock(&vmaw->flush_lock);
1531+
1532+ /* wait until buffer is available */
1533+ while (vmaw->outbuf_count >= (VMA_BLOCKS_PER_EXTENT - 1)) {
1534+ ret = vma_writer_flush(vmaw);
1535+ if (ret < 0) {
1536+ vma_writer_set_error(vmaw, "vma_writer_get_buffer: flush failed");
1537+ break;
1538+ }
1539+ }
1540+
1541+ qemu_co_mutex_unlock(&vmaw->flush_lock);
1542+
1543+ return ret;
1544+}
1545+
1546+
1547+int64_t coroutine_fn
1548+vma_writer_write(VmaWriter *vmaw, uint8_t dev_id, int64_t cluster_num,
1549+ unsigned char *buf, size_t *zero_bytes)
1550+{
1551+ g_assert(vmaw != NULL);
1552+ g_assert(zero_bytes != NULL);
1553+
1554+ *zero_bytes = 0;
1555+
1556+ if (vmaw->status < 0) {
1557+ return vmaw->status;
1558+ }
1559+
1560+ if (!dev_id || !vmaw->stream_info[dev_id].size) {
1561+ vma_writer_set_error(vmaw, "vma_writer_write: "
1562+ "no such stream %d", dev_id);
1563+ return -1;
1564+ }
1565+
1566+ if (vmaw->stream_info[dev_id].finished) {
1567+ vma_writer_set_error(vmaw, "vma_writer_write: "
1568+ "stream already closed %d", dev_id);
1569+ return -1;
1570+ }
1571+
1572+
1573+ if (cluster_num >= (((uint64_t)1)<<32)) {
1574+ vma_writer_set_error(vmaw, "vma_writer_write: "
1575+ "cluster number out of range");
1576+ return -1;
1577+ }
1578+
1579+ if (dev_id == vmaw->vmstate_stream) {
1580+ if (cluster_num != vmaw->vmstate_clusters) {
1581+ vma_writer_set_error(vmaw, "vma_writer_write: "
1582+ "non sequential vmstate write");
1583+ }
1584+ vmaw->vmstate_clusters++;
1585+ } else if (cluster_num >= vmaw->stream_info[dev_id].cluster_count) {
1586+ vma_writer_set_error(vmaw, "vma_writer_write: cluster number too big");
1587+ return -1;
1588+ }
1589+
1590+ /* wait until buffer is available */
1591+ if (vma_writer_get_buffer(vmaw) < 0) {
1592+ vma_writer_set_error(vmaw, "vma_writer_write: "
1593+ "vma_writer_get_buffer failed");
1594+ return -1;
1595+ }
1596+
1597+ DPRINTF("VMA WRITE %d %zd\n", dev_id, cluster_num);
1598+
1599+ uint16_t mask = 0;
1600+
1601+ if (buf) {
1602+ int i;
1603+ int bit = 1;
1604+ for (i = 0; i < 16; i++) {
1605+ unsigned char *vmablock = buf + (i*VMA_BLOCK_SIZE);
1606+ if (!buffer_is_zero(vmablock, VMA_BLOCK_SIZE)) {
1607+ mask |= bit;
1608+ memcpy(vmaw->outbuf + vmaw->outbuf_pos, vmablock,
1609+ VMA_BLOCK_SIZE);
1610+ vmaw->outbuf_pos += VMA_BLOCK_SIZE;
1611+ } else {
1612+ DPRINTF("VMA WRITE %zd ZERO BLOCK %d\n", cluster_num, i);
1613+ vmaw->stream_info[dev_id].zero_bytes += VMA_BLOCK_SIZE;
1614+ *zero_bytes += VMA_BLOCK_SIZE;
1615+ }
1616+
1617+ bit = bit << 1;
1618+ }
1619+ } else {
1620+ DPRINTF("VMA WRITE %zd ZERO CLUSTER\n", cluster_num);
1621+ vmaw->stream_info[dev_id].zero_bytes += VMA_CLUSTER_SIZE;
1622+ *zero_bytes += VMA_CLUSTER_SIZE;
1623+ }
1624+
1625+ uint64_t block_info = ((uint64_t)mask) << (32+16);
1626+ block_info |= ((uint64_t)dev_id) << 32;
1627+ block_info |= (cluster_num & 0xffffffff);
1628+ vmaw->outbuf_block_info[vmaw->outbuf_count] = block_info;
1629+
1630+ DPRINTF("VMA WRITE MASK %zd %zx\n", cluster_num, block_info);
1631+
1632+ vmaw->outbuf_count++;
1633+
1634+ /** NOTE: We allways write whole clusters, but we correctly set
1635+ * transferred bytes. So transferred == size when when everything
1636+ * went OK.
1637+ */
1638+ size_t transferred = VMA_CLUSTER_SIZE;
1639+
1640+ if (dev_id != vmaw->vmstate_stream) {
1641+ uint64_t last = (cluster_num + 1) * VMA_CLUSTER_SIZE;
1642+ if (last > vmaw->stream_info[dev_id].size) {
1643+ uint64_t diff = last - vmaw->stream_info[dev_id].size;
1644+ if (diff >= VMA_CLUSTER_SIZE) {
1645+ vma_writer_set_error(vmaw, "vma_writer_write: "
1646+ "read after last cluster");
1647+ return -1;
1648+ }
1649+ transferred -= diff;
1650+ }
1651+ }
1652+
1653+ vmaw->stream_info[dev_id].transferred += transferred;
1654+
1655+ return transferred;
1656+}
1657+
1658+int vma_writer_close(VmaWriter *vmaw, Error **errp)
1659+{
1660+ g_assert(vmaw != NULL);
1661+
1662+ int i;
1663+
1664+ vma_queue_flush(vmaw);
1665+
1666+ /* this should not happen - just to be sure */
1667+ while (!qemu_co_queue_empty(&vmaw->wqueue)) {
1668+ DPRINTF("vma_writer_close wait\n");
1669+ co_aio_sleep_ns(qemu_get_aio_context(), QEMU_CLOCK_REALTIME, 1000000);
1670+ }
1671+
1672+ if (vmaw->cmd) {
1673+ if (pclose(vmaw->cmd) < 0) {
1674+ vma_writer_set_error(vmaw, "vma_writer_close: "
1675+ "pclose failed - %s", g_strerror(errno));
1676+ }
1677+ } else {
1678+ if (close(vmaw->fd) < 0) {
1679+ vma_writer_set_error(vmaw, "vma_writer_close: "
1680+ "close failed - %s", g_strerror(errno));
1681+ }
1682+ }
1683+
1684+ for (i = 0; i <= 255; i++) {
1685+ VmaStreamInfo *si = &vmaw->stream_info[i];
1686+ if (si->size) {
1687+ if (!si->finished) {
1688+ vma_writer_set_error(vmaw, "vma_writer_close: "
1689+ "detected open stream '%s'", si->devname);
1690+ } else if ((si->transferred != si->size) &&
1691+ (i != vmaw->vmstate_stream)) {
1692+ vma_writer_set_error(vmaw, "vma_writer_close: "
1693+ "incomplete stream '%s' (%zd != %zd)",
1694+ si->devname, si->transferred, si->size);
1695+ }
1696+ }
1697+ }
1698+
1699+ for (i = 0; i <= 255; i++) {
1700+ vmaw->stream_info[i].finished = 1; /* mark as closed */
1701+ }
1702+
1703+ vmaw->closed = 1;
1704+
1705+ if (vmaw->status < 0 && *errp == NULL) {
1706+ error_setg(errp, "%s", vmaw->errmsg);
1707+ }
1708+
1709+ return vmaw->status;
1710+}
1711+
1712+void vma_writer_destroy(VmaWriter *vmaw)
1713+{
1714+ assert(vmaw);
1715+
1716+ int i;
1717+
1718+ for (i = 0; i <= 255; i++) {
1719+ if (vmaw->stream_info[i].devname) {
1720+ g_free(vmaw->stream_info[i].devname);
1721+ }
1722+ }
1723+
1724+ if (vmaw->md5csum) {
1725+ g_checksum_free(vmaw->md5csum);
1726+ }
1727+
1728+ for (i = 0; i < WRITE_BUFFERS; i++) {
1729+ free(vmaw->aiocbs[i]);
1730+ }
1731+
1732+ g_free(vmaw);
1733+}
1734diff --git a/vma.c b/vma.c
1735new file mode 100644
1a91ab45 1736index 0000000..8732bfa
ca0fe5f5
WB
1737--- /dev/null
1738+++ b/vma.c
1a91ab45 1739@@ -0,0 +1,586 @@
ca0fe5f5
WB
1740+/*
1741+ * VMA: Virtual Machine Archive
1742+ *
1743+ * Copyright (C) 2012-2013 Proxmox Server Solutions
1744+ *
1745+ * Authors:
1746+ * Dietmar Maurer (dietmar@proxmox.com)
1747+ *
1748+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
1749+ * See the COPYING file in the top-level directory.
1750+ *
1751+ */
1752+
68a30562 1753+#include "qemu/osdep.h"
ca0fe5f5
WB
1754+#include <glib.h>
1755+
1756+#include "vma.h"
1757+#include "qemu-common.h"
1758+#include "qemu/error-report.h"
1759+#include "qemu/main-loop.h"
1a91ab45 1760+#include "qapi/qmp/qstring.h"
68a30562 1761+#include "sysemu/char.h" /* qstring_from_str */
ca0fe5f5
WB
1762+
1763+static void help(void)
1764+{
1765+ const char *help_msg =
1766+ "usage: vma command [command options]\n"
1767+ "\n"
1768+ "vma list <filename>\n"
1769+ "vma create <filename> [-c config] <archive> pathname ...\n"
1770+ "vma extract <filename> [-r <fifo>] <targetdir>\n"
1771+ ;
1772+
1773+ printf("%s", help_msg);
1774+ exit(1);
1775+}
1776+
1777+static const char *extract_devname(const char *path, char **devname, int index)
1778+{
1779+ assert(path);
1780+
1781+ const char *sep = strchr(path, '=');
1782+
1783+ if (sep) {
1784+ *devname = g_strndup(path, sep - path);
1785+ path = sep + 1;
1786+ } else {
1787+ if (index >= 0) {
1788+ *devname = g_strdup_printf("disk%d", index);
1789+ } else {
1790+ *devname = NULL;
1791+ }
1792+ }
1793+
1794+ return path;
1795+}
1796+
1797+static void print_content(VmaReader *vmar)
1798+{
1799+ assert(vmar);
1800+
1801+ VmaHeader *head = vma_reader_get_header(vmar);
1802+
1803+ GList *l = vma_reader_get_config_data(vmar);
1804+ while (l && l->data) {
1805+ VmaConfigData *cdata = (VmaConfigData *)l->data;
1806+ l = g_list_next(l);
1807+ printf("CFG: size: %d name: %s\n", cdata->len, cdata->name);
1808+ }
1809+
1810+ int i;
1811+ VmaDeviceInfo *di;
1812+ for (i = 1; i < 255; i++) {
1813+ di = vma_reader_get_device_info(vmar, i);
1814+ if (di) {
1815+ if (strcmp(di->devname, "vmstate") == 0) {
1816+ printf("VMSTATE: dev_id=%d memory: %zd\n", i, di->size);
1817+ } else {
1818+ printf("DEV: dev_id=%d size: %zd devname: %s\n",
1819+ i, di->size, di->devname);
1820+ }
1821+ }
1822+ }
1823+ /* ctime is the last entry we print */
1824+ printf("CTIME: %s", ctime(&head->ctime));
1825+ fflush(stdout);
1826+}
1827+
1828+static int list_content(int argc, char **argv)
1829+{
1830+ int c, ret = 0;
1831+ const char *filename;
1832+
1833+ for (;;) {
1834+ c = getopt(argc, argv, "h");
1835+ if (c == -1) {
1836+ break;
1837+ }
1838+ switch (c) {
1839+ case '?':
1840+ case 'h':
1841+ help();
1842+ break;
1843+ default:
1844+ g_assert_not_reached();
1845+ }
1846+ }
1847+
1848+ /* Get the filename */
1849+ if ((optind + 1) != argc) {
1850+ help();
1851+ }
1852+ filename = argv[optind++];
1853+
1854+ Error *errp = NULL;
1855+ VmaReader *vmar = vma_reader_create(filename, &errp);
1856+
1857+ if (!vmar) {
1858+ g_error("%s", error_get_pretty(errp));
1859+ }
1860+
1861+ print_content(vmar);
1862+
1863+ vma_reader_destroy(vmar);
1864+
1865+ return ret;
1866+}
1867+
1868+typedef struct RestoreMap {
1869+ char *devname;
1870+ char *path;
1871+ bool write_zero;
1872+} RestoreMap;
1873+
1874+static int extract_content(int argc, char **argv)
1875+{
1876+ int c, ret = 0;
1877+ int verbose = 0;
1878+ const char *filename;
1879+ const char *dirname;
1880+ const char *readmap = NULL;
1881+
1882+ for (;;) {
1883+ c = getopt(argc, argv, "hvr:");
1884+ if (c == -1) {
1885+ break;
1886+ }
1887+ switch (c) {
1888+ case '?':
1889+ case 'h':
1890+ help();
1891+ break;
1892+ case 'r':
1893+ readmap = optarg;
1894+ break;
1895+ case 'v':
1896+ verbose = 1;
1897+ break;
1898+ default:
1899+ help();
1900+ }
1901+ }
1902+
1903+ /* Get the filename */
1904+ if ((optind + 2) != argc) {
1905+ help();
1906+ }
1907+ filename = argv[optind++];
1908+ dirname = argv[optind++];
1909+
1910+ Error *errp = NULL;
1911+ VmaReader *vmar = vma_reader_create(filename, &errp);
1912+
1913+ if (!vmar) {
1914+ g_error("%s", error_get_pretty(errp));
1915+ }
1916+
1917+ if (mkdir(dirname, 0777) < 0) {
1918+ g_error("unable to create target directory %s - %s",
1919+ dirname, g_strerror(errno));
1920+ }
1921+
1922+ GList *l = vma_reader_get_config_data(vmar);
1923+ while (l && l->data) {
1924+ VmaConfigData *cdata = (VmaConfigData *)l->data;
1925+ l = g_list_next(l);
1926+ char *cfgfn = g_strdup_printf("%s/%s", dirname, cdata->name);
1927+ GError *err = NULL;
1928+ if (!g_file_set_contents(cfgfn, (gchar *)cdata->data, cdata->len,
1929+ &err)) {
1930+ g_error("unable to write file: %s", err->message);
1931+ }
1932+ }
1933+
1934+ GHashTable *devmap = g_hash_table_new(g_str_hash, g_str_equal);
1935+
1936+ if (readmap) {
1937+ print_content(vmar);
1938+
1939+ FILE *map = fopen(readmap, "r");
1940+ if (!map) {
1941+ g_error("unable to open fifo %s - %s", readmap, g_strerror(errno));
1942+ }
1943+
1944+ while (1) {
1945+ char inbuf[8192];
1946+ char *line = fgets(inbuf, sizeof(inbuf), map);
1947+ if (!line || line[0] == '\0' || !strcmp(line, "done\n")) {
1948+ break;
1949+ }
1950+ int len = strlen(line);
1951+ if (line[len - 1] == '\n') {
1952+ line[len - 1] = '\0';
1953+ if (len == 1) {
1954+ break;
1955+ }
1956+ }
1957+
1958+ const char *path;
1959+ bool write_zero;
1960+ if (line[0] == '0' && line[1] == ':') {
1961+ path = inbuf + 2;
1962+ write_zero = false;
1963+ } else if (line[0] == '1' && line[1] == ':') {
1964+ path = inbuf + 2;
1965+ write_zero = true;
1966+ } else {
1967+ g_error("read map failed - parse error ('%s')", inbuf);
1968+ }
1969+
1970+ char *devname = NULL;
1971+ path = extract_devname(path, &devname, -1);
1972+ if (!devname) {
1973+ g_error("read map failed - no dev name specified ('%s')",
1974+ inbuf);
1975+ }
1976+
1977+ RestoreMap *map = g_new0(RestoreMap, 1);
1978+ map->devname = g_strdup(devname);
1979+ map->path = g_strdup(path);
1980+ map->write_zero = write_zero;
1981+
1982+ g_hash_table_insert(devmap, map->devname, map);
1983+
1984+ };
1985+ }
1986+
1987+ int i;
1988+ int vmstate_fd = -1;
1989+ guint8 vmstate_stream = 0;
1990+
1991+ for (i = 1; i < 255; i++) {
1992+ VmaDeviceInfo *di = vma_reader_get_device_info(vmar, i);
1993+ if (di && (strcmp(di->devname, "vmstate") == 0)) {
1994+ vmstate_stream = i;
1995+ char *statefn = g_strdup_printf("%s/vmstate.bin", dirname);
1996+ vmstate_fd = open(statefn, O_WRONLY|O_CREAT|O_EXCL, 0644);
1997+ if (vmstate_fd < 0) {
1998+ g_error("create vmstate file '%s' failed - %s", statefn,
1999+ g_strerror(errno));
2000+ }
2001+ g_free(statefn);
2002+ } else if (di) {
2003+ char *devfn = NULL;
2004+ int flags = BDRV_O_RDWR|BDRV_O_CACHE_WB;
2005+ bool write_zero = true;
2006+
2007+ if (readmap) {
2008+ RestoreMap *map;
2009+ map = (RestoreMap *)g_hash_table_lookup(devmap, di->devname);
2010+ if (map == NULL) {
2011+ g_error("no device name mapping for %s", di->devname);
2012+ }
2013+ devfn = map->path;
2014+ write_zero = map->write_zero;
2015+ } else {
2016+ devfn = g_strdup_printf("%s/tmp-disk-%s.raw",
2017+ dirname, di->devname);
2018+ printf("DEVINFO %s %zd\n", devfn, di->size);
2019+
2020+ bdrv_img_create(devfn, "raw", NULL, NULL, NULL, di->size,
2021+ flags, &errp, 0);
2022+ if (errp) {
2023+ g_error("can't create file %s: %s", devfn,
2024+ error_get_pretty(errp));
2025+ }
2026+
2027+ /* Note: we created an empty file above, so there is no
2028+ * need to write zeroes (so we generate a sparse file)
2029+ */
2030+ write_zero = false;
2031+ }
2032+
2033+ BlockDriverState *bs = bdrv_new();
68a30562 2034+ if (errp || bdrv_open(&bs, devfn, NULL, NULL, flags, &errp)) {
ca0fe5f5
WB
2035+ g_error("can't open file %s - %s", devfn,
2036+ error_get_pretty(errp));
2037+ }
2038+ if (vma_reader_register_bs(vmar, i, bs, write_zero, &errp) < 0) {
2039+ g_error("%s", error_get_pretty(errp));
2040+ }
2041+
2042+ if (!readmap) {
2043+ g_free(devfn);
2044+ }
2045+ }
2046+ }
2047+
2048+ if (vma_reader_restore(vmar, vmstate_fd, verbose, &errp) < 0) {
2049+ g_error("restore failed - %s", error_get_pretty(errp));
2050+ }
2051+
2052+ if (!readmap) {
2053+ for (i = 1; i < 255; i++) {
2054+ VmaDeviceInfo *di = vma_reader_get_device_info(vmar, i);
2055+ if (di && (i != vmstate_stream)) {
2056+ char *tmpfn = g_strdup_printf("%s/tmp-disk-%s.raw",
2057+ dirname, di->devname);
2058+ char *fn = g_strdup_printf("%s/disk-%s.raw",
2059+ dirname, di->devname);
2060+ if (rename(tmpfn, fn) != 0) {
2061+ g_error("rename %s to %s failed - %s",
2062+ tmpfn, fn, g_strerror(errno));
2063+ }
2064+ }
2065+ }
2066+ }
2067+
2068+ vma_reader_destroy(vmar);
2069+
2070+ bdrv_close_all();
2071+
2072+ return ret;
2073+}
2074+
2075+typedef struct BackupJob {
2076+ BlockDriverState *bs;
2077+ int64_t len;
2078+ VmaWriter *vmaw;
2079+ uint8_t dev_id;
2080+} BackupJob;
2081+
2082+#define BACKUP_SECTORS_PER_CLUSTER (VMA_CLUSTER_SIZE / BDRV_SECTOR_SIZE)
2083+
2084+static void coroutine_fn backup_run(void *opaque)
2085+{
2086+ BackupJob *job = (BackupJob *)opaque;
2087+ struct iovec iov;
2088+ QEMUIOVector qiov;
2089+
2090+ int64_t start, end;
2091+ int ret = 0;
2092+
2093+ unsigned char *buf = qemu_blockalign(job->bs, VMA_CLUSTER_SIZE);
2094+
2095+ start = 0;
2096+ end = DIV_ROUND_UP(job->len / BDRV_SECTOR_SIZE,
2097+ BACKUP_SECTORS_PER_CLUSTER);
2098+
2099+ for (; start < end; start++) {
2100+ iov.iov_base = buf;
2101+ iov.iov_len = VMA_CLUSTER_SIZE;
2102+ qemu_iovec_init_external(&qiov, &iov, 1);
2103+
2104+ ret = bdrv_co_readv(job->bs, start * BACKUP_SECTORS_PER_CLUSTER,
2105+ BACKUP_SECTORS_PER_CLUSTER, &qiov);
2106+ if (ret < 0) {
2107+ vma_writer_set_error(job->vmaw, "read error", -1);
2108+ goto out;
2109+ }
2110+
2111+ size_t zb = 0;
2112+ if (vma_writer_write(job->vmaw, job->dev_id, start, buf, &zb) < 0) {
2113+ vma_writer_set_error(job->vmaw, "backup_dump_cb vma_writer_write failed", -1);
2114+ goto out;
2115+ }
2116+ }
2117+
2118+
2119+out:
2120+ if (vma_writer_close_stream(job->vmaw, job->dev_id) <= 0) {
2121+ Error *err = NULL;
2122+ if (vma_writer_close(job->vmaw, &err) != 0) {
2123+ g_warning("vma_writer_close failed %s", error_get_pretty(err));
2124+ }
2125+ }
2126+}
2127+
2128+static int create_archive(int argc, char **argv)
2129+{
68a30562 2130+ int i, c;
ca0fe5f5
WB
2131+ int verbose = 0;
2132+ const char *archivename;
2133+ GList *config_files = NULL;
2134+
2135+ for (;;) {
2136+ c = getopt(argc, argv, "hvc:");
2137+ if (c == -1) {
2138+ break;
2139+ }
2140+ switch (c) {
2141+ case '?':
2142+ case 'h':
2143+ help();
2144+ break;
2145+ case 'c':
2146+ config_files = g_list_append(config_files, optarg);
2147+ break;
2148+ case 'v':
2149+ verbose = 1;
2150+ break;
2151+ default:
2152+ g_assert_not_reached();
2153+ }
2154+ }
2155+
2156+
2157+ /* make sure we have archive name and at least one path */
2158+ if ((optind + 2) > argc) {
2159+ help();
2160+ }
2161+
2162+ archivename = argv[optind++];
2163+
2164+ uuid_t uuid;
2165+ uuid_generate(uuid);
2166+
2167+ Error *local_err = NULL;
2168+ VmaWriter *vmaw = vma_writer_create(archivename, uuid, &local_err);
2169+
2170+ if (vmaw == NULL) {
2171+ g_error("%s", error_get_pretty(local_err));
2172+ }
2173+
2174+ GList *l = config_files;
2175+ while (l && l->data) {
2176+ char *name = l->data;
2177+ char *cdata = NULL;
2178+ gsize clen = 0;
2179+ GError *err = NULL;
2180+ if (!g_file_get_contents(name, &cdata, &clen, &err)) {
2181+ unlink(archivename);
2182+ g_error("Unable to read file: %s", err->message);
2183+ }
2184+
2185+ if (vma_writer_add_config(vmaw, name, cdata, clen) != 0) {
2186+ unlink(archivename);
2187+ g_error("Unable to append config data %s (len = %zd)",
2188+ name, clen);
2189+ }
2190+ l = g_list_next(l);
2191+ }
2192+
2193+ int ind = 0;
2194+ while (optind < argc) {
2195+ const char *path = argv[optind++];
2196+ char *devname = NULL;
2197+ path = extract_devname(path, &devname, ind++);
2198+
ca0fe5f5 2199+ Error *errp = NULL;
68a30562 2200+ BlockDriverState *bs;
ca0fe5f5 2201+
68a30562
WB
2202+ bs = bdrv_open(path, NULL, NULL, 0, &errp);
2203+ if (!bs) {
ca0fe5f5
WB
2204+ unlink(archivename);
2205+ g_error("bdrv_open '%s' failed - %s", path, error_get_pretty(errp));
2206+ }
2207+ int64_t size = bdrv_getlength(bs);
2208+ int dev_id = vma_writer_register_stream(vmaw, devname, size);
2209+ if (dev_id <= 0) {
2210+ unlink(archivename);
2211+ g_error("vma_writer_register_stream '%s' failed", devname);
2212+ }
2213+
2214+ BackupJob *job = g_new0(BackupJob, 1);
2215+ job->len = size;
2216+ job->bs = bs;
2217+ job->vmaw = vmaw;
2218+ job->dev_id = dev_id;
2219+
68a30562
WB
2220+ Coroutine *co = qemu_coroutine_create(backup_run, job);
2221+ qemu_coroutine_enter(co);
ca0fe5f5
WB
2222+ }
2223+
2224+ VmaStatus vmastat;
2225+ int percent = 0;
2226+ int last_percent = -1;
2227+
2228+ while (1) {
2229+ main_loop_wait(false);
2230+ vma_writer_get_status(vmaw, &vmastat);
2231+
2232+ if (verbose) {
2233+
2234+ uint64_t total = 0;
2235+ uint64_t transferred = 0;
2236+ uint64_t zero_bytes = 0;
2237+
2238+ int i;
2239+ for (i = 0; i < 256; i++) {
2240+ if (vmastat.stream_info[i].size) {
2241+ total += vmastat.stream_info[i].size;
2242+ transferred += vmastat.stream_info[i].transferred;
2243+ zero_bytes += vmastat.stream_info[i].zero_bytes;
2244+ }
2245+ }
2246+ percent = (transferred*100)/total;
2247+ if (percent != last_percent) {
2248+ fprintf(stderr, "progress %d%% %zd/%zd %zd\n", percent,
2249+ transferred, total, zero_bytes);
2250+ fflush(stderr);
2251+
2252+ last_percent = percent;
2253+ }
2254+ }
2255+
2256+ if (vmastat.closed) {
2257+ break;
2258+ }
68a30562
WB
2259+ } else {
2260+ Coroutine *co = qemu_coroutine_create(backup_run_empty, vmaw);
2261+ qemu_coroutine_enter(co);
2262+ while (1) {
2263+ main_loop_wait(false);
2264+ vma_writer_get_status(vmaw, &vmastat);
2265+ if (vmastat.closed) {
2266+ break;
2267+ }
2268+ }
ca0fe5f5
WB
2269+ }
2270+
2271+ bdrv_drain_all();
2272+
2273+ vma_writer_get_status(vmaw, &vmastat);
2274+
2275+ if (verbose) {
2276+ for (i = 0; i < 256; i++) {
2277+ VmaStreamInfo *si = &vmastat.stream_info[i];
2278+ if (si->size) {
2279+ fprintf(stderr, "image %s: size=%zd zeros=%zd saved=%zd\n",
2280+ si->devname, si->size, si->zero_bytes,
2281+ si->size - si->zero_bytes);
2282+ }
2283+ }
2284+ }
2285+
2286+ if (vmastat.status < 0) {
2287+ unlink(archivename);
2288+ g_error("creating vma archive failed");
2289+ }
2290+
2291+ return 0;
2292+}
2293+
2294+int main(int argc, char **argv)
2295+{
2296+ const char *cmdname;
2297+ Error *main_loop_err = NULL;
2298+
2299+ error_set_progname(argv[0]);
2300+
2301+ if (qemu_init_main_loop(&main_loop_err)) {
2302+ g_error("%s", error_get_pretty(main_loop_err));
2303+ }
2304+
2305+ bdrv_init();
2306+
2307+ if (argc < 2) {
2308+ help();
2309+ }
2310+
2311+ cmdname = argv[1];
2312+ argc--; argv++;
2313+
2314+
2315+ if (!strcmp(cmdname, "list")) {
2316+ return list_content(argc, argv);
2317+ } else if (!strcmp(cmdname, "create")) {
2318+ return create_archive(argc, argv);
2319+ } else if (!strcmp(cmdname, "extract")) {
2320+ return extract_content(argc, argv);
2321+ }
2322+
2323+ help();
2324+ return 0;
2325+}
2326diff --git a/vma.h b/vma.h
2327new file mode 100644
2328index 0000000..6625eb9
2329--- /dev/null
2330+++ b/vma.h
2331@@ -0,0 +1,146 @@
2332+/*
2333+ * VMA: Virtual Machine Archive
2334+ *
2335+ * Copyright (C) Proxmox Server Solutions
2336+ *
2337+ * Authors:
2338+ * Dietmar Maurer (dietmar@proxmox.com)
2339+ *
2340+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
2341+ * See the COPYING file in the top-level directory.
2342+ *
2343+ */
2344+
2345+#ifndef BACKUP_VMA_H
2346+#define BACKUP_VMA_H
2347+
2348+#include <uuid/uuid.h>
2349+#include "qapi/error.h"
2350+#include "block/block.h"
2351+
2352+#define VMA_BLOCK_BITS 12
2353+#define VMA_BLOCK_SIZE (1<<VMA_BLOCK_BITS)
2354+#define VMA_CLUSTER_BITS (VMA_BLOCK_BITS+4)
2355+#define VMA_CLUSTER_SIZE (1<<VMA_CLUSTER_BITS)
2356+
2357+#if VMA_CLUSTER_SIZE != 65536
2358+#error unexpected cluster size
2359+#endif
2360+
2361+#define VMA_EXTENT_HEADER_SIZE 512
2362+#define VMA_BLOCKS_PER_EXTENT 59
2363+#define VMA_MAX_CONFIGS 256
2364+
2365+#define VMA_MAX_EXTENT_SIZE \
2366+ (VMA_EXTENT_HEADER_SIZE+VMA_CLUSTER_SIZE*VMA_BLOCKS_PER_EXTENT)
2367+#if VMA_MAX_EXTENT_SIZE != 3867136
2368+#error unexpected VMA_EXTENT_SIZE
2369+#endif
2370+
2371+/* File Format Definitions */
2372+
2373+#define VMA_MAGIC (GUINT32_TO_BE(('V'<<24)|('M'<<16)|('A'<<8)|0x00))
2374+#define VMA_EXTENT_MAGIC (GUINT32_TO_BE(('V'<<24)|('M'<<16)|('A'<<8)|'E'))
2375+
2376+typedef struct VmaDeviceInfoHeader {
2377+ uint32_t devname_ptr; /* offset into blob_buffer table */
2378+ uint32_t reserved0;
2379+ uint64_t size; /* device size in bytes */
2380+ uint64_t reserved1;
2381+ uint64_t reserved2;
2382+} VmaDeviceInfoHeader;
2383+
2384+typedef struct VmaHeader {
2385+ uint32_t magic;
2386+ uint32_t version;
2387+ unsigned char uuid[16];
2388+ int64_t ctime;
2389+ unsigned char md5sum[16];
2390+
2391+ uint32_t blob_buffer_offset;
2392+ uint32_t blob_buffer_size;
2393+ uint32_t header_size;
2394+
2395+ unsigned char reserved[1984];
2396+
2397+ uint32_t config_names[VMA_MAX_CONFIGS]; /* offset into blob_buffer table */
2398+ uint32_t config_data[VMA_MAX_CONFIGS]; /* offset into blob_buffer table */
2399+
2400+ uint32_t reserved1;
2401+
2402+ VmaDeviceInfoHeader dev_info[256];
2403+} VmaHeader;
2404+
2405+typedef struct VmaExtentHeader {
2406+ uint32_t magic;
2407+ uint16_t reserved1;
2408+ uint16_t block_count;
2409+ unsigned char uuid[16];
2410+ unsigned char md5sum[16];
2411+ uint64_t blockinfo[VMA_BLOCKS_PER_EXTENT];
2412+} VmaExtentHeader;
2413+
2414+/* functions/definitions to read/write vma files */
2415+
2416+typedef struct VmaReader VmaReader;
2417+
2418+typedef struct VmaWriter VmaWriter;
2419+
2420+typedef struct VmaConfigData {
2421+ const char *name;
2422+ const void *data;
2423+ uint32_t len;
2424+} VmaConfigData;
2425+
2426+typedef struct VmaStreamInfo {
2427+ uint64_t size;
2428+ uint64_t cluster_count;
2429+ uint64_t transferred;
2430+ uint64_t zero_bytes;
2431+ int finished;
2432+ char *devname;
2433+} VmaStreamInfo;
2434+
2435+typedef struct VmaStatus {
2436+ int status;
2437+ bool closed;
2438+ char errmsg[8192];
2439+ char uuid_str[37];
2440+ VmaStreamInfo stream_info[256];
2441+} VmaStatus;
2442+
2443+typedef struct VmaDeviceInfo {
2444+ uint64_t size; /* device size in bytes */
2445+ const char *devname;
2446+} VmaDeviceInfo;
2447+
2448+VmaWriter *vma_writer_create(const char *filename, uuid_t uuid, Error **errp);
2449+int vma_writer_close(VmaWriter *vmaw, Error **errp);
2450+void vma_writer_destroy(VmaWriter *vmaw);
2451+int vma_writer_add_config(VmaWriter *vmaw, const char *name, gpointer data,
2452+ size_t len);
2453+int vma_writer_register_stream(VmaWriter *vmaw, const char *devname,
2454+ size_t size);
2455+
2456+int64_t coroutine_fn vma_writer_write(VmaWriter *vmaw, uint8_t dev_id,
2457+ int64_t cluster_num, unsigned char *buf,
2458+ size_t *zero_bytes);
2459+
2460+int coroutine_fn vma_writer_close_stream(VmaWriter *vmaw, uint8_t dev_id);
2461+
2462+int vma_writer_get_status(VmaWriter *vmaw, VmaStatus *status);
2463+void vma_writer_set_error(VmaWriter *vmaw, const char *fmt, ...);
2464+
2465+
2466+VmaReader *vma_reader_create(const char *filename, Error **errp);
2467+void vma_reader_destroy(VmaReader *vmar);
2468+VmaHeader *vma_reader_get_header(VmaReader *vmar);
2469+GList *vma_reader_get_config_data(VmaReader *vmar);
2470+VmaDeviceInfo *vma_reader_get_device_info(VmaReader *vmar, guint8 dev_id);
2471+int vma_reader_register_bs(VmaReader *vmar, guint8 dev_id,
2472+ BlockDriverState *bs, bool write_zeroes,
2473+ Error **errp);
2474+int vma_reader_restore(VmaReader *vmar, int vmstate_fd, bool verbose,
2475+ Error **errp);
2476+
2477+#endif /* BACKUP_VMA_H */
2478--
24792.1.4
2480