]> git.proxmox.com Git - pve-qemu.git/commitdiff
backup: improve error when copy-before-write fails for fleecing master
authorFiona Ebner <f.ebner@proxmox.com>
Mon, 29 Apr 2024 15:20:22 +0000 (17:20 +0200)
committerThomas Lamprecht <t.lamprecht@proxmox.com>
Mon, 29 Apr 2024 15:25:20 +0000 (17:25 +0200)
With fleecing, failure for copy-before-write does not fail the guest
write, but only sets the snapshot error that is associated to the
copy-before-write filter, making further requests to the snapshot
access fail with EACCES, which then also fails the job. But that error
code is not the root cause of why the backup failed, so bubble up the
original snapshot error instead.

Reported-by: Friedrich Weber <f.weber@proxmox.com>
Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
90 files changed:
Makefile
debian/changelog
debian/patches/bitmap-mirror/0001-drive-mirror-add-support-for-sync-bitmap-mode-never.patch
debian/patches/bitmap-mirror/0002-drive-mirror-add-support-for-conditional-and-always-.patch
debian/patches/bitmap-mirror/0003-mirror-add-check-for-bitmap-mode-without-bitmap.patch
debian/patches/bitmap-mirror/0004-mirror-switch-to-bdrv_dirty_bitmap_merge_internal.patch
debian/patches/bitmap-mirror/0006-mirror-move-some-checks-to-qmp.patch
debian/patches/extra/0001-monitor-qmp-fix-race-with-clients-disconnecting-earl.patch
debian/patches/extra/0002-scsi-megasas-Internal-cdbs-have-16-byte-length.patch
debian/patches/extra/0003-ide-avoid-potential-deadlock-when-draining-during-tr.patch
debian/patches/extra/0004-Revert-x86-acpi-workaround-Windows-not-handling-name.patch [new file with mode: 0644]
debian/patches/extra/0004-migration-block-dirty-bitmap-fix-loading-bitmap-when.patch [deleted file]
debian/patches/extra/0005-Revert-Revert-graph-lock-Disable-locking-for-now.patch [deleted file]
debian/patches/extra/0005-block-copy-before-write-use-uint64_t-for-timeout-in-.patch [new file with mode: 0644]
debian/patches/extra/0006-migration-states-workaround-snapshot-performance-reg.patch [deleted file]
debian/patches/extra/0007-Revert-x86-acpi-workaround-Windows-not-handling-name.patch [deleted file]
debian/patches/extra/0008-target-i386-the-sgx_epc_get_section-stub-is-reachabl.patch [deleted file]
debian/patches/extra/0009-ui-clipboard-mark-type-as-not-available-when-there-i.patch [deleted file]
debian/patches/extra/0010-virtio-scsi-Attach-event-vq-notifier-with-no_poll.patch [deleted file]
debian/patches/extra/0011-virtio-Re-enable-notifications-after-drain.patch [deleted file]
debian/patches/extra/0012-qemu_init-increase-NOFILE-soft-limit-on-POSIX.patch [deleted file]
debian/patches/extra/0013-virtio-blk-avoid-using-ioeventfd-state-in-irqfd-cond.patch [deleted file]
debian/patches/pve/0001-PVE-Config-block-file-change-locking-default-to-off.patch
debian/patches/pve/0002-PVE-Config-Adjust-network-script-path-to-etc-kvm.patch
debian/patches/pve/0003-PVE-Config-set-the-CPU-model-to-kvm64-32-instead-of-.patch
debian/patches/pve/0004-PVE-Config-ui-spice-default-to-pve-certificates.patch
debian/patches/pve/0005-PVE-Config-glusterfs-no-default-logfile-if-daemonize.patch
debian/patches/pve/0006-PVE-Config-rbd-block-rbd-disable-rbd_cache_writethro.patch
debian/patches/pve/0007-PVE-Up-glusterfs-allow-partial-reads.patch
debian/patches/pve/0008-PVE-Up-qemu-img-return-success-on-info-without-snaps.patch
debian/patches/pve/0009-PVE-Up-qemu-img-dd-add-osize-and-read-from-to-stdin-.patch
debian/patches/pve/0010-PVE-Up-qemu-img-dd-add-isize-parameter.patch
debian/patches/pve/0011-PVE-Up-qemu-img-dd-add-n-skip_create.patch
debian/patches/pve/0012-qemu-img-dd-add-l-option-for-loading-a-snapshot.patch
debian/patches/pve/0013-PVE-virtio-balloon-improve-query-balloon.patch
debian/patches/pve/0014-PVE-qapi-modify-query-machines.patch
debian/patches/pve/0015-PVE-qapi-modify-spice-query.patch
debian/patches/pve/0016-PVE-add-IOChannel-implementation-for-savevm-async.patch
debian/patches/pve/0017-PVE-add-savevm-async-for-background-state-snapshots.patch
debian/patches/pve/0018-PVE-add-optional-buffer-size-to-QEMUFile.patch
debian/patches/pve/0019-PVE-block-add-the-zeroinit-block-driver-filter.patch
debian/patches/pve/0020-PVE-Add-dummy-id-command-line-parameter.patch
debian/patches/pve/0021-PVE-Config-Revert-target-i386-disable-LINT0-after-re.patch
debian/patches/pve/0022-PVE-Up-Config-file-posix-make-locking-optiono-on-cre.patch
debian/patches/pve/0024-PVE-Compat-4.0-used-balloon-qemu-4-0-config-size-fal.patch
debian/patches/pve/0025-PVE-Allow-version-code-in-machine-type.patch
debian/patches/pve/0026-block-backup-move-bcs-bitmap-initialization-to-job-c.patch
debian/patches/pve/0027-PVE-Backup-add-vma-backup-format-code.patch
debian/patches/pve/0028-PVE-Backup-add-backup-dump-block-driver.patch
debian/patches/pve/0029-PVE-Add-sequential-job-transaction-support.patch
debian/patches/pve/0030-PVE-Backup-Proxmox-backup-patches-for-QEMU.patch
debian/patches/pve/0031-PVE-Backup-pbs-restore-new-command-to-restore-from-p.patch
debian/patches/pve/0032-PVE-Add-PBS-block-driver-to-map-backup-archives-into.patch
debian/patches/pve/0033-PVE-redirect-stderr-to-journal-when-daemonized.patch
debian/patches/pve/0034-PVE-Migrate-dirty-bitmap-state-via-savevm.patch
debian/patches/pve/0035-migration-block-dirty-bitmap-migrate-other-bitmaps-e.patch
debian/patches/pve/0036-PVE-fall-back-to-open-iscsi-initiatorname.patch
debian/patches/pve/0037-PVE-block-stream-increase-chunk-size.patch
debian/patches/pve/0038-block-add-alloc-track-driver.patch [new file with mode: 0644]
debian/patches/pve/0038-block-io-accept-NULL-qiov-in-bdrv_pad_request.patch [deleted file]
debian/patches/pve/0039-Revert-block-rbd-workaround-for-ceph-issue-53784.patch [new file with mode: 0644]
debian/patches/pve/0039-block-add-alloc-track-driver.patch [deleted file]
debian/patches/pve/0040-Revert-block-rbd-fix-handling-of-holes-in-.bdrv_co_b.patch [new file with mode: 0644]
debian/patches/pve/0040-Revert-block-rbd-workaround-for-ceph-issue-53784.patch [deleted file]
debian/patches/pve/0041-Revert-block-rbd-fix-handling-of-holes-in-.bdrv_co_b.patch [deleted file]
debian/patches/pve/0041-Revert-block-rbd-implement-bdrv_co_block_status.patch [new file with mode: 0644]
debian/patches/pve/0042-Revert-block-rbd-implement-bdrv_co_block_status.patch [deleted file]
debian/patches/pve/0042-alloc-track-error-out-when-auto-remove-is-not-set.patch [new file with mode: 0644]
debian/patches/pve/0043-alloc-track-avoid-seemingly-superfluous-child-permis.patch [new file with mode: 0644]
debian/patches/pve/0043-alloc-track-fix-deadlock-during-drop.patch [deleted file]
debian/patches/pve/0044-block-copy-before-write-fix-permission.patch [new file with mode: 0644]
debian/patches/pve/0044-migration-for-snapshots-hold-the-BQL-during-setup-ca.patch [deleted file]
debian/patches/pve/0045-block-copy-before-write-support-unligned-snapshot-di.patch [new file with mode: 0644]
debian/patches/pve/0045-savevm-async-don-t-hold-BQL-during-setup.patch [deleted file]
debian/patches/pve/0046-block-copy-before-write-create-block_copy-bitmap-in-.patch [new file with mode: 0644]
debian/patches/pve/0046-block-copy-before-write-fix-permission.patch [deleted file]
debian/patches/pve/0047-block-copy-before-write-support-unligned-snapshot-di.patch [deleted file]
debian/patches/pve/0047-qapi-blockdev-backup-add-discard-source-parameter.patch [new file with mode: 0644]
debian/patches/pve/0048-block-copy-before-write-create-block_copy-bitmap-in-.patch [deleted file]
debian/patches/pve/0048-copy-before-write-allow-specifying-minimum-cluster-s.patch [new file with mode: 0644]
debian/patches/pve/0049-backup-add-minimum-cluster-size-to-performance-optio.patch [new file with mode: 0644]
debian/patches/pve/0049-qapi-blockdev-backup-add-discard-source-parameter.patch [deleted file]
debian/patches/pve/0050-PVE-backup-add-fleecing-option.patch [new file with mode: 0644]
debian/patches/pve/0050-copy-before-write-allow-specifying-minimum-cluster-s.patch [deleted file]
debian/patches/pve/0051-PVE-backup-improve-error-when-copy-before-write-fail.patch [new file with mode: 0644]
debian/patches/pve/0051-backup-add-minimum-cluster-size-to-performance-optio.patch [deleted file]
debian/patches/pve/0052-PVE-backup-add-fleecing-option.patch [deleted file]
debian/patches/series
debian/source/lintian-overrides
qemu

index cad130e9ea7ccfc0f1057b6d2012465650f62913..f23b041616775e7b7b9ba07d57352b25945e70b9 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -24,6 +24,7 @@ endif
 
 PC_BIOS_FW_PURGE_LIST_IN = \
        hppa-firmware.img \
+       hppa-firmware64.img \
        openbios-ppc \
        openbios-sparc32 \
        openbios-sparc64 \
@@ -31,7 +32,8 @@ PC_BIOS_FW_PURGE_LIST_IN = \
        s390-ccw.img \
        s390-netboot.img \
        u-boot.e500 \
-       .*\.dtb \
+       .*[a-zA-Z0-9]\.dtb \
+       .*[a-zA-Z0-9]\.dts \
        qemu_vga.ndrv \
        slof.bin \
        opensbi-riscv.*-generic-fw_dynamic.bin \
@@ -56,7 +58,7 @@ $(BUILDDIR): submodule
 deb kvm: $(DEBS)
 $(DEB_DBG): $(DEB)
 $(DEB): $(BUILDDIR)
-       cd $(BUILDDIR); dpkg-buildpackage -b -us -uc -j
+       cd $(BUILDDIR); dpkg-buildpackage -b -us -uc
        lintian $(DEBS)
 
 sbuild: $(DSC)
index 39cd8b954da15dbde776b30f9ac8c999e1c71b94..9fb475737602831fea71d50922158d56c3594c76 100644 (file)
@@ -1,3 +1,15 @@
+pve-qemu-kvm (9.0.0-1) bookworm; urgency=medium
+
+  * update submodule and patches to QEMU 9.0.0
+
+ -- Proxmox Support Team <support@proxmox.com>  Mon, 29 Apr 2024 10:51:37 +0200
+
+pve-qemu-kvm (8.2.2-1) bookworm; urgency=medium
+
+  * update submodule and patches to QEMU 8.2.2
+
+ -- Proxmox Support Team <support@proxmox.com>  Sat, 27 Apr 2024 12:44:30 +0200
+
 pve-qemu-kvm (8.1.5-5) bookworm; urgency=medium
 
   * implement support for backup fleecing
index c9c63b5638ddab2da2f2e314e998e743270112e7..6789ac58d2693d90f806844366cd2cb86f3be9be 100644 (file)
@@ -27,18 +27,18 @@ Signed-off-by: Ma Haocong <mahaocong@didichuxing.com>
 Signed-off-by: John Snow <jsnow@redhat.com>
 Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
-[FE: rebased for 8.1.1]
+[FE: rebased for 8.2.2]
 Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
 ---
- block/mirror.c                         | 98 +++++++++++++++++++++-----
+ block/mirror.c                         | 99 ++++++++++++++++++++------
  blockdev.c                             | 38 +++++++++-
  include/block/block_int-global-state.h |  4 +-
  qapi/block-core.json                   | 25 ++++++-
  tests/unit/test-block-iothread.c       |  4 +-
- 5 files changed, 142 insertions(+), 27 deletions(-)
+ 5 files changed, 142 insertions(+), 28 deletions(-)
 
 diff --git a/block/mirror.c b/block/mirror.c
-index d3cacd1708..1ff42c8af1 100644
+index 1bdce3b657..0c5c72df2e 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
 @@ -51,7 +51,7 @@ typedef struct MirrorBlockJob {
@@ -50,7 +50,7 @@ index d3cacd1708..1ff42c8af1 100644
      BlockMirrorBackingMode backing_mode;
      /* Whether the target image requires explicit zero-initialization */
      bool zero_target;
-@@ -65,6 +65,8 @@ typedef struct MirrorBlockJob {
+@@ -73,6 +73,8 @@ typedef struct MirrorBlockJob {
      size_t buf_size;
      int64_t bdev_length;
      unsigned long *cow_bitmap;
@@ -59,9 +59,9 @@ index d3cacd1708..1ff42c8af1 100644
      BdrvDirtyBitmap *dirty_bitmap;
      BdrvDirtyBitmapIter *dbi;
      uint8_t *buf;
-@@ -705,7 +707,8 @@ static int mirror_exit_common(Job *job)
-     bdrv_child_refresh_perms(mirror_top_bs, mirror_top_bs->backing,
+@@ -722,7 +724,8 @@ static int mirror_exit_common(Job *job)
                               &error_abort);
      if (!abort && s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
 -        BlockDriverState *backing = s->is_none_mode ? src : s->base;
 +        BlockDriverState *backing;
@@ -69,7 +69,7 @@ index d3cacd1708..1ff42c8af1 100644
          BlockDriverState *unfiltered_target = bdrv_skip_filters(target_bs);
  
          if (bdrv_cow_bs(unfiltered_target) != backing) {
-@@ -809,6 +812,16 @@ static void mirror_abort(Job *job)
+@@ -819,6 +822,16 @@ static void mirror_abort(Job *job)
      assert(ret == 0);
  }
  
@@ -86,7 +86,7 @@ index d3cacd1708..1ff42c8af1 100644
  static void coroutine_fn mirror_throttle(MirrorBlockJob *s)
  {
      int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-@@ -997,7 +1010,8 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
+@@ -1015,7 +1028,8 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
      mirror_free_init(s);
  
      s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
@@ -96,7 +96,7 @@ index d3cacd1708..1ff42c8af1 100644
          ret = mirror_dirty_init(s);
          if (ret < 0 || job_is_cancelled(&s->common.job)) {
              goto immediate_exit;
-@@ -1251,6 +1265,7 @@ static const BlockJobDriver mirror_job_driver = {
+@@ -1304,6 +1318,7 @@ static const BlockJobDriver mirror_job_driver = {
          .run                    = mirror_run,
          .prepare                = mirror_prepare,
          .abort                  = mirror_abort,
@@ -104,7 +104,7 @@ index d3cacd1708..1ff42c8af1 100644
          .pause                  = mirror_pause,
          .complete               = mirror_complete,
          .cancel                 = mirror_cancel,
-@@ -1267,6 +1282,7 @@ static const BlockJobDriver commit_active_job_driver = {
+@@ -1322,6 +1337,7 @@ static const BlockJobDriver commit_active_job_driver = {
          .run                    = mirror_run,
          .prepare                = mirror_prepare,
          .abort                  = mirror_abort,
@@ -112,7 +112,7 @@ index d3cacd1708..1ff42c8af1 100644
          .pause                  = mirror_pause,
          .complete               = mirror_complete,
          .cancel                 = commit_active_cancel,
-@@ -1658,7 +1674,10 @@ static BlockJob *mirror_start_job(
+@@ -1714,7 +1730,10 @@ static BlockJob *mirror_start_job(
                               BlockCompletionFunc *cb,
                               void *opaque,
                               const BlockJobDriver *driver,
@@ -124,9 +124,9 @@ index d3cacd1708..1ff42c8af1 100644
                               bool auto_complete, const char *filter_node_name,
                               bool is_mirror, MirrorCopyMode copy_mode,
                               Error **errp)
-@@ -1670,10 +1689,39 @@ static BlockJob *mirror_start_job(
-     uint64_t target_perms, target_shared_perms;
-     int ret;
+@@ -1728,10 +1747,39 @@ static BlockJob *mirror_start_job(
+     GLOBAL_STATE_CODE();
  
 -    if (granularity == 0) {
 -        granularity = bdrv_get_default_bitmap_granularity(target);
@@ -166,7 +166,7 @@ index d3cacd1708..1ff42c8af1 100644
      assert(is_power_of_2(granularity));
  
      if (buf_size < 0) {
-@@ -1804,7 +1852,9 @@ static BlockJob *mirror_start_job(
+@@ -1871,7 +1919,9 @@ static BlockJob *mirror_start_job(
      s->replaces = g_strdup(replaces);
      s->on_source_error = on_source_error;
      s->on_target_error = on_target_error;
@@ -176,10 +176,10 @@ index d3cacd1708..1ff42c8af1 100644
 +    s->bitmap_mode = bitmap_mode;
      s->backing_mode = backing_mode;
      s->zero_target = zero_target;
-     s->copy_mode = copy_mode;
-@@ -1825,6 +1875,18 @@ static BlockJob *mirror_start_job(
-         bdrv_disable_dirty_bitmap(s->dirty_bitmap);
-     }
+     qatomic_set(&s->copy_mode, copy_mode);
+@@ -1897,6 +1947,18 @@ static BlockJob *mirror_start_job(
+      */
+     bdrv_disable_dirty_bitmap(s->dirty_bitmap);
  
 +    if (s->sync_bitmap) {
 +        bdrv_dirty_bitmap_set_busy(s->sync_bitmap, true);
@@ -193,10 +193,10 @@ index d3cacd1708..1ff42c8af1 100644
 +        }
 +    }
 +
+     bdrv_graph_wrlock();
      ret = block_job_add_bdrv(&s->common, "source", bs, 0,
                               BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE |
-                              BLK_PERM_CONSISTENT_READ,
-@@ -1902,6 +1964,9 @@ fail:
+@@ -1979,6 +2041,9 @@ fail:
          if (s->dirty_bitmap) {
              bdrv_release_dirty_bitmap(s->dirty_bitmap);
          }
@@ -206,7 +206,7 @@ index d3cacd1708..1ff42c8af1 100644
          job_early_fail(&s->common.job);
      }
  
-@@ -1919,31 +1984,25 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
+@@ -2001,35 +2066,28 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
                    BlockDriverState *target, const char *replaces,
                    int creation_flags, int64_t speed,
                    uint32_t granularity, int64_t buf_size,
@@ -231,8 +231,12 @@ index d3cacd1708..1ff42c8af1 100644
 -                   MirrorSyncMode_str(mode));
 -        return;
 -    }
+-
+     bdrv_graph_rdlock_main_loop();
 -    is_none_mode = mode == MIRROR_SYNC_MODE_NONE;
      base = mode == MIRROR_SYNC_MODE_TOP ? bdrv_backing_chain_next(bs) : NULL;
+     bdrv_graph_rdunlock_main_loop();
      mirror_start_job(job_id, bs, creation_flags, target, replaces,
                       speed, granularity, buf_size, backing_mode, zero_target,
                       on_source_error, on_target_error, unmap, NULL, NULL,
@@ -243,7 +247,7 @@ index d3cacd1708..1ff42c8af1 100644
  }
  
  BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
-@@ -1970,7 +2029,8 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
+@@ -2056,7 +2114,8 @@ BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
                       job_id, bs, creation_flags, base, NULL, speed, 0, 0,
                       MIRROR_LEAVE_BACKING_CHAIN, false,
                       on_error, on_error, true, cb, opaque,
@@ -254,10 +258,10 @@ index d3cacd1708..1ff42c8af1 100644
                       errp);
      if (!job) {
 diff --git a/blockdev.c b/blockdev.c
-index c28462a633..a402fa4bf7 100644
+index 057601dcf0..8682814a7a 100644
 --- a/blockdev.c
 +++ b/blockdev.c
-@@ -2849,6 +2849,9 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+@@ -2776,6 +2776,9 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
                                     BlockDriverState *target,
                                     const char *replaces,
                                     enum MirrorSyncMode sync,
@@ -267,7 +271,7 @@ index c28462a633..a402fa4bf7 100644
                                     BlockMirrorBackingMode backing_mode,
                                     bool zero_target,
                                     bool has_speed, int64_t speed,
-@@ -2867,6 +2870,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+@@ -2794,6 +2797,7 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
  {
      BlockDriverState *unfiltered_bs;
      int job_flags = JOB_DEFAULT;
@@ -275,7 +279,7 @@ index c28462a633..a402fa4bf7 100644
  
      GLOBAL_STATE_CODE();
      GRAPH_RDLOCK_GUARD_MAINLOOP();
-@@ -2921,6 +2925,29 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+@@ -2848,6 +2852,29 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
          sync = MIRROR_SYNC_MODE_FULL;
      }
  
@@ -305,7 +309,7 @@ index c28462a633..a402fa4bf7 100644
      if (!replaces) {
          /* We want to mirror from @bs, but keep implicit filters on top */
          unfiltered_bs = bdrv_skip_implicit_filters(bs);
-@@ -2966,8 +2993,8 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+@@ -2889,8 +2916,8 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
       * and will allow to check whether the node still exist at mirror completion
       */
      mirror_start(job_id, bs, target,
@@ -316,7 +320,7 @@ index c28462a633..a402fa4bf7 100644
                   on_source_error, on_target_error, unmap, filter_node_name,
                   copy_mode, errp);
  }
-@@ -3115,6 +3142,8 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
+@@ -3034,6 +3061,8 @@ void qmp_drive_mirror(DriveMirror *arg, Error **errp)
  
      blockdev_mirror_common(arg->job_id, bs, target_bs,
                             arg->replaces, arg->sync,
@@ -325,7 +329,7 @@ index c28462a633..a402fa4bf7 100644
                             backing_mode, zero_target,
                             arg->has_speed, arg->speed,
                             arg->has_granularity, arg->granularity,
-@@ -3136,6 +3165,8 @@ void qmp_blockdev_mirror(const char *job_id,
+@@ -3053,6 +3082,8 @@ void qmp_blockdev_mirror(const char *job_id,
                           const char *device, const char *target,
                           const char *replaces,
                           MirrorSyncMode sync,
@@ -334,7 +338,7 @@ index c28462a633..a402fa4bf7 100644
                           bool has_speed, int64_t speed,
                           bool has_granularity, uint32_t granularity,
                           bool has_buf_size, int64_t buf_size,
-@@ -3184,7 +3215,8 @@ void qmp_blockdev_mirror(const char *job_id,
+@@ -3093,7 +3124,8 @@ void qmp_blockdev_mirror(const char *job_id,
      }
  
      blockdev_mirror_common(job_id, bs, target_bs,
@@ -345,10 +349,10 @@ index c28462a633..a402fa4bf7 100644
                             has_granularity, granularity,
                             has_buf_size, buf_size,
 diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h
-index da5fb31089..32f0f9858a 100644
+index d2201e27f4..cc1387ae02 100644
 --- a/include/block/block_int-global-state.h
 +++ b/include/block/block_int-global-state.h
-@@ -152,7 +152,9 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
+@@ -158,7 +158,9 @@ void mirror_start(const char *job_id, BlockDriverState *bs,
                    BlockDriverState *target, const char *replaces,
                    int creation_flags, int64_t speed,
                    uint32_t granularity, int64_t buf_size,
@@ -360,10 +364,10 @@ index da5fb31089..32f0f9858a 100644
                    BlockdevOnError on_source_error,
                    BlockdevOnError on_target_error,
 diff --git a/qapi/block-core.json b/qapi/block-core.json
-index bca1a0c372..a5cea82139 100644
+index 746d1694c2..45ab548dfe 100644
 --- a/qapi/block-core.json
 +++ b/qapi/block-core.json
-@@ -2145,6 +2145,15 @@
+@@ -2174,6 +2174,15 @@
  #     destination (all the disk, only the sectors allocated in the
  #     topmost image, or only new I/O).
  #
@@ -379,7 +383,7 @@ index bca1a0c372..a5cea82139 100644
  # @granularity: granularity of the dirty bitmap, default is 64K if the
  #     image format doesn't have clusters, 4K if the clusters are
  #     smaller than that, else the cluster size.  Must be a power of 2
-@@ -2187,7 +2196,9 @@
+@@ -2216,7 +2225,9 @@
  { 'struct': 'DriveMirror',
    'data': { '*job-id': 'str', 'device': 'str', 'target': 'str',
              '*format': 'str', '*node-name': 'str', '*replaces': 'str',
@@ -390,7 +394,7 @@ index bca1a0c372..a5cea82139 100644
              '*speed': 'int', '*granularity': 'uint32',
              '*buf-size': 'int', '*on-source-error': 'BlockdevOnError',
              '*on-target-error': 'BlockdevOnError',
-@@ -2471,6 +2482,15 @@
+@@ -2496,6 +2507,15 @@
  #     destination (all the disk, only the sectors allocated in the
  #     topmost image, or only new I/O).
  #
@@ -406,7 +410,7 @@ index bca1a0c372..a5cea82139 100644
  # @granularity: granularity of the dirty bitmap, default is 64K if the
  #     image format doesn't have clusters, 4K if the clusters are
  #     smaller than that, else the cluster size.  Must be a power of 2
-@@ -2521,7 +2541,8 @@
+@@ -2544,7 +2564,8 @@
  { 'command': 'blockdev-mirror',
    'data': { '*job-id': 'str', 'device': 'str', 'target': 'str',
              '*replaces': 'str',
@@ -417,10 +421,10 @@ index bca1a0c372..a5cea82139 100644
              '*buf-size': 'int', '*on-source-error': 'BlockdevOnError',
              '*on-target-error': 'BlockdevOnError',
 diff --git a/tests/unit/test-block-iothread.c b/tests/unit/test-block-iothread.c
-index d727a5fee8..8a34aa2328 100644
+index 3766d5de6b..afa44cbd34 100644
 --- a/tests/unit/test-block-iothread.c
 +++ b/tests/unit/test-block-iothread.c
-@@ -757,8 +757,8 @@ static void test_propagate_mirror(void)
+@@ -755,8 +755,8 @@ static void test_propagate_mirror(void)
  
      /* Start a mirror job */
      mirror_start("job0", src, target, NULL, JOB_DEFAULT, 0, 0, 0,
@@ -430,4 +434,4 @@ index d727a5fee8..8a34aa2328 100644
 +                 false, BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
                   false, "filter_node", MIRROR_COPY_MODE_BACKGROUND,
                   &error_abort);
-     WITH_JOB_LOCK_GUARD() {
index a4a5a0b460d9ad64d758be58d7d26de0ae6660a6..8a1b5d8006b001fed79cf82a12a0c66f149f113a 100644 (file)
@@ -24,10 +24,10 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  1 file changed, 18 insertions(+), 6 deletions(-)
 
 diff --git a/block/mirror.c b/block/mirror.c
-index 1ff42c8af1..11b8a8e959 100644
+index 0c5c72df2e..37fee3fa25 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
-@@ -682,8 +682,6 @@ static int mirror_exit_common(Job *job)
+@@ -693,8 +693,6 @@ static int mirror_exit_common(Job *job)
          bdrv_unfreeze_backing_chain(mirror_top_bs, target_bs);
      }
  
@@ -36,9 +36,9 @@ index 1ff42c8af1..11b8a8e959 100644
      /* Make sure that the source BDS doesn't go away during bdrv_replace_node,
       * before we can call bdrv_drained_end */
      bdrv_ref(src);
-@@ -788,6 +786,18 @@ static int mirror_exit_common(Job *job)
-     block_job_remove_all_bdrv(bjob);
-     bdrv_replace_node(mirror_top_bs, mirror_top_bs->backing->bs, &error_abort);
+@@ -800,6 +798,18 @@ static int mirror_exit_common(Job *job)
+     bdrv_drained_end(target_bs);
+     bdrv_unref(target_bs);
  
 +    if (s->sync_bitmap) {
 +        if (s->bitmap_mode == BITMAP_SYNC_MODE_ALWAYS ||
@@ -55,7 +55,7 @@ index 1ff42c8af1..11b8a8e959 100644
      bs_opaque->job = NULL;
  
      bdrv_drained_end(src);
-@@ -1699,10 +1709,6 @@ static BlockJob *mirror_start_job(
+@@ -1757,10 +1767,6 @@ static BlockJob *mirror_start_job(
                         " sync mode",
                         MirrorSyncMode_str(sync_mode));
              return NULL;
@@ -66,7 +66,7 @@ index 1ff42c8af1..11b8a8e959 100644
          }
      } else if (bitmap) {
          error_setg(errp,
-@@ -1719,6 +1725,12 @@ static BlockJob *mirror_start_job(
+@@ -1777,6 +1783,12 @@ static BlockJob *mirror_start_job(
              return NULL;
          }
          granularity = bdrv_dirty_bitmap_granularity(bitmap);
index 3bf9797541006200e7338178404fc6f513b16d78..ddaf702893f8c0bf251ef57c3bfb272d852a7ad3 100644 (file)
@@ -16,10 +16,10 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  1 file changed, 3 insertions(+)
 
 diff --git a/blockdev.c b/blockdev.c
-index a402fa4bf7..01b0ab0549 100644
+index 8682814a7a..5b75a085ee 100644
 --- a/blockdev.c
 +++ b/blockdev.c
-@@ -2946,6 +2946,9 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+@@ -2873,6 +2873,9 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
          if (bdrv_dirty_bitmap_check(bitmap, BDRV_BITMAP_ALLOW_RO, errp)) {
              return;
          }
index 93a1524fab8e419cd8bd63352de67397f5d8256e..33e692358fa74b7d25e6a0302f8c00dee849162d 100644 (file)
@@ -16,10 +16,10 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  1 file changed, 4 insertions(+), 7 deletions(-)
 
 diff --git a/block/mirror.c b/block/mirror.c
-index 11b8a8e959..00f2665ca4 100644
+index 37fee3fa25..6b3cce1007 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
-@@ -792,8 +792,8 @@ static int mirror_exit_common(Job *job)
+@@ -804,8 +804,8 @@ static int mirror_exit_common(Job *job)
               job->ret == 0 && ret == 0)) {
              /* Success; synchronize copy back to sync. */
              bdrv_clear_dirty_bitmap(s->sync_bitmap, NULL);
@@ -30,7 +30,7 @@ index 11b8a8e959..00f2665ca4 100644
          }
      }
      bdrv_release_dirty_bitmap(s->dirty_bitmap);
-@@ -1892,11 +1892,8 @@ static BlockJob *mirror_start_job(
+@@ -1964,11 +1964,8 @@ static BlockJob *mirror_start_job(
      }
  
      if (s->sync_mode == MIRROR_SYNC_MODE_BITMAP) {
@@ -43,4 +43,4 @@ index 11b8a8e959..00f2665ca4 100644
 +                                         NULL, true);
      }
  
-     ret = block_job_add_bdrv(&s->common, "source", bs, 0,
+     bdrv_graph_wrlock();
index f9a6e20a6b7050dd8726e96738d7dbbf28e4c50e..05a73d16c286ad5cb720a742c33668cf22c797dd 100644 (file)
@@ -12,7 +12,7 @@ uniform w.r.t. backup block jobs.
 
 Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
-[FE: rebase for 8.0]
+[FE: rebase for 8.2.2]
 Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
 ---
  block/mirror.c             | 28 +++------------
@@ -21,12 +21,12 @@ Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
  3 files changed, 70 insertions(+), 59 deletions(-)
 
 diff --git a/block/mirror.c b/block/mirror.c
-index 00f2665ca4..60cf574de5 100644
+index 6b3cce1007..2f1223852b 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
-@@ -1699,31 +1699,13 @@ static BlockJob *mirror_start_job(
-     uint64_t target_perms, target_shared_perms;
-     int ret;
+@@ -1757,31 +1757,13 @@ static BlockJob *mirror_start_job(
+     GLOBAL_STATE_CODE();
  
 -    if (sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
 -        error_setg(errp, "Sync mode '%s' not supported",
@@ -62,10 +62,10 @@ index 00f2665ca4..60cf574de5 100644
  
          if (bitmap_mode != BITMAP_SYNC_MODE_NEVER) {
 diff --git a/blockdev.c b/blockdev.c
-index 01b0ab0549..cd5f205ad1 100644
+index 5b75a085ee..d27d8c38ec 100644
 --- a/blockdev.c
 +++ b/blockdev.c
-@@ -2925,7 +2925,36 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
+@@ -2852,7 +2852,36 @@ static void blockdev_mirror_common(const char *job_id, BlockDriverState *bs,
          sync = MIRROR_SYNC_MODE_FULL;
      }
  
index cd9797ac392a8ff8d6d61a63f90f099da9d0744c..54f06315059bacececb436708cb6fd3a7e294164 100644 (file)
@@ -78,7 +78,7 @@ index 252de85681..8db28f9272 100644
  
  /**
 diff --git a/monitor/monitor.c b/monitor/monitor.c
-index dc352f9e9d..56e1307014 100644
+index 01ede1babd..5681bca346 100644
 --- a/monitor/monitor.c
 +++ b/monitor/monitor.c
 @@ -117,6 +117,21 @@ bool monitor_cur_is_qmp(void)
@@ -144,7 +144,7 @@ index a239945e8d..589c9524f8 100644
          monitor_qmp_caps_reset(mon);
          data = qmp_greeting(mon);
 diff --git a/qapi/qmp-dispatch.c b/qapi/qmp-dispatch.c
-index 176b549473..790bb7d1da 100644
+index f3488afeef..2624eb3470 100644
 --- a/qapi/qmp-dispatch.c
 +++ b/qapi/qmp-dispatch.c
 @@ -117,16 +117,28 @@ typedef struct QmpDispatchBH {
@@ -180,7 +180,7 @@ index 176b549473..790bb7d1da 100644
      aio_co_wake(data->co);
  }
  
-@@ -253,6 +265,7 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ
+@@ -250,6 +262,7 @@ QDict *coroutine_mixed_fn qmp_dispatch(const QmpCommandList *cmds, QObject *requ
              .ret        = &ret,
              .errp       = &err,
              .co         = qemu_coroutine_self(),
index 328f6fb3877094aeeb85397b46cbbd6f05c116a0..a8a66627f308252d4aa123aa848ec8f7427045c4 100644 (file)
@@ -22,7 +22,7 @@ Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
  1 file changed, 2 insertions(+), 12 deletions(-)
 
 diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c
-index 32c70c9e99..984b6a3145 100644
+index 2d0c607177..97e51733af 100644
 --- a/hw/scsi/megasas.c
 +++ b/hw/scsi/megasas.c
 @@ -1781,7 +1781,7 @@ static int megasas_handle_io(MegasasState *s, MegasasCmd *cmd, int frame_cmd)
index 7705d72db940ee831e2776fd4a231fb77e98926a..502c9d23dbff51e8de521a55a089e20302eb3972 100644 (file)
@@ -55,10 +55,10 @@ Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
  1 file changed, 6 insertions(+), 6 deletions(-)
 
 diff --git a/hw/ide/core.c b/hw/ide/core.c
-index c3508acbb1..289347af58 100644
+index e8cb2dac92..3b21acf651 100644
 --- a/hw/ide/core.c
 +++ b/hw/ide/core.c
-@@ -444,7 +444,7 @@ static void ide_trim_bh_cb(void *opaque)
+@@ -456,7 +456,7 @@ static void ide_trim_bh_cb(void *opaque)
      iocb->bh = NULL;
      qemu_aio_unref(iocb);
  
@@ -67,7 +67,7 @@ index c3508acbb1..289347af58 100644
      blk_dec_in_flight(blk);
  }
  
-@@ -504,6 +504,8 @@ static void ide_issue_trim_cb(void *opaque, int ret)
+@@ -516,6 +516,8 @@ static void ide_issue_trim_cb(void *opaque, int ret)
  done:
      iocb->aiocb = NULL;
      if (iocb->bh) {
@@ -76,7 +76,7 @@ index c3508acbb1..289347af58 100644
          replay_bh_schedule_event(iocb->bh);
      }
  }
-@@ -516,9 +518,6 @@ BlockAIOCB *ide_issue_trim(
+@@ -528,9 +530,6 @@ BlockAIOCB *ide_issue_trim(
      IDEDevice *dev = s->unit ? s->bus->slave : s->bus->master;
      TrimAIOCB *iocb;
  
@@ -86,7 +86,7 @@ index c3508acbb1..289347af58 100644
      iocb = blk_aio_get(&trim_aiocb_info, s->blk, cb, cb_opaque);
      iocb->s = s;
      iocb->bh = qemu_bh_new_guarded(ide_trim_bh_cb, iocb,
-@@ -742,8 +741,9 @@ void ide_cancel_dma_sync(IDEState *s)
+@@ -754,8 +753,9 @@ void ide_cancel_dma_sync(IDEState *s)
       */
      if (s->bus->dma->aiocb) {
          trace_ide_cancel_dma_sync_remaining();
diff --git a/debian/patches/extra/0004-Revert-x86-acpi-workaround-Windows-not-handling-name.patch b/debian/patches/extra/0004-Revert-x86-acpi-workaround-Windows-not-handling-name.patch
new file mode 100644 (file)
index 0000000..22eb1e7
--- /dev/null
@@ -0,0 +1,45 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Fri, 17 Nov 2023 11:18:06 +0100
+Subject: [PATCH] Revert "x86: acpi: workaround Windows not handling name
+ references in Package properly"
+
+This reverts commit 44d975ef340e2f21f236f9520c53e1b30d2213a4.
+
+As reported in the community forum [0] and reproduced locally this
+breaks VirtIO network adapters in (at least) the German ISO of Windows
+Server 2022. The fix itself was for
+
+> Issue is not fatal but as result acpi-index/"PCI Label ID" property
+> is either not shown in device details page or shows incorrect value.
+
+so revert and tolerate that as a stop-gap, rather than have the
+devices not working at all.
+
+[0]: https://forum.proxmox.com/threads/92094/post-605684
+
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+---
+ hw/i386/acpi-build.c | 8 ++------
+ 1 file changed, 2 insertions(+), 6 deletions(-)
+
+diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
+index 53f804ac16..9b1b9f0412 100644
+--- a/hw/i386/acpi-build.c
++++ b/hw/i386/acpi-build.c
+@@ -347,13 +347,9 @@ Aml *aml_pci_device_dsm(void)
+     {
+         Aml *params = aml_local(0);
+         Aml *pkg = aml_package(2);
+-        aml_append(pkg, aml_int(0));
+-        aml_append(pkg, aml_int(0));
++        aml_append(pkg, aml_name("BSEL"));
++        aml_append(pkg, aml_name("ASUN"));
+         aml_append(method, aml_store(pkg, params));
+-        aml_append(method,
+-            aml_store(aml_name("BSEL"), aml_index(params, aml_int(0))));
+-        aml_append(method,
+-            aml_store(aml_name("ASUN"), aml_index(params, aml_int(1))));
+         aml_append(method,
+             aml_return(aml_call5("PDSM", aml_arg(0), aml_arg(1),
+                                  aml_arg(2), aml_arg(3), params))
diff --git a/debian/patches/extra/0004-migration-block-dirty-bitmap-fix-loading-bitmap-when.patch b/debian/patches/extra/0004-migration-block-dirty-bitmap-fix-loading-bitmap-when.patch
deleted file mode 100644 (file)
index 4dae6ca..0000000
+++ /dev/null
@@ -1,48 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Fri, 28 Jul 2023 10:47:48 +0200
-Subject: [PATCH] migration/block-dirty-bitmap: fix loading bitmap when there
- is an iothread
-
-The bdrv_create_dirty_bitmap() function (which is also called by
-bdrv_dirty_bitmap_create_successor()) uses bdrv_getlength(bs). This is
-a wrapper around a coroutine, and thus uses bdrv_poll_co(). Polling
-tries to release the AioContext which will trigger an assert() if it
-hasn't been acquired before.
-
-The issue does not happen for migration, because there we are in a
-coroutine already, so the wrapper will just call bdrv_co_getlength()
-directly without polling.
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- migration/block-dirty-bitmap.c | 6 ++++++
- 1 file changed, 6 insertions(+)
-
-diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
-index 032fc5f405..e1ae3b7316 100644
---- a/migration/block-dirty-bitmap.c
-+++ b/migration/block-dirty-bitmap.c
-@@ -805,8 +805,11 @@ static int dirty_bitmap_load_start(QEMUFile *f, DBMLoadState *s)
-                      "destination", bdrv_dirty_bitmap_name(s->bitmap));
-         return -EINVAL;
-     } else {
-+        AioContext *ctx = bdrv_get_aio_context(s->bs);
-+        aio_context_acquire(ctx);
-         s->bitmap = bdrv_create_dirty_bitmap(s->bs, granularity,
-                                              s->bitmap_name, &local_err);
-+        aio_context_release(ctx);
-         if (!s->bitmap) {
-             error_report_err(local_err);
-             return -EINVAL;
-@@ -833,7 +836,10 @@ static int dirty_bitmap_load_start(QEMUFile *f, DBMLoadState *s)
-     bdrv_disable_dirty_bitmap(s->bitmap);
-     if (flags & DIRTY_BITMAP_MIG_START_FLAG_ENABLED) {
-+        AioContext *ctx = bdrv_get_aio_context(s->bs);
-+        aio_context_acquire(ctx);
-         bdrv_dirty_bitmap_create_successor(s->bitmap, &local_err);
-+        aio_context_release(ctx);
-         if (local_err) {
-             error_report_err(local_err);
-             return -EINVAL;
diff --git a/debian/patches/extra/0005-Revert-Revert-graph-lock-Disable-locking-for-now.patch b/debian/patches/extra/0005-Revert-Revert-graph-lock-Disable-locking-for-now.patch
deleted file mode 100644 (file)
index f0648d2..0000000
+++ /dev/null
@@ -1,140 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Thu, 28 Sep 2023 10:07:03 +0200
-Subject: [PATCH] Revert "Revert "graph-lock: Disable locking for now""
-
-This reverts commit 3cce22defb4b0e47cf135444e30cc673cff5ebad.
-
-There are still some issues with graph locking, e.g. deadlocks during
-backup canceling [0]. Because the AioContext locks still exist, it
-should be safe to disable locking again.
-
-From the original 80fc5d2600 ("graph-lock: Disable locking for now"):
-
-> We don't currently rely on graph locking yet. It is supposed to replace
-> the AioContext lock eventually to enable multiqueue support, but as long
-> as we still have the AioContext lock, it is sufficient without the graph
-> lock. Once the AioContext lock goes away, the deadlock doesn't exist any
-> more either and this commit can be reverted. (Of course, it can also be
-> reverted while the AioContext lock still exists if the callers have been
-> fixed.)
-
-[0]: https://lists.nongnu.org/archive/html/qemu-devel/2023-09/msg00729.html
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- block/graph-lock.c | 24 ++++++++++++++++++++++++
- 1 file changed, 24 insertions(+)
-
-diff --git a/block/graph-lock.c b/block/graph-lock.c
-index 5e66f01ae8..5c2873262a 100644
---- a/block/graph-lock.c
-+++ b/block/graph-lock.c
-@@ -30,8 +30,10 @@ BdrvGraphLock graph_lock;
- /* Protects the list of aiocontext and orphaned_reader_count */
- static QemuMutex aio_context_list_lock;
-+#if 0
- /* Written and read with atomic operations. */
- static int has_writer;
-+#endif
- /*
-  * A reader coroutine could move from an AioContext to another.
-@@ -88,6 +90,7 @@ void unregister_aiocontext(AioContext *ctx)
-     g_free(ctx->bdrv_graph);
- }
-+#if 0
- static uint32_t reader_count(void)
- {
-     BdrvGraphRWlock *brdv_graph;
-@@ -105,12 +108,19 @@ static uint32_t reader_count(void)
-     assert((int32_t)rd >= 0);
-     return rd;
- }
-+#endif
- void bdrv_graph_wrlock(BlockDriverState *bs)
- {
-+#if 0
-     AioContext *ctx = NULL;
-     GLOBAL_STATE_CODE();
-+    /*
-+     * TODO Some callers hold an AioContext lock when this is called, which
-+     * causes deadlocks. Reenable once the AioContext locking is cleaned up (or
-+     * AioContext locks are gone).
-+     */
-     assert(!qatomic_read(&has_writer));
-     /*
-@@ -158,11 +168,13 @@ void bdrv_graph_wrlock(BlockDriverState *bs)
-     if (ctx) {
-         aio_context_acquire(bdrv_get_aio_context(bs));
-     }
-+#endif
- }
- void bdrv_graph_wrunlock(void)
- {
-     GLOBAL_STATE_CODE();
-+#if 0
-     QEMU_LOCK_GUARD(&aio_context_list_lock);
-     assert(qatomic_read(&has_writer));
-@@ -174,10 +186,13 @@ void bdrv_graph_wrunlock(void)
-     /* Wake up all coroutine that are waiting to read the graph */
-     qemu_co_enter_all(&reader_queue, &aio_context_list_lock);
-+#endif
- }
- void coroutine_fn bdrv_graph_co_rdlock(void)
- {
-+    /* TODO Reenable when wrlock is reenabled */
-+#if 0
-     BdrvGraphRWlock *bdrv_graph;
-     bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;
-@@ -237,10 +252,12 @@ void coroutine_fn bdrv_graph_co_rdlock(void)
-             qemu_co_queue_wait(&reader_queue, &aio_context_list_lock);
-         }
-     }
-+#endif
- }
- void coroutine_fn bdrv_graph_co_rdunlock(void)
- {
-+#if 0
-     BdrvGraphRWlock *bdrv_graph;
-     bdrv_graph = qemu_get_current_aio_context()->bdrv_graph;
-@@ -258,6 +275,7 @@ void coroutine_fn bdrv_graph_co_rdunlock(void)
-     if (qatomic_read(&has_writer)) {
-         aio_wait_kick();
-     }
-+#endif
- }
- void bdrv_graph_rdlock_main_loop(void)
-@@ -275,13 +293,19 @@ void bdrv_graph_rdunlock_main_loop(void)
- void assert_bdrv_graph_readable(void)
- {
-     /* reader_count() is slow due to aio_context_list_lock lock contention */
-+    /* TODO Reenable when wrlock is reenabled */
-+#if 0
- #ifdef CONFIG_DEBUG_GRAPH_LOCK
-     assert(qemu_in_main_thread() || reader_count());
- #endif
-+#endif
- }
- void assert_bdrv_graph_writable(void)
- {
-     assert(qemu_in_main_thread());
-+    /* TODO Reenable when wrlock is reenabled */
-+#if 0
-     assert(qatomic_read(&has_writer));
-+#endif
- }
diff --git a/debian/patches/extra/0005-block-copy-before-write-use-uint64_t-for-timeout-in-.patch b/debian/patches/extra/0005-block-copy-before-write-use-uint64_t-for-timeout-in-.patch
new file mode 100644 (file)
index 0000000..a8bdd85
--- /dev/null
@@ -0,0 +1,35 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Mon, 29 Apr 2024 15:41:11 +0200
+Subject: [PATCH] block/copy-before-write: use uint64_t for timeout in
+ nanoseconds
+
+rather than the uint32_t for which the maximum is slightly more than 4
+seconds and larger values would overflow. The QAPI interface allows
+specifying the number of seconds, so only values 0 to 4 are safe right
+now, other values lead to a much lower timeout than a user expects.
+
+The block_copy() call where this is used already takes a uint64_t for
+the timeout, so no change required there.
+
+Fixes: 6db7fd1ca9 ("block/copy-before-write: implement cbw-timeout option")
+Reported-by: Friedrich Weber <f.weber@proxmox.com>
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Tested-by: Friedrich Weber <f.weber@proxmox.com>
+---
+ block/copy-before-write.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index 8aba27a71d..026fa9840f 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -43,7 +43,7 @@ typedef struct BDRVCopyBeforeWriteState {
+     BlockCopyState *bcs;
+     BdrvChild *target;
+     OnCbwError on_cbw_error;
+-    uint32_t cbw_timeout_ns;
++    uint64_t cbw_timeout_ns;
+     /*
+      * @lock: protects access to @access_bitmap, @done_bitmap and
diff --git a/debian/patches/extra/0006-migration-states-workaround-snapshot-performance-reg.patch b/debian/patches/extra/0006-migration-states-workaround-snapshot-performance-reg.patch
deleted file mode 100644 (file)
index 8031837..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Thu, 28 Sep 2023 11:19:14 +0200
-Subject: [PATCH] migration states: workaround snapshot performance regression
-
-Commit 813cd616 ("migration: Use migration_transferred_bytes() to
-calculate rate_limit") introduced a prohibitive performance regression
-when taking a snapshot [0]. The reason turns out to be the flushing
-done by migration_transferred_bytes()
-
-Just use a _noflush version of the relevant function as a workaround
-until upstream fixes the issue. This is inspired by a not-applied
-upstream series [1], but doing the very minimum to avoid the
-regression.
-
-[0]: https://gitlab.com/qemu-project/qemu/-/issues/1821
-[1]: https://lists.nongnu.org/archive/html/qemu-devel/2023-05/msg07708.html
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- migration/migration-stats.c | 16 +++++++++++++++-
- 1 file changed, 15 insertions(+), 1 deletion(-)
-
-diff --git a/migration/migration-stats.c b/migration/migration-stats.c
-index 095d6d75bb..8073c8ebaa 100644
---- a/migration/migration-stats.c
-+++ b/migration/migration-stats.c
-@@ -18,6 +18,20 @@
- MigrationAtomicStats mig_stats;
-+/*
-+ * Same as migration_transferred_bytes below, but using the _noflush
-+ * variant of qemu_file_transferred() to avoid a performance
-+ * regression in migration_rate_exceeded().
-+ */
-+static uint64_t migration_transferred_bytes_noflush(QEMUFile *f)
-+{
-+    uint64_t multifd = stat64_get(&mig_stats.multifd_bytes);
-+    uint64_t qemu_file = qemu_file_transferred_noflush(f);
-+
-+    trace_migration_transferred_bytes(qemu_file, multifd);
-+    return qemu_file + multifd;
-+}
-+
- bool migration_rate_exceeded(QEMUFile *f)
- {
-     if (qemu_file_get_error(f)) {
-@@ -25,7 +39,7 @@ bool migration_rate_exceeded(QEMUFile *f)
-     }
-     uint64_t rate_limit_start = stat64_get(&mig_stats.rate_limit_start);
--    uint64_t rate_limit_current = migration_transferred_bytes(f);
-+    uint64_t rate_limit_current = migration_transferred_bytes_noflush(f);
-     uint64_t rate_limit_used = rate_limit_current - rate_limit_start;
-     uint64_t rate_limit_max = stat64_get(&mig_stats.rate_limit_max);
diff --git a/debian/patches/extra/0007-Revert-x86-acpi-workaround-Windows-not-handling-name.patch b/debian/patches/extra/0007-Revert-x86-acpi-workaround-Windows-not-handling-name.patch
deleted file mode 100644 (file)
index 02d9b6b..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Fri, 17 Nov 2023 11:18:06 +0100
-Subject: [PATCH] Revert "x86: acpi: workaround Windows not handling name
- references in Package properly"
-
-This reverts commit 44d975ef340e2f21f236f9520c53e1b30d2213a4.
-
-As reported in the community forum [0] and reproduced locally this
-breaks VirtIO network adapters in (at least) the German ISO of Windows
-Server 2022. The fix itself was for
-
-> Issue is not fatal but as result acpi-index/"PCI Label ID" property
-> is either not shown in device details page or shows incorrect value.
-
-so revert and tolerate that as a stop-gap, rather than have the
-devices not working at all.
-
-[0]: https://forum.proxmox.com/threads/92094/post-605684
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- hw/i386/acpi-build.c | 8 ++------
- 1 file changed, 2 insertions(+), 6 deletions(-)
-
-diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
-index bb12b0ad43..de14d3c3da 100644
---- a/hw/i386/acpi-build.c
-+++ b/hw/i386/acpi-build.c
-@@ -362,13 +362,9 @@ Aml *aml_pci_device_dsm(void)
-     {
-         Aml *params = aml_local(0);
-         Aml *pkg = aml_package(2);
--        aml_append(pkg, aml_int(0));
--        aml_append(pkg, aml_int(0));
-+        aml_append(pkg, aml_name("BSEL"));
-+        aml_append(pkg, aml_name("ASUN"));
-         aml_append(method, aml_store(pkg, params));
--        aml_append(method,
--            aml_store(aml_name("BSEL"), aml_index(params, aml_int(0))));
--        aml_append(method,
--            aml_store(aml_name("ASUN"), aml_index(params, aml_int(1))));
-         aml_append(method,
-             aml_return(aml_call5("PDSM", aml_arg(0), aml_arg(1),
-                                  aml_arg(2), aml_arg(3), params))
diff --git a/debian/patches/extra/0008-target-i386-the-sgx_epc_get_section-stub-is-reachabl.patch b/debian/patches/extra/0008-target-i386-the-sgx_epc_get_section-stub-is-reachabl.patch
deleted file mode 100644 (file)
index 194635f..0000000
+++ /dev/null
@@ -1,34 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Paolo Bonzini <pbonzini@redhat.com>
-Date: Tue, 1 Feb 2022 20:09:41 +0100
-Subject: [PATCH] target/i386: the sgx_epc_get_section stub is reachable
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-The sgx_epc_get_section stub is reachable from cpu_x86_cpuid.  It
-should not assert, instead it should just return true just like
-the "real" sgx_epc_get_section does when SGX is disabled.
-
-Reported-by: Vladimír BeneÅ¡ <vbenes@redhat.com>
-Cc: qemu-stable@nongnu.org
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Message-ID: <20220201190941.106001-1-pbonzini@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-(cherry-picked from commit 219615740425d9683588207b40a365e6741691a6)
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- hw/i386/sgx-stub.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/hw/i386/sgx-stub.c b/hw/i386/sgx-stub.c
-index 26833eb233..16b1dfd90b 100644
---- a/hw/i386/sgx-stub.c
-+++ b/hw/i386/sgx-stub.c
-@@ -34,5 +34,5 @@ void pc_machine_init_sgx_epc(PCMachineState *pcms)
- bool sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size)
- {
--    g_assert_not_reached();
-+    return true;
- }
diff --git a/debian/patches/extra/0009-ui-clipboard-mark-type-as-not-available-when-there-i.patch b/debian/patches/extra/0009-ui-clipboard-mark-type-as-not-available-when-there-i.patch
deleted file mode 100644 (file)
index 4b09063..0000000
+++ /dev/null
@@ -1,86 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Wed, 24 Jan 2024 11:57:48 +0100
-Subject: [PATCH] ui/clipboard: mark type as not available when there is no
- data
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-With VNC, a client can send a non-extended VNC_MSG_CLIENT_CUT_TEXT
-message with len=0. In qemu_clipboard_set_data(), the clipboard info
-will be updated setting data to NULL (because g_memdup(data, size)
-returns NULL when size is 0). If the client does not set the
-VNC_ENCODING_CLIPBOARD_EXT feature when setting up the encodings, then
-the 'request' callback for the clipboard peer is not initialized.
-Later, because data is NULL, qemu_clipboard_request() can be reached
-via vdagent_chr_write() and vdagent_clipboard_recv_request() and
-there, the clipboard owner's 'request' callback will be attempted to
-be called, but that is a NULL pointer.
-
-In particular, this can happen when using the KRDC (22.12.3) VNC
-client.
-
-Another scenario leading to the same issue is with two clients (say
-noVNC and KRDC):
-
-The noVNC client sets the extension VNC_FEATURE_CLIPBOARD_EXT and
-initializes its cbpeer.
-
-The KRDC client does not, but triggers a vnc_client_cut_text() (note
-it's not the _ext variant)). There, a new clipboard info with it as
-the 'owner' is created and via qemu_clipboard_set_data() is called,
-which in turn calls qemu_clipboard_update() with that info.
-
-In qemu_clipboard_update(), the notifier for the noVNC client will be
-called, i.e. vnc_clipboard_notify() and also set vs->cbinfo for the
-noVNC client. The 'owner' in that clipboard info is the clipboard peer
-for the KRDC client, which did not initialize the 'request' function.
-That sounds correct to me, it is the owner of that clipboard info.
-
-Then when noVNC sends a VNC_MSG_CLIENT_CUT_TEXT message (it did set
-the VNC_FEATURE_CLIPBOARD_EXT feature correctly, so a check for it
-passes), that clipboard info is passed to qemu_clipboard_request() and
-the original segfault still happens.
-
-Fix the issue by handling updates with size 0 differently. In
-particular, mark in the clipboard info that the type is not available.
-
-While at it, switch to g_memdup2(), because g_memdup() is deprecated.
-
-Cc: qemu-stable@nongnu.org
-Fixes: CVE-2023-6683
-Reported-by: Markus Frank <m.frank@proxmox.com>
-Suggested-by: Marc-André Lureau <marcandre.lureau@redhat.com>
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Reviewed-by: Marc-André Lureau <marcandre.lureau@redhat.com>
-Tested-by: Markus Frank <m.frank@proxmox.com>
-(picked from https://lists.nongnu.org/archive/html/qemu-stable/2024-01/msg00228.html)
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- ui/clipboard.c | 12 +++++++++---
- 1 file changed, 9 insertions(+), 3 deletions(-)
-
-diff --git a/ui/clipboard.c b/ui/clipboard.c
-index 3d14bffaf8..b3f6fa3c9e 100644
---- a/ui/clipboard.c
-+++ b/ui/clipboard.c
-@@ -163,9 +163,15 @@ void qemu_clipboard_set_data(QemuClipboardPeer *peer,
-     }
-     g_free(info->types[type].data);
--    info->types[type].data = g_memdup(data, size);
--    info->types[type].size = size;
--    info->types[type].available = true;
-+    if (size) {
-+        info->types[type].data = g_memdup2(data, size);
-+        info->types[type].size = size;
-+        info->types[type].available = true;
-+    } else {
-+        info->types[type].data = NULL;
-+        info->types[type].size = 0;
-+        info->types[type].available = false;
-+    }
-     if (update) {
-         qemu_clipboard_update(info);
diff --git a/debian/patches/extra/0010-virtio-scsi-Attach-event-vq-notifier-with-no_poll.patch b/debian/patches/extra/0010-virtio-scsi-Attach-event-vq-notifier-with-no_poll.patch
deleted file mode 100644 (file)
index 85d80e8..0000000
+++ /dev/null
@@ -1,65 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Hanna Czenczek <hreitz@redhat.com>
-Date: Fri, 2 Feb 2024 16:31:56 +0100
-Subject: [PATCH] virtio-scsi: Attach event vq notifier with no_poll
-
-As of commit 38738f7dbbda90fbc161757b7f4be35b52205552 ("virtio-scsi:
-don't waste CPU polling the event virtqueue"), we only attach an io_read
-notifier for the virtio-scsi event virtqueue instead, and no polling
-notifiers.  During operation, the event virtqueue is typically
-non-empty, but none of the buffers are intended to be used immediately.
-Instead, they only get used when certain events occur.  Therefore, it
-makes no sense to continuously poll it when non-empty, because it is
-supposed to be and stay non-empty.
-
-We do this by using virtio_queue_aio_attach_host_notifier_no_poll()
-instead of virtio_queue_aio_attach_host_notifier() for the event
-virtqueue.
-
-Commit 766aa2de0f29b657148e04599320d771c36fd126 ("virtio-scsi: implement
-BlockDevOps->drained_begin()") however has virtio_scsi_drained_end() use
-virtio_queue_aio_attach_host_notifier() for all virtqueues, including
-the event virtqueue.  This can lead to it being polled again, undoing
-the benefit of commit 38738f7dbbda90fbc161757b7f4be35b52205552.
-
-Fix it by using virtio_queue_aio_attach_host_notifier_no_poll() for the
-event virtqueue.
-
-       ("virtio-scsi: implement BlockDevOps->drained_begin()")
-
-Reported-by: Fiona Ebner <f.ebner@proxmox.com>
-Fixes: 766aa2de0f29b657148e04599320d771c36fd126
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Tested-by: Fiona Ebner <f.ebner@proxmox.com>
-Reviewed-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- hw/scsi/virtio-scsi.c | 7 ++++++-
- 1 file changed, 6 insertions(+), 1 deletion(-)
-
-diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
-index 45b95ea070..ad24a882fd 100644
---- a/hw/scsi/virtio-scsi.c
-+++ b/hw/scsi/virtio-scsi.c
-@@ -1148,6 +1148,7 @@ static void virtio_scsi_drained_begin(SCSIBus *bus)
- static void virtio_scsi_drained_end(SCSIBus *bus)
- {
-     VirtIOSCSI *s = container_of(bus, VirtIOSCSI, bus);
-+    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
-     VirtIODevice *vdev = VIRTIO_DEVICE(s);
-     uint32_t total_queues = VIRTIO_SCSI_VQ_NUM_FIXED +
-                             s->parent_obj.conf.num_queues;
-@@ -1165,7 +1166,11 @@ static void virtio_scsi_drained_end(SCSIBus *bus)
-     for (uint32_t i = 0; i < total_queues; i++) {
-         VirtQueue *vq = virtio_get_queue(vdev, i);
--        virtio_queue_aio_attach_host_notifier(vq, s->ctx);
-+        if (vq == vs->event_vq) {
-+            virtio_queue_aio_attach_host_notifier_no_poll(vq, s->ctx);
-+        } else {
-+            virtio_queue_aio_attach_host_notifier(vq, s->ctx);
-+        }
-     }
- }
diff --git a/debian/patches/extra/0011-virtio-Re-enable-notifications-after-drain.patch b/debian/patches/extra/0011-virtio-Re-enable-notifications-after-drain.patch
deleted file mode 100644 (file)
index 618ccb2..0000000
+++ /dev/null
@@ -1,125 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Hanna Czenczek <hreitz@redhat.com>
-Date: Fri, 2 Feb 2024 16:31:57 +0100
-Subject: [PATCH] virtio: Re-enable notifications after drain
-
-During drain, we do not care about virtqueue notifications, which is why
-we remove the handlers on it.  When removing those handlers, whether vq
-notifications are enabled or not depends on whether we were in polling
-mode or not; if not, they are enabled (by default); if so, they have
-been disabled by the io_poll_start callback.
-
-Because we do not care about those notifications after removing the
-handlers, this is fine.  However, we have to explicitly ensure they are
-enabled when re-attaching the handlers, so we will resume receiving
-notifications.  We do this in virtio_queue_aio_attach_host_notifier*().
-If such a function is called while we are in a polling section,
-attaching the notifiers will then invoke the io_poll_start callback,
-re-disabling notifications.
-
-Because we will always miss virtqueue updates in the drained section, we
-also need to poll the virtqueue once after attaching the notifiers.
-
-Buglink: https://issues.redhat.com/browse/RHEL-3934
-Signed-off-by: Hanna Czenczek <hreitz@redhat.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- hw/virtio/virtio.c  | 42 ++++++++++++++++++++++++++++++++++++++++++
- include/block/aio.h |  7 ++++++-
- 2 files changed, 48 insertions(+), 1 deletion(-)
-
-diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
-index 969c25f4cf..02cce83111 100644
---- a/hw/virtio/virtio.c
-+++ b/hw/virtio/virtio.c
-@@ -3526,6 +3526,17 @@ static void virtio_queue_host_notifier_aio_poll_end(EventNotifier *n)
- void virtio_queue_aio_attach_host_notifier(VirtQueue *vq, AioContext *ctx)
- {
-+    /*
-+     * virtio_queue_aio_detach_host_notifier() can leave notifications disabled.
-+     * Re-enable them.  (And if detach has not been used before, notifications
-+     * being enabled is still the default state while a notifier is attached;
-+     * see virtio_queue_host_notifier_aio_poll_end(), which will always leave
-+     * notifications enabled once the polling section is left.)
-+     */
-+    if (!virtio_queue_get_notification(vq)) {
-+        virtio_queue_set_notification(vq, 1);
-+    }
-+
-     aio_set_event_notifier(ctx, &vq->host_notifier,
-                            virtio_queue_host_notifier_read,
-                            virtio_queue_host_notifier_aio_poll,
-@@ -3533,6 +3544,13 @@ void virtio_queue_aio_attach_host_notifier(VirtQueue *vq, AioContext *ctx)
-     aio_set_event_notifier_poll(ctx, &vq->host_notifier,
-                                 virtio_queue_host_notifier_aio_poll_begin,
-                                 virtio_queue_host_notifier_aio_poll_end);
-+
-+    /*
-+     * We will have ignored notifications about new requests from the guest
-+     * while no notifiers were attached, so "kick" the virt queue to process
-+     * those requests now.
-+     */
-+    event_notifier_set(&vq->host_notifier);
- }
- /*
-@@ -3543,14 +3561,38 @@ void virtio_queue_aio_attach_host_notifier(VirtQueue *vq, AioContext *ctx)
-  */
- void virtio_queue_aio_attach_host_notifier_no_poll(VirtQueue *vq, AioContext *ctx)
- {
-+    /* See virtio_queue_aio_attach_host_notifier() */
-+    if (!virtio_queue_get_notification(vq)) {
-+        virtio_queue_set_notification(vq, 1);
-+    }
-+
-     aio_set_event_notifier(ctx, &vq->host_notifier,
-                            virtio_queue_host_notifier_read,
-                            NULL, NULL);
-+
-+    /*
-+     * See virtio_queue_aio_attach_host_notifier().
-+     * Note that this may be unnecessary for the type of virtqueues this
-+     * function is used for.  Still, it will not hurt to have a quick look into
-+     * whether we can/should process any of the virtqueue elements.
-+     */
-+    event_notifier_set(&vq->host_notifier);
- }
- void virtio_queue_aio_detach_host_notifier(VirtQueue *vq, AioContext *ctx)
- {
-     aio_set_event_notifier(ctx, &vq->host_notifier, NULL, NULL, NULL);
-+
-+    /*
-+     * aio_set_event_notifier_poll() does not guarantee whether io_poll_end()
-+     * will run after io_poll_begin(), so by removing the notifier, we do not
-+     * know whether virtio_queue_host_notifier_aio_poll_end() has run after a
-+     * previous virtio_queue_host_notifier_aio_poll_begin(), i.e. whether
-+     * notifications are enabled or disabled.  It does not really matter anyway;
-+     * we just removed the notifier, so we do not care about notifications until
-+     * we potentially re-attach it.  The attach_host_notifier functions will
-+     * ensure that notifications are enabled again when they are needed.
-+     */
- }
- void virtio_queue_host_notifier_read(EventNotifier *n)
-diff --git a/include/block/aio.h b/include/block/aio.h
-index 32042e8905..79efadfa48 100644
---- a/include/block/aio.h
-+++ b/include/block/aio.h
-@@ -498,9 +498,14 @@ void aio_set_event_notifier(AioContext *ctx,
-                             AioPollFn *io_poll,
-                             EventNotifierHandler *io_poll_ready);
--/* Set polling begin/end callbacks for an event notifier that has already been
-+/*
-+ * Set polling begin/end callbacks for an event notifier that has already been
-  * registered with aio_set_event_notifier.  Do nothing if the event notifier is
-  * not registered.
-+ *
-+ * Note that if the io_poll_end() callback (or the entire notifier) is removed
-+ * during polling, it will not be called, so an io_poll_begin() is not
-+ * necessarily always followed by an io_poll_end().
-  */
- void aio_set_event_notifier_poll(AioContext *ctx,
-                                  EventNotifier *notifier,
diff --git a/debian/patches/extra/0012-qemu_init-increase-NOFILE-soft-limit-on-POSIX.patch b/debian/patches/extra/0012-qemu_init-increase-NOFILE-soft-limit-on-POSIX.patch
deleted file mode 100644 (file)
index cdc1e06..0000000
+++ /dev/null
@@ -1,119 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Mon, 18 Dec 2023 11:13:40 +0100
-Subject: [PATCH] qemu_init: increase NOFILE soft limit on POSIX
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-In many configurations, e.g. multiple vNICs with multiple queues or
-with many Ceph OSDs, the default soft limit of 1024 is not enough.
-QEMU is supposed to work fine with file descriptors >= 1024 and does
-not use select() on POSIX. Bump the soft limit to the allowed hard
-limit to avoid issues with the aforementioned configurations.
-
-Of course the limit could be raised from the outside, but the man page
-of systemd.exec states about 'LimitNOFILE=':
-
-> Don't use.
-> [...]
-> Typically applications should increase their soft limit to the hard
-> limit on their own, if they are OK with working with file
-> descriptors above 1023,
-
-If the soft limit is already the same as the hard limit, avoid the
-superfluous setrlimit call. This can avoid a warning with a strict
-seccomp filter blocking setrlimit if NOFILE was already raised before
-executing QEMU.
-
-Buglink: https://bugzilla.proxmox.com/show_bug.cgi?id=4507
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
----
- include/sysemu/os-posix.h |  1 +
- include/sysemu/os-win32.h |  5 +++++
- os-posix.c                | 22 ++++++++++++++++++++++
- softmmu/vl.c              |  2 ++
- 4 files changed, 30 insertions(+)
-
-diff --git a/include/sysemu/os-posix.h b/include/sysemu/os-posix.h
-index 1030d39904..edc415aff5 100644
---- a/include/sysemu/os-posix.h
-+++ b/include/sysemu/os-posix.h
-@@ -48,6 +48,7 @@ void os_setup_early_signal_handling(void);
- void os_set_proc_name(const char *s);
- void os_setup_signal_handling(void);
- void os_daemonize(void);
-+void os_setup_limits(void);
- void os_setup_post(void);
- int os_mlock(void);
-diff --git a/include/sysemu/os-win32.h b/include/sysemu/os-win32.h
-index 91aa0d7ec0..f6e23fe01e 100644
---- a/include/sysemu/os-win32.h
-+++ b/include/sysemu/os-win32.h
-@@ -129,6 +129,11 @@ static inline int os_mlock(void)
-     return -ENOSYS;
- }
-+void os_setup_limits(void)
-+{
-+    return;
-+}
-+
- #define fsync _commit
- #if !defined(lseek)
-diff --git a/os-posix.c b/os-posix.c
-index cfcb96533c..0cc1d991b1 100644
---- a/os-posix.c
-+++ b/os-posix.c
-@@ -24,6 +24,7 @@
-  */
- #include "qemu/osdep.h"
-+#include <sys/resource.h>
- #include <sys/wait.h>
- #include <pwd.h>
- #include <grp.h>
-@@ -286,6 +287,27 @@ void os_daemonize(void)
-     }
- }
-+void os_setup_limits(void)
-+{
-+    struct rlimit nofile;
-+
-+    if (getrlimit(RLIMIT_NOFILE, &nofile) < 0) {
-+        warn_report("unable to query NOFILE limit: %s", strerror(errno));
-+        return;
-+    }
-+
-+    if (nofile.rlim_cur == nofile.rlim_max) {
-+        return;
-+    }
-+
-+    nofile.rlim_cur = nofile.rlim_max;
-+
-+    if (setrlimit(RLIMIT_NOFILE, &nofile) < 0) {
-+        warn_report("unable to set NOFILE limit: %s", strerror(errno));
-+        return;
-+    }
-+}
-+
- void os_setup_post(void)
- {
-     int fd = 0;
-diff --git a/softmmu/vl.c b/softmmu/vl.c
-index c9e9ede237..ba6ad8a8df 100644
---- a/softmmu/vl.c
-+++ b/softmmu/vl.c
-@@ -2713,6 +2713,8 @@ void qemu_init(int argc, char **argv)
-     error_init(argv[0]);
-     qemu_init_exec_dir(argv[0]);
-+    os_setup_limits();
-+
-     qemu_init_arch_modules();
-     qemu_init_subsystems();
diff --git a/debian/patches/extra/0013-virtio-blk-avoid-using-ioeventfd-state-in-irqfd-cond.patch b/debian/patches/extra/0013-virtio-blk-avoid-using-ioeventfd-state-in-irqfd-cond.patch
deleted file mode 100644 (file)
index 8109e7d..0000000
+++ /dev/null
@@ -1,61 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Stefan Hajnoczi <stefanha@redhat.com>
-Date: Mon, 22 Jan 2024 12:26:25 -0500
-Subject: [PATCH] virtio-blk: avoid using ioeventfd state in irqfd conditional
-
-Requests that complete in an IOThread use irqfd to notify the guest
-while requests that complete in the main loop thread use the traditional
-qdev irq code path. The reason for this conditional is that the irq code
-path requires the BQL:
-
-  if (s->ioeventfd_started && !s->ioeventfd_disabled) {
-      virtio_notify_irqfd(vdev, req->vq);
-  } else {
-      virtio_notify(vdev, req->vq);
-  }
-
-There is a corner case where the conditional invokes the irq code path
-instead of the irqfd code path:
-
-  static void virtio_blk_stop_ioeventfd(VirtIODevice *vdev)
-  {
-      ...
-      /*
-       * Set ->ioeventfd_started to false before draining so that host notifiers
-       * are not detached/attached anymore.
-       */
-      s->ioeventfd_started = false;
-
-      /* Wait for virtio_blk_dma_restart_bh() and in flight I/O to complete */
-      blk_drain(s->conf.conf.blk);
-
-During blk_drain() the conditional produces the wrong result because
-ioeventfd_started is false.
-
-Use qemu_in_iothread() instead of checking the ioeventfd state.
-
-Cc: qemu-stable@nongnu.org
-Buglink: https://issues.redhat.com/browse/RHEL-15394
-Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
-Message-ID: <20240122172625.415386-1-stefanha@redhat.com>
-Reviewed-by: Kevin Wolf <kwolf@redhat.com>
-Signed-off-by: Kevin Wolf <kwolf@redhat.com>
-[FE: backport: dataplane -> ioeventfd rework didn't happen yet]
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- hw/block/virtio-blk.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
-index 39e7f23fab..61bd1f6859 100644
---- a/hw/block/virtio-blk.c
-+++ b/hw/block/virtio-blk.c
-@@ -64,7 +64,7 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req, unsigned char status)
-     iov_discard_undo(&req->inhdr_undo);
-     iov_discard_undo(&req->outhdr_undo);
-     virtqueue_push(req->vq, &req->elem, req->in_len);
--    if (s->dataplane_started && !s->dataplane_disabled) {
-+    if (qemu_in_iothread()) {
-         virtio_blk_data_plane_notify(s->dataplane, req->vq);
-     } else {
-         virtio_notify(vdev, req->vq);
index 30dd2d4b76522a097842faf9f33512001596483d..f68e0df1bed7faac92f6db0fed9672be3e725374 100644 (file)
@@ -14,7 +14,7 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  1 file changed, 2 insertions(+), 2 deletions(-)
 
 diff --git a/block/file-posix.c b/block/file-posix.c
-index 7f540b03ed..ca551baa42 100644
+index 35684f7e21..43bc0bd520 100644
 --- a/block/file-posix.c
 +++ b/block/file-posix.c
 @@ -563,7 +563,7 @@ static QemuOptsList raw_runtime_opts = {
index f7c9754212c16d9db4617c4d89cd4cf5eb87b259..62bbda8d141ffe12fa8ff9987bca9dd1ebf6d01d 100644 (file)
@@ -9,10 +9,10 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  1 file changed, 2 insertions(+), 2 deletions(-)
 
 diff --git a/include/net/net.h b/include/net/net.h
-index 685ec58318..22edf4ee96 100644
+index b1f9b35fcc..096c0d52e4 100644
 --- a/include/net/net.h
 +++ b/include/net/net.h
-@@ -260,8 +260,8 @@ void netdev_add(QemuOpts *opts, Error **errp);
+@@ -317,8 +317,8 @@ void netdev_add(QemuOpts *opts, Error **errp);
  int net_hub_id_for_client(NetClientState *nc, int *id);
  NetClientState *net_hub_port_find(int hub_id);
  
index 4955ba3d1f99a5d2815337fba0e6b1f45ae9c255..71236cfb2118faf4d3766055fb97496038e36dac 100644 (file)
@@ -10,10 +10,10 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  1 file changed, 2 insertions(+), 2 deletions(-)
 
 diff --git a/target/i386/cpu.h b/target/i386/cpu.h
-index 0893b794e9..6d650a58b9 100644
+index 6b05738079..d82869900a 100644
 --- a/target/i386/cpu.h
 +++ b/target/i386/cpu.h
-@@ -2243,9 +2243,9 @@ uint64_t cpu_get_tsc(CPUX86State *env);
+@@ -2291,9 +2291,9 @@ uint64_t cpu_get_tsc(CPUX86State *env);
  #define CPU_RESOLVING_TYPE TYPE_X86_CPU
  
  #ifdef TARGET_X86_64
index 6405a25815f7059c814f41faaa0dcef2bb7a6291..d555c4815e4d8d5e29acb99c881a263d35d5de37 100644 (file)
@@ -9,10 +9,10 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  1 file changed, 9 insertions(+), 6 deletions(-)
 
 diff --git a/ui/spice-core.c b/ui/spice-core.c
-index 52a59386d7..b20c25aee0 100644
+index 15be640286..ea20e6153c 100644
 --- a/ui/spice-core.c
 +++ b/ui/spice-core.c
-@@ -691,32 +691,35 @@ static void qemu_spice_init(void)
+@@ -690,32 +690,35 @@ static void qemu_spice_init(void)
  
      if (tls_port) {
          x509_dir = qemu_opt_get(opts, "x509-dir");
index 947fc9076c653fb696546b7af3f8218ef877c30b..cb9497608862d84508249702d280bfd4e1b6d935 100644 (file)
@@ -9,7 +9,7 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  1 file changed, 11 insertions(+), 4 deletions(-)
 
 diff --git a/block/gluster.c b/block/gluster.c
-index ad5fadbe79..d0011085c4 100644
+index cc74af06dc..3ba9bbfa5e 100644
 --- a/block/gluster.c
 +++ b/block/gluster.c
 @@ -43,7 +43,7 @@
index 4bdb7b5cdc4d73f34d34a2588b789cc5cd3bec95..8881ab8c2c7f2d95f74732e83e1a8df7fadf6df6 100644 (file)
@@ -18,7 +18,7 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  1 file changed, 2 insertions(+)
 
 diff --git a/block/rbd.c b/block/rbd.c
-index 978671411e..a4749f3b1b 100644
+index 84bb2fa5d7..63f60d41be 100644
 --- a/block/rbd.c
 +++ b/block/rbd.c
 @@ -963,6 +963,8 @@ static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
index c4e67299e77785bee5b66b9adeef12ed46b1c53d..56f56f6e39a0a4c75a4cd22a50789222a012e587 100644 (file)
@@ -16,7 +16,7 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  1 file changed, 9 insertions(+), 1 deletion(-)
 
 diff --git a/block/gluster.c b/block/gluster.c
-index d0011085c4..2df3d6e35d 100644
+index 3ba9bbfa5e..34936eb855 100644
 --- a/block/gluster.c
 +++ b/block/gluster.c
 @@ -58,6 +58,7 @@ typedef struct GlusterAIOCB {
@@ -39,7 +39,7 @@ index d0011085c4..2df3d6e35d 100644
      }
  
      aio_co_schedule(acb->aio_context, acb->coroutine);
-@@ -1021,6 +1024,7 @@ static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs,
+@@ -1023,6 +1026,7 @@ static coroutine_fn int qemu_gluster_co_pwrite_zeroes(BlockDriverState *bs,
      acb.ret = 0;
      acb.coroutine = qemu_coroutine_self();
      acb.aio_context = bdrv_get_aio_context(bs);
@@ -47,7 +47,7 @@ index d0011085c4..2df3d6e35d 100644
  
      ret = glfs_zerofill_async(s->fd, offset, bytes, gluster_finish_aiocb, &acb);
      if (ret < 0) {
-@@ -1201,9 +1205,11 @@ static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
+@@ -1203,9 +1207,11 @@ static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
      acb.aio_context = bdrv_get_aio_context(bs);
  
      if (write) {
@@ -59,7 +59,7 @@ index d0011085c4..2df3d6e35d 100644
          ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
                                  gluster_finish_aiocb, &acb);
      }
-@@ -1266,6 +1272,7 @@ static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
+@@ -1268,6 +1274,7 @@ static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
      acb.ret = 0;
      acb.coroutine = qemu_coroutine_self();
      acb.aio_context = bdrv_get_aio_context(bs);
@@ -67,7 +67,7 @@ index d0011085c4..2df3d6e35d 100644
  
      ret = glfs_fsync_async(s->fd, gluster_finish_aiocb, &acb);
      if (ret < 0) {
-@@ -1314,6 +1321,7 @@ static coroutine_fn int qemu_gluster_co_pdiscard(BlockDriverState *bs,
+@@ -1316,6 +1323,7 @@ static coroutine_fn int qemu_gluster_co_pdiscard(BlockDriverState *bs,
      acb.ret = 0;
      acb.coroutine = qemu_coroutine_self();
      acb.aio_context = bdrv_get_aio_context(bs);
index 3bdb7ee6712772ebbe450b4572060616f8e0fce1..45eacb96ead2806bcd5eb4084a0faf84cea80474 100644 (file)
@@ -9,10 +9,10 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  1 file changed, 2 insertions(+), 1 deletion(-)
 
 diff --git a/qemu-img.c b/qemu-img.c
-index 78433f3746..25d427edd1 100644
+index 7668f86769..2575e97b43 100644
 --- a/qemu-img.c
 +++ b/qemu-img.c
-@@ -3062,7 +3062,8 @@ static int img_info(int argc, char **argv)
+@@ -3075,7 +3075,8 @@ static int img_info(int argc, char **argv)
      list = collect_image_info_list(image_opts, filename, fmt, chain,
                                     force_share);
      if (!list) {
index 5255268dc0d37919a58273b98e8c8204e8e7a8e1..bc03837d92f5f6a1823214561ed2cbc37a5af0bd 100644 (file)
@@ -38,10 +38,10 @@ Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
  2 files changed, 133 insertions(+), 73 deletions(-)
 
 diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
-index 1b1dab5b17..d1616c045a 100644
+index c9dd70a892..048788b23d 100644
 --- a/qemu-img-cmds.hx
 +++ b/qemu-img-cmds.hx
-@@ -58,9 +58,9 @@ SRST
+@@ -60,9 +60,9 @@ SRST
  ERST
  
  DEF("dd", img_dd,
@@ -54,10 +54,10 @@ index 1b1dab5b17..d1616c045a 100644
  
  DEF("info", img_info,
 diff --git a/qemu-img.c b/qemu-img.c
-index 25d427edd1..220e6ec577 100644
+index 2575e97b43..8ec68b346f 100644
 --- a/qemu-img.c
 +++ b/qemu-img.c
-@@ -4899,10 +4899,12 @@ static int img_bitmap(int argc, char **argv)
+@@ -4993,10 +4993,12 @@ static int img_bitmap(int argc, char **argv)
  #define C_IF      04
  #define C_OF      010
  #define C_SKIP    020
@@ -70,7 +70,7 @@ index 25d427edd1..220e6ec577 100644
  };
  
  struct DdIo {
-@@ -4978,6 +4980,19 @@ static int img_dd_skip(const char *arg,
+@@ -5072,6 +5074,19 @@ static int img_dd_skip(const char *arg,
      return 0;
  }
  
@@ -90,7 +90,7 @@ index 25d427edd1..220e6ec577 100644
  static int img_dd(int argc, char **argv)
  {
      int ret = 0;
-@@ -5018,6 +5033,7 @@ static int img_dd(int argc, char **argv)
+@@ -5112,6 +5127,7 @@ static int img_dd(int argc, char **argv)
          { "if", img_dd_if, C_IF },
          { "of", img_dd_of, C_OF },
          { "skip", img_dd_skip, C_SKIP },
@@ -98,7 +98,7 @@ index 25d427edd1..220e6ec577 100644
          { NULL, NULL, 0 }
      };
      const struct option long_options[] = {
-@@ -5093,91 +5109,112 @@ static int img_dd(int argc, char **argv)
+@@ -5187,91 +5203,112 @@ static int img_dd(int argc, char **argv)
          arg = NULL;
      }
  
@@ -275,7 +275,7 @@ index 25d427edd1..220e6ec577 100644
      }
  
      if (dd.flags & C_SKIP && (in.offset > INT64_MAX / in.bsz ||
-@@ -5194,20 +5231,43 @@ static int img_dd(int argc, char **argv)
+@@ -5288,20 +5325,43 @@ static int img_dd(int argc, char **argv)
      in.buf = g_new(uint8_t, in.bsz);
  
      for (out_pos = 0; in_pos < size; ) {
index d68e2aae47a6596b7e85ad4357f5c950a39f1564..31bbce28457282b290edb3e3ff6dbff3eb926988 100644 (file)
@@ -16,10 +16,10 @@ Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
  1 file changed, 25 insertions(+), 3 deletions(-)
 
 diff --git a/qemu-img.c b/qemu-img.c
-index 220e6ec577..58bf9b43d1 100644
+index 8ec68b346f..b98184bba1 100644
 --- a/qemu-img.c
 +++ b/qemu-img.c
-@@ -4900,11 +4900,13 @@ static int img_bitmap(int argc, char **argv)
+@@ -4994,11 +4994,13 @@ static int img_bitmap(int argc, char **argv)
  #define C_OF      010
  #define C_SKIP    020
  #define C_OSIZE   040
@@ -33,7 +33,7 @@ index 220e6ec577..58bf9b43d1 100644
  };
  
  struct DdIo {
-@@ -4993,6 +4995,19 @@ static int img_dd_osize(const char *arg,
+@@ -5087,6 +5089,19 @@ static int img_dd_osize(const char *arg,
      return 0;
  }
  
@@ -53,7 +53,7 @@ index 220e6ec577..58bf9b43d1 100644
  static int img_dd(int argc, char **argv)
  {
      int ret = 0;
-@@ -5007,12 +5022,14 @@ static int img_dd(int argc, char **argv)
+@@ -5101,12 +5116,14 @@ static int img_dd(int argc, char **argv)
      int c, i;
      const char *out_fmt = "raw";
      const char *fmt = NULL;
@@ -69,7 +69,7 @@ index 220e6ec577..58bf9b43d1 100644
      };
      struct DdIo in = {
          .bsz = 512, /* Block size is by default 512 bytes */
-@@ -5034,6 +5051,7 @@ static int img_dd(int argc, char **argv)
+@@ -5128,6 +5145,7 @@ static int img_dd(int argc, char **argv)
          { "of", img_dd_of, C_OF },
          { "skip", img_dd_skip, C_SKIP },
          { "osize", img_dd_osize, C_OSIZE },
@@ -77,7 +77,7 @@ index 220e6ec577..58bf9b43d1 100644
          { NULL, NULL, 0 }
      };
      const struct option long_options[] = {
-@@ -5230,9 +5248,10 @@ static int img_dd(int argc, char **argv)
+@@ -5324,9 +5342,10 @@ static int img_dd(int argc, char **argv)
  
      in.buf = g_new(uint8_t, in.bsz);
  
@@ -90,7 +90,7 @@ index 220e6ec577..58bf9b43d1 100644
          if (blk1) {
              in_ret = blk_pread(blk1, in_pos, bytes, in.buf, 0);
              if (in_ret == 0) {
-@@ -5241,6 +5260,9 @@ static int img_dd(int argc, char **argv)
+@@ -5335,6 +5354,9 @@ static int img_dd(int argc, char **argv)
          } else {
              in_ret = read(STDIN_FILENO, in.buf, bytes);
              if (in_ret == 0) {
index 5131d98a00ca241167958fabf41756f258c24287..a5bf7693db0e1c06c4eccc3a35802f3aaae292c2 100644 (file)
@@ -13,10 +13,10 @@ Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
  3 files changed, 26 insertions(+), 12 deletions(-)
 
 diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst
-index 15aeddc6d8..5e713e231d 100644
+index 3653adb963..d83e8fb3c0 100644
 --- a/docs/tools/qemu-img.rst
 +++ b/docs/tools/qemu-img.rst
-@@ -208,6 +208,10 @@ Parameters to convert subcommand:
+@@ -212,6 +212,10 @@ Parameters to convert subcommand:
  
  Parameters to dd subcommand:
  
@@ -27,7 +27,7 @@ index 15aeddc6d8..5e713e231d 100644
  .. program:: qemu-img-dd
  
  .. option:: bs=BLOCK_SIZE
-@@ -488,7 +492,7 @@ Command description:
+@@ -492,7 +496,7 @@ Command description:
    it doesn't need to be specified separately in this case.
  
  
@@ -36,7 +36,7 @@ index 15aeddc6d8..5e713e231d 100644
  
    dd copies from *INPUT* file to *OUTPUT* file converting it from
    *FMT* format to *OUTPUT_FMT* format.
-@@ -499,6 +503,11 @@ Command description:
+@@ -503,6 +507,11 @@ Command description:
  
    The size syntax is similar to :manpage:`dd(1)`'s size syntax.
  
@@ -49,10 +49,10 @@ index 15aeddc6d8..5e713e231d 100644
  
    Give information about the disk image *FILENAME*. Use it in
 diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
-index d1616c045a..b5b0bb4467 100644
+index 048788b23d..0b29a67a06 100644
 --- a/qemu-img-cmds.hx
 +++ b/qemu-img-cmds.hx
-@@ -58,9 +58,9 @@ SRST
+@@ -60,9 +60,9 @@ SRST
  ERST
  
  DEF("dd", img_dd,
@@ -65,10 +65,10 @@ index d1616c045a..b5b0bb4467 100644
  
  DEF("info", img_info,
 diff --git a/qemu-img.c b/qemu-img.c
-index 58bf9b43d1..9d414d639b 100644
+index b98184bba1..6fc8384f64 100644
 --- a/qemu-img.c
 +++ b/qemu-img.c
-@@ -5024,7 +5024,7 @@ static int img_dd(int argc, char **argv)
+@@ -5118,7 +5118,7 @@ static int img_dd(int argc, char **argv)
      const char *fmt = NULL;
      int64_t size = 0, readsize = 0;
      int64_t out_pos, in_pos;
@@ -77,7 +77,7 @@ index 58bf9b43d1..9d414d639b 100644
      struct DdInfo dd = {
          .flags = 0,
          .count = 0,
-@@ -5062,7 +5062,7 @@ static int img_dd(int argc, char **argv)
+@@ -5156,7 +5156,7 @@ static int img_dd(int argc, char **argv)
          { 0, 0, 0, 0 }
      };
  
@@ -86,7 +86,7 @@ index 58bf9b43d1..9d414d639b 100644
          if (c == EOF) {
              break;
          }
-@@ -5082,6 +5082,9 @@ static int img_dd(int argc, char **argv)
+@@ -5176,6 +5176,9 @@ static int img_dd(int argc, char **argv)
          case 'h':
              help();
              break;
@@ -96,7 +96,7 @@ index 58bf9b43d1..9d414d639b 100644
          case 'U':
              force_share = true;
              break;
-@@ -5212,13 +5215,15 @@ static int img_dd(int argc, char **argv)
+@@ -5306,13 +5309,15 @@ static int img_dd(int argc, char **argv)
                                  size - in.bsz * in.offset, &error_abort);
          }
  
index a9567952388f6ef7e0aeed05ba0e5122f7fb4be4..93a738094bd34529c672d9ffccb7a117e964a55d 100644 (file)
@@ -12,10 +12,10 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  3 files changed, 36 insertions(+), 7 deletions(-)
 
 diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst
-index 5e713e231d..9390d5e5cf 100644
+index d83e8fb3c0..61c6b21859 100644
 --- a/docs/tools/qemu-img.rst
 +++ b/docs/tools/qemu-img.rst
-@@ -492,10 +492,10 @@ Command description:
+@@ -496,10 +496,10 @@ Command description:
    it doesn't need to be specified separately in this case.
  
  
@@ -30,10 +30,10 @@ index 5e713e231d..9390d5e5cf 100644
    The data is by default read and written using blocks of 512 bytes but can be
    modified by specifying *BLOCK_SIZE*. If count=\ *BLOCKS* is specified
 diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
-index b5b0bb4467..36f97e1f19 100644
+index 0b29a67a06..758f397232 100644
 --- a/qemu-img-cmds.hx
 +++ b/qemu-img-cmds.hx
-@@ -58,9 +58,9 @@ SRST
+@@ -60,9 +60,9 @@ SRST
  ERST
  
  DEF("dd", img_dd,
@@ -46,10 +46,10 @@ index b5b0bb4467..36f97e1f19 100644
  
  DEF("info", img_info,
 diff --git a/qemu-img.c b/qemu-img.c
-index 9d414d639b..e13a12137b 100644
+index 6fc8384f64..a6c88e0860 100644
 --- a/qemu-img.c
 +++ b/qemu-img.c
-@@ -5016,6 +5016,7 @@ static int img_dd(int argc, char **argv)
+@@ -5110,6 +5110,7 @@ static int img_dd(int argc, char **argv)
      BlockDriver *drv = NULL, *proto_drv = NULL;
      BlockBackend *blk1 = NULL, *blk2 = NULL;
      QemuOpts *opts = NULL;
@@ -57,7 +57,7 @@ index 9d414d639b..e13a12137b 100644
      QemuOptsList *create_opts = NULL;
      Error *local_err = NULL;
      bool image_opts = false;
-@@ -5025,6 +5026,7 @@ static int img_dd(int argc, char **argv)
+@@ -5119,6 +5120,7 @@ static int img_dd(int argc, char **argv)
      int64_t size = 0, readsize = 0;
      int64_t out_pos, in_pos;
      bool force_share = false, skip_create = false;
@@ -65,7 +65,7 @@ index 9d414d639b..e13a12137b 100644
      struct DdInfo dd = {
          .flags = 0,
          .count = 0,
-@@ -5062,7 +5064,7 @@ static int img_dd(int argc, char **argv)
+@@ -5156,7 +5158,7 @@ static int img_dd(int argc, char **argv)
          { 0, 0, 0, 0 }
      };
  
@@ -74,7 +74,7 @@ index 9d414d639b..e13a12137b 100644
          if (c == EOF) {
              break;
          }
-@@ -5085,6 +5087,19 @@ static int img_dd(int argc, char **argv)
+@@ -5179,6 +5181,19 @@ static int img_dd(int argc, char **argv)
          case 'n':
              skip_create = true;
              break;
@@ -94,7 +94,7 @@ index 9d414d639b..e13a12137b 100644
          case 'U':
              force_share = true;
              break;
-@@ -5144,11 +5159,24 @@ static int img_dd(int argc, char **argv)
+@@ -5238,11 +5253,24 @@ static int img_dd(int argc, char **argv)
      if (dd.flags & C_IF) {
          blk1 = img_open(image_opts, in.filename, fmt, 0, false, false,
                          force_share);
@@ -120,7 +120,7 @@ index 9d414d639b..e13a12137b 100644
      }
  
      if (dd.flags & C_OSIZE) {
-@@ -5303,6 +5331,7 @@ static int img_dd(int argc, char **argv)
+@@ -5397,6 +5425,7 @@ static int img_dd(int argc, char **argv)
  out:
      g_free(arg);
      qemu_opts_del(opts);
index 2c1524f55ff541a9290e37557cafeb0c913cc0cb..4fc62157911e608ad2543f2dea80603e14cec74b 100644 (file)
@@ -18,10 +18,10 @@ Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
  4 files changed, 82 insertions(+), 4 deletions(-)
 
 diff --git a/hw/core/machine-hmp-cmds.c b/hw/core/machine-hmp-cmds.c
-index c3e55ef9e9..0e32e6201f 100644
+index a6ff6a4875..e7f74d1c63 100644
 --- a/hw/core/machine-hmp-cmds.c
 +++ b/hw/core/machine-hmp-cmds.c
-@@ -169,7 +169,35 @@ void hmp_info_balloon(Monitor *mon, const QDict *qdict)
+@@ -175,7 +175,35 @@ void hmp_info_balloon(Monitor *mon, const QDict *qdict)
          return;
      }
  
@@ -59,10 +59,10 @@ index c3e55ef9e9..0e32e6201f 100644
      qapi_free_BalloonInfo(info);
  }
 diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
-index d004cf29d2..2660ed520b 100644
+index 609e39a821..8cb6dfcac3 100644
 --- a/hw/virtio/virtio-balloon.c
 +++ b/hw/virtio/virtio-balloon.c
-@@ -782,8 +782,37 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f,
+@@ -781,8 +781,37 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f,
  static void virtio_balloon_stat(void *opaque, BalloonInfo *info)
  {
      VirtIOBalloon *dev = opaque;
@@ -103,10 +103,10 @@ index d004cf29d2..2660ed520b 100644
  
  static void virtio_balloon_to_target(void *opaque, ram_addr_t target)
 diff --git a/qapi/machine.json b/qapi/machine.json
-index a08b6576ca..5c9a4d55f4 100644
+index e8b60641f2..2054cdc70d 100644
 --- a/qapi/machine.json
 +++ b/qapi/machine.json
-@@ -1063,9 +1063,29 @@
+@@ -1079,9 +1079,29 @@
  # @actual: the logical size of the VM in bytes Formula used:
  #     logical_vm_size = vm_ram_size - balloon_size
  #
@@ -138,10 +138,10 @@ index a08b6576ca..5c9a4d55f4 100644
  ##
  # @query-balloon:
 diff --git a/qapi/pragma.json b/qapi/pragma.json
-index 7f810b0e97..325e684411 100644
+index 59fbe74b8c..be8fa304c5 100644
 --- a/qapi/pragma.json
 +++ b/qapi/pragma.json
-@@ -35,6 +35,7 @@
+@@ -90,6 +90,7 @@
      'member-name-exceptions': [     # visible in:
          'ACPISlotType',             # query-acpi-ospm-status
          'AcpiTableOptions',         # -acpitable
index ab331f3c18eeb9d467d36bc4b041d565abd616ea..255faf52486b1a26ff276e8b9a591c40f2370d48 100644 (file)
@@ -13,10 +13,10 @@ Signed-off-by: Dietmar Maurer <dietmar@proxmox.com>
  2 files changed, 9 insertions(+), 1 deletion(-)
 
 diff --git a/hw/core/machine-qmp-cmds.c b/hw/core/machine-qmp-cmds.c
-index 3860a50c3b..40821e2317 100644
+index 4b72009cd3..314351cdff 100644
 --- a/hw/core/machine-qmp-cmds.c
 +++ b/hw/core/machine-qmp-cmds.c
-@@ -91,6 +91,12 @@ MachineInfoList *qmp_query_machines(Error **errp)
+@@ -90,6 +90,12 @@ MachineInfoList *qmp_query_machines(Error **errp)
          info->numa_mem_supported = mc->numa_mem_supported;
          info->deprecated = !!mc->deprecation_reason;
          info->acpi = !!object_class_property_find(OBJECT_CLASS(mc), "acpi");
@@ -30,10 +30,10 @@ index 3860a50c3b..40821e2317 100644
              info->default_cpu_type = g_strdup(mc->default_cpu_type);
          }
 diff --git a/qapi/machine.json b/qapi/machine.json
-index 5c9a4d55f4..fbb61f18e4 100644
+index 2054cdc70d..a024d5b05d 100644
 --- a/qapi/machine.json
 +++ b/qapi/machine.json
-@@ -139,6 +139,8 @@
+@@ -146,6 +146,8 @@
  #
  # @is-default: whether the machine is default
  #
@@ -42,7 +42,7 @@ index 5c9a4d55f4..fbb61f18e4 100644
  # @cpu-max: maximum number of CPUs supported by the machine type
  #     (since 1.5)
  #
-@@ -163,7 +165,7 @@
+@@ -170,7 +172,7 @@
  ##
  { 'struct': 'MachineInfo',
    'data': { 'name': 'str', '*alias': 'str',
index 26c6840da8e36c0c9d802ce0d8194e3107794d4b..b1aff6a1a45193025492f6449cceb8876e238dd1 100644 (file)
@@ -14,10 +14,10 @@ Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
  2 files changed, 7 insertions(+)
 
 diff --git a/qapi/ui.json b/qapi/ui.json
-index 006616aa77..dfd1d3e36b 100644
+index f610bce118..6ea26a9acb 100644
 --- a/qapi/ui.json
 +++ b/qapi/ui.json
-@@ -317,11 +317,14 @@
+@@ -314,11 +314,14 @@
  #
  # @channels: a list of @SpiceChannel for each active spice channel
  #
@@ -33,7 +33,7 @@ index 006616aa77..dfd1d3e36b 100644
    'if': 'CONFIG_SPICE' }
  
 diff --git a/ui/spice-core.c b/ui/spice-core.c
-index b20c25aee0..26baeb7846 100644
+index ea20e6153c..55a15fba8b 100644
 --- a/ui/spice-core.c
 +++ b/ui/spice-core.c
 @@ -548,6 +548,10 @@ static SpiceInfo *qmp_query_spice_real(Error **errp)
index df086e3140d436148680f867ee38e27e1e6d796c..875fe263a6ed6674db5beb95a3a62a9ec9538c45 100644 (file)
@@ -16,19 +16,19 @@ Additionally, allows tracking the current position from the outside
 Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
 ---
- migration/channel-savevm-async.c | 183 +++++++++++++++++++++++++++++++
+ migration/channel-savevm-async.c | 184 +++++++++++++++++++++++++++++++
  migration/channel-savevm-async.h |  51 +++++++++
  migration/meson.build            |   1 +
- 3 files changed, 235 insertions(+)
+ 3 files changed, 236 insertions(+)
  create mode 100644 migration/channel-savevm-async.c
  create mode 100644 migration/channel-savevm-async.h
 
 diff --git a/migration/channel-savevm-async.c b/migration/channel-savevm-async.c
 new file mode 100644
-index 0000000000..aab081ce07
+index 0000000000..081a192f49
 --- /dev/null
 +++ b/migration/channel-savevm-async.c
-@@ -0,0 +1,183 @@
+@@ -0,0 +1,184 @@
 +/*
 + * QIO Channel implementation to be used by savevm-async QMP calls
 + */
@@ -175,8 +175,9 @@ index 0000000000..aab081ce07
 +
 +static void
 +qio_channel_savevm_async_set_aio_fd_handler(QIOChannel *ioc,
-+                                            AioContext *ctx,
++                                            AioContext *read_ctx,
 +                                            IOHandler *io_read,
++                                            AioContext *write_ctx,
 +                                            IOHandler *io_write,
 +                                            void *opaque)
 +{
@@ -270,7 +271,7 @@ index 0000000000..17ae2cb261
 +
 +#endif /* QIO_CHANNEL_SAVEVM_ASYNC_H */
 diff --git a/migration/meson.build b/migration/meson.build
-index 1ae28523a1..37ddcb5d60 100644
+index 1eeb915ff6..95d1cf2250 100644
 --- a/migration/meson.build
 +++ b/migration/meson.build
 @@ -13,6 +13,7 @@ system_ss.add(files(
index df12d851ba0d5d9bdbb0d2172a4d144fe3cf430c..f97226bd5a478ade10651ebdf3c9f9fc4893b7e0 100644 (file)
@@ -27,7 +27,7 @@ Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
 [FE: further improve aborting
      adapt to removal of QEMUFileOps
      improve condition for entering final stage
-     adapt to QAPI and other changes for 8.0]
+     adapt to QAPI and other changes for 8.2]
 Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
 ---
  hmp-commands-info.hx         |  13 +
@@ -38,14 +38,14 @@ Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
  migration/savevm-async.c     | 531 +++++++++++++++++++++++++++++++++++
  monitor/hmp-cmds.c           |  38 +++
  qapi/migration.json          |  34 +++
- qapi/misc.json               |  16 ++
+ qapi/misc.json               |  18 ++
  qemu-options.hx              |  12 +
- softmmu/vl.c                 |  10 +
- 11 files changed, 677 insertions(+)
+ system/vl.c                  |  10 +
+ 11 files changed, 679 insertions(+)
  create mode 100644 migration/savevm-async.c
 
 diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
-index f5b37eb74a..10fdd822e0 100644
+index ad1b1306e3..d5ab880492 100644
 --- a/hmp-commands-info.hx
 +++ b/hmp-commands-info.hx
 @@ -525,6 +525,19 @@ SRST
@@ -69,10 +69,10 @@ index f5b37eb74a..10fdd822e0 100644
          .name       = "balloon",
          .args_type  = "",
 diff --git a/hmp-commands.hx b/hmp-commands.hx
-index 2cbd0f77a0..e352f86872 100644
+index 2e2a3bcf98..7506de251c 100644
 --- a/hmp-commands.hx
 +++ b/hmp-commands.hx
-@@ -1865,3 +1865,20 @@ SRST
+@@ -1862,3 +1862,20 @@ SRST
    List event channels in the guest
  ERST
  #endif
@@ -94,12 +94,12 @@ index 2cbd0f77a0..e352f86872 100644
 +        .coroutine  = true,
 +    },
 diff --git a/include/migration/snapshot.h b/include/migration/snapshot.h
-index e72083b117..c846d37806 100644
+index 9e4dcaaa75..2581730d74 100644
 --- a/include/migration/snapshot.h
 +++ b/include/migration/snapshot.h
-@@ -61,4 +61,6 @@ bool delete_snapshot(const char *name,
-                     bool has_devices, strList *devices,
                    Error **errp);
+@@ -68,4 +68,6 @@ bool delete_snapshot(const char *name,
+  */
void load_snapshot_resume(RunState state);
  
 +int load_snapshot_from_blockdev(const char *filename, Error **errp);
 +
@@ -126,10 +126,10 @@ index 13f9a2dedb..7a7def7530 100644
  void coroutine_fn hmp_screendump(Monitor *mon, const QDict *qdict);
  void hmp_chardev_add(Monitor *mon, const QDict *qdict);
 diff --git a/migration/meson.build b/migration/meson.build
-index 37ddcb5d60..07f6057acc 100644
+index 95d1cf2250..800f12a60d 100644
 --- a/migration/meson.build
 +++ b/migration/meson.build
-@@ -26,6 +26,7 @@ system_ss.add(files(
+@@ -28,6 +28,7 @@ system_ss.add(files(
    'options.c',
    'postcopy-ram.c',
    'savevm.c',
@@ -139,7 +139,7 @@ index 37ddcb5d60..07f6057acc 100644
    'threadinfo.c',
 diff --git a/migration/savevm-async.c b/migration/savevm-async.c
 new file mode 100644
-index 0000000000..e9fc18fb10
+index 0000000000..779e4e2a78
 --- /dev/null
 +++ b/migration/savevm-async.c
 @@ -0,0 +1,531 @@
@@ -300,7 +300,6 @@ index 0000000000..e9fc18fb10
 +static void process_savevm_finalize(void *opaque)
 +{
 +    int ret;
-+    AioContext *iohandler_ctx = iohandler_get_aio_context();
 +    MigrationState *ms = migrate_get_current();
 +
 +    bool aborted = savevm_aborted();
@@ -317,9 +316,7 @@ index 0000000000..e9fc18fb10
 +     * so move it back. It can stay in the main context and live out its live
 +     * there, since we're done with it after this method ends anyway.
 +     */
-+    aio_context_acquire(iohandler_ctx);
 +    blk_set_aio_context(snap_state.target, qemu_get_aio_context(), NULL);
-+    aio_context_release(iohandler_ctx);
 +
 +    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
 +    if (ret < 0) {
@@ -396,12 +393,12 @@ index 0000000000..e9fc18fb10
 +         * lock. Similar to what is done in migration.c, call the exact variant
 +         * only once pend_precopy in the estimate is below the threshold.
 +         */
-+        qemu_mutex_unlock_iothread();
++        bql_unlock();
 +        qemu_savevm_state_pending_estimate(&pend_precopy, &pend_postcopy);
 +        if (pend_precopy <= threshold) {
 +            qemu_savevm_state_pending_exact(&pend_precopy, &pend_postcopy);
 +        }
-+        qemu_mutex_lock_iothread();
++        bql_lock();
 +        pending_size = pend_precopy + pend_postcopy;
 +
 +        /*
@@ -441,21 +438,25 @@ index 0000000000..e9fc18fb10
 +     * so move there now and after every flush.
 +     */
 +    aio_co_reschedule_self(qemu_get_aio_context());
-+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
++    bdrv_graph_co_rdlock();
++    bs = bdrv_first(&it);
++    bdrv_graph_co_rdunlock();
++    while (bs) {
 +        /* target has BDRV_O_NO_FLUSH, no sense calling bdrv_flush on it */
-+        if (bs == blk_bs(snap_state.target)) {
-+            continue;
-+        }
-+
-+        AioContext *bs_ctx = bdrv_get_aio_context(bs);
-+        if (bs_ctx != qemu_get_aio_context()) {
-+            DPRINTF("savevm: async flushing drive %s\n", bs->filename);
-+            aio_co_reschedule_self(bs_ctx);
-+            bdrv_graph_co_rdlock();
-+            bdrv_flush(bs);
-+            bdrv_graph_co_rdunlock();
-+            aio_co_reschedule_self(qemu_get_aio_context());
++        if (bs != blk_bs(snap_state.target)) {
++            AioContext *bs_ctx = bdrv_get_aio_context(bs);
++            if (bs_ctx != qemu_get_aio_context()) {
++                DPRINTF("savevm: async flushing drive %s\n", bs->filename);
++                aio_co_reschedule_self(bs_ctx);
++                bdrv_graph_co_rdlock();
++                bdrv_flush(bs);
++                bdrv_graph_co_rdunlock();
++                aio_co_reschedule_self(qemu_get_aio_context());
++            }
 +        }
++        bdrv_graph_co_rdlock();
++        bs = bdrv_next(&it);
++        bdrv_graph_co_rdunlock();
 +    }
 +
 +    DPRINTF("timing: async flushing took %ld ms\n",
@@ -478,7 +479,7 @@ index 0000000000..e9fc18fb10
 +        return;
 +    }
 +
-+    if (migration_is_running(ms->state)) {
++    if (migration_is_running()) {
 +        error_set(errp, ERROR_CLASS_GENERIC_ERROR, QERR_MIGRATION_ACTIVE);
 +        return;
 +    }
@@ -535,9 +536,10 @@ index 0000000000..e9fc18fb10
 +     * State is cleared in process_savevm_co, but has to be initialized
 +     * here (blocking main thread, from QMP) to avoid race conditions.
 +     */
-+    migrate_init(ms);
++    if (migrate_init(ms, errp)) {
++        return;
++    }
 +    memset(&mig_stats, 0, sizeof(mig_stats));
-+    memset(&compression_counters, 0, sizeof(compression_counters));
 +    ms->to_dst_file = snap_state.file;
 +
 +    error_setg(&snap_state.blocker, "block device is in use by savevm");
@@ -546,10 +548,8 @@ index 0000000000..e9fc18fb10
 +    snap_state.state = SAVE_STATE_ACTIVE;
 +    snap_state.finalize_bh = qemu_bh_new(process_savevm_finalize, &snap_state);
 +    snap_state.co = qemu_coroutine_create(&process_savevm_co, NULL);
-+    qemu_mutex_unlock_iothread();
 +    qemu_savevm_state_header(snap_state.file);
 +    qemu_savevm_state_setup(snap_state.file);
-+    qemu_mutex_lock_iothread();
 +
 +    /* Async processing from here on out happens in iohandler context, so let
 +     * the target bdrv have its home there.
@@ -675,7 +675,7 @@ index 0000000000..e9fc18fb10
 +    return ret;
 +}
 diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
-index 6c559b48c8..91be698308 100644
+index 871898ac46..ef4634e5c1 100644
 --- a/monitor/hmp-cmds.c
 +++ b/monitor/hmp-cmds.c
 @@ -22,6 +22,7 @@
@@ -685,7 +685,7 @@ index 6c559b48c8..91be698308 100644
 +#include "qapi/qapi-commands-migration.h"
  #include "qapi/qapi-commands-misc.h"
  #include "qapi/qmp/qdict.h"
- #include "qapi/qmp/qerror.h"
+ #include "qemu/cutils.h"
 @@ -443,3 +444,40 @@ void hmp_info_mtree(Monitor *mon, const QDict *qdict)
  
      mtree_info(flatview, dispatch_tree, owner, disabled);
@@ -728,10 +728,10 @@ index 6c559b48c8..91be698308 100644
 +    }
 +}
 diff --git a/qapi/migration.json b/qapi/migration.json
-index 8843e74b59..aca0ca1ac1 100644
+index 8c65b90328..ed20d066cd 100644
 --- a/qapi/migration.json
 +++ b/qapi/migration.json
-@@ -291,6 +291,40 @@
+@@ -297,6 +297,40 @@
             '*dirty-limit-throttle-time-per-round': 'uint64',
             '*dirty-limit-ring-full-time': 'uint64'} }
  
@@ -773,10 +773,10 @@ index 8843e74b59..aca0ca1ac1 100644
  # @query-migrate:
  #
 diff --git a/qapi/misc.json b/qapi/misc.json
-index cda2effa81..94a58bb0bf 100644
+index ec30e5c570..7147199a12 100644
 --- a/qapi/misc.json
 +++ b/qapi/misc.json
-@@ -456,6 +456,22 @@
+@@ -454,6 +454,24 @@
  ##
  { 'command': 'query-fdsets', 'returns': ['FdsetInfo'] }
  
@@ -785,6 +785,8 @@ index cda2effa81..94a58bb0bf 100644
 +#
 +# Prepare for snapshot and halt VM. Save VM state to statefile.
 +#
++# @statefile: target file that state should be written to.
++#
 +##
 +{ 'command': 'savevm-start', 'data': { '*statefile': 'str' } }
 +
@@ -800,10 +802,10 @@ index cda2effa81..94a58bb0bf 100644
  # @CommandLineParameterType:
  #
 diff --git a/qemu-options.hx b/qemu-options.hx
-index 8073f5edf5..dc1ececc9c 100644
+index 8ce85d4559..511ab9415e 100644
 --- a/qemu-options.hx
 +++ b/qemu-options.hx
-@@ -4483,6 +4483,18 @@ SRST
+@@ -4610,6 +4610,18 @@ SRST
      Start right away with a saved state (``loadvm`` in monitor)
  ERST
  
@@ -822,11 +824,11 @@ index 8073f5edf5..dc1ececc9c 100644
  #ifndef _WIN32
  DEF("daemonize", 0, QEMU_OPTION_daemonize, \
      "-daemonize      daemonize QEMU after initializing\n", QEMU_ARCH_ALL)
-diff --git a/softmmu/vl.c b/softmmu/vl.c
-index ba6ad8a8df..ddeace306e 100644
---- a/softmmu/vl.c
-+++ b/softmmu/vl.c
-@@ -164,6 +164,7 @@ static const char *accelerators;
+diff --git a/system/vl.c b/system/vl.c
+index c644222982..2738ab7c91 100644
+--- a/system/vl.c
++++ b/system/vl.c
+@@ -163,6 +163,7 @@ static const char *accelerators;
  static bool have_custom_ram_size;
  static const char *ram_memdev_id;
  static QDict *machine_opts_dict;
@@ -834,10 +836,10 @@ index ba6ad8a8df..ddeace306e 100644
  static QTAILQ_HEAD(, ObjectOption) object_opts = QTAILQ_HEAD_INITIALIZER(object_opts);
  static QTAILQ_HEAD(, DeviceOption) device_opts = QTAILQ_HEAD_INITIALIZER(device_opts);
  static int display_remote;
-@@ -2647,6 +2648,12 @@ void qmp_x_exit_preconfig(Error **errp)
-     if (loadvm) {
+@@ -2712,6 +2713,12 @@ void qmp_x_exit_preconfig(Error **errp)
+         RunState state = autostart ? RUN_STATE_RUNNING : runstate_get();
          load_snapshot(loadvm, NULL, false, NULL, &error_fatal);
+         load_snapshot_resume(state);
 +    } else if (loadstate) {
 +        Error *local_err = NULL;
 +        if (load_snapshot_from_blockdev(loadstate, &local_err) < 0) {
@@ -847,7 +849,7 @@ index ba6ad8a8df..ddeace306e 100644
      }
      if (replay_mode != REPLAY_MODE_NONE) {
          replay_vmstate_init();
-@@ -3196,6 +3203,9 @@ void qemu_init(int argc, char **argv)
+@@ -3259,6 +3266,9 @@ void qemu_init(int argc, char **argv)
              case QEMU_OPTION_loadvm:
                  loadvm = optarg;
                  break;
index c946137e82c9df3e2a9a82384844131cd4d28fae..1c0e9ef2ddc6ccb54b3fd990f2ef7155157d3b00 100644 (file)
@@ -13,18 +13,18 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
 [FE: adapt to removal of QEMUFileOps]
 Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
 ---
- migration/qemu-file.c    | 49 +++++++++++++++++++++++++++-------------
+ migration/qemu-file.c    | 50 +++++++++++++++++++++++++++-------------
  migration/qemu-file.h    |  2 ++
  migration/savevm-async.c |  5 ++--
- 3 files changed, 38 insertions(+), 18 deletions(-)
+ 3 files changed, 39 insertions(+), 18 deletions(-)
 
 diff --git a/migration/qemu-file.c b/migration/qemu-file.c
-index 19c33c9985..e9ffff0f0a 100644
+index a10882d47f..19c1de0472 100644
 --- a/migration/qemu-file.c
 +++ b/migration/qemu-file.c
-@@ -33,8 +33,8 @@
- #include "options.h"
- #include "qapi/error.h"
+@@ -35,8 +35,8 @@
+ #include "rdma.h"
+ #include "io/channel-file.h"
  
 -#define IO_BUF_SIZE 32768
 -#define MAX_IOV_SIZE MIN_CONST(IOV_MAX, 64)
@@ -32,8 +32,8 @@ index 19c33c9985..e9ffff0f0a 100644
 +#define MAX_IOV_SIZE MIN_CONST(IOV_MAX, 256)
  
  struct QEMUFile {
-     const QEMUFileHooks *hooks;
-@@ -46,7 +46,8 @@ struct QEMUFile {
+     QIOChannel *ioc;
+@@ -44,7 +44,8 @@ struct QEMUFile {
  
      int buf_index;
      int buf_size; /* 0 when writing */
@@ -43,7 +43,7 @@ index 19c33c9985..e9ffff0f0a 100644
  
      DECLARE_BITMAP(may_free, MAX_IOV_SIZE);
      struct iovec iov[MAX_IOV_SIZE];
-@@ -100,7 +101,9 @@ int qemu_file_shutdown(QEMUFile *f)
+@@ -101,7 +102,9 @@ int qemu_file_shutdown(QEMUFile *f)
      return 0;
  }
  
@@ -54,7 +54,7 @@ index 19c33c9985..e9ffff0f0a 100644
  {
      QEMUFile *f;
  
-@@ -109,6 +112,8 @@ static QEMUFile *qemu_file_new_impl(QIOChannel *ioc, bool is_writable)
+@@ -110,6 +113,8 @@ static QEMUFile *qemu_file_new_impl(QIOChannel *ioc, bool is_writable)
      object_ref(ioc);
      f->ioc = ioc;
      f->is_writable = is_writable;
@@ -63,7 +63,7 @@ index 19c33c9985..e9ffff0f0a 100644
  
      return f;
  }
-@@ -119,17 +124,27 @@ static QEMUFile *qemu_file_new_impl(QIOChannel *ioc, bool is_writable)
+@@ -120,17 +125,27 @@ static QEMUFile *qemu_file_new_impl(QIOChannel *ioc, bool is_writable)
   */
  QEMUFile *qemu_file_get_return_path(QEMUFile *f)
  {
@@ -93,8 +93,8 @@ index 19c33c9985..e9ffff0f0a 100644
 +    return qemu_file_new_impl(ioc, false, buffer_size);
  }
  
- void qemu_file_set_hooks(QEMUFile *f, const QEMUFileHooks *hooks)
-@@ -375,7 +390,7 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f)
+ /*
+@@ -328,7 +343,7 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile *f)
      do {
          len = qio_channel_read(f->ioc,
                                 (char *)f->buf + pending,
@@ -103,16 +103,17 @@ index 19c33c9985..e9ffff0f0a 100644
                                 &local_error);
          if (len == QIO_CHANNEL_ERR_BLOCK) {
              if (qemu_in_coroutine()) {
-@@ -425,6 +440,8 @@ int qemu_fclose(QEMUFile *f)
+@@ -368,6 +383,9 @@ int qemu_fclose(QEMUFile *f)
+         ret = ret2;
      }
      g_clear_pointer(&f->ioc, object_unref);
++
 +    free(f->buf);
 +
-     /* If any error was spotted before closing, we should report it
-      * instead of the close() return value.
-      */
-@@ -479,7 +496,7 @@ static void add_buf_to_iovec(QEMUFile *f, size_t len)
+     error_free(f->last_error_obj);
+     g_free(f);
+     trace_qemu_file_fclose();
+@@ -416,7 +434,7 @@ static void add_buf_to_iovec(QEMUFile *f, size_t len)
  {
      if (!add_to_iovec(f, f->buf + f->buf_index, len, false)) {
          f->buf_index += len;
@@ -121,7 +122,7 @@ index 19c33c9985..e9ffff0f0a 100644
              qemu_fflush(f);
          }
      }
-@@ -504,7 +521,7 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
+@@ -441,7 +459,7 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
      }
  
      while (size > 0) {
@@ -130,7 +131,7 @@ index 19c33c9985..e9ffff0f0a 100644
          if (l > size) {
              l = size;
          }
-@@ -549,8 +566,8 @@ size_t coroutine_mixed_fn qemu_peek_buffer(QEMUFile *f, uint8_t **buf, size_t si
+@@ -587,8 +605,8 @@ size_t coroutine_mixed_fn qemu_peek_buffer(QEMUFile *f, uint8_t **buf, size_t si
      size_t index;
  
      assert(!qemu_file_is_writable(f));
@@ -141,7 +142,7 @@ index 19c33c9985..e9ffff0f0a 100644
  
      /* The 1st byte to read from */
      index = f->buf_index + offset;
-@@ -600,7 +617,7 @@ size_t coroutine_mixed_fn qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size
+@@ -638,7 +656,7 @@ size_t coroutine_mixed_fn qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size
          size_t res;
          uint8_t *src;
  
@@ -150,7 +151,7 @@ index 19c33c9985..e9ffff0f0a 100644
          if (res == 0) {
              return done;
          }
-@@ -634,7 +651,7 @@ size_t coroutine_mixed_fn qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size
+@@ -672,7 +690,7 @@ size_t coroutine_mixed_fn qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size
   */
  size_t coroutine_mixed_fn qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size)
  {
@@ -159,7 +160,7 @@ index 19c33c9985..e9ffff0f0a 100644
          size_t res;
          uint8_t *src = NULL;
  
-@@ -659,7 +676,7 @@ int coroutine_mixed_fn qemu_peek_byte(QEMUFile *f, int offset)
+@@ -697,7 +715,7 @@ int coroutine_mixed_fn qemu_peek_byte(QEMUFile *f, int offset)
      int index = f->buf_index + offset;
  
      assert(!qemu_file_is_writable(f));
@@ -168,7 +169,7 @@ index 19c33c9985..e9ffff0f0a 100644
  
      if (index >= f->buf_size) {
          qemu_fill_buffer(f);
-@@ -777,7 +794,7 @@ static int qemu_compress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
+@@ -811,7 +829,7 @@ static int qemu_compress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
  ssize_t qemu_put_compression_data(QEMUFile *f, z_stream *stream,
                                    const uint8_t *p, size_t size)
  {
@@ -178,24 +179,24 @@ index 19c33c9985..e9ffff0f0a 100644
      if (blen < compressBound(size)) {
          return -1;
 diff --git a/migration/qemu-file.h b/migration/qemu-file.h
-index 47015f5201..1312b7c903 100644
+index 32fd4a34fd..36a0cd8cc8 100644
 --- a/migration/qemu-file.h
 +++ b/migration/qemu-file.h
-@@ -63,7 +63,9 @@ typedef struct QEMUFileHooks {
- } QEMUFileHooks;
+@@ -30,7 +30,9 @@
+ #include "io/channel.h"
  
  QEMUFile *qemu_file_new_input(QIOChannel *ioc);
 +QEMUFile *qemu_file_new_input_sized(QIOChannel *ioc, size_t buffer_size);
  QEMUFile *qemu_file_new_output(QIOChannel *ioc);
 +QEMUFile *qemu_file_new_output_sized(QIOChannel *ioc, size_t buffer_size);
- void qemu_file_set_hooks(QEMUFile *f, const QEMUFileHooks *hooks);
  int qemu_fclose(QEMUFile *f);
  
+ /*
 diff --git a/migration/savevm-async.c b/migration/savevm-async.c
-index e9fc18fb10..80624fada8 100644
+index 779e4e2a78..bf36fc06d2 100644
 --- a/migration/savevm-async.c
 +++ b/migration/savevm-async.c
-@@ -378,7 +378,7 @@ void qmp_savevm_start(const char *statefile, Error **errp)
+@@ -379,7 +379,7 @@ void qmp_savevm_start(const char *statefile, Error **errp)
  
      QIOChannel *ioc = QIO_CHANNEL(qio_channel_savevm_async_new(snap_state.target,
                                                                 &snap_state.bs_pos));
index 1cb816632500186565967a97881474743d74d73d..34a7efe3fdfdb2a9e2f60f24c5f1273ce4316353 100644 (file)
@@ -4,21 +4,22 @@ Date: Mon, 6 Apr 2020 12:16:47 +0200
 Subject: [PATCH] PVE: block: add the zeroinit block driver filter
 
 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
-[FE: adapt to changed function signatures]
+[FE: adapt to changed function signatures
+     adhere to block graph lock requirements]
 Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
 ---
  block/meson.build |   1 +
- block/zeroinit.c  | 200 ++++++++++++++++++++++++++++++++++++++++++++++
- 2 files changed, 201 insertions(+)
+ block/zeroinit.c  | 214 ++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 215 insertions(+)
  create mode 100644 block/zeroinit.c
 
 diff --git a/block/meson.build b/block/meson.build
-index 529fc172c6..1833c71ce9 100644
+index e1f03fd773..b530e117b5 100644
 --- a/block/meson.build
 +++ b/block/meson.build
-@@ -40,6 +40,7 @@ block_ss.add(files(
-   'throttle-groups.c',
+@@ -39,6 +39,7 @@ block_ss.add(files(
    'throttle.c',
+   'throttle-groups.c',
    'write-threshold.c',
 +  'zeroinit.c',
  ), zstd, zlib, gnutls)
@@ -26,10 +27,10 @@ index 529fc172c6..1833c71ce9 100644
  system_ss.add(when: 'CONFIG_TCG', if_true: files('blkreplay.c'))
 diff --git a/block/zeroinit.c b/block/zeroinit.c
 new file mode 100644
-index 0000000000..1257342724
+index 0000000000..696558d8d6
 --- /dev/null
 +++ b/block/zeroinit.c
-@@ -0,0 +1,200 @@
+@@ -0,0 +1,214 @@
 +/*
 + * Filter to fake a zero-initialized block device.
 + *
@@ -44,6 +45,7 @@ index 0000000000..1257342724
 +#include "qapi/error.h"
 +#include "block/block_int.h"
 +#include "block/block-io.h"
++#include "block/graph-lock.h"
 +#include "qapi/qmp/qdict.h"
 +#include "qapi/qmp/qstring.h"
 +#include "qemu/cutils.h"
@@ -94,6 +96,7 @@ index 0000000000..1257342724
 +                          Error **errp)
 +{
 +    BDRVZeroinitState *s = bs->opaque;
++    BdrvChild *file = NULL;
 +    QemuOpts *opts;
 +    Error *local_err = NULL;
 +    int ret;
@@ -109,10 +112,13 @@ index 0000000000..1257342724
 +    }
 +
 +    /* Open the raw file */
-+    bs->file = bdrv_open_child(qemu_opt_get(opts, "x-next"), options, "next",
-+                               bs, &child_of_bds,
-+                               BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
-+                               false, &local_err);
++    file = bdrv_open_child(qemu_opt_get(opts, "x-next"), options, "next", bs,
++                           &child_of_bds,
++                           BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, false,
++                           &local_err);
++    bdrv_graph_wrlock();
++    bs->file = file;
++    bdrv_graph_wrunlock();
 +    if (local_err) {
 +        ret = -EINVAL;
 +        error_propagate(errp, local_err);
@@ -125,7 +131,9 @@ index 0000000000..1257342724
 +    ret = 0;
 +fail:
 +    if (ret < 0) {
++        bdrv_graph_wrlock();
 +        bdrv_unref_child(bs, bs->file);
++        bdrv_graph_wrunlock();
 +    }
 +    qemu_opts_del(opts);
 +    return ret;
@@ -137,19 +145,22 @@ index 0000000000..1257342724
 +    (void)s;
 +}
 +
-+static coroutine_fn int64_t zeroinit_co_getlength(BlockDriverState *bs)
++static coroutine_fn int64_t GRAPH_RDLOCK
++zeroinit_co_getlength(BlockDriverState *bs)
 +{
 +    return bdrv_co_getlength(bs->file->bs);
 +}
 +
-+static int coroutine_fn zeroinit_co_preadv(BlockDriverState *bs,
-+    int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
++static int coroutine_fn GRAPH_RDLOCK
++zeroinit_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
++                   QEMUIOVector *qiov, BdrvRequestFlags flags)
 +{
 +    return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
 +}
 +
-+static int coroutine_fn zeroinit_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
-+                                                 int64_t bytes, BdrvRequestFlags flags)
++static int coroutine_fn GRAPH_RDLOCK
++zeroinit_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
++                          BdrvRequestFlags flags)
 +{
 +    BDRVZeroinitState *s = bs->opaque;
 +    if (offset >= s->extents)
@@ -157,8 +168,9 @@ index 0000000000..1257342724
 +    return bdrv_pwrite_zeroes(bs->file, offset, bytes, flags);
 +}
 +
-+static int coroutine_fn zeroinit_co_pwritev(BlockDriverState *bs,
-+    int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
++static int coroutine_fn GRAPH_RDLOCK
++zeroinit_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
++                    QEMUIOVector *qiov, BdrvRequestFlags flags)
 +{
 +    BDRVZeroinitState *s = bs->opaque;
 +    int64_t extents = offset + bytes;
@@ -167,32 +179,35 @@ index 0000000000..1257342724
 +    return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
 +}
 +
-+static coroutine_fn int zeroinit_co_flush(BlockDriverState *bs)
++static coroutine_fn int GRAPH_RDLOCK
++zeroinit_co_flush(BlockDriverState *bs)
 +{
 +    return bdrv_co_flush(bs->file->bs);
 +}
 +
-+static int zeroinit_has_zero_init(BlockDriverState *bs)
++static int GRAPH_RDLOCK
++zeroinit_has_zero_init(BlockDriverState *bs)
 +{
 +    BDRVZeroinitState *s = bs->opaque;
 +    return s->has_zero_init;
 +}
 +
-+static int coroutine_fn zeroinit_co_pdiscard(BlockDriverState *bs,
-+                                             int64_t offset, int64_t bytes)
++static int coroutine_fn GRAPH_RDLOCK
++zeroinit_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
 +{
 +    return bdrv_co_pdiscard(bs->file, offset, bytes);
 +}
 +
-+static int zeroinit_co_truncate(BlockDriverState *bs, int64_t offset,
-+                                _Bool exact, PreallocMode prealloc,
-+                                BdrvRequestFlags req_flags, Error **errp)
++static int GRAPH_RDLOCK
++zeroinit_co_truncate(BlockDriverState *bs, int64_t offset, _Bool exact,
++                     PreallocMode prealloc, BdrvRequestFlags req_flags,
++                     Error **errp)
 +{
 +    return bdrv_co_truncate(bs->file, offset, exact, prealloc, req_flags, errp);
 +}
 +
-+static coroutine_fn int zeroinit_co_get_info(BlockDriverState *bs,
-+                                             BlockDriverInfo *bdi)
++static coroutine_fn int GRAPH_RDLOCK
++zeroinit_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 +{
 +    return bdrv_co_get_info(bs->file->bs, bdi);
 +}
index 17f5de1352d7e05fbd25f1bf5bca388cda6bfb84..bc472b0447f29cb37dfbc7ca4db40bc92a937da4 100644 (file)
@@ -10,14 +10,14 @@ Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
 ---
  qemu-options.hx | 3 +++
- softmmu/vl.c    | 8 ++++++++
+ system/vl.c     | 8 ++++++++
  2 files changed, 11 insertions(+)
 
 diff --git a/qemu-options.hx b/qemu-options.hx
-index dc1ececc9c..848d2dfdd1 100644
+index 511ab9415e..92e301d545 100644
 --- a/qemu-options.hx
 +++ b/qemu-options.hx
-@@ -1197,6 +1197,9 @@ legacy PC, they are not recommended for modern configurations.
+@@ -1237,6 +1237,9 @@ legacy PC, they are not recommended for modern configurations.
  
  ERST
  
@@ -27,11 +27,11 @@ index dc1ececc9c..848d2dfdd1 100644
  DEF("fda", HAS_ARG, QEMU_OPTION_fda,
      "-fda/-fdb file  use 'file' as floppy disk 0/1 image\n", QEMU_ARCH_ALL)
  DEF("fdb", HAS_ARG, QEMU_OPTION_fdb, "", QEMU_ARCH_ALL)
-diff --git a/softmmu/vl.c b/softmmu/vl.c
-index ddeace306e..3ee90b3b94 100644
---- a/softmmu/vl.c
-+++ b/softmmu/vl.c
-@@ -2683,6 +2683,7 @@ void qemu_init(int argc, char **argv)
+diff --git a/system/vl.c b/system/vl.c
+index 2738ab7c91..20ebf2c920 100644
+--- a/system/vl.c
++++ b/system/vl.c
+@@ -2748,6 +2748,7 @@ void qemu_init(int argc, char **argv)
      MachineClass *machine_class;
      bool userconfig = true;
      FILE *vmstate_dump_file = NULL;
@@ -39,7 +39,7 @@ index ddeace306e..3ee90b3b94 100644
  
      qemu_add_opts(&qemu_drive_opts);
      qemu_add_drive_opts(&qemu_legacy_drive_opts);
-@@ -3308,6 +3309,13 @@ void qemu_init(int argc, char **argv)
+@@ -3371,6 +3372,13 @@ void qemu_init(int argc, char **argv)
                  machine_parse_property_opt(qemu_find_opts("smp-opts"),
                                             "smp", optarg);
                  break;
@@ -50,6 +50,6 @@ index ddeace306e..3ee90b3b94 100644
 +                    exit(1);
 +                }
 +                break;
+ #ifdef CONFIG_VNC
              case QEMU_OPTION_vnc:
                  vnc_parse(optarg);
-                 break;
index 3c28401f32f3e8e4792605bb20e9d0cec1a26cdb..9845cf2e287599d982b4a752057c20473c21a9a5 100644 (file)
@@ -11,10 +11,10 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  1 file changed, 9 insertions(+)
 
 diff --git a/hw/intc/apic_common.c b/hw/intc/apic_common.c
-index 4a34f03047..59b917e50c 100644
+index d8fc1e2815..789694b8b3 100644
 --- a/hw/intc/apic_common.c
 +++ b/hw/intc/apic_common.c
-@@ -252,6 +252,15 @@ static void apic_reset_common(DeviceState *dev)
+@@ -263,6 +263,15 @@ static void apic_reset_common(DeviceState *dev)
      info->vapic_base_update(s);
  
      apic_init_reset(dev);
index f48fe4f18bb88eef8fe19c0182c423cea1c42a74..9230b01b8b8f0690650ae9c53d4bcd36b9f0b71a 100644 (file)
@@ -9,14 +9,14 @@ Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
 ---
  block/file-posix.c   | 59 ++++++++++++++++++++++++++++++--------------
- qapi/block-core.json |  ++-
- 2 files changed, 42 insertions(+), 20 deletions(-)
+ qapi/block-core.json |  7 +++++-
+ 2 files changed, 46 insertions(+), 20 deletions(-)
 
 diff --git a/block/file-posix.c b/block/file-posix.c
-index ca551baa42..8b3b83e9d4 100644
+index 43bc0bd520..60e98c87f1 100644
 --- a/block/file-posix.c
 +++ b/block/file-posix.c
-@@ -2873,6 +2873,7 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
+@@ -2876,6 +2876,7 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
      int fd;
      uint64_t perm, shared;
      int result = 0;
@@ -24,7 +24,7 @@ index ca551baa42..8b3b83e9d4 100644
  
      /* Validate options and set default values */
      assert(options->driver == BLOCKDEV_DRIVER_FILE);
-@@ -2913,19 +2914,22 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
+@@ -2916,19 +2917,22 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
      perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
      shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
  
@@ -59,7 +59,7 @@ index ca551baa42..8b3b83e9d4 100644
      }
  
      /* Clear the file by truncating it to 0 */
-@@ -2979,13 +2983,15 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
+@@ -2982,13 +2986,15 @@ raw_co_create(BlockdevCreateOptions *options, Error **errp)
      }
  
  out_unlock:
@@ -82,7 +82,7 @@ index ca551baa42..8b3b83e9d4 100644
      }
  
  out_close:
-@@ -3009,6 +3015,7 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
+@@ -3012,6 +3018,7 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
      PreallocMode prealloc;
      char *buf = NULL;
      Error *local_err = NULL;
@@ -90,7 +90,7 @@ index ca551baa42..8b3b83e9d4 100644
  
      /* Skip file: protocol prefix */
      strstart(filename, "file:", &filename);
-@@ -3031,6 +3038,18 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
+@@ -3034,6 +3041,18 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
          return -EINVAL;
      }
  
@@ -109,7 +109,7 @@ index ca551baa42..8b3b83e9d4 100644
      options = (BlockdevCreateOptions) {
          .driver     = BLOCKDEV_DRIVER_FILE,
          .u.file     = {
-@@ -3042,6 +3061,8 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
+@@ -3045,6 +3064,8 @@ raw_co_create_opts(BlockDriver *drv, const char *filename,
              .nocow              = nocow,
              .has_extent_size_hint = has_extent_size_hint,
              .extent_size_hint   = extent_size_hint,
@@ -119,10 +119,21 @@ index ca551baa42..8b3b83e9d4 100644
      };
      return raw_co_create(&options, errp);
 diff --git a/qapi/block-core.json b/qapi/block-core.json
-index a5cea82139..bb471c078d 100644
+index 45ab548dfe..f7c2b63c5d 100644
 --- a/qapi/block-core.json
 +++ b/qapi/block-core.json
-@@ -4880,7 +4880,8 @@
+@@ -4956,6 +4956,10 @@
+ # @extent-size-hint: Extent size hint to add to the image file; 0 for
+ #     not adding an extent size hint (default: 1 MB, since 5.1)
+ #
++# @locking: whether to enable file locking.  If set to 'auto', only
++#     enable when Open File Descriptor (OFD) locking API is available
++#     (default: auto).
++#
+ # Since: 2.12
+ ##
+ { 'struct': 'BlockdevCreateOptionsFile',
+@@ -4963,7 +4967,8 @@
              'size':                 'size',
              '*preallocation':       'PreallocMode',
              '*nocow':               'bool',
index 277fa3fa740668b84ad414fa1a1f2bcd214ec2cc..0b7c435f680e7bc7d646477af0d8df31743816b3 100644 (file)
@@ -26,10 +26,10 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  1 file changed, 2 insertions(+), 1 deletion(-)
 
 diff --git a/hw/core/machine.c b/hw/core/machine.c
-index f0d35c6401..1427983543 100644
+index 37ede0e7d4..513e49bab1 100644
 --- a/hw/core/machine.c
 +++ b/hw/core/machine.c
-@@ -148,7 +148,8 @@ GlobalProperty hw_compat_4_0[] = {
+@@ -161,7 +161,8 @@ GlobalProperty hw_compat_4_0[] = {
      { "virtio-vga",     "edid", "false" },
      { "virtio-gpu-device", "edid", "false" },
      { "virtio-device", "use-started", "false" },
index 507a5e353faab74deaaa27d948d7c2d8d306b0c5..eb2730437c3fad6bde51ef7ff33cef80b3421526 100644 (file)
@@ -17,14 +17,14 @@ Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
  hw/core/machine-qmp-cmds.c |  5 +++++
  include/hw/boards.h        |  2 ++
  qapi/machine.json          |  4 +++-
- softmmu/vl.c               | 25 +++++++++++++++++++++++++
+ system/vl.c                | 25 +++++++++++++++++++++++++
  4 files changed, 35 insertions(+), 1 deletion(-)
 
 diff --git a/hw/core/machine-qmp-cmds.c b/hw/core/machine-qmp-cmds.c
-index 40821e2317..ee93ddd69a 100644
+index 314351cdff..628a3537c5 100644
 --- a/hw/core/machine-qmp-cmds.c
 +++ b/hw/core/machine-qmp-cmds.c
-@@ -95,6 +95,11 @@ MachineInfoList *qmp_query_machines(Error **errp)
+@@ -94,6 +94,11 @@ MachineInfoList *qmp_query_machines(Error **errp)
          if (strcmp(mc->name, MACHINE_GET_CLASS(current_machine)->name) == 0) {
              info->has_is_current = true;
              info->is_current = true;
@@ -37,10 +37,10 @@ index 40821e2317..ee93ddd69a 100644
  
          if (mc->default_cpu_type) {
 diff --git a/include/hw/boards.h b/include/hw/boards.h
-index ed83360198..f8b88cd86a 100644
+index 8b8f6d5c00..dd6d0a1447 100644
 --- a/include/hw/boards.h
 +++ b/include/hw/boards.h
-@@ -235,6 +235,8 @@ struct MachineClass {
+@@ -246,6 +246,8 @@ struct MachineClass {
      const char *desc;
      const char *deprecation_reason;
  
@@ -50,10 +50,10 @@ index ed83360198..f8b88cd86a 100644
      void (*reset)(MachineState *state, ShutdownCause reason);
      void (*wakeup)(MachineState *state);
 diff --git a/qapi/machine.json b/qapi/machine.json
-index fbb61f18e4..7da3c519ba 100644
+index a024d5b05d..1d69bffaa0 100644
 --- a/qapi/machine.json
 +++ b/qapi/machine.json
-@@ -161,6 +161,8 @@
+@@ -168,6 +168,8 @@
  #
  # @acpi: machine type supports ACPI (since 8.0)
  #
@@ -62,7 +62,7 @@ index fbb61f18e4..7da3c519ba 100644
  # Since: 1.2
  ##
  { 'struct': 'MachineInfo',
-@@ -168,7 +170,7 @@
+@@ -175,7 +177,7 @@
              '*is-default': 'bool', '*is-current': 'bool', 'cpu-max': 'int',
              'hotpluggable-cpus': 'bool',  'numa-mem-supported': 'bool',
              'deprecated': 'bool', '*default-cpu-type': 'str',
@@ -71,19 +71,19 @@ index fbb61f18e4..7da3c519ba 100644
  
  ##
  # @query-machines:
-diff --git a/softmmu/vl.c b/softmmu/vl.c
-index 3ee90b3b94..4b6d0b82fd 100644
---- a/softmmu/vl.c
-+++ b/softmmu/vl.c
-@@ -1597,6 +1597,7 @@ static const QEMUOption *lookup_opt(int argc, char **argv,
+diff --git a/system/vl.c b/system/vl.c
+index 20ebf2c920..4d39e32097 100644
+--- a/system/vl.c
++++ b/system/vl.c
+@@ -1659,6 +1659,7 @@ static const QEMUOption *lookup_opt(int argc, char **argv,
  static MachineClass *select_machine(QDict *qdict, Error **errp)
  {
-     const char *optarg = qdict_get_try_str(qdict, "type");
+     const char *machine_type = qdict_get_try_str(qdict, "type");
 +    const char *pvever = qdict_get_try_str(qdict, "pvever");
      GSList *machines = object_class_get_list(TYPE_MACHINE, false);
      MachineClass *machine_class;
      Error *local_err = NULL;
-@@ -1614,6 +1615,11 @@ static MachineClass *select_machine(QDict *qdict, Error **errp)
+@@ -1676,6 +1677,11 @@ static MachineClass *select_machine(QDict *qdict, Error **errp)
          }
      }
  
@@ -95,7 +95,7 @@ index 3ee90b3b94..4b6d0b82fd 100644
      g_slist_free(machines);
      if (local_err) {
          error_append_hint(&local_err, "Use -machine help to list supported machines\n");
-@@ -3250,12 +3256,31 @@ void qemu_init(int argc, char **argv)
+@@ -3313,12 +3319,31 @@ void qemu_init(int argc, char **argv)
              case QEMU_OPTION_machine:
                  {
                      bool help;
index ef3fb89a3850eeb7b9e61470eb7a067eacf00ce1..8bff4f37bb9787b6cb0fbb59997f86e7aa945efb 100644 (file)
@@ -25,7 +25,7 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  1 file changed, 4 insertions(+), 4 deletions(-)
 
 diff --git a/block/backup.c b/block/backup.c
-index db3791f4d1..39410dcf8d 100644
+index ec29d6b810..270957c0cd 100644
 --- a/block/backup.c
 +++ b/block/backup.c
 @@ -237,8 +237,8 @@ static void backup_init_bcs_bitmap(BackupBlockJob *job)
@@ -48,9 +48,9 @@ index db3791f4d1..39410dcf8d 100644
      if (s->sync_mode == MIRROR_SYNC_MODE_TOP) {
          int64_t offset = 0;
          int64_t count;
-@@ -495,6 +493,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
-     block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL,
+@@ -501,6 +499,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
                         &error_abort);
+     bdrv_graph_wrunlock();
  
 +    backup_init_bcs_bitmap(job);
 +
index 1620a5655e876d07f8c81d6f536a395fa02f3f4e..ee40ab85c2f3c349588082407a22dc29dfe93d85 100644 (file)
@@ -15,21 +15,21 @@ Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
 ---
  block/meson.build |   2 +
  meson.build       |   5 +
- vma-reader.c      | 867 ++++++++++++++++++++++++++++++++++++++++++++
+ vma-reader.c      | 870 ++++++++++++++++++++++++++++++++++++++++++++
  vma-writer.c      | 818 +++++++++++++++++++++++++++++++++++++++++
- vma.c             | 900 ++++++++++++++++++++++++++++++++++++++++++++++
+ vma.c             | 901 ++++++++++++++++++++++++++++++++++++++++++++++
  vma.h             | 150 ++++++++
- 6 files changed, 2742 insertions(+)
+ 6 files changed, 2746 insertions(+)
  create mode 100644 vma-reader.c
  create mode 100644 vma-writer.c
  create mode 100644 vma.c
  create mode 100644 vma.h
 
 diff --git a/block/meson.build b/block/meson.build
-index 1833c71ce9..59b71ba9f3 100644
+index b530e117b5..b245daa98e 100644
 --- a/block/meson.build
 +++ b/block/meson.build
-@@ -43,6 +43,8 @@ block_ss.add(files(
+@@ -42,6 +42,8 @@ block_ss.add(files(
    'zeroinit.c',
  ), zstd, zlib, gnutls)
  
@@ -39,10 +39,10 @@ index 1833c71ce9..59b71ba9f3 100644
  system_ss.add(files('block-ram-registrar.c'))
  
 diff --git a/meson.build b/meson.build
-index a9c4f28247..cd95530d3b 100644
+index 91a0aa64c6..620cc594b2 100644
 --- a/meson.build
 +++ b/meson.build
-@@ -1778,6 +1778,8 @@ endif
+@@ -1922,6 +1922,8 @@ endif
  
  has_gettid = cc.has_function('gettid')
  
@@ -51,7 +51,7 @@ index a9c4f28247..cd95530d3b 100644
  # libselinux
  selinux = dependency('libselinux',
                       required: get_option('selinux'),
-@@ -3908,6 +3910,9 @@ if have_tools
+@@ -4023,6 +4025,9 @@ if have_tools
                 dependencies: [blockdev, qemuutil, gnutls, selinux],
                 install: true)
  
@@ -59,14 +59,14 @@ index a9c4f28247..cd95530d3b 100644
 +                   dependencies: [authz, block, crypto, io, qom], install: true)
 +
    subdir('storage-daemon')
-   subdir('contrib/rdmacm-mux')
-   subdir('contrib/elf2dmp')
+   foreach exe: [ 'qemu-img', 'qemu-io', 'qemu-nbd', 'qemu-storage-daemon']
 diff --git a/vma-reader.c b/vma-reader.c
 new file mode 100644
-index 0000000000..81a891c6b1
+index 0000000000..d0b6721812
 --- /dev/null
 +++ b/vma-reader.c
-@@ -0,0 +1,867 @@
+@@ -0,0 +1,870 @@
 +/*
 + * VMA: Virtual Machine Archive
 + *
@@ -88,6 +88,7 @@ index 0000000000..81a891c6b1
 +#include "qemu/ratelimit.h"
 +#include "vma.h"
 +#include "block/block.h"
++#include "block/graph-lock.h"
 +#include "sysemu/block-backend.h"
 +
 +static unsigned char zero_vma_block[VMA_BLOCK_SIZE];
@@ -600,8 +601,10 @@ index 0000000000..81a891c6b1
 +    } else {
 +        int res = blk_pwrite(target, sector_num * BDRV_SECTOR_SIZE, nb_sectors * BDRV_SECTOR_SIZE, buf, 0);
 +        if (res < 0) {
++            bdrv_graph_rdlock_main_loop();
 +            error_setg(errp, "blk_pwrite to %s failed (%d)",
 +                       bdrv_get_device_name(blk_bs(target)), res);
++            bdrv_graph_rdunlock_main_loop();
 +            return -1;
 +        }
 +    }
@@ -1760,10 +1763,10 @@ index 0000000000..126b296647
 +}
 diff --git a/vma.c b/vma.c
 new file mode 100644
-index 0000000000..347f6283ca
+index 0000000000..bb715e9061
 --- /dev/null
 +++ b/vma.c
-@@ -0,0 +1,900 @@
+@@ -0,0 +1,901 @@
 +/*
 + * VMA: Virtual Machine Archive
 + *
@@ -2076,17 +2079,17 @@ index 0000000000..347f6283ca
 +                        inbuf);
 +            }
 +
-+            RestoreMap *map = g_new0(RestoreMap, 1);
-+            map->devname = g_strdup(devname);
-+            map->path = g_strdup(path);
-+            map->format = format;
-+            map->throttling_bps = bps_value;
-+            map->throttling_group = group;
-+            map->cache = cache;
-+            map->write_zero = write_zero;
-+            map->skip = skip;
++            RestoreMap *restore_map = g_new0(RestoreMap, 1);
++            restore_map->devname = g_strdup(devname);
++            restore_map->path = g_strdup(path);
++            restore_map->format = format;
++            restore_map->throttling_bps = bps_value;
++            restore_map->throttling_group = group;
++            restore_map->cache = cache;
++            restore_map->write_zero = write_zero;
++            restore_map->skip = skip;
 +
-+            g_hash_table_insert(devmap, map->devname, map);
++            g_hash_table_insert(devmap, restore_map->devname, restore_map);
 +
 +        };
 +    }
@@ -2385,7 +2388,7 @@ index 0000000000..347f6283ca
 +
 +static int create_archive(int argc, char **argv)
 +{
-+    int i, c;
++    int c;
 +    int verbose = 0;
 +    const char *archivename;
 +    GList *backup_coroutines = NULL;
@@ -2543,6 +2546,7 @@ index 0000000000..347f6283ca
 +    vma_writer_get_status(vmaw, &vmastat);
 +
 +    if (verbose) {
++        int i;
 +        for (i = 0; i < 256; i++) {
 +            VmaStreamInfo *si = &vmastat.stream_info[i];
 +            if (si->size) {
index 1adb45d290940df1661f802ee5cfa3caf5129544..357f9d69d8d1b40147ec975c421fa9aa7f77e6f5 100644 (file)
@@ -199,7 +199,7 @@ index 0000000000..e46abf1070
 +    return bs;
 +}
 diff --git a/block/backup.c b/block/backup.c
-index 39410dcf8d..af87fa6aa9 100644
+index 270957c0cd..16d611c4ca 100644
 --- a/block/backup.c
 +++ b/block/backup.c
 @@ -29,28 +29,6 @@
@@ -231,7 +231,7 @@ index 39410dcf8d..af87fa6aa9 100644
  static const BlockJobDriver backup_job_driver;
  
  static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
-@@ -457,6 +435,14 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
+@@ -461,6 +439,14 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
      }
  
      cluster_size = block_copy_cluster_size(bcs);
@@ -247,7 +247,7 @@ index 39410dcf8d..af87fa6aa9 100644
      if (perf->max_chunk && perf->max_chunk < cluster_size) {
          error_setg(errp, "Required max-chunk (%" PRIi64 ") is less than backup "
 diff --git a/block/meson.build b/block/meson.build
-index 59b71ba9f3..6fde9f7dcd 100644
+index b245daa98e..e99914eaa4 100644
 --- a/block/meson.build
 +++ b/block/meson.build
 @@ -4,6 +4,7 @@ block_ss.add(files(
@@ -255,11 +255,11 @@ index 59b71ba9f3..6fde9f7dcd 100644
    'amend.c',
    'backup.c',
 +  'backup-dump.c',
-   'copy-before-write.c',
    'blkdebug.c',
    'blklogwrites.c',
+   'blkverify.c',
 diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
-index 74195c3004..0a0339eee4 100644
+index 761276127e..b3e6697613 100644
 --- a/include/block/block_int-common.h
 +++ b/include/block/block_int-common.h
 @@ -26,6 +26,7 @@
@@ -312,10 +312,10 @@ index 74195c3004..0a0339eee4 100644
      BDRV_TRACKED_READ,
      BDRV_TRACKED_WRITE,
 diff --git a/job.c b/job.c
-index 72d57f0934..93e22d180b 100644
+index 660ce22c56..baf54c8d60 100644
 --- a/job.c
 +++ b/job.c
-@@ -330,7 +330,8 @@ static bool job_started_locked(Job *job)
+@@ -331,7 +331,8 @@ static bool job_started_locked(Job *job)
  }
  
  /* Called with job_mutex held. */
index fbf610e5107d549ef0ad1e7cfd528ab0a75a1444..db86c7f8822a78cd514d0b0edf7666e1a4ec3552 100644 (file)
@@ -11,10 +11,10 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  2 files changed, 46 insertions(+)
 
 diff --git a/include/qemu/job.h b/include/qemu/job.h
-index e502787dd8..963cf2bef5 100644
+index 2b873f2576..528cd6acb9 100644
 --- a/include/qemu/job.h
 +++ b/include/qemu/job.h
-@@ -381,6 +381,18 @@ void job_unlock(void);
+@@ -362,6 +362,18 @@ void job_unlock(void);
   */
  JobTxn *job_txn_new(void);
  
@@ -34,10 +34,10 @@ index e502787dd8..963cf2bef5 100644
   * Release a reference that was previously acquired with job_txn_add_job or
   * job_txn_new. If it's the last reference to the object, it will be freed.
 diff --git a/job.c b/job.c
-index 93e22d180b..2b31f1e14f 100644
+index baf54c8d60..3ac5e5cde2 100644
 --- a/job.c
 +++ b/job.c
-@@ -93,6 +93,8 @@ struct JobTxn {
+@@ -94,6 +94,8 @@ struct JobTxn {
  
      /* Reference count */
      int refcnt;
@@ -46,7 +46,7 @@ index 93e22d180b..2b31f1e14f 100644
  };
  
  void job_lock(void)
-@@ -118,6 +120,25 @@ JobTxn *job_txn_new(void)
+@@ -119,6 +121,25 @@ JobTxn *job_txn_new(void)
      return txn;
  }
  
@@ -72,7 +72,7 @@ index 93e22d180b..2b31f1e14f 100644
  /* Called with job_mutex held. */
  static void job_txn_ref_locked(JobTxn *txn)
  {
-@@ -1057,6 +1078,12 @@ static void job_completed_txn_success_locked(Job *job)
+@@ -1042,6 +1063,12 @@ static void job_completed_txn_success_locked(Job *job)
       */
      QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
          if (!job_is_completed_locked(other_job)) {
@@ -85,7 +85,7 @@ index 93e22d180b..2b31f1e14f 100644
              return;
          }
          assert(other_job->ret == 0);
-@@ -1268,6 +1295,13 @@ int job_finish_sync_locked(Job *job,
+@@ -1253,6 +1280,13 @@ int job_finish_sync_locked(Job *job,
          return -EBUSY;
      }
  
index 8bc528e33760fae9a29cf50bf863b0668f779cac..c2f2f9397b1a81c412fc170ecb83bc1612a74c67 100644 (file)
@@ -94,20 +94,20 @@ Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
  monitor/hmp-cmds.c             |   72 +++
  proxmox-backup-client.c        |  146 +++++
  proxmox-backup-client.h        |   60 ++
- pve-backup.c                   | 1089 ++++++++++++++++++++++++++++++++
- qapi/block-core.json           |  229 +++++++
+ pve-backup.c                   | 1098 ++++++++++++++++++++++++++++++++
+ qapi/block-core.json           |  233 +++++++
  qapi/common.json               |   14 +
  qapi/machine.json              |   16 +-
- 14 files changed, 1704 insertions(+), 14 deletions(-)
+ 14 files changed, 1717 insertions(+), 14 deletions(-)
  create mode 100644 proxmox-backup-client.c
  create mode 100644 proxmox-backup-client.h
  create mode 100644 pve-backup.c
 
 diff --git a/block/meson.build b/block/meson.build
-index 6fde9f7dcd..6d468f89e5 100644
+index e99914eaa4..6bba803f94 100644
 --- a/block/meson.build
 +++ b/block/meson.build
-@@ -45,6 +45,11 @@ block_ss.add(files(
+@@ -44,6 +44,11 @@ block_ss.add(files(
  ), zstd, zlib, gnutls)
  
  block_ss.add(files('../vma-writer.c'), libuuid)
@@ -120,10 +120,10 @@ index 6fde9f7dcd..6d468f89e5 100644
  system_ss.add(when: 'CONFIG_TCG', if_true: files('blkreplay.c'))
  system_ss.add(files('block-ram-registrar.c'))
 diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
-index ca2599de44..6efe28cef5 100644
+index d954bec6f1..5000c084c5 100644
 --- a/block/monitor/block-hmp-cmds.c
 +++ b/block/monitor/block-hmp-cmds.c
-@@ -1029,3 +1029,42 @@ void hmp_change_medium(Monitor *mon, const char *device, const char *target,
+@@ -1008,3 +1008,42 @@ void hmp_change_medium(Monitor *mon, const char *device, const char *target,
      qmp_blockdev_change_medium(device, NULL, target, arg, true, force,
                                 !!read_only, read_only_mode, errp);
  }
@@ -167,7 +167,7 @@ index ca2599de44..6efe28cef5 100644
 +    hmp_handle_error(mon, error);
 +}
 diff --git a/blockdev.c b/blockdev.c
-index cd5f205ad1..7793143d76 100644
+index d27d8c38ec..5e5dbc1da9 100644
 --- a/blockdev.c
 +++ b/blockdev.c
 @@ -37,6 +37,7 @@
@@ -179,7 +179,7 @@ index cd5f205ad1..7793143d76 100644
  #include "monitor/monitor.h"
  #include "qemu/error-report.h"
 diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
-index 10fdd822e0..15937793c1 100644
+index d5ab880492..6c97248d1b 100644
 --- a/hmp-commands-info.hx
 +++ b/hmp-commands-info.hx
 @@ -471,6 +471,20 @@ SRST
@@ -204,7 +204,7 @@ index 10fdd822e0..15937793c1 100644
      {
          .name       = "usernet",
 diff --git a/hmp-commands.hx b/hmp-commands.hx
-index e352f86872..0c8b6725fb 100644
+index 7506de251c..d5f9c28194 100644
 --- a/hmp-commands.hx
 +++ b/hmp-commands.hx
 @@ -101,6 +101,35 @@ ERST
@@ -265,10 +265,10 @@ index 7a7def7530..cba7afe70c 100644
  void hmp_device_add(Monitor *mon, const QDict *qdict);
  void hmp_device_del(Monitor *mon, const QDict *qdict);
 diff --git a/meson.build b/meson.build
-index cd95530d3b..d53976d621 100644
+index 620cc594b2..d16b97cf3c 100644
 --- a/meson.build
 +++ b/meson.build
-@@ -1779,6 +1779,7 @@ endif
+@@ -1923,6 +1923,7 @@ endif
  has_gettid = cc.has_function('gettid')
  
  libuuid = cc.find_library('uuid', required: true)
@@ -277,7 +277,7 @@ index cd95530d3b..d53976d621 100644
  # libselinux
  selinux = dependency('libselinux',
 diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
-index 91be698308..5b9c231a4c 100644
+index ef4634e5c1..6e25279f42 100644
 --- a/monitor/hmp-cmds.c
 +++ b/monitor/hmp-cmds.c
 @@ -21,6 +21,7 @@
@@ -586,10 +586,10 @@ index 0000000000..8cbf645b2c
 +#endif /* PROXMOX_BACKUP_CLIENT_H */
 diff --git a/pve-backup.c b/pve-backup.c
 new file mode 100644
-index 0000000000..ae3d137e12
+index 0000000000..9c13a92623
 --- /dev/null
 +++ b/pve-backup.c
-@@ -0,0 +1,1089 @@
+@@ -0,0 +1,1098 @@
 +#include "proxmox-backup-client.h"
 +#include "vma.h"
 +
@@ -600,6 +600,7 @@ index 0000000000..ae3d137e12
 +#include "block/block_int-global-state.h"
 +#include "block/blockjob.h"
 +#include "block/dirty-bitmap.h"
++#include "block/graph-lock.h"
 +#include "qapi/qapi-commands-block.h"
 +#include "qapi/qmp/qerror.h"
 +#include "qemu/cutils.h"
@@ -928,13 +929,6 @@ index 0000000000..ae3d137e12
 +        }
 +    }
 +
-+    if (di->job) {
-+        WITH_JOB_LOCK_GUARD() {
-+            job_unref_locked(&di->job->job);
-+            di->job = NULL;
-+        }
-+    }
-+
 +    // remove self from job list
 +    backup_state.di_list = g_list_remove(backup_state.di_list, di);
 +
@@ -954,6 +948,16 @@ index 0000000000..ae3d137e12
 +    di->completed_ret = ret;
 +
 +    /*
++     * Needs to happen outside of coroutine, because it takes the graph write lock.
++     */
++    if (di->job) {
++        WITH_JOB_LOCK_GUARD() {
++            job_unref_locked(&di->job->job);
++            di->job = NULL;
++        }
++    }
++
++    /*
 +     * Schedule stream cleanup in async coroutine. close_image and finish might
 +     * take a while, so we can't block on them here. This way it also doesn't
 +     * matter if we're already running in a coroutine or not.
@@ -1108,9 +1112,6 @@ index 0000000000..ae3d137e12
 +            sync_mode = MIRROR_SYNC_MODE_BITMAP;
 +            bitmap_mode = BITMAP_SYNC_MODE_ON_SUCCESS;
 +        }
-+        AioContext *aio_context = bdrv_get_aio_context(di->bs);
-+        aio_context_acquire(aio_context);
-+
 +        bdrv_drained_begin(di->bs);
 +
 +        BlockJob *job = backup_job_create(
@@ -1121,8 +1122,6 @@ index 0000000000..ae3d137e12
 +
 +        bdrv_drained_end(di->bs);
 +
-+        aio_context_release(aio_context);
-+
 +        di->job = job;
 +        if (job) {
 +            WITH_JOB_LOCK_GUARD() {
@@ -1178,7 +1177,7 @@ index 0000000000..ae3d137e12
 + * case of an error, errp will be set, but the returned value might still be a
 + * list.
 + */
-+static GList coroutine_fn *get_device_info(
++static GList coroutine_fn GRAPH_RDLOCK *get_device_info(
 +    const char *devlist,
 +    Error **errp)
 +{
@@ -1279,7 +1278,9 @@ index 0000000000..ae3d137e12
 +    /* Todo: try to auto-detect format based on file name */
 +    format = has_format ? format : BACKUP_FORMAT_VMA;
 +
++    bdrv_graph_co_rdlock();
 +    di_list = get_device_info(devlist, &local_err);
++    bdrv_graph_co_rdunlock();
 +    if (local_err) {
 +        error_propagate(errp, local_err);
 +        goto err;
@@ -1292,7 +1293,11 @@ index 0000000000..ae3d137e12
 +    while (l) {
 +        PVEBackupDevInfo *di = (PVEBackupDevInfo *)l->data;
 +        l = g_list_next(l);
-+        if (bdrv_op_is_blocked(di->bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) {
++
++        bdrv_graph_co_rdlock();
++        bool blocked = bdrv_op_is_blocked(di->bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp);
++        bdrv_graph_co_rdunlock();
++        if (blocked) {
 +            goto err;
 +        }
 +
@@ -1376,7 +1381,9 @@ index 0000000000..ae3d137e12
 +
 +            di->block_size = dump_cb_block_size;
 +
++            bdrv_graph_co_rdlock();
 +            const char *devname = bdrv_get_device_name(di->bs);
++            bdrv_graph_co_rdunlock();
 +            PBSBitmapAction action = PBS_BITMAP_ACTION_NOT_USED;
 +            size_t dirty = di->size;
 +
@@ -1452,7 +1459,9 @@ index 0000000000..ae3d137e12
 +                goto err_mutex;
 +            }
 +
++            bdrv_graph_co_rdlock();
 +            const char *devname = bdrv_get_device_name(di->bs);
++            bdrv_graph_co_rdunlock();
 +            di->dev_id = vma_writer_register_stream(vmaw, devname, di->size);
 +            if (di->dev_id <= 0) {
 +                error_set(errp, ERROR_CLASS_GENERIC_ERROR,
@@ -1680,10 +1689,10 @@ index 0000000000..ae3d137e12
 +    return ret;
 +}
 diff --git a/qapi/block-core.json b/qapi/block-core.json
-index bb471c078d..1b8462a51b 100644
+index f7c2b63c5d..e49c7b5bc9 100644
 --- a/qapi/block-core.json
 +++ b/qapi/block-core.json
-@@ -839,6 +839,235 @@
+@@ -851,6 +851,239 @@
  { 'command': 'query-block', 'returns': ['BlockInfo'],
    'allow-preconfig': true }
  
@@ -1752,6 +1761,9 @@ index bb471c078d..1b8462a51b 100644
 +# @config-file: a configuration file to include into
 +#               the backup archive.
 +#
++# @firewall-file: a firewall configuration file to include into the backup
++#     archive.
++#
 +# @speed: the maximum speed, in bytes per second
 +#
 +# @devlist: list of block device names (separated by ',', ';'
@@ -1819,8 +1831,6 @@ index bb471c078d..1b8462a51b 100644
 +#
 +# Cancel the current executing backup process.
 +#
-+# Returns: nothing on success
-+#
 +# Notes: This command succeeds even if there is no backup process running.
 +#
 +##
@@ -1844,6 +1854,9 @@ index bb471c078d..1b8462a51b 100644
 +#
 +# @pbs-library-version: Running version of libproxmox-backup-qemu0 library.
 +#
++# @backup-max-workers: Whether the 'max-workers' @BackupPerf setting is
++#     supported or not.
++#
 +##
 +{ 'struct': 'ProxmoxSupportStatus',
 +  'data': { 'pbs-dirty-bitmap': 'bool',
@@ -1920,10 +1933,10 @@ index bb471c078d..1b8462a51b 100644
  # @BlockDeviceTimedStats:
  #
 diff --git a/qapi/common.json b/qapi/common.json
-index 6fed9cde1a..630a2a8f9a 100644
+index 7558ce5430..6e3d800373 100644
 --- a/qapi/common.json
 +++ b/qapi/common.json
-@@ -207,3 +207,17 @@
+@@ -200,3 +200,17 @@
  ##
  { 'struct': 'HumanReadableText',
    'data': { 'human-readable-text': 'str' } }
@@ -1942,7 +1955,7 @@ index 6fed9cde1a..630a2a8f9a 100644
 +##
 +{ 'struct': 'UuidInfo', 'data': {'UUID': 'str'} }
 diff --git a/qapi/machine.json b/qapi/machine.json
-index 7da3c519ba..888457f810 100644
+index 1d69bffaa0..731d8d2f60 100644
 --- a/qapi/machine.json
 +++ b/qapi/machine.json
 @@ -4,6 +4,8 @@
@@ -1954,7 +1967,7 @@ index 7da3c519ba..888457f810 100644
  ##
  # = Machines
  ##
-@@ -230,20 +232,6 @@
+@@ -237,20 +239,6 @@
  ##
  { 'command': 'query-target', 'returns': 'TargetInfo' }
  
index 2d35795a231884b8d4c4a9255fc98fc9ff6ccfe8..bde2cb2dc19994fd577280060f67b456281da47a 100644 (file)
@@ -14,10 +14,10 @@ Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
  create mode 100644 pbs-restore.c
 
 diff --git a/meson.build b/meson.build
-index d53976d621..c3330310d9 100644
+index d16b97cf3c..6de51c34cb 100644
 --- a/meson.build
 +++ b/meson.build
-@@ -3914,6 +3914,10 @@ if have_tools
+@@ -4029,6 +4029,10 @@ if have_tools
    vma = executable('vma', files('vma.c', 'vma-reader.c') + genh,
                     dependencies: [authz, block, crypto, io, qom], install: true)
  
@@ -26,8 +26,8 @@ index d53976d621..c3330310d9 100644
 +                    libproxmox_backup_qemu], install: true)
 +
    subdir('storage-daemon')
-   subdir('contrib/rdmacm-mux')
-   subdir('contrib/elf2dmp')
+   foreach exe: [ 'qemu-img', 'qemu-io', 'qemu-nbd', 'qemu-storage-daemon']
 diff --git a/pbs-restore.c b/pbs-restore.c
 new file mode 100644
 index 0000000000..f03d9bab8d
index 0927c4d37b924e848f8604631cdcedc620a07615..b9578ba211f84718b573cced1edddab58b6b9d27 100644 (file)
@@ -14,35 +14,33 @@ Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
      getlength is now a coroutine function]
 Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
 ---
- block/meson.build    |   3 +
- block/pbs.c          | 305 +++++++++++++++++++++++++++++++++++++++++++
- configure            |   9 ++
+ block/meson.build    |   2 +
+ block/pbs.c          | 307 +++++++++++++++++++++++++++++++++++++++++++
  meson.build          |   2 +-
- qapi/block-core.json |  13 ++
+ qapi/block-core.json |  29 ++++
  qapi/pragma.json     |   1 +
6 files changed, 332 insertions(+), 1 deletion(-)
5 files changed, 340 insertions(+), 1 deletion(-)
  create mode 100644 block/pbs.c
 
 diff --git a/block/meson.build b/block/meson.build
-index 6d468f89e5..becc99ac4e 100644
+index 6bba803f94..1945e04eeb 100644
 --- a/block/meson.build
 +++ b/block/meson.build
-@@ -50,6 +50,9 @@ block_ss.add(files(
+@@ -49,6 +49,8 @@ block_ss.add(files(
    '../pve-backup.c',
  ), libproxmox_backup_qemu)
  
-+block_ss.add(when: 'CONFIG_PBS_BDRV', if_true: files('pbs.c'))
-+block_ss.add(when: 'CONFIG_PBS_BDRV', if_true: libproxmox_backup_qemu)
++block_ss.add(files('pbs.c'), libproxmox_backup_qemu)
 +
  
  system_ss.add(when: 'CONFIG_TCG', if_true: files('blkreplay.c'))
  system_ss.add(files('block-ram-registrar.c'))
 diff --git a/block/pbs.c b/block/pbs.c
 new file mode 100644
-index 0000000000..a2211e0f3b
+index 0000000000..dd72356bd3
 --- /dev/null
 +++ b/block/pbs.c
-@@ -0,0 +1,305 @@
+@@ -0,0 +1,307 @@
 +/*
 + * Proxmox Backup Server read-only block driver
 + */
@@ -234,7 +232,8 @@ index 0000000000..a2211e0f3b
 +    proxmox_restore_disconnect(s->conn);
 +}
 +
-+static coroutine_fn int64_t pbs_co_getlength(BlockDriverState *bs)
++static coroutine_fn int64_t GRAPH_RDLOCK
++pbs_co_getlength(BlockDriverState *bs)
 +{
 +    BDRVPBSState *s = bs->opaque;
 +    return s->length;
@@ -251,9 +250,9 @@ index 0000000000..a2211e0f3b
 +    aio_co_schedule(rcb->ctx, rcb->co);
 +}
 +
-+static coroutine_fn int pbs_co_preadv(BlockDriverState *bs,
-+                                      int64_t offset, int64_t bytes,
-+                                      QEMUIOVector *qiov, BdrvRequestFlags flags)
++static coroutine_fn int GRAPH_RDLOCK
++pbs_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
++              QEMUIOVector *qiov, BdrvRequestFlags flags)
 +{
 +    BDRVPBSState *s = bs->opaque;
 +    int ret;
@@ -298,16 +297,17 @@ index 0000000000..a2211e0f3b
 +    return 0;
 +}
 +
-+static coroutine_fn int pbs_co_pwritev(BlockDriverState *bs,
-+                                       int64_t offset, int64_t bytes,
-+                                       QEMUIOVector *qiov, BdrvRequestFlags flags)
++static coroutine_fn int GRAPH_RDLOCK
++pbs_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
++               QEMUIOVector *qiov, BdrvRequestFlags flags)
 +{
 +    fprintf(stderr, "pbs-bdrv: cannot write to backup file, make sure "
 +           "any attached disk devices are set to read-only!\n");
 +    return -EPERM;
 +}
 +
-+static void pbs_refresh_filename(BlockDriverState *bs)
++static void GRAPH_RDLOCK
++pbs_refresh_filename(BlockDriverState *bs)
 +{
 +    BDRVPBSState *s = bs->opaque;
 +    if (s->namespace) {
@@ -348,52 +348,11 @@ index 0000000000..a2211e0f3b
 +}
 +
 +block_init(bdrv_pbs_init);
-diff --git a/configure b/configure
-index 133f4e3235..f5a830c1f3 100755
---- a/configure
-+++ b/configure
-@@ -256,6 +256,7 @@ qemu_suffix="qemu"
- softmmu="yes"
- linux_user=""
- bsd_user=""
-+pbs_bdrv="yes"
- plugins="$default_feature"
- ninja=""
- python=
-@@ -809,6 +810,10 @@ for opt do
-   ;;
-   --enable-download) download="enabled"; git_submodules_action=update;
-   ;;
-+  --disable-pbs-bdrv) pbs_bdrv="no"
-+  ;;
-+  --enable-pbs-bdrv) pbs_bdrv="yes"
-+  ;;
-   --enable-plugins) if test "$mingw32" = "yes"; then
-                         error_exit "TCG plugins not currently supported on Windows platforms"
-                     else
-@@ -959,6 +964,7 @@ cat << EOF
-   bsd-user        all BSD usermode emulation targets
-   pie             Position Independent Executables
-   debug-tcg       TCG debugging (default is disabled)
-+  pbs-bdrv        Proxmox backup server read-only block driver support
- NOTE: The object files are built at the place where configure is launched
- EOF
-@@ -1744,6 +1750,9 @@ if test "$solaris" = "yes" ; then
- fi
- echo "SRC_PATH=$source_path" >> $config_host_mak
- echo "TARGET_DIRS=$target_list" >> $config_host_mak
-+if test "$pbs_bdrv" = "yes" ; then
-+  echo "CONFIG_PBS_BDRV=y" >> $config_host_mak
-+fi
- # XXX: suppress that
- if [ "$bsd" = "yes" ] ; then
 diff --git a/meson.build b/meson.build
-index c3330310d9..cbfc9a43fb 100644
+index 6de51c34cb..3bc039f60f 100644
 --- a/meson.build
 +++ b/meson.build
-@@ -4319,7 +4319,7 @@ summary_info += {'bzip2 support':     libbzip2}
+@@ -4477,7 +4477,7 @@ summary_info += {'bzip2 support':     libbzip2}
  summary_info += {'lzfse support':     liblzfse}
  summary_info += {'zstd support':      zstd}
  summary_info += {'NUMA host support': numa}
@@ -403,10 +362,10 @@ index c3330310d9..cbfc9a43fb 100644
  summary_info += {'libdaxctl support': libdaxctl}
  summary_info += {'libudev':           libudev}
 diff --git a/qapi/block-core.json b/qapi/block-core.json
-index 1b8462a51b..d67a6d448a 100644
+index e49c7b5bc9..fc32ff9957 100644
 --- a/qapi/block-core.json
 +++ b/qapi/block-core.json
-@@ -3396,6 +3396,7 @@
+@@ -3457,6 +3457,7 @@
              'parallels', 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum',
              'raw', 'rbd',
              { 'name': 'replication', 'if': 'CONFIG_REPLICATION' },
@@ -414,7 +373,7 @@ index 1b8462a51b..d67a6d448a 100644
              'ssh', 'throttle', 'vdi', 'vhdx',
              { 'name': 'virtio-blk-vfio-pci', 'if': 'CONFIG_BLKIO' },
              { 'name': 'virtio-blk-vhost-user', 'if': 'CONFIG_BLKIO' },
-@@ -3482,6 +3483,17 @@
+@@ -3543,6 +3544,33 @@
  { 'struct': 'BlockdevOptionsNull',
    'data': { '*size': 'int', '*latency-ns': 'uint64', '*read-zeroes': 'bool' } }
  
@@ -423,6 +382,22 @@ index 1b8462a51b..d67a6d448a 100644
 +#
 +# Driver specific block device options for the PBS backend.
 +#
++# @repository: Proxmox Backup Server repository.
++#
++# @snapshot: backup snapshots ID.
++#
++# @archive: archive name.
++#
++# @keyfile: keyfile to use for encryption.
++#
++# @password: password to use for connection.
++#
++# @fingerprint: backup server fingerprint.
++#
++# @key_password: password to unlock key.
++#
++# @namespace: namespace where backup snapshot lives.
++#
 +##
 +{ 'struct': 'BlockdevOptionsPbs',
 +  'data': { 'repository': 'str', 'snapshot': 'str', 'archive': 'str',
@@ -432,7 +407,7 @@ index 1b8462a51b..d67a6d448a 100644
  ##
  # @BlockdevOptionsNVMe:
  #
-@@ -4890,6 +4902,7 @@
+@@ -4977,6 +5005,7 @@
        'nfs':        'BlockdevOptionsNfs',
        'null-aio':   'BlockdevOptionsNull',
        'null-co':    'BlockdevOptionsNull',
@@ -441,10 +416,10 @@ index 1b8462a51b..d67a6d448a 100644
        'nvme-io_uring': { 'type': 'BlockdevOptionsNvmeIoUring',
                           'if': 'CONFIG_BLKIO' },
 diff --git a/qapi/pragma.json b/qapi/pragma.json
-index 325e684411..b6079f6a0e 100644
+index be8fa304c5..7ff46bd128 100644
 --- a/qapi/pragma.json
 +++ b/qapi/pragma.json
-@@ -45,6 +45,7 @@
+@@ -100,6 +100,7 @@
          'BlockInfo',                # query-block
          'BlockdevAioOptions',       # blockdev-add, -blockdev
          'BlockdevDriver',           # blockdev-add, query-blockstats, ...
index 24ec76175bc4eb54dd588cb0810b37b5e6333c7b..f56437353f7e4389d45eee92794bad7231532933 100644 (file)
@@ -9,15 +9,15 @@ fitting.
 Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
 ---
- meson.build | 2 ++
+ meson.build | 3 ++-
  os-posix.c  | 7 +++++--
- 2 files changed, 7 insertions(+), 2 deletions(-)
+ 2 files changed, 7 insertions(+), 3 deletions(-)
 
 diff --git a/meson.build b/meson.build
-index cbfc9a43fb..8206270272 100644
+index 3bc039f60f..067e8956a7 100644
 --- a/meson.build
 +++ b/meson.build
-@@ -1779,6 +1779,7 @@ endif
+@@ -1923,6 +1923,7 @@ endif
  has_gettid = cc.has_function('gettid')
  
  libuuid = cc.find_library('uuid', required: true)
@@ -25,16 +25,17 @@ index cbfc9a43fb..8206270272 100644
  libproxmox_backup_qemu = cc.find_library('proxmox_backup_qemu', required: true)
  
  # libselinux
-@@ -3406,6 +3407,7 @@ if have_block
-   # os-posix.c contains POSIX-specific functions used by qemu-storage-daemon,
-   # os-win32.c does not
-   blockdev_ss.add(when: 'CONFIG_POSIX', if_true: files('os-posix.c'))
-+  blockdev_ss.add(when: 'CONFIG_POSIX', if_true: libsystemd)
-   system_ss.add(when: 'CONFIG_WIN32', if_true: [files('os-win32.c')])
+@@ -3530,7 +3531,7 @@ if have_block
+   if host_os == 'windows'
+     system_ss.add(files('os-win32.c'))
+   else
+-    blockdev_ss.add(files('os-posix.c'))
++    blockdev_ss.add(files('os-posix.c'), libsystemd)
+   endif
  endif
  
 diff --git a/os-posix.c b/os-posix.c
-index 0cc1d991b1..f33d9901cf 100644
+index a4284e2c07..197a2120fd 100644
 --- a/os-posix.c
 +++ b/os-posix.c
 @@ -29,6 +29,8 @@
@@ -44,9 +45,9 @@ index 0cc1d991b1..f33d9901cf 100644
 +#include <systemd/sd-journal.h>
 +#include <syslog.h>
  
- /* Needed early for CONFIG_BSD etc. */
- #include "net/slirp.h"
-@@ -332,9 +334,10 @@ void os_setup_post(void)
+ #include "qemu/error-report.h"
+ #include "qemu/log.h"
+@@ -302,9 +304,10 @@ void os_setup_post(void)
  
          dup2(fd, 0);
          dup2(fd, 1);
index 1015db38575b6359f49e5e396b9f246437e7331f..1c55a418684e679443f58e1f7a1840af63fa4ebf 100644 (file)
@@ -26,10 +26,10 @@ Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
  create mode 100644 migration/pbs-state.c
 
 diff --git a/include/migration/misc.h b/include/migration/misc.h
-index 7dcc0b5c2c..4c940b2475 100644
+index c9e200f4eb..12c99ebc69 100644
 --- a/include/migration/misc.h
 +++ b/include/migration/misc.h
-@@ -77,4 +77,7 @@ bool migration_in_bg_snapshot(void);
+@@ -117,4 +117,7 @@ bool migration_in_bg_snapshot(void);
  /* migration/block-dirty-bitmap.c */
  void dirty_bitmap_mig_init(void);
  
@@ -38,7 +38,7 @@ index 7dcc0b5c2c..4c940b2475 100644
 +
  #endif
 diff --git a/migration/meson.build b/migration/meson.build
-index 07f6057acc..343994d891 100644
+index 800f12a60d..35a4306183 100644
 --- a/migration/meson.build
 +++ b/migration/meson.build
 @@ -7,7 +7,9 @@ migration_files = files(
@@ -52,17 +52,17 @@ index 07f6057acc..343994d891 100644
  system_ss.add(files(
    'block-dirty-bitmap.c',
 diff --git a/migration/migration.c b/migration/migration.c
-index 7a4c8beb5d..0a955a2a18 100644
+index 86bf76e925..b8d7e471a4 100644
 --- a/migration/migration.c
 +++ b/migration/migration.c
-@@ -162,6 +162,7 @@ void migration_object_init(void)
+@@ -239,6 +239,7 @@ void migration_object_init(void)
      blk_mig_init();
      ram_mig_init();
      dirty_bitmap_mig_init();
 +    pbs_state_mig_init();
  }
  
- void migration_cancel(const Error *error)
+ typedef struct {
 diff --git a/migration/pbs-state.c b/migration/pbs-state.c
 new file mode 100644
 index 0000000000..887e998b9e
@@ -174,10 +174,10 @@ index 0000000000..887e998b9e
 +                         NULL);
 +}
 diff --git a/pve-backup.c b/pve-backup.c
-index ae3d137e12..e6b17b797e 100644
+index 9c13a92623..9d480a8eec 100644
 --- a/pve-backup.c
 +++ b/pve-backup.c
-@@ -1082,6 +1082,7 @@ ProxmoxSupportStatus *qmp_query_proxmox_support(Error **errp)
+@@ -1091,6 +1091,7 @@ ProxmoxSupportStatus *qmp_query_proxmox_support(Error **errp)
      ret->pbs_library_version = g_strdup(proxmox_backup_qemu_version());
      ret->pbs_dirty_bitmap = true;
      ret->pbs_dirty_bitmap_savevm = true;
@@ -186,10 +186,10 @@ index ae3d137e12..e6b17b797e 100644
      ret->pbs_masterkey = true;
      ret->backup_max_workers = true;
 diff --git a/qapi/block-core.json b/qapi/block-core.json
-index d67a6d448a..09de550c95 100644
+index fc32ff9957..f516d8e95a 100644
 --- a/qapi/block-core.json
 +++ b/qapi/block-core.json
-@@ -991,6 +991,11 @@
+@@ -1004,6 +1004,11 @@
  # @pbs-dirty-bitmap-savevm: True if 'dirty-bitmaps' migration capability can
  #                           safely be set for savevm-async.
  #
@@ -201,7 +201,7 @@ index d67a6d448a..09de550c95 100644
  # @pbs-masterkey: True if the QMP backup call supports the 'master_keyfile'
  #                 parameter.
  #
-@@ -1001,6 +1006,7 @@
+@@ -1017,6 +1022,7 @@
    'data': { 'pbs-dirty-bitmap': 'bool',
              'query-bitmap-info': 'bool',
              'pbs-dirty-bitmap-savevm': 'bool',
index 75f5c28f7466d2e6f8e246e1207726448527ed7f..4a5b701bdf55281296782c569866d3b6349b1299 100644 (file)
@@ -19,7 +19,7 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
-index e1ae3b7316..285dd1d148 100644
+index 2708abf3d7..fb17c01308 100644
 --- a/migration/block-dirty-bitmap.c
 +++ b/migration/block-dirty-bitmap.c
 @@ -540,7 +540,7 @@ static int add_bitmaps_to_list(DBMSaveState *s, BlockDriverState *bs,
index f9fee14217ac616fc8c78110c95953903374f1db..c78bc03e92d1782215465eeea5f959828df7e805 100644 (file)
@@ -21,10 +21,10 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  1 file changed, 30 insertions(+)
 
 diff --git a/block/iscsi.c b/block/iscsi.c
-index 34f97ab646..398782963d 100644
+index 2ff14b7472..46f275fbf7 100644
 --- a/block/iscsi.c
 +++ b/block/iscsi.c
-@@ -1391,12 +1391,42 @@ static char *get_initiator_name(QemuOpts *opts)
+@@ -1392,12 +1392,42 @@ static char *get_initiator_name(QemuOpts *opts)
      const char *name;
      char *iscsi_name;
      UuidInfo *uuid_info;
index 28dd8d161ee13b0ff6174f3888cfb65a80e41e24..5663dcc5ef683f53dbb452382a315858524310cc 100644 (file)
@@ -11,7 +11,7 @@ Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/block/stream.c b/block/stream.c
-index e522bbdec5..afed72db55 100644
+index 7031eef12b..d2da83ae7c 100644
 --- a/block/stream.c
 +++ b/block/stream.c
 @@ -27,7 +27,7 @@ enum {
diff --git a/debian/patches/pve/0038-block-add-alloc-track-driver.patch b/debian/patches/pve/0038-block-add-alloc-track-driver.patch
new file mode 100644 (file)
index 0000000..d302c8e
--- /dev/null
@@ -0,0 +1,471 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Stefan Reiter <s.reiter@proxmox.com>
+Date: Mon, 7 Dec 2020 15:21:03 +0100
+Subject: [PATCH] block: add alloc-track driver
+
+Add a new filter node 'alloc-track', which seperates reads and writes to
+different children, thus allowing to put a backing image behind any
+blockdev (regardless of driver support). Since we can't detect any
+pre-allocated blocks, we can only track new writes, hence the write
+target ('file') for this node must always be empty.
+
+Intended use case is for live restoring, i.e. add a backup image as a
+block device into a VM, then put an alloc-track on the restore target
+and set the backup as backing. With this, one can use a regular
+'block-stream' to restore the image, while the VM can already run in the
+background. Copy-on-read will help make progress as the VM reads as
+well.
+
+This only worked if the target supports backing images, so up until now
+only for qcow2, with alloc-track any driver for the target can be used.
+
+Replacing the node cannot be done in the
+track_co_change_backing_file() callback, because replacing a node
+cannot happen in a coroutine and requires the block graph lock
+exclusively. Could either become a special option for the stream job,
+or maybe the upcoming blockdev-replace QMP command can be used in the
+future.
+
+Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+[FE: adapt to changed function signatures
+     make error return value consistent with QEMU
+     avoid premature break during read
+     adhere to block graph lock requirements]
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+---
+ block/alloc-track.c | 366 ++++++++++++++++++++++++++++++++++++++++++++
+ block/meson.build   |   1 +
+ block/stream.c      |  34 ++++
+ 3 files changed, 401 insertions(+)
+ create mode 100644 block/alloc-track.c
+
+diff --git a/block/alloc-track.c b/block/alloc-track.c
+new file mode 100644
+index 0000000000..b9f8ea9137
+--- /dev/null
++++ b/block/alloc-track.c
+@@ -0,0 +1,366 @@
++/*
++ * Node to allow backing images to be applied to any node. Assumes a blank
++ * image to begin with, only new writes are tracked as allocated, thus this
++ * must never be put on a node that already contains data.
++ *
++ * Copyright (c) 2020 Proxmox Server Solutions GmbH
++ * Copyright (c) 2020 Stefan Reiter <s.reiter@proxmox.com>
++ *
++ * This work is licensed under the terms of the GNU GPL, version 2 or later.
++ * See the COPYING file in the top-level directory.
++ */
++
++#include "qemu/osdep.h"
++#include "qapi/error.h"
++#include "block/block_int.h"
++#include "block/dirty-bitmap.h"
++#include "block/graph-lock.h"
++#include "qapi/qmp/qdict.h"
++#include "qapi/qmp/qstring.h"
++#include "qemu/cutils.h"
++#include "qemu/error-report.h"
++#include "qemu/option.h"
++#include "qemu/module.h"
++#include "sysemu/block-backend.h"
++
++#define TRACK_OPT_AUTO_REMOVE "auto-remove"
++
++typedef enum DropState {
++    DropNone,
++    DropInProgress,
++} DropState;
++
++typedef struct {
++    BdrvDirtyBitmap *bitmap;
++    uint64_t granularity;
++    DropState drop_state;
++    bool auto_remove;
++} BDRVAllocTrackState;
++
++static QemuOptsList runtime_opts = {
++    .name = "alloc-track",
++    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
++    .desc = {
++        {
++            .name = TRACK_OPT_AUTO_REMOVE,
++            .type = QEMU_OPT_BOOL,
++            .help = "automatically replace this node with 'file' when 'backing'"
++                    "is detached",
++        },
++        { /* end of list */ }
++    },
++};
++
++static void GRAPH_RDLOCK
++track_refresh_limits(BlockDriverState *bs, Error **errp)
++{
++    BDRVAllocTrackState *s = bs->opaque;
++
++    if (!bs->file) {
++        return;
++    }
++
++    /*
++     * Always use alignment from underlying write device so RMW cycle for
++     * bdrv_pwritev reads data from our backing via track_co_preadv. Also use at
++     * least the bitmap granularity.
++     */
++    bs->bl.request_alignment = MAX(bs->file->bs->bl.request_alignment,
++                                   s->granularity);
++}
++
++static int track_open(BlockDriverState *bs, QDict *options, int flags,
++                      Error **errp)
++{
++    BDRVAllocTrackState *s = bs->opaque;
++    BdrvChild *file = NULL;
++    QemuOpts *opts;
++    Error *local_err = NULL;
++    int ret = 0;
++
++    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
++    qemu_opts_absorb_qdict(opts, options, &local_err);
++    if (local_err) {
++        error_propagate(errp, local_err);
++        ret = -EINVAL;
++        goto fail;
++    }
++
++    s->auto_remove = qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false);
++
++    /* open the target (write) node, backing will be attached by block layer */
++    file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
++                           BDRV_CHILD_DATA | BDRV_CHILD_METADATA, false,
++                           &local_err);
++    bdrv_graph_wrlock();
++    bs->file = file;
++    bdrv_graph_wrunlock();
++    if (local_err) {
++        ret = -EINVAL;
++        error_propagate(errp, local_err);
++        goto fail;
++    }
++
++    bdrv_graph_rdlock_main_loop();
++    BlockDriverInfo bdi = {0};
++    ret = bdrv_get_info(bs->file->bs, &bdi);
++    if (ret < 0) {
++        /*
++         * Not a hard failure. Worst that can happen is partial cluster
++         * allocation in the write target. However, the driver here returns its
++         * allocation status based on the dirty bitmap, so any other data that
++         * maps to such a cluster will still be copied later by a stream job (or
++         * during writes to that cluster).
++         */
++        warn_report("alloc-track: unable to query cluster size for write target: %s",
++                    strerror(ret));
++    }
++    ret = 0;
++    /*
++     * Always consider alignment from underlying write device so RMW cycle for
++     * bdrv_pwritev reads data from our backing via track_co_preadv. Also try to
++     * avoid partial cluster allocation in the write target by considering the
++     * cluster size.
++     */
++    s->granularity = MAX(bs->file->bs->bl.request_alignment,
++                         MAX(bdi.cluster_size, BDRV_SECTOR_SIZE));
++    track_refresh_limits(bs, errp);
++    s->bitmap = bdrv_create_dirty_bitmap(bs->file->bs, s->granularity, NULL,
++                                         &local_err);
++    bdrv_graph_rdunlock_main_loop();
++    if (local_err) {
++        ret = -EIO;
++        error_propagate(errp, local_err);
++        goto fail;
++    }
++
++    s->drop_state = DropNone;
++
++fail:
++    if (ret < 0) {
++        bdrv_graph_wrlock();
++        bdrv_unref_child(bs, bs->file);
++        bdrv_graph_wrunlock();
++        if (s->bitmap) {
++            bdrv_release_dirty_bitmap(s->bitmap);
++        }
++    }
++    qemu_opts_del(opts);
++    return ret;
++}
++
++static void track_close(BlockDriverState *bs)
++{
++    BDRVAllocTrackState *s = bs->opaque;
++    if (s->bitmap) {
++        bdrv_release_dirty_bitmap(s->bitmap);
++    }
++}
++
++static coroutine_fn int64_t GRAPH_RDLOCK
++track_co_getlength(BlockDriverState *bs)
++{
++    return bdrv_co_getlength(bs->file->bs);
++}
++
++static int coroutine_fn GRAPH_RDLOCK
++track_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
++                QEMUIOVector *qiov, BdrvRequestFlags flags)
++{
++    BDRVAllocTrackState *s = bs->opaque;
++    QEMUIOVector local_qiov;
++    int ret;
++
++    /* 'cur_offset' is relative to 'offset', 'local_offset' to image start */
++    uint64_t cur_offset, local_offset;
++    int64_t local_bytes;
++    bool alloc;
++
++    if (offset < 0 || bytes < 0) {
++        fprintf(stderr, "unexpected negative 'offset' or 'bytes' value!\n");
++        return -EIO;
++    }
++
++    /* a read request can span multiple granularity-sized chunks, and can thus
++     * contain blocks with different allocation status - we could just iterate
++     * granularity-wise, but for better performance use bdrv_dirty_bitmap_next_X
++     * to find the next flip and consider everything up to that in one go */
++    for (cur_offset = 0; cur_offset < bytes; cur_offset += local_bytes) {
++        local_offset = offset + cur_offset;
++        alloc = bdrv_dirty_bitmap_get(s->bitmap, local_offset);
++        if (alloc) {
++            local_bytes = bdrv_dirty_bitmap_next_zero(s->bitmap, local_offset,
++                                                      bytes - cur_offset);
++        } else {
++            local_bytes = bdrv_dirty_bitmap_next_dirty(s->bitmap, local_offset,
++                                                       bytes - cur_offset);
++        }
++
++        /* _bitmap_next_X return is -1 if no end found within limit, otherwise
++         * offset of next flip (to start of image) */
++        local_bytes = local_bytes < 0 ?
++            bytes - cur_offset :
++            local_bytes - local_offset;
++
++        qemu_iovec_init_slice(&local_qiov, qiov, cur_offset, local_bytes);
++
++        if (alloc) {
++            ret = bdrv_co_preadv(bs->file, local_offset, local_bytes,
++                                 &local_qiov, flags);
++        } else if (bs->backing) {
++            ret = bdrv_co_preadv(bs->backing, local_offset, local_bytes,
++                                 &local_qiov, flags);
++        } else {
++            qemu_iovec_memset(&local_qiov, cur_offset, 0, local_bytes);
++            ret = 0;
++        }
++
++        if (ret != 0) {
++            break;
++        }
++    }
++
++    return ret;
++}
++
++static int coroutine_fn GRAPH_RDLOCK
++track_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
++                 QEMUIOVector *qiov, BdrvRequestFlags flags)
++{
++    return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
++}
++
++static int coroutine_fn GRAPH_RDLOCK
++track_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
++                       BdrvRequestFlags flags)
++{
++    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
++}
++
++static int coroutine_fn GRAPH_RDLOCK
++track_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
++{
++    return bdrv_co_pdiscard(bs->file, offset, bytes);
++}
++
++static coroutine_fn int GRAPH_RDLOCK
++track_co_flush(BlockDriverState *bs)
++{
++    return bdrv_co_flush(bs->file->bs);
++}
++
++static int coroutine_fn GRAPH_RDLOCK
++track_co_block_status(BlockDriverState *bs, bool want_zero,
++                                            int64_t offset,
++                                            int64_t bytes,
++                                            int64_t *pnum,
++                                            int64_t *map,
++                                            BlockDriverState **file)
++{
++    BDRVAllocTrackState *s = bs->opaque;
++
++    bool alloc = bdrv_dirty_bitmap_get(s->bitmap, offset);
++    int64_t next_flipped;
++    if (alloc) {
++        next_flipped = bdrv_dirty_bitmap_next_zero(s->bitmap, offset, bytes);
++    } else {
++        next_flipped = bdrv_dirty_bitmap_next_dirty(s->bitmap, offset, bytes);
++    }
++
++    /* in case not the entire region has the same state, we need to set pnum to
++     * indicate for how many bytes our result is valid */
++    *pnum = next_flipped == -1 ? bytes : next_flipped - offset;
++    *map = offset;
++
++    if (alloc) {
++        *file = bs->file->bs;
++        return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
++    } else if (bs->backing) {
++        *file = bs->backing->bs;
++    }
++    return 0;
++}
++
++static void GRAPH_RDLOCK
++track_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
++                 BlockReopenQueue *reopen_queue, uint64_t perm, uint64_t shared,
++                 uint64_t *nperm, uint64_t *nshared)
++{
++    BDRVAllocTrackState *s = bs->opaque;
++
++    *nshared = BLK_PERM_ALL;
++
++    /* in case we're currently dropping ourselves, claim to not use any
++     * permissions at all - which is fine, since from this point on we will
++     * never issue a read or write anymore */
++    if (s->drop_state == DropInProgress) {
++        *nperm = 0;
++        return;
++    }
++
++    if (role & BDRV_CHILD_DATA) {
++        *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
++    } else {
++        /* 'backing' is also a child of our BDS, but we don't expect it to be
++         * writeable, so we only forward 'consistent read' */
++        *nperm = perm & BLK_PERM_CONSISTENT_READ;
++    }
++}
++
++static int coroutine_fn GRAPH_RDLOCK
++track_co_change_backing_file(BlockDriverState *bs, const char *backing_file,
++                             const char *backing_fmt)
++{
++    /*
++     * Note that the actual backing file graph change is already done in the
++     * stream job itself with bdrv_set_backing_hd_drained(), so no need to
++     * actually do anything here. But still needs to be implemented, to make
++     * our caller (i.e. bdrv_co_change_backing_file() do the right thing).
++     *
++     * FIXME
++     * We'd like to auto-remove ourselves from the block graph, but it cannot
++     * be done from a coroutine. Currently done in the stream job, where it
++     * kinda fits better, but in the long-term, a special parameter would be
++     * nice (or done via qemu-server via upcoming blockdev-replace QMP command).
++     */
++    if (backing_file == NULL) {
++        BDRVAllocTrackState *s = bs->opaque;
++        bdrv_drained_begin(bs);
++        s->drop_state = DropInProgress;
++        bdrv_child_refresh_perms(bs, bs->file, &error_abort);
++        bdrv_drained_end(bs);
++    }
++
++    return 0;
++}
++
++static BlockDriver bdrv_alloc_track = {
++    .format_name                      = "alloc-track",
++    .instance_size                    = sizeof(BDRVAllocTrackState),
++
++    .bdrv_file_open                   = track_open,
++    .bdrv_close                       = track_close,
++    .bdrv_co_getlength                = track_co_getlength,
++    .bdrv_child_perm                  = track_child_perm,
++    .bdrv_refresh_limits              = track_refresh_limits,
++
++    .bdrv_co_pwrite_zeroes            = track_co_pwrite_zeroes,
++    .bdrv_co_pwritev                  = track_co_pwritev,
++    .bdrv_co_preadv                   = track_co_preadv,
++    .bdrv_co_pdiscard                 = track_co_pdiscard,
++
++    .bdrv_co_flush                    = track_co_flush,
++    .bdrv_co_flush_to_disk            = track_co_flush,
++
++    .supports_backing                 = true,
++
++    .bdrv_co_block_status             = track_co_block_status,
++    .bdrv_co_change_backing_file      = track_co_change_backing_file,
++};
++
++static void bdrv_alloc_track_init(void)
++{
++    bdrv_register(&bdrv_alloc_track);
++}
++
++block_init(bdrv_alloc_track_init);
+diff --git a/block/meson.build b/block/meson.build
+index 1945e04eeb..2873f3a25a 100644
+--- a/block/meson.build
++++ b/block/meson.build
+@@ -2,6 +2,7 @@ block_ss.add(genh)
+ block_ss.add(files(
+   'accounting.c',
+   'aio_task.c',
++  'alloc-track.c',
+   'amend.c',
+   'backup.c',
+   'backup-dump.c',
+diff --git a/block/stream.c b/block/stream.c
+index d2da83ae7c..f941cba14e 100644
+--- a/block/stream.c
++++ b/block/stream.c
+@@ -120,6 +120,40 @@ static int stream_prepare(Job *job)
+             ret = -EPERM;
+             goto out;
+         }
++
++        /*
++         * This cannot be done in the co_change_backing_file callback, because
++         * bdrv_replace_node() cannot be done in a coroutine. The latter also
++         * requires the graph lock exclusively. Only required for the
++         * alloc-track driver.
++         *
++         * The long-term plan is to either have an explicit parameter for the
++         * stream job or use the upcoming blockdev-replace QMP command.
++         */
++        if (base_id == NULL && strcmp(unfiltered_bs->drv->format_name, "alloc-track") == 0) {
++            BlockDriverState *file_bs;
++
++            bdrv_graph_rdlock_main_loop();
++            file_bs = unfiltered_bs->file->bs;
++            bdrv_graph_rdunlock_main_loop();
++
++            bdrv_ref(unfiltered_bs); // unrefed by bdrv_replace_node()
++            bdrv_drained_begin(file_bs);
++            bdrv_graph_wrlock();
++
++            bdrv_replace_node(unfiltered_bs, file_bs, &local_err);
++
++            bdrv_graph_wrunlock();
++            bdrv_drained_end(file_bs);
++            bdrv_unref(unfiltered_bs);
++
++            if (local_err) {
++                error_prepend(&local_err, "failed to replace alloc-track node: ");
++                error_report_err(local_err);
++                ret = -EPERM;
++                goto out;
++            }
++        }
+     }
+ out:
diff --git a/debian/patches/pve/0038-block-io-accept-NULL-qiov-in-bdrv_pad_request.patch b/debian/patches/pve/0038-block-io-accept-NULL-qiov-in-bdrv_pad_request.patch
deleted file mode 100644 (file)
index bb9b72c..0000000
+++ /dev/null
@@ -1,64 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Stefan Reiter <s.reiter@proxmox.com>
-Date: Tue, 2 Mar 2021 16:11:54 +0100
-Subject: [PATCH] block/io: accept NULL qiov in bdrv_pad_request
-
-Some operations, e.g. block-stream, perform reads while discarding the
-results (only copy-on-read matters). In this case they will pass NULL as
-the target QEMUIOVector, which will however trip bdrv_pad_request, since
-it wants to extend its passed vector.
-
-If there is no qiov, no operation can be done with it, but the bytes
-and offset still need to be updated, so the subsequent aligned read
-will actually be aligned and not run into an assertion failure.
-
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
-[FE: do update bytes and offset in any case]
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- block/io.c | 29 ++++++++++++++++-------------
- 1 file changed, 16 insertions(+), 13 deletions(-)
-
-diff --git a/block/io.c b/block/io.c
-index 83d1b1dfdc..e927881e40 100644
---- a/block/io.c
-+++ b/block/io.c
-@@ -1723,22 +1723,25 @@ static int bdrv_pad_request(BlockDriverState *bs,
-         return 0;
-     }
--    sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes,
--                                  &sliced_head, &sliced_tail,
--                                  &sliced_niov);
--
--    /* Guaranteed by bdrv_check_request32() */
--    assert(*bytes <= SIZE_MAX);
--    ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov,
--                                  sliced_head, *bytes);
--    if (ret < 0) {
--        bdrv_padding_finalize(pad);
--        return ret;
-+    if (qiov && *qiov) {
-+        sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes,
-+                                      &sliced_head, &sliced_tail,
-+                                      &sliced_niov);
-+
-+        /* Guaranteed by bdrv_check_request32() */
-+        assert(*bytes <= SIZE_MAX);
-+        ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov,
-+                                      sliced_head, *bytes);
-+        if (ret < 0) {
-+            bdrv_padding_finalize(pad);
-+            return ret;
-+        }
-+        *qiov = &pad->local_qiov;
-+        *qiov_offset = 0;
-     }
-+
-     *bytes += pad->head + pad->tail;
-     *offset -= pad->head;
--    *qiov = &pad->local_qiov;
--    *qiov_offset = 0;
-     if (padded) {
-         *padded = true;
-     }
diff --git a/debian/patches/pve/0039-Revert-block-rbd-workaround-for-ceph-issue-53784.patch b/debian/patches/pve/0039-Revert-block-rbd-workaround-for-ceph-issue-53784.patch
new file mode 100644 (file)
index 0000000..f99f717
--- /dev/null
@@ -0,0 +1,81 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fabian Ebner <f.ebner@proxmox.com>
+Date: Thu, 23 Jun 2022 14:00:05 +0200
+Subject: [PATCH] Revert "block/rbd: workaround for ceph issue #53784"
+
+This reverts commit fc176116cdea816ceb8dd969080b2b95f58edbc0 in
+preparation to revert 0347a8fd4c3faaedf119be04c197804be40a384b.
+
+Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/rbd.c | 42 ++----------------------------------------
+ 1 file changed, 2 insertions(+), 40 deletions(-)
+
+diff --git a/block/rbd.c b/block/rbd.c
+index 63f60d41be..367db42dce 100644
+--- a/block/rbd.c
++++ b/block/rbd.c
+@@ -1515,7 +1515,6 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
+     int status, r;
+     RBDDiffIterateReq req = { .offs = offset };
+     uint64_t features, flags;
+-    uint64_t head = 0;
+     assert(offset + bytes <= s->image_size);
+@@ -1543,43 +1542,7 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
+         return status;
+     }
+-#if LIBRBD_VERSION_CODE < LIBRBD_VERSION(1, 17, 0)
+-    /*
+-     * librbd had a bug until early 2022 that affected all versions of ceph that
+-     * supported fast-diff. This bug results in reporting of incorrect offsets
+-     * if the offset parameter to rbd_diff_iterate2 is not object aligned.
+-     * Work around this bug by rounding down the offset to object boundaries.
+-     * This is OK because we call rbd_diff_iterate2 with whole_object = true.
+-     * However, this workaround only works for non cloned images with default
+-     * striping.
+-     *
+-     * See: https://tracker.ceph.com/issues/53784
+-     */
+-
+-    /* check if RBD image has non-default striping enabled */
+-    if (features & RBD_FEATURE_STRIPINGV2) {
+-        return status;
+-    }
+-
+-#pragma GCC diagnostic push
+-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+-    /*
+-     * check if RBD image is a clone (= has a parent).
+-     *
+-     * rbd_get_parent_info is deprecated from Nautilus onwards, but the
+-     * replacement rbd_get_parent is not present in Luminous and Mimic.
+-     */
+-    if (rbd_get_parent_info(s->image, NULL, 0, NULL, 0, NULL, 0) != -ENOENT) {
+-        return status;
+-    }
+-#pragma GCC diagnostic pop
+-
+-    head = req.offs & (s->object_size - 1);
+-    req.offs -= head;
+-    bytes += head;
+-#endif
+-
+-    r = rbd_diff_iterate2(s->image, NULL, req.offs, bytes, true, true,
++    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
+                           qemu_rbd_diff_iterate_cb, &req);
+     if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
+         return status;
+@@ -1598,8 +1561,7 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
+         status = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID;
+     }
+-    assert(req.bytes > head);
+-    *pnum = req.bytes - head;
++    *pnum = req.bytes;
+     return status;
+ }
diff --git a/debian/patches/pve/0039-block-add-alloc-track-driver.patch b/debian/patches/pve/0039-block-add-alloc-track-driver.patch
deleted file mode 100644 (file)
index ea5f105..0000000
+++ /dev/null
@@ -1,406 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Stefan Reiter <s.reiter@proxmox.com>
-Date: Mon, 7 Dec 2020 15:21:03 +0100
-Subject: [PATCH] block: add alloc-track driver
-
-Add a new filter node 'alloc-track', which seperates reads and writes to
-different children, thus allowing to put a backing image behind any
-blockdev (regardless of driver support). Since we can't detect any
-pre-allocated blocks, we can only track new writes, hence the write
-target ('file') for this node must always be empty.
-
-Intended use case is for live restoring, i.e. add a backup image as a
-block device into a VM, then put an alloc-track on the restore target
-and set the backup as backing. With this, one can use a regular
-'block-stream' to restore the image, while the VM can already run in the
-background. Copy-on-read will help make progress as the VM reads as
-well.
-
-This only worked if the target supports backing images, so up until now
-only for qcow2, with alloc-track any driver for the target can be used.
-
-If 'auto-remove' is set, alloc-track will automatically detach itself
-once the backing image is removed. It will be replaced by 'file'.
-
-Signed-off-by: Stefan Reiter <s.reiter@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
-[FE: adapt to changed function signatures
-     make error return value consistent with QEMU
-     avoid premature break during read]
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
----
- block/alloc-track.c | 352 ++++++++++++++++++++++++++++++++++++++++++++
- block/meson.build   |   1 +
- 2 files changed, 353 insertions(+)
- create mode 100644 block/alloc-track.c
-
-diff --git a/block/alloc-track.c b/block/alloc-track.c
-new file mode 100644
-index 0000000000..b75d7c6460
---- /dev/null
-+++ b/block/alloc-track.c
-@@ -0,0 +1,352 @@
-+/*
-+ * Node to allow backing images to be applied to any node. Assumes a blank
-+ * image to begin with, only new writes are tracked as allocated, thus this
-+ * must never be put on a node that already contains data.
-+ *
-+ * Copyright (c) 2020 Proxmox Server Solutions GmbH
-+ * Copyright (c) 2020 Stefan Reiter <s.reiter@proxmox.com>
-+ *
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
-+ * See the COPYING file in the top-level directory.
-+ */
-+
-+#include "qemu/osdep.h"
-+#include "qapi/error.h"
-+#include "block/block_int.h"
-+#include "block/dirty-bitmap.h"
-+#include "qapi/qmp/qdict.h"
-+#include "qapi/qmp/qstring.h"
-+#include "qemu/cutils.h"
-+#include "qemu/option.h"
-+#include "qemu/module.h"
-+#include "sysemu/block-backend.h"
-+
-+#define TRACK_OPT_AUTO_REMOVE "auto-remove"
-+
-+typedef enum DropState {
-+    DropNone,
-+    DropRequested,
-+    DropInProgress,
-+} DropState;
-+
-+typedef struct {
-+    BdrvDirtyBitmap *bitmap;
-+    DropState drop_state;
-+    bool auto_remove;
-+} BDRVAllocTrackState;
-+
-+static QemuOptsList runtime_opts = {
-+    .name = "alloc-track",
-+    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
-+    .desc = {
-+        {
-+            .name = TRACK_OPT_AUTO_REMOVE,
-+            .type = QEMU_OPT_BOOL,
-+            .help = "automatically replace this node with 'file' when 'backing'"
-+                    "is detached",
-+        },
-+        { /* end of list */ }
-+    },
-+};
-+
-+static void track_refresh_limits(BlockDriverState *bs, Error **errp)
-+{
-+    BlockDriverInfo bdi;
-+
-+    if (!bs->file) {
-+        return;
-+    }
-+
-+    /* always use alignment from underlying write device so RMW cycle for
-+     * bdrv_pwritev reads data from our backing via track_co_preadv (no partial
-+     * cluster allocation in 'file') */
-+    bdrv_get_info(bs->file->bs, &bdi);
-+    bs->bl.request_alignment = MAX(bs->file->bs->bl.request_alignment,
-+                                   MAX(bdi.cluster_size, BDRV_SECTOR_SIZE));
-+}
-+
-+static int track_open(BlockDriverState *bs, QDict *options, int flags,
-+                      Error **errp)
-+{
-+    BDRVAllocTrackState *s = bs->opaque;
-+    QemuOpts *opts;
-+    Error *local_err = NULL;
-+    int ret = 0;
-+
-+    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
-+    qemu_opts_absorb_qdict(opts, options, &local_err);
-+    if (local_err) {
-+        error_propagate(errp, local_err);
-+        ret = -EINVAL;
-+        goto fail;
-+    }
-+
-+    s->auto_remove = qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false);
-+
-+    /* open the target (write) node, backing will be attached by block layer */
-+    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
-+                               BDRV_CHILD_DATA | BDRV_CHILD_METADATA, false,
-+                               &local_err);
-+    if (local_err) {
-+        ret = -EINVAL;
-+        error_propagate(errp, local_err);
-+        goto fail;
-+    }
-+
-+    track_refresh_limits(bs, errp);
-+    uint64_t gran = bs->bl.request_alignment;
-+    s->bitmap = bdrv_create_dirty_bitmap(bs->file->bs, gran, NULL, &local_err);
-+    if (local_err) {
-+        ret = -EIO;
-+        error_propagate(errp, local_err);
-+        goto fail;
-+    }
-+
-+    s->drop_state = DropNone;
-+
-+fail:
-+    if (ret < 0) {
-+        bdrv_unref_child(bs, bs->file);
-+        if (s->bitmap) {
-+            bdrv_release_dirty_bitmap(s->bitmap);
-+        }
-+    }
-+    qemu_opts_del(opts);
-+    return ret;
-+}
-+
-+static void track_close(BlockDriverState *bs)
-+{
-+    BDRVAllocTrackState *s = bs->opaque;
-+    if (s->bitmap) {
-+        bdrv_release_dirty_bitmap(s->bitmap);
-+    }
-+}
-+
-+static coroutine_fn int64_t track_co_getlength(BlockDriverState *bs)
-+{
-+    return bdrv_co_getlength(bs->file->bs);
-+}
-+
-+static int coroutine_fn track_co_preadv(BlockDriverState *bs,
-+    int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
-+{
-+    BDRVAllocTrackState *s = bs->opaque;
-+    QEMUIOVector local_qiov;
-+    int ret;
-+
-+    /* 'cur_offset' is relative to 'offset', 'local_offset' to image start */
-+    uint64_t cur_offset, local_offset;
-+    int64_t local_bytes;
-+    bool alloc;
-+
-+    if (offset < 0 || bytes < 0) {
-+        fprintf(stderr, "unexpected negative 'offset' or 'bytes' value!\n");
-+        return -EIO;
-+    }
-+
-+    /* a read request can span multiple granularity-sized chunks, and can thus
-+     * contain blocks with different allocation status - we could just iterate
-+     * granularity-wise, but for better performance use bdrv_dirty_bitmap_next_X
-+     * to find the next flip and consider everything up to that in one go */
-+    for (cur_offset = 0; cur_offset < bytes; cur_offset += local_bytes) {
-+        local_offset = offset + cur_offset;
-+        alloc = bdrv_dirty_bitmap_get(s->bitmap, local_offset);
-+        if (alloc) {
-+            local_bytes = bdrv_dirty_bitmap_next_zero(s->bitmap, local_offset,
-+                                                      bytes - cur_offset);
-+        } else {
-+            local_bytes = bdrv_dirty_bitmap_next_dirty(s->bitmap, local_offset,
-+                                                       bytes - cur_offset);
-+        }
-+
-+        /* _bitmap_next_X return is -1 if no end found within limit, otherwise
-+         * offset of next flip (to start of image) */
-+        local_bytes = local_bytes < 0 ?
-+            bytes - cur_offset :
-+            local_bytes - local_offset;
-+
-+        qemu_iovec_init_slice(&local_qiov, qiov, cur_offset, local_bytes);
-+
-+        if (alloc) {
-+            ret = bdrv_co_preadv(bs->file, local_offset, local_bytes,
-+                                 &local_qiov, flags);
-+        } else if (bs->backing) {
-+            ret = bdrv_co_preadv(bs->backing, local_offset, local_bytes,
-+                                 &local_qiov, flags);
-+        } else {
-+            qemu_iovec_memset(&local_qiov, cur_offset, 0, local_bytes);
-+            ret = 0;
-+        }
-+
-+        if (ret != 0) {
-+            break;
-+        }
-+    }
-+
-+    return ret;
-+}
-+
-+static int coroutine_fn track_co_pwritev(BlockDriverState *bs,
-+    int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
-+{
-+    return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
-+}
-+
-+static int coroutine_fn track_co_pwrite_zeroes(BlockDriverState *bs,
-+    int64_t offset, int64_t bytes, BdrvRequestFlags flags)
-+{
-+    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
-+}
-+
-+static int coroutine_fn track_co_pdiscard(BlockDriverState *bs,
-+    int64_t offset, int64_t bytes)
-+{
-+    return bdrv_co_pdiscard(bs->file, offset, bytes);
-+}
-+
-+static coroutine_fn int track_co_flush(BlockDriverState *bs)
-+{
-+    return bdrv_co_flush(bs->file->bs);
-+}
-+
-+static int coroutine_fn track_co_block_status(BlockDriverState *bs,
-+                                              bool want_zero,
-+                                              int64_t offset,
-+                                              int64_t bytes,
-+                                              int64_t *pnum,
-+                                              int64_t *map,
-+                                              BlockDriverState **file)
-+{
-+    BDRVAllocTrackState *s = bs->opaque;
-+
-+    bool alloc = bdrv_dirty_bitmap_get(s->bitmap, offset);
-+    int64_t next_flipped;
-+    if (alloc) {
-+        next_flipped = bdrv_dirty_bitmap_next_zero(s->bitmap, offset, bytes);
-+    } else {
-+        next_flipped = bdrv_dirty_bitmap_next_dirty(s->bitmap, offset, bytes);
-+    }
-+
-+    /* in case not the entire region has the same state, we need to set pnum to
-+     * indicate for how many bytes our result is valid */
-+    *pnum = next_flipped == -1 ? bytes : next_flipped - offset;
-+    *map = offset;
-+
-+    if (alloc) {
-+        *file = bs->file->bs;
-+        return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
-+    } else if (bs->backing) {
-+        *file = bs->backing->bs;
-+    }
-+    return 0;
-+}
-+
-+static void track_child_perm(BlockDriverState *bs, BdrvChild *c,
-+                             BdrvChildRole role, BlockReopenQueue *reopen_queue,
-+                             uint64_t perm, uint64_t shared,
-+                             uint64_t *nperm, uint64_t *nshared)
-+{
-+    BDRVAllocTrackState *s = bs->opaque;
-+
-+    *nshared = BLK_PERM_ALL;
-+
-+    /* in case we're currently dropping ourselves, claim to not use any
-+     * permissions at all - which is fine, since from this point on we will
-+     * never issue a read or write anymore */
-+    if (s->drop_state == DropInProgress) {
-+        *nperm = 0;
-+        return;
-+    }
-+
-+    if (role & BDRV_CHILD_DATA) {
-+        *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
-+    } else {
-+        /* 'backing' is also a child of our BDS, but we don't expect it to be
-+         * writeable, so we only forward 'consistent read' */
-+        *nperm = perm & BLK_PERM_CONSISTENT_READ;
-+    }
-+}
-+
-+static void track_drop(void *opaque)
-+{
-+    BlockDriverState *bs = (BlockDriverState*)opaque;
-+    BlockDriverState *file = bs->file->bs;
-+    BDRVAllocTrackState *s = bs->opaque;
-+
-+    assert(file);
-+
-+    /* we rely on the fact that we're not used anywhere else, so let's wait
-+     * until we're only used once - in the drive connected to the guest (and one
-+     * ref is held by bdrv_ref in track_change_backing_file) */
-+    if (bs->refcnt > 2) {
-+        aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, opaque);
-+        return;
-+    }
-+    AioContext *aio_context = bdrv_get_aio_context(bs);
-+    aio_context_acquire(aio_context);
-+
-+    bdrv_drained_begin(bs);
-+
-+    /* now that we're drained, we can safely set 'DropInProgress' */
-+    s->drop_state = DropInProgress;
-+    bdrv_child_refresh_perms(bs, bs->file, &error_abort);
-+
-+    bdrv_replace_node(bs, file, &error_abort);
-+    bdrv_set_backing_hd(bs, NULL, &error_abort);
-+    bdrv_drained_end(bs);
-+    bdrv_unref(bs);
-+    aio_context_release(aio_context);
-+}
-+
-+static int track_change_backing_file(BlockDriverState *bs,
-+                                     const char *backing_file,
-+                                     const char *backing_fmt)
-+{
-+    BDRVAllocTrackState *s = bs->opaque;
-+    if (s->auto_remove && s->drop_state == DropNone &&
-+        backing_file == NULL && backing_fmt == NULL)
-+    {
-+        /* backing file has been disconnected, there's no longer any use for
-+         * this node, so let's remove ourselves from the block graph - we need
-+         * to schedule this for later however, since when this function is
-+         * called, the blockjob modifying us is probably not done yet and has a
-+         * blocker on 'bs' */
-+        s->drop_state = DropRequested;
-+        bdrv_ref(bs);
-+        aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, (void*)bs);
-+    }
-+
-+    return 0;
-+}
-+
-+static BlockDriver bdrv_alloc_track = {
-+    .format_name                      = "alloc-track",
-+    .instance_size                    = sizeof(BDRVAllocTrackState),
-+
-+    .bdrv_file_open                   = track_open,
-+    .bdrv_close                       = track_close,
-+    .bdrv_co_getlength                = track_co_getlength,
-+    .bdrv_child_perm                  = track_child_perm,
-+    .bdrv_refresh_limits              = track_refresh_limits,
-+
-+    .bdrv_co_pwrite_zeroes            = track_co_pwrite_zeroes,
-+    .bdrv_co_pwritev                  = track_co_pwritev,
-+    .bdrv_co_preadv                   = track_co_preadv,
-+    .bdrv_co_pdiscard                 = track_co_pdiscard,
-+
-+    .bdrv_co_flush                    = track_co_flush,
-+    .bdrv_co_flush_to_disk            = track_co_flush,
-+
-+    .supports_backing                 = true,
-+
-+    .bdrv_co_block_status             = track_co_block_status,
-+    .bdrv_change_backing_file         = track_change_backing_file,
-+};
-+
-+static void bdrv_alloc_track_init(void)
-+{
-+    bdrv_register(&bdrv_alloc_track);
-+}
-+
-+block_init(bdrv_alloc_track_init);
-diff --git a/block/meson.build b/block/meson.build
-index becc99ac4e..0a69836593 100644
---- a/block/meson.build
-+++ b/block/meson.build
-@@ -2,6 +2,7 @@ block_ss.add(genh)
- block_ss.add(files(
-   'accounting.c',
-   'aio_task.c',
-+  'alloc-track.c',
-   'amend.c',
-   'backup.c',
-   'backup-dump.c',
diff --git a/debian/patches/pve/0040-Revert-block-rbd-fix-handling-of-holes-in-.bdrv_co_b.patch b/debian/patches/pve/0040-Revert-block-rbd-fix-handling-of-holes-in-.bdrv_co_b.patch
new file mode 100644 (file)
index 0000000..5ae0bff
--- /dev/null
@@ -0,0 +1,36 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fabian Ebner <f.ebner@proxmox.com>
+Date: Thu, 23 Jun 2022 14:00:07 +0200
+Subject: [PATCH] Revert "block/rbd: fix handling of holes in
+ .bdrv_co_block_status"
+
+This reverts commit 9e302f64bb407a9bb097b626da97228c2654cfee in
+preparation to revert 0347a8fd4c3faaedf119be04c197804be40a384b.
+
+Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/rbd.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/block/rbd.c b/block/rbd.c
+index 367db42dce..347b121626 100644
+--- a/block/rbd.c
++++ b/block/rbd.c
+@@ -1474,11 +1474,11 @@ static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len,
+     RBDDiffIterateReq *req = opaque;
+     assert(req->offs + req->bytes <= offs);
+-
+-    /* treat a hole like an unallocated area and bail out */
+-    if (!exists) {
+-        return 0;
+-    }
++    /*
++     * we do not diff against a snapshot so we should never receive a callback
++     * for a hole.
++     */
++    assert(exists);
+     if (!req->exists && offs > req->offs) {
+         /*
diff --git a/debian/patches/pve/0040-Revert-block-rbd-workaround-for-ceph-issue-53784.patch b/debian/patches/pve/0040-Revert-block-rbd-workaround-for-ceph-issue-53784.patch
deleted file mode 100644 (file)
index 094f353..0000000
+++ /dev/null
@@ -1,81 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fabian Ebner <f.ebner@proxmox.com>
-Date: Thu, 23 Jun 2022 14:00:05 +0200
-Subject: [PATCH] Revert "block/rbd: workaround for ceph issue #53784"
-
-This reverts commit fc176116cdea816ceb8dd969080b2b95f58edbc0 in
-preparation to revert 0347a8fd4c3faaedf119be04c197804be40a384b.
-
-Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/rbd.c | 42 ++----------------------------------------
- 1 file changed, 2 insertions(+), 40 deletions(-)
-
-diff --git a/block/rbd.c b/block/rbd.c
-index a4749f3b1b..53e0396b51 100644
---- a/block/rbd.c
-+++ b/block/rbd.c
-@@ -1511,7 +1511,6 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
-     int status, r;
-     RBDDiffIterateReq req = { .offs = offset };
-     uint64_t features, flags;
--    uint64_t head = 0;
-     assert(offset + bytes <= s->image_size);
-@@ -1539,43 +1538,7 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
-         return status;
-     }
--#if LIBRBD_VERSION_CODE < LIBRBD_VERSION(1, 17, 0)
--    /*
--     * librbd had a bug until early 2022 that affected all versions of ceph that
--     * supported fast-diff. This bug results in reporting of incorrect offsets
--     * if the offset parameter to rbd_diff_iterate2 is not object aligned.
--     * Work around this bug by rounding down the offset to object boundaries.
--     * This is OK because we call rbd_diff_iterate2 with whole_object = true.
--     * However, this workaround only works for non cloned images with default
--     * striping.
--     *
--     * See: https://tracker.ceph.com/issues/53784
--     */
--
--    /* check if RBD image has non-default striping enabled */
--    if (features & RBD_FEATURE_STRIPINGV2) {
--        return status;
--    }
--
--#pragma GCC diagnostic push
--#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
--    /*
--     * check if RBD image is a clone (= has a parent).
--     *
--     * rbd_get_parent_info is deprecated from Nautilus onwards, but the
--     * replacement rbd_get_parent is not present in Luminous and Mimic.
--     */
--    if (rbd_get_parent_info(s->image, NULL, 0, NULL, 0, NULL, 0) != -ENOENT) {
--        return status;
--    }
--#pragma GCC diagnostic pop
--
--    head = req.offs & (s->object_size - 1);
--    req.offs -= head;
--    bytes += head;
--#endif
--
--    r = rbd_diff_iterate2(s->image, NULL, req.offs, bytes, true, true,
-+    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
-                           qemu_rbd_diff_iterate_cb, &req);
-     if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
-         return status;
-@@ -1594,8 +1557,7 @@ static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
-         status = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID;
-     }
--    assert(req.bytes > head);
--    *pnum = req.bytes - head;
-+    *pnum = req.bytes;
-     return status;
- }
diff --git a/debian/patches/pve/0041-Revert-block-rbd-fix-handling-of-holes-in-.bdrv_co_b.patch b/debian/patches/pve/0041-Revert-block-rbd-fix-handling-of-holes-in-.bdrv_co_b.patch
deleted file mode 100644 (file)
index e4aca73..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fabian Ebner <f.ebner@proxmox.com>
-Date: Thu, 23 Jun 2022 14:00:07 +0200
-Subject: [PATCH] Revert "block/rbd: fix handling of holes in
- .bdrv_co_block_status"
-
-This reverts commit 9e302f64bb407a9bb097b626da97228c2654cfee in
-preparation to revert 0347a8fd4c3faaedf119be04c197804be40a384b.
-
-Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/rbd.c | 10 +++++-----
- 1 file changed, 5 insertions(+), 5 deletions(-)
-
-diff --git a/block/rbd.c b/block/rbd.c
-index 53e0396b51..0913a0af39 100644
---- a/block/rbd.c
-+++ b/block/rbd.c
-@@ -1470,11 +1470,11 @@ static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len,
-     RBDDiffIterateReq *req = opaque;
-     assert(req->offs + req->bytes <= offs);
--
--    /* treat a hole like an unallocated area and bail out */
--    if (!exists) {
--        return 0;
--    }
-+    /*
-+     * we do not diff against a snapshot so we should never receive a callback
-+     * for a hole.
-+     */
-+    assert(exists);
-     if (!req->exists && offs > req->offs) {
-         /*
diff --git a/debian/patches/pve/0041-Revert-block-rbd-implement-bdrv_co_block_status.patch b/debian/patches/pve/0041-Revert-block-rbd-implement-bdrv_co_block_status.patch
new file mode 100644 (file)
index 0000000..38966fe
--- /dev/null
@@ -0,0 +1,162 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fabian Ebner <f.ebner@proxmox.com>
+Date: Tue, 17 May 2022 09:46:02 +0200
+Subject: [PATCH] Revert "block/rbd: implement bdrv_co_block_status"
+
+During backup, bdrv_co_block_status is called for each block copy
+chunk. When RBD is used, the current implementation with
+rbd_diff_iterate2() using whole_object=true takes about linearly more
+time, depending on the image size. Since there are linearly more
+chunks, the slowdown is quadratic, becoming unacceptable for large
+images (starting somewhere between 500-1000 GiB in my testing).
+
+This reverts commit 0347a8fd4c3faaedf119be04c197804be40a384b as a
+stop-gap measure, until it's clear how to make the implemenation
+more efficient.
+
+Upstream bug report:
+https://gitlab.com/qemu-project/qemu/-/issues/1026
+
+Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/rbd.c | 112 ----------------------------------------------------
+ 1 file changed, 112 deletions(-)
+
+diff --git a/block/rbd.c b/block/rbd.c
+index 347b121626..e61b359b97 100644
+--- a/block/rbd.c
++++ b/block/rbd.c
+@@ -108,12 +108,6 @@ typedef struct RBDTask {
+     int64_t ret;
+ } RBDTask;
+-typedef struct RBDDiffIterateReq {
+-    uint64_t offs;
+-    uint64_t bytes;
+-    bool exists;
+-} RBDDiffIterateReq;
+-
+ static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
+                             BlockdevOptionsRbd *opts, bool cache,
+                             const char *keypairs, const char *secretid,
+@@ -1460,111 +1454,6 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
+     return spec_info;
+ }
+-/*
+- * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
+- * value in the callback routine. Choose a value that does not conflict with
+- * an existing exitcode and return it if we want to prematurely stop the
+- * execution because we detected a change in the allocation status.
+- */
+-#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
+-
+-static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len,
+-                                    int exists, void *opaque)
+-{
+-    RBDDiffIterateReq *req = opaque;
+-
+-    assert(req->offs + req->bytes <= offs);
+-    /*
+-     * we do not diff against a snapshot so we should never receive a callback
+-     * for a hole.
+-     */
+-    assert(exists);
+-
+-    if (!req->exists && offs > req->offs) {
+-        /*
+-         * we started in an unallocated area and hit the first allocated
+-         * block. req->bytes must be set to the length of the unallocated area
+-         * before the allocated area. stop further processing.
+-         */
+-        req->bytes = offs - req->offs;
+-        return QEMU_RBD_EXIT_DIFF_ITERATE2;
+-    }
+-
+-    if (req->exists && offs > req->offs + req->bytes) {
+-        /*
+-         * we started in an allocated area and jumped over an unallocated area,
+-         * req->bytes contains the length of the allocated area before the
+-         * unallocated area. stop further processing.
+-         */
+-        return QEMU_RBD_EXIT_DIFF_ITERATE2;
+-    }
+-
+-    req->bytes += len;
+-    req->exists = true;
+-
+-    return 0;
+-}
+-
+-static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
+-                                                 bool want_zero, int64_t offset,
+-                                                 int64_t bytes, int64_t *pnum,
+-                                                 int64_t *map,
+-                                                 BlockDriverState **file)
+-{
+-    BDRVRBDState *s = bs->opaque;
+-    int status, r;
+-    RBDDiffIterateReq req = { .offs = offset };
+-    uint64_t features, flags;
+-
+-    assert(offset + bytes <= s->image_size);
+-
+-    /* default to all sectors allocated */
+-    status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+-    *map = offset;
+-    *file = bs;
+-    *pnum = bytes;
+-
+-    /* check if RBD image supports fast-diff */
+-    r = rbd_get_features(s->image, &features);
+-    if (r < 0) {
+-        return status;
+-    }
+-    if (!(features & RBD_FEATURE_FAST_DIFF)) {
+-        return status;
+-    }
+-
+-    /* check if RBD fast-diff result is valid */
+-    r = rbd_get_flags(s->image, &flags);
+-    if (r < 0) {
+-        return status;
+-    }
+-    if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
+-        return status;
+-    }
+-
+-    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
+-                          qemu_rbd_diff_iterate_cb, &req);
+-    if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
+-        return status;
+-    }
+-    assert(req.bytes <= bytes);
+-    if (!req.exists) {
+-        if (r == 0) {
+-            /*
+-             * rbd_diff_iterate2 does not invoke callbacks for unallocated
+-             * areas. This here catches the case where no callback was
+-             * invoked at all (req.bytes == 0).
+-             */
+-            assert(req.bytes == 0);
+-            req.bytes = bytes;
+-        }
+-        status = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID;
+-    }
+-
+-    *pnum = req.bytes;
+-    return status;
+-}
+-
+ static int64_t coroutine_fn qemu_rbd_co_getlength(BlockDriverState *bs)
+ {
+     BDRVRBDState *s = bs->opaque;
+@@ -1800,7 +1689,6 @@ static BlockDriver bdrv_rbd = {
+ #ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
+     .bdrv_co_pwrite_zeroes  = qemu_rbd_co_pwrite_zeroes,
+ #endif
+-    .bdrv_co_block_status   = qemu_rbd_co_block_status,
+     .bdrv_snapshot_create   = qemu_rbd_snap_create,
+     .bdrv_snapshot_delete   = qemu_rbd_snap_remove,
diff --git a/debian/patches/pve/0042-Revert-block-rbd-implement-bdrv_co_block_status.patch b/debian/patches/pve/0042-Revert-block-rbd-implement-bdrv_co_block_status.patch
deleted file mode 100644 (file)
index 9e9a385..0000000
+++ /dev/null
@@ -1,162 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fabian Ebner <f.ebner@proxmox.com>
-Date: Tue, 17 May 2022 09:46:02 +0200
-Subject: [PATCH] Revert "block/rbd: implement bdrv_co_block_status"
-
-During backup, bdrv_co_block_status is called for each block copy
-chunk. When RBD is used, the current implementation with
-rbd_diff_iterate2() using whole_object=true takes about linearly more
-time, depending on the image size. Since there are linearly more
-chunks, the slowdown is quadratic, becoming unacceptable for large
-images (starting somewhere between 500-1000 GiB in my testing).
-
-This reverts commit 0347a8fd4c3faaedf119be04c197804be40a384b as a
-stop-gap measure, until it's clear how to make the implemenation
-more efficient.
-
-Upstream bug report:
-https://gitlab.com/qemu-project/qemu/-/issues/1026
-
-Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/rbd.c | 112 ----------------------------------------------------
- 1 file changed, 112 deletions(-)
-
-diff --git a/block/rbd.c b/block/rbd.c
-index 0913a0af39..1dab254517 100644
---- a/block/rbd.c
-+++ b/block/rbd.c
-@@ -108,12 +108,6 @@ typedef struct RBDTask {
-     int64_t ret;
- } RBDTask;
--typedef struct RBDDiffIterateReq {
--    uint64_t offs;
--    uint64_t bytes;
--    bool exists;
--} RBDDiffIterateReq;
--
- static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
-                             BlockdevOptionsRbd *opts, bool cache,
-                             const char *keypairs, const char *secretid,
-@@ -1456,111 +1450,6 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
-     return spec_info;
- }
--/*
-- * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
-- * value in the callback routine. Choose a value that does not conflict with
-- * an existing exitcode and return it if we want to prematurely stop the
-- * execution because we detected a change in the allocation status.
-- */
--#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
--
--static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len,
--                                    int exists, void *opaque)
--{
--    RBDDiffIterateReq *req = opaque;
--
--    assert(req->offs + req->bytes <= offs);
--    /*
--     * we do not diff against a snapshot so we should never receive a callback
--     * for a hole.
--     */
--    assert(exists);
--
--    if (!req->exists && offs > req->offs) {
--        /*
--         * we started in an unallocated area and hit the first allocated
--         * block. req->bytes must be set to the length of the unallocated area
--         * before the allocated area. stop further processing.
--         */
--        req->bytes = offs - req->offs;
--        return QEMU_RBD_EXIT_DIFF_ITERATE2;
--    }
--
--    if (req->exists && offs > req->offs + req->bytes) {
--        /*
--         * we started in an allocated area and jumped over an unallocated area,
--         * req->bytes contains the length of the allocated area before the
--         * unallocated area. stop further processing.
--         */
--        return QEMU_RBD_EXIT_DIFF_ITERATE2;
--    }
--
--    req->bytes += len;
--    req->exists = true;
--
--    return 0;
--}
--
--static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
--                                                 bool want_zero, int64_t offset,
--                                                 int64_t bytes, int64_t *pnum,
--                                                 int64_t *map,
--                                                 BlockDriverState **file)
--{
--    BDRVRBDState *s = bs->opaque;
--    int status, r;
--    RBDDiffIterateReq req = { .offs = offset };
--    uint64_t features, flags;
--
--    assert(offset + bytes <= s->image_size);
--
--    /* default to all sectors allocated */
--    status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
--    *map = offset;
--    *file = bs;
--    *pnum = bytes;
--
--    /* check if RBD image supports fast-diff */
--    r = rbd_get_features(s->image, &features);
--    if (r < 0) {
--        return status;
--    }
--    if (!(features & RBD_FEATURE_FAST_DIFF)) {
--        return status;
--    }
--
--    /* check if RBD fast-diff result is valid */
--    r = rbd_get_flags(s->image, &flags);
--    if (r < 0) {
--        return status;
--    }
--    if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
--        return status;
--    }
--
--    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
--                          qemu_rbd_diff_iterate_cb, &req);
--    if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
--        return status;
--    }
--    assert(req.bytes <= bytes);
--    if (!req.exists) {
--        if (r == 0) {
--            /*
--             * rbd_diff_iterate2 does not invoke callbacks for unallocated
--             * areas. This here catches the case where no callback was
--             * invoked at all (req.bytes == 0).
--             */
--            assert(req.bytes == 0);
--            req.bytes = bytes;
--        }
--        status = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID;
--    }
--
--    *pnum = req.bytes;
--    return status;
--}
--
- static int64_t coroutine_fn qemu_rbd_co_getlength(BlockDriverState *bs)
- {
-     BDRVRBDState *s = bs->opaque;
-@@ -1796,7 +1685,6 @@ static BlockDriver bdrv_rbd = {
- #ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
-     .bdrv_co_pwrite_zeroes  = qemu_rbd_co_pwrite_zeroes,
- #endif
--    .bdrv_co_block_status   = qemu_rbd_co_block_status,
-     .bdrv_snapshot_create   = qemu_rbd_snap_create,
-     .bdrv_snapshot_delete   = qemu_rbd_snap_remove,
diff --git a/debian/patches/pve/0042-alloc-track-error-out-when-auto-remove-is-not-set.patch b/debian/patches/pve/0042-alloc-track-error-out-when-auto-remove-is-not-set.patch
new file mode 100644 (file)
index 0000000..812026d
--- /dev/null
@@ -0,0 +1,43 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Tue, 26 Mar 2024 14:57:51 +0100
+Subject: [PATCH] alloc-track: error out when auto-remove is not set
+
+Since replacing the node now happens in the stream job, where the
+option cannot be read from (it's internal to the driver), it will
+always be treated as on.
+
+qemu-server will always set it, make sure to have other users notice
+the change (should they even exist). The option can be fully dropped
+in the future while adding a version guard in qemu-server.
+
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+---
+ block/alloc-track.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/block/alloc-track.c b/block/alloc-track.c
+index b9f8ea9137..f3ed2935c4 100644
+--- a/block/alloc-track.c
++++ b/block/alloc-track.c
+@@ -34,7 +34,6 @@ typedef struct {
+     BdrvDirtyBitmap *bitmap;
+     uint64_t granularity;
+     DropState drop_state;
+-    bool auto_remove;
+ } BDRVAllocTrackState;
+ static QemuOptsList runtime_opts = {
+@@ -86,7 +85,11 @@ static int track_open(BlockDriverState *bs, QDict *options, int flags,
+         goto fail;
+     }
+-    s->auto_remove = qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false);
++    if (!qemu_opt_get_bool(opts, TRACK_OPT_AUTO_REMOVE, false)) {
++        error_setg(errp, "alloc-track: requires auto-remove option to be set to on");
++        ret = -EINVAL;
++        goto fail;
++    }
+     /* open the target (write) node, backing will be attached by block layer */
+     file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
diff --git a/debian/patches/pve/0043-alloc-track-avoid-seemingly-superfluous-child-permis.patch b/debian/patches/pve/0043-alloc-track-avoid-seemingly-superfluous-child-permis.patch
new file mode 100644 (file)
index 0000000..295319c
--- /dev/null
@@ -0,0 +1,84 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Wed, 27 Mar 2024 11:15:39 +0100
+Subject: [PATCH] alloc-track: avoid seemingly superfluous child permission
+ update
+
+Doesn't seem necessary nowadays (maybe after commit "alloc-track: fix
+deadlock during drop" where the dropping is not rescheduled and delayed
+anymore or some upstream change). Should there really be some issue,
+instead of having a drop state, this could also be just based off the
+fact whether there is still a backing child.
+
+Dumping the cumulative (shared) permissions for the BDS with a debug
+print yields the same values after this patch and with QEMU 8.1,
+namely 3 and 5.
+
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+---
+ block/alloc-track.c | 26 --------------------------
+ 1 file changed, 26 deletions(-)
+
+diff --git a/block/alloc-track.c b/block/alloc-track.c
+index f3ed2935c4..29138dcc49 100644
+--- a/block/alloc-track.c
++++ b/block/alloc-track.c
+@@ -25,15 +25,9 @@
+ #define TRACK_OPT_AUTO_REMOVE "auto-remove"
+-typedef enum DropState {
+-    DropNone,
+-    DropInProgress,
+-} DropState;
+-
+ typedef struct {
+     BdrvDirtyBitmap *bitmap;
+     uint64_t granularity;
+-    DropState drop_state;
+ } BDRVAllocTrackState;
+ static QemuOptsList runtime_opts = {
+@@ -137,8 +131,6 @@ static int track_open(BlockDriverState *bs, QDict *options, int flags,
+         goto fail;
+     }
+-    s->drop_state = DropNone;
+-
+ fail:
+     if (ret < 0) {
+         bdrv_graph_wrlock();
+@@ -289,18 +281,8 @@ track_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
+                  BlockReopenQueue *reopen_queue, uint64_t perm, uint64_t shared,
+                  uint64_t *nperm, uint64_t *nshared)
+ {
+-    BDRVAllocTrackState *s = bs->opaque;
+-
+     *nshared = BLK_PERM_ALL;
+-    /* in case we're currently dropping ourselves, claim to not use any
+-     * permissions at all - which is fine, since from this point on we will
+-     * never issue a read or write anymore */
+-    if (s->drop_state == DropInProgress) {
+-        *nperm = 0;
+-        return;
+-    }
+-
+     if (role & BDRV_CHILD_DATA) {
+         *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
+     } else {
+@@ -326,14 +308,6 @@ track_co_change_backing_file(BlockDriverState *bs, const char *backing_file,
+      * kinda fits better, but in the long-term, a special parameter would be
+      * nice (or done via qemu-server via upcoming blockdev-replace QMP command).
+      */
+-    if (backing_file == NULL) {
+-        BDRVAllocTrackState *s = bs->opaque;
+-        bdrv_drained_begin(bs);
+-        s->drop_state = DropInProgress;
+-        bdrv_child_refresh_perms(bs, bs->file, &error_abort);
+-        bdrv_drained_end(bs);
+-    }
+-
+     return 0;
+ }
diff --git a/debian/patches/pve/0043-alloc-track-fix-deadlock-during-drop.patch b/debian/patches/pve/0043-alloc-track-fix-deadlock-during-drop.patch
deleted file mode 100644 (file)
index 153b8ef..0000000
+++ /dev/null
@@ -1,154 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Thu, 6 Apr 2023 14:59:31 +0200
-Subject: [PATCH] alloc-track: fix deadlock during drop
-
-by replacing the block node directly after changing the backing file
-instead of rescheduling it.
-
-With changes in QEMU 8.0, calling bdrv_get_info (and bdrv_unref)
-during drop can lead to a deadlock when using iothread (only triggered
-with multiple disks, except during debugging where it also triggered
-with one disk sometimes):
-1. job_unref_locked acquires the AioContext and calls job->driver->free
-2. track_drop gets scheduled
-3. bdrv_graph_wrlock is called and polls which leads to track_drop being
-   called
-4. track_drop acquires the AioContext recursively
-5. bdrv_get_info is a wrapped coroutine (since 8.0) and thus polls for
-   bdrv_co_get_info. This releases the AioContext, but only once! The
-   documentation for the AIO_WAIT_WHILE macro states that the
-   AioContext lock needs to be acquired exactly once, but there does
-   not seem to be a way for track_drop to know if it acquired the lock
-   recursively or not (without adding further hacks).
-6. Because the AioContext is still held by the main thread once, it can't
-   be acquired before entering bdrv_co_get_info in co_schedule_bh_cb
-   which happens in the iothread
-
-When doing the operation in change_backing_file, the AioContext has
-already been acquired by the caller, so the issue with the recursive
-lock goes away.
-
-The comment explaining why delaying the replace is necessary is
-> we need to schedule this for later however, since when this function
-> is called, the blockjob modifying us is probably not done yet and
-> has a blocker on 'bs'
-
-However, there is no check for blockers in bdrv_replace_node. It would
-need to be done by us, the caller, with check_to_replace_node.
-Furthermore, the mirror job also does its call to bdrv_replace_node
-while there is an active blocker (inserted by mirror itself) and they
-use a specialized version to check for blockers instead of
-check_to_replace_node there. Alloc-track could also do something
-similar to check for other blockers, but it should be fine to rely on
-Proxmox VE that no other operation with the blockdev is going on.
-
-Mirror also drains the target before replacing the node, but the
-target can have other users. In case of alloc-track the file child
-should not be accessible by anybody else and so there can't be an
-in-flight operation for the file child when alloc-track is drained.
-
-The rescheduling based on refcounting is a hack and it doesn't seem to
-be necessary anymore. It's not clear what the original issue from the
-comment was. Testing with older builds with track_drop done directly
-without rescheduling also didn't lead to any noticable issue for me.
-
-One issue it might have been is the one fixed by b1e1af394d
-("block/stream: Drain subtree around graph change"), where
-block-stream had a use-after-free if the base node changed at an
-inconvenient time (which alloc-track's auto-drop does).
-
-It's also not possible to just not auto-replace the alloc-track. Not
-replacing it at all leads to other operations like block resize
-hanging, and there is no good way to replace it manually via QMP
-(there is x-blockdev-change, but it is experimental and doesn't
-implement the required operation yet). Also, it's just cleaner in
-general to not leave unnecessary block nodes lying around.
-
-Suggested-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/alloc-track.c | 54 ++++++++++++++-------------------------------
- 1 file changed, 16 insertions(+), 38 deletions(-)
-
-diff --git a/block/alloc-track.c b/block/alloc-track.c
-index b75d7c6460..76da140a68 100644
---- a/block/alloc-track.c
-+++ b/block/alloc-track.c
-@@ -25,7 +25,6 @@
- typedef enum DropState {
-     DropNone,
--    DropRequested,
-     DropInProgress,
- } DropState;
-@@ -268,37 +267,6 @@ static void track_child_perm(BlockDriverState *bs, BdrvChild *c,
-     }
- }
--static void track_drop(void *opaque)
--{
--    BlockDriverState *bs = (BlockDriverState*)opaque;
--    BlockDriverState *file = bs->file->bs;
--    BDRVAllocTrackState *s = bs->opaque;
--
--    assert(file);
--
--    /* we rely on the fact that we're not used anywhere else, so let's wait
--     * until we're only used once - in the drive connected to the guest (and one
--     * ref is held by bdrv_ref in track_change_backing_file) */
--    if (bs->refcnt > 2) {
--        aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, opaque);
--        return;
--    }
--    AioContext *aio_context = bdrv_get_aio_context(bs);
--    aio_context_acquire(aio_context);
--
--    bdrv_drained_begin(bs);
--
--    /* now that we're drained, we can safely set 'DropInProgress' */
--    s->drop_state = DropInProgress;
--    bdrv_child_refresh_perms(bs, bs->file, &error_abort);
--
--    bdrv_replace_node(bs, file, &error_abort);
--    bdrv_set_backing_hd(bs, NULL, &error_abort);
--    bdrv_drained_end(bs);
--    bdrv_unref(bs);
--    aio_context_release(aio_context);
--}
--
- static int track_change_backing_file(BlockDriverState *bs,
-                                      const char *backing_file,
-                                      const char *backing_fmt)
-@@ -308,13 +276,23 @@ static int track_change_backing_file(BlockDriverState *bs,
-         backing_file == NULL && backing_fmt == NULL)
-     {
-         /* backing file has been disconnected, there's no longer any use for
--         * this node, so let's remove ourselves from the block graph - we need
--         * to schedule this for later however, since when this function is
--         * called, the blockjob modifying us is probably not done yet and has a
--         * blocker on 'bs' */
--        s->drop_state = DropRequested;
-+         * this node, so let's remove ourselves from the block graph */
-+        BlockDriverState *file = bs->file->bs;
-+
-+        /* Just to be sure, because bdrv_replace_node unrefs it */
-         bdrv_ref(bs);
--        aio_bh_schedule_oneshot(qemu_get_aio_context(), track_drop, (void*)bs);
-+        bdrv_drained_begin(bs);
-+
-+        /* now that we're drained, we can safely set 'DropInProgress' */
-+        s->drop_state = DropInProgress;
-+
-+        bdrv_child_refresh_perms(bs, bs->file, &error_abort);
-+
-+        bdrv_replace_node(bs, file, &error_abort);
-+        bdrv_set_backing_hd(bs, NULL, &error_abort);
-+
-+        bdrv_drained_end(bs);
-+        bdrv_unref(bs);
-     }
-     return 0;
diff --git a/debian/patches/pve/0044-block-copy-before-write-fix-permission.patch b/debian/patches/pve/0044-block-copy-before-write-fix-permission.patch
new file mode 100644 (file)
index 0000000..6a759a4
--- /dev/null
@@ -0,0 +1,55 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Date: Thu, 11 Apr 2024 11:29:22 +0200
+Subject: [PATCH] block/copy-before-write: fix permission
+
+In case when source node does not have any parents, the condition still
+works as required: backup job do create the parent by
+
+  block_job_create -> block_job_add_bdrv -> bdrv_root_attach_child
+
+Still, in this case checking @perm variable doesn't work, as backup job
+creates the root blk with empty permissions (as it rely on CBW filter
+to require correct permissions and don't want to create extra
+conflicts).
+
+So, we should not check @perm.
+
+The hack may be dropped entirely when transactional insertion of
+filter (when we don't try to recalculate permissions in intermediate
+state, when filter does conflict with original parent of the source
+node) merged (old big series
+"[PATCH v5 00/45] Transactional block-graph modifying API"[1] and it's
+current in-flight part is "[PATCH v8 0/7] blockdev-replace"[2])
+
+[1] https://patchew.org/QEMU/20220330212902.590099-1-vsementsov@openvz.org/
+[2] https://patchew.org/QEMU/20231017184444.932733-1-vsementsov@yandex-team.ru/
+
+Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/copy-before-write.c | 10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index 026fa9840f..5a9456d426 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -364,9 +364,13 @@ cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
+                            perm, shared, nperm, nshared);
+         if (!QLIST_EMPTY(&bs->parents)) {
+-            if (perm & BLK_PERM_WRITE) {
+-                *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
+-            }
++            /*
++             * Note, that source child may be shared with backup job. Backup job
++             * does create own blk parent on copy-before-write node, so this
++             * works even if source node does not have any parents before backup
++             * start
++             */
++            *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
+             *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
+         }
+     }
diff --git a/debian/patches/pve/0044-migration-for-snapshots-hold-the-BQL-during-setup-ca.patch b/debian/patches/pve/0044-migration-for-snapshots-hold-the-BQL-during-setup-ca.patch
deleted file mode 100644 (file)
index 635f64a..0000000
+++ /dev/null
@@ -1,191 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Fri, 5 May 2023 13:39:53 +0200
-Subject: [PATCH] migration: for snapshots, hold the BQL during setup callbacks
-
-In spirit, this is a partial revert of commit 9b09503752 ("migration:
-run setup callbacks out of big lock"), but only for the snapshot case.
-
-For snapshots, the bdrv_writev_vmstate() function is used during setup
-(in QIOChannelBlock backing the QEMUFile), but not holding the BQL
-while calling it could lead to an assertion failure. To understand
-how, first note the following:
-
-1. Generated coroutine wrappers for block layer functions spawn the
-coroutine and use AIO_WAIT_WHILE()/aio_poll() to wait for it.
-2. If the host OS switches threads at an inconvenient time, it can
-happen that a bottom half scheduled for the main thread's AioContext
-is executed as part of a vCPU thread's aio_poll().
-
-An example leading to the assertion failure is as follows:
-
-main thread:
-1. A snapshot-save QMP command gets issued.
-2. snapshot_save_job_bh() is scheduled.
-
-vCPU thread:
-3. aio_poll() for the main thread's AioContext is called (e.g. when
-the guest writes to a pflash device, as part of blk_pwrite which is a
-generated coroutine wrapper).
-4. snapshot_save_job_bh() is executed as part of aio_poll().
-3. qemu_savevm_state() is called.
-4. qemu_mutex_unlock_iothread() is called. Now
-qemu_get_current_aio_context() returns 0x0.
-5. bdrv_writev_vmstate() is executed during the usual savevm setup.
-But this function is a generated coroutine wrapper, so it uses
-AIO_WAIT_WHILE. There, the assertion
-assert(qemu_get_current_aio_context() == qemu_get_aio_context());
-will fail.
-
-To fix it, ensure that the BQL is held during setup. To avoid changing
-the behavior for migration too, introduce conditionals for the setup
-callbacks that need the BQL and only take the lock if it's not already
-held.
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- include/migration/register.h   |  2 +-
- migration/block-dirty-bitmap.c | 15 ++++++++++++---
- migration/block.c              | 15 ++++++++++++---
- migration/ram.c                | 16 +++++++++++++---
- migration/savevm.c             |  2 --
- 5 files changed, 38 insertions(+), 12 deletions(-)
-
-diff --git a/include/migration/register.h b/include/migration/register.h
-index 90914f32f5..c728fd9120 100644
---- a/include/migration/register.h
-+++ b/include/migration/register.h
-@@ -43,9 +43,9 @@ typedef struct SaveVMHandlers {
-      * by other locks.
-      */
-     int (*save_live_iterate)(QEMUFile *f, void *opaque);
-+    int (*save_setup)(QEMUFile *f, void *opaque);
-     /* This runs outside the iothread lock!  */
--    int (*save_setup)(QEMUFile *f, void *opaque);
-     /* Note for save_live_pending:
-      * must_precopy:
-      * - must be migrated in precopy or in stopped state
-diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
-index 285dd1d148..f7ee5a74d9 100644
---- a/migration/block-dirty-bitmap.c
-+++ b/migration/block-dirty-bitmap.c
-@@ -1219,10 +1219,17 @@ static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque)
- {
-     DBMSaveState *s = &((DBMState *)opaque)->save;
-     SaveBitmapState *dbms = NULL;
-+    bool release_lock = false;
--    qemu_mutex_lock_iothread();
-+    /* For snapshots, the BQL is held during setup. */
-+    if (!qemu_mutex_iothread_locked()) {
-+        qemu_mutex_lock_iothread();
-+        release_lock = true;
-+    }
-     if (init_dirty_bitmap_migration(s) < 0) {
--        qemu_mutex_unlock_iothread();
-+        if (release_lock) {
-+            qemu_mutex_unlock_iothread();
-+        }
-         return -1;
-     }
-@@ -1230,7 +1237,9 @@ static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque)
-         send_bitmap_start(f, s, dbms);
-     }
-     qemu_put_bitmap_flags(f, DIRTY_BITMAP_MIG_FLAG_EOS);
--    qemu_mutex_unlock_iothread();
-+    if (release_lock) {
-+        qemu_mutex_unlock_iothread();
-+    }
-     return 0;
- }
-diff --git a/migration/block.c b/migration/block.c
-index 86c2256a2b..8423e0c9f9 100644
---- a/migration/block.c
-+++ b/migration/block.c
-@@ -725,21 +725,30 @@ static void block_migration_cleanup(void *opaque)
- static int block_save_setup(QEMUFile *f, void *opaque)
- {
-     int ret;
-+    bool release_lock = false;
-     trace_migration_block_save("setup", block_mig_state.submitted,
-                                block_mig_state.transferred);
--    qemu_mutex_lock_iothread();
-+    /* For snapshots, the BQL is held during setup. */
-+    if (!qemu_mutex_iothread_locked()) {
-+        qemu_mutex_lock_iothread();
-+        release_lock = true;
-+    }
-     ret = init_blk_migration(f);
-     if (ret < 0) {
--        qemu_mutex_unlock_iothread();
-+        if (release_lock) {
-+            qemu_mutex_unlock_iothread();
-+        }
-         return ret;
-     }
-     /* start track dirty blocks */
-     ret = set_dirty_tracking();
--    qemu_mutex_unlock_iothread();
-+    if (release_lock) {
-+        qemu_mutex_unlock_iothread();
-+    }
-     if (ret) {
-         return ret;
-diff --git a/migration/ram.c b/migration/ram.c
-index 6e1514f69f..6a1aec7031 100644
---- a/migration/ram.c
-+++ b/migration/ram.c
-@@ -2896,8 +2896,16 @@ static void migration_bitmap_clear_discarded_pages(RAMState *rs)
- static void ram_init_bitmaps(RAMState *rs)
- {
--    /* For memory_global_dirty_log_start below.  */
--    qemu_mutex_lock_iothread();
-+    bool release_lock = false;
-+
-+    /*
-+     * For memory_global_dirty_log_start below.
-+     * For snapshots, the BQL is held during setup.
-+     */
-+    if (!qemu_mutex_iothread_locked()) {
-+        qemu_mutex_lock_iothread();
-+        release_lock = true;
-+    }
-     qemu_mutex_lock_ramlist();
-     WITH_RCU_READ_LOCK_GUARD() {
-@@ -2909,7 +2917,9 @@ static void ram_init_bitmaps(RAMState *rs)
-         }
-     }
-     qemu_mutex_unlock_ramlist();
--    qemu_mutex_unlock_iothread();
-+    if (release_lock) {
-+        qemu_mutex_unlock_iothread();
-+    }
-     /*
-      * After an eventual first bitmap sync, fixup the initial bitmap
-diff --git a/migration/savevm.c b/migration/savevm.c
-index d60c4f487a..3c015722f7 100644
---- a/migration/savevm.c
-+++ b/migration/savevm.c
-@@ -1625,10 +1625,8 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
-     reset_vfio_bytes_transferred();
-     ms->to_dst_file = f;
--    qemu_mutex_unlock_iothread();
-     qemu_savevm_state_header(f);
-     qemu_savevm_state_setup(f);
--    qemu_mutex_lock_iothread();
-     while (qemu_file_get_error(f) == 0) {
-         if (qemu_savevm_state_iterate(f, false) > 0) {
diff --git a/debian/patches/pve/0045-block-copy-before-write-support-unligned-snapshot-di.patch b/debian/patches/pve/0045-block-copy-before-write-support-unligned-snapshot-di.patch
new file mode 100644 (file)
index 0000000..f651c58
--- /dev/null
@@ -0,0 +1,48 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Date: Thu, 11 Apr 2024 11:29:23 +0200
+Subject: [PATCH] block/copy-before-write: support unligned snapshot-discard
+
+First thing that crashes on unligned access here is
+bdrv_reset_dirty_bitmap(). Correct way is to align-down the
+snapshot-discard request.
+
+Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/copy-before-write.c | 16 +++++++++++++---
+ 1 file changed, 13 insertions(+), 3 deletions(-)
+
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index 5a9456d426..c0e70669a2 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -325,14 +325,24 @@ static int coroutine_fn GRAPH_RDLOCK
+ cbw_co_pdiscard_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes)
+ {
+     BDRVCopyBeforeWriteState *s = bs->opaque;
++    uint32_t cluster_size = block_copy_cluster_size(s->bcs);
++    int64_t aligned_offset = QEMU_ALIGN_UP(offset, cluster_size);
++    int64_t aligned_end = QEMU_ALIGN_DOWN(offset + bytes, cluster_size);
++    int64_t aligned_bytes;
++
++    if (aligned_end <= aligned_offset) {
++        return 0;
++    }
++    aligned_bytes = aligned_end - aligned_offset;
+     WITH_QEMU_LOCK_GUARD(&s->lock) {
+-        bdrv_reset_dirty_bitmap(s->access_bitmap, offset, bytes);
++        bdrv_reset_dirty_bitmap(s->access_bitmap, aligned_offset,
++                                aligned_bytes);
+     }
+-    block_copy_reset(s->bcs, offset, bytes);
++    block_copy_reset(s->bcs, aligned_offset, aligned_bytes);
+-    return bdrv_co_pdiscard(s->target, offset, bytes);
++    return bdrv_co_pdiscard(s->target, aligned_offset, aligned_bytes);
+ }
+ static void GRAPH_RDLOCK cbw_refresh_filename(BlockDriverState *bs)
diff --git a/debian/patches/pve/0045-savevm-async-don-t-hold-BQL-during-setup.patch b/debian/patches/pve/0045-savevm-async-don-t-hold-BQL-during-setup.patch
deleted file mode 100644 (file)
index ac00f1a..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Fri, 5 May 2023 15:30:16 +0200
-Subject: [PATCH] savevm-async: don't hold BQL during setup
-
-See commit "migration: for snapshots, hold the BQL during setup
-callbacks" for why. This is separate, because a version of that one
-will hopefully land upstream.
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- migration/savevm-async.c | 2 --
- 1 file changed, 2 deletions(-)
-
-diff --git a/migration/savevm-async.c b/migration/savevm-async.c
-index 80624fada8..b1d85a4b41 100644
---- a/migration/savevm-async.c
-+++ b/migration/savevm-async.c
-@@ -401,10 +401,8 @@ void qmp_savevm_start(const char *statefile, Error **errp)
-     snap_state.state = SAVE_STATE_ACTIVE;
-     snap_state.finalize_bh = qemu_bh_new(process_savevm_finalize, &snap_state);
-     snap_state.co = qemu_coroutine_create(&process_savevm_co, NULL);
--    qemu_mutex_unlock_iothread();
-     qemu_savevm_state_header(snap_state.file);
-     qemu_savevm_state_setup(snap_state.file);
--    qemu_mutex_lock_iothread();
-     /* Async processing from here on out happens in iohandler context, so let
-      * the target bdrv have its home there.
diff --git a/debian/patches/pve/0046-block-copy-before-write-create-block_copy-bitmap-in-.patch b/debian/patches/pve/0046-block-copy-before-write-create-block_copy-bitmap-in-.patch
new file mode 100644 (file)
index 0000000..7cd24d0
--- /dev/null
@@ -0,0 +1,373 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Date: Thu, 11 Apr 2024 11:29:24 +0200
+Subject: [PATCH] block/copy-before-write: create block_copy bitmap in filter
+ node
+
+Currently block_copy creates copy_bitmap in source node. But that is in
+bad relation with .independent_close=true of copy-before-write filter:
+source node may be detached and removed before .bdrv_close() handler
+called, which should call block_copy_state_free(), which in turn should
+remove copy_bitmap.
+
+That's all not ideal: it would be better if internal bitmap of
+block-copy object is not attached to any node. But that is not possible
+now.
+
+The simplest solution is just create copy_bitmap in filter node, where
+anyway two other bitmaps are created.
+
+Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/block-copy.c         |   3 +-
+ block/copy-before-write.c  |   2 +-
+ include/block/block-copy.h |   1 +
+ tests/qemu-iotests/257.out | 112 ++++++++++++++++++-------------------
+ 4 files changed, 60 insertions(+), 58 deletions(-)
+
+diff --git a/block/block-copy.c b/block/block-copy.c
+index 9ee3dd7ef5..8fca2c3698 100644
+--- a/block/block-copy.c
++++ b/block/block-copy.c
+@@ -351,6 +351,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
+ }
+ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
++                                     BlockDriverState *copy_bitmap_bs,
+                                      const BdrvDirtyBitmap *bitmap,
+                                      Error **errp)
+ {
+@@ -367,7 +368,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+         return NULL;
+     }
+-    copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
++    copy_bitmap = bdrv_create_dirty_bitmap(copy_bitmap_bs, cluster_size, NULL,
+                                            errp);
+     if (!copy_bitmap) {
+         return NULL;
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index c0e70669a2..94db31512d 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -468,7 +468,7 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
+             ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
+              bs->file->bs->supported_zero_flags);
+-    s->bcs = block_copy_state_new(bs->file, s->target, bitmap, errp);
++    s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap, errp);
+     if (!s->bcs) {
+         error_prepend(errp, "Cannot create block-copy-state: ");
+         return -EINVAL;
+diff --git a/include/block/block-copy.h b/include/block/block-copy.h
+index 0700953ab8..8b41643bfa 100644
+--- a/include/block/block-copy.h
++++ b/include/block/block-copy.h
+@@ -25,6 +25,7 @@ typedef struct BlockCopyState BlockCopyState;
+ typedef struct BlockCopyCallState BlockCopyCallState;
+ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
++                                     BlockDriverState *copy_bitmap_bs,
+                                      const BdrvDirtyBitmap *bitmap,
+                                      Error **errp);
+diff --git a/tests/qemu-iotests/257.out b/tests/qemu-iotests/257.out
+index aa76131ca9..c33dd7f3a9 100644
+--- a/tests/qemu-iotests/257.out
++++ b/tests/qemu-iotests/257.out
+@@ -120,16 +120,16 @@ write -P0x67 0x3fe0000 0x20000
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      }
+-    ],
+-    "drive0": [
++      },
+       {
+         "busy": false,
+         "count": 0,
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      },
++      }
++    ],
++    "drive0": [
+       {
+         "busy": false,
+         "count": 458752,
+@@ -596,16 +596,16 @@ write -P0x67 0x3fe0000 0x20000
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      }
+-    ],
+-    "drive0": [
++      },
+       {
+         "busy": false,
+         "count": 0,
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      },
++      }
++    ],
++    "drive0": [
+       {
+         "busy": false,
+         "count": 458752,
+@@ -865,16 +865,16 @@ write -P0x67 0x3fe0000 0x20000
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      }
+-    ],
+-    "drive0": [
++      },
+       {
+         "busy": false,
+         "count": 0,
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      },
++      }
++    ],
++    "drive0": [
+       {
+         "busy": false,
+         "count": 458752,
+@@ -1341,16 +1341,16 @@ write -P0x67 0x3fe0000 0x20000
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      }
+-    ],
+-    "drive0": [
++      },
+       {
+         "busy": false,
+         "count": 0,
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      },
++      }
++    ],
++    "drive0": [
+       {
+         "busy": false,
+         "count": 458752,
+@@ -1610,16 +1610,16 @@ write -P0x67 0x3fe0000 0x20000
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      }
+-    ],
+-    "drive0": [
++      },
+       {
+         "busy": false,
+         "count": 0,
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      },
++      }
++    ],
++    "drive0": [
+       {
+         "busy": false,
+         "count": 458752,
+@@ -2086,16 +2086,16 @@ write -P0x67 0x3fe0000 0x20000
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      }
+-    ],
+-    "drive0": [
++      },
+       {
+         "busy": false,
+         "count": 0,
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      },
++      }
++    ],
++    "drive0": [
+       {
+         "busy": false,
+         "count": 458752,
+@@ -2355,16 +2355,16 @@ write -P0x67 0x3fe0000 0x20000
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      }
+-    ],
+-    "drive0": [
++      },
+       {
+         "busy": false,
+         "count": 0,
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      },
++      }
++    ],
++    "drive0": [
+       {
+         "busy": false,
+         "count": 458752,
+@@ -2831,16 +2831,16 @@ write -P0x67 0x3fe0000 0x20000
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      }
+-    ],
+-    "drive0": [
++      },
+       {
+         "busy": false,
+         "count": 0,
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      },
++      }
++    ],
++    "drive0": [
+       {
+         "busy": false,
+         "count": 458752,
+@@ -3100,16 +3100,16 @@ write -P0x67 0x3fe0000 0x20000
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      }
+-    ],
+-    "drive0": [
++      },
+       {
+         "busy": false,
+         "count": 0,
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      },
++      }
++    ],
++    "drive0": [
+       {
+         "busy": false,
+         "count": 458752,
+@@ -3576,16 +3576,16 @@ write -P0x67 0x3fe0000 0x20000
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      }
+-    ],
+-    "drive0": [
++      },
+       {
+         "busy": false,
+         "count": 0,
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      },
++      }
++    ],
++    "drive0": [
+       {
+         "busy": false,
+         "count": 458752,
+@@ -3845,16 +3845,16 @@ write -P0x67 0x3fe0000 0x20000
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      }
+-    ],
+-    "drive0": [
++      },
+       {
+         "busy": false,
+         "count": 0,
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      },
++      }
++    ],
++    "drive0": [
+       {
+         "busy": false,
+         "count": 458752,
+@@ -4321,16 +4321,16 @@ write -P0x67 0x3fe0000 0x20000
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      }
+-    ],
+-    "drive0": [
++      },
+       {
+         "busy": false,
+         "count": 0,
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      },
++      }
++    ],
++    "drive0": [
+       {
+         "busy": false,
+         "count": 458752,
+@@ -4590,16 +4590,16 @@ write -P0x67 0x3fe0000 0x20000
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      }
+-    ],
+-    "drive0": [
++      },
+       {
+         "busy": false,
+         "count": 0,
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      },
++      }
++    ],
++    "drive0": [
+       {
+         "busy": false,
+         "count": 458752,
+@@ -5066,16 +5066,16 @@ write -P0x67 0x3fe0000 0x20000
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      }
+-    ],
+-    "drive0": [
++      },
+       {
+         "busy": false,
+         "count": 0,
+         "granularity": 65536,
+         "persistent": false,
+         "recording": false
+-      },
++      }
++    ],
++    "drive0": [
+       {
+         "busy": false,
+         "count": 458752,
diff --git a/debian/patches/pve/0046-block-copy-before-write-fix-permission.patch b/debian/patches/pve/0046-block-copy-before-write-fix-permission.patch
deleted file mode 100644 (file)
index 1c2e5bd..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Date: Thu, 11 Apr 2024 11:29:22 +0200
-Subject: [PATCH] block/copy-before-write: fix permission
-
-In case when source node does not have any parents, the condition still
-works as required: backup job do create the parent by
-
-  block_job_create -> block_job_add_bdrv -> bdrv_root_attach_child
-
-Still, in this case checking @perm variable doesn't work, as backup job
-creates the root blk with empty permissions (as it rely on CBW filter
-to require correct permissions and don't want to create extra
-conflicts).
-
-So, we should not check @perm.
-
-The hack may be dropped entirely when transactional insertion of
-filter (when we don't try to recalculate permissions in intermediate
-state, when filter does conflict with original parent of the source
-node) merged (old big series
-"[PATCH v5 00/45] Transactional block-graph modifying API"[1] and it's
-current in-flight part is "[PATCH v8 0/7] blockdev-replace"[2])
-
-[1] https://patchew.org/QEMU/20220330212902.590099-1-vsementsov@openvz.org/
-[2] https://patchew.org/QEMU/20231017184444.932733-1-vsementsov@yandex-team.ru/
-
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/copy-before-write.c | 10 +++++++---
- 1 file changed, 7 insertions(+), 3 deletions(-)
-
-diff --git a/block/copy-before-write.c b/block/copy-before-write.c
-index b866e42271..a2dddf6f57 100644
---- a/block/copy-before-write.c
-+++ b/block/copy-before-write.c
-@@ -364,9 +364,13 @@ static void cbw_child_perm(BlockDriverState *bs, BdrvChild *c,
-                            perm, shared, nperm, nshared);
-         if (!QLIST_EMPTY(&bs->parents)) {
--            if (perm & BLK_PERM_WRITE) {
--                *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
--            }
-+            /*
-+             * Note, that source child may be shared with backup job. Backup job
-+             * does create own blk parent on copy-before-write node, so this
-+             * works even if source node does not have any parents before backup
-+             * start
-+             */
-+            *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
-             *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
-         }
-     }
diff --git a/debian/patches/pve/0047-block-copy-before-write-support-unligned-snapshot-di.patch b/debian/patches/pve/0047-block-copy-before-write-support-unligned-snapshot-di.patch
deleted file mode 100644 (file)
index 4656750..0000000
+++ /dev/null
@@ -1,48 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Date: Thu, 11 Apr 2024 11:29:23 +0200
-Subject: [PATCH] block/copy-before-write: support unligned snapshot-discard
-
-First thing that crashes on unligned access here is
-bdrv_reset_dirty_bitmap(). Correct way is to align-down the
-snapshot-discard request.
-
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/copy-before-write.c | 16 +++++++++++++---
- 1 file changed, 13 insertions(+), 3 deletions(-)
-
-diff --git a/block/copy-before-write.c b/block/copy-before-write.c
-index a2dddf6f57..0a219c2b75 100644
---- a/block/copy-before-write.c
-+++ b/block/copy-before-write.c
-@@ -325,14 +325,24 @@ static int coroutine_fn GRAPH_RDLOCK
- cbw_co_pdiscard_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes)
- {
-     BDRVCopyBeforeWriteState *s = bs->opaque;
-+    uint32_t cluster_size = block_copy_cluster_size(s->bcs);
-+    int64_t aligned_offset = QEMU_ALIGN_UP(offset, cluster_size);
-+    int64_t aligned_end = QEMU_ALIGN_DOWN(offset + bytes, cluster_size);
-+    int64_t aligned_bytes;
-+
-+    if (aligned_end <= aligned_offset) {
-+        return 0;
-+    }
-+    aligned_bytes = aligned_end - aligned_offset;
-     WITH_QEMU_LOCK_GUARD(&s->lock) {
--        bdrv_reset_dirty_bitmap(s->access_bitmap, offset, bytes);
-+        bdrv_reset_dirty_bitmap(s->access_bitmap, aligned_offset,
-+                                aligned_bytes);
-     }
--    block_copy_reset(s->bcs, offset, bytes);
-+    block_copy_reset(s->bcs, aligned_offset, aligned_bytes);
--    return bdrv_co_pdiscard(s->target, offset, bytes);
-+    return bdrv_co_pdiscard(s->target, aligned_offset, aligned_bytes);
- }
- static void cbw_refresh_filename(BlockDriverState *bs)
diff --git a/debian/patches/pve/0047-qapi-blockdev-backup-add-discard-source-parameter.patch b/debian/patches/pve/0047-qapi-blockdev-backup-add-discard-source-parameter.patch
new file mode 100644 (file)
index 0000000..ef44f42
--- /dev/null
@@ -0,0 +1,277 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Date: Thu, 11 Apr 2024 11:29:25 +0200
+Subject: [PATCH] qapi: blockdev-backup: add discard-source parameter
+
+Add a parameter that enables discard-after-copy. That is mostly useful
+in "push backup with fleecing" scheme, when source is snapshot-access
+format driver node, based on copy-before-write filter snapshot-access
+API:
+
+[guest]      [snapshot-access] ~~ blockdev-backup ~~> [backup target]
+   |            |
+   | root       | file
+   v            v
+[copy-before-write]
+   |             |
+   | file        | target
+   v             v
+[active disk]   [temp.img]
+
+In this case discard-after-copy does two things:
+
+ - discard data in temp.img to save disk space
+ - avoid further copy-before-write operation in discarded area
+
+Note that we have to declare WRITE permission on source in
+copy-before-write filter, for discard to work. Still we can't take it
+unconditionally, as it will break normal backup from RO source. So, we
+have to add a parameter and pass it thorough bdrv_open flags.
+
+Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/backup.c                         |  5 +++--
+ block/block-copy.c                     |  9 +++++++++
+ block/copy-before-write.c              | 15 +++++++++++++--
+ block/copy-before-write.h              |  1 +
+ block/replication.c                    |  4 ++--
+ blockdev.c                             |  2 +-
+ include/block/block-common.h           |  2 ++
+ include/block/block-copy.h             |  1 +
+ include/block/block_int-global-state.h |  2 +-
+ qapi/block-core.json                   |  4 ++++
+ 10 files changed, 37 insertions(+), 8 deletions(-)
+
+diff --git a/block/backup.c b/block/backup.c
+index 16d611c4ca..1963e47ab9 100644
+--- a/block/backup.c
++++ b/block/backup.c
+@@ -332,7 +332,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
+                   BlockDriverState *target, int64_t speed,
+                   MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap,
+                   BitmapSyncMode bitmap_mode,
+-                  bool compress,
++                  bool compress, bool discard_source,
+                   const char *filter_node_name,
+                   BackupPerf *perf,
+                   BlockdevOnError on_source_error,
+@@ -433,7 +433,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
+         goto error;
+     }
+-    cbw = bdrv_cbw_append(bs, target, filter_node_name, &bcs, errp);
++    cbw = bdrv_cbw_append(bs, target, filter_node_name, discard_source,
++                          &bcs, errp);
+     if (!cbw) {
+         goto error;
+     }
+diff --git a/block/block-copy.c b/block/block-copy.c
+index 8fca2c3698..7e3b378528 100644
+--- a/block/block-copy.c
++++ b/block/block-copy.c
+@@ -137,6 +137,7 @@ typedef struct BlockCopyState {
+     CoMutex lock;
+     int64_t in_flight_bytes;
+     BlockCopyMethod method;
++    bool discard_source;
+     BlockReqList reqs;
+     QLIST_HEAD(, BlockCopyCallState) calls;
+     /*
+@@ -353,6 +354,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
+ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+                                      BlockDriverState *copy_bitmap_bs,
+                                      const BdrvDirtyBitmap *bitmap,
++                                     bool discard_source,
+                                      Error **errp)
+ {
+     ERRP_GUARD();
+@@ -418,6 +420,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+                                     cluster_size),
+     };
++    s->discard_source = discard_source;
+     block_copy_set_copy_opts(s, false, false);
+     ratelimit_init(&s->rate_limit);
+@@ -589,6 +592,12 @@ static coroutine_fn int block_copy_task_entry(AioTask *task)
+     co_put_to_shres(s->mem, t->req.bytes);
+     block_copy_task_end(t, ret);
++    if (s->discard_source && ret == 0) {
++        int64_t nbytes =
++            MIN(t->req.offset + t->req.bytes, s->len) - t->req.offset;
++        bdrv_co_pdiscard(s->source, t->req.offset, nbytes);
++    }
++
+     return ret;
+ }
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index 94db31512d..853e01a1eb 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -44,6 +44,7 @@ typedef struct BDRVCopyBeforeWriteState {
+     BdrvChild *target;
+     OnCbwError on_cbw_error;
+     uint64_t cbw_timeout_ns;
++    bool discard_source;
+     /*
+      * @lock: protects access to @access_bitmap, @done_bitmap and
+@@ -357,6 +358,8 @@ cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
+                uint64_t perm, uint64_t shared,
+                uint64_t *nperm, uint64_t *nshared)
+ {
++    BDRVCopyBeforeWriteState *s = bs->opaque;
++
+     if (!(role & BDRV_CHILD_FILTERED)) {
+         /*
+          * Target child
+@@ -381,6 +384,10 @@ cbw_child_perm(BlockDriverState *bs, BdrvChild *c, BdrvChildRole role,
+              * start
+              */
+             *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
++            if (s->discard_source) {
++                *nperm = *nperm | BLK_PERM_WRITE;
++            }
++
+             *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
+         }
+     }
+@@ -468,7 +475,9 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
+             ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
+              bs->file->bs->supported_zero_flags);
+-    s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap, errp);
++    s->discard_source = flags & BDRV_O_CBW_DISCARD_SOURCE;
++    s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap,
++                                  flags & BDRV_O_CBW_DISCARD_SOURCE, errp);
+     if (!s->bcs) {
+         error_prepend(errp, "Cannot create block-copy-state: ");
+         return -EINVAL;
+@@ -535,12 +544,14 @@ static BlockDriver bdrv_cbw_filter = {
+ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+                                   BlockDriverState *target,
+                                   const char *filter_node_name,
++                                  bool discard_source,
+                                   BlockCopyState **bcs,
+                                   Error **errp)
+ {
+     BDRVCopyBeforeWriteState *state;
+     BlockDriverState *top;
+     QDict *opts;
++    int flags = BDRV_O_RDWR | (discard_source ? BDRV_O_CBW_DISCARD_SOURCE : 0);
+     assert(source->total_sectors == target->total_sectors);
+     GLOBAL_STATE_CODE();
+@@ -553,7 +564,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+     qdict_put_str(opts, "file", bdrv_get_node_name(source));
+     qdict_put_str(opts, "target", bdrv_get_node_name(target));
+-    top = bdrv_insert_node(source, opts, BDRV_O_RDWR, errp);
++    top = bdrv_insert_node(source, opts, flags, errp);
+     if (!top) {
+         return NULL;
+     }
+diff --git a/block/copy-before-write.h b/block/copy-before-write.h
+index 6e72bb25e9..01af0cd3c4 100644
+--- a/block/copy-before-write.h
++++ b/block/copy-before-write.h
+@@ -39,6 +39,7 @@
+ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+                                   BlockDriverState *target,
+                                   const char *filter_node_name,
++                                  bool discard_source,
+                                   BlockCopyState **bcs,
+                                   Error **errp);
+ void bdrv_cbw_drop(BlockDriverState *bs);
+diff --git a/block/replication.c b/block/replication.c
+index ca6bd0a720..0415a5e8b7 100644
+--- a/block/replication.c
++++ b/block/replication.c
+@@ -582,8 +582,8 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
+         s->backup_job = backup_job_create(
+                                 NULL, s->secondary_disk->bs, s->hidden_disk->bs,
+-                                0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, NULL,
+-                                &perf,
++                                0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, false,
++                                NULL, &perf,
+                                 BLOCKDEV_ON_ERROR_REPORT,
+                                 BLOCKDEV_ON_ERROR_REPORT, JOB_INTERNAL,
+                                 backup_job_completed, bs, NULL, &local_err);
+diff --git a/blockdev.c b/blockdev.c
+index 5e5dbc1da9..1054a69279 100644
+--- a/blockdev.c
++++ b/blockdev.c
+@@ -2727,7 +2727,7 @@ static BlockJob *do_backup_common(BackupCommon *backup,
+     job = backup_job_create(backup->job_id, bs, target_bs, backup->speed,
+                             backup->sync, bmap, backup->bitmap_mode,
+-                            backup->compress,
++                            backup->compress, backup->discard_source,
+                             backup->filter_node_name,
+                             &perf,
+                             backup->on_source_error,
+diff --git a/include/block/block-common.h b/include/block/block-common.h
+index a846023a09..338fe5ff7a 100644
+--- a/include/block/block-common.h
++++ b/include/block/block-common.h
+@@ -243,6 +243,8 @@ typedef enum {
+                                       read-write fails */
+ #define BDRV_O_IO_URING    0x40000 /* use io_uring instead of the thread pool */
++#define BDRV_O_CBW_DISCARD_SOURCE 0x80000 /* for copy-before-write filter */
++
+ #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)
+diff --git a/include/block/block-copy.h b/include/block/block-copy.h
+index 8b41643bfa..bdc703bacd 100644
+--- a/include/block/block-copy.h
++++ b/include/block/block-copy.h
+@@ -27,6 +27,7 @@ typedef struct BlockCopyCallState BlockCopyCallState;
+ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+                                      BlockDriverState *copy_bitmap_bs,
+                                      const BdrvDirtyBitmap *bitmap,
++                                     bool discard_source,
+                                      Error **errp);
+ /* Function should be called prior any actual copy request */
+diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h
+index cc1387ae02..f0c642b194 100644
+--- a/include/block/block_int-global-state.h
++++ b/include/block/block_int-global-state.h
+@@ -195,7 +195,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
+                             MirrorSyncMode sync_mode,
+                             BdrvDirtyBitmap *sync_bitmap,
+                             BitmapSyncMode bitmap_mode,
+-                            bool compress,
++                            bool compress, bool discard_source,
+                             const char *filter_node_name,
+                             BackupPerf *perf,
+                             BlockdevOnError on_source_error,
+diff --git a/qapi/block-core.json b/qapi/block-core.json
+index f516d8e95a..d796d49abb 100644
+--- a/qapi/block-core.json
++++ b/qapi/block-core.json
+@@ -1849,6 +1849,9 @@
+ #     node specified by @drive.  If this option is not given, a node
+ #     name is autogenerated.  (Since: 4.2)
+ #
++# @discard-source: Discard blocks on source which are already copied
++#     to the target.  (Since 9.0)
++#
+ # @x-perf: Performance options.  (Since 6.0)
+ #
+ # Features:
+@@ -1870,6 +1873,7 @@
+             '*on-target-error': 'BlockdevOnError',
+             '*auto-finalize': 'bool', '*auto-dismiss': 'bool',
+             '*filter-node-name': 'str',
++            '*discard-source': 'bool',
+             '*x-perf': { 'type': 'BackupPerf',
+                          'features': [ 'unstable' ] } } }
diff --git a/debian/patches/pve/0048-block-copy-before-write-create-block_copy-bitmap-in-.patch b/debian/patches/pve/0048-block-copy-before-write-create-block_copy-bitmap-in-.patch
deleted file mode 100644 (file)
index ab7c0bf..0000000
+++ /dev/null
@@ -1,373 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Date: Thu, 11 Apr 2024 11:29:24 +0200
-Subject: [PATCH] block/copy-before-write: create block_copy bitmap in filter
- node
-
-Currently block_copy creates copy_bitmap in source node. But that is in
-bad relation with .independent_close=true of copy-before-write filter:
-source node may be detached and removed before .bdrv_close() handler
-called, which should call block_copy_state_free(), which in turn should
-remove copy_bitmap.
-
-That's all not ideal: it would be better if internal bitmap of
-block-copy object is not attached to any node. But that is not possible
-now.
-
-The simplest solution is just create copy_bitmap in filter node, where
-anyway two other bitmaps are created.
-
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/block-copy.c         |   3 +-
- block/copy-before-write.c  |   2 +-
- include/block/block-copy.h |   1 +
- tests/qemu-iotests/257.out | 112 ++++++++++++++++++-------------------
- 4 files changed, 60 insertions(+), 58 deletions(-)
-
-diff --git a/block/block-copy.c b/block/block-copy.c
-index e13d7bc6b6..b61685f1a2 100644
---- a/block/block-copy.c
-+++ b/block/block-copy.c
-@@ -346,6 +346,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
- }
- BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
-+                                     BlockDriverState *copy_bitmap_bs,
-                                      const BdrvDirtyBitmap *bitmap,
-                                      Error **errp)
- {
-@@ -360,7 +361,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
-         return NULL;
-     }
--    copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
-+    copy_bitmap = bdrv_create_dirty_bitmap(copy_bitmap_bs, cluster_size, NULL,
-                                            errp);
-     if (!copy_bitmap) {
-         return NULL;
-diff --git a/block/copy-before-write.c b/block/copy-before-write.c
-index 0a219c2b75..d3b95bd600 100644
---- a/block/copy-before-write.c
-+++ b/block/copy-before-write.c
-@@ -470,7 +470,7 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
-             ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
-              bs->file->bs->supported_zero_flags);
--    s->bcs = block_copy_state_new(bs->file, s->target, bitmap, errp);
-+    s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap, errp);
-     if (!s->bcs) {
-         error_prepend(errp, "Cannot create block-copy-state: ");
-         ret = -EINVAL;
-diff --git a/include/block/block-copy.h b/include/block/block-copy.h
-index 0700953ab8..8b41643bfa 100644
---- a/include/block/block-copy.h
-+++ b/include/block/block-copy.h
-@@ -25,6 +25,7 @@ typedef struct BlockCopyState BlockCopyState;
- typedef struct BlockCopyCallState BlockCopyCallState;
- BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
-+                                     BlockDriverState *copy_bitmap_bs,
-                                      const BdrvDirtyBitmap *bitmap,
-                                      Error **errp);
-diff --git a/tests/qemu-iotests/257.out b/tests/qemu-iotests/257.out
-index aa76131ca9..c33dd7f3a9 100644
---- a/tests/qemu-iotests/257.out
-+++ b/tests/qemu-iotests/257.out
-@@ -120,16 +120,16 @@ write -P0x67 0x3fe0000 0x20000
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      }
--    ],
--    "drive0": [
-+      },
-       {
-         "busy": false,
-         "count": 0,
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      },
-+      }
-+    ],
-+    "drive0": [
-       {
-         "busy": false,
-         "count": 458752,
-@@ -596,16 +596,16 @@ write -P0x67 0x3fe0000 0x20000
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      }
--    ],
--    "drive0": [
-+      },
-       {
-         "busy": false,
-         "count": 0,
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      },
-+      }
-+    ],
-+    "drive0": [
-       {
-         "busy": false,
-         "count": 458752,
-@@ -865,16 +865,16 @@ write -P0x67 0x3fe0000 0x20000
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      }
--    ],
--    "drive0": [
-+      },
-       {
-         "busy": false,
-         "count": 0,
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      },
-+      }
-+    ],
-+    "drive0": [
-       {
-         "busy": false,
-         "count": 458752,
-@@ -1341,16 +1341,16 @@ write -P0x67 0x3fe0000 0x20000
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      }
--    ],
--    "drive0": [
-+      },
-       {
-         "busy": false,
-         "count": 0,
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      },
-+      }
-+    ],
-+    "drive0": [
-       {
-         "busy": false,
-         "count": 458752,
-@@ -1610,16 +1610,16 @@ write -P0x67 0x3fe0000 0x20000
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      }
--    ],
--    "drive0": [
-+      },
-       {
-         "busy": false,
-         "count": 0,
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      },
-+      }
-+    ],
-+    "drive0": [
-       {
-         "busy": false,
-         "count": 458752,
-@@ -2086,16 +2086,16 @@ write -P0x67 0x3fe0000 0x20000
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      }
--    ],
--    "drive0": [
-+      },
-       {
-         "busy": false,
-         "count": 0,
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      },
-+      }
-+    ],
-+    "drive0": [
-       {
-         "busy": false,
-         "count": 458752,
-@@ -2355,16 +2355,16 @@ write -P0x67 0x3fe0000 0x20000
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      }
--    ],
--    "drive0": [
-+      },
-       {
-         "busy": false,
-         "count": 0,
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      },
-+      }
-+    ],
-+    "drive0": [
-       {
-         "busy": false,
-         "count": 458752,
-@@ -2831,16 +2831,16 @@ write -P0x67 0x3fe0000 0x20000
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      }
--    ],
--    "drive0": [
-+      },
-       {
-         "busy": false,
-         "count": 0,
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      },
-+      }
-+    ],
-+    "drive0": [
-       {
-         "busy": false,
-         "count": 458752,
-@@ -3100,16 +3100,16 @@ write -P0x67 0x3fe0000 0x20000
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      }
--    ],
--    "drive0": [
-+      },
-       {
-         "busy": false,
-         "count": 0,
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      },
-+      }
-+    ],
-+    "drive0": [
-       {
-         "busy": false,
-         "count": 458752,
-@@ -3576,16 +3576,16 @@ write -P0x67 0x3fe0000 0x20000
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      }
--    ],
--    "drive0": [
-+      },
-       {
-         "busy": false,
-         "count": 0,
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      },
-+      }
-+    ],
-+    "drive0": [
-       {
-         "busy": false,
-         "count": 458752,
-@@ -3845,16 +3845,16 @@ write -P0x67 0x3fe0000 0x20000
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      }
--    ],
--    "drive0": [
-+      },
-       {
-         "busy": false,
-         "count": 0,
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      },
-+      }
-+    ],
-+    "drive0": [
-       {
-         "busy": false,
-         "count": 458752,
-@@ -4321,16 +4321,16 @@ write -P0x67 0x3fe0000 0x20000
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      }
--    ],
--    "drive0": [
-+      },
-       {
-         "busy": false,
-         "count": 0,
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      },
-+      }
-+    ],
-+    "drive0": [
-       {
-         "busy": false,
-         "count": 458752,
-@@ -4590,16 +4590,16 @@ write -P0x67 0x3fe0000 0x20000
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      }
--    ],
--    "drive0": [
-+      },
-       {
-         "busy": false,
-         "count": 0,
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      },
-+      }
-+    ],
-+    "drive0": [
-       {
-         "busy": false,
-         "count": 458752,
-@@ -5066,16 +5066,16 @@ write -P0x67 0x3fe0000 0x20000
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      }
--    ],
--    "drive0": [
-+      },
-       {
-         "busy": false,
-         "count": 0,
-         "granularity": 65536,
-         "persistent": false,
-         "recording": false
--      },
-+      }
-+    ],
-+    "drive0": [
-       {
-         "busy": false,
-         "count": 458752,
diff --git a/debian/patches/pve/0048-copy-before-write-allow-specifying-minimum-cluster-s.patch b/debian/patches/pve/0048-copy-before-write-allow-specifying-minimum-cluster-s.patch
new file mode 100644 (file)
index 0000000..50a8cd2
--- /dev/null
@@ -0,0 +1,133 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Thu, 11 Apr 2024 11:29:26 +0200
+Subject: [PATCH] copy-before-write: allow specifying minimum cluster size
+
+Useful to make discard-source work in the context of backup fleecing
+when the fleecing image has a larger granularity than the backup
+target.
+
+Copy-before-write operations will use at least this granularity and in
+particular, discard requests to the source node will too. If the
+granularity is too small, they will just be aligned down in
+cbw_co_pdiscard_snapshot() and thus effectively ignored.
+
+The QAPI uses uint32 so the value will be non-negative, but still fit
+into a uint64_t.
+
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/block-copy.c         | 17 +++++++++++++----
+ block/copy-before-write.c  |  3 ++-
+ include/block/block-copy.h |  1 +
+ qapi/block-core.json       |  8 +++++++-
+ 4 files changed, 23 insertions(+), 6 deletions(-)
+
+diff --git a/block/block-copy.c b/block/block-copy.c
+index 7e3b378528..adb1cbb440 100644
+--- a/block/block-copy.c
++++ b/block/block-copy.c
+@@ -310,6 +310,7 @@ void block_copy_set_copy_opts(BlockCopyState *s, bool use_copy_range,
+ }
+ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
++                                                 int64_t min_cluster_size,
+                                                  Error **errp)
+ {
+     int ret;
+@@ -335,7 +336,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
+                     "used. If the actual block size of the target exceeds "
+                     "this default, the backup may be unusable",
+                     BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
+-        return BLOCK_COPY_CLUSTER_SIZE_DEFAULT;
++        return MAX(min_cluster_size, BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
+     } else if (ret < 0 && !target_does_cow) {
+         error_setg_errno(errp, -ret,
+             "Couldn't determine the cluster size of the target image, "
+@@ -345,16 +346,18 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
+         return ret;
+     } else if (ret < 0 && target_does_cow) {
+         /* Not fatal; just trudge on ahead. */
+-        return BLOCK_COPY_CLUSTER_SIZE_DEFAULT;
++        return MAX(min_cluster_size, BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
+     }
+-    return MAX(BLOCK_COPY_CLUSTER_SIZE_DEFAULT, bdi.cluster_size);
++    return MAX(min_cluster_size,
++               MAX(BLOCK_COPY_CLUSTER_SIZE_DEFAULT, bdi.cluster_size));
+ }
+ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+                                      BlockDriverState *copy_bitmap_bs,
+                                      const BdrvDirtyBitmap *bitmap,
+                                      bool discard_source,
++                                     int64_t min_cluster_size,
+                                      Error **errp)
+ {
+     ERRP_GUARD();
+@@ -365,7 +368,13 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+     GLOBAL_STATE_CODE();
+-    cluster_size = block_copy_calculate_cluster_size(target->bs, errp);
++    if (min_cluster_size && !is_power_of_2(min_cluster_size)) {
++        error_setg(errp, "min-cluster-size needs to be a power of 2");
++        return NULL;
++    }
++
++    cluster_size = block_copy_calculate_cluster_size(target->bs,
++                                                     min_cluster_size, errp);
+     if (cluster_size < 0) {
+         return NULL;
+     }
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index 853e01a1eb..47b3cdd09f 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -477,7 +477,8 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
+     s->discard_source = flags & BDRV_O_CBW_DISCARD_SOURCE;
+     s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap,
+-                                  flags & BDRV_O_CBW_DISCARD_SOURCE, errp);
++                                  flags & BDRV_O_CBW_DISCARD_SOURCE,
++                                  opts->min_cluster_size, errp);
+     if (!s->bcs) {
+         error_prepend(errp, "Cannot create block-copy-state: ");
+         return -EINVAL;
+diff --git a/include/block/block-copy.h b/include/block/block-copy.h
+index bdc703bacd..77857c6c68 100644
+--- a/include/block/block-copy.h
++++ b/include/block/block-copy.h
+@@ -28,6 +28,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+                                      BlockDriverState *copy_bitmap_bs,
+                                      const BdrvDirtyBitmap *bitmap,
+                                      bool discard_source,
++                                     int64_t min_cluster_size,
+                                      Error **errp);
+ /* Function should be called prior any actual copy request */
+diff --git a/qapi/block-core.json b/qapi/block-core.json
+index d796d49abb..edbf6e78b9 100644
+--- a/qapi/block-core.json
++++ b/qapi/block-core.json
+@@ -4930,12 +4930,18 @@
+ #     @on-cbw-error parameter will decide how this failure is handled.
+ #     Default 0.  (Since 7.1)
+ #
++# @min-cluster-size: Minimum size of blocks used by copy-before-write
++#     operations.  Has to be a power of 2.  No effect if smaller than
++#     the maximum of the target's cluster size and 64 KiB.  Default 0.
++#     (Since 8.1)
++#
+ # Since: 6.2
+ ##
+ { 'struct': 'BlockdevOptionsCbw',
+   'base': 'BlockdevOptionsGenericFormat',
+   'data': { 'target': 'BlockdevRef', '*bitmap': 'BlockDirtyBitmap',
+-            '*on-cbw-error': 'OnCbwError', '*cbw-timeout': 'uint32' } }
++            '*on-cbw-error': 'OnCbwError', '*cbw-timeout': 'uint32',
++            '*min-cluster-size': 'uint32' } }
+ ##
+ # @BlockdevOptions:
diff --git a/debian/patches/pve/0049-backup-add-minimum-cluster-size-to-performance-optio.patch b/debian/patches/pve/0049-backup-add-minimum-cluster-size-to-performance-optio.patch
new file mode 100644 (file)
index 0000000..fe3ff95
--- /dev/null
@@ -0,0 +1,106 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Thu, 11 Apr 2024 11:29:27 +0200
+Subject: [PATCH] backup: add minimum cluster size to performance options
+
+Useful to make discard-source work in the context of backup fleecing
+when the fleecing image has a larger granularity than the backup
+target.
+
+Backup/block-copy will use at least this granularity for copy operations
+and in particular, discard requests to the backup source will too. If
+the granularity is too small, they will just be aligned down in
+cbw_co_pdiscard_snapshot() and thus effectively ignored.
+
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/backup.c            | 2 +-
+ block/copy-before-write.c | 2 ++
+ block/copy-before-write.h | 1 +
+ blockdev.c                | 3 +++
+ qapi/block-core.json      | 9 +++++++--
+ 5 files changed, 14 insertions(+), 3 deletions(-)
+
+diff --git a/block/backup.c b/block/backup.c
+index 1963e47ab9..fe69723ada 100644
+--- a/block/backup.c
++++ b/block/backup.c
+@@ -434,7 +434,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
+     }
+     cbw = bdrv_cbw_append(bs, target, filter_node_name, discard_source,
+-                          &bcs, errp);
++                          perf->min_cluster_size, &bcs, errp);
+     if (!cbw) {
+         goto error;
+     }
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index 47b3cdd09f..bba58326d7 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -546,6 +546,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+                                   BlockDriverState *target,
+                                   const char *filter_node_name,
+                                   bool discard_source,
++                                  int64_t min_cluster_size,
+                                   BlockCopyState **bcs,
+                                   Error **errp)
+ {
+@@ -564,6 +565,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+     }
+     qdict_put_str(opts, "file", bdrv_get_node_name(source));
+     qdict_put_str(opts, "target", bdrv_get_node_name(target));
++    qdict_put_int(opts, "min-cluster-size", min_cluster_size);
+     top = bdrv_insert_node(source, opts, flags, errp);
+     if (!top) {
+diff --git a/block/copy-before-write.h b/block/copy-before-write.h
+index 01af0cd3c4..dc6cafe7fa 100644
+--- a/block/copy-before-write.h
++++ b/block/copy-before-write.h
+@@ -40,6 +40,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+                                   BlockDriverState *target,
+                                   const char *filter_node_name,
+                                   bool discard_source,
++                                  int64_t min_cluster_size,
+                                   BlockCopyState **bcs,
+                                   Error **errp);
+ void bdrv_cbw_drop(BlockDriverState *bs);
+diff --git a/blockdev.c b/blockdev.c
+index 1054a69279..cbe224387b 100644
+--- a/blockdev.c
++++ b/blockdev.c
+@@ -2654,6 +2654,9 @@ static BlockJob *do_backup_common(BackupCommon *backup,
+         if (backup->x_perf->has_max_chunk) {
+             perf.max_chunk = backup->x_perf->max_chunk;
+         }
++        if (backup->x_perf->has_min_cluster_size) {
++            perf.min_cluster_size = backup->x_perf->min_cluster_size;
++        }
+     }
+     if ((backup->sync == MIRROR_SYNC_MODE_BITMAP) ||
+diff --git a/qapi/block-core.json b/qapi/block-core.json
+index edbf6e78b9..6e7ee87633 100644
+--- a/qapi/block-core.json
++++ b/qapi/block-core.json
+@@ -1790,11 +1790,16 @@
+ #     it should not be less than job cluster size which is calculated
+ #     as maximum of target image cluster size and 64k.  Default 0.
+ #
++# @min-cluster-size: Minimum size of blocks used by copy-before-write
++#     and background copy operations.  Has to be a power of 2.  No
++#     effect if smaller than the maximum of the target's cluster size
++#     and 64 KiB.  Default 0. (Since 8.1)
++#
+ # Since: 6.0
+ ##
+ { 'struct': 'BackupPerf',
+-  'data': { '*use-copy-range': 'bool',
+-            '*max-workers': 'int', '*max-chunk': 'int64' } }
++  'data': { '*use-copy-range': 'bool', '*max-workers': 'int',
++            '*max-chunk': 'int64', '*min-cluster-size': 'uint32' } }
+ ##
+ # @BackupCommon:
diff --git a/debian/patches/pve/0049-qapi-blockdev-backup-add-discard-source-parameter.patch b/debian/patches/pve/0049-qapi-blockdev-backup-add-discard-source-parameter.patch
deleted file mode 100644 (file)
index b6eee3e..0000000
+++ /dev/null
@@ -1,277 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Date: Thu, 11 Apr 2024 11:29:25 +0200
-Subject: [PATCH] qapi: blockdev-backup: add discard-source parameter
-
-Add a parameter that enables discard-after-copy. That is mostly useful
-in "push backup with fleecing" scheme, when source is snapshot-access
-format driver node, based on copy-before-write filter snapshot-access
-API:
-
-[guest]      [snapshot-access] ~~ blockdev-backup ~~> [backup target]
-   |            |
-   | root       | file
-   v            v
-[copy-before-write]
-   |             |
-   | file        | target
-   v             v
-[active disk]   [temp.img]
-
-In this case discard-after-copy does two things:
-
- - discard data in temp.img to save disk space
- - avoid further copy-before-write operation in discarded area
-
-Note that we have to declare WRITE permission on source in
-copy-before-write filter, for discard to work. Still we can't take it
-unconditionally, as it will break normal backup from RO source. So, we
-have to add a parameter and pass it thorough bdrv_open flags.
-
-Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@yandex-team.ru>
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/backup.c                         |  5 +++--
- block/block-copy.c                     |  9 +++++++++
- block/copy-before-write.c              | 15 +++++++++++++--
- block/copy-before-write.h              |  1 +
- block/replication.c                    |  4 ++--
- blockdev.c                             |  2 +-
- include/block/block-common.h           |  2 ++
- include/block/block-copy.h             |  1 +
- include/block/block_int-global-state.h |  2 +-
- qapi/block-core.json                   |  4 ++++
- 10 files changed, 37 insertions(+), 8 deletions(-)
-
-diff --git a/block/backup.c b/block/backup.c
-index af87fa6aa9..3dc955f625 100644
---- a/block/backup.c
-+++ b/block/backup.c
-@@ -332,7 +332,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
-                   BlockDriverState *target, int64_t speed,
-                   MirrorSyncMode sync_mode, BdrvDirtyBitmap *sync_bitmap,
-                   BitmapSyncMode bitmap_mode,
--                  bool compress,
-+                  bool compress, bool discard_source,
-                   const char *filter_node_name,
-                   BackupPerf *perf,
-                   BlockdevOnError on_source_error,
-@@ -429,7 +429,8 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
-         goto error;
-     }
--    cbw = bdrv_cbw_append(bs, target, filter_node_name, &bcs, errp);
-+    cbw = bdrv_cbw_append(bs, target, filter_node_name, discard_source,
-+                          &bcs, errp);
-     if (!cbw) {
-         goto error;
-     }
-diff --git a/block/block-copy.c b/block/block-copy.c
-index b61685f1a2..3c61e52bae 100644
---- a/block/block-copy.c
-+++ b/block/block-copy.c
-@@ -137,6 +137,7 @@ typedef struct BlockCopyState {
-     CoMutex lock;
-     int64_t in_flight_bytes;
-     BlockCopyMethod method;
-+    bool discard_source;
-     BlockReqList reqs;
-     QLIST_HEAD(, BlockCopyCallState) calls;
-     /*
-@@ -348,6 +349,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
- BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
-                                      BlockDriverState *copy_bitmap_bs,
-                                      const BdrvDirtyBitmap *bitmap,
-+                                     bool discard_source,
-                                      Error **errp)
- {
-     ERRP_GUARD();
-@@ -409,6 +411,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
-                                     cluster_size),
-     };
-+    s->discard_source = discard_source;
-     block_copy_set_copy_opts(s, false, false);
-     ratelimit_init(&s->rate_limit);
-@@ -580,6 +583,12 @@ static coroutine_fn int block_copy_task_entry(AioTask *task)
-     co_put_to_shres(s->mem, t->req.bytes);
-     block_copy_task_end(t, ret);
-+    if (s->discard_source && ret == 0) {
-+        int64_t nbytes =
-+            MIN(t->req.offset + t->req.bytes, s->len) - t->req.offset;
-+        bdrv_co_pdiscard(s->source, t->req.offset, nbytes);
-+    }
-+
-     return ret;
- }
-diff --git a/block/copy-before-write.c b/block/copy-before-write.c
-index d3b95bd600..3503702d71 100644
---- a/block/copy-before-write.c
-+++ b/block/copy-before-write.c
-@@ -44,6 +44,7 @@ typedef struct BDRVCopyBeforeWriteState {
-     BdrvChild *target;
-     OnCbwError on_cbw_error;
-     uint32_t cbw_timeout_ns;
-+    bool discard_source;
-     /*
-      * @lock: protects access to @access_bitmap, @done_bitmap and
-@@ -357,6 +358,8 @@ static void cbw_child_perm(BlockDriverState *bs, BdrvChild *c,
-                            uint64_t perm, uint64_t shared,
-                            uint64_t *nperm, uint64_t *nshared)
- {
-+    BDRVCopyBeforeWriteState *s = bs->opaque;
-+
-     if (!(role & BDRV_CHILD_FILTERED)) {
-         /*
-          * Target child
-@@ -381,6 +384,10 @@ static void cbw_child_perm(BlockDriverState *bs, BdrvChild *c,
-              * start
-              */
-             *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
-+            if (s->discard_source) {
-+                *nperm = *nperm | BLK_PERM_WRITE;
-+            }
-+
-             *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
-         }
-     }
-@@ -470,7 +477,9 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
-             ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
-              bs->file->bs->supported_zero_flags);
--    s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap, errp);
-+    s->discard_source = flags & BDRV_O_CBW_DISCARD_SOURCE;
-+    s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap,
-+                                  flags & BDRV_O_CBW_DISCARD_SOURCE, errp);
-     if (!s->bcs) {
-         error_prepend(errp, "Cannot create block-copy-state: ");
-         ret = -EINVAL;
-@@ -544,12 +553,14 @@ BlockDriver bdrv_cbw_filter = {
- BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
-                                   BlockDriverState *target,
-                                   const char *filter_node_name,
-+                                  bool discard_source,
-                                   BlockCopyState **bcs,
-                                   Error **errp)
- {
-     BDRVCopyBeforeWriteState *state;
-     BlockDriverState *top;
-     QDict *opts;
-+    int flags = BDRV_O_RDWR | (discard_source ? BDRV_O_CBW_DISCARD_SOURCE : 0);
-     assert(source->total_sectors == target->total_sectors);
-     GLOBAL_STATE_CODE();
-@@ -562,7 +573,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
-     qdict_put_str(opts, "file", bdrv_get_node_name(source));
-     qdict_put_str(opts, "target", bdrv_get_node_name(target));
--    top = bdrv_insert_node(source, opts, BDRV_O_RDWR, errp);
-+    top = bdrv_insert_node(source, opts, flags, errp);
-     if (!top) {
-         return NULL;
-     }
-diff --git a/block/copy-before-write.h b/block/copy-before-write.h
-index 6e72bb25e9..01af0cd3c4 100644
---- a/block/copy-before-write.h
-+++ b/block/copy-before-write.h
-@@ -39,6 +39,7 @@
- BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
-                                   BlockDriverState *target,
-                                   const char *filter_node_name,
-+                                  bool discard_source,
-                                   BlockCopyState **bcs,
-                                   Error **errp);
- void bdrv_cbw_drop(BlockDriverState *bs);
-diff --git a/block/replication.c b/block/replication.c
-index ea4bf1aa80..39ad78cf98 100644
---- a/block/replication.c
-+++ b/block/replication.c
-@@ -579,8 +579,8 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode,
-         s->backup_job = backup_job_create(
-                                 NULL, s->secondary_disk->bs, s->hidden_disk->bs,
--                                0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, NULL,
--                                &perf,
-+                                0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, false,
-+                                NULL, &perf,
-                                 BLOCKDEV_ON_ERROR_REPORT,
-                                 BLOCKDEV_ON_ERROR_REPORT, JOB_INTERNAL,
-                                 backup_job_completed, bs, NULL, &local_err);
-diff --git a/blockdev.c b/blockdev.c
-index 7793143d76..ce3fef924c 100644
---- a/blockdev.c
-+++ b/blockdev.c
-@@ -2802,7 +2802,7 @@ static BlockJob *do_backup_common(BackupCommon *backup,
-     job = backup_job_create(backup->job_id, bs, target_bs, backup->speed,
-                             backup->sync, bmap, backup->bitmap_mode,
--                            backup->compress,
-+                            backup->compress, backup->discard_source,
-                             backup->filter_node_name,
-                             &perf,
-                             backup->on_source_error,
-diff --git a/include/block/block-common.h b/include/block/block-common.h
-index e15395f2cb..913a8b259c 100644
---- a/include/block/block-common.h
-+++ b/include/block/block-common.h
-@@ -234,6 +234,8 @@ typedef enum {
-                                       read-write fails */
- #define BDRV_O_IO_URING    0x40000 /* use io_uring instead of the thread pool */
-+#define BDRV_O_CBW_DISCARD_SOURCE 0x80000 /* for copy-before-write filter */
-+
- #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)
-diff --git a/include/block/block-copy.h b/include/block/block-copy.h
-index 8b41643bfa..bdc703bacd 100644
---- a/include/block/block-copy.h
-+++ b/include/block/block-copy.h
-@@ -27,6 +27,7 @@ typedef struct BlockCopyCallState BlockCopyCallState;
- BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
-                                      BlockDriverState *copy_bitmap_bs,
-                                      const BdrvDirtyBitmap *bitmap,
-+                                     bool discard_source,
-                                      Error **errp);
- /* Function should be called prior any actual copy request */
-diff --git a/include/block/block_int-global-state.h b/include/block/block_int-global-state.h
-index 32f0f9858a..546f2b5532 100644
---- a/include/block/block_int-global-state.h
-+++ b/include/block/block_int-global-state.h
-@@ -189,7 +189,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
-                             MirrorSyncMode sync_mode,
-                             BdrvDirtyBitmap *sync_bitmap,
-                             BitmapSyncMode bitmap_mode,
--                            bool compress,
-+                            bool compress, bool discard_source,
-                             const char *filter_node_name,
-                             BackupPerf *perf,
-                             BlockdevOnError on_source_error,
-diff --git a/qapi/block-core.json b/qapi/block-core.json
-index 09de550c95..4297e5beda 100644
---- a/qapi/block-core.json
-+++ b/qapi/block-core.json
-@@ -1816,6 +1816,9 @@
- #     node specified by @drive.  If this option is not given, a node
- #     name is autogenerated.  (Since: 4.2)
- #
-+# @discard-source: Discard blocks on source which are already copied
-+#     to the target.  (Since 9.0)
-+#
- # @x-perf: Performance options.  (Since 6.0)
- #
- # Features:
-@@ -1837,6 +1840,7 @@
-             '*on-target-error': 'BlockdevOnError',
-             '*auto-finalize': 'bool', '*auto-dismiss': 'bool',
-             '*filter-node-name': 'str',
-+            '*discard-source': 'bool',
-             '*x-perf': { 'type': 'BackupPerf',
-                          'features': [ 'unstable' ] } } }
diff --git a/debian/patches/pve/0050-PVE-backup-add-fleecing-option.patch b/debian/patches/pve/0050-PVE-backup-add-fleecing-option.patch
new file mode 100644 (file)
index 0000000..d5d2f4a
--- /dev/null
@@ -0,0 +1,345 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Thu, 11 Apr 2024 11:29:28 +0200
+Subject: [PATCH] PVE backup: add fleecing option
+
+When a fleecing option is given, it is expected that each device has
+a corresponding "-fleecing" block device already attached, except for
+EFI disk and TPM state, where fleecing is never used.
+
+The following graph was adapted from [0] which also contains more
+details about fleecing.
+
+[guest]
+   |
+   | root
+   v                 file
+[copy-before-write]<------[snapshot-access]
+   |           |
+   | file      | target
+   v           v
+[source] [fleecing]
+
+For fleecing, a copy-before-write filter is inserted on top of the
+source node, as well as a snapshot-access node pointing to the filter
+node which allows to read the consistent state of the image at the
+time it was inserted. New guest writes are passed through the
+copy-before-write filter which will first copy over old data to the
+fleecing image in case that old data is still needed by the
+snapshot-access node.
+
+The backup process will sequentially read from the snapshot access,
+which has a bitmap and knows whether to read from the original image
+or the fleecing image to get the "snapshot" state, i.e. data from the
+source image at the time when the copy-before-write filter was
+inserted. After reading, the copied sections are discarded from the
+fleecing image to reduce space usage.
+
+All of this can be restricted by an initial dirty bitmap to parts of
+the source image that are required for an incremental backup.
+
+For discard to work, it is necessary that the fleecing image does not
+have a larger cluster size than the backup job granularity. Since
+querying that size does not always work, e.g. for RBD with krbd, the
+cluster size will not be reported, a minimum of 4 MiB is used. A job
+with PBS target already has at least this granularity, so it's just
+relevant for other targets. I.e. edge cases where this minimum is not
+enough should be very rare in practice. If ever necessary in the
+future, can still add a passed-in value for the backup QMP command to
+override.
+
+Additionally, the cbw-timeout and on-cbw-error=break-snapshot options
+are set when installing the copy-before-write filter and
+snapshot-access. When an error or timeout occurs, the problematic (and
+each further) snapshot operation will fail and thus cancel the backup
+instead of breaking the guest write.
+
+Note that job_id cannot be inferred from the snapshot-access bs because
+it has no parent, so just pass the one from the original bs.
+
+[0]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg876056.html
+
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ block/monitor/block-hmp-cmds.c |   1 +
+ pve-backup.c                   | 143 ++++++++++++++++++++++++++++++++-
+ qapi/block-core.json           |  10 ++-
+ 3 files changed, 150 insertions(+), 4 deletions(-)
+
+diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
+index 5000c084c5..70b3de4c7e 100644
+--- a/block/monitor/block-hmp-cmds.c
++++ b/block/monitor/block-hmp-cmds.c
+@@ -1043,6 +1043,7 @@ void coroutine_fn hmp_backup(Monitor *mon, const QDict *qdict)
+         NULL, NULL,
+         devlist, qdict_haskey(qdict, "speed"), speed,
+         false, 0, // BackupPerf max-workers
++        false, false, // fleecing
+         &error);
+     hmp_handle_error(mon, error);
+diff --git a/pve-backup.c b/pve-backup.c
+index 9d480a8eec..7cc1dd3724 100644
+--- a/pve-backup.c
++++ b/pve-backup.c
+@@ -7,9 +7,11 @@
+ #include "sysemu/blockdev.h"
+ #include "block/block_int-global-state.h"
+ #include "block/blockjob.h"
++#include "block/copy-before-write.h"
+ #include "block/dirty-bitmap.h"
+ #include "block/graph-lock.h"
+ #include "qapi/qapi-commands-block.h"
++#include "qapi/qmp/qdict.h"
+ #include "qapi/qmp/qerror.h"
+ #include "qemu/cutils.h"
+@@ -81,8 +83,15 @@ static void pvebackup_init(void)
+ // initialize PVEBackupState at startup
+ opts_init(pvebackup_init);
++typedef struct PVEBackupFleecingInfo {
++    BlockDriverState *bs;
++    BlockDriverState *cbw;
++    BlockDriverState *snapshot_access;
++} PVEBackupFleecingInfo;
++
+ typedef struct PVEBackupDevInfo {
+     BlockDriverState *bs;
++    PVEBackupFleecingInfo fleecing;
+     size_t size;
+     uint64_t block_size;
+     uint8_t dev_id;
+@@ -355,6 +364,25 @@ static void pvebackup_complete_cb(void *opaque, int ret)
+     PVEBackupDevInfo *di = opaque;
+     di->completed_ret = ret;
++    /*
++     * Handle block-graph specific cleanup (for fleecing) outside of the coroutine, because the work
++     * won't be done as a coroutine anyways:
++     * - For snapshot_access, allows doing bdrv_unref() directly. Doing it via bdrv_co_unref() would
++     *   just spawn a BH calling bdrv_unref().
++     * - For cbw, draining would need to spawn a BH.
++     *
++     * Note that the AioContext lock is already acquired by our caller, i.e.
++     * job_finalize_single_locked()
++     */
++    if (di->fleecing.snapshot_access) {
++        bdrv_unref(di->fleecing.snapshot_access);
++        di->fleecing.snapshot_access = NULL;
++    }
++    if (di->fleecing.cbw) {
++        bdrv_cbw_drop(di->fleecing.cbw);
++        di->fleecing.cbw = NULL;
++    }
++
+     /*
+      * Needs to happen outside of coroutine, because it takes the graph write lock.
+      */
+@@ -522,9 +550,82 @@ static void create_backup_jobs_bh(void *opaque) {
+         }
+         bdrv_drained_begin(di->bs);
++        BackupPerf perf = (BackupPerf){ .max_workers = backup_state.perf.max_workers };
++
++        BlockDriverState *source_bs = di->bs;
++        bool discard_source = false;
++        bdrv_graph_co_rdlock();
++        const char *job_id = bdrv_get_device_name(di->bs);
++        bdrv_graph_co_rdunlock();
++        if (di->fleecing.bs) {
++            QDict *cbw_opts = qdict_new();
++            qdict_put_str(cbw_opts, "driver", "copy-before-write");
++            qdict_put_str(cbw_opts, "file", bdrv_get_node_name(di->bs));
++            qdict_put_str(cbw_opts, "target", bdrv_get_node_name(di->fleecing.bs));
++
++            if (di->bitmap) {
++                /*
++                 * Only guest writes to parts relevant for the backup need to be intercepted with
++                 * old data being copied to the fleecing image.
++                 */
++                qdict_put_str(cbw_opts, "bitmap.node", bdrv_get_node_name(di->bs));
++                qdict_put_str(cbw_opts, "bitmap.name", bdrv_dirty_bitmap_name(di->bitmap));
++            }
++            /*
++             * Fleecing storage is supposed to be fast and it's better to break backup than guest
++             * writes. Certain guest drivers like VirtIO-win have 60 seconds timeout by default, so
++             * abort a bit before that.
++             */
++            qdict_put_str(cbw_opts, "on-cbw-error", "break-snapshot");
++            qdict_put_int(cbw_opts, "cbw-timeout", 45);
++
++            di->fleecing.cbw = bdrv_insert_node(di->bs, cbw_opts, BDRV_O_RDWR, &local_err);
++
++            if (!di->fleecing.cbw) {
++                error_setg(errp, "appending cbw node for fleecing failed: %s",
++                           local_err ? error_get_pretty(local_err) : "unknown error");
++                break;
++            }
++
++            QDict *snapshot_access_opts = qdict_new();
++            qdict_put_str(snapshot_access_opts, "driver", "snapshot-access");
++            qdict_put_str(snapshot_access_opts, "file", bdrv_get_node_name(di->fleecing.cbw));
++
++            /*
++             * Holding the AioContext lock here would cause a deadlock, because bdrv_open_driver()
++             * will aquire it a second time. But it's allowed to be held exactly once when polling
++             * and that happens when the bdrv_refresh_total_sectors() call is made there.
++             */
++            di->fleecing.snapshot_access =
++                bdrv_open(NULL, NULL, snapshot_access_opts, BDRV_O_RDWR | BDRV_O_UNMAP, &local_err);
++            if (!di->fleecing.snapshot_access) {
++                error_setg(errp, "setting up snapshot access for fleecing failed: %s",
++                           local_err ? error_get_pretty(local_err) : "unknown error");
++                break;
++            }
++            source_bs = di->fleecing.snapshot_access;
++            discard_source = true;
++
++            /*
++             * bdrv_get_info() just retuns 0 (= doesn't matter) for RBD when using krbd. But discard
++             * on the fleecing image won't work if the backup job's granularity is less than the RBD
++             * object size (default 4 MiB), so it does matter. Always use at least 4 MiB. With a PBS
++             * target, the backup job granularity would already be at least this much.
++             */
++            perf.min_cluster_size = 4 * 1024 * 1024;
++            /*
++             * For discard to work, cluster size for the backup job must be at least the same as for
++             * the fleecing image.
++             */
++            BlockDriverInfo bdi;
++            if (bdrv_get_info(di->fleecing.bs, &bdi) >= 0) {
++                perf.min_cluster_size = MAX(perf.min_cluster_size, bdi.cluster_size);
++            }
++        }
++
+         BlockJob *job = backup_job_create(
+-            NULL, di->bs, di->target, backup_state.speed, sync_mode, di->bitmap,
+-            bitmap_mode, false, NULL, &backup_state.perf, BLOCKDEV_ON_ERROR_REPORT,
++            job_id, source_bs, di->target, backup_state.speed, sync_mode, di->bitmap,
++            bitmap_mode, false, discard_source, NULL, &perf, BLOCKDEV_ON_ERROR_REPORT,
+             BLOCKDEV_ON_ERROR_REPORT, JOB_DEFAULT, pvebackup_complete_cb, di, backup_state.txn,
+             &local_err);
+@@ -580,6 +681,14 @@ static void create_backup_jobs_bh(void *opaque) {
+     aio_co_enter(data->ctx, data->co);
+ }
++/*
++ * EFI disk and TPM state are small and it's just not worth setting up fleecing for them.
++ */
++static bool device_uses_fleecing(const char *device_id)
++{
++    return strncmp(device_id, "drive-efidisk", 13) && strncmp(device_id, "drive-tpmstate", 14);
++}
++
+ /*
+  * Returns a list of device infos, which needs to be freed by the caller. In
+  * case of an error, errp will be set, but the returned value might still be a
+@@ -587,6 +696,7 @@ static void create_backup_jobs_bh(void *opaque) {
+  */
+ static GList coroutine_fn GRAPH_RDLOCK *get_device_info(
+     const char *devlist,
++    bool fleecing,
+     Error **errp)
+ {
+     gchar **devs = NULL;
+@@ -610,6 +720,31 @@ static GList coroutine_fn GRAPH_RDLOCK *get_device_info(
+             }
+             PVEBackupDevInfo *di = g_new0(PVEBackupDevInfo, 1);
+             di->bs = bs;
++
++            if (fleecing && device_uses_fleecing(*d)) {
++                g_autofree gchar *fleecing_devid = g_strconcat(*d, "-fleecing", NULL);
++                BlockBackend *fleecing_blk = blk_by_name(fleecing_devid);
++                if (!fleecing_blk) {
++                    error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
++                              "Device '%s' not found", fleecing_devid);
++                    goto err;
++                }
++                BlockDriverState *fleecing_bs = blk_bs(fleecing_blk);
++                if (!bdrv_co_is_inserted(fleecing_bs)) {
++                    error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, fleecing_devid);
++                    goto err;
++                }
++                /*
++                 * Fleecing image needs to be the same size to act as a cbw target.
++                 */
++                if (bs->total_sectors != fleecing_bs->total_sectors) {
++                    error_setg(errp, "Size mismatch for '%s' - sector count %ld != %ld",
++                               fleecing_devid, fleecing_bs->total_sectors, bs->total_sectors);
++                    goto err;
++                }
++                di->fleecing.bs = fleecing_bs;
++            }
++
+             di_list = g_list_append(di_list, di);
+             d++;
+         }
+@@ -659,6 +794,7 @@ UuidInfo coroutine_fn *qmp_backup(
+     const char *devlist,
+     bool has_speed, int64_t speed,
+     bool has_max_workers, int64_t max_workers,
++    bool has_fleecing, bool fleecing,
+     Error **errp)
+ {
+     assert(qemu_in_coroutine());
+@@ -687,7 +823,7 @@ UuidInfo coroutine_fn *qmp_backup(
+     format = has_format ? format : BACKUP_FORMAT_VMA;
+     bdrv_graph_co_rdlock();
+-    di_list = get_device_info(devlist, &local_err);
++    di_list = get_device_info(devlist, has_fleecing && fleecing, &local_err);
+     bdrv_graph_co_rdunlock();
+     if (local_err) {
+         error_propagate(errp, local_err);
+@@ -1095,5 +1231,6 @@ ProxmoxSupportStatus *qmp_query_proxmox_support(Error **errp)
+     ret->query_bitmap_info = true;
+     ret->pbs_masterkey = true;
+     ret->backup_max_workers = true;
++    ret->backup_fleecing = true;
+     return ret;
+ }
+diff --git a/qapi/block-core.json b/qapi/block-core.json
+index 6e7ee87633..dc5f75cd39 100644
+--- a/qapi/block-core.json
++++ b/qapi/block-core.json
+@@ -948,6 +948,10 @@
+ #
+ # @max-workers: see @BackupPerf for details. Default 16.
+ #
++# @fleecing: perform a backup with fleecing. For each device in @devlist, a
++#            corresponing '-fleecing' device with the same size already needs to
++#            be present.
++#
+ # Returns: the uuid of the backup job
+ #
+ ##
+@@ -968,7 +972,8 @@
+                                     '*firewall-file': 'str',
+                                     '*devlist': 'str',
+                                     '*speed': 'int',
+-                                    '*max-workers': 'int' },
++                                    '*max-workers': 'int',
++                                    '*fleecing': 'bool' },
+   'returns': 'UuidInfo', 'coroutine': true }
+ ##
+@@ -1014,6 +1019,8 @@
+ #
+ # @pbs-library-version: Running version of libproxmox-backup-qemu0 library.
+ #
++# @backup-fleecing: Whether backup fleecing is supported or not.
++#
+ # @backup-max-workers: Whether the 'max-workers' @BackupPerf setting is
+ #     supported or not.
+ #
+@@ -1025,6 +1032,7 @@
+             'pbs-dirty-bitmap-migration': 'bool',
+             'pbs-masterkey': 'bool',
+             'pbs-library-version': 'str',
++            'backup-fleecing': 'bool',
+             'backup-max-workers': 'bool' } }
+ ##
diff --git a/debian/patches/pve/0050-copy-before-write-allow-specifying-minimum-cluster-s.patch b/debian/patches/pve/0050-copy-before-write-allow-specifying-minimum-cluster-s.patch
deleted file mode 100644 (file)
index 17949f4..0000000
+++ /dev/null
@@ -1,133 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Thu, 11 Apr 2024 11:29:26 +0200
-Subject: [PATCH] copy-before-write: allow specifying minimum cluster size
-
-Useful to make discard-source work in the context of backup fleecing
-when the fleecing image has a larger granularity than the backup
-target.
-
-Copy-before-write operations will use at least this granularity and in
-particular, discard requests to the source node will too. If the
-granularity is too small, they will just be aligned down in
-cbw_co_pdiscard_snapshot() and thus effectively ignored.
-
-The QAPI uses uint32 so the value will be non-negative, but still fit
-into a uint64_t.
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/block-copy.c         | 17 +++++++++++++----
- block/copy-before-write.c  |  3 ++-
- include/block/block-copy.h |  1 +
- qapi/block-core.json       |  8 +++++++-
- 4 files changed, 23 insertions(+), 6 deletions(-)
-
-diff --git a/block/block-copy.c b/block/block-copy.c
-index 3c61e52bae..c9a722a5a6 100644
---- a/block/block-copy.c
-+++ b/block/block-copy.c
-@@ -310,6 +310,7 @@ void block_copy_set_copy_opts(BlockCopyState *s, bool use_copy_range,
- }
- static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
-+                                                 int64_t min_cluster_size,
-                                                  Error **errp)
- {
-     int ret;
-@@ -330,7 +331,7 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
-                     "used. If the actual block size of the target exceeds "
-                     "this default, the backup may be unusable",
-                     BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
--        return BLOCK_COPY_CLUSTER_SIZE_DEFAULT;
-+        return MAX(min_cluster_size, BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
-     } else if (ret < 0 && !target_does_cow) {
-         error_setg_errno(errp, -ret,
-             "Couldn't determine the cluster size of the target image, "
-@@ -340,16 +341,18 @@ static int64_t block_copy_calculate_cluster_size(BlockDriverState *target,
-         return ret;
-     } else if (ret < 0 && target_does_cow) {
-         /* Not fatal; just trudge on ahead. */
--        return BLOCK_COPY_CLUSTER_SIZE_DEFAULT;
-+        return MAX(min_cluster_size, BLOCK_COPY_CLUSTER_SIZE_DEFAULT);
-     }
--    return MAX(BLOCK_COPY_CLUSTER_SIZE_DEFAULT, bdi.cluster_size);
-+    return MAX(min_cluster_size,
-+               MAX(BLOCK_COPY_CLUSTER_SIZE_DEFAULT, bdi.cluster_size));
- }
- BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
-                                      BlockDriverState *copy_bitmap_bs,
-                                      const BdrvDirtyBitmap *bitmap,
-                                      bool discard_source,
-+                                     int64_t min_cluster_size,
-                                      Error **errp)
- {
-     ERRP_GUARD();
-@@ -358,7 +361,13 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
-     BdrvDirtyBitmap *copy_bitmap;
-     bool is_fleecing;
--    cluster_size = block_copy_calculate_cluster_size(target->bs, errp);
-+    if (min_cluster_size && !is_power_of_2(min_cluster_size)) {
-+        error_setg(errp, "min-cluster-size needs to be a power of 2");
-+        return NULL;
-+    }
-+
-+    cluster_size = block_copy_calculate_cluster_size(target->bs,
-+                                                     min_cluster_size, errp);
-     if (cluster_size < 0) {
-         return NULL;
-     }
-diff --git a/block/copy-before-write.c b/block/copy-before-write.c
-index 3503702d71..4a8c5bdb62 100644
---- a/block/copy-before-write.c
-+++ b/block/copy-before-write.c
-@@ -479,7 +479,8 @@ static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
-     s->discard_source = flags & BDRV_O_CBW_DISCARD_SOURCE;
-     s->bcs = block_copy_state_new(bs->file, s->target, bs, bitmap,
--                                  flags & BDRV_O_CBW_DISCARD_SOURCE, errp);
-+                                  flags & BDRV_O_CBW_DISCARD_SOURCE,
-+                                  opts->min_cluster_size, errp);
-     if (!s->bcs) {
-         error_prepend(errp, "Cannot create block-copy-state: ");
-         ret = -EINVAL;
-diff --git a/include/block/block-copy.h b/include/block/block-copy.h
-index bdc703bacd..77857c6c68 100644
---- a/include/block/block-copy.h
-+++ b/include/block/block-copy.h
-@@ -28,6 +28,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
-                                      BlockDriverState *copy_bitmap_bs,
-                                      const BdrvDirtyBitmap *bitmap,
-                                      bool discard_source,
-+                                     int64_t min_cluster_size,
-                                      Error **errp);
- /* Function should be called prior any actual copy request */
-diff --git a/qapi/block-core.json b/qapi/block-core.json
-index 4297e5beda..33e7e3c090 100644
---- a/qapi/block-core.json
-+++ b/qapi/block-core.json
-@@ -4825,12 +4825,18 @@
- #     @on-cbw-error parameter will decide how this failure is handled.
- #     Default 0. (Since 7.1)
- #
-+# @min-cluster-size: Minimum size of blocks used by copy-before-write
-+#     operations.  Has to be a power of 2.  No effect if smaller than
-+#     the maximum of the target's cluster size and 64 KiB.  Default 0.
-+#     (Since 8.1)
-+#
- # Since: 6.2
- ##
- { 'struct': 'BlockdevOptionsCbw',
-   'base': 'BlockdevOptionsGenericFormat',
-   'data': { 'target': 'BlockdevRef', '*bitmap': 'BlockDirtyBitmap',
--            '*on-cbw-error': 'OnCbwError', '*cbw-timeout': 'uint32' } }
-+            '*on-cbw-error': 'OnCbwError', '*cbw-timeout': 'uint32',
-+            '*min-cluster-size': 'uint32' } }
- ##
- # @BlockdevOptions:
diff --git a/debian/patches/pve/0051-PVE-backup-improve-error-when-copy-before-write-fail.patch b/debian/patches/pve/0051-PVE-backup-improve-error-when-copy-before-write-fail.patch
new file mode 100644 (file)
index 0000000..c7f2ccb
--- /dev/null
@@ -0,0 +1,117 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fiona Ebner <f.ebner@proxmox.com>
+Date: Mon, 29 Apr 2024 14:43:58 +0200
+Subject: [PATCH] PVE backup: improve error when copy-before-write fails for
+ fleecing
+
+With fleecing, failure for copy-before-write does not fail the guest
+write, but only sets the snapshot error that is associated to the
+copy-before-write filter, making further requests to the snapshot
+access fail with EACCES, which then also fails the job. But that error
+code is not the root cause of why the backup failed, so bubble up the
+original snapshot error instead.
+
+Reported-by: Friedrich Weber <f.weber@proxmox.com>
+Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
+Tested-by: Friedrich Weber <f.weber@proxmox.com>
+---
+ block/copy-before-write.c | 18 ++++++++++++------
+ block/copy-before-write.h |  1 +
+ pve-backup.c              |  9 +++++++++
+ 3 files changed, 22 insertions(+), 6 deletions(-)
+
+diff --git a/block/copy-before-write.c b/block/copy-before-write.c
+index bba58326d7..50cc4c7aae 100644
+--- a/block/copy-before-write.c
++++ b/block/copy-before-write.c
+@@ -27,6 +27,7 @@
+ #include "qapi/qmp/qjson.h"
+ #include "sysemu/block-backend.h"
++#include "qemu/atomic.h"
+ #include "qemu/cutils.h"
+ #include "qapi/error.h"
+ #include "block/block_int.h"
+@@ -74,7 +75,8 @@ typedef struct BDRVCopyBeforeWriteState {
+      * @snapshot_error is normally zero. But on first copy-before-write failure
+      * when @on_cbw_error == ON_CBW_ERROR_BREAK_SNAPSHOT, @snapshot_error takes
+      * value of this error (<0). After that all in-flight and further
+-     * snapshot-API requests will fail with that error.
++     * snapshot-API requests will fail with that error. To be accessed with
++     * atomics.
+      */
+     int snapshot_error;
+ } BDRVCopyBeforeWriteState;
+@@ -114,7 +116,7 @@ static coroutine_fn int cbw_do_copy_before_write(BlockDriverState *bs,
+         return 0;
+     }
+-    if (s->snapshot_error) {
++    if (qatomic_read(&s->snapshot_error)) {
+         return 0;
+     }
+@@ -138,9 +140,7 @@ static coroutine_fn int cbw_do_copy_before_write(BlockDriverState *bs,
+     WITH_QEMU_LOCK_GUARD(&s->lock) {
+         if (ret < 0) {
+             assert(s->on_cbw_error == ON_CBW_ERROR_BREAK_SNAPSHOT);
+-            if (!s->snapshot_error) {
+-                s->snapshot_error = ret;
+-            }
++            qatomic_cmpxchg(&s->snapshot_error, 0, ret);
+         } else {
+             bdrv_set_dirty_bitmap(s->done_bitmap, off, end - off);
+         }
+@@ -214,7 +214,7 @@ cbw_snapshot_read_lock(BlockDriverState *bs, int64_t offset, int64_t bytes,
+     QEMU_LOCK_GUARD(&s->lock);
+-    if (s->snapshot_error) {
++    if (qatomic_read(&s->snapshot_error)) {
+         g_free(req);
+         return NULL;
+     }
+@@ -585,6 +585,12 @@ void bdrv_cbw_drop(BlockDriverState *bs)
+     bdrv_unref(bs);
+ }
++int bdrv_cbw_snapshot_error(BlockDriverState *bs)
++{
++    BDRVCopyBeforeWriteState *s = bs->opaque;
++    return qatomic_read(&s->snapshot_error);
++}
++
+ static void cbw_init(void)
+ {
+     bdrv_register(&bdrv_cbw_filter);
+diff --git a/block/copy-before-write.h b/block/copy-before-write.h
+index dc6cafe7fa..a27d2d7d9f 100644
+--- a/block/copy-before-write.h
++++ b/block/copy-before-write.h
+@@ -44,5 +44,6 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
+                                   BlockCopyState **bcs,
+                                   Error **errp);
+ void bdrv_cbw_drop(BlockDriverState *bs);
++int bdrv_cbw_snapshot_error(BlockDriverState *bs);
+ #endif /* COPY_BEFORE_WRITE_H */
+diff --git a/pve-backup.c b/pve-backup.c
+index 7cc1dd3724..07709aa350 100644
+--- a/pve-backup.c
++++ b/pve-backup.c
+@@ -379,6 +379,15 @@ static void pvebackup_complete_cb(void *opaque, int ret)
+         di->fleecing.snapshot_access = NULL;
+     }
+     if (di->fleecing.cbw) {
++        /*
++         * With fleecing, failure for cbw does not fail the guest write, but only sets the snapshot
++         * error, making further requests to the snapshot fail with EACCES, which then also fail the
++         * job. But that code is not the root cause and just confusing, so update it.
++         */
++        int snapshot_error = bdrv_cbw_snapshot_error(di->fleecing.cbw);
++        if (di->completed_ret == -EACCES && snapshot_error) {
++            di->completed_ret = snapshot_error;
++        }
+         bdrv_cbw_drop(di->fleecing.cbw);
+         di->fleecing.cbw = NULL;
+     }
diff --git a/debian/patches/pve/0051-backup-add-minimum-cluster-size-to-performance-optio.patch b/debian/patches/pve/0051-backup-add-minimum-cluster-size-to-performance-optio.patch
deleted file mode 100644 (file)
index d760e45..0000000
+++ /dev/null
@@ -1,106 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Thu, 11 Apr 2024 11:29:27 +0200
-Subject: [PATCH] backup: add minimum cluster size to performance options
-
-Useful to make discard-source work in the context of backup fleecing
-when the fleecing image has a larger granularity than the backup
-target.
-
-Backup/block-copy will use at least this granularity for copy operations
-and in particular, discard requests to the backup source will too. If
-the granularity is too small, they will just be aligned down in
-cbw_co_pdiscard_snapshot() and thus effectively ignored.
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/backup.c            | 2 +-
- block/copy-before-write.c | 2 ++
- block/copy-before-write.h | 1 +
- blockdev.c                | 3 +++
- qapi/block-core.json      | 9 +++++++--
- 5 files changed, 14 insertions(+), 3 deletions(-)
-
-diff --git a/block/backup.c b/block/backup.c
-index 3dc955f625..ac5bd81338 100644
---- a/block/backup.c
-+++ b/block/backup.c
-@@ -430,7 +430,7 @@ BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
-     }
-     cbw = bdrv_cbw_append(bs, target, filter_node_name, discard_source,
--                          &bcs, errp);
-+                          perf->min_cluster_size, &bcs, errp);
-     if (!cbw) {
-         goto error;
-     }
-diff --git a/block/copy-before-write.c b/block/copy-before-write.c
-index 4a8c5bdb62..9ca5ec5e5c 100644
---- a/block/copy-before-write.c
-+++ b/block/copy-before-write.c
-@@ -555,6 +555,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
-                                   BlockDriverState *target,
-                                   const char *filter_node_name,
-                                   bool discard_source,
-+                                  int64_t min_cluster_size,
-                                   BlockCopyState **bcs,
-                                   Error **errp)
- {
-@@ -573,6 +574,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
-     }
-     qdict_put_str(opts, "file", bdrv_get_node_name(source));
-     qdict_put_str(opts, "target", bdrv_get_node_name(target));
-+    qdict_put_int(opts, "min-cluster-size", min_cluster_size);
-     top = bdrv_insert_node(source, opts, flags, errp);
-     if (!top) {
-diff --git a/block/copy-before-write.h b/block/copy-before-write.h
-index 01af0cd3c4..dc6cafe7fa 100644
---- a/block/copy-before-write.h
-+++ b/block/copy-before-write.h
-@@ -40,6 +40,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
-                                   BlockDriverState *target,
-                                   const char *filter_node_name,
-                                   bool discard_source,
-+                                  int64_t min_cluster_size,
-                                   BlockCopyState **bcs,
-                                   Error **errp);
- void bdrv_cbw_drop(BlockDriverState *bs);
-diff --git a/blockdev.c b/blockdev.c
-index ce3fef924c..5ae1dde73c 100644
---- a/blockdev.c
-+++ b/blockdev.c
-@@ -2729,6 +2729,9 @@ static BlockJob *do_backup_common(BackupCommon *backup,
-         if (backup->x_perf->has_max_chunk) {
-             perf.max_chunk = backup->x_perf->max_chunk;
-         }
-+        if (backup->x_perf->has_min_cluster_size) {
-+            perf.min_cluster_size = backup->x_perf->min_cluster_size;
-+        }
-     }
-     if ((backup->sync == MIRROR_SYNC_MODE_BITMAP) ||
-diff --git a/qapi/block-core.json b/qapi/block-core.json
-index 33e7e3c090..58fd637e86 100644
---- a/qapi/block-core.json
-+++ b/qapi/block-core.json
-@@ -1757,11 +1757,16 @@
- #     it should not be less than job cluster size which is calculated
- #     as maximum of target image cluster size and 64k.  Default 0.
- #
-+# @min-cluster-size: Minimum size of blocks used by copy-before-write
-+#     and background copy operations.  Has to be a power of 2.  No
-+#     effect if smaller than the maximum of the target's cluster size
-+#     and 64 KiB.  Default 0. (Since 8.1)
-+#
- # Since: 6.0
- ##
- { 'struct': 'BackupPerf',
--  'data': { '*use-copy-range': 'bool',
--            '*max-workers': 'int', '*max-chunk': 'int64' } }
-+  'data': { '*use-copy-range': 'bool', '*max-workers': 'int',
-+            '*max-chunk': 'int64', '*min-cluster-size': 'uint32' } }
- ##
- # @BackupCommon:
diff --git a/debian/patches/pve/0052-PVE-backup-add-fleecing-option.patch b/debian/patches/pve/0052-PVE-backup-add-fleecing-option.patch
deleted file mode 100644 (file)
index a4d78cd..0000000
+++ /dev/null
@@ -1,335 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Fiona Ebner <f.ebner@proxmox.com>
-Date: Thu, 11 Apr 2024 11:29:28 +0200
-Subject: [PATCH] PVE backup: add fleecing option
-
-When a fleecing option is given, it is expected that each device has
-a corresponding "-fleecing" block device already attached, except for
-EFI disk and TPM state, where fleecing is never used.
-
-The following graph was adapted from [0] which also contains more
-details about fleecing.
-
-[guest]
-   |
-   | root
-   v                 file
-[copy-before-write]<------[snapshot-access]
-   |           |
-   | file      | target
-   v           v
-[source] [fleecing]
-
-For fleecing, a copy-before-write filter is inserted on top of the
-source node, as well as a snapshot-access node pointing to the filter
-node which allows to read the consistent state of the image at the
-time it was inserted. New guest writes are passed through the
-copy-before-write filter which will first copy over old data to the
-fleecing image in case that old data is still needed by the
-snapshot-access node.
-
-The backup process will sequentially read from the snapshot access,
-which has a bitmap and knows whether to read from the original image
-or the fleecing image to get the "snapshot" state, i.e. data from the
-source image at the time when the copy-before-write filter was
-inserted. After reading, the copied sections are discarded from the
-fleecing image to reduce space usage.
-
-All of this can be restricted by an initial dirty bitmap to parts of
-the source image that are required for an incremental backup.
-
-For discard to work, it is necessary that the fleecing image does not
-have a larger cluster size than the backup job granularity. Since
-querying that size does not always work, e.g. for RBD with krbd, the
-cluster size will not be reported, a minimum of 4 MiB is used. A job
-with PBS target already has at least this granularity, so it's just
-relevant for other targets. I.e. edge cases where this minimum is not
-enough should be very rare in practice. If ever necessary in the
-future, can still add a passed-in value for the backup QMP command to
-override.
-
-Additionally, the cbw-timeout and on-cbw-error=break-snapshot options
-are set when installing the copy-before-write filter and
-snapshot-access. When an error or timeout occurs, the problematic (and
-each further) snapshot operation will fail and thus cancel the backup
-instead of breaking the guest write.
-
-Note that job_id cannot be inferred from the snapshot-access bs because
-it has no parent, so just pass the one from the original bs.
-
-[0]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg876056.html
-
-Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- block/monitor/block-hmp-cmds.c |   1 +
- pve-backup.c                   | 143 ++++++++++++++++++++++++++++++++-
- qapi/block-core.json           |   8 +-
- 3 files changed, 148 insertions(+), 4 deletions(-)
-
-diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
-index 6efe28cef5..ca29cc4281 100644
---- a/block/monitor/block-hmp-cmds.c
-+++ b/block/monitor/block-hmp-cmds.c
-@@ -1064,6 +1064,7 @@ void coroutine_fn hmp_backup(Monitor *mon, const QDict *qdict)
-         NULL, NULL,
-         devlist, qdict_haskey(qdict, "speed"), speed,
-         false, 0, // BackupPerf max-workers
-+        false, false, // fleecing
-         &error);
-     hmp_handle_error(mon, error);
-diff --git a/pve-backup.c b/pve-backup.c
-index e6b17b797e..00aaff6509 100644
---- a/pve-backup.c
-+++ b/pve-backup.c
-@@ -7,8 +7,10 @@
- #include "sysemu/blockdev.h"
- #include "block/block_int-global-state.h"
- #include "block/blockjob.h"
-+#include "block/copy-before-write.h"
- #include "block/dirty-bitmap.h"
- #include "qapi/qapi-commands-block.h"
-+#include "qapi/qmp/qdict.h"
- #include "qapi/qmp/qerror.h"
- #include "qemu/cutils.h"
-@@ -80,8 +82,15 @@ static void pvebackup_init(void)
- // initialize PVEBackupState at startup
- opts_init(pvebackup_init);
-+typedef struct PVEBackupFleecingInfo {
-+    BlockDriverState *bs;
-+    BlockDriverState *cbw;
-+    BlockDriverState *snapshot_access;
-+} PVEBackupFleecingInfo;
-+
- typedef struct PVEBackupDevInfo {
-     BlockDriverState *bs;
-+    PVEBackupFleecingInfo fleecing;
-     size_t size;
-     uint64_t block_size;
-     uint8_t dev_id;
-@@ -361,6 +370,25 @@ static void pvebackup_complete_cb(void *opaque, int ret)
-     PVEBackupDevInfo *di = opaque;
-     di->completed_ret = ret;
-+    /*
-+     * Handle block-graph specific cleanup (for fleecing) outside of the coroutine, because the work
-+     * won't be done as a coroutine anyways:
-+     * - For snapshot_access, allows doing bdrv_unref() directly. Doing it via bdrv_co_unref() would
-+     *   just spawn a BH calling bdrv_unref().
-+     * - For cbw, draining would need to spawn a BH.
-+     *
-+     * Note that the AioContext lock is already acquired by our caller, i.e.
-+     * job_finalize_single_locked()
-+     */
-+    if (di->fleecing.snapshot_access) {
-+        bdrv_unref(di->fleecing.snapshot_access);
-+        di->fleecing.snapshot_access = NULL;
-+    }
-+    if (di->fleecing.cbw) {
-+        bdrv_cbw_drop(di->fleecing.cbw);
-+        di->fleecing.cbw = NULL;
-+    }
-+
-     /*
-      * Schedule stream cleanup in async coroutine. close_image and finish might
-      * take a while, so we can't block on them here. This way it also doesn't
-@@ -521,9 +549,82 @@ static void create_backup_jobs_bh(void *opaque) {
-         bdrv_drained_begin(di->bs);
-+        BackupPerf perf = (BackupPerf){ .max_workers = backup_state.perf.max_workers };
-+
-+        BlockDriverState *source_bs = di->bs;
-+        bool discard_source = false;
-+        const char *job_id = bdrv_get_device_name(di->bs);
-+        if (di->fleecing.bs) {
-+            QDict *cbw_opts = qdict_new();
-+            qdict_put_str(cbw_opts, "driver", "copy-before-write");
-+            qdict_put_str(cbw_opts, "file", bdrv_get_node_name(di->bs));
-+            qdict_put_str(cbw_opts, "target", bdrv_get_node_name(di->fleecing.bs));
-+
-+            if (di->bitmap) {
-+                /*
-+                 * Only guest writes to parts relevant for the backup need to be intercepted with
-+                 * old data being copied to the fleecing image.
-+                 */
-+                qdict_put_str(cbw_opts, "bitmap.node", bdrv_get_node_name(di->bs));
-+                qdict_put_str(cbw_opts, "bitmap.name", bdrv_dirty_bitmap_name(di->bitmap));
-+            }
-+            /*
-+             * Fleecing storage is supposed to be fast and it's better to break backup than guest
-+             * writes. Certain guest drivers like VirtIO-win have 60 seconds timeout by default, so
-+             * abort a bit before that.
-+             */
-+            qdict_put_str(cbw_opts, "on-cbw-error", "break-snapshot");
-+            qdict_put_int(cbw_opts, "cbw-timeout", 45);
-+
-+            di->fleecing.cbw = bdrv_insert_node(di->bs, cbw_opts, BDRV_O_RDWR, &local_err);
-+
-+            if (!di->fleecing.cbw) {
-+                error_setg(errp, "appending cbw node for fleecing failed: %s",
-+                           local_err ? error_get_pretty(local_err) : "unknown error");
-+                break;
-+            }
-+
-+            QDict *snapshot_access_opts = qdict_new();
-+            qdict_put_str(snapshot_access_opts, "driver", "snapshot-access");
-+            qdict_put_str(snapshot_access_opts, "file", bdrv_get_node_name(di->fleecing.cbw));
-+
-+            /*
-+             * Holding the AioContext lock here would cause a deadlock, because bdrv_open_driver()
-+             * will aquire it a second time. But it's allowed to be held exactly once when polling
-+             * and that happens when the bdrv_refresh_total_sectors() call is made there.
-+             */
-+            aio_context_release(aio_context);
-+            di->fleecing.snapshot_access =
-+                bdrv_open(NULL, NULL, snapshot_access_opts, BDRV_O_RDWR | BDRV_O_UNMAP, &local_err);
-+            aio_context_acquire(aio_context);
-+            if (!di->fleecing.snapshot_access) {
-+                error_setg(errp, "setting up snapshot access for fleecing failed: %s",
-+                           local_err ? error_get_pretty(local_err) : "unknown error");
-+                break;
-+            }
-+            source_bs = di->fleecing.snapshot_access;
-+            discard_source = true;
-+
-+            /*
-+             * bdrv_get_info() just retuns 0 (= doesn't matter) for RBD when using krbd. But discard
-+             * on the fleecing image won't work if the backup job's granularity is less than the RBD
-+             * object size (default 4 MiB), so it does matter. Always use at least 4 MiB. With a PBS
-+             * target, the backup job granularity would already be at least this much.
-+             */
-+            perf.min_cluster_size = 4 * 1024 * 1024;
-+            /*
-+             * For discard to work, cluster size for the backup job must be at least the same as for
-+             * the fleecing image.
-+             */
-+            BlockDriverInfo bdi;
-+            if (bdrv_get_info(di->fleecing.bs, &bdi) >= 0) {
-+                perf.min_cluster_size = MAX(perf.min_cluster_size, bdi.cluster_size);
-+            }
-+        }
-+
-         BlockJob *job = backup_job_create(
--            NULL, di->bs, di->target, backup_state.speed, sync_mode, di->bitmap,
--            bitmap_mode, false, NULL, &backup_state.perf, BLOCKDEV_ON_ERROR_REPORT,
-+            job_id, source_bs, di->target, backup_state.speed, sync_mode, di->bitmap,
-+            bitmap_mode, false, discard_source, NULL, &perf, BLOCKDEV_ON_ERROR_REPORT,
-             BLOCKDEV_ON_ERROR_REPORT, JOB_DEFAULT, pvebackup_complete_cb, di, backup_state.txn,
-             &local_err);
-@@ -581,6 +682,14 @@ static void create_backup_jobs_bh(void *opaque) {
-     aio_co_enter(data->ctx, data->co);
- }
-+/*
-+ * EFI disk and TPM state are small and it's just not worth setting up fleecing for them.
-+ */
-+static bool device_uses_fleecing(const char *device_id)
-+{
-+    return strncmp(device_id, "drive-efidisk", 13) && strncmp(device_id, "drive-tpmstate", 14);
-+}
-+
- /*
-  * Returns a list of device infos, which needs to be freed by the caller. In
-  * case of an error, errp will be set, but the returned value might still be a
-@@ -588,6 +697,7 @@ static void create_backup_jobs_bh(void *opaque) {
-  */
- static GList coroutine_fn *get_device_info(
-     const char *devlist,
-+    bool fleecing,
-     Error **errp)
- {
-     gchar **devs = NULL;
-@@ -611,6 +721,31 @@ static GList coroutine_fn *get_device_info(
-             }
-             PVEBackupDevInfo *di = g_new0(PVEBackupDevInfo, 1);
-             di->bs = bs;
-+
-+            if (fleecing && device_uses_fleecing(*d)) {
-+                g_autofree gchar *fleecing_devid = g_strconcat(*d, "-fleecing", NULL);
-+                BlockBackend *fleecing_blk = blk_by_name(fleecing_devid);
-+                if (!fleecing_blk) {
-+                    error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
-+                              "Device '%s' not found", fleecing_devid);
-+                    goto err;
-+                }
-+                BlockDriverState *fleecing_bs = blk_bs(fleecing_blk);
-+                if (!bdrv_co_is_inserted(fleecing_bs)) {
-+                    error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, fleecing_devid);
-+                    goto err;
-+                }
-+                /*
-+                 * Fleecing image needs to be the same size to act as a cbw target.
-+                 */
-+                if (bs->total_sectors != fleecing_bs->total_sectors) {
-+                    error_setg(errp, "Size mismatch for '%s' - sector count %ld != %ld",
-+                               fleecing_devid, fleecing_bs->total_sectors, bs->total_sectors);
-+                    goto err;
-+                }
-+                di->fleecing.bs = fleecing_bs;
-+            }
-+
-             di_list = g_list_append(di_list, di);
-             d++;
-         }
-@@ -660,6 +795,7 @@ UuidInfo coroutine_fn *qmp_backup(
-     const char *devlist,
-     bool has_speed, int64_t speed,
-     bool has_max_workers, int64_t max_workers,
-+    bool has_fleecing, bool fleecing,
-     Error **errp)
- {
-     assert(qemu_in_coroutine());
-@@ -687,7 +823,7 @@ UuidInfo coroutine_fn *qmp_backup(
-     /* Todo: try to auto-detect format based on file name */
-     format = has_format ? format : BACKUP_FORMAT_VMA;
--    di_list = get_device_info(devlist, &local_err);
-+    di_list = get_device_info(devlist, has_fleecing && fleecing, &local_err);
-     if (local_err) {
-         error_propagate(errp, local_err);
-         goto err;
-@@ -1086,5 +1222,6 @@ ProxmoxSupportStatus *qmp_query_proxmox_support(Error **errp)
-     ret->query_bitmap_info = true;
-     ret->pbs_masterkey = true;
-     ret->backup_max_workers = true;
-+    ret->backup_fleecing = true;
-     return ret;
- }
-diff --git a/qapi/block-core.json b/qapi/block-core.json
-index 58fd637e86..0bc5f42677 100644
---- a/qapi/block-core.json
-+++ b/qapi/block-core.json
-@@ -933,6 +933,10 @@
- #
- # @max-workers: see @BackupPerf for details. Default 16.
- #
-+# @fleecing: perform a backup with fleecing. For each device in @devlist, a
-+#            corresponing '-fleecing' device with the same size already needs to
-+#            be present.
-+#
- # Returns: the uuid of the backup job
- #
- ##
-@@ -953,7 +957,8 @@
-                                     '*firewall-file': 'str',
-                                     '*devlist': 'str',
-                                     '*speed': 'int',
--                                    '*max-workers': 'int' },
-+                                    '*max-workers': 'int',
-+                                    '*fleecing': 'bool' },
-   'returns': 'UuidInfo', 'coroutine': true }
- ##
-@@ -1009,6 +1014,7 @@
-             'pbs-dirty-bitmap-migration': 'bool',
-             'pbs-masterkey': 'bool',
-             'pbs-library-version': 'str',
-+            'backup-fleecing': 'bool',
-             'backup-max-workers': 'bool' } }
- ##
index 1680bca4eddbe2960c379bea53ae5ab256ff7b3b..b97881e0e695b43a39ea878422091795d43b2775 100644 (file)
@@ -1,16 +1,8 @@
 extra/0001-monitor-qmp-fix-race-with-clients-disconnecting-earl.patch
 extra/0002-scsi-megasas-Internal-cdbs-have-16-byte-length.patch
 extra/0003-ide-avoid-potential-deadlock-when-draining-during-tr.patch
-extra/0004-migration-block-dirty-bitmap-fix-loading-bitmap-when.patch
-extra/0005-Revert-Revert-graph-lock-Disable-locking-for-now.patch
-extra/0006-migration-states-workaround-snapshot-performance-reg.patch
-extra/0007-Revert-x86-acpi-workaround-Windows-not-handling-name.patch
-extra/0008-target-i386-the-sgx_epc_get_section-stub-is-reachabl.patch
-extra/0009-ui-clipboard-mark-type-as-not-available-when-there-i.patch
-extra/0010-virtio-scsi-Attach-event-vq-notifier-with-no_poll.patch
-extra/0011-virtio-Re-enable-notifications-after-drain.patch
-extra/0012-qemu_init-increase-NOFILE-soft-limit-on-POSIX.patch
-extra/0013-virtio-blk-avoid-using-ioeventfd-state-in-irqfd-cond.patch
+extra/0004-Revert-x86-acpi-workaround-Windows-not-handling-name.patch
+extra/0005-block-copy-before-write-use-uint64_t-for-timeout-in-.patch
 bitmap-mirror/0001-drive-mirror-add-support-for-sync-bitmap-mode-never.patch
 bitmap-mirror/0002-drive-mirror-add-support-for-conditional-and-always-.patch
 bitmap-mirror/0003-mirror-add-check-for-bitmap-mode-without-bitmap.patch
@@ -54,18 +46,17 @@ pve/0034-PVE-Migrate-dirty-bitmap-state-via-savevm.patch
 pve/0035-migration-block-dirty-bitmap-migrate-other-bitmaps-e.patch
 pve/0036-PVE-fall-back-to-open-iscsi-initiatorname.patch
 pve/0037-PVE-block-stream-increase-chunk-size.patch
-pve/0038-block-io-accept-NULL-qiov-in-bdrv_pad_request.patch
-pve/0039-block-add-alloc-track-driver.patch
-pve/0040-Revert-block-rbd-workaround-for-ceph-issue-53784.patch
-pve/0041-Revert-block-rbd-fix-handling-of-holes-in-.bdrv_co_b.patch
-pve/0042-Revert-block-rbd-implement-bdrv_co_block_status.patch
-pve/0043-alloc-track-fix-deadlock-during-drop.patch
-pve/0044-migration-for-snapshots-hold-the-BQL-during-setup-ca.patch
-pve/0045-savevm-async-don-t-hold-BQL-during-setup.patch
-pve/0046-block-copy-before-write-fix-permission.patch
-pve/0047-block-copy-before-write-support-unligned-snapshot-di.patch
-pve/0048-block-copy-before-write-create-block_copy-bitmap-in-.patch
-pve/0049-qapi-blockdev-backup-add-discard-source-parameter.patch
-pve/0050-copy-before-write-allow-specifying-minimum-cluster-s.patch
-pve/0051-backup-add-minimum-cluster-size-to-performance-optio.patch
-pve/0052-PVE-backup-add-fleecing-option.patch
+pve/0038-block-add-alloc-track-driver.patch
+pve/0039-Revert-block-rbd-workaround-for-ceph-issue-53784.patch
+pve/0040-Revert-block-rbd-fix-handling-of-holes-in-.bdrv_co_b.patch
+pve/0041-Revert-block-rbd-implement-bdrv_co_block_status.patch
+pve/0042-alloc-track-error-out-when-auto-remove-is-not-set.patch
+pve/0043-alloc-track-avoid-seemingly-superfluous-child-permis.patch
+pve/0044-block-copy-before-write-fix-permission.patch
+pve/0045-block-copy-before-write-support-unligned-snapshot-di.patch
+pve/0046-block-copy-before-write-create-block_copy-bitmap-in-.patch
+pve/0047-qapi-blockdev-backup-add-discard-source-parameter.patch
+pve/0048-copy-before-write-allow-specifying-minimum-cluster-s.patch
+pve/0049-backup-add-minimum-cluster-size-to-performance-optio.patch
+pve/0050-PVE-backup-add-fleecing-option.patch
+pve/0051-PVE-backup-improve-error-when-copy-before-write-fail.patch
index 79c962eeb50ac6ff10dc6292f8981efe0dd6def3..3492c991a8d38778b955b87534a7c811fc314f1b 100644 (file)
@@ -1 +1,2 @@
 source-is-missing [roms/SLOF/*.oco]
+source-is-missing [linux-user/*/vdso-*.so]
diff --git a/qemu b/qemu
index 20a1b341a0af1fef84cec9e521d33da0e8d9ecf3..c25df57ae8f9fe1c72eee2dab37d76d904ac382e 160000 (submodule)
--- a/qemu
+++ b/qemu
@@ -1 +1 @@
-Subproject commit 20a1b341a0af1fef84cec9e521d33da0e8d9ecf3
+Subproject commit c25df57ae8f9fe1c72eee2dab37d76d904ac382e