debian/patches/pve/0052-PVE-backup-add-fleecing-option.patch

   1 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
   2 From: Fiona Ebner <f.ebner@proxmox.com>
   3 Date: Thu, 11 Apr 2024 11:29:28 +0200
   4 Subject: [PATCH] PVE backup: add fleecing option
   5
   6 When a fleecing option is given, it is expected that each device has
   7 a corresponding "-fleecing" block device already attached, except for
   8 EFI disk and TPM state, where fleecing is never used.
   9
  10 The following graph was adapted from [0] which also contains more
  11 details about fleecing.
  12
  13 [guest]
  14    |
  15    | root
  16    v                 file
  17 [copy-before-write]<------[snapshot-access]
  18    |           |
  19    | file      | target
  20    v           v
  21 [source] [fleecing]
  22
  23 For fleecing, a copy-before-write filter is inserted on top of the
  24 source node, as well as a snapshot-access node pointing to the filter
  25 node which allows to read the consistent state of the image at the
  26 time it was inserted. New guest writes are passed through the
  27 copy-before-write filter which will first copy over old data to the
  28 fleecing image in case that old data is still needed by the
  29 snapshot-access node.
  30
  31 The backup process will sequentially read from the snapshot access,
  32 which has a bitmap and knows whether to read from the original image
  33 or the fleecing image to get the "snapshot" state, i.e. data from the
  34 source image at the time when the copy-before-write filter was
  35 inserted. After reading, the copied sections are discarded from the
  36 fleecing image to reduce space usage.
  37
  38 All of this can be restricted by an initial dirty bitmap to parts of
  39 the source image that are required for an incremental backup.
  40
  41 For discard to work, it is necessary that the fleecing image does not
  42 have a larger cluster size than the backup job granularity. Since
  43 querying that size does not always work, e.g. for RBD with krbd, the
  44 cluster size will not be reported, a minimum of 4 MiB is used. A job
  45 with PBS target already has at least this granularity, so it's just
  46 relevant for other targets. I.e. edge cases where this minimum is not
  47 enough should be very rare in practice. If ever necessary in the
  48 future, can still add a passed-in value for the backup QMP command to
  49 override.
  50
  51 Additionally, the cbw-timeout and on-cbw-error=break-snapshot options
  52 are set when installing the copy-before-write filter and
  53 snapshot-access. When an error or timeout occurs, the problematic (and
  54 each further) snapshot operation will fail and thus cancel the backup
  55 instead of breaking the guest write.
  56
  57 Note that job_id cannot be inferred from the snapshot-access bs because
  58 it has no parent, so just pass the one from the original bs.
  59
  60 [0]: https://www.mail-archive.com/qemu-devel@nongnu.org/msg876056.html
  61
  62 Signed-off-by: Fiona Ebner <f.ebner@proxmox.com>
  63 Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  64 ---
  65  block/monitor/block-hmp-cmds.c |   1 +
  66  pve-backup.c                   | 143 ++++++++++++++++++++++++++++++++-
  67  qapi/block-core.json           |   8 +-
  68  3 files changed, 148 insertions(+), 4 deletions(-)
  69
  70 diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
  71 index 6efe28cef5..ca29cc4281 100644
  72 --- a/block/monitor/block-hmp-cmds.c
  73 +++ b/block/monitor/block-hmp-cmds.c
  74 @@ -1064,6 +1064,7 @@ void coroutine_fn hmp_backup(Monitor *mon, const QDict *qdict)
  75          NULL, NULL,
  76          devlist, qdict_haskey(qdict, "speed"), speed,
  77          false, 0, // BackupPerf max-workers
  78 +        false, false, // fleecing
  79          &error);
  80
  81      hmp_handle_error(mon, error);
  82 diff --git a/pve-backup.c b/pve-backup.c
  83 index e6b17b797e..00aaff6509 100644
  84 --- a/pve-backup.c
  85 +++ b/pve-backup.c
  86 @@ -7,8 +7,10 @@
  87  #include "sysemu/blockdev.h"
  88  #include "block/block_int-global-state.h"
  89  #include "block/blockjob.h"
  90 +#include "block/copy-before-write.h"
  91  #include "block/dirty-bitmap.h"
  92  #include "qapi/qapi-commands-block.h"
  93 +#include "qapi/qmp/qdict.h"
  94  #include "qapi/qmp/qerror.h"
  95  #include "qemu/cutils.h"
  96
  97 @@ -80,8 +82,15 @@ static void pvebackup_init(void)
  98  // initialize PVEBackupState at startup
  99  opts_init(pvebackup_init);
 100
 101 +typedef struct PVEBackupFleecingInfo {
 102 +    BlockDriverState *bs;
 103 +    BlockDriverState *cbw;
 104 +    BlockDriverState *snapshot_access;
 105 +} PVEBackupFleecingInfo;
 106 +
 107  typedef struct PVEBackupDevInfo {
 108      BlockDriverState *bs;
 109 +    PVEBackupFleecingInfo fleecing;
 110      size_t size;
 111      uint64_t block_size;
 112      uint8_t dev_id;
 113 @@ -361,6 +370,25 @@ static void pvebackup_complete_cb(void *opaque, int ret)
 114      PVEBackupDevInfo *di = opaque;
 115      di->completed_ret = ret;
 116
 117 +    /*
 118 +     * Handle block-graph specific cleanup (for fleecing) outside of the coroutine, because the work
 119 +     * won't be done as a coroutine anyways:
 120 +     * - For snapshot_access, allows doing bdrv_unref() directly. Doing it via bdrv_co_unref() would
 121 +     *   just spawn a BH calling bdrv_unref().
 122 +     * - For cbw, draining would need to spawn a BH.
 123 +     *
 124 +     * Note that the AioContext lock is already acquired by our caller, i.e.
 125 +     * job_finalize_single_locked()
 126 +     */
 127 +    if (di->fleecing.snapshot_access) {
 128 +        bdrv_unref(di->fleecing.snapshot_access);
 129 +        di->fleecing.snapshot_access = NULL;
 130 +    }
 131 +    if (di->fleecing.cbw) {
 132 +        bdrv_cbw_drop(di->fleecing.cbw);
 133 +        di->fleecing.cbw = NULL;
 134 +    }
 135 +
 136      /*
 137       * Schedule stream cleanup in async coroutine. close_image and finish might
 138       * take a while, so we can't block on them here. This way it also doesn't
 139 @@ -521,9 +549,82 @@ static void create_backup_jobs_bh(void *opaque) {
 140
 141          bdrv_drained_begin(di->bs);
 142
 143 +        BackupPerf perf = (BackupPerf){ .max_workers = backup_state.perf.max_workers };
 144 +
 145 +        BlockDriverState *source_bs = di->bs;
 146 +        bool discard_source = false;
 147 +        const char *job_id = bdrv_get_device_name(di->bs);
 148 +        if (di->fleecing.bs) {
 149 +            QDict *cbw_opts = qdict_new();
 150 +            qdict_put_str(cbw_opts, "driver", "copy-before-write");
 151 +            qdict_put_str(cbw_opts, "file", bdrv_get_node_name(di->bs));
 152 +            qdict_put_str(cbw_opts, "target", bdrv_get_node_name(di->fleecing.bs));
 153 +
 154 +            if (di->bitmap) {
 155 +                /*
 156 +                 * Only guest writes to parts relevant for the backup need to be intercepted with
 157 +                 * old data being copied to the fleecing image.
 158 +                 */
 159 +                qdict_put_str(cbw_opts, "bitmap.node", bdrv_get_node_name(di->bs));
 160 +                qdict_put_str(cbw_opts, "bitmap.name", bdrv_dirty_bitmap_name(di->bitmap));
 161 +            }
 162 +            /*
 163 +             * Fleecing storage is supposed to be fast and it's better to break backup than guest
 164 +             * writes. Certain guest drivers like VirtIO-win have 60 seconds timeout by default, so
 165 +             * abort a bit before that.
 166 +             */
 167 +            qdict_put_str(cbw_opts, "on-cbw-error", "break-snapshot");
 168 +            qdict_put_int(cbw_opts, "cbw-timeout", 45);
 169 +
 170 +            di->fleecing.cbw = bdrv_insert_node(di->bs, cbw_opts, BDRV_O_RDWR, &local_err);
 171 +
 172 +            if (!di->fleecing.cbw) {
 173 +                error_setg(errp, "appending cbw node for fleecing failed: %s",
 174 +                           local_err ? error_get_pretty(local_err) : "unknown error");
 175 +                break;
 176 +            }
 177 +
 178 +            QDict *snapshot_access_opts = qdict_new();
 179 +            qdict_put_str(snapshot_access_opts, "driver", "snapshot-access");
 180 +            qdict_put_str(snapshot_access_opts, "file", bdrv_get_node_name(di->fleecing.cbw));
 181 +
 182 +            /*
 183 +             * Holding the AioContext lock here would cause a deadlock, because bdrv_open_driver()
 184 +             * will aquire it a second time. But it's allowed to be held exactly once when polling
 185 +             * and that happens when the bdrv_refresh_total_sectors() call is made there.
 186 +             */
 187 +            aio_context_release(aio_context);
 188 +            di->fleecing.snapshot_access =
 189 +                bdrv_open(NULL, NULL, snapshot_access_opts, BDRV_O_RDWR | BDRV_O_UNMAP, &local_err);
 190 +            aio_context_acquire(aio_context);
 191 +            if (!di->fleecing.snapshot_access) {
 192 +                error_setg(errp, "setting up snapshot access for fleecing failed: %s",
 193 +                           local_err ? error_get_pretty(local_err) : "unknown error");
 194 +                break;
 195 +            }
 196 +            source_bs = di->fleecing.snapshot_access;
 197 +            discard_source = true;
 198 +
 199 +            /*
 200 +             * bdrv_get_info() just retuns 0 (= doesn't matter) for RBD when using krbd. But discard
 201 +             * on the fleecing image won't work if the backup job's granularity is less than the RBD
 202 +             * object size (default 4 MiB), so it does matter. Always use at least 4 MiB. With a PBS
 203 +             * target, the backup job granularity would already be at least this much.
 204 +             */
 205 +            perf.min_cluster_size = 4 * 1024 * 1024;
 206 +            /*
 207 +             * For discard to work, cluster size for the backup job must be at least the same as for
 208 +             * the fleecing image.
 209 +             */
 210 +            BlockDriverInfo bdi;
 211 +            if (bdrv_get_info(di->fleecing.bs, &bdi) >= 0) {
 212 +                perf.min_cluster_size = MAX(perf.min_cluster_size, bdi.cluster_size);
 213 +            }
 214 +        }
 215 +
 216          BlockJob *job = backup_job_create(
 217 -            NULL, di->bs, di->target, backup_state.speed, sync_mode, di->bitmap,
 218 -            bitmap_mode, false, NULL, &backup_state.perf, BLOCKDEV_ON_ERROR_REPORT,
 219 +            job_id, source_bs, di->target, backup_state.speed, sync_mode, di->bitmap,
 220 +            bitmap_mode, false, discard_source, NULL, &perf, BLOCKDEV_ON_ERROR_REPORT,
 221              BLOCKDEV_ON_ERROR_REPORT, JOB_DEFAULT, pvebackup_complete_cb, di, backup_state.txn,
 222              &local_err);
 223
 224 @@ -581,6 +682,14 @@ static void create_backup_jobs_bh(void *opaque) {
 225      aio_co_enter(data->ctx, data->co);
 226  }
 227
 228 +/*
 229 + * EFI disk and TPM state are small and it's just not worth setting up fleecing for them.
 230 + */
 231 +static bool device_uses_fleecing(const char *device_id)
 232 +{
 233 +    return strncmp(device_id, "drive-efidisk", 13) && strncmp(device_id, "drive-tpmstate", 14);
 234 +}
 235 +
 236  /*
 237   * Returns a list of device infos, which needs to be freed by the caller. In
 238   * case of an error, errp will be set, but the returned value might still be a
 239 @@ -588,6 +697,7 @@ static void create_backup_jobs_bh(void *opaque) {
 240   */
 241  static GList coroutine_fn *get_device_info(
 242      const char *devlist,
 243 +    bool fleecing,
 244      Error **errp)
 245  {
 246      gchar **devs = NULL;
 247 @@ -611,6 +721,31 @@ static GList coroutine_fn *get_device_info(
 248              }
 249              PVEBackupDevInfo *di = g_new0(PVEBackupDevInfo, 1);
 250              di->bs = bs;
 251 +
 252 +            if (fleecing && device_uses_fleecing(*d)) {
 253 +                g_autofree gchar *fleecing_devid = g_strconcat(*d, "-fleecing", NULL);
 254 +                BlockBackend *fleecing_blk = blk_by_name(fleecing_devid);
 255 +                if (!fleecing_blk) {
 256 +                    error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
 257 +                              "Device '%s' not found", fleecing_devid);
 258 +                    goto err;
 259 +                }
 260 +                BlockDriverState *fleecing_bs = blk_bs(fleecing_blk);
 261 +                if (!bdrv_co_is_inserted(fleecing_bs)) {
 262 +                    error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, fleecing_devid);
 263 +                    goto err;
 264 +                }
 265 +                /*
 266 +                 * Fleecing image needs to be the same size to act as a cbw target.
 267 +                 */
 268 +                if (bs->total_sectors != fleecing_bs->total_sectors) {
 269 +                    error_setg(errp, "Size mismatch for '%s' - sector count %ld != %ld",
 270 +                               fleecing_devid, fleecing_bs->total_sectors, bs->total_sectors);
 271 +                    goto err;
 272 +                }
 273 +                di->fleecing.bs = fleecing_bs;
 274 +            }
 275 +
 276              di_list = g_list_append(di_list, di);
 277              d++;
 278          }
 279 @@ -660,6 +795,7 @@ UuidInfo coroutine_fn *qmp_backup(
 280      const char *devlist,
 281      bool has_speed, int64_t speed,
 282      bool has_max_workers, int64_t max_workers,
 283 +    bool has_fleecing, bool fleecing,
 284      Error **errp)
 285  {
 286      assert(qemu_in_coroutine());
 287 @@ -687,7 +823,7 @@ UuidInfo coroutine_fn *qmp_backup(
 288      /* Todo: try to auto-detect format based on file name */
 289      format = has_format ? format : BACKUP_FORMAT_VMA;
 290
 291 -    di_list = get_device_info(devlist, &local_err);
 292 +    di_list = get_device_info(devlist, has_fleecing && fleecing, &local_err);
 293      if (local_err) {
 294          error_propagate(errp, local_err);
 295          goto err;
 296 @@ -1086,5 +1222,6 @@ ProxmoxSupportStatus *qmp_query_proxmox_support(Error **errp)
 297      ret->query_bitmap_info = true;
 298      ret->pbs_masterkey = true;
 299      ret->backup_max_workers = true;
 300 +    ret->backup_fleecing = true;
 301      return ret;
 302  }
 303 diff --git a/qapi/block-core.json b/qapi/block-core.json
 304 index 58fd637e86..0bc5f42677 100644
 305 --- a/qapi/block-core.json
 306 +++ b/qapi/block-core.json
 307 @@ -933,6 +933,10 @@
 308  #
 309  # @max-workers: see @BackupPerf for details. Default 16.
 310  #
 311 +# @fleecing: perform a backup with fleecing. For each device in @devlist, a
 312 +#            corresponing '-fleecing' device with the same size already needs to
 313 +#            be present.
 314 +#
 315  # Returns: the uuid of the backup job
 316  #
 317  ##
 318 @@ -953,7 +957,8 @@
 319                                      '*firewall-file': 'str',
 320                                      '*devlist': 'str',
 321                                      '*speed': 'int',
 322 -                                    '*max-workers': 'int' },
 323 +                                    '*max-workers': 'int',
 324 +                                    '*fleecing': 'bool' },
 325    'returns': 'UuidInfo', 'coroutine': true }
 326
 327  ##
 328 @@ -1009,6 +1014,7 @@
 329              'pbs-dirty-bitmap-migration': 'bool',
 330              'pbs-masterkey': 'bool',
 331              'pbs-library-version': 'str',
 332 +            'backup-fleecing': 'bool',
 333              'backup-max-workers': 'bool' } }
 334
 335  ##