ceph/src/librbd/migration/QCOWFormat.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3
   4 #include "librbd/migration/QCOWFormat.h"
   5 #include "common/Clock.h"
   6 #include "common/dout.h"
   7 #include "common/errno.h"
   8 #include "include/intarith.h"
   9 #include "librbd/AsioEngine.h"
  10 #include "librbd/ImageCtx.h"
  11 #include "librbd/ImageState.h"
  12 #include "librbd/Utils.h"
  13 #include "librbd/io/AioCompletion.h"
  14 #include "librbd/io/ReadResult.h"
  15 #include "librbd/migration/SnapshotInterface.h"
  16 #include "librbd/migration/SourceSpecBuilder.h"
  17 #include "librbd/migration/StreamInterface.h"
  18 #include "librbd/migration/Utils.h"
  19 #include <boost/asio/dispatch.hpp>
  20 #include <boost/asio/post.hpp>
  21 #include <deque>
  22 #include <tuple>
  23 #include <unordered_map>
  24 #include <vector>
  25
  26 #define dout_subsys ceph_subsys_rbd
  27
  28 namespace librbd {
  29 namespace migration {
  30
  31 #undef dout_prefix
  32 #define dout_prefix *_dout << "librbd::migration::QCOWFormat: " \
  33                            << __func__ << ": "
  34
  35 namespace qcow_format {
  36
  37 struct ClusterExtent {
  38   uint64_t cluster_offset;
  39   uint64_t cluster_length;
  40   uint64_t intra_cluster_offset;
  41   uint64_t image_offset;
  42   uint64_t buffer_offset;
  43
  44   ClusterExtent(uint64_t cluster_offset, uint64_t cluster_length,
  45                 uint64_t intra_cluster_offset, uint64_t image_offset,
  46                 uint64_t buffer_offset)
  47     : cluster_offset(cluster_offset), cluster_length(cluster_length),
  48       intra_cluster_offset(intra_cluster_offset), image_offset(image_offset),
  49       buffer_offset(buffer_offset) {
  50   }
  51 };
  52
  53 typedef std::vector<ClusterExtent> ClusterExtents;
  54
  55 void LookupTable::init() {
  56   if (cluster_offsets == nullptr) {
  57     cluster_offsets = reinterpret_cast<uint64_t*>(bl.c_str());
  58   }
  59 }
  60
  61 void LookupTable::decode() {
  62   init();
  63
  64   // L2 tables are selectively byte-swapped on demand if only requesting a
  65   // single cluster offset
  66   if (decoded) {
  67     return;
  68   }
  69
  70   // translate the lookup table (big-endian -> CPU endianess)
  71   for (auto idx = 0UL; idx < size; ++idx) {
  72     cluster_offsets[idx] = be64toh(cluster_offsets[idx]);
  73   }
  74
  75   decoded = true;
  76 }
  77
  78 void populate_cluster_extents(CephContext* cct, uint64_t cluster_size,
  79                               const io::Extents& image_extents,
  80                               ClusterExtents* cluster_extents) {
  81   uint64_t buffer_offset = 0;
  82   for (auto [image_offset, image_length] : image_extents) {
  83     while (image_length > 0) {
  84       auto intra_cluster_offset = image_offset & (cluster_size - 1);
  85       auto intra_cluster_length = cluster_size - intra_cluster_offset;
  86       auto cluster_length = std::min(image_length, intra_cluster_length);
  87
  88       ldout(cct, 20) << "image_offset=" << image_offset << ", "
  89                      << "image_length=" << image_length << ", "
  90                      << "cluster_length=" << cluster_length << dendl;
  91
  92
  93       cluster_extents->emplace_back(0, cluster_length, intra_cluster_offset,
  94                                    image_offset, buffer_offset);
  95
  96       image_offset += cluster_length;
  97       image_length -= cluster_length;
  98       buffer_offset += cluster_length;
  99     }
 100   }
 101 }
 102
 103 } // namespace qcow_format
 104
 105 using namespace qcow_format;
 106
 107 template <typename I>
 108 struct QCOWFormat<I>::Cluster {
 109   const uint64_t cluster_offset;
 110   bufferlist cluster_data_bl;
 111
 112   Cluster(uint64_t cluster_offset) : cluster_offset(cluster_offset) {
 113   }
 114 };
 115
 116 #undef dout_prefix
 117 #define dout_prefix *_dout << "librbd::migration::QCOWFormat::ClusterCache: " \
 118                            << this << " " << __func__ << ": "
 119
 120 template <typename I>
 121 class QCOWFormat<I>::ClusterCache {
 122 public:
 123   ClusterCache(QCOWFormat* qcow_format)
 124     : qcow_format(qcow_format),
 125       m_strand(*qcow_format->m_image_ctx->asio_engine) {
 126   }
 127
 128   void get_cluster(uint64_t cluster_offset, uint64_t cluster_length,
 129                    uint64_t intra_cluster_offset, bufferlist* bl,
 130                    Context* on_finish) {
 131     auto cct = qcow_format->m_image_ctx->cct;
 132     ldout(cct, 20) << "cluster_offset=" << cluster_offset << dendl;
 133
 134     // cache state machine runs in a single strand thread
 135     boost::asio::dispatch(
 136       m_strand,
 137       [this, cluster_offset, cluster_length, intra_cluster_offset, bl,
 138        on_finish]() {
 139         execute_get_cluster(cluster_offset, cluster_length,
 140                             intra_cluster_offset, bl, on_finish);
 141       });
 142   }
 143
 144 private:
 145   typedef std::tuple<uint64_t, uint64_t, bufferlist*, Context*> Completion;
 146   typedef std::list<Completion> Completions;
 147
 148   QCOWFormat* qcow_format;
 149   boost::asio::io_context::strand m_strand;
 150
 151   std::shared_ptr<Cluster> cluster;
 152   std::unordered_map<uint64_t, Completions> cluster_completions;
 153
 154   void execute_get_cluster(uint64_t cluster_offset, uint64_t cluster_length,
 155                            uint64_t intra_cluster_offset, bufferlist* bl,
 156                            Context* on_finish) {
 157     auto cct = qcow_format->m_image_ctx->cct;
 158     ldout(cct, 20) << "cluster_offset=" << cluster_offset << dendl;
 159
 160     if (cluster && cluster->cluster_offset == cluster_offset) {
 161       // most-recent cluster matches
 162       bl->substr_of(cluster->cluster_data_bl, intra_cluster_offset,
 163                     cluster_length);
 164       boost::asio::post(*qcow_format->m_image_ctx->asio_engine,
 165                         [on_finish]() { on_finish->complete(0); });
 166       return;
 167     }
 168
 169     // record callback for cluster
 170     bool new_request = (cluster_completions.count(cluster_offset) == 0);
 171     cluster_completions[cluster_offset].emplace_back(
 172       intra_cluster_offset, cluster_length, bl, on_finish);
 173     if (new_request) {
 174       // start the new read request
 175       read_cluster(std::make_shared<Cluster>(cluster_offset));
 176     }
 177   }
 178
 179   void read_cluster(std::shared_ptr<Cluster> cluster) {
 180     auto cct = qcow_format->m_image_ctx->cct;
 181
 182     uint64_t stream_offset = cluster->cluster_offset;
 183     uint64_t stream_length = qcow_format->m_cluster_size;
 184     if ((cluster->cluster_offset & QCOW_OFLAG_COMPRESSED) != 0) {
 185       // compressed clusters encode the compressed length in the lower bits
 186       stream_offset = cluster->cluster_offset &
 187                       qcow_format->m_cluster_offset_mask;
 188       stream_length = (cluster->cluster_offset >>
 189                         (63 - qcow_format->m_cluster_bits)) &
 190                       (qcow_format->m_cluster_size - 1);
 191     }
 192
 193     ldout(cct, 20) << "cluster_offset=" << cluster->cluster_offset << ", "
 194                    << "stream_offset=" << stream_offset << ", "
 195                    << "stream_length=" << stream_length << dendl;
 196
 197     // read the cluster into the cache entry
 198     auto ctx = new LambdaContext([this, cluster](int r) {
 199       boost::asio::post(m_strand, [this, cluster, r]() {
 200         handle_read_cluster(r, cluster); }); });
 201     qcow_format->m_stream->read({{stream_offset, stream_length}},
 202                                 &cluster->cluster_data_bl, ctx);
 203   }
 204
 205   void handle_read_cluster(int r, std::shared_ptr<Cluster> cluster) {
 206     auto cct = qcow_format->m_image_ctx->cct;
 207     ldout(cct, 20) << "r=" << r << ", "
 208                    << "cluster_offset=" << cluster->cluster_offset << dendl;
 209
 210     auto completions = std::move(cluster_completions[cluster->cluster_offset]);
 211     cluster_completions.erase(cluster->cluster_offset);
 212
 213     if (r < 0) {
 214       lderr(cct) << "failed to read cluster offset " << cluster->cluster_offset
 215                  << ": " << cpp_strerror(r) << dendl;
 216     } else {
 217       if ((cluster->cluster_offset & QCOW_OFLAG_COMPRESSED) != 0) {
 218         bufferlist compressed_bl{std::move(cluster->cluster_data_bl)};
 219         cluster->cluster_data_bl.clear();
 220
 221         // TODO
 222         lderr(cct) << "support for compressed clusters is not available"
 223                    << dendl;
 224         r = -EINVAL;
 225       } else {
 226         // cache the MRU cluster in case of sequential IO
 227         this->cluster = cluster;
 228       }
 229     }
 230
 231     // complete the IO back to caller
 232     boost::asio::post(*qcow_format->m_image_ctx->asio_engine,
 233                       [r, cluster, completions=std::move(completions)]() {
 234       for (auto completion : completions) {
 235         if (r >= 0) {
 236           std::get<2>(completion)->substr_of(
 237             cluster->cluster_data_bl,
 238             std::get<0>(completion),
 239             std::get<1>(completion));
 240         }
 241         std::get<3>(completion)->complete(r);
 242       }
 243     });
 244   }
 245 };
 246
 247 #undef dout_prefix
 248 #define dout_prefix *_dout << "librbd::migration::QCOWFormat::L2TableCache: " \
 249                            << this << " " << __func__ << ": "
 250
 251 template <typename I>
 252 class QCOWFormat<I>::L2TableCache {
 253 public:
 254   L2TableCache(QCOWFormat* qcow_format)
 255     : qcow_format(qcow_format),
 256       m_strand(*qcow_format->m_image_ctx->asio_engine),
 257       l2_cache_entries(QCOW_L2_CACHE_SIZE) {
 258   }
 259
 260   void get_l2_table(const LookupTable* l1_table, uint64_t l2_table_offset,
 261                     std::shared_ptr<const LookupTable>* l2_table,
 262                     Context* on_finish) {
 263     auto cct = qcow_format->m_image_ctx->cct;
 264     ldout(cct, 20) << "l2_table_offset=" << l2_table_offset << dendl;
 265
 266     // cache state machine runs in a single strand thread
 267     Request request{l1_table, l2_table_offset, l2_table, on_finish};
 268     boost::asio::dispatch(
 269       m_strand, [this, request=std::move(request)]() {
 270         requests.push_back(std::move(request));
 271       });
 272     dispatch_request();
 273   }
 274
 275   void get_cluster_offset(const LookupTable* l1_table,
 276                           uint64_t image_offset, uint64_t* cluster_offset,
 277                           Context* on_finish) {
 278     auto cct = qcow_format->m_image_ctx->cct;
 279     uint32_t l1_table_index = image_offset >> qcow_format->m_l1_shift;
 280     uint64_t l2_table_offset = l1_table->cluster_offsets[std::min<uint32_t>(
 281                                  l1_table_index, l1_table->size - 1)] &
 282                                qcow_format->m_cluster_mask;
 283     uint32_t l2_table_index = (image_offset >> qcow_format->m_cluster_bits) &
 284                               (qcow_format->m_l2_size - 1);
 285     ldout(cct, 20) << "image_offset=" << image_offset << ", "
 286                    << "l1_table_index=" << l1_table_index << ", "
 287                    << "l2_table_offset=" << l2_table_offset << ", "
 288                    << "l2_table_index=" << l2_table_index << dendl;
 289
 290     if (l1_table_index >= l1_table->size) {
 291       lderr(cct) << "L1 index " << l1_table_index << " out-of-bounds" << dendl;
 292       on_finish->complete(-ERANGE);
 293       return;
 294     } else if (l2_table_offset == 0) {
 295       // L2 table has not been allocated for specified offset
 296       ldout(cct, 20) << "image_offset=" << image_offset << ", "
 297                      << "cluster_offset=DNE" << dendl;
 298       *cluster_offset = 0;
 299       on_finish->complete(-ENOENT);
 300       return;
 301     }
 302
 303     // cache state machine runs in a single strand thread
 304     Request request{l1_table, l2_table_offset, l2_table_index, cluster_offset,
 305                     on_finish};
 306     boost::asio::dispatch(
 307       m_strand, [this, request=std::move(request)]() {
 308         requests.push_back(std::move(request));
 309       });
 310     dispatch_request();
 311   }
 312
 313 private:
 314   QCOWFormat* qcow_format;
 315
 316   boost::asio::io_context::strand m_strand;
 317
 318   struct Request {
 319     const LookupTable* l1_table;
 320
 321     uint64_t l2_table_offset;
 322
 323     // get_cluster_offset request
 324     uint32_t l2_table_index;
 325     uint64_t* cluster_offset = nullptr;
 326
 327     // get_l2_table request
 328     std::shared_ptr<const LookupTable>* l2_table;
 329
 330     Context* on_finish;
 331
 332     Request(const LookupTable* l1_table, uint64_t l2_table_offset,
 333             uint32_t l2_table_index, uint64_t* cluster_offset,
 334             Context* on_finish)
 335       : l1_table(l1_table), l2_table_offset(l2_table_offset),
 336         l2_table_index(l2_table_index), cluster_offset(cluster_offset),
 337         on_finish(on_finish) {
 338     }
 339     Request(const LookupTable* l1_table, uint64_t l2_table_offset,
 340             std::shared_ptr<const LookupTable>* l2_table, Context* on_finish)
 341       : l1_table(l1_table), l2_table_offset(l2_table_offset),
 342         l2_table(l2_table), on_finish(on_finish) {
 343     }
 344   };
 345
 346   typedef std::deque<Request> Requests;
 347
 348   struct L2Cache {
 349     uint64_t l2_offset = 0;
 350     std::shared_ptr<LookupTable> l2_table;
 351
 352     utime_t timestamp;
 353     uint32_t count = 0;
 354     bool in_flight = false;
 355
 356     int ret_val = 0;
 357   };
 358   std::vector<L2Cache> l2_cache_entries;
 359
 360   Requests requests;
 361
 362   void dispatch_request() {
 363     boost::asio::dispatch(m_strand, [this]() { execute_request(); });
 364   }
 365
 366   void execute_request() {
 367     auto cct = qcow_format->m_image_ctx->cct;
 368     if (requests.empty()) {
 369       return;
 370     }
 371
 372     auto request = requests.front();
 373     ldout(cct, 20) << "l2_table_offset=" << request.l2_table_offset << dendl;
 374
 375     std::shared_ptr<LookupTable> l2_table;
 376     int r = l2_table_lookup(request.l2_table_offset, &l2_table);
 377     if (r < 0) {
 378       lderr(cct) << "failed to load L2 table: l2_table_offset="
 379                  << request.l2_table_offset << ": "
 380                  << cpp_strerror(r) << dendl;
 381     } else if (l2_table == nullptr) {
 382       // table not in cache -- will restart once its loaded
 383       return;
 384     } else if (request.cluster_offset != nullptr) {
 385       auto cluster_offset = l2_table->cluster_offsets[request.l2_table_index];
 386       if (!l2_table->decoded) {
 387         // table hasn't been byte-swapped
 388         cluster_offset = be64toh(cluster_offset);
 389       }
 390
 391       *request.cluster_offset = cluster_offset & qcow_format->m_cluster_mask;
 392       if (*request.cluster_offset == QCOW_OFLAG_ZERO) {
 393         ldout(cct, 20) << "l2_table_offset=" << request.l2_table_offset << ", "
 394                        << "l2_table_index=" << request.l2_table_index << ", "
 395                        << "cluster_offset=zeroed" << dendl;
 396       } else {
 397         ldout(cct, 20) << "l2_table_offset=" << request.l2_table_offset << ", "
 398                        << "l2_table_index=" << request.l2_table_index << ", "
 399                        << "cluster_offset=" << *request.cluster_offset
 400                        << dendl;
 401       }
 402     } else if (request.l2_table != nullptr) {
 403       // ensure it's in the correct byte-order
 404       l2_table->decode();
 405       *request.l2_table = l2_table;
 406     } else {
 407       ceph_assert(false);
 408     }
 409
 410     // complete the L2 cache request
 411     boost::asio::post(*qcow_format->m_image_ctx->asio_engine,
 412                       [r, ctx=request.on_finish]() { ctx->complete(r); });
 413     requests.pop_front();
 414
 415     // process next request (if any)
 416     dispatch_request();
 417   }
 418
 419   int l2_table_lookup(uint64_t l2_offset,
 420                       std::shared_ptr<LookupTable>* l2_table) {
 421     auto cct = qcow_format->m_image_ctx->cct;
 422
 423     l2_table->reset();
 424
 425     // find a match in the existing cache
 426     for (auto idx = 0U; idx < l2_cache_entries.size(); ++idx) {
 427       auto& l2_cache = l2_cache_entries[idx];
 428       if (l2_cache.l2_offset == l2_offset) {
 429         if (l2_cache.in_flight) {
 430           ldout(cct, 20) << "l2_offset=" << l2_offset << ", "
 431                          << "index=" << idx << " (in-flight)" << dendl;
 432           return 0;
 433         }
 434
 435         if (l2_cache.ret_val < 0) {
 436           ldout(cct, 20) << "l2_offset=" << l2_offset << ", "
 437                          << "index=" << idx << " (error): "
 438                          << cpp_strerror(l2_cache.ret_val) << dendl;
 439           int r = l2_cache.ret_val;
 440           l2_cache = L2Cache{};
 441
 442           return r;
 443         }
 444
 445         ++l2_cache.count;
 446         if (l2_cache.count == std::numeric_limits<uint32_t>::max()) {
 447           for (auto& entry : l2_cache_entries) {
 448             entry.count >>= 1;
 449           }
 450         }
 451
 452         ldout(cct, 20) << "l2_offset=" << l2_offset << ", " << "index=" << idx
 453                        << dendl;
 454         *l2_table = l2_cache.l2_table;
 455         return 0;
 456       }
 457     }
 458
 459     // find the least used entry
 460     int32_t min_idx = -1;
 461     uint32_t min_count = std::numeric_limits<uint32_t>::max();
 462     utime_t min_timestamp;
 463     for (uint32_t idx = 0U; idx < l2_cache_entries.size(); ++idx) {
 464       auto& l2_cache = l2_cache_entries[idx];
 465       if (l2_cache.in_flight) {
 466         continue;
 467       }
 468
 469       if (l2_cache.count > 0) {
 470         --l2_cache.count;
 471       }
 472
 473       if (l2_cache.count <= min_count) {
 474         if (min_idx == -1 || l2_cache.timestamp < min_timestamp) {
 475           min_timestamp = l2_cache.timestamp;
 476           min_count = l2_cache.count;
 477           min_idx = idx;
 478         }
 479       }
 480     }
 481
 482     if (min_idx == -1) {
 483       // no space in the cache due to in-flight requests
 484       ldout(cct, 20) << "l2_offset=" << l2_offset << ", "
 485                      << "index=DNE (cache busy)" << dendl;
 486       return 0;
 487     }
 488
 489     ldout(cct, 20) << "l2_offset=" << l2_offset << ", "
 490                    << "index=" << min_idx << " (loading)" << dendl;
 491     auto& l2_cache = l2_cache_entries[min_idx];
 492     l2_cache.l2_table = std::make_shared<LookupTable>(qcow_format->m_l2_size);
 493     l2_cache.l2_offset = l2_offset;
 494     l2_cache.timestamp = ceph_clock_now();
 495     l2_cache.count = 1;
 496     l2_cache.in_flight = true;
 497
 498     // read the L2 table into the L2 cache entry
 499     auto ctx = new LambdaContext([this, index=min_idx, l2_offset](int r) {
 500       boost::asio::post(m_strand, [this, index, l2_offset, r]() {
 501         handle_l2_table_lookup(r, index, l2_offset); }); });
 502     qcow_format->m_stream->read(
 503       {{l2_offset, qcow_format->m_l2_size * sizeof(uint64_t)}},
 504       &l2_cache.l2_table->bl, ctx);
 505     return 0;
 506   }
 507
 508   void handle_l2_table_lookup(int r, uint32_t index, uint64_t l2_offset) {
 509     auto cct = qcow_format->m_image_ctx->cct;
 510     ldout(cct, 20) << "r=" << r << ", "
 511                    << "l2_offset=" << l2_offset << ", "
 512                    << "index=" << index << dendl;
 513
 514     auto& l2_cache = l2_cache_entries[index];
 515     ceph_assert(l2_cache.in_flight);
 516     l2_cache.in_flight = false;
 517
 518     if (r < 0) {
 519       lderr(cct) << "failed to load L2 table: "
 520                  << "l2_offset=" << l2_cache.l2_offset << ": "
 521                  << cpp_strerror(r) << dendl;
 522       l2_cache.ret_val = r;
 523     } else {
 524       // keep the L2 table in big-endian byte-order until the full table
 525       // is requested
 526       l2_cache.l2_table->init();
 527     }
 528
 529     // restart the state machine
 530     dispatch_request();
 531   }
 532
 533 };
 534
 535 #undef dout_prefix
 536 #define dout_prefix *_dout << "librbd::migration::QCOWFormat::ReadRequest: " \
 537                            << this << " " << __func__ << ": "
 538
 539 template <typename I>
 540 class QCOWFormat<I>::ReadRequest {
 541 public:
 542   ReadRequest(QCOWFormat* qcow_format, io::AioCompletion* aio_comp,
 543               const LookupTable* l1_table, io::Extents&& image_extents)
 544     : qcow_format(qcow_format), aio_comp(aio_comp), l1_table(l1_table),
 545       image_extents(std::move(image_extents)) {
 546   }
 547
 548   void send() {
 549     get_cluster_offsets();
 550   }
 551
 552 private:
 553   QCOWFormat* qcow_format;
 554   io::AioCompletion* aio_comp;
 555
 556   const LookupTable* l1_table;
 557   io::Extents image_extents;
 558
 559   size_t image_extents_idx = 0;
 560   uint32_t image_extent_offset = 0;
 561
 562   ClusterExtents cluster_extents;
 563
 564   void get_cluster_offsets() {
 565     auto cct = qcow_format->m_image_ctx->cct;
 566     populate_cluster_extents(cct, qcow_format->m_cluster_size, image_extents,
 567                              &cluster_extents);
 568
 569     ldout(cct, 20) << dendl;
 570     auto ctx = new LambdaContext([this](int r) {
 571       handle_get_cluster_offsets(r); });
 572     auto gather_ctx = new C_Gather(cct, ctx);
 573
 574     for (auto& cluster_extent : cluster_extents) {
 575       auto sub_ctx = new LambdaContext(
 576         [this, &cluster_extent, on_finish=gather_ctx->new_sub()](int r) {
 577           handle_get_cluster_offset(r, cluster_extent, on_finish); });
 578       qcow_format->m_l2_table_cache->get_cluster_offset(
 579         l1_table, cluster_extent.image_offset,
 580         &cluster_extent.cluster_offset, sub_ctx);
 581     }
 582
 583     gather_ctx->activate();
 584   }
 585
 586   void handle_get_cluster_offset(int r, const ClusterExtent& cluster_extent,
 587                                  Context* on_finish) {
 588     auto cct = qcow_format->m_image_ctx->cct;
 589     ldout(cct, 20) << "r=" << r << ", "
 590                    << "image_offset=" << cluster_extent.image_offset << ", "
 591                    << "cluster_offset=" << cluster_extent.cluster_offset
 592                    << dendl;
 593
 594     if (r == -ENOENT) {
 595       ldout(cct, 20) << "image offset DNE in QCOW image" << dendl;
 596       r = 0;
 597     } else if (r < 0) {
 598       lderr(cct) << "failed to map image offset " << cluster_extent.image_offset
 599                  << ": " << cpp_strerror(r) << dendl;
 600     }
 601
 602     on_finish->complete(r);
 603   }
 604
 605   void handle_get_cluster_offsets(int r) {
 606     auto cct = qcow_format->m_image_ctx->cct;
 607     ldout(cct, 20) << "r=" << r << dendl;
 608
 609     if (r < 0) {
 610       lderr(cct) << "failed to retrieve cluster extents: " << cpp_strerror(r)
 611                  << dendl;
 612       aio_comp->fail(r);
 613       delete this;
 614       return;
 615     }
 616
 617     read_clusters();
 618   }
 619
 620   void read_clusters() {
 621     auto cct = qcow_format->m_image_ctx->cct;
 622     ldout(cct, 20) << dendl;
 623
 624     aio_comp->set_request_count(cluster_extents.size());
 625     for (auto& cluster_extent : cluster_extents) {
 626       auto read_ctx = new io::ReadResult::C_ImageReadRequest(
 627         aio_comp, cluster_extent.buffer_offset,
 628         {{cluster_extent.image_offset, cluster_extent.cluster_length}});
 629       read_ctx->ignore_enoent = true;
 630
 631       auto log_ctx = new LambdaContext(
 632         [this, cct=qcow_format->m_image_ctx->cct,
 633          image_offset=cluster_extent.image_offset,
 634          image_length=cluster_extent.cluster_length, ctx=read_ctx](int r) {
 635           handle_read_cluster(cct, r, image_offset, image_length, ctx);
 636         });
 637
 638       if (cluster_extent.cluster_offset == 0) {
 639         // QCOW header is at offset 0, implies cluster DNE
 640         log_ctx->complete(-ENOENT);
 641       } else if (cluster_extent.cluster_offset == QCOW_OFLAG_ZERO) {
 642         // explicitly zeroed section
 643         read_ctx->bl.append_zero(cluster_extent.cluster_length);
 644         log_ctx->complete(0);
 645       } else {
 646         // request the (sub)cluster from the cluster cache
 647         qcow_format->m_cluster_cache->get_cluster(
 648           cluster_extent.cluster_offset, cluster_extent.cluster_length,
 649           cluster_extent.intra_cluster_offset, &read_ctx->bl, log_ctx);
 650       }
 651     }
 652
 653     delete this;
 654   }
 655
 656   void handle_read_cluster(CephContext* cct, int r, uint64_t image_offset,
 657                            uint64_t image_length, Context* on_finish) const {
 658     // NOTE: treat as static function, expect object has been deleted
 659
 660     ldout(cct, 20) << "r=" << r << ", "
 661                    << "image_offset=" << image_offset << ", "
 662                    << "image_length=" << image_length << dendl;
 663
 664     if (r != -ENOENT && r < 0) {
 665       lderr(cct) << "failed to read image extent " << image_offset << "~"
 666                  << image_length << ": " << cpp_strerror(r) << dendl;
 667     }
 668
 669     on_finish->complete(r);
 670   }
 671 };
 672
 673 #undef dout_prefix
 674 #define dout_prefix *_dout << "librbd::migration::QCOWFormat::" \
 675                            << "ListSnapsRequest: " << this << " " \
 676                            << __func__ << ": "
 677
 678 template <typename I>
 679 class QCOWFormat<I>::ListSnapsRequest {
 680 public:
 681   ListSnapsRequest(
 682       QCOWFormat* qcow_format, uint32_t l1_table_index,
 683       ClusterExtents&& cluster_extents,
 684       const std::map<uint64_t, const LookupTable*>& snap_id_to_l1_table,
 685       io::SnapshotDelta* snapshot_delta, Context* on_finish)
 686     : qcow_format(qcow_format), l1_table_index(l1_table_index),
 687       cluster_extents(std::move(cluster_extents)),
 688       snap_id_to_l1_table(snap_id_to_l1_table), snapshot_delta(snapshot_delta),
 689       on_finish(on_finish) {
 690   }
 691
 692   void send() {
 693     get_l2_table();
 694   }
 695
 696 private:
 697   QCOWFormat* qcow_format;
 698   uint32_t l1_table_index;
 699   ClusterExtents cluster_extents;
 700   std::map<uint64_t, const LookupTable*> snap_id_to_l1_table;
 701   io::SnapshotDelta* snapshot_delta;
 702   Context* on_finish;
 703
 704   std::shared_ptr<const LookupTable> previous_l2_table;
 705   std::shared_ptr<const LookupTable> l2_table;
 706
 707   void get_l2_table() {
 708     auto cct = qcow_format->m_image_ctx->cct;
 709     if (snap_id_to_l1_table.empty()) {
 710       finish(0);
 711       return;
 712     }
 713
 714     auto it = snap_id_to_l1_table.begin();
 715     auto [snap_id, l1_table] = *it;
 716     snap_id_to_l1_table.erase(it);
 717
 718     previous_l2_table = l2_table;
 719     l2_table.reset();
 720
 721     auto ctx = new LambdaContext([this, snap_id = snap_id](int r) {
 722       boost::asio::post(qcow_format->m_strand, [this, snap_id, r]() {
 723         handle_get_l2_table(r, snap_id);
 724         });
 725     });
 726
 727     if (l1_table_index >= l1_table->size ||
 728         l1_table->cluster_offsets[l1_table_index] == 0) {
 729       ldout(cct, 20) << "l1_table_index=" << l1_table_index << ", "
 730                      << "snap_id=" << snap_id << ": DNE" << dendl;
 731       ctx->complete(-ENOENT);
 732       return;
 733     }
 734
 735     uint64_t l2_table_offset = l1_table->cluster_offsets[l1_table_index] &
 736                                qcow_format->m_cluster_mask;
 737
 738     ldout(cct, 20) << "l1_table_index=" << l1_table_index << ", "
 739                    << "snap_id=" << snap_id << ", "
 740                    << "l2_table_offset=" << l2_table_offset << dendl;
 741     qcow_format->m_l2_table_cache->get_l2_table(l1_table, l2_table_offset,
 742                                                 &l2_table, ctx);
 743   }
 744
 745   void handle_get_l2_table(int r, uint64_t snap_id) {
 746     ceph_assert(qcow_format->m_strand.running_in_this_thread());
 747
 748     auto cct = qcow_format->m_image_ctx->cct;
 749     ldout(cct, 20) << "r=" << r << ", "
 750                    << "snap_id=" << snap_id << dendl;
 751
 752     if (r == -ENOENT) {
 753       l2_table.reset();
 754     } else if (r < 0) {
 755       lderr(cct) << "failed to retrieve L2 table for snapshot " << snap_id
 756                  << ": " << cpp_strerror(r) << dendl;
 757       finish(r);
 758       return;
 759     }
 760
 761     // compare the cluster offsets at each requested L2 offset between
 762     // the previous snapshot's L2 table and the current L2 table.
 763     auto& sparse_extents = (*snapshot_delta)[{snap_id, snap_id}];
 764     for (auto& cluster_extent : cluster_extents) {
 765       uint32_t l2_table_index =
 766         (cluster_extent.image_offset >> qcow_format->m_cluster_bits) &
 767         (qcow_format->m_l2_size - 1);
 768
 769       std::optional<uint64_t> cluster_offset;
 770       if (l2_table && l2_table_index < l2_table->size) {
 771         cluster_offset = l2_table->cluster_offsets[l2_table_index] &
 772                          qcow_format->m_cluster_offset_mask;
 773       }
 774
 775       std::optional<uint64_t> prev_cluster_offset;
 776       if (previous_l2_table && l2_table_index < previous_l2_table->size) {
 777         prev_cluster_offset =
 778           previous_l2_table->cluster_offsets[l2_table_index] &
 779           qcow_format->m_cluster_offset_mask;
 780       }
 781
 782       ldout(cct, 20) << "l1_table_index=" << l1_table_index << ", "
 783                      << "snap_id=" << snap_id << ", "
 784                      << "image_offset=" << cluster_extent.image_offset << ", "
 785                      << "l2_table_index=" << l2_table_index << ", "
 786                      << "cluster_offset=" << cluster_offset << ", "
 787                      << "prev_cluster_offset=" << prev_cluster_offset << dendl;
 788
 789       auto state = io::SPARSE_EXTENT_STATE_DATA;
 790       if (cluster_offset == prev_cluster_offset) {
 791         continue;
 792       } else if ((prev_cluster_offset && !cluster_offset) ||
 793                  *cluster_offset == QCOW_OFLAG_ZERO) {
 794         // explicitly zeroed or deallocated
 795         state = io::SPARSE_EXTENT_STATE_ZEROED;
 796       }
 797
 798       sparse_extents.insert(
 799         cluster_extent.image_offset, cluster_extent.cluster_length,
 800         {state, cluster_extent.cluster_length});
 801     }
 802
 803     ldout(cct, 20) << "l1_table_index=" << l1_table_index << ", "
 804                    << "snap_id=" << snap_id << ", "
 805                    << "sparse_extents=" << sparse_extents << dendl;
 806
 807     // continue processing the L2 table at this index for all snapshots
 808     boost::asio::post(*qcow_format->m_image_ctx->asio_engine,
 809                       [this]() { get_l2_table(); });
 810   }
 811
 812
 813   void finish(int r) {
 814     auto cct = qcow_format->m_image_ctx->cct;
 815     ldout(cct, 20) << "r=" << r << dendl;
 816
 817     on_finish->complete(r);
 818     delete this;
 819   }
 820 };
 821
 822 #undef dout_prefix
 823 #define dout_prefix *_dout << "librbd::migration::QCOWFormat: " << this \
 824                            << " " << __func__ << ": "
 825
 826 template <typename I>
 827 QCOWFormat<I>::QCOWFormat(
 828     I* image_ctx, const json_spirit::mObject& json_object,
 829     const SourceSpecBuilder<I>* source_spec_builder)
 830   : m_image_ctx(image_ctx), m_json_object(json_object),
 831     m_source_spec_builder(source_spec_builder),
 832     m_strand(*image_ctx->asio_engine) {
 833 }
 834
 835 template <typename I>
 836 void QCOWFormat<I>::open(Context* on_finish) {
 837   auto cct = m_image_ctx->cct;
 838   ldout(cct, 10) << dendl;
 839
 840   int r = m_source_spec_builder->build_stream(m_json_object, &m_stream);
 841   if (r < 0) {
 842     lderr(cct) << "failed to build migration stream handler" << cpp_strerror(r)
 843                << dendl;
 844     on_finish->complete(r);
 845     return;
 846   }
 847
 848   auto ctx = new LambdaContext([this, on_finish](int r) {
 849     handle_open(r, on_finish); });
 850   m_stream->open(ctx);
 851 }
 852
 853 template <typename I>
 854 void QCOWFormat<I>::handle_open(int r, Context* on_finish) {
 855   auto cct = m_image_ctx->cct;
 856   ldout(cct, 10) << "r=" << r << dendl;
 857
 858   if (r < 0) {
 859     lderr(cct) << "failed to open QCOW image: " << cpp_strerror(r)
 860                << dendl;
 861     on_finish->complete(r);
 862     return;
 863   }
 864
 865   probe(on_finish);
 866 }
 867
 868 template <typename I>
 869 void QCOWFormat<I>::probe(Context* on_finish) {
 870   auto cct = m_image_ctx->cct;
 871   ldout(cct, 10) << dendl;
 872
 873   auto ctx = new LambdaContext([this, on_finish](int r) {
 874     handle_probe(r, on_finish); });
 875   m_bl.clear();
 876   m_stream->read({{0, 8}}, &m_bl, ctx);
 877 }
 878
 879 template <typename I>
 880 void QCOWFormat<I>::handle_probe(int r, Context* on_finish) {
 881   auto cct = m_image_ctx->cct;
 882   ldout(cct, 10) << "r=" << r << dendl;
 883
 884   if (r < 0) {
 885     lderr(cct) << "failed to probe QCOW image: " << cpp_strerror(r)
 886                << dendl;
 887     on_finish->complete(r);
 888     return;
 889   }
 890
 891   auto header_probe = *reinterpret_cast<QCowHeaderProbe*>(
 892     m_bl.c_str());
 893   header_probe.magic = be32toh(header_probe.magic);
 894   header_probe.version = be32toh(header_probe.version);
 895
 896   if (header_probe.magic != QCOW_MAGIC) {
 897     lderr(cct) << "invalid QCOW header magic" << dendl;
 898     on_finish->complete(-EINVAL);
 899     return;
 900   }
 901
 902   m_bl.clear();
 903   if (header_probe.version == 1) {
 904 #ifdef WITH_RBD_MIGRATION_FORMAT_QCOW_V1
 905     read_v1_header(on_finish);
 906 #else // WITH_RBD_MIGRATION_FORMAT_QCOW_V1
 907     lderr(cct) << "QCOW is not supported" << dendl;
 908     on_finish->complete(-ENOTSUP);
 909 #endif // WITH_RBD_MIGRATION_FORMAT_QCOW_V1
 910     return;
 911   } else if (header_probe.version >= 2 && header_probe.version <= 3) {
 912     read_v2_header(on_finish);
 913     return;
 914   } else {
 915     lderr(cct) << "invalid QCOW header version " << header_probe.version
 916                << dendl;
 917     on_finish->complete(-EINVAL);
 918     return;
 919   }
 920 }
 921
 922 #ifdef WITH_RBD_MIGRATION_FORMAT_QCOW_V1
 923
 924 template <typename I>
 925 void QCOWFormat<I>::read_v1_header(Context* on_finish) {
 926   auto cct = m_image_ctx->cct;
 927   ldout(cct, 10) << dendl;
 928
 929   auto ctx = new LambdaContext([this, on_finish](int r) {
 930     handle_read_v1_header(r, on_finish); });
 931   m_bl.clear();
 932   m_stream->read({{0, sizeof(QCowHeaderV1)}}, &m_bl, ctx);
 933 }
 934
 935 template <typename I>
 936 void QCOWFormat<I>::handle_read_v1_header(int r, Context* on_finish) {
 937   auto cct = m_image_ctx->cct;
 938   ldout(cct, 10) << "r=" << r << dendl;
 939
 940   if (r < 0) {
 941     lderr(cct) << "failed to read QCOW header: " << cpp_strerror(r) << dendl;
 942     on_finish->complete(r);
 943     return;
 944   }
 945
 946   auto header = *reinterpret_cast<QCowHeaderV1*>(m_bl.c_str());
 947
 948   // byte-swap important fields
 949   header.magic = be32toh(header.magic);
 950   header.version = be32toh(header.version);
 951   header.backing_file_offset = be64toh(header.backing_file_offset);
 952   header.backing_file_size = be32toh(header.backing_file_size);
 953   header.size = be64toh(header.size);
 954   header.crypt_method = be32toh(header.crypt_method);
 955   header.l1_table_offset = be64toh(header.l1_table_offset);
 956
 957   if (header.magic != QCOW_MAGIC || header.version != 1) {
 958     // honestly shouldn't happen since we've already validated it
 959     lderr(cct) << "header is not QCOW" << dendl;
 960     on_finish->complete(-EINVAL);
 961     return;
 962   }
 963
 964   if (header.cluster_bits < QCOW_MIN_CLUSTER_BITS ||
 965       header.cluster_bits > QCOW_MAX_CLUSTER_BITS) {
 966     lderr(cct) << "invalid cluster bits: " << header.cluster_bits << dendl;
 967     on_finish->complete(-EINVAL);
 968     return;
 969   }
 970
 971   if (header.l2_bits < (QCOW_MIN_CLUSTER_BITS - 3) ||
 972       header.l2_bits > (QCOW_MAX_CLUSTER_BITS - 3)) {
 973     lderr(cct) << "invalid L2 bits: " << header.l2_bits << dendl;
 974     on_finish->complete(-EINVAL);
 975     return;
 976   }
 977
 978   if (header.crypt_method != QCOW_CRYPT_NONE) {
 979     lderr(cct) << "invalid or unsupported encryption method" << dendl;
 980     on_finish->complete(-EINVAL);
 981     return;
 982   }
 983
 984   m_size = header.size;
 985   if (p2roundup(m_size, static_cast<uint64_t>(512)) != m_size) {
 986     lderr(cct) << "image size is not a multiple of block size" << dendl;
 987     on_finish->complete(-EINVAL);
 988     return;
 989   }
 990
 991   m_backing_file_offset = header.backing_file_offset;
 992   m_backing_file_size = header.backing_file_size;
 993
 994   m_cluster_bits = header.cluster_bits;
 995   m_cluster_size = 1UL << header.cluster_bits;
 996   m_cluster_offset_mask = (1ULL << (63 - header.cluster_bits)) - 1;
 997   m_cluster_mask = ~QCOW_OFLAG_COMPRESSED;
 998
 999   m_l2_bits = header.l2_bits;
1000   m_l2_size = (1UL << m_l2_bits);
1001
1002   m_l1_shift = m_cluster_bits + m_l2_bits;
1003   m_l1_table.size = (m_size + (1LL << m_l1_shift) - 1) >> m_l1_shift;
1004   m_l1_table_offset = header.l1_table_offset;
1005   if (m_size > (std::numeric_limits<uint64_t>::max() - (1ULL << m_l1_shift)) ||
1006       m_l1_table.size >
1007         (std::numeric_limits<int32_t>::max() / sizeof(uint64_t))) {
1008     lderr(cct) << "image size too big: " << m_size << dendl;
1009     on_finish->complete(-EINVAL);
1010     return;
1011   }
1012
1013   ldout(cct, 15) << "size=" << m_size << ", "
1014                  << "cluster_bits=" << m_cluster_bits << ", "
1015                  << "l2_bits=" << m_l2_bits << dendl;
1016
1017   // allocate memory for L1 table and L2 + cluster caches
1018   m_l2_table_cache = std::make_unique<L2TableCache>(this);
1019   m_cluster_cache = std::make_unique<ClusterCache>(this);
1020
1021   read_l1_table(on_finish);
1022 }
1023
1024 #endif // WITH_RBD_MIGRATION_FORMAT_QCOW_V1
1025
1026 template <typename I>
1027 void QCOWFormat<I>::read_v2_header(Context* on_finish) {
1028   auto cct = m_image_ctx->cct;
1029   ldout(cct, 10) << dendl;
1030
1031   auto ctx = new LambdaContext([this, on_finish](int r) {
1032     handle_read_v2_header(r, on_finish); });
1033   m_bl.clear();
1034   m_stream->read({{0, sizeof(QCowHeader)}}, &m_bl, ctx);
1035 }
1036
1037 template <typename I>
1038 void QCOWFormat<I>::handle_read_v2_header(int r, Context* on_finish) {
1039   auto cct = m_image_ctx->cct;
1040   ldout(cct, 10) << "r=" << r << dendl;
1041
1042   if (r < 0) {
1043     lderr(cct) << "failed to read QCOW2 header: " << cpp_strerror(r) << dendl;
1044     on_finish->complete(r);
1045     return;
1046   }
1047
1048   auto header = *reinterpret_cast<QCowHeader*>(m_bl.c_str());
1049
1050   // byte-swap important fields
1051   header.magic = be32toh(header.magic);
1052   header.version = be32toh(header.version);
1053   header.backing_file_offset = be64toh(header.backing_file_offset);
1054   header.backing_file_size = be32toh(header.backing_file_size);
1055   header.cluster_bits = be32toh(header.cluster_bits);
1056   header.size = be64toh(header.size);
1057   header.crypt_method = be32toh(header.crypt_method);
1058   header.l1_size = be32toh(header.l1_size);
1059   header.l1_table_offset = be64toh(header.l1_table_offset);
1060   header.nb_snapshots = be32toh(header.nb_snapshots);
1061   header.snapshots_offset = be64toh(header.snapshots_offset);
1062
1063   if (header.version == 2) {
1064     // valid only for version >= 3
1065     header.incompatible_features = 0;
1066     header.compatible_features = 0;
1067     header.autoclear_features = 0;
1068     header.header_length = 72;
1069     header.compression_type = 0;
1070   } else {
1071     header.incompatible_features = be64toh(header.incompatible_features);
1072     header.compatible_features = be64toh(header.compatible_features);
1073     header.autoclear_features = be64toh(header.autoclear_features);
1074     header.header_length = be32toh(header.header_length);
1075   }
1076
1077   if (header.magic != QCOW_MAGIC || header.version < 2 || header.version > 3) {
1078     // honestly shouldn't happen since we've already validated it
1079     lderr(cct) << "header is not QCOW2" << dendl;
1080     on_finish->complete(-EINVAL);
1081     return;
1082   }
1083
1084   if (header.cluster_bits < QCOW_MIN_CLUSTER_BITS ||
1085       header.cluster_bits > QCOW_MAX_CLUSTER_BITS) {
1086     lderr(cct) << "invalid cluster bits: " << header.cluster_bits << dendl;
1087     on_finish->complete(-EINVAL);
1088     return;
1089   }
1090
1091   if (header.crypt_method != QCOW_CRYPT_NONE) {
1092     lderr(cct) << "invalid or unsupported encryption method" << dendl;
1093     on_finish->complete(-EINVAL);
1094     return;
1095   }
1096
1097   m_size = header.size;
1098   if (p2roundup(m_size, static_cast<uint64_t>(512)) != m_size) {
1099     lderr(cct) << "image size is not a multiple of block size" << dendl;
1100     on_finish->complete(-EINVAL);
1101     return;
1102   }
1103
1104   if (header.header_length <= offsetof(QCowHeader, compression_type)) {
1105     header.compression_type = 0;
1106   }
1107
1108   if ((header.compression_type != 0) ||
1109       ((header.incompatible_features & QCOW2_INCOMPAT_COMPRESSION) != 0)) {
1110     lderr(cct) << "invalid or unsupported compression type" << dendl;
1111     on_finish->complete(-EINVAL);
1112     return;
1113   }
1114
1115   if ((header.incompatible_features & QCOW2_INCOMPAT_DATA_FILE) != 0) {
1116     lderr(cct) << "external data file feature not supported" << dendl;
1117     on_finish->complete(-ENOTSUP);
1118   }
1119
1120   if ((header.incompatible_features & QCOW2_INCOMPAT_EXTL2) != 0) {
1121     lderr(cct) << "extended L2 table feature not supported" << dendl;
1122     on_finish->complete(-ENOTSUP);
1123     return;
1124   }
1125
1126   header.incompatible_features &= ~QCOW2_INCOMPAT_MASK;
1127   if (header.incompatible_features != 0) {
1128     lderr(cct) << "unknown incompatible feature enabled" << dendl;
1129     on_finish->complete(-EINVAL);
1130     return;
1131   }
1132
1133   m_backing_file_offset = header.backing_file_offset;
1134   m_backing_file_size = header.backing_file_size;
1135
1136   m_cluster_bits = header.cluster_bits;
1137   m_cluster_size = 1UL << header.cluster_bits;
1138   m_cluster_offset_mask = (1ULL << (63 - header.cluster_bits)) - 1;
1139   m_cluster_mask = ~(QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_COPIED);
1140
1141   // L2 table is fixed a (1) cluster block to hold 8-byte (3 bit) offsets
1142   m_l2_bits = m_cluster_bits - 3;
1143   m_l2_size = (1UL << m_l2_bits);
1144
1145   m_l1_shift = m_cluster_bits + m_l2_bits;
1146   m_l1_table.size = (m_size + (1LL << m_l1_shift) - 1) >> m_l1_shift;
1147   m_l1_table_offset = header.l1_table_offset;
1148   if (m_size > (std::numeric_limits<uint64_t>::max() - (1ULL << m_l1_shift)) ||
1149       m_l1_table.size >
1150         (std::numeric_limits<int32_t>::max() / sizeof(uint64_t))) {
1151     lderr(cct) << "image size too big: " << m_size << dendl;
1152     on_finish->complete(-EINVAL);
1153     return;
1154   } else if (m_l1_table.size > header.l1_size) {
1155     lderr(cct) << "invalid L1 table size in header (" << header.l1_size
1156                << " < " << m_l1_table.size << ")" << dendl;
1157     on_finish->complete(-EINVAL);
1158     return;
1159   }
1160
1161   m_snapshot_count = header.nb_snapshots;
1162   m_snapshots_offset = header.snapshots_offset;
1163
1164   ldout(cct, 15) << "size=" << m_size << ", "
1165                  << "cluster_bits=" << m_cluster_bits << ", "
1166                  << "l1_table_offset=" << m_l1_table_offset << ", "
1167                  << "snapshot_count=" << m_snapshot_count << ", "
1168                  << "snapshots_offset=" << m_snapshots_offset << dendl;
1169
1170   // allocate memory for L1 table and L2 + cluster caches
1171   m_l2_table_cache = std::make_unique<L2TableCache>(this);
1172   m_cluster_cache = std::make_unique<ClusterCache>(this);
1173
1174   read_snapshot(on_finish);
1175 }
1176
1177 template <typename I>
1178 void QCOWFormat<I>::read_snapshot(Context* on_finish) {
1179   if (m_snapshots_offset == 0 || m_snapshots.size() == m_snapshot_count) {
1180     read_l1_table(on_finish);
1181     return;
1182   }
1183
1184   // header is always aligned on 8 byte boundary
1185   m_snapshots_offset = p2roundup(m_snapshots_offset, static_cast<uint64_t>(8));
1186
1187   auto cct = m_image_ctx->cct;
1188   ldout(cct, 10) << "snap_id=" << (m_snapshots.size() + 1) << ", "
1189                  << "offset=" << m_snapshots_offset << dendl;
1190
1191   auto ctx = new LambdaContext([this, on_finish](int r) {
1192     handle_read_snapshot(r, on_finish); });
1193   m_bl.clear();
1194   m_stream->read({{m_snapshots_offset, sizeof(QCowSnapshotHeader)}}, &m_bl,
1195                  ctx);
1196 }
1197
1198 template <typename I>
1199 void QCOWFormat<I>::handle_read_snapshot(int r, Context* on_finish) {
1200   auto cct = m_image_ctx->cct;
1201   ldout(cct, 10) << "r=" << r << ", "
1202                  << "index=" << m_snapshots.size() << dendl;
1203
1204   if (r < 0) {
1205     lderr(cct) << "failed to read QCOW2 snapshot header: " << cpp_strerror(r)
1206                << dendl;
1207     on_finish->complete(r);
1208     return;
1209   }
1210
1211   m_snapshots_offset += m_bl.length();
1212   auto header = *reinterpret_cast<QCowSnapshotHeader*>(m_bl.c_str());
1213
1214   auto& snapshot = m_snapshots[m_snapshots.size() + 1];
1215   snapshot.id.resize(be16toh(header.id_str_size));
1216   snapshot.name.resize(be16toh(header.name_size));
1217   snapshot.l1_table_offset = be64toh(header.l1_table_offset);
1218   snapshot.l1_table.size = be32toh(header.l1_size);
1219   snapshot.timestamp.sec_ref() = be32toh(header.date_sec);
1220   snapshot.timestamp.nsec_ref() = be32toh(header.date_nsec);
1221   snapshot.extra_data_size = be32toh(header.extra_data_size);
1222
1223   ldout(cct, 10) << "snap_id=" << m_snapshots.size() << ", "
1224                  << "id_str_len=" << snapshot.id.size() << ", "
1225                  << "name_str_len=" << snapshot.name.size() << ", "
1226                  << "l1_table_offset=" << snapshot.l1_table_offset << ", "
1227                  << "l1_size=" << snapshot.l1_table.size << ", "
1228                  << "extra_data_size=" << snapshot.extra_data_size << dendl;
1229
1230   read_snapshot_extra(on_finish);
1231 }
1232
1233 template <typename I>
1234 void QCOWFormat<I>::read_snapshot_extra(Context* on_finish) {
1235   ceph_assert(!m_snapshots.empty());
1236   auto& snapshot = m_snapshots.rbegin()->second;
1237
1238   uint32_t length = snapshot.extra_data_size +
1239                     snapshot.id.size() +
1240                     snapshot.name.size();
1241   if (length == 0) {
1242     uuid_d uuid_gen;
1243     uuid_gen.generate_random();
1244     snapshot.name = uuid_gen.to_string();
1245
1246     read_snapshot(on_finish);
1247     return;
1248   }
1249
1250   auto cct = m_image_ctx->cct;
1251   ldout(cct, 10) << "snap_id=" << m_snapshots.size() << ", "
1252                  << "offset=" << m_snapshots_offset << ", "
1253                  << "length=" << length << dendl;
1254
1255   auto offset = m_snapshots_offset;
1256   m_snapshots_offset += length;
1257
1258   auto ctx = new LambdaContext([this, on_finish](int r) {
1259     handle_read_snapshot_extra(r, on_finish); });
1260   m_bl.clear();
1261   m_stream->read({{offset, length}}, &m_bl, ctx);
1262 }
1263
1264 template <typename I>
1265 void QCOWFormat<I>::handle_read_snapshot_extra(int r, Context* on_finish) {
1266   ceph_assert(!m_snapshots.empty());
1267   auto& snapshot = m_snapshots.rbegin()->second;
1268
1269   auto cct = m_image_ctx->cct;
1270   ldout(cct, 10) << "r=" << r << ", "
1271                  << "snap_id=" << m_snapshots.size() << dendl;
1272
1273   if (r < 0) {
1274     lderr(cct) << "failed to read QCOW2 snapshot header extra: "
1275                << cpp_strerror(r) << dendl;
1276     on_finish->complete(r);
1277     return;
1278   }
1279
1280   if (snapshot.extra_data_size >=
1281         offsetof(QCowSnapshotExtraData, disk_size) + sizeof(uint64_t))  {
1282     auto extra = reinterpret_cast<const QCowSnapshotExtraData*>(m_bl.c_str());
1283     snapshot.size = be64toh(extra->disk_size);
1284   } else {
1285     snapshot.size = m_size;
1286   }
1287
1288   auto data = reinterpret_cast<const char*>(m_bl.c_str());
1289   data += snapshot.extra_data_size;
1290
1291   if (!snapshot.id.empty()) {
1292     snapshot.id = std::string(data, snapshot.id.size());
1293     data += snapshot.id.size();
1294   }
1295
1296   if (!snapshot.name.empty()) {
1297     snapshot.name = std::string(data, snapshot.name.size());
1298     data += snapshot.name.size();
1299   } else {
1300     uuid_d uuid_gen;
1301     uuid_gen.generate_random();
1302     snapshot.name = uuid_gen.to_string();
1303   }
1304
1305   ldout(cct, 10) << "snap_id=" << m_snapshots.size() << ", "
1306                  << "name=" << snapshot.name << ", "
1307                  << "size=" << snapshot.size << dendl;
1308   read_snapshot_l1_table(on_finish);
1309 }
1310
1311 template <typename I>
1312 void QCOWFormat<I>::read_snapshot_l1_table(Context* on_finish) {
1313   ceph_assert(!m_snapshots.empty());
1314   auto& snapshot = m_snapshots.rbegin()->second;
1315
1316   auto cct = m_image_ctx->cct;
1317   ldout(cct, 10) << "snap_id=" << m_snapshots.size() << ", "
1318                  << "l1_table_offset=" << snapshot.l1_table_offset
1319                  << dendl;
1320
1321   auto ctx = new LambdaContext([this, on_finish](int r) {
1322     handle_read_snapshot_l1_table(r, on_finish); });
1323   m_stream->read({{snapshot.l1_table_offset,
1324                    snapshot.l1_table.size * sizeof(uint64_t)}},
1325                  &snapshot.l1_table.bl, ctx);
1326 }
1327
1328 template <typename I>
1329 void QCOWFormat<I>::handle_read_snapshot_l1_table(int r, Context* on_finish) {
1330   ceph_assert(!m_snapshots.empty());
1331   auto& snapshot = m_snapshots.rbegin()->second;
1332
1333   auto cct = m_image_ctx->cct;
1334   ldout(cct, 10) << "r=" << r << ", "
1335                  << "snap_id=" << m_snapshots.size() << dendl;
1336
1337   if (r < 0) {
1338     lderr(cct) << "failed to read snapshot L1 table: " << cpp_strerror(r)
1339                << dendl;
1340     on_finish->complete(r);
1341     return;
1342   }
1343
1344   snapshot.l1_table.decode();
1345   read_snapshot(on_finish);
1346 }
1347
1348 template <typename I>
1349 void QCOWFormat<I>::read_l1_table(Context* on_finish) {
1350   auto cct = m_image_ctx->cct;
1351   ldout(cct, 10) << dendl;
1352
1353   auto ctx = new LambdaContext([this, on_finish](int r) {
1354     handle_read_l1_table(r, on_finish); });
1355   m_stream->read({{m_l1_table_offset,
1356                    m_l1_table.size * sizeof(uint64_t)}},
1357                  &m_l1_table.bl, ctx);
1358 }
1359
1360 template <typename I>
1361 void QCOWFormat<I>::handle_read_l1_table(int r, Context* on_finish) {
1362   auto cct = m_image_ctx->cct;
1363   ldout(cct, 10) << "r=" << r << dendl;
1364
1365   if (r < 0) {
1366     lderr(cct) << "failed to read L1 table: " << cpp_strerror(r) << dendl;
1367     on_finish->complete(r);
1368     return;
1369   }
1370
1371   m_l1_table.decode();
1372   read_backing_file(on_finish);
1373 }
1374
1375 template <typename I>
1376 void QCOWFormat<I>::read_backing_file(Context* on_finish) {
1377   if (m_backing_file_offset == 0 || m_backing_file_size == 0) {
1378     // all data is within the specified file
1379     on_finish->complete(0);
1380     return;
1381   }
1382
1383   auto cct = m_image_ctx->cct;
1384   ldout(cct, 10) << dendl;
1385
1386   // TODO add support for backing files
1387   on_finish->complete(-ENOTSUP);
1388 }
1389
1390 template <typename I>
1391 void QCOWFormat<I>::close(Context* on_finish) {
1392   auto cct = m_image_ctx->cct;
1393   ldout(cct, 10) << dendl;
1394
1395   m_stream->close(on_finish);
1396 }
1397
1398 template <typename I>
1399 void QCOWFormat<I>::get_snapshots(SnapInfos* snap_infos, Context* on_finish) {
1400   auto cct = m_image_ctx->cct;
1401   ldout(cct, 10) << dendl;
1402
1403   snap_infos->clear();
1404   for (auto& [snap_id, snapshot] : m_snapshots) {
1405     SnapInfo snap_info(snapshot.name, cls::rbd::UserSnapshotNamespace{},
1406                        snapshot.size, {}, 0, 0, snapshot.timestamp);
1407     snap_infos->emplace(snap_id, snap_info);
1408   }
1409
1410   on_finish->complete(0);
1411 }
1412
1413 template <typename I>
1414 void QCOWFormat<I>::get_image_size(uint64_t snap_id, uint64_t* size,
1415                                   Context* on_finish) {
1416   auto cct = m_image_ctx->cct;
1417   ldout(cct, 10) << "snap_id=" << snap_id << dendl;
1418
1419   if (snap_id == CEPH_NOSNAP) {
1420     *size = m_size;
1421   } else {
1422     auto snapshot_it = m_snapshots.find(snap_id);
1423     if (snapshot_it == m_snapshots.end()) {
1424       on_finish->complete(-ENOENT);
1425       return;
1426     }
1427
1428     auto& snapshot = snapshot_it->second;
1429     *size = snapshot.size;
1430   }
1431
1432   on_finish->complete(0);
1433 }
1434
1435 template <typename I>
1436 bool QCOWFormat<I>::read(
1437     io::AioCompletion* aio_comp, uint64_t snap_id, io::Extents&& image_extents,
1438     io::ReadResult&& read_result, int op_flags, int read_flags,
1439     const ZTracer::Trace &parent_trace) {
1440   auto cct = m_image_ctx->cct;
1441   ldout(cct, 20) << "snap_id=" << snap_id << ", "
1442                  << "image_extents=" << image_extents << dendl;
1443
1444   const LookupTable* l1_table = nullptr;
1445   if (snap_id == CEPH_NOSNAP) {
1446     l1_table = &m_l1_table;
1447   } else {
1448     auto snapshot_it = m_snapshots.find(snap_id);
1449     if (snapshot_it == m_snapshots.end()) {
1450       aio_comp->fail(-ENOENT);
1451       return true;
1452     }
1453
1454     auto& snapshot = snapshot_it->second;
1455     l1_table = &snapshot.l1_table;
1456   }
1457
1458   aio_comp->read_result = std::move(read_result);
1459   aio_comp->read_result.set_image_extents(image_extents);
1460
1461   auto read_request = new ReadRequest(this, aio_comp, l1_table,
1462                                       std::move(image_extents));
1463   read_request->send();
1464
1465   return true;
1466 }
1467
1468 template <typename I>
1469 void QCOWFormat<I>::list_snaps(io::Extents&& image_extents,
1470                               io::SnapIds&& snap_ids, int list_snaps_flags,
1471                               io::SnapshotDelta* snapshot_delta,
1472                               const ZTracer::Trace &parent_trace,
1473                               Context* on_finish) {
1474   auto cct = m_image_ctx->cct;
1475   ldout(cct, 20) << "image_extents=" << image_extents << dendl;
1476
1477   ClusterExtents cluster_extents;
1478   populate_cluster_extents(cct, m_cluster_size, image_extents,
1479                            &cluster_extents);
1480
1481   // map L1 table indexes to cluster extents
1482   std::map<uint64_t, ClusterExtents> l1_cluster_extents;
1483   for (auto& cluster_extent : cluster_extents) {
1484     uint32_t l1_table_index = cluster_extent.image_offset >> m_l1_shift;
1485     auto& l1_cluster_extent = l1_cluster_extents[l1_table_index];
1486     l1_cluster_extent.reserve(cluster_extents.size());
1487     l1_cluster_extent.push_back(cluster_extent);
1488   }
1489
1490   std::map<uint64_t, const LookupTable*> snap_id_to_l1_table;
1491   for (auto& [snap_id, snapshot] : m_snapshots) {
1492     snap_id_to_l1_table[snap_id] = &snapshot.l1_table;
1493   }
1494   snap_id_to_l1_table[CEPH_NOSNAP] = &m_l1_table;
1495
1496   on_finish = new LambdaContext([this, image_extents,
1497                                  snap_ids=std::move(snap_ids),
1498                                  snapshot_delta, on_finish](int r) mutable {
1499     handle_list_snaps(r, std::move(image_extents), std::move(snap_ids),
1500                       snapshot_delta, on_finish);
1501   });
1502
1503   auto gather_ctx = new C_Gather(cct, on_finish);
1504
1505   for (auto& [l1_table_index, cluster_extents] : l1_cluster_extents) {
1506     auto list_snaps_request = new ListSnapsRequest(
1507       this, l1_table_index, std::move(cluster_extents), snap_id_to_l1_table,
1508       snapshot_delta, gather_ctx->new_sub());
1509     list_snaps_request->send();
1510   }
1511
1512   gather_ctx->activate();
1513 }
1514
1515 template <typename I>
1516 void QCOWFormat<I>::handle_list_snaps(int r, io::Extents&& image_extents,
1517                                       io::SnapIds&& snap_ids,
1518                                       io::SnapshotDelta* snapshot_delta,
1519                                       Context* on_finish) {
1520   auto cct = m_image_ctx->cct;
1521   ldout(cct, 20) << "r=" << r << ", "
1522                  << "snapshot_delta=" << *snapshot_delta << dendl;
1523
1524   std::optional<uint64_t> previous_size = std::nullopt;
1525   for (auto& [snap_id, snapshot] : m_snapshots) {
1526     auto sparse_extents = &(*snapshot_delta)[{snap_id, snap_id}];
1527     util::zero_shrunk_snapshot(cct, image_extents, snap_id, snapshot.size,
1528                                &previous_size, sparse_extents);
1529   }
1530
1531   auto sparse_extents = &(*snapshot_delta)[{CEPH_NOSNAP, CEPH_NOSNAP}];
1532   util::zero_shrunk_snapshot(cct, image_extents, CEPH_NOSNAP, m_size,
1533                              &previous_size, sparse_extents);
1534
1535   util::merge_snapshot_delta(snap_ids, snapshot_delta);
1536   on_finish->complete(r);
1537 }
1538
1539 } // namespace migration
1540 } // namespace librbd
1541
1542 template class librbd::migration::QCOWFormat<librbd::ImageCtx>;