ceph/src/os/bluestore/BlueFS.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 #include <chrono>
   4 #include "boost/algorithm/string.hpp"
   5 #include "bluestore_common.h"
   6 #include "BlueFS.h"
   7
   8 #include "common/debug.h"
   9 #include "common/errno.h"
  10 #include "common/perf_counters.h"
  11 #include "Allocator.h"
  12 #include "include/ceph_assert.h"
  13 #include "common/admin_socket.h"
  14
  15 #define dout_context cct
  16 #define dout_subsys ceph_subsys_bluefs
  17 #undef dout_prefix
  18 #define dout_prefix *_dout << "bluefs "
  19 using TOPNSPC::common::cmd_getval;
  20
  21 using std::byte;
  22 using std::list;
  23 using std::make_pair;
  24 using std::map;
  25 using std::ostream;
  26 using std::pair;
  27 using std::set;
  28 using std::string;
  29 using std::to_string;
  30 using std::vector;
  31 using std::chrono::duration;
  32 using std::chrono::seconds;
  33
  34 using ceph::bufferlist;
  35 using ceph::decode;
  36 using ceph::encode;
  37 using ceph::Formatter;
  38
  39
  40 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
  41 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
  42 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs_file_writer);
  43 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
  44                               bluefs_file_reader_buffer, bluefs_file_reader);
  45 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs_file_reader);
  46 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
  47
  48 static void wal_discard_cb(void *priv, void* priv2) {
  49   BlueFS *bluefs = static_cast<BlueFS*>(priv);
  50   interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
  51   bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp);
  52 }
  53
  54 static void db_discard_cb(void *priv, void* priv2) {
  55   BlueFS *bluefs = static_cast<BlueFS*>(priv);
  56   interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
  57   bluefs->handle_discard(BlueFS::BDEV_DB, *tmp);
  58 }
  59
  60 static void slow_discard_cb(void *priv, void* priv2) {
  61   BlueFS *bluefs = static_cast<BlueFS*>(priv);
  62   interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
  63   bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
  64 }
  65
  66 class BlueFS::SocketHook : public AdminSocketHook {
  67   BlueFS* bluefs;
  68 public:
  69   static BlueFS::SocketHook* create(BlueFS* bluefs)
  70   {
  71     BlueFS::SocketHook* hook = nullptr;
  72     AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
  73     if (admin_socket) {
  74       hook = new BlueFS::SocketHook(bluefs);
  75       int r = admin_socket->register_command("bluestore bluefs device info "
  76                                              "name=alloc_size,type=CephInt,req=false",
  77                                              hook,
  78                                              "Shows space report for bluefs devices. "
  79                                              "This also includes an estimation for space "
  80                                              "available to bluefs at main device. "
  81                                              "alloc_size, if set, specifies the custom bluefs "
  82                                              "allocation unit size for the estimation above.");
  83       if (r != 0) {
  84         ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl;
  85         delete hook;
  86         hook = nullptr;
  87       } else {
  88         r = admin_socket->register_command("bluefs stats",
  89                                            hook,
  90                                            "Dump internal statistics for bluefs."
  91                                            "");
  92         ceph_assert(r == 0);
  93         r = admin_socket->register_command("bluefs files list", hook,
  94                                            "print files in bluefs");
  95         ceph_assert(r == 0);
  96         r = admin_socket->register_command("bluefs debug_inject_read_zeros", hook,
  97                                            "Injects 8K zeros into next BlueFS read. Debug only.");
  98         ceph_assert(r == 0);
  99       }
 100     }
 101     return hook;
 102   }
 103
 104   ~SocketHook() {
 105     AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
 106     admin_socket->unregister_commands(this);
 107   }
 108 private:
 109   SocketHook(BlueFS* bluefs) :
 110     bluefs(bluefs) {}
 111   int call(std::string_view command, const cmdmap_t& cmdmap,
 112            const bufferlist&,
 113            Formatter *f,
 114            std::ostream& errss,
 115            bufferlist& out) override {
 116     if (command == "bluestore bluefs device info") {
 117       int64_t alloc_size = 0;
 118       cmd_getval(cmdmap, "alloc_size", alloc_size);
 119       if ((alloc_size & (alloc_size - 1)) != 0) {
 120         errss << "Invalid allocation size:'" << alloc_size << std::endl;
 121         return -EINVAL;
 122       }
 123       if (alloc_size == 0)
 124         alloc_size = bluefs->cct->_conf->bluefs_shared_alloc_size;
 125       f->open_object_section("bluefs_device_info");
 126       for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) {
 127         if (bluefs->bdev[dev]) {
 128           f->open_object_section("dev");
 129           f->dump_string("device", bluefs->get_device_name(dev));
 130           ceph_assert(bluefs->alloc[dev]);
 131           auto total = bluefs->get_total(dev);
 132           auto free = bluefs->get_free(dev);
 133           auto used = bluefs->get_used(dev);
 134
 135           f->dump_int("total", total);
 136           f->dump_int("free", free);
 137           f->dump_int("bluefs_used", used);
 138           if (bluefs->is_shared_alloc(dev)) {
 139             size_t avail = bluefs->probe_alloc_avail(dev, alloc_size);
 140             f->dump_int("bluefs max available", avail);
 141           }
 142           f->close_section();
 143         }
 144       }
 145
 146       f->close_section();
 147     } else if (command == "bluefs stats") {
 148       std::stringstream ss;
 149       bluefs->dump_block_extents(ss);
 150       bluefs->dump_volume_selector(ss);
 151       out.append(ss);
 152     } else if (command == "bluefs files list") {
 153       const char* devnames[3] = {"wal","db","slow"};
 154       std::lock_guard l(bluefs->nodes.lock);
 155       f->open_array_section("files");
 156       for (auto &d : bluefs->nodes.dir_map) {
 157         std::string dir = d.first;
 158         for (auto &r : d.second->file_map) {
 159           f->open_object_section("file");
 160           f->dump_string("name", (dir + "/" + r.first).c_str());
 161           std::vector<size_t> sizes;
 162           sizes.resize(bluefs->bdev.size());
 163           for(auto& i : r.second->fnode.extents) {
 164             sizes[i.bdev] += i.length;
 165           }
 166           for (size_t i = 0; i < sizes.size(); i++) {
 167             if (sizes[i]>0) {
 168               if (i < sizeof(devnames) / sizeof(*devnames))
 169                 f->dump_int(devnames[i], sizes[i]);
 170               else
 171                 f->dump_int(("dev-"+to_string(i)).c_str(), sizes[i]);
 172             }
 173           }
 174           f->close_section();
 175         }
 176       }
 177       f->close_section();
 178       f->flush(out);
 179     } else if (command == "bluefs debug_inject_read_zeros") {
 180       bluefs->inject_read_zeros++;
 181     } else {
 182       errss << "Invalid command" << std::endl;
 183       return -ENOSYS;
 184     }
 185     return 0;
 186   }
 187 };
 188
 189 BlueFS::BlueFS(CephContext* cct)
 190   : cct(cct),
 191     bdev(MAX_BDEV),
 192     ioc(MAX_BDEV),
 193     block_reserved(MAX_BDEV),
 194     alloc(MAX_BDEV),
 195     alloc_size(MAX_BDEV, 0)
 196 {
 197   dirty.pending_release.resize(MAX_BDEV);
 198   discard_cb[BDEV_WAL] = wal_discard_cb;
 199   discard_cb[BDEV_DB] = db_discard_cb;
 200   discard_cb[BDEV_SLOW] = slow_discard_cb;
 201   asok_hook = SocketHook::create(this);
 202 }
 203
 204 BlueFS::~BlueFS()
 205 {
 206   delete asok_hook;
 207   for (auto p : ioc) {
 208     if (p)
 209       p->aio_wait();
 210   }
 211   for (auto p : bdev) {
 212     if (p) {
 213       p->close();
 214       delete p;
 215     }
 216   }
 217   for (auto p : ioc) {
 218     delete p;
 219   }
 220 }
 221
 222 void BlueFS::_init_logger()
 223 {
 224   PerfCountersBuilder b(cct, "bluefs",
 225                         l_bluefs_first, l_bluefs_last);
 226   b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
 227             "Total bytes (main db device)",
 228             "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
 229   b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
 230             "Used bytes (main db device)",
 231             "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
 232   b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
 233             "Total bytes (wal device)",
 234             "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
 235   b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
 236             "Used bytes (wal device)",
 237             "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
 238   b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
 239             "Total bytes (slow device)",
 240             "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
 241   b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
 242             "Used bytes (slow device)",
 243             "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
 244   b.add_u64(l_bluefs_num_files, "num_files", "File count",
 245             "f", PerfCountersBuilder::PRIO_USEFUL);
 246   b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
 247             "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
 248   b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
 249                     "Compactions of the metadata log");
 250   b.add_u64_counter(l_bluefs_log_write_count, "log_write_count",
 251                     "Write op count to the metadata log");
 252   b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
 253                     "Bytes written to the metadata log",
 254                     "j",
 255                     PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
 256   b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
 257                     "Files written to WAL");
 258   b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
 259                     "Files written to SSTs");
 260   b.add_u64_counter(l_bluefs_write_count_wal, "write_count_wal",
 261                     "Write op count to WAL");
 262   b.add_u64_counter(l_bluefs_write_count_sst, "write_count_sst",
 263                     "Write op count to SSTs");
 264   b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
 265                     "Bytes written to WAL",
 266                     "walb",
 267                     PerfCountersBuilder::PRIO_CRITICAL);
 268   b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
 269                     "Bytes written to SSTs",
 270                     "sstb",
 271                     PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
 272   b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow",
 273                     "Bytes written to WAL/SSTs at slow device",
 274                     "slwb",
 275                     PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
 276   b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal",
 277                     "Maximum bytes allocated from WAL",
 278                     "mxwb",
 279                     PerfCountersBuilder::PRIO_INTERESTING,
 280                     unit_t(UNIT_BYTES));
 281   b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db",
 282                     "Maximum bytes allocated from DB",
 283                     "mxdb",
 284                     PerfCountersBuilder::PRIO_INTERESTING,
 285                     unit_t(UNIT_BYTES));
 286   b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow",
 287                     "Maximum bytes allocated from SLOW",
 288                     "mxwb",
 289                     PerfCountersBuilder::PRIO_INTERESTING,
 290                     unit_t(UNIT_BYTES));
 291   b.add_u64_counter(l_bluefs_main_alloc_unit, "alloc_unit_main",
 292                     "Allocation unit size (in bytes) for primary/shared device",
 293                     "aumb",
 294                     PerfCountersBuilder::PRIO_CRITICAL,
 295                     unit_t(UNIT_BYTES));
 296   b.add_u64_counter(l_bluefs_db_alloc_unit, "alloc_unit_db",
 297                     "Allocation unit size (in bytes) for standalone DB device",
 298                     "audb",
 299                     PerfCountersBuilder::PRIO_CRITICAL,
 300                     unit_t(UNIT_BYTES));
 301   b.add_u64_counter(l_bluefs_wal_alloc_unit, "alloc_unit_wal",
 302                     "Allocation unit size (in bytes) for standalone WAL device",
 303                     "auwb",
 304                     PerfCountersBuilder::PRIO_CRITICAL,
 305                     unit_t(UNIT_BYTES));
 306   b.add_u64_counter(l_bluefs_read_random_count, "read_random_count",
 307                     "random read requests processed",
 308                     NULL,
 309                     PerfCountersBuilder::PRIO_USEFUL);
 310   b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes",
 311                     "Bytes requested in random read mode",
 312                     NULL,
 313                     PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
 314   b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count",
 315                     "random reads requests going to disk",
 316                     NULL,
 317                     PerfCountersBuilder::PRIO_USEFUL);
 318   b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes",
 319                     "Bytes read from disk in random read mode",
 320                     "rrb",
 321                     PerfCountersBuilder::PRIO_INTERESTING,
 322                     unit_t(UNIT_BYTES));
 323   b.add_u64_counter(l_bluefs_read_random_disk_bytes_wal, "read_random_disk_bytes_wal",
 324                     "random reads requests going to WAL disk",
 325                     NULL,
 326                     PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
 327   b.add_u64_counter(l_bluefs_read_random_disk_bytes_db, "read_random_disk_bytes_db",
 328                     "random reads requests going to DB disk",
 329                     NULL,
 330                     PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
 331   b.add_u64_counter(l_bluefs_read_random_disk_bytes_slow, "read_random_disk_bytes_slow",
 332                     "random reads requests going to main disk",
 333                     "rrsb",
 334                     PerfCountersBuilder::PRIO_INTERESTING,
 335                     unit_t(UNIT_BYTES));
 336   b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count",
 337                     "random read requests processed using prefetch buffer",
 338                     NULL,
 339                     PerfCountersBuilder::PRIO_USEFUL);
 340   b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes",
 341                     "Bytes read from prefetch buffer in random read mode",
 342                     NULL,
 343                     PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
 344   b.add_u64_counter(l_bluefs_read_count, "read_count",
 345                     "buffered read requests processed",
 346                     NULL,
 347                     PerfCountersBuilder::PRIO_USEFUL);
 348   b.add_u64_counter(l_bluefs_read_bytes, "read_bytes",
 349                     "Bytes requested in buffered read mode",
 350                     NULL,
 351                     PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
 352   b.add_u64_counter(l_bluefs_read_disk_count, "read_disk_count",
 353                     "buffered reads requests going to disk",
 354                     NULL,
 355                     PerfCountersBuilder::PRIO_USEFUL);
 356   b.add_u64_counter(l_bluefs_read_disk_bytes, "read_disk_bytes",
 357                     "Bytes read in buffered mode from disk",
 358                     "rb",
 359                     PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
 360   b.add_u64_counter(l_bluefs_read_disk_bytes_wal, "read_disk_bytes_wal",
 361                     "reads requests going to WAL disk",
 362                     NULL,
 363                     PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
 364   b.add_u64_counter(l_bluefs_read_disk_bytes_db, "read_disk_bytes_db",
 365                     "reads requests going to DB disk",
 366                     NULL,
 367                     PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
 368   b.add_u64_counter(l_bluefs_read_disk_bytes_slow, "read_disk_bytes_slow",
 369                     "reads requests going to main disk",
 370                     "rsb",
 371                     PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
 372   b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count",
 373                     "prefetch read requests processed",
 374                      NULL,
 375                     PerfCountersBuilder::PRIO_USEFUL);
 376   b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes",
 377                     "Bytes requested in prefetch read mode",
 378                      NULL,
 379                     PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
 380   b.add_u64_counter(l_bluefs_write_count, "write_count",
 381                     "Write requests processed");
 382   b.add_u64_counter(l_bluefs_write_disk_count, "write_disk_count",
 383                     "Write requests sent to disk");
 384   b.add_u64_counter(l_bluefs_write_bytes, "write_bytes",
 385                     "Bytes written", NULL,
 386                     PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
 387  b.add_time_avg     (l_bluefs_compaction_lat, "compact_lat",
 388                     "Average bluefs log compaction latency",
 389                     "c__t",
 390                     PerfCountersBuilder::PRIO_INTERESTING);
 391  b.add_time_avg     (l_bluefs_compaction_lock_lat, "compact_lock_lat",
 392                     "Average lock duration while compacting bluefs log",
 393                     "c_lt",
 394                     PerfCountersBuilder::PRIO_INTERESTING);
 395   b.add_u64_counter(l_bluefs_alloc_shared_dev_fallbacks, "alloc_slow_fallback",
 396                     "Amount of allocations that required fallback to "
 397                     " slow/shared device",
 398                      "asdf",
 399                     PerfCountersBuilder::PRIO_USEFUL);
 400   b.add_u64_counter(l_bluefs_alloc_shared_size_fallbacks, "alloc_slow_size_fallback",
 401                     "Amount of allocations that required fallback to shared device's "
 402                     "regular unit size",
 403                      "assf",
 404                     PerfCountersBuilder::PRIO_USEFUL);
 405   b.add_u64(l_bluefs_read_zeros_candidate, "read_zeros_candidate",
 406             "How many times bluefs read found page with all 0s");
 407   b.add_u64(l_bluefs_read_zeros_errors, "read_zeros_errors",
 408             "How many times bluefs read found transient page with all 0s");
 409
 410   logger = b.create_perf_counters();
 411   cct->get_perfcounters_collection()->add(logger);
 412 }
 413
 414 void BlueFS::_shutdown_logger()
 415 {
 416   cct->get_perfcounters_collection()->remove(logger);
 417   delete logger;
 418 }
 419
 420 void BlueFS::_update_logger_stats()
 421 {
 422   if (alloc[BDEV_WAL]) {
 423     logger->set(l_bluefs_wal_total_bytes, _get_total(BDEV_WAL));
 424     logger->set(l_bluefs_wal_used_bytes, _get_used(BDEV_WAL));
 425   }
 426   if (alloc[BDEV_DB]) {
 427     logger->set(l_bluefs_db_total_bytes, _get_total(BDEV_DB));
 428     logger->set(l_bluefs_db_used_bytes, _get_used(BDEV_DB));
 429   }
 430   if (alloc[BDEV_SLOW]) {
 431     logger->set(l_bluefs_slow_total_bytes, _get_total(BDEV_SLOW));
 432     logger->set(l_bluefs_slow_used_bytes, _get_used(BDEV_SLOW));
 433   }
 434 }
 435
 436 int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
 437                              uint64_t reserved,
 438                              bluefs_shared_alloc_context_t* _shared_alloc)
 439 {
 440   dout(10) << __func__ << " bdev " << id << " path " << path << " "
 441            << reserved << dendl;
 442   ceph_assert(id < bdev.size());
 443   ceph_assert(bdev[id] == NULL);
 444   BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
 445                                        discard_cb[id], static_cast<void*>(this));
 446   block_reserved[id] = reserved;
 447   if (_shared_alloc) {
 448     b->set_no_exclusive_lock();
 449   }
 450   int r = b->open(path);
 451   if (r < 0) {
 452     delete b;
 453     return r;
 454   }
 455   if (trim) {
 456     interval_set<uint64_t> whole_device;
 457     whole_device.insert(0, b->get_size());
 458     b->try_discard(whole_device, false);
 459   }
 460
 461   dout(1) << __func__ << " bdev " << id << " path " << path
 462           << " size " << byte_u_t(b->get_size()) << dendl;
 463   bdev[id] = b;
 464   ioc[id] = new IOContext(cct, NULL);
 465   if (_shared_alloc) {
 466     ceph_assert(!shared_alloc);
 467     shared_alloc = _shared_alloc;
 468     alloc[id] = shared_alloc->a;
 469     shared_alloc_id = id;
 470   }
 471   return 0;
 472 }
 473
 474 bool BlueFS::bdev_support_label(unsigned id)
 475 {
 476   ceph_assert(id < bdev.size());
 477   ceph_assert(bdev[id]);
 478   return bdev[id]->supported_bdev_label();
 479 }
 480
 481 uint64_t BlueFS::get_block_device_size(unsigned id) const
 482 {
 483   if (id < bdev.size() && bdev[id])
 484     return bdev[id]->get_size();
 485   return 0;
 486 }
 487
 488 void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
 489 {
 490   dout(10) << __func__ << " bdev " << id << dendl;
 491   ceph_assert(alloc[id]);
 492   alloc[id]->release(to_release);
 493   if (is_shared_alloc(id)) {
 494     shared_alloc->bluefs_used -= to_release.size();
 495   }
 496 }
 497
 498 uint64_t BlueFS::get_used()
 499 {
 500   uint64_t used = 0;
 501   for (unsigned id = 0; id < MAX_BDEV; ++id) {
 502     used += _get_used(id);
 503   }
 504   return used;
 505 }
 506
 507 uint64_t BlueFS::_get_used(unsigned id) const
 508 {
 509   uint64_t used = 0;
 510   if (!alloc[id])
 511      return 0;
 512
 513   if (is_shared_alloc(id)) {
 514     used = shared_alloc->bluefs_used;
 515   } else {
 516     used = _get_total(id) - alloc[id]->get_free();
 517   }
 518   return used;
 519 }
 520
 521 uint64_t BlueFS::get_used(unsigned id)
 522 {
 523   ceph_assert(id < alloc.size());
 524   ceph_assert(alloc[id]);
 525   return _get_used(id);
 526 }
 527
 528 uint64_t BlueFS::_get_total(unsigned id) const
 529 {
 530   ceph_assert(id < bdev.size());
 531   ceph_assert(id < block_reserved.size());
 532   return get_block_device_size(id) - block_reserved[id];
 533 }
 534
 535 uint64_t BlueFS::get_total(unsigned id)
 536 {
 537   return _get_total(id);
 538 }
 539
 540 uint64_t BlueFS::get_free(unsigned id)
 541 {
 542   ceph_assert(id < alloc.size());
 543   return alloc[id]->get_free();
 544 }
 545
 546 void BlueFS::dump_perf_counters(Formatter *f)
 547 {
 548   f->open_object_section("bluefs_perf_counters");
 549   logger->dump_formatted(f, false, false);
 550   f->close_section();
 551 }
 552
 553 void BlueFS::dump_block_extents(ostream& out)
 554 {
 555   for (unsigned i = 0; i < MAX_BDEV; ++i) {
 556     if (!bdev[i]) {
 557       continue;
 558     }
 559     auto total = get_total(i);
 560     auto free = get_free(i);
 561
 562     out << i << " : device size 0x" << std::hex << total
 563         << " : using 0x" << total - free
 564         << std::dec << "(" << byte_u_t(total - free) << ")";
 565     out << "\n";
 566   }
 567 }
 568
 569 void BlueFS::foreach_block_extents(
 570   unsigned id,
 571   std::function<void(uint64_t, uint32_t)> fn)
 572 {
 573   std::lock_guard nl(nodes.lock);
 574   dout(10) << __func__ << " bdev " << id << dendl;
 575   ceph_assert(id < alloc.size());
 576   for (auto& p : nodes.file_map) {
 577     for (auto& q : p.second->fnode.extents) {
 578       if (q.bdev == id) {
 579         fn(q.offset, q.length);
 580       }
 581     }
 582   }
 583 }
 584
 585 int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
 586 {
 587   dout(1) << __func__
 588           << " osd_uuid " << osd_uuid
 589           << dendl;
 590
 591   // set volume selector if not provided before/outside
 592   if (vselector == nullptr) {
 593     vselector.reset(
 594       new OriginalVolumeSelector(
 595         get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
 596         get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
 597         get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
 598   }
 599
 600   _init_logger();
 601   _init_alloc();
 602
 603   super.version = 0;
 604   super.block_size = bdev[BDEV_DB]->get_block_size();
 605   super.osd_uuid = osd_uuid;
 606   super.uuid.generate_random();
 607   dout(1) << __func__ << " uuid " << super.uuid << dendl;
 608
 609   // init log
 610   FileRef log_file = ceph::make_ref<File>();
 611   log_file->fnode.ino = 1;
 612   log_file->vselector_hint = vselector->get_hint_for_log();
 613   int r = _allocate(
 614     vselector->select_prefer_bdev(log_file->vselector_hint),
 615     cct->_conf->bluefs_max_log_runway,
 616     0,
 617     &log_file->fnode);
 618   vselector->add_usage(log_file->vselector_hint, log_file->fnode);
 619   ceph_assert(r == 0);
 620   log.writer = _create_writer(log_file);
 621
 622   // initial txn
 623   ceph_assert(log.seq_live == 1);
 624   log.t.seq = 1;
 625   log.t.op_init();
 626   _flush_and_sync_log_LD();
 627
 628   // write supers
 629   super.log_fnode = log_file->fnode;
 630   super.memorized_layout = layout;
 631   _write_super(BDEV_DB);
 632   _flush_bdev();
 633
 634   // clean up
 635   super = bluefs_super_t();
 636   _close_writer(log.writer);
 637   log.writer = NULL;
 638   vselector.reset(nullptr);
 639   _stop_alloc();
 640   _shutdown_logger();
 641   if (shared_alloc) {
 642     ceph_assert(shared_alloc->need_init);
 643     shared_alloc->need_init = false;
 644   }
 645
 646   dout(10) << __func__ << " success" << dendl;
 647   return 0;
 648 }
 649
 650 void BlueFS::_init_alloc()
 651 {
 652   dout(20) << __func__ << dendl;
 653
 654   size_t wal_alloc_size = 0;
 655   if (bdev[BDEV_WAL]) {
 656     wal_alloc_size = cct->_conf->bluefs_alloc_size;
 657     alloc_size[BDEV_WAL] = wal_alloc_size;
 658   }
 659   logger->set(l_bluefs_wal_alloc_unit, wal_alloc_size);
 660
 661
 662   uint64_t shared_alloc_size = cct->_conf->bluefs_shared_alloc_size;
 663   if (shared_alloc && shared_alloc->a) {
 664     uint64_t unit = shared_alloc->a->get_block_size();
 665     shared_alloc_size = std::max(
 666       unit,
 667       shared_alloc_size);
 668     ceph_assert(0 == p2phase(shared_alloc_size, unit));
 669   }
 670   if (bdev[BDEV_SLOW]) {
 671     alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size;
 672     alloc_size[BDEV_SLOW] = shared_alloc_size;
 673   } else {
 674     alloc_size[BDEV_DB] = shared_alloc_size;
 675     alloc_size[BDEV_SLOW] = 0;
 676   }
 677   logger->set(l_bluefs_db_alloc_unit, alloc_size[BDEV_DB]);
 678   logger->set(l_bluefs_main_alloc_unit, alloc_size[BDEV_SLOW]);
 679   // new wal and db devices are never shared
 680   if (bdev[BDEV_NEWWAL]) {
 681     alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size;
 682   }
 683   if (bdev[BDEV_NEWDB]) {
 684     alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size;
 685   }
 686
 687   for (unsigned id = 0; id < bdev.size(); ++id) {
 688     if (!bdev[id]) {
 689       continue;
 690     }
 691     ceph_assert(bdev[id]->get_size());
 692     if (is_shared_alloc(id)) {
 693       dout(1) << __func__ << " shared, id " << id << std::hex
 694               << ", capacity 0x" << bdev[id]->get_size()
 695               << ", block size 0x" << alloc_size[id]
 696               << std::dec << dendl;
 697     } else {
 698       ceph_assert(alloc_size[id]);
 699       std::string name = "bluefs-";
 700       const char* devnames[] = { "wal","db","slow" };
 701       if (id <= BDEV_SLOW)
 702         name += devnames[id];
 703       else
 704         name += to_string(uintptr_t(this));
 705       dout(1) << __func__ << " new, id " << id << std::hex
 706               << ", allocator name " << name
 707               << ", allocator type " << cct->_conf->bluefs_allocator
 708               << ", capacity 0x" << bdev[id]->get_size()
 709               << ", block size 0x" << alloc_size[id]
 710               << std::dec << dendl;
 711       alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
 712                                     bdev[id]->get_size(),
 713                                     alloc_size[id],
 714                                     0, 0,
 715                                     name);
 716       alloc[id]->init_add_free(
 717         block_reserved[id],
 718         _get_total(id));
 719     }
 720   }
 721 }
 722
 723 void BlueFS::_stop_alloc()
 724 {
 725   dout(20) << __func__ << dendl;
 726   for (auto p : bdev) {
 727     if (p)
 728       p->discard_drain();
 729   }
 730
 731   for (size_t i = 0; i < alloc.size(); ++i) {
 732     if (alloc[i] && !is_shared_alloc(i)) {
 733       alloc[i]->shutdown();
 734       delete alloc[i];
 735       alloc[i] = nullptr;
 736     }
 737   }
 738 }
 739
 740 int BlueFS::_read_and_check(uint8_t ndev, uint64_t off, uint64_t len,
 741                             ceph::buffer::list *pbl, IOContext *ioc, bool buffered)
 742 {
 743   dout(10) << __func__ << " dev " << int(ndev)
 744            << ": 0x" << std::hex << off << "~" << len << std::dec
 745            << (buffered ? " buffered" : "")
 746            << dendl;
 747   int r;
 748   bufferlist bl;
 749   r = _bdev_read(ndev, off, len, &bl, ioc, buffered);
 750   if (r != 0) {
 751     return r;
 752   }
 753   uint64_t block_size = bdev[ndev]->get_block_size();
 754   if (inject_read_zeros) {
 755     if (len >= block_size * 2) {
 756       derr << __func__ << " injecting error, zeros at "
 757            << int(ndev) << ": 0x" << std::hex << (off + len / 2)
 758            << "~" << (block_size * 2) << std::dec << dendl;
 759       //use beginning, replace 8K in the middle with zeros, use tail
 760       bufferlist temp;
 761       bl.splice(0, len / 2 - block_size, &temp);
 762       temp.append(buffer::create(block_size * 2, 0));
 763       bl.splice(block_size * 2, len / 2 - block_size, &temp);
 764       bl = temp;
 765       inject_read_zeros--;
 766     }
 767   }
 768   //make a check if there is a block with all 0
 769   uint64_t to_check_len = len;
 770   uint64_t skip = p2nphase(off, block_size);
 771   if (skip >= to_check_len) {
 772     return r;
 773   }
 774   auto it = bl.begin(skip);
 775   to_check_len -= skip;
 776   bool all_zeros = false;
 777   while (all_zeros == false && to_check_len >= block_size) {
 778     // checking 0s step
 779     unsigned block_left = block_size;
 780     unsigned avail;
 781     const char* data;
 782     all_zeros = true;
 783     while (all_zeros && block_left > 0) {
 784       avail = it.get_ptr_and_advance(block_left, &data);
 785       block_left -= avail;
 786       all_zeros = mem_is_zero(data, avail);
 787     }
 788     // skipping step
 789     while (block_left > 0) {
 790       avail = it.get_ptr_and_advance(block_left, &data);
 791       block_left -= avail;
 792     }
 793     to_check_len -= block_size;
 794   }
 795   if (all_zeros) {
 796     logger->inc(l_bluefs_read_zeros_candidate, 1);
 797     bufferlist bl_reread;
 798     r = _bdev_read(ndev, off, len, &bl_reread, ioc, buffered);
 799     if (r != 0) {
 800       return r;
 801     }
 802     // check if both read gave the same
 803     if (!bl.contents_equal(bl_reread)) {
 804       // report problems to log, but continue, maybe it will be good now...
 805       derr << __func__ << " initial read of " << int(ndev)
 806            << ": 0x" << std::hex << off << "~" << len
 807            << std::dec << ": different then re-read " << dendl;
 808       logger->inc(l_bluefs_read_zeros_errors, 1);
 809     }
 810     // use second read will be better if is different
 811     pbl->append(bl_reread);
 812   } else {
 813     pbl->append(bl);
 814   }
 815   return r;
 816 }
 817
 818 int BlueFS::_read_random_and_check(
 819   uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered)
 820 {
 821   dout(10) << __func__ << " dev " << int(ndev)
 822            << ": 0x" << std::hex << off << "~" << len << std::dec
 823            << (buffered ? " buffered" : "")
 824            << dendl;
 825   int r;
 826   r = _bdev_read_random(ndev, off, len, buf, buffered);
 827   if (r != 0) {
 828     return r;
 829   }
 830   uint64_t block_size = bdev[ndev]->get_block_size();
 831   if (inject_read_zeros) {
 832     if (len >= block_size * 2) {
 833       derr << __func__ << " injecting error, zeros at "
 834            << int(ndev) << ": 0x" << std::hex << (off + len / 2)
 835            << "~" << (block_size * 2) << std::dec << dendl;
 836       //zero middle 8K
 837       memset(buf + len / 2 - block_size, 0, block_size * 2);
 838       inject_read_zeros--;
 839     }
 840   }
 841   //make a check if there is a block with all 0
 842   uint64_t to_check_len = len;
 843   const char* data = buf;
 844   uint64_t skip = p2nphase(off, block_size);
 845   if (skip >= to_check_len) {
 846     return r;
 847   }
 848   to_check_len -= skip;
 849   data += skip;
 850
 851   bool all_zeros = false;
 852   while (all_zeros == false && to_check_len >= block_size) {
 853     if (mem_is_zero(data, block_size)) {
 854       // at least one block is all zeros
 855       all_zeros = true;
 856       break;
 857     }
 858     data += block_size;
 859     to_check_len -= block_size;
 860   }
 861   if (all_zeros) {
 862     logger->inc(l_bluefs_read_zeros_candidate, 1);
 863     std::unique_ptr<char[]> data_reread(new char[len]);
 864     r = _bdev_read_random(ndev, off, len, &data_reread[0], buffered);
 865     if (r != 0) {
 866       return r;
 867     }
 868     // check if both read gave the same
 869     if (memcmp(buf, &data_reread[0], len) != 0) {
 870       derr << __func__ << " initial read of " << int(ndev)
 871            << ": 0x" << std::hex << off << "~" << len
 872            << std::dec << ": different then re-read " << dendl;
 873       logger->inc(l_bluefs_read_zeros_errors, 1);
 874       // second read is probably better
 875       memcpy(buf, &data_reread[0], len);
 876     }
 877   }
 878   return r;
 879 }
 880
 881 int BlueFS::_bdev_read(uint8_t ndev, uint64_t off, uint64_t len,
 882   ceph::buffer::list* pbl, IOContext* ioc, bool buffered)
 883 {
 884   int cnt = 0;
 885   switch (ndev) {
 886     case BDEV_WAL: cnt = l_bluefs_read_disk_bytes_wal; break;
 887     case BDEV_DB: cnt = l_bluefs_read_disk_bytes_db; break;
 888     case BDEV_SLOW: cnt = l_bluefs_read_disk_bytes_slow; break;
 889
 890   }
 891   if (cnt) {
 892     logger->inc(cnt, len);
 893   }
 894   return bdev[ndev]->read(off, len, pbl, ioc, buffered);
 895 }
 896
 897 int BlueFS::_bdev_read_random(uint8_t ndev, uint64_t off, uint64_t len,
 898   char* buf, bool buffered)
 899 {
 900   int cnt = 0;
 901   switch (ndev) {
 902     case BDEV_WAL: cnt = l_bluefs_read_random_disk_bytes_wal; break;
 903     case BDEV_DB: cnt = l_bluefs_read_random_disk_bytes_db; break;
 904     case BDEV_SLOW: cnt = l_bluefs_read_random_disk_bytes_slow; break;
 905   }
 906   if (cnt) {
 907     logger->inc(cnt, len);
 908   }
 909   return bdev[ndev]->read_random(off, len, buf, buffered);
 910 }
 911
 912 int BlueFS::mount()
 913 {
 914   dout(1) << __func__ << dendl;
 915
 916   _init_logger();
 917   int r = _open_super();
 918   if (r < 0) {
 919     derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
 920     goto out;
 921   }
 922
 923   // set volume selector if not provided before/outside
 924   if (vselector == nullptr) {
 925     vselector.reset(
 926       new OriginalVolumeSelector(
 927         get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
 928         get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
 929         get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
 930   }
 931
 932   _init_alloc();
 933
 934   r = _replay(false, false);
 935   if (r < 0) {
 936     derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
 937     _stop_alloc();
 938     goto out;
 939   }
 940
 941   // init freelist
 942   for (auto& p : nodes.file_map) {
 943     dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
 944     for (auto& q : p.second->fnode.extents) {
 945       bool is_shared = is_shared_alloc(q.bdev);
 946       ceph_assert(!is_shared || (is_shared && shared_alloc));
 947       if (is_shared && shared_alloc->need_init && shared_alloc->a) {
 948         shared_alloc->bluefs_used += q.length;
 949         alloc[q.bdev]->init_rm_free(q.offset, q.length);
 950       } else if (!is_shared) {
 951         alloc[q.bdev]->init_rm_free(q.offset, q.length);
 952       }
 953     }
 954   }
 955   if (shared_alloc) {
 956     shared_alloc->need_init = false;
 957     dout(1) << __func__ << " shared_bdev_used = "
 958             << shared_alloc->bluefs_used << dendl;
 959   } else {
 960     dout(1) << __func__ << " shared bdev not used"
 961             << dendl;
 962   }
 963
 964   // set up the log for future writes
 965   log.writer = _create_writer(_get_file(1));
 966   ceph_assert(log.writer->file->fnode.ino == 1);
 967   log.writer->pos = log.writer->file->fnode.size;
 968   log.writer->file->fnode.reset_delta();
 969   dout(10) << __func__ << " log write pos set to 0x"
 970            << std::hex << log.writer->pos << std::dec
 971            << dendl;
 972   // update log size
 973   logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size);
 974   return 0;
 975
 976  out:
 977   super = bluefs_super_t();
 978   return r;
 979 }
 980
 981 int BlueFS::maybe_verify_layout(const bluefs_layout_t& layout) const
 982 {
 983   if (super.memorized_layout) {
 984     if (layout == *super.memorized_layout) {
 985       dout(10) << __func__ << " bluefs layout verified positively" << dendl;
 986     } else {
 987       derr << __func__ << " memorized layout doesn't fit current one" << dendl;
 988       return -EIO;
 989     }
 990   } else {
 991     dout(10) << __func__ << " no memorized_layout in bluefs superblock"
 992              << dendl;
 993   }
 994
 995   return 0;
 996 }
 997
 998 void BlueFS::umount(bool avoid_compact)
 999 {
1000   dout(1) << __func__ << dendl;
1001
1002   sync_metadata(avoid_compact);
1003   if (cct->_conf->bluefs_check_volume_selector_on_umount) {
1004     _check_vselector_LNF();
1005   }
1006   _close_writer(log.writer);
1007   log.writer = NULL;
1008   log.t.clear();
1009
1010   vselector.reset(nullptr);
1011   _stop_alloc();
1012   nodes.file_map.clear();
1013   nodes.dir_map.clear();
1014   super = bluefs_super_t();
1015   _shutdown_logger();
1016 }
1017
1018 int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout)
1019 {
1020   dout(1) << __func__ << dendl;
1021
1022   if(id == BDEV_NEWDB) {
1023     int new_log_dev_cur = BDEV_WAL;
1024     int new_log_dev_next = BDEV_WAL;
1025     if (!bdev[BDEV_WAL]) {
1026       new_log_dev_cur = BDEV_NEWDB;
1027       new_log_dev_next = BDEV_DB;
1028     }
1029     _rewrite_log_and_layout_sync_LNF_LD(false,
1030       BDEV_NEWDB,
1031       new_log_dev_cur,
1032       new_log_dev_next,
1033       RENAME_DB2SLOW,
1034       layout);
1035   } else if(id == BDEV_NEWWAL) {
1036     _rewrite_log_and_layout_sync_LNF_LD(false,
1037       BDEV_DB,
1038       BDEV_NEWWAL,
1039       BDEV_WAL,
1040       REMOVE_WAL,
1041       layout);
1042   } else {
1043     assert(false);
1044   }
1045   return 0;
1046 }
1047
1048 void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id)
1049 {
1050   if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB])
1051     bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
1052   if (bdev[BDEV_WAL])
1053     bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
1054 }
1055
1056 void BlueFS::get_devices(set<string> *ls)
1057 {
1058   for (unsigned i = 0; i < MAX_BDEV; ++i) {
1059     if (bdev[i]) {
1060       bdev[i]->get_devices(ls);
1061     }
1062   }
1063 }
1064
1065 int BlueFS::fsck()
1066 {
1067   dout(1) << __func__ << dendl;
1068   // hrm, i think we check everything on mount...
1069   return 0;
1070 }
1071
1072 int BlueFS::_write_super(int dev)
1073 {
1074   ++super.version;
1075   // build superblock
1076   bufferlist bl;
1077   encode(super, bl);
1078   uint32_t crc = bl.crc32c(-1);
1079   encode(crc, bl);
1080   dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
1081   dout(10) << __func__ << " superblock " << super.version << dendl;
1082   dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
1083   ceph_assert_always(bl.length() <= get_super_length());
1084   bl.append_zero(get_super_length() - bl.length());
1085
1086   bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT);
1087   dout(20) << __func__ << " v " << super.version
1088            << " crc 0x" << std::hex << crc
1089            << " offset 0x" << get_super_offset() << std::dec
1090            << dendl;
1091   return 0;
1092 }
1093
1094 int BlueFS::_open_super()
1095 {
1096   dout(10) << __func__ << dendl;
1097
1098   bufferlist bl;
1099   uint32_t expected_crc, crc;
1100   int r;
1101
1102   // always the second block
1103   r = _bdev_read(BDEV_DB, get_super_offset(), get_super_length(),
1104                  &bl, ioc[BDEV_DB], false);
1105   if (r < 0)
1106     return r;
1107
1108   auto p = bl.cbegin();
1109   decode(super, p);
1110   {
1111     bufferlist t;
1112     t.substr_of(bl, 0, p.get_off());
1113     crc = t.crc32c(-1);
1114   }
1115   decode(expected_crc, p);
1116   if (crc != expected_crc) {
1117     derr << __func__ << " bad crc on superblock, expected 0x"
1118          << std::hex << expected_crc << " != actual 0x" << crc << std::dec
1119          << dendl;
1120     return -EIO;
1121   }
1122   dout(10) << __func__ << " superblock " << super.version << dendl;
1123   dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
1124   return 0;
1125 }
1126
1127 int BlueFS::_check_allocations(const bluefs_fnode_t& fnode,
1128   boost::dynamic_bitset<uint64_t>* used_blocks,
1129   bool is_alloc, //true when allocating, false when deallocating
1130   const char* op_name)
1131 {
1132   auto& fnode_extents = fnode.extents;
1133   for (auto e : fnode_extents) {
1134     auto id = e.bdev;
1135     bool fail = false;
1136     ceph_assert(id < MAX_BDEV);
1137     ceph_assert(bdev[id]);
1138     // let's use minimal allocation unit we can have
1139     auto alloc_unit = bdev[id]->get_block_size();
1140
1141     if (int r = _verify_alloc_granularity(id, e.offset, e.length,
1142                                           alloc_unit,
1143                                           op_name); r < 0) {
1144       return r;
1145     }
1146
1147     apply_for_bitset_range(e.offset, e.length, alloc_unit, used_blocks[id],
1148       [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1149         if (is_alloc == bs.test(pos)) {
1150           fail = true;
1151         } else {
1152           bs.flip(pos);
1153         }
1154       }
1155     );
1156     if (fail) {
1157       derr << __func__ << " " << op_name << " invalid extent " << int(e.bdev)
1158         << ": 0x" << std::hex << e.offset << "~" << e.length << std::dec
1159         << (is_alloc == true ?
1160             ": duplicate reference, ino " : ": double free, ino ")
1161         << fnode.ino << dendl;
1162       return -EFAULT;
1163     }
1164   }
1165   return 0;
1166 }
1167
1168 int BlueFS::_verify_alloc_granularity(
1169   __u8 id, uint64_t offset, uint64_t length, uint64_t alloc_unit, const char *op)
1170 {
1171   if ((offset & (alloc_unit - 1)) ||
1172       (length & (alloc_unit - 1))) {
1173     derr << __func__ << " " << op << " of " << (int)id
1174          << ":0x" << std::hex << offset << "~" << length << std::dec
1175          << " does not align to alloc_size 0x"
1176          << std::hex << alloc_unit << std::dec << dendl;
1177     return -EFAULT;
1178   }
1179   return 0;
1180 }
1181
1182 int BlueFS::_replay(bool noop, bool to_stdout)
1183 {
1184   dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
1185   ino_last = 1;  // by the log
1186   uint64_t log_seq = 0;
1187
1188   FileRef log_file;
1189   log_file = _get_file(1);
1190
1191   log_file->fnode = super.log_fnode;
1192   if (!noop) {
1193     log_file->vselector_hint =
1194       vselector->get_hint_for_log();
1195   }
1196   dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
1197   if (unlikely(to_stdout)) {
1198     std::cout << " log_fnode " << super.log_fnode << std::endl;
1199   }
1200
1201   FileReader *log_reader = new FileReader(
1202     log_file, cct->_conf->bluefs_max_prefetch,
1203     false,  // !random
1204     true);  // ignore eof
1205
1206   bool seen_recs = false;
1207
1208   boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV];
1209
1210   if (!noop) {
1211     if (cct->_conf->bluefs_log_replay_check_allocations) {
1212       for (size_t i = 0; i < MAX_BDEV; ++i) {
1213         if (bdev[i] != nullptr) {
1214           // let's use minimal allocation unit we can have
1215           auto au = bdev[i]->get_block_size();
1216           //hmm... on 32TB/4K drive this would take 1GB RAM!!!
1217           used_blocks[i].resize(round_up_to(bdev[i]->get_size(), au) / au);
1218         }
1219       }
1220       // check initial log layout
1221       int r = _check_allocations(log_file->fnode,
1222                                  used_blocks, true, "Log from super");
1223       if (r < 0) {
1224         return r;
1225       }
1226     }
1227   }
1228
1229   while (true) {
1230     ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0);
1231     uint64_t pos = log_reader->buf.pos;
1232     uint64_t read_pos = pos;
1233     bufferlist bl;
1234     {
1235       int r = _read(log_reader, read_pos, super.block_size,
1236                     &bl, NULL);
1237       if (r != (int)super.block_size && cct->_conf->bluefs_replay_recovery) {
1238         r += _do_replay_recovery_read(log_reader, pos, read_pos + r, super.block_size - r, &bl);
1239       }
1240       assert(r == (int)super.block_size);
1241       read_pos += r;
1242     }
1243     uint64_t more = 0;
1244     uint64_t seq;
1245     uuid_d uuid;
1246     {
1247       auto p = bl.cbegin();
1248       __u8 a, b;
1249       uint32_t len;
1250       decode(a, p);
1251       decode(b, p);
1252       decode(len, p);
1253       decode(uuid, p);
1254       decode(seq, p);
1255       if (len + 6 > bl.length()) {
1256         more = round_up_to(len + 6 - bl.length(), super.block_size);
1257       }
1258     }
1259     if (uuid != super.uuid) {
1260       if (seen_recs) {
1261         dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1262                  << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1263                  << dendl;
1264       } else {
1265         derr << __func__ << " 0x" << std::hex << pos << std::dec
1266                  << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1267                  << ", block dump: \n";
1268         bufferlist t;
1269         t.substr_of(bl, 0, super.block_size);
1270         t.hexdump(*_dout);
1271         *_dout << dendl;
1272       }
1273       break;
1274     }
1275     if (seq != log_seq + 1) {
1276       if (seen_recs) {
1277         dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1278                  << ": stop: seq " << seq << " != expected " << log_seq + 1
1279                  << dendl;;
1280       } else {
1281         derr << __func__ << " 0x" << std::hex << pos << std::dec
1282              << ": stop: seq " << seq << " != expected " << log_seq + 1
1283              << dendl;;
1284       }
1285       break;
1286     }
1287     if (more) {
1288       dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
1289                << " more bytes" << dendl;
1290       bufferlist t;
1291       int r = _read(log_reader, read_pos, more, &t, NULL);
1292       if (r < (int)more) {
1293         dout(10) << __func__ << " 0x" << std::hex << pos
1294                  << ": stop: len is 0x" << bl.length() + more << std::dec
1295                  << ", which is past eof" << dendl;
1296         if (cct->_conf->bluefs_replay_recovery) {
1297           //try to search for more data
1298           r += _do_replay_recovery_read(log_reader, pos, read_pos + r, more - r, &t);
1299           if (r < (int)more) {
1300             //in normal mode we must read r==more, for recovery it is too strict
1301             break;
1302           }
1303         }
1304       }
1305       ceph_assert(r == (int)more);
1306       bl.claim_append(t);
1307       read_pos += r;
1308     }
1309     bluefs_transaction_t t;
1310     try {
1311       auto p = bl.cbegin();
1312       decode(t, p);
1313       seen_recs = true;
1314     }
1315     catch (ceph::buffer::error& e) {
1316       // Multi-block transactions might be incomplete due to unexpected
1317       // power off. Hence let's treat that as a regular stop condition.
1318       if (seen_recs && more) {
1319         dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1320                  << ": stop: failed to decode: " << e.what()
1321                  << dendl;
1322       } else {
1323         derr << __func__ << " 0x" << std::hex << pos << std::dec
1324              << ": stop: failed to decode: " << e.what()
1325              << dendl;
1326         delete log_reader;
1327         return -EIO;
1328       }
1329       break;
1330     }
1331     ceph_assert(seq == t.seq);
1332     dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1333              << ": " << t << dendl;
1334     if (unlikely(to_stdout)) {
1335       std::cout << " 0x" << std::hex << pos << std::dec
1336                 << ": " << t << std::endl;
1337     }
1338
1339     auto p = t.op_bl.cbegin();
1340     auto pos0 = pos;
1341     while (!p.end()) {
1342       pos = pos0 + p.get_off();
1343       __u8 op;
1344       decode(op, p);
1345       switch (op) {
1346
1347       case bluefs_transaction_t::OP_INIT:
1348         dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1349                  << ":  op_init" << dendl;
1350         if (unlikely(to_stdout)) {
1351           std::cout << " 0x" << std::hex << pos << std::dec
1352                     << ":  op_init" << std::endl;
1353         }
1354
1355         ceph_assert(t.seq == 1);
1356         break;
1357
1358       case bluefs_transaction_t::OP_JUMP:
1359         {
1360           uint64_t next_seq;
1361           uint64_t offset;
1362           decode(next_seq, p);
1363           decode(offset, p);
1364           dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1365                    << ":  op_jump seq " << next_seq
1366                    << " offset 0x" << std::hex << offset << std::dec << dendl;
1367           if (unlikely(to_stdout)) {
1368             std::cout << " 0x" << std::hex << pos << std::dec
1369                       << ":  op_jump seq " << next_seq
1370                       << " offset 0x" << std::hex << offset << std::dec
1371                       << std::endl;
1372           }
1373
1374           ceph_assert(next_seq > log_seq);
1375           log_seq = next_seq - 1; // we will increment it below
1376           uint64_t skip = offset - read_pos;
1377           if (skip) {
1378             bufferlist junk;
1379             int r = _read(log_reader, read_pos, skip, &junk,
1380                           NULL);
1381             if (r != (int)skip) {
1382               dout(10) << __func__ << " 0x" << std::hex << read_pos
1383                        << ": stop: failed to skip to " << offset
1384                        << std::dec << dendl;
1385               ceph_abort_msg("problem with op_jump");
1386             }
1387           }
1388         }
1389         break;
1390
1391       case bluefs_transaction_t::OP_JUMP_SEQ:
1392         {
1393           uint64_t next_seq;
1394           decode(next_seq, p);
1395           dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1396                    << ":  op_jump_seq " << next_seq << dendl;
1397           if (unlikely(to_stdout)) {
1398             std::cout << " 0x" << std::hex << pos << std::dec
1399                       << ":  op_jump_seq " << next_seq << std::endl;
1400           }
1401
1402           ceph_assert(next_seq > log_seq);
1403           log_seq = next_seq - 1; // we will increment it below
1404         }
1405         break;
1406
1407       case bluefs_transaction_t::OP_ALLOC_ADD:
1408         // LEGACY, do nothing but read params
1409         {
1410           __u8 id;
1411           uint64_t offset, length;
1412           decode(id, p);
1413           decode(offset, p);
1414           decode(length, p);
1415         }
1416         break;
1417
1418       case bluefs_transaction_t::OP_ALLOC_RM:
1419         // LEGACY, do nothing but read params
1420         {
1421           __u8 id;
1422           uint64_t offset, length;
1423           decode(id, p);
1424           decode(offset, p);
1425           decode(length, p);
1426         }
1427         break;
1428
1429       case bluefs_transaction_t::OP_DIR_LINK:
1430         {
1431           string dirname, filename;
1432           uint64_t ino;
1433           decode(dirname, p);
1434           decode(filename, p);
1435           decode(ino, p);
1436           dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1437                    << ":  op_dir_link " << " " << dirname << "/" << filename
1438                    << " to " << ino
1439                    << dendl;
1440           if (unlikely(to_stdout)) {
1441             std::cout << " 0x" << std::hex << pos << std::dec
1442                       << ":  op_dir_link " << " " << dirname << "/" << filename
1443                       << " to " << ino
1444                       << std::endl;
1445           }
1446
1447           if (!noop) {
1448             FileRef file = _get_file(ino);
1449             ceph_assert(file->fnode.ino);
1450             map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1451             ceph_assert(q != nodes.dir_map.end());
1452             map<string,FileRef>::iterator r = q->second->file_map.find(filename);
1453             ceph_assert(r == q->second->file_map.end());
1454
1455             vselector->sub_usage(file->vselector_hint, file->fnode);
1456             file->vselector_hint =
1457               vselector->get_hint_by_dir(dirname);
1458             vselector->add_usage(file->vselector_hint, file->fnode);
1459
1460             q->second->file_map[filename] = file;
1461             ++file->refs;
1462           }
1463         }
1464         break;
1465
1466       case bluefs_transaction_t::OP_DIR_UNLINK:
1467         {
1468           string dirname, filename;
1469           decode(dirname, p);
1470           decode(filename, p);
1471           dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1472                    << ":  op_dir_unlink " << " " << dirname << "/" << filename
1473                    << dendl;
1474           if (unlikely(to_stdout)) {
1475             std::cout << " 0x" << std::hex << pos << std::dec
1476                       << ":  op_dir_unlink " << " " << dirname << "/" << filename
1477                       << std::endl;
1478           }
1479
1480           if (!noop) {
1481             map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1482             ceph_assert(q != nodes.dir_map.end());
1483             map<string,FileRef>::iterator r = q->second->file_map.find(filename);
1484             ceph_assert(r != q->second->file_map.end());
1485             ceph_assert(r->second->refs > 0);
1486             --r->second->refs;
1487             q->second->file_map.erase(r);
1488           }
1489         }
1490         break;
1491
1492       case bluefs_transaction_t::OP_DIR_CREATE:
1493         {
1494           string dirname;
1495           decode(dirname, p);
1496           dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1497                    << ":  op_dir_create " << dirname << dendl;
1498           if (unlikely(to_stdout)) {
1499             std::cout << " 0x" << std::hex << pos << std::dec
1500                       << ":  op_dir_create " << dirname << std::endl;
1501           }
1502
1503           if (!noop) {
1504             map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1505             ceph_assert(q == nodes.dir_map.end());
1506             nodes.dir_map[dirname] = ceph::make_ref<Dir>();
1507           }
1508         }
1509         break;
1510
1511       case bluefs_transaction_t::OP_DIR_REMOVE:
1512         {
1513           string dirname;
1514           decode(dirname, p);
1515           dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1516                    << ":  op_dir_remove " << dirname << dendl;
1517           if (unlikely(to_stdout)) {
1518             std::cout << " 0x" << std::hex << pos << std::dec
1519                       << ":  op_dir_remove " << dirname << std::endl;
1520           }
1521
1522           if (!noop) {
1523             map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1524             ceph_assert(q != nodes.dir_map.end());
1525             ceph_assert(q->second->file_map.empty());
1526             nodes.dir_map.erase(q);
1527           }
1528         }
1529         break;
1530
1531       case bluefs_transaction_t::OP_FILE_UPDATE:
1532         {
1533           bluefs_fnode_t fnode;
1534           decode(fnode, p);
1535           dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1536                    << ":  op_file_update " << " " << fnode << " " << dendl;
1537           if (unlikely(to_stdout)) {
1538             std::cout << " 0x" << std::hex << pos << std::dec
1539                       << ":  op_file_update " << " " << fnode << std::endl;
1540           }
1541           if (!noop) {
1542             FileRef f = _get_file(fnode.ino);
1543             if (cct->_conf->bluefs_log_replay_check_allocations) {
1544               int r = _check_allocations(f->fnode,
1545                 used_blocks, false, "OP_FILE_UPDATE");
1546               if (r < 0) {
1547                 return r;
1548               }
1549             }
1550             if (fnode.ino != 1) {
1551               vselector->sub_usage(f->vselector_hint, f->fnode);
1552             }
1553             f->fnode = fnode;
1554             if (fnode.ino != 1) {
1555               vselector->add_usage(f->vselector_hint, f->fnode);
1556             }
1557
1558             if (fnode.ino > ino_last) {
1559               ino_last = fnode.ino;
1560             }
1561             if (cct->_conf->bluefs_log_replay_check_allocations) {
1562               int r = _check_allocations(f->fnode,
1563                 used_blocks, true, "OP_FILE_UPDATE");
1564               if (r < 0) {
1565                 return r;
1566               }
1567             }
1568           } else if (noop && fnode.ino == 1) {
1569             FileRef f = _get_file(fnode.ino);
1570             f->fnode = fnode;
1571           }
1572         }
1573         break;
1574       case bluefs_transaction_t::OP_FILE_UPDATE_INC:
1575         {
1576           bluefs_fnode_delta_t delta;
1577           decode(delta, p);
1578           dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1579             << ":  op_file_update_inc " << " " << delta << " " << dendl;
1580           if (unlikely(to_stdout)) {
1581             std::cout << " 0x" << std::hex << pos << std::dec
1582               << ":  op_file_update_inc " << " " << delta << std::endl;
1583           }
1584           if (!noop) {
1585             FileRef f = _get_file(delta.ino);
1586             bluefs_fnode_t& fnode = f->fnode;
1587             if (delta.offset != fnode.allocated) {
1588               derr << __func__ << " invalid op_file_update_inc, new extents miss end of file"
1589                    << " fnode=" << fnode
1590                    << " delta=" << delta
1591                    << dendl;
1592               ceph_assert(delta.offset == fnode.allocated);
1593             }
1594             if (cct->_conf->bluefs_log_replay_check_allocations) {
1595               int r = _check_allocations(fnode,
1596                 used_blocks, false, "OP_FILE_UPDATE_INC");
1597               if (r < 0) {
1598                 return r;
1599               }
1600             }
1601
1602             fnode.ino = delta.ino;
1603             fnode.mtime = delta.mtime;
1604             if (fnode.ino != 1) {
1605               vselector->sub_usage(f->vselector_hint, fnode);
1606             }
1607             fnode.size = delta.size;
1608             fnode.claim_extents(delta.extents);
1609             dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1610                      << ":  op_file_update_inc produced " << " " << fnode << " " << dendl;
1611
1612             if (fnode.ino != 1) {
1613               vselector->add_usage(f->vselector_hint, fnode);
1614             }
1615
1616             if (fnode.ino > ino_last) {
1617               ino_last = fnode.ino;
1618             }
1619             if (cct->_conf->bluefs_log_replay_check_allocations) {
1620               int r = _check_allocations(f->fnode,
1621                 used_blocks, true, "OP_FILE_UPDATE_INC");
1622               if (r < 0) {
1623                 return r;
1624               }
1625             }
1626           } else if (noop && delta.ino == 1) {
1627             // we need to track bluefs log, even in noop mode
1628             FileRef f = _get_file(1);
1629             bluefs_fnode_t& fnode = f->fnode;
1630             fnode.ino = delta.ino;
1631             fnode.mtime = delta.mtime;
1632             fnode.size = delta.size;
1633             fnode.claim_extents(delta.extents);
1634           }
1635         }
1636       break;
1637
1638       case bluefs_transaction_t::OP_FILE_REMOVE:
1639         {
1640           uint64_t ino;
1641           decode(ino, p);
1642           dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1643                    << ":  op_file_remove " << ino << dendl;
1644           if (unlikely(to_stdout)) {
1645             std::cout << " 0x" << std::hex << pos << std::dec
1646                       << ":  op_file_remove " << ino << std::endl;
1647           }
1648
1649           if (!noop) {
1650             auto p = nodes.file_map.find(ino);
1651             ceph_assert(p != nodes.file_map.end());
1652             vselector->sub_usage(p->second->vselector_hint, p->second->fnode);
1653             if (cct->_conf->bluefs_log_replay_check_allocations) {
1654               int r = _check_allocations(p->second->fnode,
1655                 used_blocks, false, "OP_FILE_REMOVE");
1656               if (r < 0) {
1657                 return r;
1658               }
1659             }
1660             nodes.file_map.erase(p);
1661           }
1662         }
1663         break;
1664
1665       default:
1666         derr << __func__ << " 0x" << std::hex << pos << std::dec
1667              << ": stop: unrecognized op " << (int)op << dendl;
1668         delete log_reader;
1669         return -EIO;
1670       }
1671     }
1672     ceph_assert(p.end());
1673
1674     // we successfully replayed the transaction; bump the seq and log size
1675     ++log_seq;
1676     log_file->fnode.size = log_reader->buf.pos;
1677   }
1678   if (!noop) {
1679     vselector->add_usage(log_file->vselector_hint, log_file->fnode);
1680     log.seq_live = log_seq + 1;
1681     dirty.seq_live = log_seq + 1;
1682     log.t.seq = log.seq_live;
1683     dirty.seq_stable = log_seq;
1684   }
1685
1686   dout(10) << __func__ << " log file size was 0x"
1687            << std::hex << log_file->fnode.size << std::dec << dendl;
1688   if (unlikely(to_stdout)) {
1689     std::cout << " log file size was 0x"
1690               << std::hex << log_file->fnode.size << std::dec << std::endl;
1691   }
1692
1693   delete log_reader;
1694
1695   if (!noop) {
1696     // verify file link counts are all >0
1697     for (auto& p : nodes.file_map) {
1698       if (p.second->refs == 0 &&
1699           p.second->fnode.ino > 1) {
1700         derr << __func__ << " file with link count 0: " << p.second->fnode
1701              << dendl;
1702         return -EIO;
1703       }
1704     }
1705   }
1706   // reflect file count in logger
1707   logger->set(l_bluefs_num_files, nodes.file_map.size());
1708
1709   dout(10) << __func__ << " done" << dendl;
1710   return 0;
1711 }
1712
1713 int BlueFS::log_dump()
1714 {
1715   // only dump log file's content
1716   ceph_assert(log.writer == nullptr && "cannot log_dump on mounted BlueFS");
1717   _init_logger();
1718   int r = _open_super();
1719   if (r < 0) {
1720     derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
1721     return r;
1722   }
1723   r = _replay(true, true);
1724   if (r < 0) {
1725     derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
1726   }
1727   _shutdown_logger();
1728   super = bluefs_super_t();
1729   return r;
1730 }
1731
1732 int BlueFS::device_migrate_to_existing(
1733   CephContext *cct,
1734   const set<int>& devs_source,
1735   int dev_target,
1736   const bluefs_layout_t& layout)
1737 {
1738   vector<byte> buf;
1739   bool buffered = cct->_conf->bluefs_buffered_io;
1740
1741   dout(10) << __func__ << " devs_source " << devs_source
1742            << " dev_target " << dev_target << dendl;
1743   assert(dev_target < (int)MAX_BDEV);
1744
1745   int flags = 0;
1746   flags |= devs_source.count(BDEV_DB) ?
1747     (REMOVE_DB | RENAME_SLOW2DB) : 0;
1748   flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1749   int dev_target_new = dev_target;
1750
1751   // Slow device without separate DB one is addressed via BDEV_DB
1752   // Hence need renaming.
1753   if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) {
1754     dev_target_new = BDEV_DB;
1755     dout(0) << __func__ << " super to be written to " << dev_target << dendl;
1756   }
1757
1758   for (auto& [ino, file_ref] : nodes.file_map) {
1759     //do not copy log
1760     if (ino == 1) {
1761       continue;
1762     }
1763     dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
1764
1765     vselector->sub_usage(file_ref->vselector_hint, file_ref->fnode);
1766
1767     bool rewrite = std::any_of(
1768       file_ref->fnode.extents.begin(),
1769       file_ref->fnode.extents.end(),
1770       [=](auto& ext) {
1771         return ext.bdev != dev_target && devs_source.count(ext.bdev);
1772       });
1773     if (rewrite) {
1774       dout(10) << __func__ << "  migrating" << dendl;
1775       bluefs_fnode_t old_fnode;
1776       old_fnode.swap_extents(file_ref->fnode);
1777       auto& old_fnode_extents = old_fnode.extents;
1778       // read entire file
1779       bufferlist bl;
1780       for (const auto &old_ext : old_fnode_extents) {
1781         buf.resize(old_ext.length);
1782         int r = _bdev_read_random(old_ext.bdev,
1783           old_ext.offset,
1784           old_ext.length,
1785           (char*)&buf.at(0),
1786           buffered);
1787         if (r != 0) {
1788           derr << __func__ << " failed to read 0x" << std::hex
1789                << old_ext.offset << "~" << old_ext.length << std::dec
1790                << " from " << (int)dev_target << dendl;
1791           return -EIO;
1792         }
1793         bl.append((char*)&buf[0], old_ext.length);
1794       }
1795
1796       // write entire file
1797       auto l = _allocate(dev_target, bl.length(), 0,
1798         &file_ref->fnode, 0, false);
1799       if (l < 0) {
1800         derr << __func__ << " unable to allocate len 0x" << std::hex
1801              << bl.length() << std::dec << " from " << (int)dev_target
1802              << ": " << cpp_strerror(l) << dendl;
1803         return -ENOSPC;
1804       }
1805
1806       uint64_t off = 0;
1807       for (auto& i : file_ref->fnode.extents) {
1808         bufferlist cur;
1809         uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1810         ceph_assert(cur_len > 0);
1811         cur.substr_of(bl, off, cur_len);
1812         int r = bdev[dev_target]->write(i.offset, cur, buffered);
1813         ceph_assert(r == 0);
1814         off += cur_len;
1815       }
1816
1817       // release old extents
1818       for (const auto &old_ext : old_fnode_extents) {
1819         PExtentVector to_release;
1820         to_release.emplace_back(old_ext.offset, old_ext.length);
1821         alloc[old_ext.bdev]->release(to_release);
1822         if (is_shared_alloc(old_ext.bdev)) {
1823           shared_alloc->bluefs_used -= to_release.size();
1824         }
1825       }
1826
1827       // update fnode
1828       for (auto& i : file_ref->fnode.extents) {
1829         i.bdev = dev_target_new;
1830       }
1831     } else {
1832       for (auto& ext : file_ref->fnode.extents) {
1833         if (dev_target != dev_target_new && ext.bdev == dev_target) {
1834           dout(20) << __func__ << "  " << " ... adjusting extent 0x"
1835                    << std::hex << ext.offset << std::dec
1836                    << " bdev " << dev_target << " -> " << dev_target_new
1837                    << dendl;
1838           ext.bdev = dev_target_new;
1839         }
1840       }
1841     }
1842     vselector->add_usage(file_ref->vselector_hint, file_ref->fnode);
1843   }
1844   // new logging device in the current naming scheme
1845   int new_log_dev_cur = bdev[BDEV_WAL] ?
1846     BDEV_WAL :
1847     bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
1848
1849   // new logging device in new naming scheme
1850   int new_log_dev_next = new_log_dev_cur;
1851
1852   if (devs_source.count(new_log_dev_cur)) {
1853     // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1854     new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ?
1855       BDEV_DB :
1856       BDEV_WAL;
1857
1858     dout(0) << __func__ << " log moved from " << new_log_dev_cur
1859       << " to " << new_log_dev_next << dendl;
1860
1861     new_log_dev_cur =
1862       (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ?
1863         BDEV_SLOW :
1864         new_log_dev_next;
1865   }
1866
1867   _rewrite_log_and_layout_sync_LNF_LD(
1868     false,
1869     (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB,
1870     new_log_dev_cur,
1871     new_log_dev_next,
1872     flags,
1873     layout);
1874   return 0;
1875 }
1876
1877 int BlueFS::device_migrate_to_new(
1878   CephContext *cct,
1879   const set<int>& devs_source,
1880   int dev_target,
1881   const bluefs_layout_t& layout)
1882 {
1883   vector<byte> buf;
1884   bool buffered = cct->_conf->bluefs_buffered_io;
1885
1886   dout(10) << __func__ << " devs_source " << devs_source
1887            << " dev_target " << dev_target << dendl;
1888   assert(dev_target == (int)BDEV_NEWDB || dev_target == (int)BDEV_NEWWAL);
1889
1890   int flags = 0;
1891
1892   flags |= devs_source.count(BDEV_DB) ?
1893     (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) :
1894     0;
1895   flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1896   int dev_target_new = dev_target; //FIXME: remove, makes no sense
1897
1898   for (auto& [ino, file_ref] : nodes.file_map) {
1899     //do not copy log
1900     if (ino == 1) {
1901       continue;
1902     }
1903     dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
1904
1905     vselector->sub_usage(file_ref->vselector_hint, file_ref->fnode);
1906
1907     bool rewrite = std::any_of(
1908       file_ref->fnode.extents.begin(),
1909       file_ref->fnode.extents.end(),
1910       [=](auto& ext) {
1911         return ext.bdev != dev_target && devs_source.count(ext.bdev);
1912       });
1913     if (rewrite) {
1914       dout(10) << __func__ << "  migrating" << dendl;
1915       bluefs_fnode_t old_fnode;
1916       old_fnode.swap_extents(file_ref->fnode);
1917       auto& old_fnode_extents = old_fnode.extents;
1918       // read entire file
1919       bufferlist bl;
1920       for (const auto &old_ext : old_fnode_extents) {
1921         buf.resize(old_ext.length);
1922         int r = _bdev_read_random(old_ext.bdev,
1923           old_ext.offset,
1924           old_ext.length,
1925           (char*)&buf.at(0),
1926           buffered);
1927         if (r != 0) {
1928           derr << __func__ << " failed to read 0x" << std::hex
1929                << old_ext.offset << "~" << old_ext.length << std::dec
1930                << " from " << (int)dev_target << dendl;
1931           return -EIO;
1932         }
1933         bl.append((char*)&buf[0], old_ext.length);
1934       }
1935
1936       // write entire file
1937       auto l = _allocate(dev_target, bl.length(), 0,
1938         &file_ref->fnode, 0, false);
1939       if (l < 0) {
1940         derr << __func__ << " unable to allocate len 0x" << std::hex
1941              << bl.length() << std::dec << " from " << (int)dev_target
1942              << ": " << cpp_strerror(l) << dendl;
1943         return -ENOSPC;
1944       }
1945
1946       uint64_t off = 0;
1947       for (auto& i : file_ref->fnode.extents) {
1948         bufferlist cur;
1949         uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1950         ceph_assert(cur_len > 0);
1951         cur.substr_of(bl, off, cur_len);
1952         int r = bdev[dev_target]->write(i.offset, cur, buffered);
1953         ceph_assert(r == 0);
1954         off += cur_len;
1955       }
1956
1957       // release old extents
1958       for (const auto &old_ext : old_fnode_extents) {
1959         PExtentVector to_release;
1960         to_release.emplace_back(old_ext.offset, old_ext.length);
1961         alloc[old_ext.bdev]->release(to_release);
1962         if (is_shared_alloc(old_ext.bdev)) {
1963           shared_alloc->bluefs_used -= to_release.size();
1964         }
1965       }
1966
1967       // update fnode
1968       for (auto& i : file_ref->fnode.extents) {
1969         i.bdev = dev_target_new;
1970       }
1971     }
1972   }
1973   // new logging device in the current naming scheme
1974   int new_log_dev_cur =
1975     bdev[BDEV_NEWWAL] ?
1976       BDEV_NEWWAL :
1977       bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ?
1978         BDEV_WAL :
1979         bdev[BDEV_NEWDB] ?
1980           BDEV_NEWDB :
1981           bdev[BDEV_DB] && !(flags & REMOVE_DB)?
1982             BDEV_DB :
1983             BDEV_SLOW;
1984
1985   // new logging device in new naming scheme
1986   int new_log_dev_next =
1987     new_log_dev_cur == BDEV_NEWWAL ?
1988       BDEV_WAL :
1989       new_log_dev_cur == BDEV_NEWDB ?
1990         BDEV_DB :
1991         new_log_dev_cur;
1992
1993   int super_dev =
1994     dev_target == BDEV_NEWDB ?
1995       BDEV_NEWDB :
1996       bdev[BDEV_DB] ?
1997         BDEV_DB :
1998         BDEV_SLOW;
1999
2000   _rewrite_log_and_layout_sync_LNF_LD(
2001     false,
2002     super_dev,
2003     new_log_dev_cur,
2004     new_log_dev_next,
2005     flags,
2006     layout);
2007   return 0;
2008 }
2009
2010 BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
2011 {
2012   auto p = nodes.file_map.find(ino);
2013   if (p == nodes.file_map.end()) {
2014     FileRef f = ceph::make_ref<File>();
2015     nodes.file_map[ino] = f;
2016     // track files count in logger
2017     logger->set(l_bluefs_num_files, nodes.file_map.size());
2018     dout(30) << __func__ << " ino " << ino << " = " << f
2019              << " (new)" << dendl;
2020     return f;
2021   } else {
2022     dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
2023     return p->second;
2024   }
2025 }
2026
2027
2028 /**
2029 To modify fnode both FileWriter::lock and File::lock must be obtained.
2030 The special case is when we modify bluefs log (ino 1) or
2031 we are compacting log (ino 0).
2032
2033 In any case it is enough to hold File::lock to be sure fnode will not be modified.
2034 */
2035 struct lock_fnode_print {
2036   BlueFS::FileRef file;
2037   lock_fnode_print(BlueFS::FileRef file) : file(file) {};
2038 };
2039 std::ostream& operator<<(std::ostream& out, const lock_fnode_print& to_lock) {
2040   std::lock_guard l(to_lock.file->lock);
2041   out << to_lock.file->fnode;
2042   return out;
2043 }
2044
2045 void BlueFS::_drop_link_D(FileRef file)
2046 {
2047   dout(20) << __func__ << " had refs " << file->refs
2048            << " on " << lock_fnode_print(file) << dendl;
2049   ceph_assert(file->refs > 0);
2050   ceph_assert(ceph_mutex_is_locked(log.lock));
2051   ceph_assert(ceph_mutex_is_locked(nodes.lock));
2052
2053   --file->refs;
2054   if (file->refs == 0) {
2055     dout(20) << __func__ << " destroying " << file->fnode << dendl;
2056     ceph_assert(file->num_reading.load() == 0);
2057     vselector->sub_usage(file->vselector_hint, file->fnode);
2058     log.t.op_file_remove(file->fnode.ino);
2059     nodes.file_map.erase(file->fnode.ino);
2060     logger->set(l_bluefs_num_files, nodes.file_map.size());
2061     file->deleted = true;
2062
2063     std::lock_guard dl(dirty.lock);
2064     for (auto& r : file->fnode.extents) {
2065       dirty.pending_release[r.bdev].insert(r.offset, r.length);
2066     }
2067     if (file->dirty_seq > dirty.seq_stable) {
2068       // retract request to serialize changes
2069       ceph_assert(dirty.files.count(file->dirty_seq));
2070       auto it = dirty.files[file->dirty_seq].iterator_to(*file);
2071       dirty.files[file->dirty_seq].erase(it);
2072       file->dirty_seq = dirty.seq_stable;
2073     }
2074   }
2075 }
2076
2077 int64_t BlueFS::_read_random(
2078   FileReader *h,         ///< [in] read from here
2079   uint64_t off,          ///< [in] offset
2080   uint64_t len,          ///< [in] this many bytes
2081   char *out)             ///< [out] copy it here
2082 {
2083   auto* buf = &h->buf;
2084
2085   int64_t ret = 0;
2086   dout(10) << __func__ << " h " << h
2087            << " 0x" << std::hex << off << "~" << len << std::dec
2088            << " from " << lock_fnode_print(h->file) << dendl;
2089
2090   ++h->file->num_reading;
2091
2092   if (!h->ignore_eof &&
2093       off + len > h->file->fnode.size) {
2094     if (off > h->file->fnode.size)
2095       len = 0;
2096     else
2097       len = h->file->fnode.size - off;
2098     dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
2099              << std::hex << len << std::dec << dendl;
2100   }
2101   logger->inc(l_bluefs_read_random_count, 1);
2102   logger->inc(l_bluefs_read_random_bytes, len);
2103
2104   std::shared_lock s_lock(h->lock);
2105   buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
2106   while (len > 0) {
2107     if (off < buf->bl_off || off >= buf->get_buf_end()) {
2108       s_lock.unlock();
2109       uint64_t x_off = 0;
2110       auto p = h->file->fnode.seek(off, &x_off);
2111       ceph_assert(p != h->file->fnode.extents.end());
2112       uint64_t l = std::min(p->length - x_off, len);
2113       //hard cap to 1GB
2114       l = std::min(l, uint64_t(1) << 30);
2115       dout(20) << __func__ << " read random 0x"
2116                << std::hex << x_off << "~" << l << std::dec
2117                << " of " << *p << dendl;
2118       int r;
2119       if (!cct->_conf->bluefs_check_for_zeros) {
2120         r = _bdev_read_random(p->bdev, p->offset + x_off, l, out,
2121                               cct->_conf->bluefs_buffered_io);
2122       } else {
2123         r = _read_random_and_check(p->bdev, p->offset + x_off, l, out,
2124                         cct->_conf->bluefs_buffered_io);
2125       }
2126       ceph_assert(r == 0);
2127       off += l;
2128       len -= l;
2129       ret += l;
2130       out += l;
2131
2132       logger->inc(l_bluefs_read_random_disk_count, 1);
2133       logger->inc(l_bluefs_read_random_disk_bytes, l);
2134       if (len > 0) {
2135         s_lock.lock();
2136       }
2137     } else {
2138       auto left = buf->get_buf_remaining(off);
2139       int64_t r = std::min(len, left);
2140       logger->inc(l_bluefs_read_random_buffer_count, 1);
2141       logger->inc(l_bluefs_read_random_buffer_bytes, r);
2142       dout(20) << __func__ << " left 0x" << std::hex << left
2143               << " 0x" << off << "~" << len << std::dec
2144               << dendl;
2145
2146       auto p = buf->bl.begin();
2147       p.seek(off - buf->bl_off);
2148       p.copy(r, out);
2149       out += r;
2150
2151       dout(30) << __func__ << " result chunk (0x"
2152                << std::hex << r << std::dec << " bytes):\n";
2153       bufferlist t;
2154       t.substr_of(buf->bl, off - buf->bl_off, r);
2155       t.hexdump(*_dout);
2156       *_dout << dendl;
2157
2158       off += r;
2159       len -= r;
2160       ret += r;
2161       buf->pos += r;
2162     }
2163   }
2164   dout(20) << __func__ << std::hex
2165            << " got 0x" << ret
2166            << std::dec  << dendl;
2167   --h->file->num_reading;
2168   return ret;
2169 }
2170
2171 int64_t BlueFS::_read(
2172   FileReader *h,         ///< [in] read from here
2173   uint64_t off,          ///< [in] offset
2174   size_t len,            ///< [in] this many bytes
2175   bufferlist *outbl,     ///< [out] optional: reference the result here
2176   char *out)             ///< [out] optional: or copy it here
2177 {
2178   FileReaderBuffer *buf = &(h->buf);
2179
2180   bool prefetch = !outbl && !out;
2181   dout(10) << __func__ << " h " << h
2182            << " 0x" << std::hex << off << "~" << len << std::dec
2183            << " from " << lock_fnode_print(h->file)
2184            << (prefetch ? " prefetch" : "")
2185            << dendl;
2186
2187   ++h->file->num_reading;
2188
2189   if (!h->ignore_eof &&
2190       off + len > h->file->fnode.size) {
2191     if (off > h->file->fnode.size)
2192       len = 0;
2193     else
2194       len = h->file->fnode.size - off;
2195     dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
2196              << std::hex << len << std::dec << dendl;
2197   }
2198   logger->inc(l_bluefs_read_count, 1);
2199   logger->inc(l_bluefs_read_bytes, len);
2200   if (prefetch) {
2201     logger->inc(l_bluefs_read_prefetch_count, 1);
2202     logger->inc(l_bluefs_read_prefetch_bytes, len);
2203   }
2204
2205   if (outbl)
2206     outbl->clear();
2207
2208   int64_t ret = 0;
2209   std::shared_lock s_lock(h->lock);
2210   while (len > 0) {
2211     size_t left;
2212     if (off < buf->bl_off || off >= buf->get_buf_end()) {
2213       s_lock.unlock();
2214       std::unique_lock u_lock(h->lock);
2215       buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
2216       if (off < buf->bl_off || off >= buf->get_buf_end()) {
2217         // if precondition hasn't changed during locking upgrade.
2218         buf->bl.clear();
2219         buf->bl_off = off & super.block_mask();
2220         uint64_t x_off = 0;
2221         auto p = h->file->fnode.seek(buf->bl_off, &x_off);
2222         if (p == h->file->fnode.extents.end()) {
2223           dout(5) << __func__ << " reading less then required "
2224                   << ret << "<" << ret + len << dendl;
2225           break;
2226         }
2227
2228         uint64_t want = round_up_to(len + (off & ~super.block_mask()),
2229                                     super.block_size);
2230         want = std::max(want, buf->max_prefetch);
2231         uint64_t l = std::min(p->length - x_off, want);
2232         //hard cap to 1GB
2233         l = std::min(l, uint64_t(1) << 30);
2234         uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size);
2235         if (!h->ignore_eof &&
2236             buf->bl_off + l > eof_offset) {
2237           l = eof_offset - buf->bl_off;
2238         }
2239         dout(20) << __func__ << " fetching 0x"
2240                  << std::hex << x_off << "~" << l << std::dec
2241                  << " of " << *p << dendl;
2242         int r;
2243         // when reading BlueFS log (only happens on startup) use non-buffered io
2244         // it makes it in sync with logic in _flush_range()
2245         bool use_buffered_io = h->file->fnode.ino == 1 ? false : cct->_conf->bluefs_buffered_io;
2246         if (!cct->_conf->bluefs_check_for_zeros) {
2247           r = _bdev_read(p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev],
2248                          use_buffered_io);
2249         } else {
2250           r = _read_and_check(
2251             p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev],
2252             use_buffered_io);
2253         }
2254         logger->inc(l_bluefs_read_disk_count, 1);
2255         logger->inc(l_bluefs_read_disk_bytes, l);
2256
2257         ceph_assert(r == 0);
2258       }
2259       u_lock.unlock();
2260       s_lock.lock();
2261       // we should recheck if buffer is valid after lock downgrade
2262       continue;
2263     }
2264     left = buf->get_buf_remaining(off);
2265     dout(20) << __func__ << " left 0x" << std::hex << left
2266              << " len 0x" << len << std::dec << dendl;
2267
2268     int64_t r = std::min(len, left);
2269     if (outbl) {
2270       bufferlist t;
2271       t.substr_of(buf->bl, off - buf->bl_off, r);
2272       outbl->claim_append(t);
2273     }
2274     if (out) {
2275       auto p = buf->bl.begin();
2276       p.seek(off - buf->bl_off);
2277       p.copy(r, out);
2278       out += r;
2279     }
2280
2281     dout(30) << __func__ << " result chunk (0x"
2282              << std::hex << r << std::dec << " bytes):\n";
2283     bufferlist t;
2284     t.substr_of(buf->bl, off - buf->bl_off, r);
2285     t.hexdump(*_dout);
2286     *_dout << dendl;
2287
2288     off += r;
2289     len -= r;
2290     ret += r;
2291     buf->pos += r;
2292   }
2293
2294   dout(20) << __func__ << std::hex
2295            << " got 0x" << ret
2296            << std::dec  << dendl;
2297   ceph_assert(!outbl || (int)outbl->length() == ret);
2298   --h->file->num_reading;
2299   return ret;
2300 }
2301
2302 void BlueFS::invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
2303 {
2304   std::lock_guard l(f->lock);
2305   dout(10) << __func__ << " file " << f->fnode
2306            << " 0x" << std::hex << offset << "~" << length << std::dec
2307            << dendl;
2308   if (offset & ~super.block_mask()) {
2309     offset &= super.block_mask();
2310     length = round_up_to(length, super.block_size);
2311   }
2312   uint64_t x_off = 0;
2313   auto p = f->fnode.seek(offset, &x_off);
2314   while (length > 0 && p != f->fnode.extents.end()) {
2315     uint64_t x_len = std::min(p->length - x_off, length);
2316     bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
2317     dout(20) << __func__  << " 0x" << std::hex << x_off << "~" << x_len
2318              << std:: dec << " of " << *p << dendl;
2319     offset += x_len;
2320     length -= x_len;
2321   }
2322 }
2323
2324
2325 uint64_t BlueFS::_estimate_transaction_size(bluefs_transaction_t* t)
2326 {
2327   uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
2328                                      std::max(alloc_size[BDEV_DB],
2329                                               alloc_size[BDEV_SLOW]));
2330
2331   // conservative estimate for final encoded size
2332   return round_up_to(t->op_bl.length() + super.block_size * 2, max_alloc_size);
2333 }
2334
2335 uint64_t BlueFS::_make_initial_transaction(uint64_t start_seq,
2336                                            bluefs_fnode_t& fnode,
2337                                            uint64_t expected_final_size,
2338                                            bufferlist* out)
2339 {
2340   bluefs_transaction_t t0;
2341   t0.seq = start_seq;
2342   t0.uuid = super.uuid;
2343   t0.op_init();
2344   t0.op_file_update_inc(fnode);
2345   t0.op_jump(start_seq, expected_final_size); // this is a fixed size op,
2346                                               // hence it's valid with fake
2347                                               // params for overall txc size
2348                                               // estimation
2349   if (!out) {
2350     return _estimate_transaction_size(&t0);
2351   }
2352
2353   ceph_assert(expected_final_size > 0);
2354   out->reserve(expected_final_size);
2355   encode(t0, *out);
2356   // make sure we're not wrong aboth the size
2357   ceph_assert(out->length() <= expected_final_size);
2358   _pad_bl(*out, expected_final_size);
2359   return expected_final_size;
2360 }
2361
2362 uint64_t BlueFS::_estimate_log_size_N()
2363 {
2364   std::lock_guard nl(nodes.lock);
2365   int avg_dir_size = 40;  // fixme
2366   int avg_file_size = 12;
2367   uint64_t size = 4096 * 2;
2368   size += nodes.file_map.size() * (1 + sizeof(bluefs_fnode_t));
2369   size += nodes.dir_map.size() + (1 + avg_dir_size);
2370   size += nodes.file_map.size() * (1 + avg_dir_size + avg_file_size);
2371   return round_up_to(size, super.block_size);
2372 }
2373
2374 void BlueFS::compact_log()/*_LNF_LD_NF_D*/
2375 {
2376   if (!cct->_conf->bluefs_replay_recovery_disable_compact) {
2377     if (cct->_conf->bluefs_compact_log_sync) {
2378       _compact_log_sync_LNF_LD();
2379     } else {
2380       _compact_log_async_LD_LNF_D();
2381     }
2382   }
2383 }
2384
2385 bool BlueFS::_should_start_compact_log_L_N()
2386 {
2387   if (log_is_compacting.load() == true) {
2388     // compaction is already running
2389     return false;
2390   }
2391   uint64_t current;
2392   {
2393     std::lock_guard ll(log.lock);
2394     current = log.writer->file->fnode.size;
2395   }
2396   uint64_t expected = _estimate_log_size_N();
2397   float ratio = (float)current / (float)expected;
2398   dout(10) << __func__ << " current 0x" << std::hex << current
2399            << " expected " << expected << std::dec
2400            << " ratio " << ratio
2401            << dendl;
2402   if (current < cct->_conf->bluefs_log_compact_min_size ||
2403       ratio < cct->_conf->bluefs_log_compact_min_ratio) {
2404     return false;
2405   }
2406   return true;
2407 }
2408
2409 void BlueFS::_compact_log_dump_metadata_NF(uint64_t start_seq,
2410                                         bluefs_transaction_t *t,
2411                                         int bdev_update_flags,
2412                                         uint64_t capture_before_seq)
2413 {
2414   dout(20) << __func__ << dendl;
2415   t->seq = start_seq;
2416   t->uuid = super.uuid;
2417
2418   std::lock_guard nl(nodes.lock);
2419
2420   for (auto& [ino, file_ref] : nodes.file_map) {
2421     if (ino == 1)
2422       continue;
2423     ceph_assert(ino > 1);
2424     std::lock_guard fl(file_ref->lock);
2425     if (bdev_update_flags) {
2426       for(auto& e : file_ref->fnode.extents) {
2427         auto bdev = e.bdev;
2428         auto bdev_new = bdev;
2429         ceph_assert(!((bdev_update_flags & REMOVE_WAL) && bdev == BDEV_WAL));
2430         if ((bdev_update_flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
2431           bdev_new = BDEV_DB;
2432         }
2433         if ((bdev_update_flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
2434           bdev_new = BDEV_SLOW;
2435         }
2436         if (bdev == BDEV_NEWDB) {
2437           // REMOVE_DB xor RENAME_DB
2438           ceph_assert(!(bdev_update_flags & REMOVE_DB) != !(bdev_update_flags & RENAME_DB2SLOW));
2439           ceph_assert(!(bdev_update_flags & RENAME_SLOW2DB));
2440           bdev_new = BDEV_DB;
2441         }
2442         if (bdev == BDEV_NEWWAL) {
2443           ceph_assert(bdev_update_flags & REMOVE_WAL);
2444           bdev_new = BDEV_WAL;
2445         }
2446         e.bdev = bdev_new;
2447       }
2448     }
2449     if (capture_before_seq == 0 || file_ref->dirty_seq < capture_before_seq) {
2450       dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
2451     } else {
2452       dout(20) << __func__ << " op_file_update just modified, dirty_seq="
2453                << file_ref->dirty_seq << " " << file_ref->fnode << dendl;
2454     }
2455     t->op_file_update(file_ref->fnode);
2456   }
2457   for (auto& [path, dir_ref] : nodes.dir_map) {
2458     dout(20) << __func__ << " op_dir_create " << path << dendl;
2459     t->op_dir_create(path);
2460     for (auto& [fname, file_ref] : dir_ref->file_map) {
2461       dout(20) << __func__ << " op_dir_link " << path << "/" << fname
2462                << " to " << file_ref->fnode.ino << dendl;
2463       t->op_dir_link(path, fname, file_ref->fnode.ino);
2464     }
2465   }
2466 }
2467
2468 void BlueFS::_compact_log_sync_LNF_LD()
2469 {
2470   dout(10) << __func__ << dendl;
2471   uint8_t prefer_bdev;
2472   {
2473     std::lock_guard ll(log.lock);
2474     prefer_bdev =
2475       vselector->select_prefer_bdev(log.writer->file->vselector_hint);
2476   }
2477   _rewrite_log_and_layout_sync_LNF_LD(true,
2478     BDEV_DB,
2479     prefer_bdev,
2480     prefer_bdev,
2481     0,
2482     super.memorized_layout);
2483   logger->inc(l_bluefs_log_compactions);
2484 }
2485
2486 /*
2487  * SYNC LOG COMPACTION
2488  *
2489  * 0. Lock the log completely through the whole procedure
2490  *
2491  * 1. Build new log. It will include log's starter and compacted metadata
2492  *    body. Jump op appended to the starter will link the pieces together.
2493  *
2494  * 2. Write out new log's content
2495  *
2496  * 3. Write out new superblock. This includes relevant device layout update.
2497  *
2498  * 4. Finalization. Old space release.
2499  */
2500
2501 void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback,
2502                                          int super_dev,
2503                                          int log_dev,
2504                                          int log_dev_new,
2505                                          int flags,
2506                                          std::optional<bluefs_layout_t> layout)
2507 {
2508   // we substitute log_dev with log_dev_new for new allocations below
2509   // and permitting fallback allocations prevents such a substitution
2510   ceph_assert((permit_dev_fallback && log_dev == log_dev_new) ||
2511               !permit_dev_fallback);
2512
2513   dout(10) << __func__ << " super_dev:" << super_dev
2514                        << " log_dev:" << log_dev
2515                        << " log_dev_new:" << log_dev_new
2516                        << " flags:" << flags
2517                        << " seq:" << log.seq_live
2518                        << dendl;
2519   utime_t mtime = ceph_clock_now();
2520   uint64_t starter_seq = 1;
2521
2522   // Part 0.
2523   // Lock the log totally till the end of the procedure
2524   std::lock_guard ll(log.lock);
2525   auto t0 = mono_clock::now();
2526
2527   File *log_file = log.writer->file.get();
2528   bluefs_fnode_t fnode_tail;
2529   // log.t.seq is always set to current live seq
2530   ceph_assert(log.t.seq == log.seq_live);
2531   // Capturing entire state. Dump anything that has been stored there.
2532   log.t.clear();
2533   log.t.seq = log.seq_live;
2534   // From now on, no changes to log.t are permitted until we finish rewriting log.
2535   // Can allow dirty to remain dirty - log.seq_live will not change.
2536
2537   //
2538   // Part 1.
2539   // Build new log starter and compacted metadata body
2540   // 1.1. Build full compacted meta transaction.
2541   //      Encode a bluefs transaction that dumps all of the in-memory fnodes
2542   //      and names.
2543   //      This might be pretty large and its allocation map can exceed
2544   //      superblock size. Hence instead we'll need log starter part which
2545   //      goes to superblock and refers that new meta through op_update_inc.
2546   // 1.2.  Allocate space for the above transaction
2547   //       using its size estimation.
2548   // 1.3.  Allocate the space required for the starter part of the new log.
2549   //       It should be small enough to fit into superblock.
2550   // 1.4   Building new log persistent fnode representation which will
2551   //       finally land to disk.
2552   //       Depending on input parameters we might need to perform device ids
2553   //       rename - runtime and persistent replicas should be different when we
2554   //       are in the device migration process.
2555   // 1.5   Store starter fnode to run-time superblock, to be written out later.
2556   //       It doesn't contain compacted meta to fit relevant alocation map into
2557   //       superblock.
2558   // 1.6   Proceed building new log persistent fnode representation.
2559   //       Will add log tail with compacted meta extents from 1.1.
2560   //       Device rename applied as well
2561   //
2562   // 1.7.  Encode new log fnode starter,
2563   //       It will include op_init, new log's op_update_inc
2564   //       and jump to the compacted meta transaction beginning.
2565   //       Superblock will reference this starter part
2566   //
2567   // 1.8.  Encode compacted meta transaction,
2568   //       extend the transaction with a jump to proper sequence no
2569   //
2570
2571
2572   // 1.1 Build full compacted meta transaction
2573   bluefs_transaction_t compacted_meta_t;
2574   _compact_log_dump_metadata_NF(starter_seq + 1, &compacted_meta_t, flags, 0);
2575
2576   // 1.2 Allocate the space required for the compacted meta transaction
2577   uint64_t compacted_meta_need =
2578     _estimate_transaction_size(&compacted_meta_t) +
2579       cct->_conf->bluefs_max_log_runway;
2580
2581   dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need << dendl;
2582
2583   int r = _allocate(log_dev, compacted_meta_need, 0, &fnode_tail, 0,
2584     permit_dev_fallback);
2585   ceph_assert(r == 0);
2586
2587
2588   // 1.3 Allocate the space required for the starter part of the new log.
2589   // estimate new log fnode size to be referenced from superblock
2590   // hence use dummy fnode and jump parameters
2591   uint64_t starter_need = _make_initial_transaction(starter_seq, fnode_tail, 0, nullptr);
2592
2593   bluefs_fnode_t fnode_starter(log_file->fnode.ino, 0, mtime);
2594   r = _allocate(log_dev, starter_need, 0, &fnode_starter, 0,
2595     permit_dev_fallback);
2596   ceph_assert(r == 0);
2597
2598   // 1.4 Building starter fnode
2599   bluefs_fnode_t fnode_persistent(fnode_starter.ino, 0, mtime);
2600   for (auto p : fnode_starter.extents) {
2601     // rename device if needed - this is possible when fallback allocations
2602     // are prohibited only. Which means every extent is targeted to the same
2603     // device and we can unconditionally update them.
2604     if (log_dev != log_dev_new) {
2605       dout(10) << __func__ << " renaming log extents to "
2606                << log_dev_new << dendl;
2607       p.bdev = log_dev_new;
2608     }
2609     fnode_persistent.append_extent(p);
2610   }
2611
2612   // 1.5 Store starter fnode to run-time superblock, to be written out later
2613   super.log_fnode = fnode_persistent;
2614
2615   // 1.6 Proceed building new log persistent fnode representation
2616   // we'll build incremental update starting from this point
2617   fnode_persistent.reset_delta();
2618   for (auto p : fnode_tail.extents) {
2619     // rename device if needed - this is possible when fallback allocations
2620     // are prohibited only. Which means every extent is targeted to the same
2621     // device and we can unconditionally update them.
2622     if (log_dev != log_dev_new) {
2623       dout(10) << __func__ << " renaming log extents to "
2624                << log_dev_new << dendl;
2625       p.bdev = log_dev_new;
2626     }
2627     fnode_persistent.append_extent(p);
2628   }
2629
2630   // 1.7 Encode new log fnode
2631   // This will flush incremental part of fnode_persistent only.
2632   bufferlist starter_bl;
2633   _make_initial_transaction(starter_seq, fnode_persistent, starter_need, &starter_bl);
2634
2635   // 1.8 Encode compacted meta transaction
2636   dout(20) << __func__ << " op_jump_seq " << log.seq_live << dendl;
2637   // hopefully "compact_meta_need" estimation provides enough extra space
2638   // for this op, assert below if not
2639   compacted_meta_t.op_jump_seq(log.seq_live);
2640
2641   bufferlist compacted_meta_bl;
2642   encode(compacted_meta_t, compacted_meta_bl);
2643   _pad_bl(compacted_meta_bl);
2644   ceph_assert(compacted_meta_bl.length() <= compacted_meta_need);
2645
2646   //
2647   // Part 2
2648   // Write out new log's content
2649   // 2.1. Build the full runtime new log's fnode
2650   //
2651   // 2.2. Write out new log's
2652   //
2653   // 2.3. Do flush and wait for completion through flush_bdev()
2654   //
2655   // 2.4. Finalize log update
2656   //      Update all sequence numbers
2657   //
2658
2659   // 2.1 Build the full runtime new log's fnode
2660   bluefs_fnode_t old_log_fnode;
2661   old_log_fnode.swap(fnode_starter);
2662   old_log_fnode.clone_extents(fnode_tail);
2663   old_log_fnode.reset_delta();
2664   log_file->fnode.swap(old_log_fnode);
2665
2666   // 2.2 Write out new log's content
2667   // Get rid off old writer
2668   _close_writer(log.writer);
2669   // Make new log writer and stage new log's content writing
2670   log.writer = _create_writer(log_file);
2671   log.writer->append(starter_bl);
2672   log.writer->append(compacted_meta_bl);
2673
2674   // 2.3 Do flush and wait for completion through flush_bdev()
2675   _flush_special(log.writer);
2676 #ifdef HAVE_LIBAIO
2677   if (!cct->_conf->bluefs_sync_write) {
2678     list<aio_t> completed_ios;
2679     _claim_completed_aios(log.writer, &completed_ios);
2680     _wait_for_aio(log.writer);
2681     completed_ios.clear();
2682   }
2683 #endif
2684   _flush_bdev();
2685
2686   // 2.4 Finalize log update
2687   ++log.seq_live;
2688   dirty.seq_live = log.seq_live;
2689   log.t.seq = log.seq_live;
2690   vselector->sub_usage(log_file->vselector_hint, old_log_fnode);
2691   vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2692
2693   // Part 3.
2694   // Write out new superblock to reflect all the changes.
2695   //
2696
2697   super.memorized_layout = layout;
2698   _write_super(super_dev);
2699   _flush_bdev();
2700
2701   // we're mostly done
2702   dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2703   logger->inc(l_bluefs_log_compactions);
2704
2705   // Part 4
2706   // Finalization. Release old space.
2707   //
2708   {
2709     dout(10) << __func__
2710              << " release old log extents " << old_log_fnode.extents
2711              << dendl;
2712     std::lock_guard dl(dirty.lock);
2713     for (auto& r : old_log_fnode.extents) {
2714       dirty.pending_release[r.bdev].insert(r.offset, r.length);
2715     }
2716   }
2717   logger->tinc(l_bluefs_compaction_lock_lat, mono_clock::now() - t0);
2718 }
2719
2720 /*
2721  * ASYNC LOG COMPACTION
2722  *
2723  * 0. Lock the log and forbid its extension. The former covers just
2724  *    a part of the below procedure while the latter spans over it
2725  *    completely.
2726  * 1. Allocate a new extent to continue the log, and then log an event
2727  *    that jumps the log write position to the new extent.  At this point, the
2728  *    old extent(s) won't be written to, and reflect everything to compact.
2729  *    New events will be written to the new region that we'll keep.
2730  *    The latter will finally become new log tail on compaction completion.
2731  *
2732  * 2. Build new log. It will include log's starter, compacted metadata
2733  *    body and the above tail. Jump ops appended to the starter and meta body
2734  *    will link the pieces togather. Log's lock is releases in the mid of the
2735  *    process to permit parallel access to it.
2736  *
2737  * 3. Write out new log's content.
2738  *
2739  * 4. Write out new superblock to reflect all the changes.
2740  *
2741  * 5. Apply new log fnode, log is locked for a while.
2742  *
2743  * 6. Finalization. Clean up, old space release and total unlocking.
2744  */
2745
2746 void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
2747 {
2748   dout(10) << __func__ << dendl;
2749   utime_t mtime = ceph_clock_now();
2750   uint64_t starter_seq = 1;
2751   uint64_t old_log_jump_to = 0;
2752
2753   // Part 0.
2754   // Lock the log and forbid its expansion and other compactions
2755
2756   // only one compaction allowed at one time
2757   bool old_is_comp = std::atomic_exchange(&log_is_compacting, true);
2758   if (old_is_comp) {
2759     dout(10) << __func__ << " ongoing" <<dendl;
2760     return;
2761   }
2762   // lock log's run-time structures for a while
2763   log.lock.lock();
2764   auto t0 = mono_clock::now();
2765
2766   // Part 1.
2767   // Prepare current log for jumping into it.
2768   // 1. Allocate extent
2769   // 2. Update op to log
2770   // 3. Jump op to log
2771   // During that, no one else can write to log, otherwise we risk jumping backwards.
2772   // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that.
2773
2774   //signal _maybe_extend_log that expansion of log is temporary inacceptable
2775   bool old_forbidden = atomic_exchange(&log_forbidden_to_expand, true);
2776   ceph_assert(old_forbidden == false);
2777
2778   //
2779   // Part 1.
2780   // Prepare current log for jumping into it.
2781   // 1.1. Allocate extent
2782   // 1.2. Save log's fnode extents and add new extents
2783   // 1.3. Update op to log
2784   // 1.4. Jump op to log
2785   // During that, no one else can write to log, otherwise we risk jumping backwards.
2786   // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that.
2787
2788   // 1.1 allocate new log extents and store them at fnode_tail
2789   File *log_file = log.writer->file.get();
2790   old_log_jump_to = log_file->fnode.get_allocated();
2791   bluefs_fnode_t fnode_tail;
2792   uint64_t runway = log_file->fnode.get_allocated() - log.writer->get_effective_write_pos();
2793   dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
2794            << " need 0x" << cct->_conf->bluefs_max_log_runway << std::dec << dendl;
2795   int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
2796                     cct->_conf->bluefs_max_log_runway,
2797                     0,
2798                     &fnode_tail);
2799   ceph_assert(r == 0);
2800
2801   // 1.2 save log's fnode extents and add new extents
2802   bluefs_fnode_t old_log_fnode(log_file->fnode);
2803   log_file->fnode.clone_extents(fnode_tail);
2804   //adjust usage as flush below will need it
2805   vselector->sub_usage(log_file->vselector_hint, old_log_fnode);
2806   vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2807   dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2808
2809   // 1.3 update the log file change and log a jump to the offset where we want to
2810   // write the new entries
2811   log.t.op_file_update_inc(log_file->fnode);
2812
2813   // 1.4 jump to new position should mean next seq
2814   log.t.op_jump(log.seq_live + 1, old_log_jump_to);
2815   uint64_t seq_now = log.seq_live;
2816   // we need to flush all bdev because we will be streaming all dirty files to log
2817   // TODO - think - if _flush_and_sync_log_jump will not add dirty files nor release pending allocations
2818   // then flush_bdev() will not be necessary
2819   _flush_bdev();
2820   _flush_and_sync_log_jump_D(old_log_jump_to, runway);
2821
2822   //
2823   // Part 2.
2824   // Build new log starter and compacted metadata body
2825   // 2.1.  Build full compacted meta transaction.
2826   //       While still holding the lock, encode a bluefs transaction
2827   //       that dumps all of the in-memory fnodes and names.
2828   //       This might be pretty large and its allocation map can exceed
2829   //       superblock size. Hence instead we'll need log starter part which
2830   //       goes to superblock and refers that new meta through op_update_inc.
2831   // 2.2.  After releasing the lock allocate space for the above transaction
2832   //       using its size estimation.
2833   //       Then build tailing list of extents which consists of these
2834   //       newly allocated extents followed by ones from Part 1.
2835   // 2.3.  Allocate the space required for the starter part of the new log.
2836   //       It should be small enough to fit into superblock.
2837   //       Effectively we start building new log fnode here.
2838   // 2.4.  Store starter fnode to run-time superblock, to be written out later
2839   // 2.5.  Finalize new log's fnode building
2840   //       This will include log's starter and tailing extents built at 2.2
2841   // 2.6.  Encode new log fnode starter,
2842   //       It will include op_init, new log's op_update_inc
2843   //       and jump to the compacted meta transaction beginning.
2844   //       Superblock will reference this starter part
2845   // 2.7.  Encode compacted meta transaction,
2846   //       extend the transaction with a jump to the log tail from 1.1 before
2847   //       encoding.
2848   //
2849
2850   // 2.1 Build full compacted meta transaction
2851   bluefs_transaction_t compacted_meta_t;
2852   _compact_log_dump_metadata_NF(starter_seq + 1, &compacted_meta_t, 0, seq_now);
2853
2854   // now state is captured to compacted_meta_t,
2855   // current log can be used to write to,
2856   //ops in log will be continuation of captured state
2857   logger->tinc(l_bluefs_compaction_lock_lat, mono_clock::now() - t0);
2858   log.lock.unlock();
2859
2860   // 2.2 Allocate the space required for the compacted meta transaction
2861   uint64_t compacted_meta_need = _estimate_transaction_size(&compacted_meta_t);
2862   dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need
2863            << dendl;
2864   {
2865     bluefs_fnode_t fnode_pre_tail;
2866     // do allocate
2867     r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
2868                   compacted_meta_need,
2869                   0,
2870                   &fnode_pre_tail);
2871     ceph_assert(r == 0);
2872     // build trailing list of extents in fnode_tail,
2873     // this will include newly allocated extents for compacted meta
2874     // and aux extents allocated at step 1.1
2875     fnode_pre_tail.claim_extents(fnode_tail.extents);
2876     fnode_tail.swap_extents(fnode_pre_tail);
2877   }
2878
2879   // 2.3 Allocate the space required for the starter part of the new log.
2880   // Start building New log fnode
2881   FileRef new_log = nullptr;
2882   new_log = ceph::make_ref<File>();
2883   new_log->fnode.ino = log_file->fnode.ino;
2884   new_log->fnode.mtime = mtime;
2885   // Estimate the required space
2886   uint64_t starter_need =
2887     _make_initial_transaction(starter_seq, fnode_tail, 0, nullptr);
2888   // and now allocate and store at new_log_fnode
2889   r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
2890                 starter_need,
2891                 0,
2892                 &new_log->fnode);
2893   ceph_assert(r == 0);
2894
2895   // 2.4 Store starter fnode to run-time superblock, to be written out later
2896   super.log_fnode = new_log->fnode;
2897
2898   // 2.5 Finalize new log's fnode building
2899   // start collecting new log fnode updates (to make op_update_inc later)
2900   // since this point. This will include compacted meta from 2.2 and aux
2901   // extents from 1.1.
2902   new_log->fnode.reset_delta();
2903   new_log->fnode.claim_extents(fnode_tail.extents);
2904
2905   // 2.6 Encode new log fnode
2906   bufferlist starter_bl;
2907   _make_initial_transaction(starter_seq, new_log->fnode, starter_need,
2908     &starter_bl);
2909
2910   // 2.7 Encode compacted meta transaction,
2911   dout(20) << __func__
2912            << " new_log jump seq " << seq_now
2913            << std::hex << " offset 0x" << starter_need + compacted_meta_need
2914            << std::dec << dendl;
2915   // Extent compacted_meta transaction with a just to new log tail.
2916   // Hopefully "compact_meta_need" estimation provides enough extra space
2917   // for this new jump, assert below if not
2918   compacted_meta_t.op_jump(seq_now, starter_need + compacted_meta_need);
2919   // Now do encodeing and padding
2920   bufferlist compacted_meta_bl;
2921   compacted_meta_bl.reserve(compacted_meta_need);
2922   encode(compacted_meta_t, compacted_meta_bl);
2923   ceph_assert(compacted_meta_bl.length() <= compacted_meta_need);
2924   _pad_bl(compacted_meta_bl, compacted_meta_need);
2925
2926   //
2927   // Part 3.
2928   // Write out new log's content
2929   // 3.1 Stage new log's content writing
2930   // 3.2 Do flush and wait for completion through flush_bdev()
2931   //
2932
2933   // 3.1 Stage new log's content writing
2934   // Make new log writer and append bufferlists to write out.
2935   FileWriter *new_log_writer = _create_writer(new_log);
2936   // And append all new log's bufferlists to write out.
2937   new_log_writer->append(starter_bl);
2938   new_log_writer->append(compacted_meta_bl);
2939
2940   // 3.2. flush and wait
2941   _flush_special(new_log_writer);
2942   _flush_bdev(new_log_writer, false); // do not check log.lock is locked
2943
2944   // Part 4.
2945   // Write out new superblock to reflect all the changes.
2946   //
2947
2948   _write_super(BDEV_DB);
2949   _flush_bdev();
2950
2951   // Part 5.
2952   // Apply new log fnode
2953   //
2954
2955   // we need to acquire log's lock back at this point
2956   log.lock.lock();
2957   // Reconstruct actual log object from the new one.
2958   vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2959   log_file->fnode.size =
2960     log.writer->pos - old_log_jump_to + starter_need + compacted_meta_need;
2961   log_file->fnode.mtime = std::max(mtime, log_file->fnode.mtime);
2962   log_file->fnode.swap_extents(new_log->fnode);
2963   // update log's writer
2964   log.writer->pos = log.writer->file->fnode.size;
2965   vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2966   // and unlock
2967   log.lock.unlock();
2968
2969   // we're mostly done
2970   dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2971   logger->inc(l_bluefs_log_compactions);
2972
2973   //Part 6.
2974   // Finalization
2975   // 6.1 Permit log's extension, forbidden at step 0.
2976   //
2977   // 6.2 Release the new log writer
2978   //
2979   // 6.3 Release old space
2980   //
2981   // 6.4. Enable other compactions
2982   //
2983
2984   // 6.1 Permit log's extension, forbidden at step 0.
2985   old_forbidden = atomic_exchange(&log_forbidden_to_expand, false);
2986   ceph_assert(old_forbidden == true);
2987   //to wake up if someone was in need of expanding log
2988   log_cond.notify_all();
2989
2990   // 6.2 Release the new log writer
2991   _close_writer(new_log_writer);
2992   new_log_writer = nullptr;
2993   new_log = nullptr;
2994
2995   // 6.3 Release old space
2996   {
2997     dout(10) << __func__
2998              << " release old log extents " << old_log_fnode.extents
2999              << dendl;
3000     std::lock_guard dl(dirty.lock);
3001     for (auto& r : old_log_fnode.extents) {
3002       dirty.pending_release[r.bdev].insert(r.offset, r.length);
3003     }
3004   }
3005
3006   // 6.4. Enable other compactions
3007   old_is_comp = atomic_exchange(&log_is_compacting, false);
3008   ceph_assert(old_is_comp);
3009 }
3010
3011 void BlueFS::_pad_bl(bufferlist& bl, uint64_t pad_size)
3012 {
3013   pad_size = std::max(pad_size, uint64_t(super.block_size));
3014   uint64_t partial = bl.length() % pad_size;
3015   if (partial) {
3016     dout(10) << __func__ << " padding with 0x" << std::hex
3017              << pad_size - partial << " zeros" << std::dec << dendl;
3018     bl.append_zero(pad_size - partial);
3019   }
3020 }
3021
3022
3023 // Returns log seq that was live before advance.
3024 uint64_t BlueFS::_log_advance_seq()
3025 {
3026   ceph_assert(ceph_mutex_is_locked(dirty.lock));
3027   ceph_assert(ceph_mutex_is_locked(log.lock));
3028   //acquire new seq
3029   // this will became seq_stable once we write
3030   ceph_assert(dirty.seq_stable < dirty.seq_live);
3031   ceph_assert(log.t.seq == log.seq_live);
3032   uint64_t seq = log.seq_live;
3033   log.t.uuid = super.uuid;
3034
3035   ++dirty.seq_live;
3036   ++log.seq_live;
3037   ceph_assert(dirty.seq_live == log.seq_live);
3038   return seq;
3039 }
3040
3041
3042 // Adds to log.t file modifications mentioned in `dirty.files`.
3043 // Note: some bluefs ops may have already been stored in log.t transaction.
3044 void BlueFS::_consume_dirty(uint64_t seq)
3045 {
3046   ceph_assert(ceph_mutex_is_locked(dirty.lock));
3047   ceph_assert(ceph_mutex_is_locked(log.lock));
3048
3049   // log dirty files
3050   // we just incremented log_seq. It is now illegal to add to dirty.files[log_seq]
3051   auto lsi = dirty.files.find(seq);
3052   if (lsi != dirty.files.end()) {
3053     dout(20) << __func__ << " " << lsi->second.size() << " dirty.files" << dendl;
3054     for (auto &f : lsi->second) {
3055       // fnode here is protected indirectly
3056       // the only path that adds to dirty.files goes from _fsync()
3057       // _fsync() is executed under writer lock,
3058       // and does not exit until syncing log is done
3059       dout(20) << __func__ << "   op_file_update_inc " << f.fnode << dendl;
3060       log.t.op_file_update_inc(f.fnode);
3061     }
3062   }
3063 }
3064
3065 // Extends log if its free space is smaller then bluefs_min_log_runway.
3066 // Returns space available *BEFORE* adding new space. Signed for additional <0 detection.
3067 int64_t BlueFS::_maybe_extend_log()
3068 {
3069   ceph_assert(ceph_mutex_is_locked(log.lock));
3070   // allocate some more space (before we run out)?
3071   // BTW: this triggers `flush()` in the `page_aligned_appender` of `log.writer`.
3072   int64_t runway = log.writer->file->fnode.get_allocated() -
3073     log.writer->get_effective_write_pos();
3074   if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
3075     dout(10) << __func__ << " allocating more log runway (0x"
3076              << std::hex << runway << std::dec  << " remaining)" << dendl;
3077     /*
3078      * Usually, when we are low on space in log, we just allocate new extent,
3079      * put update op(log) to log and we are fine.
3080      * Problem - it interferes with log compaction:
3081      * New log produced in compaction will include - as last op - jump into some offset (anchor) of current log.
3082      * It is assumed that log region (anchor - end) will contain all changes made by bluefs since
3083      * full state capture into new log.
3084      * Putting log update into (anchor - end) region is illegal, because any update there must be compatible with
3085      * both logs, but old log is different then new log.
3086      *
3087      * Possible solutions:
3088      * - stall extending log until we finish compacting and switch log (CURRENT)
3089      * - re-run compaction with more runway for old log
3090      * - add OP_FILE_ADDEXT that adds extent; will be compatible with both logs
3091      */
3092     if (log_forbidden_to_expand.load() == true) {
3093       return -EWOULDBLOCK;
3094     }
3095     vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
3096     int r = _allocate(
3097       vselector->select_prefer_bdev(log.writer->file->vselector_hint),
3098       cct->_conf->bluefs_max_log_runway,
3099       0,
3100       &log.writer->file->fnode);
3101     ceph_assert(r == 0);
3102     vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
3103     log.t.op_file_update_inc(log.writer->file->fnode);
3104   }
3105   return runway;
3106 }
3107
3108 void BlueFS::_flush_and_sync_log_core(int64_t runway)
3109 {
3110   ceph_assert(ceph_mutex_is_locked(log.lock));
3111   dout(10) << __func__ << " " << log.t << dendl;
3112
3113   bufferlist bl;
3114   bl.reserve(super.block_size);
3115   encode(log.t, bl);
3116   // pad to block boundary
3117   size_t realign = super.block_size - (bl.length() % super.block_size);
3118   if (realign && realign != super.block_size)
3119     bl.append_zero(realign);
3120
3121   logger->inc(l_bluefs_log_write_count, 1);
3122   logger->inc(l_bluefs_logged_bytes, bl.length());
3123
3124   if (true) {
3125     ceph_assert(bl.length() <= runway); // if we write this, we will have an unrecoverable data loss
3126                                         // transaction will not fit extents before growth -> data loss on _replay
3127   }
3128
3129   log.writer->append(bl);
3130
3131   // prepare log for new transactions
3132   log.t.clear();
3133   log.t.seq = log.seq_live;
3134
3135   uint64_t new_data = _flush_special(log.writer);
3136   vselector->add_usage(log.writer->file->vselector_hint, new_data);
3137 }
3138
3139 // Clears dirty.files up to (including) seq_stable.
3140 void BlueFS::_clear_dirty_set_stable_D(uint64_t seq)
3141 {
3142   std::lock_guard dl(dirty.lock);
3143
3144   // clean dirty files
3145   if (seq > dirty.seq_stable) {
3146     dirty.seq_stable = seq;
3147     dout(20) << __func__ << " seq_stable " << dirty.seq_stable << dendl;
3148
3149     // undirty all files that were already streamed to log
3150     auto p = dirty.files.begin();
3151     while (p != dirty.files.end()) {
3152       if (p->first > dirty.seq_stable) {
3153         dout(20) << __func__ << " done cleaning up dirty files" << dendl;
3154         break;
3155       }
3156
3157       auto l = p->second.begin();
3158       while (l != p->second.end()) {
3159         File *file = &*l;
3160         ceph_assert(file->dirty_seq <= dirty.seq_stable);
3161         dout(20) << __func__ << " cleaned file " << file->fnode.ino << dendl;
3162         file->dirty_seq = dirty.seq_stable;
3163         p->second.erase(l++);
3164       }
3165
3166       ceph_assert(p->second.empty());
3167       dirty.files.erase(p++);
3168     }
3169   } else {
3170     dout(20) << __func__ << " seq_stable " << dirty.seq_stable
3171              << " already >= out seq " << seq
3172              << ", we lost a race against another log flush, done" << dendl;
3173   }
3174 }
3175
3176 void BlueFS::_release_pending_allocations(vector<interval_set<uint64_t>>& to_release)
3177 {
3178   for (unsigned i = 0; i < to_release.size(); ++i) {
3179     if (to_release[i].empty()) {
3180         continue;
3181     }
3182     /* OK, now we have the guarantee alloc[i] won't be null. */
3183
3184     bool discard_queued = bdev[i]->try_discard(to_release[i]);
3185     if (!discard_queued) {
3186       alloc[i]->release(to_release[i]);
3187       if (is_shared_alloc(i)) {
3188         shared_alloc->bluefs_used -= to_release[i].size();
3189       }
3190     }
3191   }
3192 }
3193
3194 int BlueFS::_flush_and_sync_log_LD(uint64_t want_seq)
3195 {
3196   int64_t available_runway;
3197   do {
3198     log.lock.lock();
3199     dirty.lock.lock();
3200     if (want_seq && want_seq <= dirty.seq_stable) {
3201       dout(10) << __func__ << " want_seq " << want_seq << " <= seq_stable "
3202                << dirty.seq_stable << ", done" << dendl;
3203       dirty.lock.unlock();
3204       log.lock.unlock();
3205       return 0;
3206     }
3207
3208     available_runway = _maybe_extend_log();
3209     if (available_runway == -EWOULDBLOCK) {
3210       // we are in need of adding runway, but we are during log-switch from compaction
3211       dirty.lock.unlock();
3212       //instead log.lock.unlock() do move ownership
3213       std::unique_lock<ceph::mutex> ll(log.lock, std::adopt_lock);
3214       while (log_forbidden_to_expand.load()) {
3215         log_cond.wait(ll);
3216       }
3217     } else {
3218       ceph_assert(available_runway >= 0);
3219     }
3220   } while (available_runway < 0);
3221
3222   ceph_assert(want_seq == 0 || want_seq <= dirty.seq_live); // illegal to request seq that was not created yet
3223   uint64_t seq =_log_advance_seq();
3224   _consume_dirty(seq);
3225   vector<interval_set<uint64_t>> to_release(dirty.pending_release.size());
3226   to_release.swap(dirty.pending_release);
3227   dirty.lock.unlock();
3228
3229   _flush_and_sync_log_core(available_runway);
3230   _flush_bdev(log.writer);
3231   logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size);
3232   //now log.lock is no longer needed
3233   log.lock.unlock();
3234
3235   _clear_dirty_set_stable_D(seq);
3236   _release_pending_allocations(to_release);
3237
3238   _update_logger_stats();
3239   return 0;
3240 }
3241
3242 // Flushes log and immediately adjusts log_writer pos.
3243 int BlueFS::_flush_and_sync_log_jump_D(uint64_t jump_to,
3244                                      int64_t available_runway)
3245 {
3246   ceph_assert(ceph_mutex_is_locked(log.lock));
3247
3248   ceph_assert(jump_to);
3249   // we synchronize writing to log, by lock to log.lock
3250
3251   dirty.lock.lock();
3252   uint64_t seq =_log_advance_seq();
3253   _consume_dirty(seq);
3254   vector<interval_set<uint64_t>> to_release(dirty.pending_release.size());
3255   to_release.swap(dirty.pending_release);
3256   dirty.lock.unlock();
3257   _flush_and_sync_log_core(available_runway);
3258
3259   dout(10) << __func__ << " jumping log offset from 0x" << std::hex
3260            << log.writer->pos << " -> 0x" << jump_to << std::dec << dendl;
3261   log.writer->pos = jump_to;
3262   vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode.size);
3263   log.writer->file->fnode.size = jump_to;
3264   vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode.size);
3265
3266   _flush_bdev(log.writer);
3267
3268   _clear_dirty_set_stable_D(seq);
3269   _release_pending_allocations(to_release);
3270
3271   logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size);
3272   _update_logger_stats();
3273   return 0;
3274 }
3275
3276 ceph::bufferlist BlueFS::FileWriter::flush_buffer(
3277   CephContext* const cct,
3278   const bool partial,
3279   const unsigned length,
3280   const bluefs_super_t& super)
3281 {
3282   ceph_assert(ceph_mutex_is_locked(this->lock) || file->fnode.ino <= 1);
3283   ceph::bufferlist bl;
3284   if (partial) {
3285     tail_block.splice(0, tail_block.length(), &bl);
3286   }
3287   const auto remaining_len = length - bl.length();
3288   buffer.splice(0, remaining_len, &bl);
3289   if (buffer.length()) {
3290     dout(20) << " leaving 0x" << std::hex << buffer.length() << std::dec
3291              << " unflushed" << dendl;
3292   }
3293   if (const unsigned tail = bl.length() & ~super.block_mask(); tail) {
3294     const auto padding_len = super.block_size - tail;
3295     dout(20) << __func__ << " caching tail of 0x"
3296              << std::hex << tail
3297              << " and padding block with 0x" << padding_len
3298              << " buffer.length() " << buffer.length()
3299              << std::dec << dendl;
3300     // We need to go through the `buffer_appender` to get a chance to
3301     // preserve in-memory contiguity and not mess with the alignment.
3302     // Otherwise a costly rebuild could happen in e.g. `KernelDevice`.
3303     buffer_appender.append_zero(padding_len);
3304     buffer.splice(buffer.length() - padding_len, padding_len, &bl);
3305     // Deep copy the tail here. This allows to avoid costlier copy on
3306     // bufferlist rebuild in e.g. `KernelDevice` and minimizes number
3307     // of memory allocations.
3308     // The alternative approach would be to place the entire tail and
3309     // padding on a dedicated, 4 KB long memory chunk. This shouldn't
3310     // trigger the rebuild while still being less expensive.
3311     buffer_appender.substr_of(bl, bl.length() - padding_len - tail, tail);
3312     buffer.splice(buffer.length() - tail, tail, &tail_block);
3313   } else {
3314     tail_block.clear();
3315   }
3316   return bl;
3317 }
3318
3319 int BlueFS::_signal_dirty_to_log_D(FileWriter *h)
3320 {
3321   ceph_assert(ceph_mutex_is_locked(h->lock));
3322   std::lock_guard dl(dirty.lock);
3323   if (h->file->deleted) {
3324     dout(10) << __func__ << "  deleted, no-op" << dendl;
3325     return 0;
3326   }
3327
3328   h->file->fnode.mtime = ceph_clock_now();
3329   ceph_assert(h->file->fnode.ino >= 1);
3330   if (h->file->dirty_seq <= dirty.seq_stable) {
3331     h->file->dirty_seq = dirty.seq_live;
3332     dirty.files[h->file->dirty_seq].push_back(*h->file);
3333     dout(20) << __func__ << " dirty_seq = " << dirty.seq_live
3334              << " (was clean)" << dendl;
3335   } else {
3336     if (h->file->dirty_seq != dirty.seq_live) {
3337       // need re-dirty, erase from list first
3338       ceph_assert(dirty.files.count(h->file->dirty_seq));
3339       auto it = dirty.files[h->file->dirty_seq].iterator_to(*h->file);
3340       dirty.files[h->file->dirty_seq].erase(it);
3341       h->file->dirty_seq = dirty.seq_live;
3342       dirty.files[h->file->dirty_seq].push_back(*h->file);
3343       dout(20) << __func__ << " dirty_seq = " << dirty.seq_live
3344                << " (was " << h->file->dirty_seq << ")" << dendl;
3345     } else {
3346       dout(20) << __func__ << " dirty_seq = " << dirty.seq_live
3347                << " (unchanged, do nothing) " << dendl;
3348     }
3349   }
3350   return 0;
3351 }
3352
3353 void BlueFS::flush_range(FileWriter *h, uint64_t offset, uint64_t length)/*_WF*/
3354 {
3355   _maybe_check_vselector_LNF();
3356   std::unique_lock hl(h->lock);
3357   _flush_range_F(h, offset, length);
3358 }
3359
3360 int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length)
3361 {
3362   ceph_assert(ceph_mutex_is_locked(h->lock));
3363   ceph_assert(h->file->num_readers.load() == 0);
3364   ceph_assert(h->file->fnode.ino > 1);
3365
3366   dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
3367            << " 0x" << offset << "~" << length << std::dec
3368            << " to " << h->file->fnode << dendl;
3369   if (h->file->deleted) {
3370     dout(10) << __func__ << "  deleted, no-op" << dendl;
3371     return 0;
3372   }
3373
3374   bool buffered = cct->_conf->bluefs_buffered_io;
3375
3376   if (offset + length <= h->pos)
3377     return 0;
3378   if (offset < h->pos) {
3379     length -= h->pos - offset;
3380     offset = h->pos;
3381     dout(10) << " still need 0x"
3382              << std::hex << offset << "~" << length << std::dec
3383              << dendl;
3384   }
3385   std::lock_guard file_lock(h->file->lock);
3386   ceph_assert(offset <= h->file->fnode.size);
3387
3388   uint64_t allocated = h->file->fnode.get_allocated();
3389   vselector->sub_usage(h->file->vselector_hint, h->file->fnode);
3390   // do not bother to dirty the file if we are overwriting
3391   // previously allocated extents.
3392   if (allocated < offset + length) {
3393     // we should never run out of log space here; see the min runway check
3394     // in _flush_and_sync_log.
3395     int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint),
3396                       offset + length - allocated,
3397                       0,
3398                       &h->file->fnode);
3399     if (r < 0) {
3400       derr << __func__ << " allocated: 0x" << std::hex << allocated
3401            << " offset: 0x" << offset << " length: 0x" << length << std::dec
3402            << dendl;
3403       vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo
3404       ceph_abort_msg("bluefs enospc");
3405       return r;
3406     }
3407     h->file->is_dirty = true;
3408   }
3409   if (h->file->fnode.size < offset + length) {
3410     h->file->fnode.size = offset + length;
3411     h->file->is_dirty = true;
3412   }
3413
3414   dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl;
3415   int res = _flush_data(h, offset, length, buffered);
3416   vselector->add_usage(h->file->vselector_hint, h->file->fnode);
3417   return res;
3418 }
3419
3420 int BlueFS::_flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool buffered)
3421 {
3422   if (h->file->fnode.ino > 1) {
3423     ceph_assert(ceph_mutex_is_locked(h->lock));
3424     ceph_assert(ceph_mutex_is_locked(h->file->lock));
3425   }
3426   uint64_t x_off = 0;
3427   auto p = h->file->fnode.seek(offset, &x_off);
3428   ceph_assert(p != h->file->fnode.extents.end());
3429   dout(20) << __func__ << " in " << *p << " x_off 0x"
3430            << std::hex << x_off << std::dec << dendl;
3431
3432   unsigned partial = x_off & ~super.block_mask();
3433   if (partial) {
3434     dout(20) << __func__ << " using partial tail 0x"
3435              << std::hex << partial << std::dec << dendl;
3436     x_off -= partial;
3437     offset -= partial;
3438     length += partial;
3439     dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
3440     for (auto p : h->iocv) {
3441       if (p) {
3442         p->aio_wait();
3443       }
3444     }
3445   }
3446
3447   auto bl = h->flush_buffer(cct, partial, length, super);
3448   ceph_assert(bl.length() >= length);
3449   h->pos = offset + length;
3450   length = bl.length();
3451
3452   logger->inc(l_bluefs_write_count, 1);
3453   logger->inc(l_bluefs_write_bytes, length);
3454
3455   switch (h->writer_type) {
3456   case WRITER_WAL:
3457     logger->inc(l_bluefs_write_count_wal, 1);
3458     logger->inc(l_bluefs_bytes_written_wal, length);
3459     break;
3460   case WRITER_SST:
3461     logger->inc(l_bluefs_write_count_sst, 1);
3462     logger->inc(l_bluefs_bytes_written_sst, length);
3463     break;
3464   }
3465
3466   dout(30) << "dump:\n";
3467   bl.hexdump(*_dout);
3468   *_dout << dendl;
3469
3470   uint64_t bloff = 0;
3471   uint64_t bytes_written_slow = 0;
3472   while (length > 0) {
3473     logger->inc(l_bluefs_write_disk_count, 1);
3474
3475     uint64_t x_len = std::min(p->length - x_off, length);
3476     bufferlist t;
3477     t.substr_of(bl, bloff, x_len);
3478     if (cct->_conf->bluefs_sync_write) {
3479       bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint);
3480     } else {
3481       bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint);
3482     }
3483     h->dirty_devs[p->bdev] = true;
3484     if (p->bdev == BDEV_SLOW) {
3485       bytes_written_slow += t.length();
3486     }
3487
3488     bloff += x_len;
3489     length -= x_len;
3490     ++p;
3491     x_off = 0;
3492   }
3493   if (bytes_written_slow) {
3494     logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow);
3495   }
3496   for (unsigned i = 0; i < MAX_BDEV; ++i) {
3497     if (bdev[i]) {
3498       if (h->iocv[i] && h->iocv[i]->has_pending_aios()) {
3499         bdev[i]->aio_submit(h->iocv[i]);
3500       }
3501     }
3502   }
3503   dout(20) << __func__ << " h " << h << " pos now 0x"
3504            << std::hex << h->pos << std::dec << dendl;
3505   return 0;
3506 }
3507
3508 #ifdef HAVE_LIBAIO
3509 // we need to retire old completed aios so they don't stick around in
3510 // memory indefinitely (along with their bufferlist refs).
3511 void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
3512 {
3513   for (auto p : h->iocv) {
3514     if (p) {
3515       ls->splice(ls->end(), p->running_aios);
3516     }
3517   }
3518   dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
3519 }
3520
3521 void BlueFS::_wait_for_aio(FileWriter *h)
3522 {
3523   // NOTE: this is safe to call without a lock, as long as our reference is
3524   // stable.
3525   utime_t start;
3526   lgeneric_subdout(cct, bluefs, 10) << __func__;
3527   start = ceph_clock_now();
3528   *_dout << " " << h << dendl;
3529   for (auto p : h->iocv) {
3530     if (p) {
3531       p->aio_wait();
3532     }
3533   }
3534   dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl;
3535 }
3536 #endif
3537
3538 void BlueFS::append_try_flush(FileWriter *h, const char* buf, size_t len)/*_WF_LNF_NF_LD_D*/
3539 {
3540   bool flushed_sum = false;
3541   {
3542     std::unique_lock hl(h->lock);
3543     size_t max_size = 1ull << 30; // cap to 1GB
3544     while (len > 0) {
3545       bool need_flush = true;
3546       auto l0 = h->get_buffer_length();
3547       if (l0 < max_size) {
3548         size_t l = std::min(len, max_size - l0);
3549         h->append(buf, l);
3550         buf += l;
3551         len -= l;
3552         need_flush = h->get_buffer_length() >= cct->_conf->bluefs_min_flush_size;
3553       }
3554       if (need_flush) {
3555         bool flushed = false;
3556         int r = _flush_F(h, true, &flushed);
3557         ceph_assert(r == 0);
3558         flushed_sum |= flushed;
3559         // make sure we've made any progress with flush hence the
3560         // loop doesn't iterate forever
3561         ceph_assert(h->get_buffer_length() < max_size);
3562       }
3563     }
3564   }
3565   if (flushed_sum) {
3566     _maybe_compact_log_LNF_NF_LD_D();
3567   }
3568 }
3569
3570 void BlueFS::flush(FileWriter *h, bool force)/*_WF_LNF_NF_LD_D*/
3571 {
3572   bool flushed = false;
3573   int r;
3574   {
3575     std::unique_lock hl(h->lock);
3576     r = _flush_F(h, force, &flushed);
3577     ceph_assert(r == 0);
3578   }
3579   if (r == 0 && flushed) {
3580     _maybe_compact_log_LNF_NF_LD_D();
3581   }
3582 }
3583
3584 int BlueFS::_flush_F(FileWriter *h, bool force, bool *flushed)
3585 {
3586   ceph_assert(ceph_mutex_is_locked(h->lock));
3587   uint64_t length = h->get_buffer_length();
3588   uint64_t offset = h->pos;
3589   if (flushed) {
3590     *flushed = false;
3591   }
3592   if (!force &&
3593       length < cct->_conf->bluefs_min_flush_size) {
3594     dout(10) << __func__ << " " << h << " ignoring, length " << length
3595              << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
3596              << dendl;
3597     return 0;
3598   }
3599   if (length == 0) {
3600     dout(10) << __func__ << " " << h << " no dirty data on "
3601              << h->file->fnode << dendl;
3602     return 0;
3603   }
3604   dout(10) << __func__ << " " << h << " 0x"
3605            << std::hex << offset << "~" << length << std::dec
3606            << " to " << h->file->fnode << dendl;
3607   ceph_assert(h->pos <= h->file->fnode.size);
3608   int r = _flush_range_F(h, offset, length);
3609   if (flushed) {
3610     *flushed = true;
3611   }
3612   return r;
3613 }
3614
3615 // Flush for bluefs special files.
3616 // Does not add extents to h.
3617 // Does not mark h as dirty.
3618 // we do not need to dirty the log file (or it's compacting
3619 // replacement) when the file size changes because replay is
3620 // smart enough to discover it on its own.
3621 uint64_t BlueFS::_flush_special(FileWriter *h)
3622 {
3623   ceph_assert(h->file->fnode.ino <= 1);
3624   uint64_t length = h->get_buffer_length();
3625   uint64_t offset = h->pos;
3626   uint64_t new_data = 0;
3627   ceph_assert(length + offset <= h->file->fnode.get_allocated());
3628   if (h->file->fnode.size < offset + length) {
3629     new_data = offset + length - h->file->fnode.size;
3630     h->file->fnode.size = offset + length;
3631   }
3632   _flush_data(h, offset, length, false);
3633   return new_data;
3634 }
3635
3636 int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
3637 {
3638   std::lock_guard hl(h->lock);
3639   dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
3640            << " file " << h->file->fnode << dendl;
3641   if (h->file->deleted) {
3642     dout(10) << __func__ << "  deleted, no-op" << dendl;
3643     return 0;
3644   }
3645
3646   // we never truncate internal log files
3647   ceph_assert(h->file->fnode.ino > 1);
3648
3649   // truncate off unflushed data?
3650   if (h->pos < offset &&
3651       h->pos + h->get_buffer_length() > offset) {
3652     dout(20) << __func__ << " tossing out last " << offset - h->pos
3653              << " unflushed bytes" << dendl;
3654     ceph_abort_msg("actually this shouldn't happen");
3655   }
3656   if (h->get_buffer_length()) {
3657     int r = _flush_F(h, true);
3658     if (r < 0)
3659       return r;
3660   }
3661   if (offset == h->file->fnode.size) {
3662     return 0;  // no-op!
3663   }
3664   if (offset > h->file->fnode.size) {
3665     ceph_abort_msg("truncate up not supported");
3666   }
3667   ceph_assert(h->file->fnode.size >= offset);
3668   _flush_bdev(h);
3669
3670   std::lock_guard ll(log.lock);
3671   vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
3672   h->file->fnode.size = offset;
3673   h->file->is_dirty = true;
3674   vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
3675   log.t.op_file_update_inc(h->file->fnode);
3676   return 0;
3677 }
3678
3679 int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/
3680 {
3681   _maybe_check_vselector_LNF();
3682   std::unique_lock hl(h->lock);
3683   uint64_t old_dirty_seq = 0;
3684   {
3685     dout(10) << __func__ << " " << h << " " << h->file->fnode
3686              << " dirty " << h->file->is_dirty << dendl;
3687     int r = _flush_F(h, true);
3688     if (r < 0)
3689       return r;
3690     _flush_bdev(h);
3691     if (h->file->is_dirty) {
3692       _signal_dirty_to_log_D(h);
3693       h->file->is_dirty = false;
3694     }
3695     {
3696       std::lock_guard dl(dirty.lock);
3697       if (dirty.seq_stable < h->file->dirty_seq) {
3698         old_dirty_seq = h->file->dirty_seq;
3699         dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
3700                  << ") on " << h->file->fnode << ", flushing log" << dendl;
3701       }
3702     }
3703   }
3704   if (old_dirty_seq) {
3705     _flush_and_sync_log_LD(old_dirty_seq);
3706   }
3707   _maybe_compact_log_LNF_NF_LD_D();
3708
3709   return 0;
3710 }
3711
3712 // be careful - either h->file->lock or log.lock must be taken
3713 void BlueFS::_flush_bdev(FileWriter *h, bool check_mutext_locked)
3714 {
3715   if (check_mutext_locked) {
3716     if (h->file->fnode.ino > 1) {
3717       ceph_assert(ceph_mutex_is_locked(h->lock));
3718     } else if (h->file->fnode.ino == 1) {
3719       ceph_assert(ceph_mutex_is_locked(log.lock));
3720     }
3721   }
3722   std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs;
3723   h->dirty_devs.fill(false);
3724 #ifdef HAVE_LIBAIO
3725   if (!cct->_conf->bluefs_sync_write) {
3726     list<aio_t> completed_ios;
3727     _claim_completed_aios(h, &completed_ios);
3728     _wait_for_aio(h);
3729     completed_ios.clear();
3730   }
3731 #endif
3732   _flush_bdev(flush_devs);
3733 }
3734
3735 void BlueFS::_flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs)
3736 {
3737   // NOTE: this is safe to call without a lock.
3738   dout(20) << __func__ << dendl;
3739   for (unsigned i = 0; i < MAX_BDEV; i++) {
3740     if (dirty_bdevs[i])
3741       bdev[i]->flush();
3742   }
3743 }
3744
3745 void BlueFS::_flush_bdev()
3746 {
3747   // NOTE: this is safe to call without a lock.
3748   dout(20) << __func__ << dendl;
3749   for (unsigned i = 0; i < MAX_BDEV; i++) {
3750     // alloc space from BDEV_SLOW is unexpected.
3751     // So most cases we don't alloc from BDEV_SLOW and so avoiding flush not-used device.
3752     if (bdev[i] && (i != BDEV_SLOW || _get_used(i))) {
3753       bdev[i]->flush();
3754     }
3755   }
3756 }
3757
3758 const char* BlueFS::get_device_name(unsigned id)
3759 {
3760   if (id >= MAX_BDEV) return "BDEV_INV";
3761   const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
3762   return names[id];
3763 }
3764
3765 int BlueFS::_allocate(uint8_t id, uint64_t len,
3766                       uint64_t alloc_unit,
3767                       bluefs_fnode_t* node,
3768                       size_t alloc_attempts,
3769                       bool permit_dev_fallback)
3770 {
3771   dout(10) << __func__ << " len 0x" << std::hex << len
3772            << " au 0x" << alloc_unit
3773            << std::dec << " from " << (int)id
3774            << " cooldown " << cooldown_deadline
3775            << dendl;
3776   ceph_assert(id < alloc.size());
3777   int64_t alloc_len = 0;
3778   PExtentVector extents;
3779   uint64_t hint = 0;
3780   int64_t need = len;
3781   bool shared = is_shared_alloc(id);
3782   auto shared_unit = shared_alloc ? shared_alloc->alloc_unit : 0;
3783   bool was_cooldown = false;
3784   if (alloc[id]) {
3785     if (!alloc_unit) {
3786       alloc_unit = alloc_size[id];
3787     }
3788     // do not attempt shared_allocator with bluefs alloc unit
3789     // when cooling down, fallback to slow dev alloc unit.
3790     if (shared && alloc_unit != shared_unit) {
3791        if (duration_cast<seconds>(real_clock::now().time_since_epoch()).count() <
3792            cooldown_deadline) {
3793          logger->inc(l_bluefs_alloc_shared_size_fallbacks);
3794          alloc_unit = shared_unit;
3795          was_cooldown = true;
3796        } else if (cooldown_deadline.fetch_and(0)) {
3797          // we might get false cooldown_deadline reset at this point
3798          // but that's mostly harmless.
3799          dout(1) << __func__ << " shared allocation cooldown period elapsed"
3800                  << dendl;
3801        }
3802     }
3803     need = round_up_to(len, alloc_unit);
3804     if (!node->extents.empty() && node->extents.back().bdev == id) {
3805       hint = node->extents.back().end();
3806     }
3807     ++alloc_attempts;
3808     extents.reserve(4);  // 4 should be (more than) enough for most allocations
3809     alloc_len = alloc[id]->allocate(need, alloc_unit, hint, &extents);
3810   }
3811   if (alloc_len < 0 || alloc_len < need) {
3812     if (alloc[id]) {
3813       if (alloc_len > 0) {
3814         alloc[id]->release(extents);
3815       }
3816       if (!was_cooldown && shared) {
3817         auto delay_s = cct->_conf->bluefs_failed_shared_alloc_cooldown;
3818         cooldown_deadline = delay_s +
3819           duration_cast<seconds>(real_clock::now().time_since_epoch()).count();
3820         dout(1) << __func__ << " shared allocation cooldown set for "
3821                 << delay_s << "s"
3822                 << dendl;
3823       }
3824       dout(1) << __func__ << " unable to allocate 0x" << std::hex << need
3825               << " on bdev " << (int)id
3826               << ", allocator name " << alloc[id]->get_name()
3827               << ", allocator type " << alloc[id]->get_type()
3828               << ", capacity 0x" << alloc[id]->get_capacity()
3829               << ", block size 0x" << alloc[id]->get_block_size()
3830               << ", alloc unit 0x" << alloc_unit
3831               << ", free 0x" << alloc[id]->get_free()
3832               << ", fragmentation " << alloc[id]->get_fragmentation()
3833               << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0)
3834               << std::dec << dendl;
3835     } else {
3836       dout(20) << __func__ << " alloc-id not set on index="<< (int)id
3837                << " unable to allocate 0x" << std::hex << need
3838                << " on bdev " << (int)id << std::dec << dendl;
3839     }
3840     if (alloc[id] && shared && alloc_unit != shared_unit) {
3841       alloc_unit = shared_unit;
3842       dout(20) << __func__ << " fallback to bdev "
3843                << (int)id
3844                << " with alloc unit 0x" << std::hex << alloc_unit
3845                << std::dec << dendl;
3846       logger->inc(l_bluefs_alloc_shared_size_fallbacks);
3847       return _allocate(id,
3848                        len,
3849                        alloc_unit,
3850                        node,
3851                        alloc_attempts,
3852                        permit_dev_fallback);
3853     } else if (permit_dev_fallback && id != BDEV_SLOW && alloc[id + 1]) {
3854       dout(20) << __func__ << " fallback to bdev "
3855                << (int)id + 1
3856                << dendl;
3857       if (alloc_attempts > 0 && is_shared_alloc(id + 1)) {
3858         logger->inc(l_bluefs_alloc_shared_dev_fallbacks);
3859       }
3860       return _allocate(id + 1,
3861                        len,
3862                        0, // back to default alloc unit
3863                        node,
3864                        alloc_attempts,
3865                        permit_dev_fallback);
3866     } else {
3867       derr << __func__ << " allocation failed, needed 0x" << std::hex << need
3868            << dendl;
3869     }
3870     return -ENOSPC;
3871   } else {
3872     uint64_t used = _get_used(id);
3873     if (max_bytes[id] < used) {
3874       logger->set(max_bytes_pcounters[id], used);
3875       max_bytes[id] = used;
3876     }
3877     if (shared) {
3878       shared_alloc->bluefs_used += alloc_len;
3879     }
3880   }
3881
3882   for (auto& p : extents) {
3883     node->append_extent(bluefs_extent_t(id, p.offset, p.length));
3884   }
3885
3886   return 0;
3887 }
3888
3889 int BlueFS::preallocate(FileRef f, uint64_t off, uint64_t len)/*_LF*/
3890 {
3891   std::lock_guard ll(log.lock);
3892   std::lock_guard fl(f->lock);
3893   dout(10) << __func__ << " file " << f->fnode << " 0x"
3894            << std::hex << off << "~" << len << std::dec << dendl;
3895   if (f->deleted) {
3896     dout(10) << __func__ << "  deleted, no-op" << dendl;
3897     return 0;
3898   }
3899   ceph_assert(f->fnode.ino > 1);
3900   uint64_t allocated = f->fnode.get_allocated();
3901   if (off + len > allocated) {
3902     uint64_t want = off + len - allocated;
3903
3904     vselector->sub_usage(f->vselector_hint, f->fnode);
3905     int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint),
3906       want,
3907       0,
3908       &f->fnode);
3909     vselector->add_usage(f->vselector_hint, f->fnode);
3910     if (r < 0)
3911       return r;
3912
3913     log.t.op_file_update_inc(f->fnode);
3914   }
3915   return 0;
3916 }
3917
3918 void BlueFS::sync_metadata(bool avoid_compact)/*_LNF_NF_LD_D*/
3919 {
3920   bool can_skip_flush;
3921   {
3922     std::lock_guard ll(log.lock);
3923     std::lock_guard dl(dirty.lock);
3924     can_skip_flush = log.t.empty() && dirty.files.empty();
3925   }
3926   if (can_skip_flush) {
3927     dout(10) << __func__ << " - no pending log events" << dendl;
3928   } else {
3929     utime_t start;
3930     lgeneric_subdout(cct, bluefs, 10) << __func__;
3931     start = ceph_clock_now();
3932     *_dout <<  dendl;
3933     _flush_bdev(); // FIXME?
3934     _flush_and_sync_log_LD();
3935     dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl;
3936   }
3937
3938   if (!avoid_compact) {
3939     _maybe_compact_log_LNF_NF_LD_D();
3940   }
3941 }
3942
3943 void BlueFS::_maybe_compact_log_LNF_NF_LD_D()
3944 {
3945   if (!cct->_conf->bluefs_replay_recovery_disable_compact &&
3946       _should_start_compact_log_L_N()) {
3947     auto t0 = mono_clock::now();
3948     if (cct->_conf->bluefs_compact_log_sync) {
3949       _compact_log_sync_LNF_LD();
3950     } else {
3951       _compact_log_async_LD_LNF_D();
3952     }
3953     logger->tinc(l_bluefs_compaction_lat, mono_clock::now() - t0);
3954   }
3955 }
3956
3957 int BlueFS::open_for_write(
3958   std::string_view dirname,
3959   std::string_view filename,
3960   FileWriter **h,
3961   bool overwrite)/*_LND*/
3962 {
3963   _maybe_check_vselector_LNF();
3964   FileRef file;
3965   bool create = false;
3966   bool truncate = false;
3967   mempool::bluefs::vector<bluefs_extent_t> pending_release_extents;
3968   {
3969   std::lock_guard ll(log.lock);
3970   std::lock_guard nl(nodes.lock);
3971   dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3972   map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
3973   DirRef dir;
3974   if (p == nodes.dir_map.end()) {
3975     // implicitly create the dir
3976     dout(20) << __func__ << "  dir " << dirname
3977              << " does not exist" << dendl;
3978     return -ENOENT;
3979   } else {
3980     dir = p->second;
3981   }
3982
3983   map<string,FileRef>::iterator q = dir->file_map.find(filename);
3984   if (q == dir->file_map.end()) {
3985     if (overwrite) {
3986       dout(20) << __func__ << " dir " << dirname << " (" << dir
3987                << ") file " << filename
3988                << " does not exist" << dendl;
3989       return -ENOENT;
3990     }
3991     file = ceph::make_ref<File>();
3992     file->fnode.ino = ++ino_last;
3993     nodes.file_map[ino_last] = file;
3994     dir->file_map[string{filename}] = file;
3995     ++file->refs;
3996     create = true;
3997     logger->set(l_bluefs_num_files, nodes.file_map.size());
3998   } else {
3999     // overwrite existing file?
4000     file = q->second;
4001     if (overwrite) {
4002       dout(20) << __func__ << " dir " << dirname << " (" << dir
4003                << ") file " << filename
4004                << " already exists, overwrite in place" << dendl;
4005     } else {
4006       dout(20) << __func__ << " dir " << dirname << " (" << dir
4007                << ") file " << filename
4008                << " already exists, truncate + overwrite" << dendl;
4009       vselector->sub_usage(file->vselector_hint, file->fnode);
4010       file->fnode.size = 0;
4011       pending_release_extents.swap(file->fnode.extents);
4012       truncate = true;
4013
4014       file->fnode.clear_extents();
4015     }
4016   }
4017   ceph_assert(file->fnode.ino > 1);
4018
4019   file->fnode.mtime = ceph_clock_now();
4020   file->vselector_hint = vselector->get_hint_by_dir(dirname);
4021   if (create || truncate) {
4022     vselector->add_usage(file->vselector_hint, file->fnode); // update file count
4023   }
4024
4025   dout(20) << __func__ << " mapping " << dirname << "/" << filename
4026            << " vsel_hint " << file->vselector_hint
4027            << dendl;
4028
4029   log.t.op_file_update(file->fnode);
4030   if (create)
4031     log.t.op_dir_link(dirname, filename, file->fnode.ino);
4032
4033   std::lock_guard dl(dirty.lock);
4034   for (auto& p : pending_release_extents) {
4035     dirty.pending_release[p.bdev].insert(p.offset, p.length);
4036   }
4037   }
4038   *h = _create_writer(file);
4039
4040   if (boost::algorithm::ends_with(filename, ".log")) {
4041     (*h)->writer_type = BlueFS::WRITER_WAL;
4042     if (logger && !overwrite) {
4043       logger->inc(l_bluefs_files_written_wal);
4044     }
4045   } else if (boost::algorithm::ends_with(filename, ".sst")) {
4046     (*h)->writer_type = BlueFS::WRITER_SST;
4047     if (logger) {
4048       logger->inc(l_bluefs_files_written_sst);
4049     }
4050   }
4051
4052   dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
4053   return 0;
4054 }
4055
4056 BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
4057 {
4058   FileWriter *w = new FileWriter(f);
4059   for (unsigned i = 0; i < MAX_BDEV; ++i) {
4060     if (bdev[i]) {
4061       w->iocv[i] = new IOContext(cct, NULL);
4062     }
4063   }
4064   return w;
4065 }
4066
4067 void BlueFS::_drain_writer(FileWriter *h)
4068 {
4069   dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
4070   //h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer);
4071   for (unsigned i=0; i<MAX_BDEV; ++i) {
4072     if (bdev[i]) {
4073       if (h->iocv[i]) {
4074         h->iocv[i]->aio_wait();
4075         delete h->iocv[i];
4076       }
4077     }
4078   }
4079   // sanity
4080   if (h->file->fnode.size >= (1ull << 30)) {
4081     dout(10) << __func__ << " file is unexpectedly large:" << h->file->fnode << dendl;
4082   }
4083 }
4084
4085 void BlueFS::_close_writer(FileWriter *h)
4086 {
4087   _drain_writer(h);
4088   delete h;
4089 }
4090 void BlueFS::close_writer(FileWriter *h)
4091 {
4092   {
4093     std::lock_guard l(h->lock);
4094     _drain_writer(h);
4095   }
4096   delete h;
4097 }
4098
4099 uint64_t BlueFS::debug_get_dirty_seq(FileWriter *h)
4100 {
4101   std::lock_guard l(h->lock);
4102   return h->file->dirty_seq;
4103 }
4104
4105 bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev)
4106 {
4107   std::lock_guard l(h->lock);
4108   return h->dirty_devs[dev];
4109 }
4110
4111 int BlueFS::open_for_read(
4112   std::string_view dirname,
4113   std::string_view filename,
4114   FileReader **h,
4115   bool random)/*_N*/
4116 {
4117   _maybe_check_vselector_LNF();
4118   std::lock_guard nl(nodes.lock);
4119   dout(10) << __func__ << " " << dirname << "/" << filename
4120            << (random ? " (random)":" (sequential)") << dendl;
4121   map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4122   if (p == nodes.dir_map.end()) {
4123     dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
4124     return -ENOENT;
4125   }
4126   DirRef dir = p->second;
4127
4128   map<string,FileRef>::iterator q = dir->file_map.find(filename);
4129   if (q == dir->file_map.end()) {
4130     dout(20) << __func__ << " dir " << dirname << " (" << dir
4131              << ") file " << filename
4132              << " not found" << dendl;
4133     return -ENOENT;
4134   }
4135   File *file = q->second.get();
4136
4137   *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
4138                       random, false);
4139   dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
4140   return 0;
4141 }
4142
4143 int BlueFS::rename(
4144   std::string_view old_dirname, std::string_view old_filename,
4145   std::string_view new_dirname, std::string_view new_filename)/*_LND*/
4146 {
4147   std::lock_guard ll(log.lock);
4148   std::lock_guard nl(nodes.lock);
4149   dout(10) << __func__ << " " << old_dirname << "/" << old_filename
4150            << " -> " << new_dirname << "/" << new_filename << dendl;
4151   map<string,DirRef>::iterator p = nodes.dir_map.find(old_dirname);
4152   if (p == nodes.dir_map.end()) {
4153     dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
4154     return -ENOENT;
4155   }
4156   DirRef old_dir = p->second;
4157   map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
4158   if (q == old_dir->file_map.end()) {
4159     dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
4160              << ") file " << old_filename
4161              << " not found" << dendl;
4162     return -ENOENT;
4163   }
4164   FileRef file = q->second;
4165
4166   p = nodes.dir_map.find(new_dirname);
4167   if (p == nodes.dir_map.end()) {
4168     dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
4169     return -ENOENT;
4170   }
4171   DirRef new_dir = p->second;
4172   q = new_dir->file_map.find(new_filename);
4173   if (q != new_dir->file_map.end()) {
4174     dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
4175              << ") file " << new_filename
4176              << " already exists, unlinking" << dendl;
4177     ceph_assert(q->second != file);
4178     log.t.op_dir_unlink(new_dirname, new_filename);
4179     _drop_link_D(q->second);
4180   }
4181
4182   dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
4183            << " " << file->fnode << dendl;
4184
4185   new_dir->file_map[string{new_filename}] = file;
4186   old_dir->file_map.erase(string{old_filename});
4187
4188   log.t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
4189   log.t.op_dir_unlink(old_dirname, old_filename);
4190   return 0;
4191 }
4192
4193 int BlueFS::mkdir(std::string_view dirname)/*_LN*/
4194 {
4195   std::lock_guard ll(log.lock);
4196   std::lock_guard nl(nodes.lock);
4197   dout(10) << __func__ << " " << dirname << dendl;
4198   map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4199   if (p != nodes.dir_map.end()) {
4200     dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
4201     return -EEXIST;
4202   }
4203   nodes.dir_map[string{dirname}] = ceph::make_ref<Dir>();
4204   log.t.op_dir_create(dirname);
4205   return 0;
4206 }
4207
4208 int BlueFS::rmdir(std::string_view dirname)/*_LN*/
4209 {
4210   std::lock_guard ll(log.lock);
4211   std::lock_guard nl(nodes.lock);
4212   dout(10) << __func__ << " " << dirname << dendl;
4213   auto p = nodes.dir_map.find(dirname);
4214   if (p == nodes.dir_map.end()) {
4215     dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
4216     return -ENOENT;
4217   }
4218   DirRef dir = p->second;
4219   if (!dir->file_map.empty()) {
4220     dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
4221     return -ENOTEMPTY;
4222   }
4223   nodes.dir_map.erase(string{dirname});
4224   log.t.op_dir_remove(dirname);
4225   return 0;
4226 }
4227
4228 bool BlueFS::dir_exists(std::string_view dirname)/*_N*/
4229 {
4230   std::lock_guard nl(nodes.lock);
4231   map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4232   bool exists = p != nodes.dir_map.end();
4233   dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
4234   return exists;
4235 }
4236
4237 int BlueFS::stat(std::string_view dirname, std::string_view filename,
4238                  uint64_t *size, utime_t *mtime)/*_N*/
4239 {
4240   std::lock_guard nl(nodes.lock);
4241   dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
4242   map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4243   if (p == nodes.dir_map.end()) {
4244     dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
4245     return -ENOENT;
4246   }
4247   DirRef dir = p->second;
4248   map<string,FileRef>::iterator q = dir->file_map.find(filename);
4249   if (q == dir->file_map.end()) {
4250     dout(20) << __func__ << " dir " << dirname << " (" << dir
4251              << ") file " << filename
4252              << " not found" << dendl;
4253     return -ENOENT;
4254   }
4255   File *file = q->second.get();
4256   dout(10) << __func__ << " " << dirname << "/" << filename
4257            << " " << file->fnode << dendl;
4258   if (size)
4259     *size = file->fnode.size;
4260   if (mtime)
4261     *mtime = file->fnode.mtime;
4262   return 0;
4263 }
4264
4265 int BlueFS::lock_file(std::string_view dirname, std::string_view filename,
4266                       FileLock **plock)/*_LN*/
4267 {
4268   std::lock_guard ll(log.lock);
4269   std::lock_guard nl(nodes.lock);
4270   dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
4271   map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4272   if (p == nodes.dir_map.end()) {
4273     dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
4274     return -ENOENT;
4275   }
4276   DirRef dir = p->second;
4277   auto q = dir->file_map.find(filename);
4278   FileRef file;
4279   if (q == dir->file_map.end()) {
4280     dout(20) << __func__ << " dir " << dirname << " (" << dir
4281              << ") file " << filename
4282              << " not found, creating" << dendl;
4283     file = ceph::make_ref<File>();
4284     file->fnode.ino = ++ino_last;
4285     file->fnode.mtime = ceph_clock_now();
4286     nodes.file_map[ino_last] = file;
4287     dir->file_map[string{filename}] = file;
4288     logger->set(l_bluefs_num_files, nodes.file_map.size());
4289     ++file->refs;
4290     log.t.op_file_update(file->fnode);
4291     log.t.op_dir_link(dirname, filename, file->fnode.ino);
4292   } else {
4293     file = q->second;
4294     if (file->locked) {
4295       dout(10) << __func__ << " already locked" << dendl;
4296       return -ENOLCK;
4297     }
4298   }
4299   file->locked = true;
4300   *plock = new FileLock(file);
4301   dout(10) << __func__ << " locked " << file->fnode
4302            << " with " << *plock << dendl;
4303   return 0;
4304 }
4305
4306 int BlueFS::unlock_file(FileLock *fl)/*_N*/
4307 {
4308   std::lock_guard nl(nodes.lock);
4309   dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
4310   ceph_assert(fl->file->locked);
4311   fl->file->locked = false;
4312   delete fl;
4313   return 0;
4314 }
4315
4316 int BlueFS::readdir(std::string_view dirname, vector<string> *ls)/*_N*/
4317 {
4318   // dirname may contain a trailing /
4319   if (!dirname.empty() && dirname.back() == '/') {
4320     dirname.remove_suffix(1);
4321   }
4322   std::lock_guard nl(nodes.lock);
4323   dout(10) << __func__ << " " << dirname << dendl;
4324   if (dirname.empty()) {
4325     // list dirs
4326     ls->reserve(nodes.dir_map.size() + 2);
4327     for (auto& q : nodes.dir_map) {
4328       ls->push_back(q.first);
4329     }
4330   } else {
4331     // list files in dir
4332     map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4333     if (p == nodes.dir_map.end()) {
4334       dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
4335       return -ENOENT;
4336     }
4337     DirRef dir = p->second;
4338     ls->reserve(dir->file_map.size() + 2);
4339     for (auto& q : dir->file_map) {
4340       ls->push_back(q.first);
4341     }
4342   }
4343   ls->push_back(".");
4344   ls->push_back("..");
4345   return 0;
4346 }
4347
4348 int BlueFS::unlink(std::string_view dirname, std::string_view filename)/*_LND*/
4349 {
4350   std::lock_guard ll(log.lock);
4351   std::lock_guard nl(nodes.lock);
4352   dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
4353   map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4354   if (p == nodes.dir_map.end()) {
4355     dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
4356     return -ENOENT;
4357   }
4358   DirRef dir = p->second;
4359   map<string,FileRef>::iterator q = dir->file_map.find(filename);
4360   if (q == dir->file_map.end()) {
4361     dout(20) << __func__ << " file " << dirname << "/" << filename
4362              << " not found" << dendl;
4363     return -ENOENT;
4364   }
4365   FileRef file = q->second;
4366   if (file->locked) {
4367     dout(20) << __func__ << " file " << dirname << "/" << filename
4368              << " is locked" << dendl;
4369     return -EBUSY;
4370   }
4371   dir->file_map.erase(string{filename});
4372   log.t.op_dir_unlink(dirname, filename);
4373   _drop_link_D(file);
4374   return 0;
4375 }
4376
4377 bool BlueFS::wal_is_rotational()
4378 {
4379   if (bdev[BDEV_WAL]) {
4380     return bdev[BDEV_WAL]->is_rotational();
4381   } else if (bdev[BDEV_DB]) {
4382     return bdev[BDEV_DB]->is_rotational();
4383   }
4384   return bdev[BDEV_SLOW]->is_rotational();
4385 }
4386
4387 bool BlueFS::db_is_rotational()
4388 {
4389   if (bdev[BDEV_DB]) {
4390     return bdev[BDEV_DB]->is_rotational();
4391   }
4392   return bdev[BDEV_SLOW]->is_rotational();
4393 }
4394
4395 /*
4396   Algorithm.
4397   do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there.
4398   Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future,
4399   and try if using it will produce healthy bluefs transaction.
4400   We encode already known bluefs log extents and search disk for these bytes.
4401   When we find it, we decode following bytes as extent.
4402   We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction.
4403  */
4404 int BlueFS::_do_replay_recovery_read(FileReader *log_reader,
4405                                     size_t replay_pos,
4406                                     size_t read_offset,
4407                                     size_t read_len,
4408                                     bufferlist* bl) {
4409   dout(1) << __func__ << " replay_pos=0x" << std::hex << replay_pos <<
4410     " needs 0x" << read_offset << "~" << read_len << std::dec << dendl;
4411
4412   bluefs_fnode_t& log_fnode = log_reader->file->fnode;
4413   bufferlist bin_extents;
4414   ::encode(log_fnode.extents, bin_extents);
4415   dout(2) << __func__ << " log file encoded extents length = " << bin_extents.length() << dendl;
4416
4417   // cannot process if too small to effectively search
4418   ceph_assert(bin_extents.length() >= 32);
4419   bufferlist last_32;
4420   last_32.substr_of(bin_extents, bin_extents.length() - 32, 32);
4421
4422   //read fixed part from replay_pos to end of bluefs_log extents
4423   bufferlist fixed;
4424   uint64_t e_off = 0;
4425   auto e = log_fnode.seek(replay_pos, &e_off);
4426   ceph_assert(e != log_fnode.extents.end());
4427   int r = _bdev_read(e->bdev, e->offset + e_off, e->length - e_off, &fixed, ioc[e->bdev],
4428                      cct->_conf->bluefs_buffered_io);
4429   ceph_assert(r == 0);
4430   //capture dev of last good extent
4431   uint8_t last_e_dev = e->bdev;
4432   uint64_t last_e_off = e->offset;
4433   ++e;
4434   while (e != log_fnode.extents.end()) {
4435     r = _bdev_read(e->bdev, e->offset, e->length, &fixed, ioc[e->bdev],
4436                    cct->_conf->bluefs_buffered_io);
4437     ceph_assert(r == 0);
4438     last_e_dev = e->bdev;
4439     ++e;
4440   }
4441   ceph_assert(replay_pos + fixed.length() == read_offset);
4442
4443   dout(2) << __func__ << " valid data in log = " << fixed.length() << dendl;
4444
4445   struct compare {
4446     bool operator()(const bluefs_extent_t& a, const bluefs_extent_t& b) const {
4447       if (a.bdev < b.bdev) return true;
4448       if (a.offset < b.offset) return true;
4449       return a.length < b.length;
4450     }
4451   };
4452   std::set<bluefs_extent_t, compare> extents_rejected;
4453   for (int dcnt = 0; dcnt < 3; dcnt++) {
4454     uint8_t dev = (last_e_dev + dcnt) % MAX_BDEV;
4455     if (bdev[dev] == nullptr) continue;
4456     dout(2) << __func__ << " processing " << get_device_name(dev) << dendl;
4457     interval_set<uint64_t> disk_regions;
4458     disk_regions.insert(0, bdev[dev]->get_size());
4459     for (auto f : nodes.file_map) {
4460       auto& e = f.second->fnode.extents;
4461       for (auto& p : e) {
4462         if (p.bdev == dev) {
4463           disk_regions.erase(p.offset, p.length);
4464         }
4465       }
4466     }
4467     size_t disk_regions_count = disk_regions.num_intervals();
4468     dout(5) << __func__ << " " << disk_regions_count << " regions to scan on " << get_device_name(dev) << dendl;
4469
4470     auto reg = disk_regions.lower_bound(last_e_off);
4471     //for all except first, start from beginning
4472     last_e_off = 0;
4473     if (reg == disk_regions.end()) {
4474       reg = disk_regions.begin();
4475     }
4476     const uint64_t chunk_size = 4 * 1024 * 1024;
4477     const uint64_t page_size = 4096;
4478     const uint64_t max_extent_size = 16;
4479     uint64_t overlay_size = last_32.length() + max_extent_size;
4480     for (size_t i = 0; i < disk_regions_count; reg++, i++) {
4481       if (reg == disk_regions.end()) {
4482         reg = disk_regions.begin();
4483       }
4484       uint64_t pos = reg.get_start();
4485       uint64_t len = reg.get_len();
4486
4487       std::unique_ptr<char[]> raw_data_p{new char[page_size + chunk_size]};
4488       char* raw_data = raw_data_p.get();
4489       memset(raw_data, 0, page_size);
4490
4491       while (len > last_32.length()) {
4492         uint64_t chunk_len = len > chunk_size ? chunk_size : len;
4493         dout(5) << __func__ << " read "
4494                 << get_device_name(dev) << ":0x" << std::hex << pos << "+" << chunk_len
4495                 << std::dec << dendl;
4496         r = _bdev_read_random(dev, pos, chunk_len,
4497           raw_data + page_size, cct->_conf->bluefs_buffered_io);
4498         ceph_assert(r == 0);
4499
4500         //search for fixed_last_32
4501         char* chunk_b = raw_data + page_size;
4502         char* chunk_e = chunk_b + chunk_len;
4503
4504         char* search_b = chunk_b - overlay_size;
4505         char* search_e = chunk_e;
4506
4507         for (char* sp = search_b; ; sp += last_32.length()) {
4508           sp = (char*)memmem(sp, search_e - sp, last_32.c_str(), last_32.length());
4509           if (sp == nullptr) {
4510             break;
4511           }
4512
4513           char* n = sp + last_32.length();
4514           dout(5) << __func__ << " checking location 0x" << std::hex << pos + (n - chunk_b) << std::dec << dendl;
4515           bufferlist test;
4516           test.append(n, std::min<size_t>(max_extent_size, chunk_e - n));
4517           bluefs_extent_t ne;
4518           try {
4519             bufferlist::const_iterator p = test.begin();
4520             ::decode(ne, p);
4521           } catch (buffer::error& e) {
4522             continue;
4523           }
4524           if (extents_rejected.count(ne) != 0) {
4525             dout(5) << __func__ << " extent " << ne << " already refected" <<dendl;
4526             continue;
4527           }
4528           //insert as rejected already. if we succeed, it wouldn't make difference.
4529           extents_rejected.insert(ne);
4530
4531           if (ne.bdev >= MAX_BDEV ||
4532               bdev[ne.bdev] == nullptr ||
4533               ne.length > 16 * 1024 * 1024 ||
4534               (ne.length & 4095) != 0 ||
4535               ne.offset + ne.length > bdev[ne.bdev]->get_size() ||
4536               (ne.offset & 4095) != 0) {
4537             dout(5) << __func__ << " refusing extent " << ne << dendl;
4538             continue;
4539           }
4540           dout(5) << __func__ << " checking extent " << ne << dendl;
4541
4542           //read candidate extent - whole
4543           bufferlist candidate;
4544           candidate.append(fixed);
4545           r = _bdev_read(ne.bdev, ne.offset, ne.length, &candidate, ioc[ne.bdev],
4546                          cct->_conf->bluefs_buffered_io);
4547           ceph_assert(r == 0);
4548
4549           //check if transaction & crc is ok
4550           bluefs_transaction_t t;
4551           try {
4552             bufferlist::const_iterator p = candidate.begin();
4553             ::decode(t, p);
4554           }
4555           catch (buffer::error& e) {
4556             dout(5) << __func__ << " failed match" << dendl;
4557             continue;
4558           }
4559
4560           //success, it seems a probable candidate
4561           uint64_t l = std::min<uint64_t>(ne.length, read_len);
4562           //trim to required size
4563           bufferlist requested_read;
4564           requested_read.substr_of(candidate, fixed.length(), l);
4565           bl->append(requested_read);
4566           dout(5) << __func__ << " successful extension of log " << l << "/" << read_len << dendl;
4567           log_fnode.append_extent(ne);
4568           log_fnode.recalc_allocated();
4569           log_reader->buf.pos += l;
4570           return l;
4571         }
4572         //save overlay for next search
4573         memcpy(search_b, chunk_e - overlay_size, overlay_size);
4574         pos += chunk_len;
4575         len -= chunk_len;
4576       }
4577     }
4578   }
4579   return 0;
4580 }
4581
4582 void BlueFS::_check_vselector_LNF() {
4583   BlueFSVolumeSelector* vs = vselector->clone_empty();
4584   if (!vs) {
4585     return;
4586   }
4587   std::lock_guard ll(log.lock);
4588   std::lock_guard nl(nodes.lock);
4589   // Checking vselector is under log, nodes and file(s) locks,
4590   // so any modification of vselector must be under at least one of those locks.
4591   for (auto& f : nodes.file_map) {
4592     f.second->lock.lock();
4593     vs->add_usage(f.second->vselector_hint, f.second->fnode);
4594   }
4595   bool res = vselector->compare(vs);
4596   if (!res) {
4597     dout(0) << "Current:";
4598     vselector->dump(*_dout);
4599     *_dout << dendl;
4600     dout(0) << "Expected:";
4601     vs->dump(*_dout);
4602     *_dout << dendl;
4603   }
4604   ceph_assert(res);
4605   for (auto& f : nodes.file_map) {
4606     f.second->lock.unlock();
4607   }
4608   delete vs;
4609 }
4610
4611 size_t BlueFS::probe_alloc_avail(int dev, uint64_t alloc_size)
4612 {
4613   size_t total = 0;
4614   auto iterated_allocation = [&](size_t off, size_t len) {
4615     //only count in size that is alloc_size aligned
4616     size_t dist_to_alignment;
4617     size_t offset_in_block = off & (alloc_size - 1);
4618     if (offset_in_block == 0)
4619       dist_to_alignment = 0;
4620     else
4621       dist_to_alignment = alloc_size - offset_in_block;
4622     if (dist_to_alignment >= len)
4623       return;
4624     len -= dist_to_alignment;
4625     total += p2align(len, alloc_size);
4626   };
4627   if (alloc[dev]) {
4628     alloc[dev]->foreach(iterated_allocation);
4629   }
4630   return total;
4631 }
4632 // ===============================================
4633 // OriginalVolumeSelector
4634
4635 void* OriginalVolumeSelector::get_hint_for_log() const {
4636   return reinterpret_cast<void*>(BlueFS::BDEV_WAL);
4637 }
4638 void* OriginalVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
4639   uint8_t res = BlueFS::BDEV_DB;
4640   if (dirname.length() > 5) {
4641     // the "db.slow" and "db.wal" directory names are hard-coded at
4642     // match up with bluestore.  the slow device is always the second
4643     // one (when a dedicated block.db device is present and used at
4644     // bdev 0).  the wal device is always last.
4645     if (boost::algorithm::ends_with(dirname, ".slow") && slow_total) {
4646       res = BlueFS::BDEV_SLOW;
4647     } else if (boost::algorithm::ends_with(dirname, ".wal") && wal_total) {
4648       res = BlueFS::BDEV_WAL;
4649     }
4650   }
4651   return reinterpret_cast<void*>(res);
4652 }
4653
4654 uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint)
4655 {
4656   return (uint8_t)(reinterpret_cast<uint64_t>(hint));
4657 }
4658
4659 void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const
4660 {
4661   res.emplace_back(base, db_total);
4662   res.emplace_back(base + ".slow",
4663     slow_total ? slow_total : db_total); // use fake non-zero value if needed to
4664                                          // avoid RocksDB complains
4665 }
4666
4667 #undef dout_prefix
4668 #define dout_prefix *_dout << "OriginalVolumeSelector: "
4669
4670 void OriginalVolumeSelector::dump(ostream& sout) {
4671   sout<< "wal_total:" << wal_total
4672     << ", db_total:" << db_total
4673     << ", slow_total:" << slow_total
4674     << std::endl;
4675 }
4676
4677 // ===============================================
4678 // FitToFastVolumeSelector
4679
4680 void FitToFastVolumeSelector::get_paths(const std::string& base, paths& res) const {
4681   res.emplace_back(base, 1);  // size of the last db_path has no effect
4682 }