]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueFS.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / os / bluestore / BlueFS.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "boost/algorithm/string.hpp"
9f95a23c 5#include "bluestore_common.h"
7c673cae
FG
6#include "BlueFS.h"
7
8#include "common/debug.h"
9#include "common/errno.h"
10#include "common/perf_counters.h"
7c673cae 11#include "Allocator.h"
11fdf7f2 12#include "include/ceph_assert.h"
eafe8130 13#include "common/admin_socket.h"
7c673cae
FG
14
15#define dout_context cct
16#define dout_subsys ceph_subsys_bluefs
17#undef dout_prefix
18#define dout_prefix *_dout << "bluefs "
9f95a23c 19using TOPNSPC::common::cmd_getval;
f67539c2
TL
20
21using std::byte;
22using std::list;
23using std::make_pair;
24using std::map;
25using std::ostream;
26using std::pair;
27using std::set;
28using std::string;
29using std::to_string;
30using std::vector;
31
32using ceph::bufferlist;
33using ceph::decode;
34using ceph::encode;
35using ceph::Formatter;
36
37
7c673cae
FG
38MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
39MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
f91f0fd5 40MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs_file_writer);
7c673cae 41MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
f91f0fd5
TL
42 bluefs_file_reader_buffer, bluefs_file_reader);
43MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs_file_reader);
7c673cae
FG
44MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
45
11fdf7f2
TL
46static void wal_discard_cb(void *priv, void* priv2) {
47 BlueFS *bluefs = static_cast<BlueFS*>(priv);
48 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
49 bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp);
50}
51
52static void db_discard_cb(void *priv, void* priv2) {
53 BlueFS *bluefs = static_cast<BlueFS*>(priv);
54 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
55 bluefs->handle_discard(BlueFS::BDEV_DB, *tmp);
56}
57
58static void slow_discard_cb(void *priv, void* priv2) {
59 BlueFS *bluefs = static_cast<BlueFS*>(priv);
60 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
61 bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
62}
7c673cae 63
eafe8130
TL
64class BlueFS::SocketHook : public AdminSocketHook {
65 BlueFS* bluefs;
66public:
67 static BlueFS::SocketHook* create(BlueFS* bluefs)
68 {
69 BlueFS::SocketHook* hook = nullptr;
70 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
71 if (admin_socket) {
72 hook = new BlueFS::SocketHook(bluefs);
f67539c2 73 int r = admin_socket->register_command("bluestore bluefs device info "
eafe8130
TL
74 "name=alloc_size,type=CephInt,req=false",
75 hook,
f67539c2
TL
76 "Shows space report for bluefs devices. "
77 "This also includes an estimation for space "
78 "available to bluefs at main device. "
79 "alloc_size, if set, specifies the custom bluefs "
80 "allocation unit size for the estimation above.");
eafe8130
TL
81 if (r != 0) {
82 ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl;
83 delete hook;
84 hook = nullptr;
9f95a23c 85 } else {
f6b5b4d7 86 r = admin_socket->register_command("bluefs stats",
9f95a23c
TL
87 hook,
88 "Dump internal statistics for bluefs."
89 "");
90 ceph_assert(r == 0);
f67539c2
TL
91 r = admin_socket->register_command("bluefs files list", hook,
92 "print files in bluefs");
93 ceph_assert(r == 0);
cd265ab1
TL
94 r = admin_socket->register_command("bluefs debug_inject_read_zeros", hook,
95 "Injects 8K zeros into next BlueFS read. Debug only.");
96 ceph_assert(r == 0);
eafe8130
TL
97 }
98 }
99 return hook;
100 }
101
102 ~SocketHook() {
103 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
9f95a23c 104 admin_socket->unregister_commands(this);
eafe8130
TL
105 }
106private:
107 SocketHook(BlueFS* bluefs) :
108 bluefs(bluefs) {}
9f95a23c
TL
109 int call(std::string_view command, const cmdmap_t& cmdmap,
110 Formatter *f,
111 std::ostream& errss,
112 bufferlist& out) override {
f67539c2 113 if (command == "bluestore bluefs device info") {
9f95a23c
TL
114 int64_t alloc_size = 0;
115 cmd_getval(cmdmap, "alloc_size", alloc_size);
116 if ((alloc_size & (alloc_size - 1)) != 0) {
117 errss << "Invalid allocation size:'" << alloc_size << std::endl;
118 return -EINVAL;
119 }
120 if (alloc_size == 0)
f67539c2
TL
121 alloc_size = bluefs->cct->_conf->bluefs_shared_alloc_size;
122 f->open_object_section("bluefs_device_info");
9f95a23c
TL
123 for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) {
124 if (bluefs->bdev[dev]) {
125 f->open_object_section("dev");
126 f->dump_string("device", bluefs->get_device_name(dev));
127 ceph_assert(bluefs->alloc[dev]);
f67539c2
TL
128 auto total = bluefs->get_total(dev);
129 auto free = bluefs->get_free(dev);
130 auto used = bluefs->get_used(dev);
131
132 f->dump_int("total", total);
133 f->dump_int("free", free);
134 f->dump_int("bluefs_used", used);
135 if (bluefs->is_shared_alloc(dev)) {
136 size_t avail = bluefs->probe_alloc_avail(dev, alloc_size);
137 f->dump_int("bluefs max available", avail);
138 }
139 f->close_section();
140 }
eafe8130 141 }
f67539c2 142
9f95a23c
TL
143 f->close_section();
144 } else if (command == "bluefs stats") {
145 std::stringstream ss;
146 bluefs->dump_block_extents(ss);
147 bluefs->dump_volume_selector(ss);
eafe8130 148 out.append(ss);
f67539c2
TL
149 } else if (command == "bluefs files list") {
150 const char* devnames[3] = {"wal","db","slow"};
20effc67 151 std::lock_guard l(bluefs->nodes.lock);
f67539c2 152 f->open_array_section("files");
20effc67 153 for (auto &d : bluefs->nodes.dir_map) {
f67539c2
TL
154 std::string dir = d.first;
155 for (auto &r : d.second->file_map) {
156 f->open_object_section("file");
157 f->dump_string("name", (dir + "/" + r.first).c_str());
158 std::vector<size_t> sizes;
159 sizes.resize(bluefs->bdev.size());
160 for(auto& i : r.second->fnode.extents) {
161 sizes[i.bdev] += i.length;
162 }
163 for (size_t i = 0; i < sizes.size(); i++) {
164 if (sizes[i]>0) {
165 if (i < sizeof(devnames) / sizeof(*devnames))
166 f->dump_int(devnames[i], sizes[i]);
167 else
168 f->dump_int(("dev-"+to_string(i)).c_str(), sizes[i]);
169 }
170 }
171 f->close_section();
172 }
173 }
174 f->close_section();
175 f->flush(out);
cd265ab1
TL
176 } else if (command == "bluefs debug_inject_read_zeros") {
177 bluefs->inject_read_zeros++;
9f95a23c
TL
178 } else {
179 errss << "Invalid command" << std::endl;
180 return -ENOSYS;
eafe8130 181 }
9f95a23c
TL
182 return 0;
183 }
eafe8130
TL
184};
185
7c673cae
FG
186BlueFS::BlueFS(CephContext* cct)
187 : cct(cct),
188 bdev(MAX_BDEV),
189 ioc(MAX_BDEV),
f67539c2
TL
190 block_reserved(MAX_BDEV),
191 alloc(MAX_BDEV),
20effc67 192 alloc_size(MAX_BDEV, 0)
7c673cae 193{
20effc67 194 dirty.pending_release.resize(MAX_BDEV);
11fdf7f2
TL
195 discard_cb[BDEV_WAL] = wal_discard_cb;
196 discard_cb[BDEV_DB] = db_discard_cb;
197 discard_cb[BDEV_SLOW] = slow_discard_cb;
eafe8130 198 asok_hook = SocketHook::create(this);
7c673cae
FG
199}
200
201BlueFS::~BlueFS()
202{
eafe8130 203 delete asok_hook;
7c673cae
FG
204 for (auto p : ioc) {
205 if (p)
206 p->aio_wait();
207 }
208 for (auto p : bdev) {
209 if (p) {
210 p->close();
211 delete p;
212 }
213 }
214 for (auto p : ioc) {
215 delete p;
216 }
217}
218
219void BlueFS::_init_logger()
220{
221 PerfCountersBuilder b(cct, "bluefs",
222 l_bluefs_first, l_bluefs_last);
7c673cae
FG
223 b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
224 "Total bytes (main db device)",
11fdf7f2 225 "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
226 b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
227 "Used bytes (main db device)",
11fdf7f2 228 "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
229 b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
230 "Total bytes (wal device)",
11fdf7f2 231 "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
232 b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
233 "Used bytes (wal device)",
11fdf7f2 234 "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
235 b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
236 "Total bytes (slow device)",
11fdf7f2 237 "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
238 b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
239 "Used bytes (slow device)",
11fdf7f2 240 "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
241 b.add_u64(l_bluefs_num_files, "num_files", "File count",
242 "f", PerfCountersBuilder::PRIO_USEFUL);
243 b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
11fdf7f2 244 "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
7c673cae
FG
245 b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
246 "Compactions of the metadata log");
247 b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
20effc67
TL
248 "Bytes written to the metadata log",
249 "j",
11fdf7f2 250 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
7c673cae
FG
251 b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
252 "Files written to WAL");
253 b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
254 "Files written to SSTs");
255 b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
20effc67
TL
256 "Bytes written to WAL",
257 "walb",
7c673cae
FG
258 PerfCountersBuilder::PRIO_CRITICAL);
259 b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
20effc67
TL
260 "Bytes written to SSTs",
261 "sstb",
11fdf7f2
TL
262 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
263 b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow",
20effc67
TL
264 "Bytes written to WAL/SSTs at slow device",
265 "slwb",
266 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
11fdf7f2 267 b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal",
20effc67
TL
268 "Maximum bytes allocated from WAL",
269 "mxwb",
270 PerfCountersBuilder::PRIO_INTERESTING,
271 unit_t(UNIT_BYTES));
11fdf7f2 272 b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db",
20effc67
TL
273 "Maximum bytes allocated from DB",
274 "mxdb",
275 PerfCountersBuilder::PRIO_INTERESTING,
276 unit_t(UNIT_BYTES));
11fdf7f2 277 b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow",
20effc67
TL
278 "Maximum bytes allocated from SLOW",
279 "mxwb",
280 PerfCountersBuilder::PRIO_INTERESTING,
281 unit_t(UNIT_BYTES));
282 b.add_u64_counter(l_bluefs_main_alloc_unit, "alloc_unit_main",
283 "Allocation unit size (in bytes) for primary/shared device",
284 "aumb",
285 PerfCountersBuilder::PRIO_CRITICAL,
286 unit_t(UNIT_BYTES));
287 b.add_u64_counter(l_bluefs_db_alloc_unit, "alloc_unit_db",
288 "Allocation unit size (in bytes) for standalone DB device",
289 "audb",
290 PerfCountersBuilder::PRIO_CRITICAL,
291 unit_t(UNIT_BYTES));
292 b.add_u64_counter(l_bluefs_wal_alloc_unit, "alloc_unit_wal",
293 "Allocation unit size (in bytes) for standalone WAL device",
294 "auwb",
295 PerfCountersBuilder::PRIO_CRITICAL,
296 unit_t(UNIT_BYTES));
494da23a 297 b.add_u64_counter(l_bluefs_read_random_count, "read_random_count",
20effc67
TL
298 "random read requests processed",
299 NULL,
300 PerfCountersBuilder::PRIO_USEFUL);
494da23a 301 b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes",
20effc67
TL
302 "Bytes requested in random read mode",
303 NULL,
494da23a
TL
304 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
305 b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count",
20effc67
TL
306 "random reads requests going to disk",
307 NULL,
308 PerfCountersBuilder::PRIO_USEFUL);
494da23a 309 b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes",
20effc67
TL
310 "Bytes read from disk in random read mode",
311 "rrb",
312 PerfCountersBuilder::PRIO_INTERESTING,
313 unit_t(UNIT_BYTES));
314 b.add_u64_counter(l_bluefs_read_random_disk_bytes_wal, "read_random_disk_bytes_wal",
315 "random reads requests going to WAL disk",
316 NULL,
317 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
318 b.add_u64_counter(l_bluefs_read_random_disk_bytes_db, "read_random_disk_bytes_db",
319 "random reads requests going to DB disk",
320 NULL,
494da23a 321 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
20effc67
TL
322 b.add_u64_counter(l_bluefs_read_random_disk_bytes_slow, "read_random_disk_bytes_slow",
323 "random reads requests going to main disk",
324 "rrsb",
325 PerfCountersBuilder::PRIO_INTERESTING,
326 unit_t(UNIT_BYTES));
494da23a 327 b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count",
20effc67
TL
328 "random read requests processed using prefetch buffer",
329 NULL,
330 PerfCountersBuilder::PRIO_USEFUL);
494da23a 331 b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes",
20effc67
TL
332 "Bytes read from prefetch buffer in random read mode",
333 NULL,
494da23a 334 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
494da23a 335 b.add_u64_counter(l_bluefs_read_count, "read_count",
20effc67
TL
336 "buffered read requests processed",
337 NULL,
338 PerfCountersBuilder::PRIO_USEFUL);
494da23a 339 b.add_u64_counter(l_bluefs_read_bytes, "read_bytes",
20effc67
TL
340 "Bytes requested in buffered read mode",
341 NULL,
494da23a 342 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
20effc67
TL
343 b.add_u64_counter(l_bluefs_read_disk_count, "read_disk_count",
344 "buffered reads requests going to disk",
345 NULL,
346 PerfCountersBuilder::PRIO_USEFUL);
347 b.add_u64_counter(l_bluefs_read_disk_bytes, "read_disk_bytes",
348 "Bytes read in buffered mode from disk",
349 "rb",
350 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
351 b.add_u64_counter(l_bluefs_read_disk_bytes_wal, "read_disk_bytes_wal",
352 "reads requests going to WAL disk",
353 NULL,
354 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
355 b.add_u64_counter(l_bluefs_read_disk_bytes_db, "read_disk_bytes_db",
356 "reads requests going to DB disk",
357 NULL,
358 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
359 b.add_u64_counter(l_bluefs_read_disk_bytes_slow, "read_disk_bytes_slow",
360 "reads requests going to main disk",
361 "rsb",
362 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
494da23a 363 b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count",
20effc67
TL
364 "prefetch read requests processed",
365 NULL,
366 PerfCountersBuilder::PRIO_USEFUL);
494da23a 367 b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes",
20effc67
TL
368 "Bytes requested in prefetch read mode",
369 NULL,
494da23a 370 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
cd265ab1
TL
371 b.add_u64(l_bluefs_read_zeros_candidate, "read_zeros_candidate",
372 "How many times bluefs read found page with all 0s");
373 b.add_u64(l_bluefs_read_zeros_errors, "read_zeros_errors",
374 "How many times bluefs read found transient page with all 0s");
494da23a 375
7c673cae
FG
376 logger = b.create_perf_counters();
377 cct->get_perfcounters_collection()->add(logger);
378}
379
380void BlueFS::_shutdown_logger()
381{
382 cct->get_perfcounters_collection()->remove(logger);
383 delete logger;
384}
385
386void BlueFS::_update_logger_stats()
387{
7c673cae 388 if (alloc[BDEV_WAL]) {
f67539c2
TL
389 logger->set(l_bluefs_wal_total_bytes, _get_total(BDEV_WAL));
390 logger->set(l_bluefs_wal_used_bytes, _get_used(BDEV_WAL));
7c673cae
FG
391 }
392 if (alloc[BDEV_DB]) {
f67539c2
TL
393 logger->set(l_bluefs_db_total_bytes, _get_total(BDEV_DB));
394 logger->set(l_bluefs_db_used_bytes, _get_used(BDEV_DB));
7c673cae
FG
395 }
396 if (alloc[BDEV_SLOW]) {
f67539c2
TL
397 logger->set(l_bluefs_slow_total_bytes, _get_total(BDEV_SLOW));
398 logger->set(l_bluefs_slow_used_bytes, _get_used(BDEV_SLOW));
7c673cae
FG
399 }
400}
401
11fdf7f2 402int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
f67539c2
TL
403 uint64_t reserved,
404 bluefs_shared_alloc_context_t* _shared_alloc)
7c673cae 405{
f67539c2
TL
406 dout(10) << __func__ << " bdev " << id << " path " << path << " "
407 << reserved << dendl;
11fdf7f2
TL
408 ceph_assert(id < bdev.size());
409 ceph_assert(bdev[id] == NULL);
410 BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
411 discard_cb[id], static_cast<void*>(this));
f67539c2
TL
412 block_reserved[id] = reserved;
413 if (_shared_alloc) {
11fdf7f2
TL
414 b->set_no_exclusive_lock();
415 }
7c673cae
FG
416 int r = b->open(path);
417 if (r < 0) {
418 delete b;
419 return r;
420 }
11fdf7f2
TL
421 if (trim) {
422 b->discard(0, b->get_size());
423 }
424
7c673cae 425 dout(1) << __func__ << " bdev " << id << " path " << path
1adf2230 426 << " size " << byte_u_t(b->get_size()) << dendl;
7c673cae
FG
427 bdev[id] = b;
428 ioc[id] = new IOContext(cct, NULL);
f67539c2
TL
429 if (_shared_alloc) {
430 ceph_assert(!shared_alloc);
431 shared_alloc = _shared_alloc;
432 alloc[id] = shared_alloc->a;
433 shared_alloc_id = id;
434 }
7c673cae
FG
435 return 0;
436}
437
438bool BlueFS::bdev_support_label(unsigned id)
439{
11fdf7f2
TL
440 ceph_assert(id < bdev.size());
441 ceph_assert(bdev[id]);
7c673cae
FG
442 return bdev[id]->supported_bdev_label();
443}
444
f67539c2 445uint64_t BlueFS::get_block_device_size(unsigned id) const
7c673cae
FG
446{
447 if (id < bdev.size() && bdev[id])
448 return bdev[id]->get_size();
449 return 0;
450}
451
f67539c2 452void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
7c673cae 453{
f67539c2
TL
454 dout(10) << __func__ << " bdev " << id << dendl;
455 ceph_assert(alloc[id]);
456 alloc[id]->release(to_release);
457 if (is_shared_alloc(id)) {
458 shared_alloc->bluefs_used -= to_release.size();
7c673cae 459 }
7c673cae
FG
460}
461
f67539c2 462uint64_t BlueFS::get_used()
7c673cae 463{
f67539c2
TL
464 uint64_t used = 0;
465 for (unsigned id = 0; id < MAX_BDEV; ++id) {
466 used += _get_used(id);
7c673cae 467 }
f67539c2
TL
468 return used;
469}
7c673cae 470
f67539c2
TL
471uint64_t BlueFS::_get_used(unsigned id) const
472{
473 uint64_t used = 0;
474 if (!alloc[id])
475 return 0;
9f95a23c 476
f67539c2
TL
477 if (is_shared_alloc(id)) {
478 used = shared_alloc->bluefs_used;
479 } else {
480 used = _get_total(id) - alloc[id]->get_free();
9f95a23c 481 }
f67539c2 482 return used;
7c673cae
FG
483}
484
f67539c2 485uint64_t BlueFS::get_used(unsigned id)
7c673cae 486{
f67539c2 487 ceph_assert(id < alloc.size());
11fdf7f2 488 ceph_assert(alloc[id]);
f67539c2 489 return _get_used(id);
11fdf7f2
TL
490}
491
f67539c2 492uint64_t BlueFS::_get_total(unsigned id) const
11fdf7f2 493{
f67539c2
TL
494 ceph_assert(id < bdev.size());
495 ceph_assert(id < block_reserved.size());
496 return get_block_device_size(id) - block_reserved[id];
7c673cae
FG
497}
498
499uint64_t BlueFS::get_total(unsigned id)
500{
f67539c2 501 return _get_total(id);
7c673cae
FG
502}
503
504uint64_t BlueFS::get_free(unsigned id)
505{
11fdf7f2 506 ceph_assert(id < alloc.size());
7c673cae
FG
507 return alloc[id]->get_free();
508}
509
510void BlueFS::dump_perf_counters(Formatter *f)
511{
512 f->open_object_section("bluefs_perf_counters");
513 logger->dump_formatted(f,0);
514 f->close_section();
515}
516
3efd9988
FG
517void BlueFS::dump_block_extents(ostream& out)
518{
519 for (unsigned i = 0; i < MAX_BDEV; ++i) {
520 if (!bdev[i]) {
521 continue;
522 }
f67539c2 523 auto total = get_total(i);
11fdf7f2 524 auto free = get_free(i);
1911f103 525
f67539c2
TL
526 out << i << " : device size 0x" << std::hex << total
527 << " : using 0x" << total - free
528 << std::dec << "(" << byte_u_t(total - free) << ")";
1911f103 529 out << "\n";
3efd9988
FG
530 }
531}
7c673cae 532
7c673cae
FG
533int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
534{
20effc67 535 std::lock_guard nl(nodes.lock);
7c673cae 536 dout(10) << __func__ << " bdev " << id << dendl;
f67539c2 537 ceph_assert(id < alloc.size());
20effc67 538 for (auto& p : nodes.file_map) {
f67539c2
TL
539 for (auto& q : p.second->fnode.extents) {
540 if (q.bdev == id) {
541 extents->insert(q.offset, q.length);
542 }
543 }
544 }
7c673cae
FG
545 return 0;
546}
547
9f95a23c 548int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
7c673cae 549{
7c673cae
FG
550 dout(1) << __func__
551 << " osd_uuid " << osd_uuid
552 << dendl;
553
9f95a23c
TL
554 // set volume selector if not provided before/outside
555 if (vselector == nullptr) {
556 vselector.reset(
557 new OriginalVolumeSelector(
558 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
559 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
560 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
561 }
562
7c673cae 563 _init_logger();
20effc67 564 _init_alloc();
7c673cae
FG
565
566 super.version = 1;
567 super.block_size = bdev[BDEV_DB]->get_block_size();
568 super.osd_uuid = osd_uuid;
569 super.uuid.generate_random();
570 dout(1) << __func__ << " uuid " << super.uuid << dendl;
571
572 // init log
9f95a23c 573 FileRef log_file = ceph::make_ref<File>();
7c673cae 574 log_file->fnode.ino = 1;
f6b5b4d7 575 log_file->vselector_hint = vselector->get_hint_for_log();
7c673cae 576 int r = _allocate(
9f95a23c 577 vselector->select_prefer_bdev(log_file->vselector_hint),
7c673cae 578 cct->_conf->bluefs_max_log_runway,
94b18763 579 &log_file->fnode);
9f95a23c 580 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
11fdf7f2 581 ceph_assert(r == 0);
20effc67 582 log.writer = _create_writer(log_file);
7c673cae
FG
583
584 // initial txn
20effc67
TL
585 ceph_assert(log.seq_live == 1);
586 log.t.seq = 1;
587 log.t.op_init();
588 _flush_and_sync_log_LD();
7c673cae
FG
589
590 // write supers
591 super.log_fnode = log_file->fnode;
9f95a23c 592 super.memorized_layout = layout;
11fdf7f2 593 _write_super(BDEV_DB);
20effc67 594 _flush_bdev();
7c673cae
FG
595
596 // clean up
597 super = bluefs_super_t();
20effc67
TL
598 _close_writer(log.writer);
599 log.writer = NULL;
9f95a23c 600 vselector.reset(nullptr);
7c673cae
FG
601 _stop_alloc();
602 _shutdown_logger();
f67539c2
TL
603 if (shared_alloc) {
604 ceph_assert(shared_alloc->need_init);
605 shared_alloc->need_init = false;
606 }
7c673cae
FG
607
608 dout(10) << __func__ << " success" << dendl;
609 return 0;
610}
611
612void BlueFS::_init_alloc()
613{
614 dout(20) << __func__ << dendl;
eafe8130 615
20effc67 616 size_t wal_alloc_size = 0;
eafe8130 617 if (bdev[BDEV_WAL]) {
20effc67
TL
618 wal_alloc_size = cct->_conf->bluefs_alloc_size;
619 alloc_size[BDEV_WAL] = wal_alloc_size;
eafe8130 620 }
20effc67
TL
621 logger->set(l_bluefs_wal_alloc_unit, wal_alloc_size);
622
eafe8130
TL
623 if (bdev[BDEV_SLOW]) {
624 alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size;
625 alloc_size[BDEV_SLOW] = cct->_conf->bluefs_shared_alloc_size;
20effc67
TL
626 logger->set(l_bluefs_db_alloc_unit, cct->_conf->bluefs_alloc_size);
627 logger->set(l_bluefs_main_alloc_unit, cct->_conf->bluefs_shared_alloc_size);
eafe8130
TL
628 } else {
629 alloc_size[BDEV_DB] = cct->_conf->bluefs_shared_alloc_size;
20effc67
TL
630 logger->set(l_bluefs_main_alloc_unit, 0);
631 logger->set(l_bluefs_db_alloc_unit, cct->_conf->bluefs_shared_alloc_size);
eafe8130
TL
632 }
633 // new wal and db devices are never shared
634 if (bdev[BDEV_NEWWAL]) {
635 alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size;
636 }
637 if (bdev[BDEV_NEWDB]) {
638 alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size;
639 }
640
7c673cae
FG
641 for (unsigned id = 0; id < bdev.size(); ++id) {
642 if (!bdev[id]) {
643 continue;
644 }
11fdf7f2 645 ceph_assert(bdev[id]->get_size());
eafe8130 646 ceph_assert(alloc_size[id]);
f67539c2
TL
647 if (is_shared_alloc(id)) {
648 dout(1) << __func__ << " shared, id " << id << std::hex
649 << ", capacity 0x" << bdev[id]->get_size()
650 << ", block size 0x" << alloc_size[id]
651 << std::dec << dendl;
652 } else {
653 std::string name = "bluefs-";
654 const char* devnames[] = { "wal","db","slow" };
655 if (id <= BDEV_SLOW)
656 name += devnames[id];
657 else
658 name += to_string(uintptr_t(this));
659 dout(1) << __func__ << " new, id " << id << std::hex
660 << ", allocator name " << name
661 << ", allocator type " << cct->_conf->bluefs_allocator
662 << ", capacity 0x" << bdev[id]->get_size()
663 << ", block size 0x" << alloc_size[id]
664 << std::dec << dendl;
665 alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
666 bdev[id]->get_size(),
20effc67
TL
667 alloc_size[id],
668 0, 0,
669 name);
f67539c2
TL
670 alloc[id]->init_add_free(
671 block_reserved[id],
672 _get_total(id));
7c673cae
FG
673 }
674 }
675}
676
677void BlueFS::_stop_alloc()
678{
679 dout(20) << __func__ << dendl;
11fdf7f2
TL
680 for (auto p : bdev) {
681 if (p)
682 p->discard_drain();
683 }
684
f67539c2
TL
685 for (size_t i = 0; i < alloc.size(); ++i) {
686 if (alloc[i] && !is_shared_alloc(i)) {
687 alloc[i]->shutdown();
688 delete alloc[i];
689 alloc[i] = nullptr;
7c673cae
FG
690 }
691 }
7c673cae
FG
692}
693
20effc67
TL
694int BlueFS::_read_and_check(uint8_t ndev, uint64_t off, uint64_t len,
695 ceph::buffer::list *pbl, IOContext *ioc, bool buffered)
cd265ab1
TL
696{
697 dout(10) << __func__ << " dev " << int(ndev)
698 << ": 0x" << std::hex << off << "~" << len << std::dec
699 << (buffered ? " buffered" : "")
700 << dendl;
701 int r;
702 bufferlist bl;
20effc67 703 r = _bdev_read(ndev, off, len, &bl, ioc, buffered);
cd265ab1
TL
704 if (r != 0) {
705 return r;
706 }
707 uint64_t block_size = bdev[ndev]->get_block_size();
708 if (inject_read_zeros) {
709 if (len >= block_size * 2) {
710 derr << __func__ << " injecting error, zeros at "
711 << int(ndev) << ": 0x" << std::hex << (off + len / 2)
712 << "~" << (block_size * 2) << std::dec << dendl;
713 //use beginning, replace 8K in the middle with zeros, use tail
714 bufferlist temp;
715 bl.splice(0, len / 2 - block_size, &temp);
f67539c2 716 temp.append(buffer::create(block_size * 2, 0));
cd265ab1
TL
717 bl.splice(block_size * 2, len / 2 - block_size, &temp);
718 bl = temp;
719 inject_read_zeros--;
720 }
721 }
722 //make a check if there is a block with all 0
723 uint64_t to_check_len = len;
724 uint64_t skip = p2nphase(off, block_size);
725 if (skip >= to_check_len) {
726 return r;
727 }
728 auto it = bl.begin(skip);
729 to_check_len -= skip;
730 bool all_zeros = false;
731 while (all_zeros == false && to_check_len >= block_size) {
732 // checking 0s step
733 unsigned block_left = block_size;
734 unsigned avail;
735 const char* data;
736 all_zeros = true;
737 while (all_zeros && block_left > 0) {
738 avail = it.get_ptr_and_advance(block_left, &data);
739 block_left -= avail;
740 all_zeros = mem_is_zero(data, avail);
741 }
742 // skipping step
743 while (block_left > 0) {
744 avail = it.get_ptr_and_advance(block_left, &data);
745 block_left -= avail;
746 }
747 to_check_len -= block_size;
748 }
749 if (all_zeros) {
750 logger->inc(l_bluefs_read_zeros_candidate, 1);
751 bufferlist bl_reread;
20effc67 752 r = _bdev_read(ndev, off, len, &bl_reread, ioc, buffered);
cd265ab1
TL
753 if (r != 0) {
754 return r;
755 }
756 // check if both read gave the same
757 if (!bl.contents_equal(bl_reread)) {
758 // report problems to log, but continue, maybe it will be good now...
759 derr << __func__ << " initial read of " << int(ndev)
760 << ": 0x" << std::hex << off << "~" << len
761 << std::dec << ": different then re-read " << dendl;
762 logger->inc(l_bluefs_read_zeros_errors, 1);
763 }
764 // use second read will be better if is different
765 pbl->append(bl_reread);
766 } else {
767 pbl->append(bl);
768 }
769 return r;
770}
771
20effc67
TL
772int BlueFS::_read_random_and_check(
773 uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered)
cd265ab1
TL
774{
775 dout(10) << __func__ << " dev " << int(ndev)
776 << ": 0x" << std::hex << off << "~" << len << std::dec
777 << (buffered ? " buffered" : "")
778 << dendl;
779 int r;
20effc67 780 r = _bdev_read_random(ndev, off, len, buf, buffered);
cd265ab1
TL
781 if (r != 0) {
782 return r;
783 }
784 uint64_t block_size = bdev[ndev]->get_block_size();
785 if (inject_read_zeros) {
786 if (len >= block_size * 2) {
787 derr << __func__ << " injecting error, zeros at "
788 << int(ndev) << ": 0x" << std::hex << (off + len / 2)
789 << "~" << (block_size * 2) << std::dec << dendl;
790 //zero middle 8K
791 memset(buf + len / 2 - block_size, 0, block_size * 2);
792 inject_read_zeros--;
793 }
794 }
795 //make a check if there is a block with all 0
796 uint64_t to_check_len = len;
797 const char* data = buf;
798 uint64_t skip = p2nphase(off, block_size);
799 if (skip >= to_check_len) {
800 return r;
801 }
802 to_check_len -= skip;
803 data += skip;
804
805 bool all_zeros = false;
806 while (all_zeros == false && to_check_len >= block_size) {
807 if (mem_is_zero(data, block_size)) {
808 // at least one block is all zeros
809 all_zeros = true;
810 break;
811 }
812 data += block_size;
813 to_check_len -= block_size;
814 }
815 if (all_zeros) {
816 logger->inc(l_bluefs_read_zeros_candidate, 1);
817 std::unique_ptr<char[]> data_reread(new char[len]);
20effc67 818 r = _bdev_read_random(ndev, off, len, &data_reread[0], buffered);
cd265ab1
TL
819 if (r != 0) {
820 return r;
821 }
822 // check if both read gave the same
823 if (memcmp(buf, &data_reread[0], len) != 0) {
824 derr << __func__ << " initial read of " << int(ndev)
825 << ": 0x" << std::hex << off << "~" << len
826 << std::dec << ": different then re-read " << dendl;
827 logger->inc(l_bluefs_read_zeros_errors, 1);
828 // second read is probably better
829 memcpy(buf, &data_reread[0], len);
830 }
831 }
832 return r;
833}
834
20effc67
TL
835int BlueFS::_bdev_read(uint8_t ndev, uint64_t off, uint64_t len,
836 ceph::buffer::list* pbl, IOContext* ioc, bool buffered)
837{
838 int cnt = 0;
839 switch (ndev) {
840 case BDEV_WAL: cnt = l_bluefs_read_disk_bytes_wal; break;
841 case BDEV_DB: cnt = l_bluefs_read_disk_bytes_db; break;
842 case BDEV_SLOW: cnt = l_bluefs_read_disk_bytes_slow; break;
843
844 }
845 if (cnt) {
846 logger->inc(cnt, len);
847 }
848 return bdev[ndev]->read(off, len, pbl, ioc, buffered);
849}
850
851int BlueFS::_bdev_read_random(uint8_t ndev, uint64_t off, uint64_t len,
852 char* buf, bool buffered)
853{
854 int cnt = 0;
855 switch (ndev) {
856 case BDEV_WAL: cnt = l_bluefs_read_random_disk_bytes_wal; break;
857 case BDEV_DB: cnt = l_bluefs_read_random_disk_bytes_db; break;
858 case BDEV_SLOW: cnt = l_bluefs_read_random_disk_bytes_slow; break;
859 }
860 if (cnt) {
861 logger->inc(cnt, len);
862 }
863 return bdev[ndev]->read_random(off, len, buf, buffered);
864}
865
7c673cae
FG
866int BlueFS::mount()
867{
868 dout(1) << __func__ << dendl;
869
20effc67 870 _init_logger();
7c673cae
FG
871 int r = _open_super();
872 if (r < 0) {
873 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
874 goto out;
875 }
876
9f95a23c
TL
877 // set volume selector if not provided before/outside
878 if (vselector == nullptr) {
879 vselector.reset(
880 new OriginalVolumeSelector(
881 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
882 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
883 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
884 }
885
7c673cae
FG
886 _init_alloc();
887
11fdf7f2 888 r = _replay(false, false);
7c673cae
FG
889 if (r < 0) {
890 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
891 _stop_alloc();
892 goto out;
893 }
894
895 // init freelist
20effc67 896 for (auto& p : nodes.file_map) {
7c673cae
FG
897 dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
898 for (auto& q : p.second->fnode.extents) {
f67539c2
TL
899 bool is_shared = is_shared_alloc(q.bdev);
900 ceph_assert(!is_shared || (is_shared && shared_alloc));
901 if (is_shared && shared_alloc->need_init && shared_alloc->a) {
902 shared_alloc->bluefs_used += q.length;
903 alloc[q.bdev]->init_rm_free(q.offset, q.length);
904 } else if (!is_shared) {
905 alloc[q.bdev]->init_rm_free(q.offset, q.length);
906 }
7c673cae
FG
907 }
908 }
f67539c2
TL
909 if (shared_alloc) {
910 shared_alloc->need_init = false;
911 dout(1) << __func__ << " shared_bdev_used = "
912 << shared_alloc->bluefs_used << dendl;
913 } else {
914 dout(1) << __func__ << " shared bdev not used"
915 << dendl;
916 }
7c673cae
FG
917
918 // set up the log for future writes
20effc67
TL
919 log.writer = _create_writer(_get_file(1));
920 ceph_assert(log.writer->file->fnode.ino == 1);
921 log.writer->pos = log.writer->file->fnode.size;
922 log.writer->file->fnode.reset_delta();
7c673cae 923 dout(10) << __func__ << " log write pos set to 0x"
20effc67 924 << std::hex << log.writer->pos << std::dec
7c673cae 925 << dendl;
20effc67
TL
926 // update log size
927 logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size);
7c673cae
FG
928 return 0;
929
930 out:
931 super = bluefs_super_t();
932 return r;
933}
934
9f95a23c
TL
935int BlueFS::maybe_verify_layout(const bluefs_layout_t& layout) const
936{
937 if (super.memorized_layout) {
938 if (layout == *super.memorized_layout) {
939 dout(10) << __func__ << " bluefs layout verified positively" << dendl;
940 } else {
941 derr << __func__ << " memorized layout doesn't fit current one" << dendl;
942 return -EIO;
943 }
944 } else {
945 dout(10) << __func__ << " no memorized_layout in bluefs superblock"
946 << dendl;
947 }
948
949 return 0;
950}
951
1911f103 952void BlueFS::umount(bool avoid_compact)
7c673cae
FG
953{
954 dout(1) << __func__ << dendl;
955
1911f103 956 sync_metadata(avoid_compact);
20effc67
TL
957 if (cct->_conf->bluefs_check_volume_selector_on_umount) {
958 _check_vselector_LNF();
959 }
960 _close_writer(log.writer);
961 log.writer = NULL;
962 log.t.clear();
7c673cae 963
9f95a23c 964 vselector.reset(nullptr);
7c673cae 965 _stop_alloc();
20effc67
TL
966 nodes.file_map.clear();
967 nodes.dir_map.clear();
7c673cae 968 super = bluefs_super_t();
7c673cae
FG
969 _shutdown_logger();
970}
971
9f95a23c 972int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout)
7c673cae 973{
11fdf7f2
TL
974 dout(1) << __func__ << dendl;
975
976 if(id == BDEV_NEWDB) {
977 int new_log_dev_cur = BDEV_WAL;
978 int new_log_dev_next = BDEV_WAL;
979 if (!bdev[BDEV_WAL]) {
980 new_log_dev_cur = BDEV_NEWDB;
981 new_log_dev_next = BDEV_DB;
982 }
20effc67 983 _rewrite_log_and_layout_sync_LNF_LD(false,
11fdf7f2
TL
984 BDEV_NEWDB,
985 new_log_dev_cur,
986 new_log_dev_next,
9f95a23c
TL
987 RENAME_DB2SLOW,
988 layout);
11fdf7f2
TL
989 //}
990 } else if(id == BDEV_NEWWAL) {
20effc67 991 _rewrite_log_and_layout_sync_LNF_LD(false,
9f95a23c
TL
992 BDEV_DB,
993 BDEV_NEWWAL,
994 BDEV_WAL,
995 REMOVE_WAL,
996 layout);
11fdf7f2
TL
997 } else {
998 assert(false);
999 }
1000 return 0;
1001}
1002
1003void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id)
1004{
1005 if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB])
7c673cae
FG
1006 bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
1007 if (bdev[BDEV_WAL])
1008 bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
11fdf7f2
TL
1009}
1010
1011void BlueFS::get_devices(set<string> *ls)
1012{
1013 for (unsigned i = 0; i < MAX_BDEV; ++i) {
1014 if (bdev[i]) {
1015 bdev[i]->get_devices(ls);
1016 }
1017 }
7c673cae
FG
1018}
1019
1020int BlueFS::fsck()
1021{
7c673cae
FG
1022 dout(1) << __func__ << dendl;
1023 // hrm, i think we check everything on mount...
1024 return 0;
1025}
1026
11fdf7f2 1027int BlueFS::_write_super(int dev)
7c673cae
FG
1028{
1029 // build superblock
1030 bufferlist bl;
11fdf7f2 1031 encode(super, bl);
7c673cae 1032 uint32_t crc = bl.crc32c(-1);
11fdf7f2 1033 encode(crc, bl);
7c673cae
FG
1034 dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
1035 dout(10) << __func__ << " superblock " << super.version << dendl;
1036 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
9f95a23c 1037 ceph_assert_always(bl.length() <= get_super_length());
7c673cae
FG
1038 bl.append_zero(get_super_length() - bl.length());
1039
11fdf7f2 1040 bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT);
7c673cae
FG
1041 dout(20) << __func__ << " v " << super.version
1042 << " crc 0x" << std::hex << crc
1043 << " offset 0x" << get_super_offset() << std::dec
1044 << dendl;
1045 return 0;
1046}
1047
1048int BlueFS::_open_super()
1049{
1050 dout(10) << __func__ << dendl;
1051
1052 bufferlist bl;
1053 uint32_t expected_crc, crc;
1054 int r;
1055
1056 // always the second block
20effc67
TL
1057 r = _bdev_read(BDEV_DB, get_super_offset(), get_super_length(),
1058 &bl, ioc[BDEV_DB], false);
7c673cae
FG
1059 if (r < 0)
1060 return r;
1061
11fdf7f2
TL
1062 auto p = bl.cbegin();
1063 decode(super, p);
7c673cae
FG
1064 {
1065 bufferlist t;
1066 t.substr_of(bl, 0, p.get_off());
1067 crc = t.crc32c(-1);
1068 }
11fdf7f2 1069 decode(expected_crc, p);
7c673cae
FG
1070 if (crc != expected_crc) {
1071 derr << __func__ << " bad crc on superblock, expected 0x"
1072 << std::hex << expected_crc << " != actual 0x" << crc << std::dec
1073 << dendl;
1074 return -EIO;
1075 }
1076 dout(10) << __func__ << " superblock " << super.version << dendl;
1077 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
1078 return 0;
1079}
1080
20effc67
TL
1081int BlueFS::_check_allocations(const bluefs_fnode_t& fnode,
1082 boost::dynamic_bitset<uint64_t>* used_blocks,
1083 bool is_alloc, //true when allocating, false when deallocating
1084 const char* op_name)
9f95a23c
TL
1085{
1086 auto& fnode_extents = fnode.extents;
1087 for (auto e : fnode_extents) {
1088 auto id = e.bdev;
1089 bool fail = false;
20effc67
TL
1090 ceph_assert(id < MAX_BDEV);
1091 if (int r = _verify_alloc_granularity(id, e.offset, e.length,
1092 op_name); r < 0) {
1093 return r;
1094 }
9f95a23c
TL
1095
1096 apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
1097 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
20effc67
TL
1098 if (is_alloc == bs.test(pos)) {
1099 fail = true;
1100 } else {
1101 bs.flip(pos);
1102 }
9f95a23c
TL
1103 }
1104 );
1105 if (fail) {
20effc67
TL
1106 derr << __func__ << " " << op_name << " invalid extent " << int(e.bdev)
1107 << ": 0x" << std::hex << e.offset << "~" << e.length << std::dec
1108 << (is_alloc == true ?
1109 ": duplicate reference, ino " : ": double free, ino ")
1110 << fnode.ino << dendl;
9f95a23c
TL
1111 return -EFAULT;
1112 }
1113 }
1114 return 0;
1115}
1116
9f95a23c
TL
1117int BlueFS::_verify_alloc_granularity(
1118 __u8 id, uint64_t offset, uint64_t length, const char *op)
1119{
1120 if ((offset & (alloc_size[id] - 1)) ||
1121 (length & (alloc_size[id] - 1))) {
1122 derr << __func__ << " " << op << " of " << (int)id
1123 << ":0x" << std::hex << offset << "~" << length << std::dec
1124 << " does not align to alloc_size 0x"
1125 << std::hex << alloc_size[id] << std::dec << dendl;
1126 // be helpful
1127 auto need = alloc_size[id];
1128 while (need && ((offset & (need - 1)) ||
1129 (length & (need - 1)))) {
1130 need >>= 1;
1131 }
1132 if (need) {
1133 const char *which;
1134 if (id == BDEV_SLOW ||
1135 (id == BDEV_DB && !bdev[BDEV_SLOW])) {
1136 which = "bluefs_shared_alloc_size";
1137 } else {
1138 which = "bluefs_alloc_size";
1139 }
1140 derr << "work-around by setting " << which << " = " << need
1141 << " for this OSD" << dendl;
1142 }
1143 return -EFAULT;
1144 }
1145 return 0;
1146}
1147
11fdf7f2 1148int BlueFS::_replay(bool noop, bool to_stdout)
7c673cae
FG
1149{
1150 dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
1151 ino_last = 1; // by the log
20effc67 1152 uint64_t log_seq = 0;
7c673cae
FG
1153
1154 FileRef log_file;
11fdf7f2 1155 log_file = _get_file(1);
9f95a23c 1156
f67539c2 1157 log_file->fnode = super.log_fnode;
11fdf7f2 1158 if (!noop) {
9f95a23c 1159 log_file->vselector_hint =
f6b5b4d7 1160 vselector->get_hint_for_log();
7c673cae 1161 } else {
11fdf7f2
TL
1162 // do not use fnode from superblock in 'noop' mode - log_file's one should
1163 // be fine and up-to-date
1164 ceph_assert(log_file->fnode.ino == 1);
1165 ceph_assert(log_file->fnode.extents.size() != 0);
7c673cae 1166 }
7c673cae 1167 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
11fdf7f2
TL
1168 if (unlikely(to_stdout)) {
1169 std::cout << " log_fnode " << super.log_fnode << std::endl;
1170 }
7c673cae
FG
1171
1172 FileReader *log_reader = new FileReader(
1173 log_file, cct->_conf->bluefs_max_prefetch,
1174 false, // !random
1175 true); // ignore eof
9f95a23c
TL
1176
1177 bool seen_recs = false;
1178
1179 boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV];
9f95a23c 1180
f67539c2
TL
1181 if (!noop) {
1182 if (cct->_conf->bluefs_log_replay_check_allocations) {
1183 for (size_t i = 0; i < MAX_BDEV; ++i) {
1184 if (alloc_size[i] != 0 && bdev[i] != nullptr) {
1185 used_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]);
1186 }
9f95a23c 1187 }
20effc67
TL
1188 // check initial log layout
1189 int r = _check_allocations(log_file->fnode,
1190 used_blocks, true, "Log from super");
1191 if (r < 0) {
1192 return r;
1193 }
9f95a23c
TL
1194 }
1195 }
1196
7c673cae 1197 while (true) {
11fdf7f2 1198 ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0);
7c673cae
FG
1199 uint64_t pos = log_reader->buf.pos;
1200 uint64_t read_pos = pos;
1201 bufferlist bl;
1202 {
f67539c2 1203 int r = _read(log_reader, read_pos, super.block_size,
7c673cae 1204 &bl, NULL);
f6b5b4d7 1205 if (r != (int)super.block_size && cct->_conf->bluefs_replay_recovery) {
20effc67 1206 r += _do_replay_recovery_read(log_reader, pos, read_pos + r, super.block_size - r, &bl);
f6b5b4d7
TL
1207 }
1208 assert(r == (int)super.block_size);
7c673cae
FG
1209 read_pos += r;
1210 }
1211 uint64_t more = 0;
1212 uint64_t seq;
1213 uuid_d uuid;
1214 {
11fdf7f2 1215 auto p = bl.cbegin();
7c673cae
FG
1216 __u8 a, b;
1217 uint32_t len;
11fdf7f2
TL
1218 decode(a, p);
1219 decode(b, p);
1220 decode(len, p);
1221 decode(uuid, p);
1222 decode(seq, p);
7c673cae 1223 if (len + 6 > bl.length()) {
11fdf7f2 1224 more = round_up_to(len + 6 - bl.length(), super.block_size);
7c673cae
FG
1225 }
1226 }
1227 if (uuid != super.uuid) {
9f95a23c
TL
1228 if (seen_recs) {
1229 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1230 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1231 << dendl;
1232 } else {
1233 derr << __func__ << " 0x" << std::hex << pos << std::dec
1234 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1235 << ", block dump: \n";
1236 bufferlist t;
1237 t.substr_of(bl, 0, super.block_size);
1238 t.hexdump(*_dout);
1239 *_dout << dendl;
1240 }
7c673cae
FG
1241 break;
1242 }
1243 if (seq != log_seq + 1) {
9f95a23c
TL
1244 if (seen_recs) {
1245 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1246 << ": stop: seq " << seq << " != expected " << log_seq + 1
1247 << dendl;;
1248 } else {
1249 derr << __func__ << " 0x" << std::hex << pos << std::dec
1250 << ": stop: seq " << seq << " != expected " << log_seq + 1
1251 << dendl;;
1252 }
7c673cae
FG
1253 break;
1254 }
1255 if (more) {
1256 dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
1257 << " more bytes" << dendl;
1258 bufferlist t;
f67539c2 1259 int r = _read(log_reader, read_pos, more, &t, NULL);
7c673cae 1260 if (r < (int)more) {
f6b5b4d7
TL
1261 dout(10) << __func__ << " 0x" << std::hex << pos
1262 << ": stop: len is 0x" << bl.length() + more << std::dec
1263 << ", which is past eof" << dendl;
1264 if (cct->_conf->bluefs_replay_recovery) {
1265 //try to search for more data
20effc67 1266 r += _do_replay_recovery_read(log_reader, pos, read_pos + r, more - r, &t);
f6b5b4d7
TL
1267 if (r < (int)more) {
1268 //in normal mode we must read r==more, for recovery it is too strict
1269 break;
1270 }
1271 }
7c673cae 1272 }
11fdf7f2 1273 ceph_assert(r == (int)more);
7c673cae
FG
1274 bl.claim_append(t);
1275 read_pos += r;
1276 }
1277 bluefs_transaction_t t;
1278 try {
11fdf7f2
TL
1279 auto p = bl.cbegin();
1280 decode(t, p);
522d829b 1281 seen_recs = true;
7c673cae 1282 }
f67539c2 1283 catch (ceph::buffer::error& e) {
522d829b
TL
1284 // Multi-block transactions might be incomplete due to unexpected
1285 // power off. Hence let's treat that as a regular stop condition.
1286 if (seen_recs && more) {
1287 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1288 << ": stop: failed to decode: " << e.what()
1289 << dendl;
1290 } else {
1291 derr << __func__ << " 0x" << std::hex << pos << std::dec
1292 << ": stop: failed to decode: " << e.what()
1293 << dendl;
1294 delete log_reader;
1295 return -EIO;
1296 }
1297 break;
7c673cae 1298 }
11fdf7f2 1299 ceph_assert(seq == t.seq);
7c673cae
FG
1300 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1301 << ": " << t << dendl;
11fdf7f2
TL
1302 if (unlikely(to_stdout)) {
1303 std::cout << " 0x" << std::hex << pos << std::dec
1304 << ": " << t << std::endl;
1305 }
7c673cae 1306
11fdf7f2 1307 auto p = t.op_bl.cbegin();
7c673cae
FG
1308 while (!p.end()) {
1309 __u8 op;
11fdf7f2 1310 decode(op, p);
7c673cae
FG
1311 switch (op) {
1312
1313 case bluefs_transaction_t::OP_INIT:
1314 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1315 << ": op_init" << dendl;
11fdf7f2
TL
1316 if (unlikely(to_stdout)) {
1317 std::cout << " 0x" << std::hex << pos << std::dec
1318 << ": op_init" << std::endl;
1319 }
1320
1321 ceph_assert(t.seq == 1);
7c673cae
FG
1322 break;
1323
1324 case bluefs_transaction_t::OP_JUMP:
1325 {
1326 uint64_t next_seq;
1327 uint64_t offset;
11fdf7f2
TL
1328 decode(next_seq, p);
1329 decode(offset, p);
7c673cae
FG
1330 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1331 << ": op_jump seq " << next_seq
1332 << " offset 0x" << std::hex << offset << std::dec << dendl;
11fdf7f2
TL
1333 if (unlikely(to_stdout)) {
1334 std::cout << " 0x" << std::hex << pos << std::dec
1335 << ": op_jump seq " << next_seq
1336 << " offset 0x" << std::hex << offset << std::dec
1337 << std::endl;
1338 }
1339
20effc67 1340 ceph_assert(next_seq > log_seq);
7c673cae
FG
1341 log_seq = next_seq - 1; // we will increment it below
1342 uint64_t skip = offset - read_pos;
1343 if (skip) {
1344 bufferlist junk;
f67539c2 1345 int r = _read(log_reader, read_pos, skip, &junk,
7c673cae
FG
1346 NULL);
1347 if (r != (int)skip) {
1348 dout(10) << __func__ << " 0x" << std::hex << read_pos
1349 << ": stop: failed to skip to " << offset
1350 << std::dec << dendl;
11fdf7f2 1351 ceph_abort_msg("problem with op_jump");
7c673cae
FG
1352 }
1353 }
1354 }
1355 break;
1356
1357 case bluefs_transaction_t::OP_JUMP_SEQ:
1358 {
1359 uint64_t next_seq;
11fdf7f2 1360 decode(next_seq, p);
7c673cae
FG
1361 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1362 << ": op_jump_seq " << next_seq << dendl;
11fdf7f2
TL
1363 if (unlikely(to_stdout)) {
1364 std::cout << " 0x" << std::hex << pos << std::dec
1365 << ": op_jump_seq " << next_seq << std::endl;
1366 }
1367
20effc67 1368 ceph_assert(next_seq > log_seq);
7c673cae
FG
1369 log_seq = next_seq - 1; // we will increment it below
1370 }
1371 break;
1372
1373 case bluefs_transaction_t::OP_ALLOC_ADD:
f67539c2 1374 // LEGACY, do nothing but read params
7c673cae 1375 {
f67539c2
TL
1376 __u8 id;
1377 uint64_t offset, length;
1378 decode(id, p);
1379 decode(offset, p);
1380 decode(length, p);
1381 }
7c673cae
FG
1382 break;
1383
1384 case bluefs_transaction_t::OP_ALLOC_RM:
f67539c2 1385 // LEGACY, do nothing but read params
7c673cae 1386 {
f67539c2
TL
1387 __u8 id;
1388 uint64_t offset, length;
1389 decode(id, p);
1390 decode(offset, p);
1391 decode(length, p);
1392 }
1393 break;
7c673cae
FG
1394
1395 case bluefs_transaction_t::OP_DIR_LINK:
1396 {
1397 string dirname, filename;
1398 uint64_t ino;
11fdf7f2
TL
1399 decode(dirname, p);
1400 decode(filename, p);
1401 decode(ino, p);
7c673cae
FG
1402 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1403 << ": op_dir_link " << " " << dirname << "/" << filename
1404 << " to " << ino
1405 << dendl;
11fdf7f2
TL
1406 if (unlikely(to_stdout)) {
1407 std::cout << " 0x" << std::hex << pos << std::dec
1408 << ": op_dir_link " << " " << dirname << "/" << filename
1409 << " to " << ino
1410 << std::endl;
1411 }
1412
7c673cae
FG
1413 if (!noop) {
1414 FileRef file = _get_file(ino);
11fdf7f2 1415 ceph_assert(file->fnode.ino);
20effc67
TL
1416 map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1417 ceph_assert(q != nodes.dir_map.end());
7c673cae 1418 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
11fdf7f2 1419 ceph_assert(r == q->second->file_map.end());
9f95a23c
TL
1420
1421 vselector->sub_usage(file->vselector_hint, file->fnode);
1422 file->vselector_hint =
1423 vselector->get_hint_by_dir(dirname);
1424 vselector->add_usage(file->vselector_hint, file->fnode);
1425
7c673cae
FG
1426 q->second->file_map[filename] = file;
1427 ++file->refs;
1428 }
1429 }
1430 break;
1431
1432 case bluefs_transaction_t::OP_DIR_UNLINK:
1433 {
1434 string dirname, filename;
11fdf7f2
TL
1435 decode(dirname, p);
1436 decode(filename, p);
7c673cae
FG
1437 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1438 << ": op_dir_unlink " << " " << dirname << "/" << filename
1439 << dendl;
11fdf7f2
TL
1440 if (unlikely(to_stdout)) {
1441 std::cout << " 0x" << std::hex << pos << std::dec
1442 << ": op_dir_unlink " << " " << dirname << "/" << filename
1443 << std::endl;
1444 }
1445
7c673cae 1446 if (!noop) {
20effc67
TL
1447 map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1448 ceph_assert(q != nodes.dir_map.end());
7c673cae 1449 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
11fdf7f2
TL
1450 ceph_assert(r != q->second->file_map.end());
1451 ceph_assert(r->second->refs > 0);
7c673cae
FG
1452 --r->second->refs;
1453 q->second->file_map.erase(r);
1454 }
1455 }
1456 break;
1457
1458 case bluefs_transaction_t::OP_DIR_CREATE:
1459 {
1460 string dirname;
11fdf7f2 1461 decode(dirname, p);
7c673cae
FG
1462 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1463 << ": op_dir_create " << dirname << dendl;
11fdf7f2
TL
1464 if (unlikely(to_stdout)) {
1465 std::cout << " 0x" << std::hex << pos << std::dec
1466 << ": op_dir_create " << dirname << std::endl;
1467 }
1468
7c673cae 1469 if (!noop) {
20effc67
TL
1470 map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1471 ceph_assert(q == nodes.dir_map.end());
1472 nodes.dir_map[dirname] = ceph::make_ref<Dir>();
7c673cae
FG
1473 }
1474 }
1475 break;
1476
1477 case bluefs_transaction_t::OP_DIR_REMOVE:
1478 {
1479 string dirname;
11fdf7f2 1480 decode(dirname, p);
7c673cae
FG
1481 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1482 << ": op_dir_remove " << dirname << dendl;
11fdf7f2
TL
1483 if (unlikely(to_stdout)) {
1484 std::cout << " 0x" << std::hex << pos << std::dec
1485 << ": op_dir_remove " << dirname << std::endl;
1486 }
1487
7c673cae 1488 if (!noop) {
20effc67
TL
1489 map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1490 ceph_assert(q != nodes.dir_map.end());
11fdf7f2 1491 ceph_assert(q->second->file_map.empty());
20effc67 1492 nodes.dir_map.erase(q);
7c673cae
FG
1493 }
1494 }
1495 break;
1496
1497 case bluefs_transaction_t::OP_FILE_UPDATE:
1498 {
1499 bluefs_fnode_t fnode;
11fdf7f2 1500 decode(fnode, p);
7c673cae 1501 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
9f95a23c 1502 << ": op_file_update " << " " << fnode << " " << dendl;
11fdf7f2
TL
1503 if (unlikely(to_stdout)) {
1504 std::cout << " 0x" << std::hex << pos << std::dec
1505 << ": op_file_update " << " " << fnode << std::endl;
1506 }
9f95a23c 1507 if (!noop) {
7c673cae 1508 FileRef f = _get_file(fnode.ino);
20effc67
TL
1509 if (cct->_conf->bluefs_log_replay_check_allocations) {
1510 int r = _check_allocations(f->fnode,
1511 used_blocks, false, "OP_FILE_UPDATE");
1512 if (r < 0) {
1513 return r;
9f95a23c
TL
1514 }
1515 }
9f95a23c
TL
1516 if (fnode.ino != 1) {
1517 vselector->sub_usage(f->vselector_hint, f->fnode);
1518 }
1519 f->fnode = fnode;
1520 if (fnode.ino != 1) {
1521 vselector->add_usage(f->vselector_hint, f->fnode);
1522 }
1523
7c673cae
FG
1524 if (fnode.ino > ino_last) {
1525 ino_last = fnode.ino;
1526 }
9f95a23c 1527 if (cct->_conf->bluefs_log_replay_check_allocations) {
20effc67
TL
1528 int r = _check_allocations(f->fnode,
1529 used_blocks, true, "OP_FILE_UPDATE");
9f95a23c
TL
1530 if (r < 0) {
1531 return r;
1532 }
1533 }
522d829b
TL
1534 } else if (noop && fnode.ino == 1) {
1535 FileRef f = _get_file(fnode.ino);
1536 f->fnode = fnode;
7c673cae 1537 }
9f95a23c 1538 }
7c673cae 1539 break;
20effc67
TL
1540 case bluefs_transaction_t::OP_FILE_UPDATE_INC:
1541 {
1542 bluefs_fnode_delta_t delta;
1543 decode(delta, p);
1544 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1545 << ": op_file_update_inc " << " " << delta << " " << dendl;
1546 if (unlikely(to_stdout)) {
1547 std::cout << " 0x" << std::hex << pos << std::dec
1548 << ": op_file_update_inc " << " " << delta << std::endl;
1549 }
1550 if (!noop) {
1551 FileRef f = _get_file(delta.ino);
1552 bluefs_fnode_t& fnode = f->fnode;
1553 if (delta.offset != fnode.allocated) {
1554 derr << __func__ << " invalid op_file_update_inc, new extents miss end of file"
1555 << " fnode=" << fnode
1556 << " delta=" << delta
1557 << dendl;
1558 ceph_assert(delta.offset == fnode.allocated);
1559 }
1560 if (cct->_conf->bluefs_log_replay_check_allocations) {
1561 int r = _check_allocations(fnode,
1562 used_blocks, false, "OP_FILE_UPDATE_INC");
1563 if (r < 0) {
1564 return r;
1565 }
1566 }
1567
1568 fnode.ino = delta.ino;
1569 fnode.mtime = delta.mtime;
1570 if (fnode.ino != 1) {
1571 vselector->sub_usage(f->vselector_hint, fnode);
1572 }
1573 fnode.size = delta.size;
1574 fnode.claim_extents(delta.extents);
1575 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1576 << ": op_file_update_inc produced " << " " << fnode << " " << dendl;
1577
1578 if (fnode.ino != 1) {
1579 vselector->add_usage(f->vselector_hint, fnode);
1580 }
1581
1582 if (fnode.ino > ino_last) {
1583 ino_last = fnode.ino;
1584 }
1585 if (cct->_conf->bluefs_log_replay_check_allocations) {
1586 int r = _check_allocations(f->fnode,
1587 used_blocks, true, "OP_FILE_UPDATE_INC");
1588 if (r < 0) {
1589 return r;
1590 }
1591 }
1592 } else if (noop && delta.ino == 1) {
1593 // we need to track bluefs log, even in noop mode
1594 FileRef f = _get_file(1);
1595 bluefs_fnode_t& fnode = f->fnode;
1596 fnode.ino = delta.ino;
1597 fnode.mtime = delta.mtime;
1598 fnode.size = delta.size;
1599 fnode.claim_extents(delta.extents);
1600 }
1601 }
1602 break;
7c673cae
FG
1603
1604 case bluefs_transaction_t::OP_FILE_REMOVE:
1605 {
1606 uint64_t ino;
11fdf7f2 1607 decode(ino, p);
7c673cae
FG
1608 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1609 << ": op_file_remove " << ino << dendl;
11fdf7f2
TL
1610 if (unlikely(to_stdout)) {
1611 std::cout << " 0x" << std::hex << pos << std::dec
1612 << ": op_file_remove " << ino << std::endl;
1613 }
1614
9f95a23c 1615 if (!noop) {
20effc67
TL
1616 auto p = nodes.file_map.find(ino);
1617 ceph_assert(p != nodes.file_map.end());
9f95a23c
TL
1618 vselector->sub_usage(p->second->vselector_hint, p->second->fnode);
1619 if (cct->_conf->bluefs_log_replay_check_allocations) {
20effc67
TL
1620 int r = _check_allocations(p->second->fnode,
1621 used_blocks, false, "OP_FILE_REMOVE");
1622 if (r < 0) {
1623 return r;
9f95a23c
TL
1624 }
1625 }
20effc67 1626 nodes.file_map.erase(p);
9f95a23c
TL
1627 }
1628 }
7c673cae
FG
1629 break;
1630
1631 default:
1632 derr << __func__ << " 0x" << std::hex << pos << std::dec
1633 << ": stop: unrecognized op " << (int)op << dendl;
1634 delete log_reader;
1635 return -EIO;
1636 }
1637 }
11fdf7f2 1638 ceph_assert(p.end());
7c673cae
FG
1639
1640 // we successfully replayed the transaction; bump the seq and log size
1641 ++log_seq;
1642 log_file->fnode.size = log_reader->buf.pos;
1643 }
f67539c2
TL
1644 if (!noop) {
1645 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
20effc67
TL
1646 log.seq_live = log_seq + 1;
1647 dirty.seq_live = log_seq + 1;
1648 log.t.seq = log.seq_live;
1649 dirty.seq_stable = log_seq;
9f95a23c 1650 }
7c673cae
FG
1651
1652 dout(10) << __func__ << " log file size was 0x"
1653 << std::hex << log_file->fnode.size << std::dec << dendl;
11fdf7f2
TL
1654 if (unlikely(to_stdout)) {
1655 std::cout << " log file size was 0x"
1656 << std::hex << log_file->fnode.size << std::dec << std::endl;
1657 }
1658
7c673cae
FG
1659 delete log_reader;
1660
1661 if (!noop) {
1662 // verify file link counts are all >0
20effc67 1663 for (auto& p : nodes.file_map) {
7c673cae
FG
1664 if (p.second->refs == 0 &&
1665 p.second->fnode.ino > 1) {
1666 derr << __func__ << " file with link count 0: " << p.second->fnode
1667 << dendl;
1668 return -EIO;
1669 }
1670 }
1671 }
20effc67
TL
1672 // reflect file count in logger
1673 logger->set(l_bluefs_num_files, nodes.file_map.size());
7c673cae
FG
1674
1675 dout(10) << __func__ << " done" << dendl;
1676 return 0;
1677}
1678
11fdf7f2
TL
1679int BlueFS::log_dump()
1680{
1681 // only dump log file's content
20effc67
TL
1682 ceph_assert(log.writer == nullptr && "cannot log_dump on mounted BlueFS");
1683 _init_logger();
f67539c2 1684 int r = _open_super();
11fdf7f2 1685 if (r < 0) {
f67539c2 1686 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
11fdf7f2
TL
1687 return r;
1688 }
f67539c2
TL
1689 r = _replay(true, true);
1690 if (r < 0) {
1691 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
1692 }
1693 _shutdown_logger();
1694 super = bluefs_super_t();
1695 return r;
11fdf7f2
TL
1696}
1697
1698int BlueFS::device_migrate_to_existing(
1699 CephContext *cct,
1700 const set<int>& devs_source,
9f95a23c
TL
1701 int dev_target,
1702 const bluefs_layout_t& layout)
11fdf7f2
TL
1703{
1704 vector<byte> buf;
1705 bool buffered = cct->_conf->bluefs_buffered_io;
1706
eafe8130
TL
1707 dout(10) << __func__ << " devs_source " << devs_source
1708 << " dev_target " << dev_target << dendl;
11fdf7f2
TL
1709 assert(dev_target < (int)MAX_BDEV);
1710
1711 int flags = 0;
1712 flags |= devs_source.count(BDEV_DB) ?
1713 (REMOVE_DB | RENAME_SLOW2DB) : 0;
1714 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1715 int dev_target_new = dev_target;
1716
1717 // Slow device without separate DB one is addressed via BDEV_DB
1718 // Hence need renaming.
1719 if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) {
1720 dev_target_new = BDEV_DB;
1721 dout(0) << __func__ << " super to be written to " << dev_target << dendl;
1722 }
1723
20effc67 1724 for (auto& [ino, file_ref] : nodes.file_map) {
11fdf7f2 1725 //do not copy log
9f95a23c 1726 if (file_ref->fnode.ino == 1) {
11fdf7f2
TL
1727 continue;
1728 }
9f95a23c 1729 dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
eafe8130 1730
9f95a23c 1731 auto& fnode_extents = file_ref->fnode.extents;
20effc67 1732 vselector->sub_usage(file_ref->vselector_hint, file_ref->fnode);
11fdf7f2 1733
9f95a23c
TL
1734 bool rewrite = std::any_of(
1735 fnode_extents.begin(),
1736 fnode_extents.end(),
1737 [=](auto& ext) {
1738 return ext.bdev != dev_target && devs_source.count(ext.bdev);
1739 });
eafe8130
TL
1740 if (rewrite) {
1741 dout(10) << __func__ << " migrating" << dendl;
1742
1743 // read entire file
1744 bufferlist bl;
1745 for (auto old_ext : fnode_extents) {
1746 buf.resize(old_ext.length);
20effc67 1747 int r = _bdev_read_random(old_ext.bdev,
eafe8130
TL
1748 old_ext.offset,
1749 old_ext.length,
1750 (char*)&buf.at(0),
1751 buffered);
1752 if (r != 0) {
1753 derr << __func__ << " failed to read 0x" << std::hex
1754 << old_ext.offset << "~" << old_ext.length << std::dec
1755 << " from " << (int)dev_target << dendl;
1756 return -EIO;
1757 }
1758 bl.append((char*)&buf[0], old_ext.length);
1759 }
11fdf7f2 1760
eafe8130
TL
1761 // write entire file
1762 PExtentVector extents;
1763 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1764 if (l < 0) {
1765 derr << __func__ << " unable to allocate len 0x" << std::hex
1766 << bl.length() << std::dec << " from " << (int)dev_target
1767 << ": " << cpp_strerror(l) << dendl;
1768 return -ENOSPC;
1769 }
11fdf7f2 1770
eafe8130
TL
1771 uint64_t off = 0;
1772 for (auto& i : extents) {
1773 bufferlist cur;
1774 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1775 ceph_assert(cur_len > 0);
1776 cur.substr_of(bl, off, cur_len);
1777 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1778 ceph_assert(r == 0);
1779 off += cur_len;
1780 }
1781
1782 // release old extents
1783 for (auto old_ext : fnode_extents) {
1784 PExtentVector to_release;
1785 to_release.emplace_back(old_ext.offset, old_ext.length);
1786 alloc[old_ext.bdev]->release(to_release);
f67539c2
TL
1787 if (is_shared_alloc(old_ext.bdev)) {
1788 shared_alloc->bluefs_used -= to_release.size();
1789 }
eafe8130
TL
1790 }
1791
1792 // update fnode
1793 fnode_extents.clear();
1794 for (auto& i : extents) {
1795 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
1796 }
1797 } else {
9f95a23c
TL
1798 for (auto& ext : fnode_extents) {
1799 if (dev_target != dev_target_new && ext.bdev == dev_target) {
eafe8130 1800 dout(20) << __func__ << " " << " ... adjusting extent 0x"
9f95a23c 1801 << std::hex << ext.offset << std::dec
eafe8130
TL
1802 << " bdev " << dev_target << " -> " << dev_target_new
1803 << dendl;
9f95a23c 1804 ext.bdev = dev_target_new;
11fdf7f2 1805 }
11fdf7f2
TL
1806 }
1807 }
20effc67 1808 vselector->add_usage(file_ref->vselector_hint, file_ref->fnode);
11fdf7f2
TL
1809 }
1810 // new logging device in the current naming scheme
1811 int new_log_dev_cur = bdev[BDEV_WAL] ?
1812 BDEV_WAL :
1813 bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
1814
1815 // new logging device in new naming scheme
1816 int new_log_dev_next = new_log_dev_cur;
1817
1818 if (devs_source.count(new_log_dev_cur)) {
1819 // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1820 new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ?
1821 BDEV_DB :
1822 BDEV_WAL;
1823
1824 dout(0) << __func__ << " log moved from " << new_log_dev_cur
1825 << " to " << new_log_dev_next << dendl;
1826
1827 new_log_dev_cur =
1828 (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ?
1829 BDEV_SLOW :
1830 new_log_dev_next;
1831 }
1832
20effc67 1833 _rewrite_log_and_layout_sync_LNF_LD(
11fdf7f2
TL
1834 false,
1835 (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB,
1836 new_log_dev_cur,
1837 new_log_dev_next,
9f95a23c
TL
1838 flags,
1839 layout);
11fdf7f2
TL
1840 return 0;
1841}
1842
1843int BlueFS::device_migrate_to_new(
1844 CephContext *cct,
1845 const set<int>& devs_source,
9f95a23c
TL
1846 int dev_target,
1847 const bluefs_layout_t& layout)
11fdf7f2
TL
1848{
1849 vector<byte> buf;
1850 bool buffered = cct->_conf->bluefs_buffered_io;
1851
eafe8130
TL
1852 dout(10) << __func__ << " devs_source " << devs_source
1853 << " dev_target " << dev_target << dendl;
20effc67 1854 assert(dev_target == (int)BDEV_NEWDB || dev_target == (int)BDEV_NEWWAL);
11fdf7f2
TL
1855
1856 int flags = 0;
1857
1858 flags |= devs_source.count(BDEV_DB) ?
1859 (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) :
1860 0;
1861 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
9f95a23c 1862 int dev_target_new = dev_target; //FIXME: remove, makes no sense
11fdf7f2 1863
20effc67 1864 for (auto& p : nodes.file_map) {
11fdf7f2
TL
1865 //do not copy log
1866 if (p.second->fnode.ino == 1) {
1867 continue;
1868 }
eafe8130
TL
1869 dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl;
1870
11fdf7f2
TL
1871 auto& fnode_extents = p.second->fnode.extents;
1872
eafe8130 1873 bool rewrite = false;
11fdf7f2 1874 for (auto ext_it = fnode_extents.begin();
eafe8130
TL
1875 ext_it != p.second->fnode.extents.end();
1876 ++ext_it) {
11fdf7f2 1877 if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
eafe8130
TL
1878 rewrite = true;
1879 break;
1880 }
1881 }
1882 if (rewrite) {
1883 dout(10) << __func__ << " migrating" << dendl;
1884
1885 // read entire file
1886 bufferlist bl;
1887 for (auto old_ext : fnode_extents) {
1888 buf.resize(old_ext.length);
20effc67 1889 int r = _bdev_read_random(old_ext.bdev,
eafe8130
TL
1890 old_ext.offset,
1891 old_ext.length,
1892 (char*)&buf.at(0),
1893 buffered);
1894 if (r != 0) {
1895 derr << __func__ << " failed to read 0x" << std::hex
1896 << old_ext.offset << "~" << old_ext.length << std::dec
1897 << " from " << (int)dev_target << dendl;
1898 return -EIO;
11fdf7f2 1899 }
eafe8130
TL
1900 bl.append((char*)&buf[0], old_ext.length);
1901 }
1902
1903 // write entire file
1904 PExtentVector extents;
1905 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1906 if (l < 0) {
1907 derr << __func__ << " unable to allocate len 0x" << std::hex
1908 << bl.length() << std::dec << " from " << (int)dev_target
1909 << ": " << cpp_strerror(l) << dendl;
1910 return -ENOSPC;
1911 }
1912
1913 uint64_t off = 0;
1914 for (auto& i : extents) {
1915 bufferlist cur;
1916 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1917 ceph_assert(cur_len > 0);
1918 cur.substr_of(bl, off, cur_len);
1919 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1920 ceph_assert(r == 0);
1921 off += cur_len;
1922 }
1923
1924 // release old extents
1925 for (auto old_ext : fnode_extents) {
1926 PExtentVector to_release;
1927 to_release.emplace_back(old_ext.offset, old_ext.length);
1928 alloc[old_ext.bdev]->release(to_release);
f67539c2
TL
1929 if (is_shared_alloc(old_ext.bdev)) {
1930 shared_alloc->bluefs_used -= to_release.size();
1931 }
eafe8130
TL
1932 }
1933
1934 // update fnode
1935 fnode_extents.clear();
1936 for (auto& i : extents) {
1937 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
11fdf7f2
TL
1938 }
1939 }
11fdf7f2
TL
1940 }
1941 // new logging device in the current naming scheme
1942 int new_log_dev_cur =
1943 bdev[BDEV_NEWWAL] ?
1944 BDEV_NEWWAL :
1945 bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ?
1946 BDEV_WAL :
1947 bdev[BDEV_NEWDB] ?
1948 BDEV_NEWDB :
1949 bdev[BDEV_DB] && !(flags & REMOVE_DB)?
1950 BDEV_DB :
1951 BDEV_SLOW;
1952
1953 // new logging device in new naming scheme
1954 int new_log_dev_next =
1955 new_log_dev_cur == BDEV_NEWWAL ?
1956 BDEV_WAL :
1957 new_log_dev_cur == BDEV_NEWDB ?
1958 BDEV_DB :
1959 new_log_dev_cur;
1960
1961 int super_dev =
1962 dev_target == BDEV_NEWDB ?
1963 BDEV_NEWDB :
1964 bdev[BDEV_DB] ?
1965 BDEV_DB :
1966 BDEV_SLOW;
1967
20effc67 1968 _rewrite_log_and_layout_sync_LNF_LD(
11fdf7f2
TL
1969 false,
1970 super_dev,
1971 new_log_dev_cur,
1972 new_log_dev_next,
9f95a23c
TL
1973 flags,
1974 layout);
11fdf7f2
TL
1975 return 0;
1976}
1977
7c673cae
FG
1978BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
1979{
20effc67
TL
1980 auto p = nodes.file_map.find(ino);
1981 if (p == nodes.file_map.end()) {
9f95a23c 1982 FileRef f = ceph::make_ref<File>();
20effc67
TL
1983 nodes.file_map[ino] = f;
1984 // track files count in logger
1985 logger->set(l_bluefs_num_files, nodes.file_map.size());
7c673cae
FG
1986 dout(30) << __func__ << " ino " << ino << " = " << f
1987 << " (new)" << dendl;
1988 return f;
1989 } else {
1990 dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
1991 return p->second;
1992 }
1993}
1994
20effc67
TL
1995
1996/**
1997To modify fnode both FileWriter::lock and File::lock must be obtained.
1998The special case is when we modify bluefs log (ino 1) or
1999we are compacting log (ino 0).
2000
2001In any case it is enough to hold File::lock to be sure fnode will not be modified.
2002*/
2003struct lock_fnode_print {
2004 BlueFS::FileRef file;
2005 lock_fnode_print(BlueFS::FileRef file) : file(file) {};
2006};
2007std::ostream& operator<<(std::ostream& out, const lock_fnode_print& to_lock) {
2008 std::lock_guard l(to_lock.file->lock);
2009 out << to_lock.file->fnode;
2010 return out;
2011}
2012
2013void BlueFS::_drop_link_D(FileRef file)
7c673cae
FG
2014{
2015 dout(20) << __func__ << " had refs " << file->refs
20effc67 2016 << " on " << lock_fnode_print(file) << dendl;
11fdf7f2 2017 ceph_assert(file->refs > 0);
20effc67
TL
2018 ceph_assert(ceph_mutex_is_locked(log.lock));
2019 ceph_assert(ceph_mutex_is_locked(nodes.lock));
2020
7c673cae
FG
2021 --file->refs;
2022 if (file->refs == 0) {
2023 dout(20) << __func__ << " destroying " << file->fnode << dendl;
11fdf7f2 2024 ceph_assert(file->num_reading.load() == 0);
9f95a23c 2025 vselector->sub_usage(file->vselector_hint, file->fnode);
20effc67
TL
2026 log.t.op_file_remove(file->fnode.ino);
2027 nodes.file_map.erase(file->fnode.ino);
2028 logger->set(l_bluefs_num_files, nodes.file_map.size());
7c673cae 2029 file->deleted = true;
94b18763 2030
20effc67
TL
2031 std::lock_guard dl(dirty.lock);
2032 for (auto& r : file->fnode.extents) {
2033 dirty.pending_release[r.bdev].insert(r.offset, r.length);
2034 }
2035 if (file->dirty_seq > dirty.seq_stable) {
2036 // retract request to serialize changes
2037 ceph_assert(dirty.files.count(file->dirty_seq));
2038 auto it = dirty.files[file->dirty_seq].iterator_to(*file);
2039 dirty.files[file->dirty_seq].erase(it);
2040 file->dirty_seq = dirty.seq_stable;
7c673cae
FG
2041 }
2042 }
2043}
2044
adb31ebb 2045int64_t BlueFS::_read_random(
7c673cae
FG
2046 FileReader *h, ///< [in] read from here
2047 uint64_t off, ///< [in] offset
9f95a23c 2048 uint64_t len, ///< [in] this many bytes
f67539c2 2049 char *out) ///< [out] copy it here
7c673cae 2050{
494da23a
TL
2051 auto* buf = &h->buf;
2052
adb31ebb 2053 int64_t ret = 0;
7c673cae
FG
2054 dout(10) << __func__ << " h " << h
2055 << " 0x" << std::hex << off << "~" << len << std::dec
20effc67 2056 << " from " << lock_fnode_print(h->file) << dendl;
7c673cae
FG
2057
2058 ++h->file->num_reading;
2059
2060 if (!h->ignore_eof &&
2061 off + len > h->file->fnode.size) {
2062 if (off > h->file->fnode.size)
2063 len = 0;
2064 else
2065 len = h->file->fnode.size - off;
2066 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
2067 << std::hex << len << std::dec << dendl;
2068 }
494da23a
TL
2069 logger->inc(l_bluefs_read_random_count, 1);
2070 logger->inc(l_bluefs_read_random_bytes, len);
7c673cae 2071
494da23a 2072 std::shared_lock s_lock(h->lock);
f91f0fd5 2073 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
7c673cae 2074 while (len > 0) {
494da23a
TL
2075 if (off < buf->bl_off || off >= buf->get_buf_end()) {
2076 s_lock.unlock();
2077 uint64_t x_off = 0;
2078 auto p = h->file->fnode.seek(off, &x_off);
f6b5b4d7 2079 ceph_assert(p != h->file->fnode.extents.end());
9f95a23c 2080 uint64_t l = std::min(p->length - x_off, len);
adb31ebb
TL
2081 //hard cap to 1GB
2082 l = std::min(l, uint64_t(1) << 30);
494da23a
TL
2083 dout(20) << __func__ << " read random 0x"
2084 << std::hex << x_off << "~" << l << std::dec
2085 << " of " << *p << dendl;
cd265ab1
TL
2086 int r;
2087 if (!cct->_conf->bluefs_check_for_zeros) {
20effc67
TL
2088 r = _bdev_read_random(p->bdev, p->offset + x_off, l, out,
2089 cct->_conf->bluefs_buffered_io);
cd265ab1 2090 } else {
20effc67 2091 r = _read_random_and_check(p->bdev, p->offset + x_off, l, out,
cd265ab1
TL
2092 cct->_conf->bluefs_buffered_io);
2093 }
494da23a
TL
2094 ceph_assert(r == 0);
2095 off += l;
2096 len -= l;
2097 ret += l;
2098 out += l;
2099
2100 logger->inc(l_bluefs_read_random_disk_count, 1);
2101 logger->inc(l_bluefs_read_random_disk_bytes, l);
2102 if (len > 0) {
2103 s_lock.lock();
2104 }
2105 } else {
2106 auto left = buf->get_buf_remaining(off);
adb31ebb 2107 int64_t r = std::min(len, left);
494da23a
TL
2108 logger->inc(l_bluefs_read_random_buffer_count, 1);
2109 logger->inc(l_bluefs_read_random_buffer_bytes, r);
2110 dout(20) << __func__ << " left 0x" << std::hex << left
2111 << " 0x" << off << "~" << len << std::dec
2112 << dendl;
2113
f67539c2
TL
2114 auto p = buf->bl.begin();
2115 p.seek(off - buf->bl_off);
2116 p.copy(r, out);
2117 out += r;
7c673cae 2118
494da23a
TL
2119 dout(30) << __func__ << " result chunk (0x"
2120 << std::hex << r << std::dec << " bytes):\n";
2121 bufferlist t;
2122 t.substr_of(buf->bl, off - buf->bl_off, r);
2123 t.hexdump(*_dout);
2124 *_dout << dendl;
2125
2126 off += r;
2127 len -= r;
2128 ret += r;
2129 buf->pos += r;
2130 }
2131 }
7c673cae
FG
2132 dout(20) << __func__ << " got " << ret << dendl;
2133 --h->file->num_reading;
2134 return ret;
2135}
2136
adb31ebb 2137int64_t BlueFS::_read(
7c673cae 2138 FileReader *h, ///< [in] read from here
7c673cae
FG
2139 uint64_t off, ///< [in] offset
2140 size_t len, ///< [in] this many bytes
2141 bufferlist *outbl, ///< [out] optional: reference the result here
2142 char *out) ///< [out] optional: or copy it here
2143{
f67539c2
TL
2144 FileReaderBuffer *buf = &(h->buf);
2145
494da23a 2146 bool prefetch = !outbl && !out;
7c673cae
FG
2147 dout(10) << __func__ << " h " << h
2148 << " 0x" << std::hex << off << "~" << len << std::dec
20effc67 2149 << " from " << lock_fnode_print(h->file)
494da23a
TL
2150 << (prefetch ? " prefetch" : "")
2151 << dendl;
7c673cae
FG
2152
2153 ++h->file->num_reading;
2154
2155 if (!h->ignore_eof &&
2156 off + len > h->file->fnode.size) {
2157 if (off > h->file->fnode.size)
2158 len = 0;
2159 else
2160 len = h->file->fnode.size - off;
2161 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
2162 << std::hex << len << std::dec << dendl;
2163 }
494da23a
TL
2164 logger->inc(l_bluefs_read_count, 1);
2165 logger->inc(l_bluefs_read_bytes, len);
2166 if (prefetch) {
2167 logger->inc(l_bluefs_read_prefetch_count, 1);
2168 logger->inc(l_bluefs_read_prefetch_bytes, len);
2169 }
2170
7c673cae
FG
2171 if (outbl)
2172 outbl->clear();
2173
adb31ebb 2174 int64_t ret = 0;
494da23a 2175 std::shared_lock s_lock(h->lock);
7c673cae
FG
2176 while (len > 0) {
2177 size_t left;
2178 if (off < buf->bl_off || off >= buf->get_buf_end()) {
494da23a
TL
2179 s_lock.unlock();
2180 std::unique_lock u_lock(h->lock);
f91f0fd5 2181 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
494da23a
TL
2182 if (off < buf->bl_off || off >= buf->get_buf_end()) {
2183 // if precondition hasn't changed during locking upgrade.
2184 buf->bl.clear();
2185 buf->bl_off = off & super.block_mask();
2186 uint64_t x_off = 0;
2187 auto p = h->file->fnode.seek(buf->bl_off, &x_off);
f6b5b4d7
TL
2188 if (p == h->file->fnode.extents.end()) {
2189 dout(5) << __func__ << " reading less then required "
2190 << ret << "<" << ret + len << dendl;
2191 break;
2192 }
2193
494da23a
TL
2194 uint64_t want = round_up_to(len + (off & ~super.block_mask()),
2195 super.block_size);
2196 want = std::max(want, buf->max_prefetch);
2197 uint64_t l = std::min(p->length - x_off, want);
adb31ebb
TL
2198 //hard cap to 1GB
2199 l = std::min(l, uint64_t(1) << 30);
494da23a
TL
2200 uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size);
2201 if (!h->ignore_eof &&
2202 buf->bl_off + l > eof_offset) {
2203 l = eof_offset - buf->bl_off;
2204 }
2205 dout(20) << __func__ << " fetching 0x"
2206 << std::hex << x_off << "~" << l << std::dec
2207 << " of " << *p << dendl;
cd265ab1
TL
2208 int r;
2209 if (!cct->_conf->bluefs_check_for_zeros) {
20effc67
TL
2210 r = _bdev_read(p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev],
2211 cct->_conf->bluefs_buffered_io);
cd265ab1 2212 } else {
20effc67
TL
2213 r = _read_and_check(
2214 p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev],
2215 cct->_conf->bluefs_buffered_io);
cd265ab1 2216 }
20effc67
TL
2217 logger->inc(l_bluefs_read_disk_count, 1);
2218 logger->inc(l_bluefs_read_disk_bytes, l);
2219
494da23a 2220 ceph_assert(r == 0);
7c673cae 2221 }
494da23a
TL
2222 u_lock.unlock();
2223 s_lock.lock();
2224 // we should recheck if buffer is valid after lock downgrade
2225 continue;
7c673cae
FG
2226 }
2227 left = buf->get_buf_remaining(off);
2228 dout(20) << __func__ << " left 0x" << std::hex << left
2229 << " len 0x" << len << std::dec << dendl;
2230
adb31ebb 2231 int64_t r = std::min(len, left);
7c673cae
FG
2232 if (outbl) {
2233 bufferlist t;
2234 t.substr_of(buf->bl, off - buf->bl_off, r);
2235 outbl->claim_append(t);
2236 }
2237 if (out) {
f67539c2
TL
2238 auto p = buf->bl.begin();
2239 p.seek(off - buf->bl_off);
2240 p.copy(r, out);
7c673cae
FG
2241 out += r;
2242 }
2243
2244 dout(30) << __func__ << " result chunk (0x"
2245 << std::hex << r << std::dec << " bytes):\n";
2246 bufferlist t;
2247 t.substr_of(buf->bl, off - buf->bl_off, r);
2248 t.hexdump(*_dout);
2249 *_dout << dendl;
2250
2251 off += r;
2252 len -= r;
2253 ret += r;
2254 buf->pos += r;
2255 }
f67539c2 2256
7c673cae 2257 dout(20) << __func__ << " got " << ret << dendl;
11fdf7f2 2258 ceph_assert(!outbl || (int)outbl->length() == ret);
7c673cae
FG
2259 --h->file->num_reading;
2260 return ret;
2261}
2262
20effc67 2263void BlueFS::invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
7c673cae 2264{
20effc67 2265 std::lock_guard l(f->lock);
7c673cae
FG
2266 dout(10) << __func__ << " file " << f->fnode
2267 << " 0x" << std::hex << offset << "~" << length << std::dec
2268 << dendl;
2269 if (offset & ~super.block_mask()) {
2270 offset &= super.block_mask();
11fdf7f2 2271 length = round_up_to(length, super.block_size);
7c673cae
FG
2272 }
2273 uint64_t x_off = 0;
2274 auto p = f->fnode.seek(offset, &x_off);
2275 while (length > 0 && p != f->fnode.extents.end()) {
11fdf7f2 2276 uint64_t x_len = std::min(p->length - x_off, length);
7c673cae
FG
2277 bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
2278 dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len
2279 << std:: dec << " of " << *p << dendl;
2280 offset += x_len;
2281 length -= x_len;
2282 }
2283}
2284
20effc67 2285uint64_t BlueFS::_estimate_log_size_N()
7c673cae 2286{
20effc67 2287 std::lock_guard nl(nodes.lock);
7c673cae
FG
2288 int avg_dir_size = 40; // fixme
2289 int avg_file_size = 12;
2290 uint64_t size = 4096 * 2;
20effc67
TL
2291 size += nodes.file_map.size() * (1 + sizeof(bluefs_fnode_t));
2292 size += nodes.dir_map.size() + (1 + avg_dir_size);
2293 size += nodes.file_map.size() * (1 + avg_dir_size + avg_file_size);
11fdf7f2 2294 return round_up_to(size, super.block_size);
7c673cae
FG
2295}
2296
20effc67 2297void BlueFS::compact_log()/*_LNF_LD_NF_D*/
7c673cae 2298{
f6b5b4d7
TL
2299 if (!cct->_conf->bluefs_replay_recovery_disable_compact) {
2300 if (cct->_conf->bluefs_compact_log_sync) {
20effc67 2301 _compact_log_sync_LNF_LD();
f6b5b4d7 2302 } else {
20effc67 2303 _compact_log_async_LD_LNF_D();
f6b5b4d7 2304 }
7c673cae
FG
2305 }
2306}
2307
20effc67 2308bool BlueFS::_should_start_compact_log_L_N()
7c673cae 2309{
20effc67
TL
2310 if (log_is_compacting.load() == true) {
2311 // compaction is already running
2312 return false;
2313 }
2314 uint64_t current;
2315 {
2316 std::lock_guard ll(log.lock);
2317 current = log.writer->file->fnode.size;
2318 }
2319 uint64_t expected = _estimate_log_size_N();
7c673cae
FG
2320 float ratio = (float)current / (float)expected;
2321 dout(10) << __func__ << " current 0x" << std::hex << current
2322 << " expected " << expected << std::dec
2323 << " ratio " << ratio
7c673cae 2324 << dendl;
20effc67 2325 if (current < cct->_conf->bluefs_log_compact_min_size ||
7c673cae
FG
2326 ratio < cct->_conf->bluefs_log_compact_min_ratio) {
2327 return false;
2328 }
2329 return true;
2330}
2331
20effc67 2332void BlueFS::_compact_log_dump_metadata_NF(bluefs_transaction_t *t,
11fdf7f2 2333 int flags)
7c673cae 2334{
20effc67
TL
2335 std::lock_guard nl(nodes.lock);
2336
7c673cae
FG
2337 t->seq = 1;
2338 t->uuid = super.uuid;
2339 dout(20) << __func__ << " op_init" << dendl;
2340
2341 t->op_init();
20effc67 2342 for (auto& [ino, file_ref] : nodes.file_map) {
9f95a23c 2343 if (ino == 1)
7c673cae 2344 continue;
9f95a23c 2345 ceph_assert(ino > 1);
20effc67 2346 std::lock_guard fl(file_ref->lock);
9f95a23c 2347 for(auto& e : file_ref->fnode.extents) {
11fdf7f2
TL
2348 auto bdev = e.bdev;
2349 auto bdev_new = bdev;
2350 ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL));
2351 if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
2352 bdev_new = BDEV_DB;
2353 }
2354 if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
2355 bdev_new = BDEV_SLOW;
2356 }
2357 if (bdev == BDEV_NEWDB) {
2358 // REMOVE_DB xor RENAME_DB
2359 ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
2360 ceph_assert(!(flags & RENAME_SLOW2DB));
2361 bdev_new = BDEV_DB;
2362 }
2363 if (bdev == BDEV_NEWWAL) {
2364 ceph_assert(flags & REMOVE_WAL);
2365 bdev_new = BDEV_WAL;
2366 }
2367 e.bdev = bdev_new;
2368 }
9f95a23c
TL
2369 dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
2370 t->op_file_update(file_ref->fnode);
7c673cae 2371 }
20effc67
TL
2372 for (auto& [path, dir_ref] : nodes.dir_map) {
2373 dout(20) << __func__ << " op_dir_create " << path << dendl;
2374 t->op_dir_create(path);
2375 for (auto& [fname, file_ref] : dir_ref->file_map) {
2376 dout(20) << __func__ << " op_dir_link " << path << "/" << fname
2377 << " to " << file_ref->fnode.ino << dendl;
2378 t->op_dir_link(path, fname, file_ref->fnode.ino);
2379 }
2380 }
2381}
2382/* Streams to t files modified before *capture_before_seq* and all dirs */
2383void BlueFS::_compact_log_async_dump_metadata_NF(bluefs_transaction_t *t,
2384 uint64_t capture_before_seq)
2385{
2386 std::lock_guard nl(nodes.lock);
2387
2388 t->seq = 1;
2389 t->uuid = super.uuid;
2390 dout(20) << __func__ << " op_init" << dendl;
2391
2392 t->op_init();
2393 for (auto& [ino, file_ref] : nodes.file_map) {
2394 if (ino == 1)
2395 continue;
2396 ceph_assert(ino > 1);
2397 std::lock_guard fl(file_ref->lock);
2398 if (file_ref->dirty_seq < capture_before_seq) {
2399 dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
2400 } else {
2401 dout(20) << __func__ << " op_file_update just modified, dirty_seq="
2402 << file_ref->dirty_seq << " " << file_ref->fnode << dendl;
2403 }
2404 t->op_file_update(file_ref->fnode);
2405 }
2406 for (auto& [path, dir_ref] : nodes.dir_map) {
9f95a23c
TL
2407 dout(20) << __func__ << " op_dir_create " << path << dendl;
2408 t->op_dir_create(path);
2409 for (auto& [fname, file_ref] : dir_ref->file_map) {
2410 dout(20) << __func__ << " op_dir_link " << path << "/" << fname
2411 << " to " << file_ref->fnode.ino << dendl;
2412 t->op_dir_link(path, fname, file_ref->fnode.ino);
7c673cae
FG
2413 }
2414 }
2415}
2416
20effc67 2417void BlueFS::_compact_log_sync_LNF_LD()
7c673cae
FG
2418{
2419 dout(10) << __func__ << dendl;
20effc67
TL
2420 uint8_t prefer_bdev;
2421 {
2422 std::lock_guard ll(log.lock);
2423 prefer_bdev =
2424 vselector->select_prefer_bdev(log.writer->file->vselector_hint);
2425 }
2426 _rewrite_log_and_layout_sync_LNF_LD(true,
11fdf7f2 2427 BDEV_DB,
9f95a23c
TL
2428 prefer_bdev,
2429 prefer_bdev,
2430 0,
2431 super.memorized_layout);
11fdf7f2
TL
2432 logger->inc(l_bluefs_log_compactions);
2433}
2434
20effc67
TL
2435void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
2436 int super_dev,
2437 int log_dev,
2438 int log_dev_new,
2439 int flags,
2440 std::optional<bluefs_layout_t> layout)
11fdf7f2 2441{
20effc67
TL
2442 std::lock_guard ll(log.lock);
2443
2444 File *log_file = log.writer->file.get();
7c673cae 2445
20effc67
TL
2446 // log.t.seq is always set to current live seq
2447 ceph_assert(log.t.seq == log.seq_live);
2448 // Capturing entire state. Dump anything that has been stored there.
2449 log.t.clear();
2450 log.t.seq = log.seq_live;
2451 // From now on, no changes to log.t are permitted until we finish rewriting log.
2452 // Can allow dirty to remain dirty - log.seq_live will not change.
7c673cae 2453
11fdf7f2
TL
2454 dout(20) << __func__ << " super_dev:" << super_dev
2455 << " log_dev:" << log_dev
2456 << " log_dev_new:" << log_dev_new
2457 << " flags:" << flags
2458 << dendl;
7c673cae 2459 bluefs_transaction_t t;
20effc67 2460 _compact_log_dump_metadata_NF(&t, flags);
7c673cae 2461
20effc67
TL
2462 dout(20) << __func__ << " op_jump_seq " << log.seq_live << dendl;
2463 t.op_jump_seq(log.seq_live);
7c673cae
FG
2464
2465 bufferlist bl;
11fdf7f2 2466 encode(t, bl);
7c673cae
FG
2467 _pad_bl(bl);
2468
2469 uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
2470 dout(20) << __func__ << " need " << need << dendl;
2471
494da23a 2472 bluefs_fnode_t old_fnode;
11fdf7f2 2473 int r;
20effc67 2474 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
494da23a 2475 log_file->fnode.swap_extents(old_fnode);
11fdf7f2
TL
2476 if (allocate_with_fallback) {
2477 r = _allocate(log_dev, need, &log_file->fnode);
2478 ceph_assert(r == 0);
2479 } else {
2480 PExtentVector extents;
2481 r = _allocate_without_fallback(log_dev,
2482 need,
2483 &extents);
2484 ceph_assert(r == 0);
2485 for (auto& p : extents) {
2486 log_file->fnode.append_extent(
2487 bluefs_extent_t(log_dev, p.offset, p.length));
2488 }
7c673cae
FG
2489 }
2490
20effc67 2491 _close_writer(log.writer);
7c673cae 2492
20effc67
TL
2493 // we will write it to super
2494 log_file->fnode.reset_delta();
7c673cae 2495 log_file->fnode.size = bl.length();
9f95a23c 2496
20effc67
TL
2497 log.writer = _create_writer(log_file);
2498 log.writer->append(bl);
2499 _flush_special(log.writer);
2500 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
11fdf7f2
TL
2501#ifdef HAVE_LIBAIO
2502 if (!cct->_conf->bluefs_sync_write) {
2503 list<aio_t> completed_ios;
20effc67
TL
2504 _claim_completed_aios(log.writer, &completed_ios);
2505 _wait_for_aio(log.writer);
11fdf7f2
TL
2506 completed_ios.clear();
2507 }
2508#endif
20effc67 2509 _flush_bdev();
224ce89b 2510
9f95a23c 2511 super.memorized_layout = layout;
7c673cae 2512 super.log_fnode = log_file->fnode;
11fdf7f2
TL
2513 // rename device if needed
2514 if (log_dev != log_dev_new) {
2515 dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl;
2516 for (auto& p : super.log_fnode.extents) {
2517 p.bdev = log_dev_new;
2518 }
2519 }
2520 dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl;
2521
7c673cae 2522 ++super.version;
11fdf7f2 2523 _write_super(super_dev);
20effc67 2524 _flush_bdev();
7c673cae 2525
494da23a 2526 dout(10) << __func__ << " release old log extents " << old_fnode.extents << dendl;
20effc67 2527 std::lock_guard dl(dirty.lock);
494da23a 2528 for (auto& r : old_fnode.extents) {
20effc67 2529 dirty.pending_release[r.bdev].insert(r.offset, r.length);
7c673cae 2530 }
7c673cae
FG
2531}
2532
2533/*
2534 * 1. Allocate a new extent to continue the log, and then log an event
2535 * that jumps the log write position to the new extent. At this point, the
2536 * old extent(s) won't be written to, and reflect everything to compact.
2537 * New events will be written to the new region that we'll keep.
2538 *
2539 * 2. While still holding the lock, encode a bufferlist that dumps all of the
2540 * in-memory fnodes and names. This will become the new beginning of the
2541 * log. The last event will jump to the log continuation extent from #1.
2542 *
2543 * 3. Queue a write to a new extent for the new beginnging of the log.
2544 *
2545 * 4. Drop lock and wait
2546 *
2547 * 5. Retake the lock.
2548 *
2549 * 6. Update the log_fnode to splice in the new beginning.
2550 *
2551 * 7. Write the new superblock.
2552 *
2553 * 8. Release the old log space. Clean up.
2554 */
20effc67
TL
2555
2556void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
7c673cae
FG
2557{
2558 dout(10) << __func__ << dendl;
20effc67
TL
2559 // only one compaction allowed at one time
2560 bool old_is_comp = std::atomic_exchange(&log_is_compacting, true);
2561 if (old_is_comp) {
2562 dout(10) << __func__ << " ongoing" <<dendl;
2563 return;
2564 }
2565
2566 log.lock.lock();
2567 File *log_file = log.writer->file.get();
2568 FileWriter *new_log_writer = nullptr;
2569 FileRef new_log = nullptr;
2570 uint64_t new_log_jump_to = 0;
2571 uint64_t old_log_jump_to = 0;
7c673cae 2572
9f95a23c 2573 new_log = ceph::make_ref<File>();
20effc67 2574 new_log->fnode.ino = 0; // we use _flush_special to avoid log of the fnode
181888fb 2575
20effc67
TL
2576 // Part 1.
2577 // Prepare current log for jumping into it.
2578 // 1. Allocate extent
2579 // 2. Update op to log
2580 // 3. Jump op to log
2581 // During that, no one else can write to log, otherwise we risk jumping backwards.
2582 // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that.
2583
2584 //signal _maybe_extend_log that expansion of log is temporary inacceptable
2585 bool old_forbidden = atomic_exchange(&log_forbidden_to_expand, true);
2586 ceph_assert(old_forbidden == false);
3efd9988 2587
9f95a23c
TL
2588 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2589
20effc67 2590 // 1.1 allocate new log space and jump to it.
7c673cae 2591 old_log_jump_to = log_file->fnode.get_allocated();
20effc67 2592 uint64_t runway = log_file->fnode.get_allocated() - log.writer->get_effective_write_pos();
7c673cae 2593 dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
11fdf7f2 2594 << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl;
9f95a23c
TL
2595 int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
2596 cct->_conf->bluefs_max_log_runway,
2597 &log_file->fnode);
11fdf7f2 2598 ceph_assert(r == 0);
9f95a23c
TL
2599 //adjust usage as flush below will need it
2600 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
7c673cae
FG
2601 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2602
2603 // update the log file change and log a jump to the offset where we want to
2604 // write the new entries
20effc67
TL
2605 log.t.op_file_update(log_file->fnode);
2606 // jump to new position should mean next seq
2607 log.t.op_jump(log.seq_live + 1, old_log_jump_to);
2608 uint64_t seq_now = log.seq_live;
2609 // we need to flush all bdev because we will be streaming all dirty files to log
2610 // TODO - think - if _flush_and_sync_log_jump will not add dirty files nor release pending allocations
2611 // then flush_bdev() will not be necessary
2612 _flush_bdev();
2613 _flush_and_sync_log_jump_D(old_log_jump_to, runway);
2614
2615 // out of jump section
7c673cae
FG
2616
2617 // 2. prepare compacted log
2618 bluefs_transaction_t t;
20effc67
TL
2619 _compact_log_async_dump_metadata_NF(&t, seq_now);
2620
2621 // now state is captured to bufferlist
2622 // log can be used to write to, ops in log will be continuation of captured state
2623 log.lock.unlock();
7c673cae 2624
eafe8130
TL
2625 uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
2626 std::max(alloc_size[BDEV_DB],
2627 alloc_size[BDEV_SLOW]));
2628
7c673cae 2629 // conservative estimate for final encoded size
11fdf7f2 2630 new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2,
eafe8130 2631 max_alloc_size);
20effc67
TL
2632 //newly constructed log head will jump to what we had before
2633 t.op_jump(seq_now, new_log_jump_to);
7c673cae 2634
11fdf7f2 2635 // allocate
9f95a23c 2636 //FIXME: check if we want DB here?
11fdf7f2
TL
2637 r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
2638 &new_log->fnode);
2639 ceph_assert(r == 0);
2640
7c673cae 2641 bufferlist bl;
11fdf7f2 2642 encode(t, bl);
7c673cae
FG
2643 _pad_bl(bl);
2644
2645 dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
2646 << std::dec << dendl;
2647
7c673cae 2648 new_log_writer = _create_writer(new_log);
7c673cae 2649
20effc67 2650 new_log_writer->append(bl);
7c673cae 2651 // 3. flush
20effc67 2652 _flush_special(new_log_writer);
7c673cae
FG
2653
2654 // 4. wait
20effc67 2655 _flush_bdev(new_log_writer);
11fdf7f2 2656 // 5. update our log fnode
20effc67
TL
2657 // we need to append to new_log the extents that were allocated in step 1.1
2658 // we do it by inverse logic - we drop 'old_log_jump_to' bytes and keep rest
2659 // todo - maybe improve _allocate so we will give clear set of new allocations
2660 uint64_t processed = 0;
7c673cae 2661 mempool::bluefs::vector<bluefs_extent_t> old_extents;
20effc67
TL
2662 for (auto& e : log_file->fnode.extents) {
2663 if (processed + e.length <= old_log_jump_to) {
2664 // drop whole extent
7c673cae 2665 dout(10) << __func__ << " remove old log extent " << e << dendl;
20effc67 2666 old_extents.push_back(e);
7c673cae 2667 } else {
20effc67
TL
2668 // keep, but how much?
2669 if (processed < old_log_jump_to) {
2670 ceph_assert(processed + e.length > old_log_jump_to);
2671 ceph_assert(old_log_jump_to - processed <= std::numeric_limits<uint32_t>::max());
2672 uint32_t cut_at = uint32_t(old_log_jump_to - processed);
2673 // need to cut, first half gets dropped
2674 bluefs_extent_t retire(e.bdev, e.offset, cut_at);
2675 old_extents.push_back(retire);
2676 // second half goes to new log
2677 bluefs_extent_t keep(e.bdev, e.offset + cut_at, e.length - cut_at);
2678 new_log->fnode.append_extent(keep);
2679 dout(10) << __func__ << " kept " << keep << " removed " << retire << dendl;
2680 } else {
2681 // take entire extent
2682 ceph_assert(processed >= old_log_jump_to);
2683 new_log->fnode.append_extent(e);
2684 dout(10) << __func__ << " kept " << e << dendl;
2685 }
7c673cae 2686 }
20effc67 2687 processed += e.length;
94b18763 2688 }
20effc67
TL
2689 // we will write it to super
2690 new_log->fnode.reset_delta();
7c673cae 2691
20effc67
TL
2692 // 6. write the super block to reflect the changes
2693 dout(10) << __func__ << " writing super" << dendl;
2694 new_log->fnode.ino = log_file->fnode.ino;
2695 new_log->fnode.size = 0;
2696 new_log->fnode.mtime = ceph_clock_now();
2697 super.log_fnode = new_log->fnode;
2698 ++super.version;
2699 _write_super(BDEV_DB);
2700 _flush_bdev();
2701
2702 log.lock.lock();
2703 // swapping log_file and new_log
9f95a23c
TL
2704 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2705
7c673cae 2706 // clear the extents from old log file, they are added to new log
94b18763 2707 log_file->fnode.clear_extents();
7c673cae 2708 // swap the log files. New log file is the log file now.
94b18763
FG
2709 new_log->fnode.swap_extents(log_file->fnode);
2710
20effc67
TL
2711 log.writer->pos = log.writer->file->fnode.size =
2712 log.writer->pos - old_log_jump_to + new_log_jump_to;
7c673cae 2713
9f95a23c
TL
2714 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2715
20effc67 2716 log.lock.unlock();
7c673cae 2717
20effc67
TL
2718 old_forbidden = atomic_exchange(&log_forbidden_to_expand, false);
2719 ceph_assert(old_forbidden == true);
2720 //to wake up if someone was in need of expanding log
2721 log_cond.notify_all();
7c673cae 2722
11fdf7f2 2723 // 7. release old space
7c673cae 2724 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
20effc67
TL
2725 {
2726 std::lock_guard dl(dirty.lock);
2727 for (auto& r : old_extents) {
2728 dirty.pending_release[r.bdev].insert(r.offset, r.length);
2729 }
7c673cae
FG
2730 }
2731
2732 // delete the new log, remove from the dirty files list
2733 _close_writer(new_log_writer);
7c673cae
FG
2734 new_log_writer = nullptr;
2735 new_log = nullptr;
2736 log_cond.notify_all();
2737
2738 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2739 logger->inc(l_bluefs_log_compactions);
20effc67
TL
2740
2741 old_is_comp = atomic_exchange(&log_is_compacting, false);
2742 ceph_assert(old_is_comp);
7c673cae
FG
2743}
2744
2745void BlueFS::_pad_bl(bufferlist& bl)
2746{
2747 uint64_t partial = bl.length() % super.block_size;
2748 if (partial) {
2749 dout(10) << __func__ << " padding with 0x" << std::hex
2750 << super.block_size - partial << " zeros" << std::dec << dendl;
2751 bl.append_zero(super.block_size - partial);
2752 }
2753}
2754
7c673cae 2755
20effc67
TL
2756// Returns log seq that was live before advance.
2757uint64_t BlueFS::_log_advance_seq()
7c673cae 2758{
20effc67
TL
2759 ceph_assert(ceph_mutex_is_locked(dirty.lock));
2760 ceph_assert(ceph_mutex_is_locked(log.lock));
2761 //acquire new seq
2762 // this will became seq_stable once we write
2763 ceph_assert(dirty.seq_stable < dirty.seq_live);
2764 ceph_assert(log.t.seq == log.seq_live);
2765 uint64_t seq = log.seq_live;
2766 log.t.uuid = super.uuid;
2767
2768 ++dirty.seq_live;
2769 ++log.seq_live;
2770 ceph_assert(dirty.seq_live == log.seq_live);
2771 return seq;
2772}
7c673cae 2773
a8e16298 2774
20effc67
TL
2775// Adds to log.t file modifications mentioned in `dirty.files`.
2776// Note: some bluefs ops may have already been stored in log.t transaction.
2777void BlueFS::_consume_dirty(uint64_t seq)
2778{
2779 ceph_assert(ceph_mutex_is_locked(dirty.lock));
2780 ceph_assert(ceph_mutex_is_locked(log.lock));
7c673cae
FG
2781
2782 // log dirty files
20effc67
TL
2783 // we just incremented log_seq. It is now illegal to add to dirty.files[log_seq]
2784 auto lsi = dirty.files.find(seq);
2785 if (lsi != dirty.files.end()) {
2786 dout(20) << __func__ << " " << lsi->second.size() << " dirty.files" << dendl;
7c673cae 2787 for (auto &f : lsi->second) {
20effc67
TL
2788 // fnode here is protected indirectly
2789 // the only path that adds to dirty.files goes from _fsync()
2790 // _fsync() is executed under writer lock,
2791 // and does not exit until syncing log is done
2792 dout(20) << __func__ << " op_file_update_inc " << f.fnode << dendl;
2793 log.t.op_file_update_inc(f.fnode);
7c673cae
FG
2794 }
2795 }
20effc67 2796}
7c673cae 2797
20effc67
TL
2798// Extends log if its free space is smaller then bluefs_min_log_runway.
2799// Returns space available *BEFORE* adding new space. Signed for additional <0 detection.
2800int64_t BlueFS::_maybe_extend_log()
2801{
2802 ceph_assert(ceph_mutex_is_locked(log.lock));
7c673cae 2803 // allocate some more space (before we run out)?
20effc67
TL
2804 // BTW: this triggers `flush()` in the `page_aligned_appender` of `log.writer`.
2805 int64_t runway = log.writer->file->fnode.get_allocated() -
2806 log.writer->get_effective_write_pos();
7c673cae
FG
2807 if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
2808 dout(10) << __func__ << " allocating more log runway (0x"
2809 << std::hex << runway << std::dec << " remaining)" << dendl;
20effc67
TL
2810 /*
2811 * Usually, when we are low on space in log, we just allocate new extent,
2812 * put update op(log) to log and we are fine.
2813 * Problem - it interferes with log compaction:
2814 * New log produced in compaction will include - as last op - jump into some offset (anchor) of current log.
2815 * It is assumed that log region (anchor - end) will contain all changes made by bluefs since
2816 * full state capture into new log.
2817 * Putting log update into (anchor - end) region is illegal, because any update there must be compatible with
2818 * both logs, but old log is different then new log.
2819 *
2820 * Possible solutions:
2821 * - stall extending log until we finish compacting and switch log (CURRENT)
2822 * - re-run compaction with more runway for old log
2823 * - add OP_FILE_ADDEXT that adds extent; will be compatible with both logs
2824 */
2825 if (log_forbidden_to_expand.load() == true) {
2826 return -EWOULDBLOCK;
7c673cae 2827 }
20effc67 2828 vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
9f95a23c 2829 int r = _allocate(
20effc67 2830 vselector->select_prefer_bdev(log.writer->file->vselector_hint),
9f95a23c 2831 cct->_conf->bluefs_max_log_runway,
20effc67 2832 &log.writer->file->fnode);
11fdf7f2 2833 ceph_assert(r == 0);
20effc67
TL
2834 vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
2835 log.t.op_file_update_inc(log.writer->file->fnode);
7c673cae 2836 }
20effc67
TL
2837 return runway;
2838}
2839
2840void BlueFS::_flush_and_sync_log_core(int64_t runway)
2841{
2842 ceph_assert(ceph_mutex_is_locked(log.lock));
2843 dout(10) << __func__ << " " << log.t << dendl;
7c673cae
FG
2844
2845 bufferlist bl;
11fdf7f2 2846 bl.reserve(super.block_size);
20effc67 2847 encode(log.t, bl);
7c673cae 2848 // pad to block boundary
11fdf7f2
TL
2849 size_t realign = super.block_size - (bl.length() % super.block_size);
2850 if (realign && realign != super.block_size)
2851 bl.append_zero(realign);
2852
7c673cae
FG
2853 logger->inc(l_bluefs_logged_bytes, bl.length());
2854
20effc67 2855 if (true) {
f6b5b4d7 2856 ceph_assert(bl.length() <= runway); // if we write this, we will have an unrecoverable data loss
20effc67 2857 // transaction will not fit extents before growth -> data loss on _replay
f6b5b4d7
TL
2858 }
2859
20effc67 2860 log.writer->append(bl);
7c673cae 2861
20effc67
TL
2862 // prepare log for new transactions
2863 log.t.clear();
2864 log.t.seq = log.seq_live;
7c673cae 2865
20effc67
TL
2866 uint64_t new_data = _flush_special(log.writer);
2867 vselector->add_usage(log.writer->file->vselector_hint, new_data);
2868}
7c673cae 2869
20effc67
TL
2870// Clears dirty.files up to (including) seq_stable.
2871void BlueFS::_clear_dirty_set_stable_D(uint64_t seq)
2872{
2873 std::lock_guard dl(dirty.lock);
7c673cae
FG
2874
2875 // clean dirty files
20effc67
TL
2876 if (seq > dirty.seq_stable) {
2877 dirty.seq_stable = seq;
2878 dout(20) << __func__ << " seq_stable " << dirty.seq_stable << dendl;
2879
2880 // undirty all files that were already streamed to log
2881 auto p = dirty.files.begin();
2882 while (p != dirty.files.end()) {
2883 if (p->first > dirty.seq_stable) {
7c673cae
FG
2884 dout(20) << __func__ << " done cleaning up dirty files" << dendl;
2885 break;
2886 }
2887
2888 auto l = p->second.begin();
2889 while (l != p->second.end()) {
2890 File *file = &*l;
20effc67
TL
2891 ceph_assert(file->dirty_seq <= dirty.seq_stable);
2892 dout(20) << __func__ << " cleaned file " << file->fnode.ino << dendl;
2893 file->dirty_seq = dirty.seq_stable;
7c673cae
FG
2894 p->second.erase(l++);
2895 }
2896
11fdf7f2 2897 ceph_assert(p->second.empty());
20effc67 2898 dirty.files.erase(p++);
7c673cae
FG
2899 }
2900 } else {
20effc67 2901 dout(20) << __func__ << " seq_stable " << dirty.seq_stable
7c673cae
FG
2902 << " already >= out seq " << seq
2903 << ", we lost a race against another log flush, done" << dendl;
2904 }
20effc67 2905}
a8e16298 2906
20effc67
TL
2907void BlueFS::_release_pending_allocations(vector<interval_set<uint64_t>>& to_release)
2908{
a8e16298
TL
2909 for (unsigned i = 0; i < to_release.size(); ++i) {
2910 if (!to_release[i].empty()) {
2911 /* OK, now we have the guarantee alloc[i] won't be null. */
11fdf7f2
TL
2912 int r = 0;
2913 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
2914 r = bdev[i]->queue_discard(to_release[i]);
2915 if (r == 0)
2916 continue;
2917 } else if (cct->_conf->bdev_enable_discard) {
2918 for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
2919 bdev[i]->discard(p.get_start(), p.get_len());
2920 }
2921 }
a8e16298 2922 alloc[i]->release(to_release[i]);
f67539c2
TL
2923 if (is_shared_alloc(i)) {
2924 shared_alloc->bluefs_used -= to_release[i].size();
2925 }
a8e16298
TL
2926 }
2927 }
20effc67
TL
2928}
2929
2930int BlueFS::_flush_and_sync_log_LD(uint64_t want_seq)
2931{
2932 int64_t available_runway;
2933 do {
2934 log.lock.lock();
2935 dirty.lock.lock();
2936 if (want_seq && want_seq <= dirty.seq_stable) {
2937 dout(10) << __func__ << " want_seq " << want_seq << " <= seq_stable "
2938 << dirty.seq_stable << ", done" << dendl;
2939 dirty.lock.unlock();
2940 log.lock.unlock();
2941 return 0;
2942 }
2943
2944 available_runway = _maybe_extend_log();
2945 if (available_runway == -EWOULDBLOCK) {
2946 // we are in need of adding runway, but we are during log-switch from compaction
2947 dirty.lock.unlock();
2948 //instead log.lock.unlock() do move ownership
2949 std::unique_lock<ceph::mutex> ll(log.lock, std::adopt_lock);
2950 while (log_forbidden_to_expand.load()) {
2951 log_cond.wait(ll);
2952 }
2953 } else {
2954 ceph_assert(available_runway >= 0);
2955 }
2956 } while (available_runway < 0);
2957
2958 ceph_assert(want_seq == 0 || want_seq <= dirty.seq_live); // illegal to request seq that was not created yet
2959 uint64_t seq =_log_advance_seq();
2960 _consume_dirty(seq);
2961 vector<interval_set<uint64_t>> to_release(dirty.pending_release.size());
2962 to_release.swap(dirty.pending_release);
2963 dirty.lock.unlock();
2964
2965 _flush_and_sync_log_core(available_runway);
2966 _flush_bdev(log.writer);
2967 logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size);
2968 //now log.lock is no longer needed
2969 log.lock.unlock();
2970
2971 _clear_dirty_set_stable_D(seq);
2972 _release_pending_allocations(to_release);
a8e16298 2973
7c673cae 2974 _update_logger_stats();
20effc67
TL
2975 return 0;
2976}
2977
2978// Flushes log and immediately adjusts log_writer pos.
2979int BlueFS::_flush_and_sync_log_jump_D(uint64_t jump_to,
2980 int64_t available_runway)
2981{
2982 ceph_assert(ceph_mutex_is_locked(log.lock));
2983
2984 ceph_assert(jump_to);
2985 // we synchronize writing to log, by lock to log.lock
2986
2987 dirty.lock.lock();
2988 uint64_t seq =_log_advance_seq();
2989 _consume_dirty(seq);
2990 vector<interval_set<uint64_t>> to_release(dirty.pending_release.size());
2991 to_release.swap(dirty.pending_release);
2992 dirty.lock.unlock();
2993 _flush_and_sync_log_core(available_runway);
7c673cae 2994
20effc67
TL
2995 dout(10) << __func__ << " jumping log offset from 0x" << std::hex
2996 << log.writer->pos << " -> 0x" << jump_to << std::dec << dendl;
2997 log.writer->pos = jump_to;
2998 vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode.size);
2999 log.writer->file->fnode.size = jump_to;
3000 vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode.size);
3001
3002 _flush_bdev(log.writer);
3003
3004 _clear_dirty_set_stable_D(seq);
3005 _release_pending_allocations(to_release);
3006
3007 logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size);
3008 _update_logger_stats();
7c673cae
FG
3009 return 0;
3010}
3011
f67539c2
TL
3012ceph::bufferlist BlueFS::FileWriter::flush_buffer(
3013 CephContext* const cct,
3014 const bool partial,
3015 const unsigned length,
3016 const bluefs_super_t& super)
3017{
20effc67 3018 ceph_assert(ceph_mutex_is_locked(this->lock) || file->fnode.ino <= 1);
f67539c2
TL
3019 ceph::bufferlist bl;
3020 if (partial) {
3021 tail_block.splice(0, tail_block.length(), &bl);
3022 }
3023 const auto remaining_len = length - bl.length();
3024 buffer.splice(0, remaining_len, &bl);
3025 if (buffer.length()) {
3026 dout(20) << " leaving 0x" << std::hex << buffer.length() << std::dec
3027 << " unflushed" << dendl;
3028 }
3029 if (const unsigned tail = bl.length() & ~super.block_mask(); tail) {
3030 const auto padding_len = super.block_size - tail;
3031 dout(20) << __func__ << " caching tail of 0x"
3032 << std::hex << tail
3033 << " and padding block with 0x" << padding_len
3034 << " buffer.length() " << buffer.length()
3035 << std::dec << dendl;
3036 // We need to go through the `buffer_appender` to get a chance to
3037 // preserve in-memory contiguity and not mess with the alignment.
3038 // Otherwise a costly rebuild could happen in e.g. `KernelDevice`.
3039 buffer_appender.append_zero(padding_len);
3040 buffer.splice(buffer.length() - padding_len, padding_len, &bl);
3041 // Deep copy the tail here. This allows to avoid costlier copy on
3042 // bufferlist rebuild in e.g. `KernelDevice` and minimizes number
3043 // of memory allocations.
3044 // The alternative approach would be to place the entire tail and
3045 // padding on a dedicated, 4 KB long memory chunk. This shouldn't
3046 // trigger the rebuild while still being less expensive.
3047 buffer_appender.substr_of(bl, bl.length() - padding_len - tail, tail);
3048 buffer.splice(buffer.length() - tail, tail, &tail_block);
3049 } else {
3050 tail_block.clear();
3051 }
3052 return bl;
3053}
3054
20effc67 3055int BlueFS::_signal_dirty_to_log_D(FileWriter *h)
522d829b 3056{
20effc67
TL
3057 ceph_assert(ceph_mutex_is_locked(h->lock));
3058 std::lock_guard dl(dirty.lock);
522d829b
TL
3059 h->file->fnode.mtime = ceph_clock_now();
3060 ceph_assert(h->file->fnode.ino >= 1);
20effc67
TL
3061 if (h->file->dirty_seq <= dirty.seq_stable) {
3062 h->file->dirty_seq = dirty.seq_live;
3063 dirty.files[h->file->dirty_seq].push_back(*h->file);
3064 dout(20) << __func__ << " dirty_seq = " << dirty.seq_live
522d829b
TL
3065 << " (was clean)" << dendl;
3066 } else {
20effc67 3067 if (h->file->dirty_seq != dirty.seq_live) {
522d829b 3068 // need re-dirty, erase from list first
20effc67
TL
3069 ceph_assert(dirty.files.count(h->file->dirty_seq));
3070 auto it = dirty.files[h->file->dirty_seq].iterator_to(*h->file);
3071 dirty.files[h->file->dirty_seq].erase(it);
3072 h->file->dirty_seq = dirty.seq_live;
3073 dirty.files[h->file->dirty_seq].push_back(*h->file);
3074 dout(20) << __func__ << " dirty_seq = " << dirty.seq_live
522d829b
TL
3075 << " (was " << h->file->dirty_seq << ")" << dendl;
3076 } else {
20effc67 3077 dout(20) << __func__ << " dirty_seq = " << dirty.seq_live
522d829b
TL
3078 << " (unchanged, do nothing) " << dendl;
3079 }
3080 }
3081 return 0;
3082}
3083
20effc67 3084void BlueFS::flush_range(FileWriter *h, uint64_t offset, uint64_t length)/*_WF*/
7c673cae 3085{
20effc67
TL
3086 _maybe_check_vselector_LNF();
3087 std::unique_lock hl(h->lock);
3088 _flush_range_F(h, offset, length);
3089}
3090
3091int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length)
3092{
3093 ceph_assert(ceph_mutex_is_locked(h->lock));
3094 ceph_assert(h->file->num_readers.load() == 0);
3095 ceph_assert(h->file->fnode.ino > 1);
3096
7c673cae
FG
3097 dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
3098 << " 0x" << offset << "~" << length << std::dec
3099 << " to " << h->file->fnode << dendl;
f67539c2
TL
3100 if (h->file->deleted) {
3101 dout(10) << __func__ << " deleted, no-op" << dendl;
3102 return 0;
3103 }
7c673cae 3104
20effc67 3105 bool buffered = cct->_conf->bluefs_buffered_io;
7c673cae
FG
3106
3107 if (offset + length <= h->pos)
3108 return 0;
3109 if (offset < h->pos) {
3110 length -= h->pos - offset;
3111 offset = h->pos;
3112 dout(10) << " still need 0x"
3113 << std::hex << offset << "~" << length << std::dec
3114 << dendl;
3115 }
20effc67 3116 std::lock_guard file_lock(h->file->lock);
11fdf7f2 3117 ceph_assert(offset <= h->file->fnode.size);
7c673cae
FG
3118
3119 uint64_t allocated = h->file->fnode.get_allocated();
9f95a23c 3120 vselector->sub_usage(h->file->vselector_hint, h->file->fnode);
7c673cae
FG
3121 // do not bother to dirty the file if we are overwriting
3122 // previously allocated extents.
7c673cae
FG
3123 if (allocated < offset + length) {
3124 // we should never run out of log space here; see the min runway check
3125 // in _flush_and_sync_log.
9f95a23c 3126 int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint),
7c673cae 3127 offset + length - allocated,
94b18763 3128 &h->file->fnode);
7c673cae
FG
3129 if (r < 0) {
3130 derr << __func__ << " allocated: 0x" << std::hex << allocated
3131 << " offset: 0x" << offset << " length: 0x" << length << std::dec
3132 << dendl;
9f95a23c 3133 vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo
11fdf7f2 3134 ceph_abort_msg("bluefs enospc");
7c673cae
FG
3135 return r;
3136 }
522d829b 3137 h->file->is_dirty = true;
7c673cae
FG
3138 }
3139 if (h->file->fnode.size < offset + length) {
3140 h->file->fnode.size = offset + length;
20effc67 3141 h->file->is_dirty = true;
7c673cae 3142 }
20effc67 3143
522d829b 3144 dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl;
20effc67
TL
3145 int res = _flush_data(h, offset, length, buffered);
3146 vselector->add_usage(h->file->vselector_hint, h->file->fnode);
3147 return res;
3148}
7c673cae 3149
20effc67
TL
3150int BlueFS::_flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool buffered)
3151{
3152 if (h->file->fnode.ino > 1) {
3153 ceph_assert(ceph_mutex_is_locked(h->lock));
3154 ceph_assert(ceph_mutex_is_locked(h->file->lock));
3155 }
7c673cae
FG
3156 uint64_t x_off = 0;
3157 auto p = h->file->fnode.seek(offset, &x_off);
11fdf7f2 3158 ceph_assert(p != h->file->fnode.extents.end());
7c673cae
FG
3159 dout(20) << __func__ << " in " << *p << " x_off 0x"
3160 << std::hex << x_off << std::dec << dendl;
3161
3162 unsigned partial = x_off & ~super.block_mask();
7c673cae
FG
3163 if (partial) {
3164 dout(20) << __func__ << " using partial tail 0x"
3165 << std::hex << partial << std::dec << dendl;
7c673cae
FG
3166 x_off -= partial;
3167 offset -= partial;
3168 length += partial;
3169 dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
3170 for (auto p : h->iocv) {
3171 if (p) {
3172 p->aio_wait();
3173 }
3174 }
3175 }
7c673cae 3176
f67539c2
TL
3177 auto bl = h->flush_buffer(cct, partial, length, super);
3178 ceph_assert(bl.length() >= length);
9f95a23c 3179 h->pos = offset + length;
f67539c2 3180 length = bl.length();
9f95a23c 3181
7c673cae
FG
3182 switch (h->writer_type) {
3183 case WRITER_WAL:
3184 logger->inc(l_bluefs_bytes_written_wal, length);
3185 break;
3186 case WRITER_SST:
3187 logger->inc(l_bluefs_bytes_written_sst, length);
3188 break;
3189 }
3190
3191 dout(30) << "dump:\n";
3192 bl.hexdump(*_dout);
3193 *_dout << dendl;
3194
7c673cae 3195 uint64_t bloff = 0;
11fdf7f2 3196 uint64_t bytes_written_slow = 0;
7c673cae 3197 while (length > 0) {
11fdf7f2 3198 uint64_t x_len = std::min(p->length - x_off, length);
7c673cae
FG
3199 bufferlist t;
3200 t.substr_of(bl, bloff, x_len);
7c673cae 3201 if (cct->_conf->bluefs_sync_write) {
11fdf7f2 3202 bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint);
7c673cae 3203 } else {
11fdf7f2
TL
3204 bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint);
3205 }
3206 h->dirty_devs[p->bdev] = true;
3207 if (p->bdev == BDEV_SLOW) {
3208 bytes_written_slow += t.length();
7c673cae 3209 }
11fdf7f2 3210
7c673cae
FG
3211 bloff += x_len;
3212 length -= x_len;
3213 ++p;
3214 x_off = 0;
3215 }
f67539c2
TL
3216 if (bytes_written_slow) {
3217 logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow);
3218 }
7c673cae
FG
3219 for (unsigned i = 0; i < MAX_BDEV; ++i) {
3220 if (bdev[i]) {
11fdf7f2 3221 if (h->iocv[i] && h->iocv[i]->has_pending_aios()) {
7c673cae
FG
3222 bdev[i]->aio_submit(h->iocv[i]);
3223 }
3224 }
3225 }
3226 dout(20) << __func__ << " h " << h << " pos now 0x"
3227 << std::hex << h->pos << std::dec << dendl;
3228 return 0;
3229}
3230
11fdf7f2 3231#ifdef HAVE_LIBAIO
7c673cae
FG
3232// we need to retire old completed aios so they don't stick around in
3233// memory indefinitely (along with their bufferlist refs).
3234void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
3235{
3236 for (auto p : h->iocv) {
3237 if (p) {
3238 ls->splice(ls->end(), p->running_aios);
3239 }
3240 }
3241 dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
3242}
3243
20effc67 3244void BlueFS::_wait_for_aio(FileWriter *h)
7c673cae
FG
3245{
3246 // NOTE: this is safe to call without a lock, as long as our reference is
3247 // stable.
f67539c2
TL
3248 utime_t start;
3249 lgeneric_subdout(cct, bluefs, 10) << __func__;
3250 start = ceph_clock_now();
3251 *_dout << " " << h << dendl;
7c673cae
FG
3252 for (auto p : h->iocv) {
3253 if (p) {
3254 p->aio_wait();
3255 }
3256 }
11fdf7f2 3257 dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl;
7c673cae 3258}
11fdf7f2 3259#endif
7c673cae 3260
20effc67
TL
3261void BlueFS::append_try_flush(FileWriter *h, const char* buf, size_t len)/*_WF_LNF_NF_LD_D*/
3262{
3263 bool flushed_sum = false;
3264 {
3265 std::unique_lock hl(h->lock);
3266 size_t max_size = 1ull << 30; // cap to 1GB
3267 while (len > 0) {
3268 bool need_flush = true;
3269 auto l0 = h->get_buffer_length();
3270 if (l0 < max_size) {
3271 size_t l = std::min(len, max_size - l0);
3272 h->append(buf, l);
3273 buf += l;
3274 len -= l;
3275 need_flush = h->get_buffer_length() >= cct->_conf->bluefs_min_flush_size;
3276 }
3277 if (need_flush) {
3278 bool flushed = false;
3279 int r = _flush_F(h, true, &flushed);
3280 ceph_assert(r == 0);
3281 flushed_sum |= flushed;
3282 // make sure we've made any progress with flush hence the
3283 // loop doesn't iterate forever
3284 ceph_assert(h->get_buffer_length() < max_size);
3285 }
3286 }
3287 }
3288 if (flushed_sum) {
3289 _maybe_compact_log_LNF_NF_LD_D();
3290 }
3291}
3292
3293void BlueFS::flush(FileWriter *h, bool force)/*_WF_LNF_NF_LD_D*/
f6b5b4d7
TL
3294{
3295 bool flushed = false;
20effc67
TL
3296 int r;
3297 {
3298 std::unique_lock hl(h->lock);
3299 r = _flush_F(h, force, &flushed);
3300 ceph_assert(r == 0);
3301 }
f6b5b4d7 3302 if (r == 0 && flushed) {
20effc67 3303 _maybe_compact_log_LNF_NF_LD_D();
f6b5b4d7 3304 }
f6b5b4d7
TL
3305}
3306
20effc67 3307int BlueFS::_flush_F(FileWriter *h, bool force, bool *flushed)
7c673cae 3308{
20effc67 3309 ceph_assert(ceph_mutex_is_locked(h->lock));
f67539c2 3310 uint64_t length = h->get_buffer_length();
7c673cae 3311 uint64_t offset = h->pos;
f6b5b4d7
TL
3312 if (flushed) {
3313 *flushed = false;
3314 }
7c673cae
FG
3315 if (!force &&
3316 length < cct->_conf->bluefs_min_flush_size) {
3317 dout(10) << __func__ << " " << h << " ignoring, length " << length
3318 << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
3319 << dendl;
3320 return 0;
3321 }
3322 if (length == 0) {
3323 dout(10) << __func__ << " " << h << " no dirty data on "
3324 << h->file->fnode << dendl;
3325 return 0;
3326 }
3327 dout(10) << __func__ << " " << h << " 0x"
3328 << std::hex << offset << "~" << length << std::dec
3329 << " to " << h->file->fnode << dendl;
11fdf7f2 3330 ceph_assert(h->pos <= h->file->fnode.size);
20effc67 3331 int r = _flush_range_F(h, offset, length);
f6b5b4d7
TL
3332 if (flushed) {
3333 *flushed = true;
3334 }
3335 return r;
7c673cae
FG
3336}
3337
20effc67
TL
3338// Flush for bluefs special files.
3339// Does not add extents to h.
3340// Does not mark h as dirty.
3341// we do not need to dirty the log file (or it's compacting
3342// replacement) when the file size changes because replay is
3343// smart enough to discover it on its own.
3344uint64_t BlueFS::_flush_special(FileWriter *h)
3345{
3346 ceph_assert(h->file->fnode.ino <= 1);
3347 uint64_t length = h->get_buffer_length();
3348 uint64_t offset = h->pos;
3349 uint64_t new_data = 0;
3350 ceph_assert(length + offset <= h->file->fnode.get_allocated());
3351 if (h->file->fnode.size < offset + length) {
3352 new_data = offset + length - h->file->fnode.size;
3353 h->file->fnode.size = offset + length;
3354 }
3355 _flush_data(h, offset, length, false);
3356 return new_data;
3357}
3358
3359int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
7c673cae 3360{
20effc67 3361 std::lock_guard hl(h->lock);
7c673cae
FG
3362 dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
3363 << " file " << h->file->fnode << dendl;
3364 if (h->file->deleted) {
3365 dout(10) << __func__ << " deleted, no-op" << dendl;
3366 return 0;
3367 }
3368
3369 // we never truncate internal log files
11fdf7f2 3370 ceph_assert(h->file->fnode.ino > 1);
7c673cae 3371
7c673cae
FG
3372 // truncate off unflushed data?
3373 if (h->pos < offset &&
f67539c2 3374 h->pos + h->get_buffer_length() > offset) {
7c673cae
FG
3375 dout(20) << __func__ << " tossing out last " << offset - h->pos
3376 << " unflushed bytes" << dendl;
11fdf7f2 3377 ceph_abort_msg("actually this shouldn't happen");
7c673cae 3378 }
f67539c2 3379 if (h->get_buffer_length()) {
20effc67 3380 int r = _flush_F(h, true);
7c673cae
FG
3381 if (r < 0)
3382 return r;
3383 }
3384 if (offset == h->file->fnode.size) {
3385 return 0; // no-op!
3386 }
3387 if (offset > h->file->fnode.size) {
11fdf7f2 3388 ceph_abort_msg("truncate up not supported");
7c673cae 3389 }
11fdf7f2 3390 ceph_assert(h->file->fnode.size >= offset);
20effc67
TL
3391 _flush_bdev(h);
3392
3393 std::lock_guard ll(log.lock);
9f95a23c 3394 vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
7c673cae 3395 h->file->fnode.size = offset;
9f95a23c 3396 vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
20effc67 3397 log.t.op_file_update_inc(h->file->fnode);
7c673cae
FG
3398 return 0;
3399}
3400
20effc67 3401int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/
7c673cae 3402{
20effc67
TL
3403 _maybe_check_vselector_LNF();
3404 std::unique_lock hl(h->lock);
3405 uint64_t old_dirty_seq = 0;
3406 {
3407 dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl;
3408 int r = _flush_F(h, true);
3409 if (r < 0)
3410 return r;
3411 _flush_bdev(h);
3412 if (h->file->is_dirty) {
3413 _signal_dirty_to_log_D(h);
3414 h->file->is_dirty = false;
3415 }
3416 {
3417 std::lock_guard dl(dirty.lock);
3418 if (dirty.seq_stable < h->file->dirty_seq) {
3419 old_dirty_seq = h->file->dirty_seq;
3420 dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
3421 << ") on " << h->file->fnode << ", flushing log" << dendl;
3422 }
3423 }
522d829b 3424 }
7c673cae 3425 if (old_dirty_seq) {
20effc67 3426 _flush_and_sync_log_LD(old_dirty_seq);
7c673cae 3427 }
20effc67
TL
3428 _maybe_compact_log_LNF_NF_LD_D();
3429
7c673cae
FG
3430 return 0;
3431}
3432
20effc67
TL
3433// be careful - either h->file->lock or log.lock must be taken
3434void BlueFS::_flush_bdev(FileWriter *h)
7c673cae 3435{
20effc67
TL
3436 if (h->file->fnode.ino > 1) {
3437 ceph_assert(ceph_mutex_is_locked(h->lock));
3438 } else if (h->file->fnode.ino == 1) {
3439 ceph_assert(ceph_mutex_is_locked(log.lock));
3440 }
11fdf7f2
TL
3441 std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs;
3442 h->dirty_devs.fill(false);
3443#ifdef HAVE_LIBAIO
7c673cae
FG
3444 if (!cct->_conf->bluefs_sync_write) {
3445 list<aio_t> completed_ios;
3446 _claim_completed_aios(h, &completed_ios);
20effc67 3447 _wait_for_aio(h);
7c673cae 3448 completed_ios.clear();
7c673cae 3449 }
20effc67
TL
3450#endif
3451 _flush_bdev(flush_devs);
7c673cae
FG
3452}
3453
20effc67 3454void BlueFS::_flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs)
11fdf7f2
TL
3455{
3456 // NOTE: this is safe to call without a lock.
3457 dout(20) << __func__ << dendl;
3458 for (unsigned i = 0; i < MAX_BDEV; i++) {
3459 if (dirty_bdevs[i])
3460 bdev[i]->flush();
3461 }
3462}
3463
20effc67 3464void BlueFS::_flush_bdev()
7c673cae
FG
3465{
3466 // NOTE: this is safe to call without a lock.
3467 dout(20) << __func__ << dendl;
f67539c2
TL
3468 for (unsigned i = 0; i < MAX_BDEV; i++) {
3469 // alloc space from BDEV_SLOW is unexpected.
3470 // So most cases we don't alloc from BDEV_SLOW and so avoiding flush not-used device.
3471 if (bdev[i] && (i != BDEV_SLOW || _get_used(i))) {
3472 bdev[i]->flush();
3473 }
7c673cae
FG
3474 }
3475}
3476
eafe8130
TL
3477const char* BlueFS::get_device_name(unsigned id)
3478{
3479 if (id >= MAX_BDEV) return "BDEV_INV";
3480 const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
3481 return names[id];
3482}
3483
11fdf7f2
TL
3484int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
3485 PExtentVector* extents)
3486{
3487 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
3488 << " from " << (int)id << dendl;
3489 assert(id < alloc.size());
11fdf7f2
TL
3490 if (!alloc[id]) {
3491 return -ENOENT;
3492 }
3493 extents->reserve(4); // 4 should be (more than) enough for most allocations
f67539c2
TL
3494 int64_t need = round_up_to(len, alloc_size[id]);
3495 int64_t alloc_len = alloc[id]->allocate(need, alloc_size[id], 0, extents);
3496 if (alloc_len < 0 || alloc_len < need) {
eafe8130 3497 if (alloc_len > 0) {
11fdf7f2
TL
3498 alloc[id]->release(*extents);
3499 }
f67539c2
TL
3500 derr << __func__ << " unable to allocate 0x" << std::hex << need
3501 << " on bdev " << (int)id
3502 << ", allocator name " << alloc[id]->get_name()
3503 << ", allocator type " << alloc[id]->get_type()
3504 << ", capacity 0x" << alloc[id]->get_capacity()
3505 << ", block size 0x" << alloc[id]->get_block_size()
20effc67 3506 << ", alloc size 0x" << alloc_size[id]
f67539c2
TL
3507 << ", free 0x" << alloc[id]->get_free()
3508 << ", fragmentation " << alloc[id]->get_fragmentation()
3509 << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0)
3510 << std::dec << dendl;
3511 alloc[id]->dump();
11fdf7f2
TL
3512 return -ENOSPC;
3513 }
f67539c2
TL
3514 if (is_shared_alloc(id)) {
3515 shared_alloc->bluefs_used += alloc_len;
3516 }
11fdf7f2
TL
3517
3518 return 0;
3519}
3520
7c673cae 3521int BlueFS::_allocate(uint8_t id, uint64_t len,
94b18763 3522 bluefs_fnode_t* node)
7c673cae
FG
3523{
3524 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
3525 << " from " << (int)id << dendl;
11fdf7f2 3526 ceph_assert(id < alloc.size());
b32b8144 3527 int64_t alloc_len = 0;
a8e16298 3528 PExtentVector extents;
11fdf7f2 3529 uint64_t hint = 0;
f67539c2 3530 int64_t need = len;
7c673cae 3531 if (alloc[id]) {
f67539c2 3532 need = round_up_to(len, alloc_size[id]);
94b18763
FG
3533 if (!node->extents.empty() && node->extents.back().bdev == id) {
3534 hint = node->extents.back().end();
11fdf7f2 3535 }
b32b8144 3536 extents.reserve(4); // 4 should be (more than) enough for most allocations
f67539c2 3537 alloc_len = alloc[id]->allocate(need, alloc_size[id], hint, &extents);
b32b8144 3538 }
f67539c2
TL
3539 if (alloc_len < 0 || alloc_len < need) {
3540 if (alloc[id]) {
3541 if (alloc_len > 0) {
3542 alloc[id]->release(extents);
3543 }
3544 dout(1) << __func__ << " unable to allocate 0x" << std::hex << need
3545 << " on bdev " << (int)id
3546 << ", allocator name " << alloc[id]->get_name()
3547 << ", allocator type " << alloc[id]->get_type()
3548 << ", capacity 0x" << alloc[id]->get_capacity()
3549 << ", block size 0x" << alloc[id]->get_block_size()
20effc67 3550 << ", alloc size 0x" << alloc_size[id]
f67539c2
TL
3551 << ", free 0x" << alloc[id]->get_free()
3552 << ", fragmentation " << alloc[id]->get_fragmentation()
3553 << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0)
3554 << std::dec << dendl;
20effc67
TL
3555 } else {
3556 dout(20) << __func__ << " alloc-id not set on index="<< (int)id << " unable to allocate 0x" << std::hex << need
3557 << " on bdev " << (int)id << std::dec << dendl;
b32b8144 3558 }
7c673cae 3559 if (id != BDEV_SLOW) {
f67539c2 3560 dout(20) << __func__ << " fallback to bdev "
20effc67 3561 << (int)id + 1
f67539c2 3562 << dendl;
94b18763 3563 return _allocate(id + 1, len, node);
11fdf7f2 3564 } else {
f67539c2
TL
3565 derr << __func__ << " allocation failed, needed 0x" << std::hex << need
3566 << dendl;
11fdf7f2 3567 }
f67539c2 3568 return -ENOSPC;
11fdf7f2 3569 } else {
f67539c2
TL
3570 uint64_t used = _get_used(id);
3571 if (max_bytes[id] < used) {
3572 logger->set(max_bytes_pcounters[id], used);
3573 max_bytes[id] = used;
3574 }
3575 if (is_shared_alloc(id)) {
3576 shared_alloc->bluefs_used += alloc_len;
11fdf7f2 3577 }
7c673cae
FG
3578 }
3579
3580 for (auto& p : extents) {
94b18763 3581 node->append_extent(bluefs_extent_t(id, p.offset, p.length));
7c673cae
FG
3582 }
3583
3584 return 0;
3585}
3586
20effc67 3587int BlueFS::preallocate(FileRef f, uint64_t off, uint64_t len)/*_LF*/
7c673cae 3588{
20effc67
TL
3589 std::lock_guard ll(log.lock);
3590 std::lock_guard fl(f->lock);
7c673cae
FG
3591 dout(10) << __func__ << " file " << f->fnode << " 0x"
3592 << std::hex << off << "~" << len << std::dec << dendl;
3593 if (f->deleted) {
3594 dout(10) << __func__ << " deleted, no-op" << dendl;
3595 return 0;
3596 }
11fdf7f2 3597 ceph_assert(f->fnode.ino > 1);
7c673cae
FG
3598 uint64_t allocated = f->fnode.get_allocated();
3599 if (off + len > allocated) {
3600 uint64_t want = off + len - allocated;
9f95a23c 3601
20effc67 3602 vselector->sub_usage(f->vselector_hint, f->fnode);
9f95a23c
TL
3603 int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint),
3604 want,
3605 &f->fnode);
3606 vselector->add_usage(f->vselector_hint, f->fnode);
7c673cae
FG
3607 if (r < 0)
3608 return r;
20effc67
TL
3609
3610 log.t.op_file_update_inc(f->fnode);
7c673cae
FG
3611 }
3612 return 0;
3613}
3614
20effc67 3615void BlueFS::sync_metadata(bool avoid_compact)/*_LNF_NF_LD_D*/
7c673cae 3616{
20effc67
TL
3617 bool can_skip_flush;
3618 {
3619 std::lock_guard ll(log.lock);
3620 std::lock_guard dl(dirty.lock);
3621 can_skip_flush = log.t.empty() && dirty.files.empty();
3622 }
3623 if (can_skip_flush) {
7c673cae 3624 dout(10) << __func__ << " - no pending log events" << dendl;
11fdf7f2 3625 } else {
f67539c2
TL
3626 utime_t start;
3627 lgeneric_subdout(cct, bluefs, 10) << __func__;
3628 start = ceph_clock_now();
3629 *_dout << dendl;
20effc67
TL
3630 _flush_bdev(); // FIXME?
3631 _flush_and_sync_log_LD();
11fdf7f2 3632 dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl;
7c673cae 3633 }
7c673cae 3634
f6b5b4d7 3635 if (!avoid_compact) {
20effc67 3636 _maybe_compact_log_LNF_NF_LD_D();
f6b5b4d7
TL
3637 }
3638}
3639
20effc67 3640void BlueFS::_maybe_compact_log_LNF_NF_LD_D()
f6b5b4d7
TL
3641{
3642 if (!cct->_conf->bluefs_replay_recovery_disable_compact &&
20effc67 3643 _should_start_compact_log_L_N()) {
7c673cae 3644 if (cct->_conf->bluefs_compact_log_sync) {
20effc67 3645 _compact_log_sync_LNF_LD();
7c673cae 3646 } else {
20effc67 3647 _compact_log_async_LD_LNF_D();
7c673cae
FG
3648 }
3649 }
7c673cae
FG
3650}
3651
3652int BlueFS::open_for_write(
b3b6e05e
TL
3653 std::string_view dirname,
3654 std::string_view filename,
7c673cae 3655 FileWriter **h,
20effc67 3656 bool overwrite)/*_N_LD*/
7c673cae 3657{
20effc67
TL
3658 _maybe_check_vselector_LNF();
3659 FileRef file;
3660 bool create = false;
3661 bool truncate = false;
3662 mempool::bluefs::vector<bluefs_extent_t> pending_release_extents;
3663 {
3664 std::unique_lock nl(nodes.lock);
7c673cae 3665 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
20effc67 3666 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
7c673cae 3667 DirRef dir;
20effc67 3668 if (p == nodes.dir_map.end()) {
7c673cae
FG
3669 // implicitly create the dir
3670 dout(20) << __func__ << " dir " << dirname
3671 << " does not exist" << dendl;
3672 return -ENOENT;
3673 } else {
3674 dir = p->second;
3675 }
3676
7c673cae
FG
3677 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3678 if (q == dir->file_map.end()) {
3679 if (overwrite) {
3680 dout(20) << __func__ << " dir " << dirname << " (" << dir
3681 << ") file " << filename
3682 << " does not exist" << dendl;
3683 return -ENOENT;
3684 }
9f95a23c 3685 file = ceph::make_ref<File>();
7c673cae 3686 file->fnode.ino = ++ino_last;
20effc67 3687 nodes.file_map[ino_last] = file;
b3b6e05e 3688 dir->file_map[string{filename}] = file;
7c673cae
FG
3689 ++file->refs;
3690 create = true;
20effc67 3691 logger->set(l_bluefs_num_files, nodes.file_map.size());
7c673cae
FG
3692 } else {
3693 // overwrite existing file?
3694 file = q->second;
3695 if (overwrite) {
3696 dout(20) << __func__ << " dir " << dirname << " (" << dir
3697 << ") file " << filename
3698 << " already exists, overwrite in place" << dendl;
3699 } else {
3700 dout(20) << __func__ << " dir " << dirname << " (" << dir
3701 << ") file " << filename
3702 << " already exists, truncate + overwrite" << dendl;
9f95a23c 3703 vselector->sub_usage(file->vselector_hint, file->fnode);
7c673cae 3704 file->fnode.size = 0;
20effc67 3705 pending_release_extents.swap(file->fnode.extents);
f6b5b4d7 3706 truncate = true;
94b18763
FG
3707
3708 file->fnode.clear_extents();
7c673cae
FG
3709 }
3710 }
11fdf7f2 3711 ceph_assert(file->fnode.ino > 1);
7c673cae
FG
3712
3713 file->fnode.mtime = ceph_clock_now();
9f95a23c 3714 file->vselector_hint = vselector->get_hint_by_dir(dirname);
f6b5b4d7
TL
3715 if (create || truncate) {
3716 vselector->add_usage(file->vselector_hint, file->fnode); // update file count
3717 }
9f95a23c 3718
7c673cae 3719 dout(20) << __func__ << " mapping " << dirname << "/" << filename
9f95a23c
TL
3720 << " vsel_hint " << file->vselector_hint
3721 << dendl;
20effc67
TL
3722 }
3723 {
3724 std::lock_guard ll(log.lock);
3725 log.t.op_file_update(file->fnode);
3726 if (create)
3727 log.t.op_dir_link(dirname, filename, file->fnode.ino);
3728
3729 std::lock_guard dl(dirty.lock);
3730 for (auto& p : pending_release_extents) {
3731 dirty.pending_release[p.bdev].insert(p.offset, p.length);
3732 }
3733 }
7c673cae
FG
3734 *h = _create_writer(file);
3735
3736 if (boost::algorithm::ends_with(filename, ".log")) {
3737 (*h)->writer_type = BlueFS::WRITER_WAL;
3738 if (logger && !overwrite) {
3739 logger->inc(l_bluefs_files_written_wal);
3740 }
3741 } else if (boost::algorithm::ends_with(filename, ".sst")) {
3742 (*h)->writer_type = BlueFS::WRITER_SST;
3743 if (logger) {
3744 logger->inc(l_bluefs_files_written_sst);
3745 }
3746 }
3747
3748 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
3749 return 0;
3750}
3751
3752BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
3753{
3754 FileWriter *w = new FileWriter(f);
3755 for (unsigned i = 0; i < MAX_BDEV; ++i) {
3756 if (bdev[i]) {
3757 w->iocv[i] = new IOContext(cct, NULL);
7c673cae
FG
3758 }
3759 }
3760 return w;
3761}
3762
20effc67 3763void BlueFS::_drain_writer(FileWriter *h)
7c673cae
FG
3764{
3765 dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
f67539c2 3766 //h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer);
7c673cae
FG
3767 for (unsigned i=0; i<MAX_BDEV; ++i) {
3768 if (bdev[i]) {
11fdf7f2
TL
3769 if (h->iocv[i]) {
3770 h->iocv[i]->aio_wait();
20effc67 3771 delete h->iocv[i];
11fdf7f2 3772 }
7c673cae
FG
3773 }
3774 }
522d829b
TL
3775 // sanity
3776 if (h->file->fnode.size >= (1ull << 30)) {
3777 dout(10) << __func__ << " file is unexpectedly large:" << h->file->fnode << dendl;
3778 }
20effc67
TL
3779}
3780
3781void BlueFS::_close_writer(FileWriter *h)
3782{
3783 _drain_writer(h);
3784 delete h;
3785}
3786void BlueFS::close_writer(FileWriter *h)
3787{
3788 {
3789 std::lock_guard l(h->lock);
3790 _drain_writer(h);
3791 }
7c673cae
FG
3792 delete h;
3793}
3794
522d829b
TL
3795uint64_t BlueFS::debug_get_dirty_seq(FileWriter *h)
3796{
20effc67 3797 std::lock_guard l(h->lock);
522d829b
TL
3798 return h->file->dirty_seq;
3799}
3800
3801bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev)
3802{
20effc67 3803 std::lock_guard l(h->lock);
522d829b
TL
3804 return h->dirty_devs[dev];
3805}
3806
7c673cae 3807int BlueFS::open_for_read(
b3b6e05e
TL
3808 std::string_view dirname,
3809 std::string_view filename,
7c673cae 3810 FileReader **h,
20effc67 3811 bool random)/*_N*/
7c673cae 3812{
20effc67
TL
3813 _maybe_check_vselector_LNF();
3814 std::lock_guard nl(nodes.lock);
7c673cae
FG
3815 dout(10) << __func__ << " " << dirname << "/" << filename
3816 << (random ? " (random)":" (sequential)") << dendl;
20effc67
TL
3817 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
3818 if (p == nodes.dir_map.end()) {
7c673cae
FG
3819 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3820 return -ENOENT;
3821 }
3822 DirRef dir = p->second;
3823
3824 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3825 if (q == dir->file_map.end()) {
3826 dout(20) << __func__ << " dir " << dirname << " (" << dir
3827 << ") file " << filename
3828 << " not found" << dendl;
3829 return -ENOENT;
3830 }
3831 File *file = q->second.get();
3832
3833 *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
3834 random, false);
3835 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
3836 return 0;
3837}
3838
3839int BlueFS::rename(
b3b6e05e 3840 std::string_view old_dirname, std::string_view old_filename,
20effc67 3841 std::string_view new_dirname, std::string_view new_filename)/*_LND*/
7c673cae 3842{
20effc67
TL
3843 std::lock_guard ll(log.lock);
3844 std::lock_guard nl(nodes.lock);
7c673cae
FG
3845 dout(10) << __func__ << " " << old_dirname << "/" << old_filename
3846 << " -> " << new_dirname << "/" << new_filename << dendl;
20effc67
TL
3847 map<string,DirRef>::iterator p = nodes.dir_map.find(old_dirname);
3848 if (p == nodes.dir_map.end()) {
7c673cae
FG
3849 dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
3850 return -ENOENT;
3851 }
3852 DirRef old_dir = p->second;
3853 map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
3854 if (q == old_dir->file_map.end()) {
3855 dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
3856 << ") file " << old_filename
3857 << " not found" << dendl;
3858 return -ENOENT;
3859 }
3860 FileRef file = q->second;
3861
20effc67
TL
3862 p = nodes.dir_map.find(new_dirname);
3863 if (p == nodes.dir_map.end()) {
7c673cae
FG
3864 dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
3865 return -ENOENT;
3866 }
3867 DirRef new_dir = p->second;
3868 q = new_dir->file_map.find(new_filename);
3869 if (q != new_dir->file_map.end()) {
3870 dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
3871 << ") file " << new_filename
3872 << " already exists, unlinking" << dendl;
11fdf7f2 3873 ceph_assert(q->second != file);
20effc67
TL
3874 log.t.op_dir_unlink(new_dirname, new_filename);
3875 _drop_link_D(q->second);
7c673cae
FG
3876 }
3877
3878 dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
3879 << " " << file->fnode << dendl;
3880
b3b6e05e
TL
3881 new_dir->file_map[string{new_filename}] = file;
3882 old_dir->file_map.erase(string{old_filename});
7c673cae 3883
20effc67
TL
3884 log.t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
3885 log.t.op_dir_unlink(old_dirname, old_filename);
7c673cae
FG
3886 return 0;
3887}
3888
20effc67 3889int BlueFS::mkdir(std::string_view dirname)/*_LN*/
7c673cae 3890{
20effc67
TL
3891 std::lock_guard ll(log.lock);
3892 std::lock_guard nl(nodes.lock);
7c673cae 3893 dout(10) << __func__ << " " << dirname << dendl;
20effc67
TL
3894 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
3895 if (p != nodes.dir_map.end()) {
7c673cae
FG
3896 dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
3897 return -EEXIST;
3898 }
20effc67
TL
3899 nodes.dir_map[string{dirname}] = ceph::make_ref<Dir>();
3900 log.t.op_dir_create(dirname);
7c673cae
FG
3901 return 0;
3902}
3903
20effc67 3904int BlueFS::rmdir(std::string_view dirname)/*_LN*/
7c673cae 3905{
20effc67
TL
3906 std::lock_guard ll(log.lock);
3907 std::lock_guard nl(nodes.lock);
7c673cae 3908 dout(10) << __func__ << " " << dirname << dendl;
20effc67
TL
3909 auto p = nodes.dir_map.find(dirname);
3910 if (p == nodes.dir_map.end()) {
7c673cae
FG
3911 dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
3912 return -ENOENT;
3913 }
3914 DirRef dir = p->second;
3915 if (!dir->file_map.empty()) {
3916 dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
3917 return -ENOTEMPTY;
3918 }
20effc67
TL
3919 nodes.dir_map.erase(string{dirname});
3920 log.t.op_dir_remove(dirname);
7c673cae
FG
3921 return 0;
3922}
3923
20effc67 3924bool BlueFS::dir_exists(std::string_view dirname)/*_N*/
7c673cae 3925{
20effc67
TL
3926 std::lock_guard nl(nodes.lock);
3927 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
3928 bool exists = p != nodes.dir_map.end();
7c673cae
FG
3929 dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
3930 return exists;
3931}
3932
b3b6e05e 3933int BlueFS::stat(std::string_view dirname, std::string_view filename,
20effc67 3934 uint64_t *size, utime_t *mtime)/*_N*/
7c673cae 3935{
20effc67 3936 std::lock_guard nl(nodes.lock);
7c673cae 3937 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
20effc67
TL
3938 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
3939 if (p == nodes.dir_map.end()) {
7c673cae
FG
3940 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3941 return -ENOENT;
3942 }
3943 DirRef dir = p->second;
3944 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3945 if (q == dir->file_map.end()) {
3946 dout(20) << __func__ << " dir " << dirname << " (" << dir
3947 << ") file " << filename
3948 << " not found" << dendl;
3949 return -ENOENT;
3950 }
3951 File *file = q->second.get();
3952 dout(10) << __func__ << " " << dirname << "/" << filename
3953 << " " << file->fnode << dendl;
3954 if (size)
3955 *size = file->fnode.size;
3956 if (mtime)
3957 *mtime = file->fnode.mtime;
3958 return 0;
3959}
3960
b3b6e05e 3961int BlueFS::lock_file(std::string_view dirname, std::string_view filename,
20effc67 3962 FileLock **plock)/*_LN*/
7c673cae 3963{
20effc67
TL
3964 std::lock_guard ll(log.lock);
3965 std::lock_guard nl(nodes.lock);
7c673cae 3966 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
20effc67
TL
3967 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
3968 if (p == nodes.dir_map.end()) {
7c673cae
FG
3969 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3970 return -ENOENT;
3971 }
3972 DirRef dir = p->second;
b3b6e05e 3973 auto q = dir->file_map.find(filename);
9f95a23c 3974 FileRef file;
7c673cae
FG
3975 if (q == dir->file_map.end()) {
3976 dout(20) << __func__ << " dir " << dirname << " (" << dir
3977 << ") file " << filename
3978 << " not found, creating" << dendl;
9f95a23c 3979 file = ceph::make_ref<File>();
7c673cae
FG
3980 file->fnode.ino = ++ino_last;
3981 file->fnode.mtime = ceph_clock_now();
20effc67 3982 nodes.file_map[ino_last] = file;
b3b6e05e 3983 dir->file_map[string{filename}] = file;
20effc67 3984 logger->set(l_bluefs_num_files, nodes.file_map.size());
7c673cae 3985 ++file->refs;
20effc67
TL
3986 log.t.op_file_update(file->fnode);
3987 log.t.op_dir_link(dirname, filename, file->fnode.ino);
7c673cae 3988 } else {
9f95a23c 3989 file = q->second;
7c673cae
FG
3990 if (file->locked) {
3991 dout(10) << __func__ << " already locked" << dendl;
11fdf7f2 3992 return -ENOLCK;
7c673cae
FG
3993 }
3994 }
3995 file->locked = true;
3996 *plock = new FileLock(file);
3997 dout(10) << __func__ << " locked " << file->fnode
3998 << " with " << *plock << dendl;
3999 return 0;
4000}
4001
20effc67 4002int BlueFS::unlock_file(FileLock *fl)/*_N*/
7c673cae 4003{
20effc67 4004 std::lock_guard nl(nodes.lock);
7c673cae 4005 dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
11fdf7f2 4006 ceph_assert(fl->file->locked);
7c673cae
FG
4007 fl->file->locked = false;
4008 delete fl;
4009 return 0;
4010}
4011
20effc67 4012int BlueFS::readdir(std::string_view dirname, vector<string> *ls)/*_N*/
7c673cae 4013{
b3b6e05e
TL
4014 // dirname may contain a trailing /
4015 if (!dirname.empty() && dirname.back() == '/') {
4016 dirname.remove_suffix(1);
4017 }
20effc67 4018 std::lock_guard nl(nodes.lock);
7c673cae
FG
4019 dout(10) << __func__ << " " << dirname << dendl;
4020 if (dirname.empty()) {
4021 // list dirs
20effc67
TL
4022 ls->reserve(nodes.dir_map.size() + 2);
4023 for (auto& q : nodes.dir_map) {
7c673cae
FG
4024 ls->push_back(q.first);
4025 }
4026 } else {
4027 // list files in dir
20effc67
TL
4028 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4029 if (p == nodes.dir_map.end()) {
7c673cae
FG
4030 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
4031 return -ENOENT;
4032 }
4033 DirRef dir = p->second;
4034 ls->reserve(dir->file_map.size() + 2);
4035 for (auto& q : dir->file_map) {
4036 ls->push_back(q.first);
4037 }
4038 }
4039 ls->push_back(".");
4040 ls->push_back("..");
4041 return 0;
4042}
4043
20effc67 4044int BlueFS::unlink(std::string_view dirname, std::string_view filename)/*_LND*/
7c673cae 4045{
20effc67
TL
4046 std::lock_guard ll(log.lock);
4047 std::lock_guard nl(nodes.lock);
7c673cae 4048 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
20effc67
TL
4049 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4050 if (p == nodes.dir_map.end()) {
7c673cae
FG
4051 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
4052 return -ENOENT;
4053 }
4054 DirRef dir = p->second;
4055 map<string,FileRef>::iterator q = dir->file_map.find(filename);
4056 if (q == dir->file_map.end()) {
4057 dout(20) << __func__ << " file " << dirname << "/" << filename
4058 << " not found" << dendl;
4059 return -ENOENT;
4060 }
4061 FileRef file = q->second;
4062 if (file->locked) {
4063 dout(20) << __func__ << " file " << dirname << "/" << filename
4064 << " is locked" << dendl;
4065 return -EBUSY;
4066 }
b3b6e05e 4067 dir->file_map.erase(string{filename});
20effc67
TL
4068 log.t.op_dir_unlink(dirname, filename);
4069 _drop_link_D(file);
7c673cae
FG
4070 return 0;
4071}
d2e6a577
FG
4072
4073bool BlueFS::wal_is_rotational()
4074{
94b18763
FG
4075 if (bdev[BDEV_WAL]) {
4076 return bdev[BDEV_WAL]->is_rotational();
4077 } else if (bdev[BDEV_DB]) {
4078 return bdev[BDEV_DB]->is_rotational();
4079 }
4080 return bdev[BDEV_SLOW]->is_rotational();
d2e6a577 4081}
9f95a23c 4082
f6b5b4d7
TL
4083/*
4084 Algorithm.
4085 do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there.
4086 Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future,
4087 and try if using it will produce healthy bluefs transaction.
4088 We encode already known bluefs log extents and search disk for these bytes.
4089 When we find it, we decode following bytes as extent.
4090 We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction.
4091 */
20effc67 4092int BlueFS::_do_replay_recovery_read(FileReader *log_reader,
f6b5b4d7
TL
4093 size_t replay_pos,
4094 size_t read_offset,
4095 size_t read_len,
4096 bufferlist* bl) {
4097 dout(1) << __func__ << " replay_pos=0x" << std::hex << replay_pos <<
4098 " needs 0x" << read_offset << "~" << read_len << std::dec << dendl;
4099
4100 bluefs_fnode_t& log_fnode = log_reader->file->fnode;
4101 bufferlist bin_extents;
f67539c2 4102 ::encode(log_fnode.extents, bin_extents);
f6b5b4d7
TL
4103 dout(2) << __func__ << " log file encoded extents length = " << bin_extents.length() << dendl;
4104
4105 // cannot process if too small to effectively search
4106 ceph_assert(bin_extents.length() >= 32);
4107 bufferlist last_32;
4108 last_32.substr_of(bin_extents, bin_extents.length() - 32, 32);
4109
4110 //read fixed part from replay_pos to end of bluefs_log extents
4111 bufferlist fixed;
4112 uint64_t e_off = 0;
4113 auto e = log_fnode.seek(replay_pos, &e_off);
4114 ceph_assert(e != log_fnode.extents.end());
20effc67
TL
4115 int r = _bdev_read(e->bdev, e->offset + e_off, e->length - e_off, &fixed, ioc[e->bdev],
4116 cct->_conf->bluefs_buffered_io);
f6b5b4d7
TL
4117 ceph_assert(r == 0);
4118 //capture dev of last good extent
4119 uint8_t last_e_dev = e->bdev;
4120 uint64_t last_e_off = e->offset;
4121 ++e;
4122 while (e != log_fnode.extents.end()) {
20effc67
TL
4123 r = _bdev_read(e->bdev, e->offset, e->length, &fixed, ioc[e->bdev],
4124 cct->_conf->bluefs_buffered_io);
f6b5b4d7
TL
4125 ceph_assert(r == 0);
4126 last_e_dev = e->bdev;
4127 ++e;
4128 }
4129 ceph_assert(replay_pos + fixed.length() == read_offset);
4130
4131 dout(2) << __func__ << " valid data in log = " << fixed.length() << dendl;
4132
4133 struct compare {
4134 bool operator()(const bluefs_extent_t& a, const bluefs_extent_t& b) const {
4135 if (a.bdev < b.bdev) return true;
4136 if (a.offset < b.offset) return true;
4137 return a.length < b.length;
4138 }
4139 };
4140 std::set<bluefs_extent_t, compare> extents_rejected;
4141 for (int dcnt = 0; dcnt < 3; dcnt++) {
4142 uint8_t dev = (last_e_dev + dcnt) % MAX_BDEV;
4143 if (bdev[dev] == nullptr) continue;
4144 dout(2) << __func__ << " processing " << get_device_name(dev) << dendl;
4145 interval_set<uint64_t> disk_regions;
4146 disk_regions.insert(0, bdev[dev]->get_size());
20effc67 4147 for (auto f : nodes.file_map) {
f6b5b4d7
TL
4148 auto& e = f.second->fnode.extents;
4149 for (auto& p : e) {
4150 if (p.bdev == dev) {
4151 disk_regions.erase(p.offset, p.length);
4152 }
4153 }
4154 }
4155 size_t disk_regions_count = disk_regions.num_intervals();
4156 dout(5) << __func__ << " " << disk_regions_count << " regions to scan on " << get_device_name(dev) << dendl;
4157
4158 auto reg = disk_regions.lower_bound(last_e_off);
4159 //for all except first, start from beginning
4160 last_e_off = 0;
4161 if (reg == disk_regions.end()) {
4162 reg = disk_regions.begin();
4163 }
4164 const uint64_t chunk_size = 4 * 1024 * 1024;
4165 const uint64_t page_size = 4096;
4166 const uint64_t max_extent_size = 16;
4167 uint64_t overlay_size = last_32.length() + max_extent_size;
4168 for (size_t i = 0; i < disk_regions_count; reg++, i++) {
4169 if (reg == disk_regions.end()) {
4170 reg = disk_regions.begin();
4171 }
4172 uint64_t pos = reg.get_start();
4173 uint64_t len = reg.get_len();
4174
4175 std::unique_ptr<char[]> raw_data_p{new char[page_size + chunk_size]};
4176 char* raw_data = raw_data_p.get();
4177 memset(raw_data, 0, page_size);
4178
4179 while (len > last_32.length()) {
4180 uint64_t chunk_len = len > chunk_size ? chunk_size : len;
4181 dout(5) << __func__ << " read "
20effc67
TL
4182 << get_device_name(dev) << ":0x" << std::hex << pos << "+" << chunk_len
4183 << std::dec << dendl;
4184 r = _bdev_read_random(dev, pos, chunk_len,
4185 raw_data + page_size, cct->_conf->bluefs_buffered_io);
f6b5b4d7
TL
4186 ceph_assert(r == 0);
4187
4188 //search for fixed_last_32
4189 char* chunk_b = raw_data + page_size;
4190 char* chunk_e = chunk_b + chunk_len;
4191
4192 char* search_b = chunk_b - overlay_size;
4193 char* search_e = chunk_e;
4194
4195 for (char* sp = search_b; ; sp += last_32.length()) {
4196 sp = (char*)memmem(sp, search_e - sp, last_32.c_str(), last_32.length());
4197 if (sp == nullptr) {
4198 break;
4199 }
4200
4201 char* n = sp + last_32.length();
4202 dout(5) << __func__ << " checking location 0x" << std::hex << pos + (n - chunk_b) << std::dec << dendl;
4203 bufferlist test;
4204 test.append(n, std::min<size_t>(max_extent_size, chunk_e - n));
4205 bluefs_extent_t ne;
4206 try {
4207 bufferlist::const_iterator p = test.begin();
f67539c2 4208 ::decode(ne, p);
f6b5b4d7
TL
4209 } catch (buffer::error& e) {
4210 continue;
4211 }
4212 if (extents_rejected.count(ne) != 0) {
4213 dout(5) << __func__ << " extent " << ne << " already refected" <<dendl;
4214 continue;
4215 }
4216 //insert as rejected already. if we succeed, it wouldn't make difference.
4217 extents_rejected.insert(ne);
4218
4219 if (ne.bdev >= MAX_BDEV ||
4220 bdev[ne.bdev] == nullptr ||
4221 ne.length > 16 * 1024 * 1024 ||
4222 (ne.length & 4095) != 0 ||
4223 ne.offset + ne.length > bdev[ne.bdev]->get_size() ||
4224 (ne.offset & 4095) != 0) {
4225 dout(5) << __func__ << " refusing extent " << ne << dendl;
4226 continue;
4227 }
4228 dout(5) << __func__ << " checking extent " << ne << dendl;
4229
4230 //read candidate extent - whole
4231 bufferlist candidate;
4232 candidate.append(fixed);
20effc67
TL
4233 r = _bdev_read(ne.bdev, ne.offset, ne.length, &candidate, ioc[ne.bdev],
4234 cct->_conf->bluefs_buffered_io);
f6b5b4d7
TL
4235 ceph_assert(r == 0);
4236
4237 //check if transaction & crc is ok
4238 bluefs_transaction_t t;
4239 try {
f67539c2
TL
4240 bufferlist::const_iterator p = candidate.begin();
4241 ::decode(t, p);
f6b5b4d7
TL
4242 }
4243 catch (buffer::error& e) {
4244 dout(5) << __func__ << " failed match" << dendl;
4245 continue;
4246 }
4247
4248 //success, it seems a probable candidate
4249 uint64_t l = std::min<uint64_t>(ne.length, read_len);
4250 //trim to required size
4251 bufferlist requested_read;
4252 requested_read.substr_of(candidate, fixed.length(), l);
4253 bl->append(requested_read);
4254 dout(5) << __func__ << " successful extension of log " << l << "/" << read_len << dendl;
4255 log_fnode.append_extent(ne);
4256 log_fnode.recalc_allocated();
4257 log_reader->buf.pos += l;
4258 return l;
4259 }
4260 //save overlay for next search
4261 memcpy(search_b, chunk_e - overlay_size, overlay_size);
4262 pos += chunk_len;
4263 len -= chunk_len;
4264 }
4265 }
4266 }
4267 return 0;
4268}
4269
20effc67
TL
4270void BlueFS::_check_vselector_LNF() {
4271 BlueFSVolumeSelector* vs = vselector->clone_empty();
4272 if (!vs) {
4273 return;
4274 }
4275 std::lock_guard ll(log.lock);
4276 std::lock_guard nl(nodes.lock);
4277 // Checking vselector is under log, nodes and file(s) locks,
4278 // so any modification of vselector must be under at least one of those locks.
4279 for (auto& f : nodes.file_map) {
4280 f.second->lock.lock();
4281 vs->add_usage(f.second->vselector_hint, f.second->fnode);
4282 }
4283 bool res = vselector->compare(vs);
4284 if (!res) {
4285 dout(0) << "Current:";
4286 vselector->dump(*_dout);
4287 *_dout << dendl;
4288 dout(0) << "Expected:";
4289 vs->dump(*_dout);
4290 *_dout << dendl;
4291 }
4292 ceph_assert(res);
4293 for (auto& f : nodes.file_map) {
4294 f.second->lock.unlock();
4295 }
4296 delete vs;
4297}
4298
f67539c2 4299size_t BlueFS::probe_alloc_avail(int dev, uint64_t alloc_size)
9f95a23c 4300{
f67539c2
TL
4301 size_t total = 0;
4302 auto iterated_allocation = [&](size_t off, size_t len) {
4303 //only count in size that is alloc_size aligned
4304 size_t dist_to_alignment;
4305 size_t offset_in_block = off & (alloc_size - 1);
4306 if (offset_in_block == 0)
4307 dist_to_alignment = 0;
4308 else
4309 dist_to_alignment = alloc_size - offset_in_block;
4310 if (dist_to_alignment >= len)
4311 return;
4312 len -= dist_to_alignment;
4313 total += p2align(len, alloc_size);
4314 };
4315 if (alloc[dev]) {
4316 alloc[dev]->dump(iterated_allocation);
9f95a23c 4317 }
f67539c2 4318 return total;
9f95a23c 4319}
9f95a23c
TL
4320// ===============================================
4321// OriginalVolumeSelector
4322
f6b5b4d7
TL
4323void* OriginalVolumeSelector::get_hint_for_log() const {
4324 return reinterpret_cast<void*>(BlueFS::BDEV_WAL);
9f95a23c 4325}
b3b6e05e 4326void* OriginalVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
9f95a23c
TL
4327 uint8_t res = BlueFS::BDEV_DB;
4328 if (dirname.length() > 5) {
4329 // the "db.slow" and "db.wal" directory names are hard-coded at
4330 // match up with bluestore. the slow device is always the second
4331 // one (when a dedicated block.db device is present and used at
4332 // bdev 0). the wal device is always last.
a4b75251 4333 if (boost::algorithm::ends_with(dirname, ".slow") && slow_total) {
9f95a23c 4334 res = BlueFS::BDEV_SLOW;
a4b75251 4335 } else if (boost::algorithm::ends_with(dirname, ".wal") && wal_total) {
9f95a23c
TL
4336 res = BlueFS::BDEV_WAL;
4337 }
4338 }
4339 return reinterpret_cast<void*>(res);
4340}
4341
4342uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint)
4343{
4344 return (uint8_t)(reinterpret_cast<uint64_t>(hint));
4345}
4346
4347void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const
4348{
4349 res.emplace_back(base, db_total);
522d829b
TL
4350 res.emplace_back(base + ".slow",
4351 slow_total ? slow_total : db_total); // use fake non-zero value if needed to
4352 // avoid RocksDB complains
9f95a23c
TL
4353}
4354
4355#undef dout_prefix
4356#define dout_prefix *_dout << "OriginalVolumeSelector: "
4357
4358void OriginalVolumeSelector::dump(ostream& sout) {
4359 sout<< "wal_total:" << wal_total
4360 << ", db_total:" << db_total
4361 << ", slow_total:" << slow_total
4362 << std::endl;
4363}
f67539c2
TL
4364
4365// ===============================================
4366// FitToFastVolumeSelector
4367
4368void FitToFastVolumeSelector::get_paths(const std::string& base, paths& res) const {
4369 res.emplace_back(base, 1); // size of the last db_path has no effect
4370}