]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueFS.cc
70fa142b717743a6eda67f907ae69279feae6b40
[ceph.git] / ceph / src / os / bluestore / BlueFS.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "boost/algorithm/string.hpp"
5 #include "bluestore_common.h"
6 #include "BlueFS.h"
7
8 #include "common/debug.h"
9 #include "common/errno.h"
10 #include "common/perf_counters.h"
11 #include "Allocator.h"
12 #include "include/ceph_assert.h"
13 #include "common/admin_socket.h"
14
15 #define dout_context cct
16 #define dout_subsys ceph_subsys_bluefs
17 #undef dout_prefix
18 #define dout_prefix *_dout << "bluefs "
19 using TOPNSPC::common::cmd_getval;
20
21 using std::byte;
22 using std::list;
23 using std::make_pair;
24 using std::map;
25 using std::ostream;
26 using std::pair;
27 using std::set;
28 using std::string;
29 using std::to_string;
30 using std::vector;
31
32 using ceph::bufferlist;
33 using ceph::decode;
34 using ceph::encode;
35 using ceph::Formatter;
36
37
38 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
39 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
40 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs_file_writer);
41 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
42 bluefs_file_reader_buffer, bluefs_file_reader);
43 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs_file_reader);
44 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
45
46 static void wal_discard_cb(void *priv, void* priv2) {
47 BlueFS *bluefs = static_cast<BlueFS*>(priv);
48 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
49 bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp);
50 }
51
52 static void db_discard_cb(void *priv, void* priv2) {
53 BlueFS *bluefs = static_cast<BlueFS*>(priv);
54 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
55 bluefs->handle_discard(BlueFS::BDEV_DB, *tmp);
56 }
57
58 static void slow_discard_cb(void *priv, void* priv2) {
59 BlueFS *bluefs = static_cast<BlueFS*>(priv);
60 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
61 bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
62 }
63
64 class BlueFS::SocketHook : public AdminSocketHook {
65 BlueFS* bluefs;
66 public:
67 static BlueFS::SocketHook* create(BlueFS* bluefs)
68 {
69 BlueFS::SocketHook* hook = nullptr;
70 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
71 if (admin_socket) {
72 hook = new BlueFS::SocketHook(bluefs);
73 int r = admin_socket->register_command("bluestore bluefs device info "
74 "name=alloc_size,type=CephInt,req=false",
75 hook,
76 "Shows space report for bluefs devices. "
77 "This also includes an estimation for space "
78 "available to bluefs at main device. "
79 "alloc_size, if set, specifies the custom bluefs "
80 "allocation unit size for the estimation above.");
81 if (r != 0) {
82 ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl;
83 delete hook;
84 hook = nullptr;
85 } else {
86 r = admin_socket->register_command("bluefs stats",
87 hook,
88 "Dump internal statistics for bluefs."
89 "");
90 ceph_assert(r == 0);
91 r = admin_socket->register_command("bluefs files list", hook,
92 "print files in bluefs");
93 ceph_assert(r == 0);
94 r = admin_socket->register_command("bluefs debug_inject_read_zeros", hook,
95 "Injects 8K zeros into next BlueFS read. Debug only.");
96 ceph_assert(r == 0);
97 }
98 }
99 return hook;
100 }
101
102 ~SocketHook() {
103 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
104 admin_socket->unregister_commands(this);
105 }
106 private:
107 SocketHook(BlueFS* bluefs) :
108 bluefs(bluefs) {}
109 int call(std::string_view command, const cmdmap_t& cmdmap,
110 Formatter *f,
111 std::ostream& errss,
112 bufferlist& out) override {
113 if (command == "bluestore bluefs device info") {
114 int64_t alloc_size = 0;
115 cmd_getval(cmdmap, "alloc_size", alloc_size);
116 if ((alloc_size & (alloc_size - 1)) != 0) {
117 errss << "Invalid allocation size:'" << alloc_size << std::endl;
118 return -EINVAL;
119 }
120 if (alloc_size == 0)
121 alloc_size = bluefs->cct->_conf->bluefs_shared_alloc_size;
122 f->open_object_section("bluefs_device_info");
123 for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) {
124 if (bluefs->bdev[dev]) {
125 f->open_object_section("dev");
126 f->dump_string("device", bluefs->get_device_name(dev));
127 ceph_assert(bluefs->alloc[dev]);
128 auto total = bluefs->get_total(dev);
129 auto free = bluefs->get_free(dev);
130 auto used = bluefs->get_used(dev);
131
132 f->dump_int("total", total);
133 f->dump_int("free", free);
134 f->dump_int("bluefs_used", used);
135 if (bluefs->is_shared_alloc(dev)) {
136 size_t avail = bluefs->probe_alloc_avail(dev, alloc_size);
137 f->dump_int("bluefs max available", avail);
138 }
139 f->close_section();
140 }
141 }
142
143 f->close_section();
144 } else if (command == "bluefs stats") {
145 std::stringstream ss;
146 bluefs->dump_block_extents(ss);
147 bluefs->dump_volume_selector(ss);
148 out.append(ss);
149 } else if (command == "bluefs files list") {
150 const char* devnames[3] = {"wal","db","slow"};
151 std::lock_guard l(bluefs->nodes.lock);
152 f->open_array_section("files");
153 for (auto &d : bluefs->nodes.dir_map) {
154 std::string dir = d.first;
155 for (auto &r : d.second->file_map) {
156 f->open_object_section("file");
157 f->dump_string("name", (dir + "/" + r.first).c_str());
158 std::vector<size_t> sizes;
159 sizes.resize(bluefs->bdev.size());
160 for(auto& i : r.second->fnode.extents) {
161 sizes[i.bdev] += i.length;
162 }
163 for (size_t i = 0; i < sizes.size(); i++) {
164 if (sizes[i]>0) {
165 if (i < sizeof(devnames) / sizeof(*devnames))
166 f->dump_int(devnames[i], sizes[i]);
167 else
168 f->dump_int(("dev-"+to_string(i)).c_str(), sizes[i]);
169 }
170 }
171 f->close_section();
172 }
173 }
174 f->close_section();
175 f->flush(out);
176 } else if (command == "bluefs debug_inject_read_zeros") {
177 bluefs->inject_read_zeros++;
178 } else {
179 errss << "Invalid command" << std::endl;
180 return -ENOSYS;
181 }
182 return 0;
183 }
184 };
185
186 BlueFS::BlueFS(CephContext* cct)
187 : cct(cct),
188 bdev(MAX_BDEV),
189 ioc(MAX_BDEV),
190 block_reserved(MAX_BDEV),
191 alloc(MAX_BDEV),
192 alloc_size(MAX_BDEV, 0)
193 {
194 dirty.pending_release.resize(MAX_BDEV);
195 discard_cb[BDEV_WAL] = wal_discard_cb;
196 discard_cb[BDEV_DB] = db_discard_cb;
197 discard_cb[BDEV_SLOW] = slow_discard_cb;
198 asok_hook = SocketHook::create(this);
199 }
200
201 BlueFS::~BlueFS()
202 {
203 delete asok_hook;
204 for (auto p : ioc) {
205 if (p)
206 p->aio_wait();
207 }
208 for (auto p : bdev) {
209 if (p) {
210 p->close();
211 delete p;
212 }
213 }
214 for (auto p : ioc) {
215 delete p;
216 }
217 }
218
219 void BlueFS::_init_logger()
220 {
221 PerfCountersBuilder b(cct, "bluefs",
222 l_bluefs_first, l_bluefs_last);
223 b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
224 "Total bytes (main db device)",
225 "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
226 b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
227 "Used bytes (main db device)",
228 "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
229 b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
230 "Total bytes (wal device)",
231 "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
232 b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
233 "Used bytes (wal device)",
234 "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
235 b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
236 "Total bytes (slow device)",
237 "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
238 b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
239 "Used bytes (slow device)",
240 "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
241 b.add_u64(l_bluefs_num_files, "num_files", "File count",
242 "f", PerfCountersBuilder::PRIO_USEFUL);
243 b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
244 "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
245 b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
246 "Compactions of the metadata log");
247 b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
248 "Bytes written to the metadata log",
249 "j",
250 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
251 b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
252 "Files written to WAL");
253 b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
254 "Files written to SSTs");
255 b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
256 "Bytes written to WAL",
257 "walb",
258 PerfCountersBuilder::PRIO_CRITICAL);
259 b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
260 "Bytes written to SSTs",
261 "sstb",
262 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
263 b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow",
264 "Bytes written to WAL/SSTs at slow device",
265 "slwb",
266 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
267 b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal",
268 "Maximum bytes allocated from WAL",
269 "mxwb",
270 PerfCountersBuilder::PRIO_INTERESTING,
271 unit_t(UNIT_BYTES));
272 b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db",
273 "Maximum bytes allocated from DB",
274 "mxdb",
275 PerfCountersBuilder::PRIO_INTERESTING,
276 unit_t(UNIT_BYTES));
277 b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow",
278 "Maximum bytes allocated from SLOW",
279 "mxwb",
280 PerfCountersBuilder::PRIO_INTERESTING,
281 unit_t(UNIT_BYTES));
282 b.add_u64_counter(l_bluefs_main_alloc_unit, "alloc_unit_main",
283 "Allocation unit size (in bytes) for primary/shared device",
284 "aumb",
285 PerfCountersBuilder::PRIO_CRITICAL,
286 unit_t(UNIT_BYTES));
287 b.add_u64_counter(l_bluefs_db_alloc_unit, "alloc_unit_db",
288 "Allocation unit size (in bytes) for standalone DB device",
289 "audb",
290 PerfCountersBuilder::PRIO_CRITICAL,
291 unit_t(UNIT_BYTES));
292 b.add_u64_counter(l_bluefs_wal_alloc_unit, "alloc_unit_wal",
293 "Allocation unit size (in bytes) for standalone WAL device",
294 "auwb",
295 PerfCountersBuilder::PRIO_CRITICAL,
296 unit_t(UNIT_BYTES));
297 b.add_u64_counter(l_bluefs_read_random_count, "read_random_count",
298 "random read requests processed",
299 NULL,
300 PerfCountersBuilder::PRIO_USEFUL);
301 b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes",
302 "Bytes requested in random read mode",
303 NULL,
304 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
305 b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count",
306 "random reads requests going to disk",
307 NULL,
308 PerfCountersBuilder::PRIO_USEFUL);
309 b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes",
310 "Bytes read from disk in random read mode",
311 "rrb",
312 PerfCountersBuilder::PRIO_INTERESTING,
313 unit_t(UNIT_BYTES));
314 b.add_u64_counter(l_bluefs_read_random_disk_bytes_wal, "read_random_disk_bytes_wal",
315 "random reads requests going to WAL disk",
316 NULL,
317 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
318 b.add_u64_counter(l_bluefs_read_random_disk_bytes_db, "read_random_disk_bytes_db",
319 "random reads requests going to DB disk",
320 NULL,
321 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
322 b.add_u64_counter(l_bluefs_read_random_disk_bytes_slow, "read_random_disk_bytes_slow",
323 "random reads requests going to main disk",
324 "rrsb",
325 PerfCountersBuilder::PRIO_INTERESTING,
326 unit_t(UNIT_BYTES));
327 b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count",
328 "random read requests processed using prefetch buffer",
329 NULL,
330 PerfCountersBuilder::PRIO_USEFUL);
331 b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes",
332 "Bytes read from prefetch buffer in random read mode",
333 NULL,
334 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
335 b.add_u64_counter(l_bluefs_read_count, "read_count",
336 "buffered read requests processed",
337 NULL,
338 PerfCountersBuilder::PRIO_USEFUL);
339 b.add_u64_counter(l_bluefs_read_bytes, "read_bytes",
340 "Bytes requested in buffered read mode",
341 NULL,
342 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
343 b.add_u64_counter(l_bluefs_read_disk_count, "read_disk_count",
344 "buffered reads requests going to disk",
345 NULL,
346 PerfCountersBuilder::PRIO_USEFUL);
347 b.add_u64_counter(l_bluefs_read_disk_bytes, "read_disk_bytes",
348 "Bytes read in buffered mode from disk",
349 "rb",
350 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
351 b.add_u64_counter(l_bluefs_read_disk_bytes_wal, "read_disk_bytes_wal",
352 "reads requests going to WAL disk",
353 NULL,
354 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
355 b.add_u64_counter(l_bluefs_read_disk_bytes_db, "read_disk_bytes_db",
356 "reads requests going to DB disk",
357 NULL,
358 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
359 b.add_u64_counter(l_bluefs_read_disk_bytes_slow, "read_disk_bytes_slow",
360 "reads requests going to main disk",
361 "rsb",
362 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
363 b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count",
364 "prefetch read requests processed",
365 NULL,
366 PerfCountersBuilder::PRIO_USEFUL);
367 b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes",
368 "Bytes requested in prefetch read mode",
369 NULL,
370 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
371 b.add_u64(l_bluefs_read_zeros_candidate, "read_zeros_candidate",
372 "How many times bluefs read found page with all 0s");
373 b.add_u64(l_bluefs_read_zeros_errors, "read_zeros_errors",
374 "How many times bluefs read found transient page with all 0s");
375
376 logger = b.create_perf_counters();
377 cct->get_perfcounters_collection()->add(logger);
378 }
379
380 void BlueFS::_shutdown_logger()
381 {
382 cct->get_perfcounters_collection()->remove(logger);
383 delete logger;
384 }
385
386 void BlueFS::_update_logger_stats()
387 {
388 if (alloc[BDEV_WAL]) {
389 logger->set(l_bluefs_wal_total_bytes, _get_total(BDEV_WAL));
390 logger->set(l_bluefs_wal_used_bytes, _get_used(BDEV_WAL));
391 }
392 if (alloc[BDEV_DB]) {
393 logger->set(l_bluefs_db_total_bytes, _get_total(BDEV_DB));
394 logger->set(l_bluefs_db_used_bytes, _get_used(BDEV_DB));
395 }
396 if (alloc[BDEV_SLOW]) {
397 logger->set(l_bluefs_slow_total_bytes, _get_total(BDEV_SLOW));
398 logger->set(l_bluefs_slow_used_bytes, _get_used(BDEV_SLOW));
399 }
400 }
401
402 int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
403 uint64_t reserved,
404 bluefs_shared_alloc_context_t* _shared_alloc)
405 {
406 dout(10) << __func__ << " bdev " << id << " path " << path << " "
407 << reserved << dendl;
408 ceph_assert(id < bdev.size());
409 ceph_assert(bdev[id] == NULL);
410 BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
411 discard_cb[id], static_cast<void*>(this));
412 block_reserved[id] = reserved;
413 if (_shared_alloc) {
414 b->set_no_exclusive_lock();
415 }
416 int r = b->open(path);
417 if (r < 0) {
418 delete b;
419 return r;
420 }
421 if (trim) {
422 b->discard(0, b->get_size());
423 }
424
425 dout(1) << __func__ << " bdev " << id << " path " << path
426 << " size " << byte_u_t(b->get_size()) << dendl;
427 bdev[id] = b;
428 ioc[id] = new IOContext(cct, NULL);
429 if (_shared_alloc) {
430 ceph_assert(!shared_alloc);
431 shared_alloc = _shared_alloc;
432 alloc[id] = shared_alloc->a;
433 shared_alloc_id = id;
434 }
435 return 0;
436 }
437
438 bool BlueFS::bdev_support_label(unsigned id)
439 {
440 ceph_assert(id < bdev.size());
441 ceph_assert(bdev[id]);
442 return bdev[id]->supported_bdev_label();
443 }
444
445 uint64_t BlueFS::get_block_device_size(unsigned id) const
446 {
447 if (id < bdev.size() && bdev[id])
448 return bdev[id]->get_size();
449 return 0;
450 }
451
452 void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
453 {
454 dout(10) << __func__ << " bdev " << id << dendl;
455 ceph_assert(alloc[id]);
456 alloc[id]->release(to_release);
457 if (is_shared_alloc(id)) {
458 shared_alloc->bluefs_used -= to_release.size();
459 }
460 }
461
462 uint64_t BlueFS::get_used()
463 {
464 uint64_t used = 0;
465 for (unsigned id = 0; id < MAX_BDEV; ++id) {
466 used += _get_used(id);
467 }
468 return used;
469 }
470
471 uint64_t BlueFS::_get_used(unsigned id) const
472 {
473 uint64_t used = 0;
474 if (!alloc[id])
475 return 0;
476
477 if (is_shared_alloc(id)) {
478 used = shared_alloc->bluefs_used;
479 } else {
480 used = _get_total(id) - alloc[id]->get_free();
481 }
482 return used;
483 }
484
485 uint64_t BlueFS::get_used(unsigned id)
486 {
487 ceph_assert(id < alloc.size());
488 ceph_assert(alloc[id]);
489 return _get_used(id);
490 }
491
492 uint64_t BlueFS::_get_total(unsigned id) const
493 {
494 ceph_assert(id < bdev.size());
495 ceph_assert(id < block_reserved.size());
496 return get_block_device_size(id) - block_reserved[id];
497 }
498
499 uint64_t BlueFS::get_total(unsigned id)
500 {
501 return _get_total(id);
502 }
503
504 uint64_t BlueFS::get_free(unsigned id)
505 {
506 ceph_assert(id < alloc.size());
507 return alloc[id]->get_free();
508 }
509
510 void BlueFS::dump_perf_counters(Formatter *f)
511 {
512 f->open_object_section("bluefs_perf_counters");
513 logger->dump_formatted(f,0);
514 f->close_section();
515 }
516
517 void BlueFS::dump_block_extents(ostream& out)
518 {
519 for (unsigned i = 0; i < MAX_BDEV; ++i) {
520 if (!bdev[i]) {
521 continue;
522 }
523 auto total = get_total(i);
524 auto free = get_free(i);
525
526 out << i << " : device size 0x" << std::hex << total
527 << " : using 0x" << total - free
528 << std::dec << "(" << byte_u_t(total - free) << ")";
529 out << "\n";
530 }
531 }
532
533 int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
534 {
535 std::lock_guard nl(nodes.lock);
536 dout(10) << __func__ << " bdev " << id << dendl;
537 ceph_assert(id < alloc.size());
538 for (auto& p : nodes.file_map) {
539 for (auto& q : p.second->fnode.extents) {
540 if (q.bdev == id) {
541 extents->insert(q.offset, q.length);
542 }
543 }
544 }
545 return 0;
546 }
547
548 int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
549 {
550 dout(1) << __func__
551 << " osd_uuid " << osd_uuid
552 << dendl;
553
554 // set volume selector if not provided before/outside
555 if (vselector == nullptr) {
556 vselector.reset(
557 new OriginalVolumeSelector(
558 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
559 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
560 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
561 }
562
563 _init_logger();
564 _init_alloc();
565
566 super.version = 1;
567 super.block_size = bdev[BDEV_DB]->get_block_size();
568 super.osd_uuid = osd_uuid;
569 super.uuid.generate_random();
570 dout(1) << __func__ << " uuid " << super.uuid << dendl;
571
572 // init log
573 FileRef log_file = ceph::make_ref<File>();
574 log_file->fnode.ino = 1;
575 log_file->vselector_hint = vselector->get_hint_for_log();
576 int r = _allocate(
577 vselector->select_prefer_bdev(log_file->vselector_hint),
578 cct->_conf->bluefs_max_log_runway,
579 &log_file->fnode);
580 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
581 ceph_assert(r == 0);
582 log.writer = _create_writer(log_file);
583
584 // initial txn
585 ceph_assert(log.seq_live == 1);
586 log.t.seq = 1;
587 log.t.op_init();
588 _flush_and_sync_log_LD();
589
590 // write supers
591 super.log_fnode = log_file->fnode;
592 super.memorized_layout = layout;
593 _write_super(BDEV_DB);
594 _flush_bdev();
595
596 // clean up
597 super = bluefs_super_t();
598 _close_writer(log.writer);
599 log.writer = NULL;
600 vselector.reset(nullptr);
601 _stop_alloc();
602 _shutdown_logger();
603 if (shared_alloc) {
604 ceph_assert(shared_alloc->need_init);
605 shared_alloc->need_init = false;
606 }
607
608 dout(10) << __func__ << " success" << dendl;
609 return 0;
610 }
611
612 void BlueFS::_init_alloc()
613 {
614 dout(20) << __func__ << dendl;
615
616 size_t wal_alloc_size = 0;
617 if (bdev[BDEV_WAL]) {
618 wal_alloc_size = cct->_conf->bluefs_alloc_size;
619 alloc_size[BDEV_WAL] = wal_alloc_size;
620 }
621 logger->set(l_bluefs_wal_alloc_unit, wal_alloc_size);
622
623 if (bdev[BDEV_SLOW]) {
624 alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size;
625 alloc_size[BDEV_SLOW] = cct->_conf->bluefs_shared_alloc_size;
626 logger->set(l_bluefs_db_alloc_unit, cct->_conf->bluefs_alloc_size);
627 logger->set(l_bluefs_main_alloc_unit, cct->_conf->bluefs_shared_alloc_size);
628 } else {
629 alloc_size[BDEV_DB] = cct->_conf->bluefs_shared_alloc_size;
630 logger->set(l_bluefs_main_alloc_unit, 0);
631 logger->set(l_bluefs_db_alloc_unit, cct->_conf->bluefs_shared_alloc_size);
632 }
633 // new wal and db devices are never shared
634 if (bdev[BDEV_NEWWAL]) {
635 alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size;
636 }
637 if (bdev[BDEV_NEWDB]) {
638 alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size;
639 }
640
641 for (unsigned id = 0; id < bdev.size(); ++id) {
642 if (!bdev[id]) {
643 continue;
644 }
645 ceph_assert(bdev[id]->get_size());
646 ceph_assert(alloc_size[id]);
647 if (is_shared_alloc(id)) {
648 dout(1) << __func__ << " shared, id " << id << std::hex
649 << ", capacity 0x" << bdev[id]->get_size()
650 << ", block size 0x" << alloc_size[id]
651 << std::dec << dendl;
652 } else {
653 std::string name = "bluefs-";
654 const char* devnames[] = { "wal","db","slow" };
655 if (id <= BDEV_SLOW)
656 name += devnames[id];
657 else
658 name += to_string(uintptr_t(this));
659 dout(1) << __func__ << " new, id " << id << std::hex
660 << ", allocator name " << name
661 << ", allocator type " << cct->_conf->bluefs_allocator
662 << ", capacity 0x" << bdev[id]->get_size()
663 << ", block size 0x" << alloc_size[id]
664 << std::dec << dendl;
665 alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
666 bdev[id]->get_size(),
667 alloc_size[id],
668 0, 0,
669 name);
670 alloc[id]->init_add_free(
671 block_reserved[id],
672 _get_total(id));
673 }
674 }
675 }
676
677 void BlueFS::_stop_alloc()
678 {
679 dout(20) << __func__ << dendl;
680 for (auto p : bdev) {
681 if (p)
682 p->discard_drain();
683 }
684
685 for (size_t i = 0; i < alloc.size(); ++i) {
686 if (alloc[i] && !is_shared_alloc(i)) {
687 alloc[i]->shutdown();
688 delete alloc[i];
689 alloc[i] = nullptr;
690 }
691 }
692 }
693
694 int BlueFS::_read_and_check(uint8_t ndev, uint64_t off, uint64_t len,
695 ceph::buffer::list *pbl, IOContext *ioc, bool buffered)
696 {
697 dout(10) << __func__ << " dev " << int(ndev)
698 << ": 0x" << std::hex << off << "~" << len << std::dec
699 << (buffered ? " buffered" : "")
700 << dendl;
701 int r;
702 bufferlist bl;
703 r = _bdev_read(ndev, off, len, &bl, ioc, buffered);
704 if (r != 0) {
705 return r;
706 }
707 uint64_t block_size = bdev[ndev]->get_block_size();
708 if (inject_read_zeros) {
709 if (len >= block_size * 2) {
710 derr << __func__ << " injecting error, zeros at "
711 << int(ndev) << ": 0x" << std::hex << (off + len / 2)
712 << "~" << (block_size * 2) << std::dec << dendl;
713 //use beginning, replace 8K in the middle with zeros, use tail
714 bufferlist temp;
715 bl.splice(0, len / 2 - block_size, &temp);
716 temp.append(buffer::create(block_size * 2, 0));
717 bl.splice(block_size * 2, len / 2 - block_size, &temp);
718 bl = temp;
719 inject_read_zeros--;
720 }
721 }
722 //make a check if there is a block with all 0
723 uint64_t to_check_len = len;
724 uint64_t skip = p2nphase(off, block_size);
725 if (skip >= to_check_len) {
726 return r;
727 }
728 auto it = bl.begin(skip);
729 to_check_len -= skip;
730 bool all_zeros = false;
731 while (all_zeros == false && to_check_len >= block_size) {
732 // checking 0s step
733 unsigned block_left = block_size;
734 unsigned avail;
735 const char* data;
736 all_zeros = true;
737 while (all_zeros && block_left > 0) {
738 avail = it.get_ptr_and_advance(block_left, &data);
739 block_left -= avail;
740 all_zeros = mem_is_zero(data, avail);
741 }
742 // skipping step
743 while (block_left > 0) {
744 avail = it.get_ptr_and_advance(block_left, &data);
745 block_left -= avail;
746 }
747 to_check_len -= block_size;
748 }
749 if (all_zeros) {
750 logger->inc(l_bluefs_read_zeros_candidate, 1);
751 bufferlist bl_reread;
752 r = _bdev_read(ndev, off, len, &bl_reread, ioc, buffered);
753 if (r != 0) {
754 return r;
755 }
756 // check if both read gave the same
757 if (!bl.contents_equal(bl_reread)) {
758 // report problems to log, but continue, maybe it will be good now...
759 derr << __func__ << " initial read of " << int(ndev)
760 << ": 0x" << std::hex << off << "~" << len
761 << std::dec << ": different then re-read " << dendl;
762 logger->inc(l_bluefs_read_zeros_errors, 1);
763 }
764 // use second read will be better if is different
765 pbl->append(bl_reread);
766 } else {
767 pbl->append(bl);
768 }
769 return r;
770 }
771
772 int BlueFS::_read_random_and_check(
773 uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered)
774 {
775 dout(10) << __func__ << " dev " << int(ndev)
776 << ": 0x" << std::hex << off << "~" << len << std::dec
777 << (buffered ? " buffered" : "")
778 << dendl;
779 int r;
780 r = _bdev_read_random(ndev, off, len, buf, buffered);
781 if (r != 0) {
782 return r;
783 }
784 uint64_t block_size = bdev[ndev]->get_block_size();
785 if (inject_read_zeros) {
786 if (len >= block_size * 2) {
787 derr << __func__ << " injecting error, zeros at "
788 << int(ndev) << ": 0x" << std::hex << (off + len / 2)
789 << "~" << (block_size * 2) << std::dec << dendl;
790 //zero middle 8K
791 memset(buf + len / 2 - block_size, 0, block_size * 2);
792 inject_read_zeros--;
793 }
794 }
795 //make a check if there is a block with all 0
796 uint64_t to_check_len = len;
797 const char* data = buf;
798 uint64_t skip = p2nphase(off, block_size);
799 if (skip >= to_check_len) {
800 return r;
801 }
802 to_check_len -= skip;
803 data += skip;
804
805 bool all_zeros = false;
806 while (all_zeros == false && to_check_len >= block_size) {
807 if (mem_is_zero(data, block_size)) {
808 // at least one block is all zeros
809 all_zeros = true;
810 break;
811 }
812 data += block_size;
813 to_check_len -= block_size;
814 }
815 if (all_zeros) {
816 logger->inc(l_bluefs_read_zeros_candidate, 1);
817 std::unique_ptr<char[]> data_reread(new char[len]);
818 r = _bdev_read_random(ndev, off, len, &data_reread[0], buffered);
819 if (r != 0) {
820 return r;
821 }
822 // check if both read gave the same
823 if (memcmp(buf, &data_reread[0], len) != 0) {
824 derr << __func__ << " initial read of " << int(ndev)
825 << ": 0x" << std::hex << off << "~" << len
826 << std::dec << ": different then re-read " << dendl;
827 logger->inc(l_bluefs_read_zeros_errors, 1);
828 // second read is probably better
829 memcpy(buf, &data_reread[0], len);
830 }
831 }
832 return r;
833 }
834
835 int BlueFS::_bdev_read(uint8_t ndev, uint64_t off, uint64_t len,
836 ceph::buffer::list* pbl, IOContext* ioc, bool buffered)
837 {
838 int cnt = 0;
839 switch (ndev) {
840 case BDEV_WAL: cnt = l_bluefs_read_disk_bytes_wal; break;
841 case BDEV_DB: cnt = l_bluefs_read_disk_bytes_db; break;
842 case BDEV_SLOW: cnt = l_bluefs_read_disk_bytes_slow; break;
843
844 }
845 if (cnt) {
846 logger->inc(cnt, len);
847 }
848 return bdev[ndev]->read(off, len, pbl, ioc, buffered);
849 }
850
851 int BlueFS::_bdev_read_random(uint8_t ndev, uint64_t off, uint64_t len,
852 char* buf, bool buffered)
853 {
854 int cnt = 0;
855 switch (ndev) {
856 case BDEV_WAL: cnt = l_bluefs_read_random_disk_bytes_wal; break;
857 case BDEV_DB: cnt = l_bluefs_read_random_disk_bytes_db; break;
858 case BDEV_SLOW: cnt = l_bluefs_read_random_disk_bytes_slow; break;
859 }
860 if (cnt) {
861 logger->inc(cnt, len);
862 }
863 return bdev[ndev]->read_random(off, len, buf, buffered);
864 }
865
866 int BlueFS::mount()
867 {
868 dout(1) << __func__ << dendl;
869
870 _init_logger();
871 int r = _open_super();
872 if (r < 0) {
873 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
874 goto out;
875 }
876
877 // set volume selector if not provided before/outside
878 if (vselector == nullptr) {
879 vselector.reset(
880 new OriginalVolumeSelector(
881 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
882 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
883 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
884 }
885
886 _init_alloc();
887
888 r = _replay(false, false);
889 if (r < 0) {
890 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
891 _stop_alloc();
892 goto out;
893 }
894
895 // init freelist
896 for (auto& p : nodes.file_map) {
897 dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
898 for (auto& q : p.second->fnode.extents) {
899 bool is_shared = is_shared_alloc(q.bdev);
900 ceph_assert(!is_shared || (is_shared && shared_alloc));
901 if (is_shared && shared_alloc->need_init && shared_alloc->a) {
902 shared_alloc->bluefs_used += q.length;
903 alloc[q.bdev]->init_rm_free(q.offset, q.length);
904 } else if (!is_shared) {
905 alloc[q.bdev]->init_rm_free(q.offset, q.length);
906 }
907 }
908 }
909 if (shared_alloc) {
910 shared_alloc->need_init = false;
911 dout(1) << __func__ << " shared_bdev_used = "
912 << shared_alloc->bluefs_used << dendl;
913 } else {
914 dout(1) << __func__ << " shared bdev not used"
915 << dendl;
916 }
917
918 // set up the log for future writes
919 log.writer = _create_writer(_get_file(1));
920 ceph_assert(log.writer->file->fnode.ino == 1);
921 log.writer->pos = log.writer->file->fnode.size;
922 log.writer->file->fnode.reset_delta();
923 dout(10) << __func__ << " log write pos set to 0x"
924 << std::hex << log.writer->pos << std::dec
925 << dendl;
926 // update log size
927 logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size);
928 return 0;
929
930 out:
931 super = bluefs_super_t();
932 return r;
933 }
934
935 int BlueFS::maybe_verify_layout(const bluefs_layout_t& layout) const
936 {
937 if (super.memorized_layout) {
938 if (layout == *super.memorized_layout) {
939 dout(10) << __func__ << " bluefs layout verified positively" << dendl;
940 } else {
941 derr << __func__ << " memorized layout doesn't fit current one" << dendl;
942 return -EIO;
943 }
944 } else {
945 dout(10) << __func__ << " no memorized_layout in bluefs superblock"
946 << dendl;
947 }
948
949 return 0;
950 }
951
952 void BlueFS::umount(bool avoid_compact)
953 {
954 dout(1) << __func__ << dendl;
955
956 sync_metadata(avoid_compact);
957 if (cct->_conf->bluefs_check_volume_selector_on_umount) {
958 _check_vselector_LNF();
959 }
960 _close_writer(log.writer);
961 log.writer = NULL;
962 log.t.clear();
963
964 vselector.reset(nullptr);
965 _stop_alloc();
966 nodes.file_map.clear();
967 nodes.dir_map.clear();
968 super = bluefs_super_t();
969 _shutdown_logger();
970 }
971
972 int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout)
973 {
974 dout(1) << __func__ << dendl;
975
976 if(id == BDEV_NEWDB) {
977 int new_log_dev_cur = BDEV_WAL;
978 int new_log_dev_next = BDEV_WAL;
979 if (!bdev[BDEV_WAL]) {
980 new_log_dev_cur = BDEV_NEWDB;
981 new_log_dev_next = BDEV_DB;
982 }
983 _rewrite_log_and_layout_sync_LNF_LD(false,
984 BDEV_NEWDB,
985 new_log_dev_cur,
986 new_log_dev_next,
987 RENAME_DB2SLOW,
988 layout);
989 //}
990 } else if(id == BDEV_NEWWAL) {
991 _rewrite_log_and_layout_sync_LNF_LD(false,
992 BDEV_DB,
993 BDEV_NEWWAL,
994 BDEV_WAL,
995 REMOVE_WAL,
996 layout);
997 } else {
998 assert(false);
999 }
1000 return 0;
1001 }
1002
1003 void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id)
1004 {
1005 if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB])
1006 bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
1007 if (bdev[BDEV_WAL])
1008 bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
1009 }
1010
1011 void BlueFS::get_devices(set<string> *ls)
1012 {
1013 for (unsigned i = 0; i < MAX_BDEV; ++i) {
1014 if (bdev[i]) {
1015 bdev[i]->get_devices(ls);
1016 }
1017 }
1018 }
1019
1020 int BlueFS::fsck()
1021 {
1022 dout(1) << __func__ << dendl;
1023 // hrm, i think we check everything on mount...
1024 return 0;
1025 }
1026
1027 int BlueFS::_write_super(int dev)
1028 {
1029 // build superblock
1030 bufferlist bl;
1031 encode(super, bl);
1032 uint32_t crc = bl.crc32c(-1);
1033 encode(crc, bl);
1034 dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
1035 dout(10) << __func__ << " superblock " << super.version << dendl;
1036 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
1037 ceph_assert_always(bl.length() <= get_super_length());
1038 bl.append_zero(get_super_length() - bl.length());
1039
1040 bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT);
1041 dout(20) << __func__ << " v " << super.version
1042 << " crc 0x" << std::hex << crc
1043 << " offset 0x" << get_super_offset() << std::dec
1044 << dendl;
1045 return 0;
1046 }
1047
1048 int BlueFS::_open_super()
1049 {
1050 dout(10) << __func__ << dendl;
1051
1052 bufferlist bl;
1053 uint32_t expected_crc, crc;
1054 int r;
1055
1056 // always the second block
1057 r = _bdev_read(BDEV_DB, get_super_offset(), get_super_length(),
1058 &bl, ioc[BDEV_DB], false);
1059 if (r < 0)
1060 return r;
1061
1062 auto p = bl.cbegin();
1063 decode(super, p);
1064 {
1065 bufferlist t;
1066 t.substr_of(bl, 0, p.get_off());
1067 crc = t.crc32c(-1);
1068 }
1069 decode(expected_crc, p);
1070 if (crc != expected_crc) {
1071 derr << __func__ << " bad crc on superblock, expected 0x"
1072 << std::hex << expected_crc << " != actual 0x" << crc << std::dec
1073 << dendl;
1074 return -EIO;
1075 }
1076 dout(10) << __func__ << " superblock " << super.version << dendl;
1077 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
1078 return 0;
1079 }
1080
1081 int BlueFS::_check_allocations(const bluefs_fnode_t& fnode,
1082 boost::dynamic_bitset<uint64_t>* used_blocks,
1083 bool is_alloc, //true when allocating, false when deallocating
1084 const char* op_name)
1085 {
1086 auto& fnode_extents = fnode.extents;
1087 for (auto e : fnode_extents) {
1088 auto id = e.bdev;
1089 bool fail = false;
1090 ceph_assert(id < MAX_BDEV);
1091 if (int r = _verify_alloc_granularity(id, e.offset, e.length,
1092 op_name); r < 0) {
1093 return r;
1094 }
1095
1096 apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
1097 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1098 if (is_alloc == bs.test(pos)) {
1099 fail = true;
1100 } else {
1101 bs.flip(pos);
1102 }
1103 }
1104 );
1105 if (fail) {
1106 derr << __func__ << " " << op_name << " invalid extent " << int(e.bdev)
1107 << ": 0x" << std::hex << e.offset << "~" << e.length << std::dec
1108 << (is_alloc == true ?
1109 ": duplicate reference, ino " : ": double free, ino ")
1110 << fnode.ino << dendl;
1111 return -EFAULT;
1112 }
1113 }
1114 return 0;
1115 }
1116
1117 int BlueFS::_verify_alloc_granularity(
1118 __u8 id, uint64_t offset, uint64_t length, const char *op)
1119 {
1120 if ((offset & (alloc_size[id] - 1)) ||
1121 (length & (alloc_size[id] - 1))) {
1122 derr << __func__ << " " << op << " of " << (int)id
1123 << ":0x" << std::hex << offset << "~" << length << std::dec
1124 << " does not align to alloc_size 0x"
1125 << std::hex << alloc_size[id] << std::dec << dendl;
1126 // be helpful
1127 auto need = alloc_size[id];
1128 while (need && ((offset & (need - 1)) ||
1129 (length & (need - 1)))) {
1130 need >>= 1;
1131 }
1132 if (need) {
1133 const char *which;
1134 if (id == BDEV_SLOW ||
1135 (id == BDEV_DB && !bdev[BDEV_SLOW])) {
1136 which = "bluefs_shared_alloc_size";
1137 } else {
1138 which = "bluefs_alloc_size";
1139 }
1140 derr << "work-around by setting " << which << " = " << need
1141 << " for this OSD" << dendl;
1142 }
1143 return -EFAULT;
1144 }
1145 return 0;
1146 }
1147
1148 int BlueFS::_replay(bool noop, bool to_stdout)
1149 {
1150 dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
1151 ino_last = 1; // by the log
1152 uint64_t log_seq = 0;
1153
1154 FileRef log_file;
1155 log_file = _get_file(1);
1156
1157 log_file->fnode = super.log_fnode;
1158 if (!noop) {
1159 log_file->vselector_hint =
1160 vselector->get_hint_for_log();
1161 } else {
1162 // do not use fnode from superblock in 'noop' mode - log_file's one should
1163 // be fine and up-to-date
1164 ceph_assert(log_file->fnode.ino == 1);
1165 ceph_assert(log_file->fnode.extents.size() != 0);
1166 }
1167 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
1168 if (unlikely(to_stdout)) {
1169 std::cout << " log_fnode " << super.log_fnode << std::endl;
1170 }
1171
1172 FileReader *log_reader = new FileReader(
1173 log_file, cct->_conf->bluefs_max_prefetch,
1174 false, // !random
1175 true); // ignore eof
1176
1177 bool seen_recs = false;
1178
1179 boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV];
1180
1181 if (!noop) {
1182 if (cct->_conf->bluefs_log_replay_check_allocations) {
1183 for (size_t i = 0; i < MAX_BDEV; ++i) {
1184 if (alloc_size[i] != 0 && bdev[i] != nullptr) {
1185 used_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]);
1186 }
1187 }
1188 // check initial log layout
1189 int r = _check_allocations(log_file->fnode,
1190 used_blocks, true, "Log from super");
1191 if (r < 0) {
1192 return r;
1193 }
1194 }
1195 }
1196
1197 while (true) {
1198 ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0);
1199 uint64_t pos = log_reader->buf.pos;
1200 uint64_t read_pos = pos;
1201 bufferlist bl;
1202 {
1203 int r = _read(log_reader, read_pos, super.block_size,
1204 &bl, NULL);
1205 if (r != (int)super.block_size && cct->_conf->bluefs_replay_recovery) {
1206 r += _do_replay_recovery_read(log_reader, pos, read_pos + r, super.block_size - r, &bl);
1207 }
1208 assert(r == (int)super.block_size);
1209 read_pos += r;
1210 }
1211 uint64_t more = 0;
1212 uint64_t seq;
1213 uuid_d uuid;
1214 {
1215 auto p = bl.cbegin();
1216 __u8 a, b;
1217 uint32_t len;
1218 decode(a, p);
1219 decode(b, p);
1220 decode(len, p);
1221 decode(uuid, p);
1222 decode(seq, p);
1223 if (len + 6 > bl.length()) {
1224 more = round_up_to(len + 6 - bl.length(), super.block_size);
1225 }
1226 }
1227 if (uuid != super.uuid) {
1228 if (seen_recs) {
1229 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1230 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1231 << dendl;
1232 } else {
1233 derr << __func__ << " 0x" << std::hex << pos << std::dec
1234 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1235 << ", block dump: \n";
1236 bufferlist t;
1237 t.substr_of(bl, 0, super.block_size);
1238 t.hexdump(*_dout);
1239 *_dout << dendl;
1240 }
1241 break;
1242 }
1243 if (seq != log_seq + 1) {
1244 if (seen_recs) {
1245 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1246 << ": stop: seq " << seq << " != expected " << log_seq + 1
1247 << dendl;;
1248 } else {
1249 derr << __func__ << " 0x" << std::hex << pos << std::dec
1250 << ": stop: seq " << seq << " != expected " << log_seq + 1
1251 << dendl;;
1252 }
1253 break;
1254 }
1255 if (more) {
1256 dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
1257 << " more bytes" << dendl;
1258 bufferlist t;
1259 int r = _read(log_reader, read_pos, more, &t, NULL);
1260 if (r < (int)more) {
1261 dout(10) << __func__ << " 0x" << std::hex << pos
1262 << ": stop: len is 0x" << bl.length() + more << std::dec
1263 << ", which is past eof" << dendl;
1264 if (cct->_conf->bluefs_replay_recovery) {
1265 //try to search for more data
1266 r += _do_replay_recovery_read(log_reader, pos, read_pos + r, more - r, &t);
1267 if (r < (int)more) {
1268 //in normal mode we must read r==more, for recovery it is too strict
1269 break;
1270 }
1271 }
1272 }
1273 ceph_assert(r == (int)more);
1274 bl.claim_append(t);
1275 read_pos += r;
1276 }
1277 bluefs_transaction_t t;
1278 try {
1279 auto p = bl.cbegin();
1280 decode(t, p);
1281 seen_recs = true;
1282 }
1283 catch (ceph::buffer::error& e) {
1284 // Multi-block transactions might be incomplete due to unexpected
1285 // power off. Hence let's treat that as a regular stop condition.
1286 if (seen_recs && more) {
1287 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1288 << ": stop: failed to decode: " << e.what()
1289 << dendl;
1290 } else {
1291 derr << __func__ << " 0x" << std::hex << pos << std::dec
1292 << ": stop: failed to decode: " << e.what()
1293 << dendl;
1294 delete log_reader;
1295 return -EIO;
1296 }
1297 break;
1298 }
1299 ceph_assert(seq == t.seq);
1300 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1301 << ": " << t << dendl;
1302 if (unlikely(to_stdout)) {
1303 std::cout << " 0x" << std::hex << pos << std::dec
1304 << ": " << t << std::endl;
1305 }
1306
1307 auto p = t.op_bl.cbegin();
1308 while (!p.end()) {
1309 __u8 op;
1310 decode(op, p);
1311 switch (op) {
1312
1313 case bluefs_transaction_t::OP_INIT:
1314 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1315 << ": op_init" << dendl;
1316 if (unlikely(to_stdout)) {
1317 std::cout << " 0x" << std::hex << pos << std::dec
1318 << ": op_init" << std::endl;
1319 }
1320
1321 ceph_assert(t.seq == 1);
1322 break;
1323
1324 case bluefs_transaction_t::OP_JUMP:
1325 {
1326 uint64_t next_seq;
1327 uint64_t offset;
1328 decode(next_seq, p);
1329 decode(offset, p);
1330 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1331 << ": op_jump seq " << next_seq
1332 << " offset 0x" << std::hex << offset << std::dec << dendl;
1333 if (unlikely(to_stdout)) {
1334 std::cout << " 0x" << std::hex << pos << std::dec
1335 << ": op_jump seq " << next_seq
1336 << " offset 0x" << std::hex << offset << std::dec
1337 << std::endl;
1338 }
1339
1340 ceph_assert(next_seq > log_seq);
1341 log_seq = next_seq - 1; // we will increment it below
1342 uint64_t skip = offset - read_pos;
1343 if (skip) {
1344 bufferlist junk;
1345 int r = _read(log_reader, read_pos, skip, &junk,
1346 NULL);
1347 if (r != (int)skip) {
1348 dout(10) << __func__ << " 0x" << std::hex << read_pos
1349 << ": stop: failed to skip to " << offset
1350 << std::dec << dendl;
1351 ceph_abort_msg("problem with op_jump");
1352 }
1353 }
1354 }
1355 break;
1356
1357 case bluefs_transaction_t::OP_JUMP_SEQ:
1358 {
1359 uint64_t next_seq;
1360 decode(next_seq, p);
1361 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1362 << ": op_jump_seq " << next_seq << dendl;
1363 if (unlikely(to_stdout)) {
1364 std::cout << " 0x" << std::hex << pos << std::dec
1365 << ": op_jump_seq " << next_seq << std::endl;
1366 }
1367
1368 ceph_assert(next_seq > log_seq);
1369 log_seq = next_seq - 1; // we will increment it below
1370 }
1371 break;
1372
1373 case bluefs_transaction_t::OP_ALLOC_ADD:
1374 // LEGACY, do nothing but read params
1375 {
1376 __u8 id;
1377 uint64_t offset, length;
1378 decode(id, p);
1379 decode(offset, p);
1380 decode(length, p);
1381 }
1382 break;
1383
1384 case bluefs_transaction_t::OP_ALLOC_RM:
1385 // LEGACY, do nothing but read params
1386 {
1387 __u8 id;
1388 uint64_t offset, length;
1389 decode(id, p);
1390 decode(offset, p);
1391 decode(length, p);
1392 }
1393 break;
1394
1395 case bluefs_transaction_t::OP_DIR_LINK:
1396 {
1397 string dirname, filename;
1398 uint64_t ino;
1399 decode(dirname, p);
1400 decode(filename, p);
1401 decode(ino, p);
1402 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1403 << ": op_dir_link " << " " << dirname << "/" << filename
1404 << " to " << ino
1405 << dendl;
1406 if (unlikely(to_stdout)) {
1407 std::cout << " 0x" << std::hex << pos << std::dec
1408 << ": op_dir_link " << " " << dirname << "/" << filename
1409 << " to " << ino
1410 << std::endl;
1411 }
1412
1413 if (!noop) {
1414 FileRef file = _get_file(ino);
1415 ceph_assert(file->fnode.ino);
1416 map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1417 ceph_assert(q != nodes.dir_map.end());
1418 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
1419 ceph_assert(r == q->second->file_map.end());
1420
1421 vselector->sub_usage(file->vselector_hint, file->fnode);
1422 file->vselector_hint =
1423 vselector->get_hint_by_dir(dirname);
1424 vselector->add_usage(file->vselector_hint, file->fnode);
1425
1426 q->second->file_map[filename] = file;
1427 ++file->refs;
1428 }
1429 }
1430 break;
1431
1432 case bluefs_transaction_t::OP_DIR_UNLINK:
1433 {
1434 string dirname, filename;
1435 decode(dirname, p);
1436 decode(filename, p);
1437 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1438 << ": op_dir_unlink " << " " << dirname << "/" << filename
1439 << dendl;
1440 if (unlikely(to_stdout)) {
1441 std::cout << " 0x" << std::hex << pos << std::dec
1442 << ": op_dir_unlink " << " " << dirname << "/" << filename
1443 << std::endl;
1444 }
1445
1446 if (!noop) {
1447 map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1448 ceph_assert(q != nodes.dir_map.end());
1449 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
1450 ceph_assert(r != q->second->file_map.end());
1451 ceph_assert(r->second->refs > 0);
1452 --r->second->refs;
1453 q->second->file_map.erase(r);
1454 }
1455 }
1456 break;
1457
1458 case bluefs_transaction_t::OP_DIR_CREATE:
1459 {
1460 string dirname;
1461 decode(dirname, p);
1462 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1463 << ": op_dir_create " << dirname << dendl;
1464 if (unlikely(to_stdout)) {
1465 std::cout << " 0x" << std::hex << pos << std::dec
1466 << ": op_dir_create " << dirname << std::endl;
1467 }
1468
1469 if (!noop) {
1470 map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1471 ceph_assert(q == nodes.dir_map.end());
1472 nodes.dir_map[dirname] = ceph::make_ref<Dir>();
1473 }
1474 }
1475 break;
1476
1477 case bluefs_transaction_t::OP_DIR_REMOVE:
1478 {
1479 string dirname;
1480 decode(dirname, p);
1481 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1482 << ": op_dir_remove " << dirname << dendl;
1483 if (unlikely(to_stdout)) {
1484 std::cout << " 0x" << std::hex << pos << std::dec
1485 << ": op_dir_remove " << dirname << std::endl;
1486 }
1487
1488 if (!noop) {
1489 map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1490 ceph_assert(q != nodes.dir_map.end());
1491 ceph_assert(q->second->file_map.empty());
1492 nodes.dir_map.erase(q);
1493 }
1494 }
1495 break;
1496
1497 case bluefs_transaction_t::OP_FILE_UPDATE:
1498 {
1499 bluefs_fnode_t fnode;
1500 decode(fnode, p);
1501 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1502 << ": op_file_update " << " " << fnode << " " << dendl;
1503 if (unlikely(to_stdout)) {
1504 std::cout << " 0x" << std::hex << pos << std::dec
1505 << ": op_file_update " << " " << fnode << std::endl;
1506 }
1507 if (!noop) {
1508 FileRef f = _get_file(fnode.ino);
1509 if (cct->_conf->bluefs_log_replay_check_allocations) {
1510 int r = _check_allocations(f->fnode,
1511 used_blocks, false, "OP_FILE_UPDATE");
1512 if (r < 0) {
1513 return r;
1514 }
1515 }
1516 if (fnode.ino != 1) {
1517 vselector->sub_usage(f->vselector_hint, f->fnode);
1518 }
1519 f->fnode = fnode;
1520 if (fnode.ino != 1) {
1521 vselector->add_usage(f->vselector_hint, f->fnode);
1522 }
1523
1524 if (fnode.ino > ino_last) {
1525 ino_last = fnode.ino;
1526 }
1527 if (cct->_conf->bluefs_log_replay_check_allocations) {
1528 int r = _check_allocations(f->fnode,
1529 used_blocks, true, "OP_FILE_UPDATE");
1530 if (r < 0) {
1531 return r;
1532 }
1533 }
1534 } else if (noop && fnode.ino == 1) {
1535 FileRef f = _get_file(fnode.ino);
1536 f->fnode = fnode;
1537 }
1538 }
1539 break;
1540 case bluefs_transaction_t::OP_FILE_UPDATE_INC:
1541 {
1542 bluefs_fnode_delta_t delta;
1543 decode(delta, p);
1544 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1545 << ": op_file_update_inc " << " " << delta << " " << dendl;
1546 if (unlikely(to_stdout)) {
1547 std::cout << " 0x" << std::hex << pos << std::dec
1548 << ": op_file_update_inc " << " " << delta << std::endl;
1549 }
1550 if (!noop) {
1551 FileRef f = _get_file(delta.ino);
1552 bluefs_fnode_t& fnode = f->fnode;
1553 if (delta.offset != fnode.allocated) {
1554 derr << __func__ << " invalid op_file_update_inc, new extents miss end of file"
1555 << " fnode=" << fnode
1556 << " delta=" << delta
1557 << dendl;
1558 ceph_assert(delta.offset == fnode.allocated);
1559 }
1560 if (cct->_conf->bluefs_log_replay_check_allocations) {
1561 int r = _check_allocations(fnode,
1562 used_blocks, false, "OP_FILE_UPDATE_INC");
1563 if (r < 0) {
1564 return r;
1565 }
1566 }
1567
1568 fnode.ino = delta.ino;
1569 fnode.mtime = delta.mtime;
1570 if (fnode.ino != 1) {
1571 vselector->sub_usage(f->vselector_hint, fnode);
1572 }
1573 fnode.size = delta.size;
1574 fnode.claim_extents(delta.extents);
1575 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1576 << ": op_file_update_inc produced " << " " << fnode << " " << dendl;
1577
1578 if (fnode.ino != 1) {
1579 vselector->add_usage(f->vselector_hint, fnode);
1580 }
1581
1582 if (fnode.ino > ino_last) {
1583 ino_last = fnode.ino;
1584 }
1585 if (cct->_conf->bluefs_log_replay_check_allocations) {
1586 int r = _check_allocations(f->fnode,
1587 used_blocks, true, "OP_FILE_UPDATE_INC");
1588 if (r < 0) {
1589 return r;
1590 }
1591 }
1592 } else if (noop && delta.ino == 1) {
1593 // we need to track bluefs log, even in noop mode
1594 FileRef f = _get_file(1);
1595 bluefs_fnode_t& fnode = f->fnode;
1596 fnode.ino = delta.ino;
1597 fnode.mtime = delta.mtime;
1598 fnode.size = delta.size;
1599 fnode.claim_extents(delta.extents);
1600 }
1601 }
1602 break;
1603
1604 case bluefs_transaction_t::OP_FILE_REMOVE:
1605 {
1606 uint64_t ino;
1607 decode(ino, p);
1608 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1609 << ": op_file_remove " << ino << dendl;
1610 if (unlikely(to_stdout)) {
1611 std::cout << " 0x" << std::hex << pos << std::dec
1612 << ": op_file_remove " << ino << std::endl;
1613 }
1614
1615 if (!noop) {
1616 auto p = nodes.file_map.find(ino);
1617 ceph_assert(p != nodes.file_map.end());
1618 vselector->sub_usage(p->second->vselector_hint, p->second->fnode);
1619 if (cct->_conf->bluefs_log_replay_check_allocations) {
1620 int r = _check_allocations(p->second->fnode,
1621 used_blocks, false, "OP_FILE_REMOVE");
1622 if (r < 0) {
1623 return r;
1624 }
1625 }
1626 nodes.file_map.erase(p);
1627 }
1628 }
1629 break;
1630
1631 default:
1632 derr << __func__ << " 0x" << std::hex << pos << std::dec
1633 << ": stop: unrecognized op " << (int)op << dendl;
1634 delete log_reader;
1635 return -EIO;
1636 }
1637 }
1638 ceph_assert(p.end());
1639
1640 // we successfully replayed the transaction; bump the seq and log size
1641 ++log_seq;
1642 log_file->fnode.size = log_reader->buf.pos;
1643 }
1644 if (!noop) {
1645 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
1646 log.seq_live = log_seq + 1;
1647 dirty.seq_live = log_seq + 1;
1648 log.t.seq = log.seq_live;
1649 dirty.seq_stable = log_seq;
1650 }
1651
1652 dout(10) << __func__ << " log file size was 0x"
1653 << std::hex << log_file->fnode.size << std::dec << dendl;
1654 if (unlikely(to_stdout)) {
1655 std::cout << " log file size was 0x"
1656 << std::hex << log_file->fnode.size << std::dec << std::endl;
1657 }
1658
1659 delete log_reader;
1660
1661 if (!noop) {
1662 // verify file link counts are all >0
1663 for (auto& p : nodes.file_map) {
1664 if (p.second->refs == 0 &&
1665 p.second->fnode.ino > 1) {
1666 derr << __func__ << " file with link count 0: " << p.second->fnode
1667 << dendl;
1668 return -EIO;
1669 }
1670 }
1671 }
1672 // reflect file count in logger
1673 logger->set(l_bluefs_num_files, nodes.file_map.size());
1674
1675 dout(10) << __func__ << " done" << dendl;
1676 return 0;
1677 }
1678
1679 int BlueFS::log_dump()
1680 {
1681 // only dump log file's content
1682 ceph_assert(log.writer == nullptr && "cannot log_dump on mounted BlueFS");
1683 _init_logger();
1684 int r = _open_super();
1685 if (r < 0) {
1686 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
1687 return r;
1688 }
1689 r = _replay(true, true);
1690 if (r < 0) {
1691 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
1692 }
1693 _shutdown_logger();
1694 super = bluefs_super_t();
1695 return r;
1696 }
1697
1698 int BlueFS::device_migrate_to_existing(
1699 CephContext *cct,
1700 const set<int>& devs_source,
1701 int dev_target,
1702 const bluefs_layout_t& layout)
1703 {
1704 vector<byte> buf;
1705 bool buffered = cct->_conf->bluefs_buffered_io;
1706
1707 dout(10) << __func__ << " devs_source " << devs_source
1708 << " dev_target " << dev_target << dendl;
1709 assert(dev_target < (int)MAX_BDEV);
1710
1711 int flags = 0;
1712 flags |= devs_source.count(BDEV_DB) ?
1713 (REMOVE_DB | RENAME_SLOW2DB) : 0;
1714 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1715 int dev_target_new = dev_target;
1716
1717 // Slow device without separate DB one is addressed via BDEV_DB
1718 // Hence need renaming.
1719 if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) {
1720 dev_target_new = BDEV_DB;
1721 dout(0) << __func__ << " super to be written to " << dev_target << dendl;
1722 }
1723
1724 for (auto& [ino, file_ref] : nodes.file_map) {
1725 //do not copy log
1726 if (file_ref->fnode.ino == 1) {
1727 continue;
1728 }
1729 dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
1730
1731 auto& fnode_extents = file_ref->fnode.extents;
1732 vselector->sub_usage(file_ref->vselector_hint, file_ref->fnode);
1733
1734 bool rewrite = std::any_of(
1735 fnode_extents.begin(),
1736 fnode_extents.end(),
1737 [=](auto& ext) {
1738 return ext.bdev != dev_target && devs_source.count(ext.bdev);
1739 });
1740 if (rewrite) {
1741 dout(10) << __func__ << " migrating" << dendl;
1742
1743 // read entire file
1744 bufferlist bl;
1745 for (auto old_ext : fnode_extents) {
1746 buf.resize(old_ext.length);
1747 int r = _bdev_read_random(old_ext.bdev,
1748 old_ext.offset,
1749 old_ext.length,
1750 (char*)&buf.at(0),
1751 buffered);
1752 if (r != 0) {
1753 derr << __func__ << " failed to read 0x" << std::hex
1754 << old_ext.offset << "~" << old_ext.length << std::dec
1755 << " from " << (int)dev_target << dendl;
1756 return -EIO;
1757 }
1758 bl.append((char*)&buf[0], old_ext.length);
1759 }
1760
1761 // write entire file
1762 PExtentVector extents;
1763 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1764 if (l < 0) {
1765 derr << __func__ << " unable to allocate len 0x" << std::hex
1766 << bl.length() << std::dec << " from " << (int)dev_target
1767 << ": " << cpp_strerror(l) << dendl;
1768 return -ENOSPC;
1769 }
1770
1771 uint64_t off = 0;
1772 for (auto& i : extents) {
1773 bufferlist cur;
1774 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1775 ceph_assert(cur_len > 0);
1776 cur.substr_of(bl, off, cur_len);
1777 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1778 ceph_assert(r == 0);
1779 off += cur_len;
1780 }
1781
1782 // release old extents
1783 for (auto old_ext : fnode_extents) {
1784 PExtentVector to_release;
1785 to_release.emplace_back(old_ext.offset, old_ext.length);
1786 alloc[old_ext.bdev]->release(to_release);
1787 if (is_shared_alloc(old_ext.bdev)) {
1788 shared_alloc->bluefs_used -= to_release.size();
1789 }
1790 }
1791
1792 // update fnode
1793 fnode_extents.clear();
1794 for (auto& i : extents) {
1795 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
1796 }
1797 } else {
1798 for (auto& ext : fnode_extents) {
1799 if (dev_target != dev_target_new && ext.bdev == dev_target) {
1800 dout(20) << __func__ << " " << " ... adjusting extent 0x"
1801 << std::hex << ext.offset << std::dec
1802 << " bdev " << dev_target << " -> " << dev_target_new
1803 << dendl;
1804 ext.bdev = dev_target_new;
1805 }
1806 }
1807 }
1808 vselector->add_usage(file_ref->vselector_hint, file_ref->fnode);
1809 }
1810 // new logging device in the current naming scheme
1811 int new_log_dev_cur = bdev[BDEV_WAL] ?
1812 BDEV_WAL :
1813 bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
1814
1815 // new logging device in new naming scheme
1816 int new_log_dev_next = new_log_dev_cur;
1817
1818 if (devs_source.count(new_log_dev_cur)) {
1819 // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1820 new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ?
1821 BDEV_DB :
1822 BDEV_WAL;
1823
1824 dout(0) << __func__ << " log moved from " << new_log_dev_cur
1825 << " to " << new_log_dev_next << dendl;
1826
1827 new_log_dev_cur =
1828 (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ?
1829 BDEV_SLOW :
1830 new_log_dev_next;
1831 }
1832
1833 _rewrite_log_and_layout_sync_LNF_LD(
1834 false,
1835 (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB,
1836 new_log_dev_cur,
1837 new_log_dev_next,
1838 flags,
1839 layout);
1840 return 0;
1841 }
1842
1843 int BlueFS::device_migrate_to_new(
1844 CephContext *cct,
1845 const set<int>& devs_source,
1846 int dev_target,
1847 const bluefs_layout_t& layout)
1848 {
1849 vector<byte> buf;
1850 bool buffered = cct->_conf->bluefs_buffered_io;
1851
1852 dout(10) << __func__ << " devs_source " << devs_source
1853 << " dev_target " << dev_target << dendl;
1854 assert(dev_target == (int)BDEV_NEWDB || dev_target == (int)BDEV_NEWWAL);
1855
1856 int flags = 0;
1857
1858 flags |= devs_source.count(BDEV_DB) ?
1859 (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) :
1860 0;
1861 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1862 int dev_target_new = dev_target; //FIXME: remove, makes no sense
1863
1864 for (auto& p : nodes.file_map) {
1865 //do not copy log
1866 if (p.second->fnode.ino == 1) {
1867 continue;
1868 }
1869 dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl;
1870
1871 auto& fnode_extents = p.second->fnode.extents;
1872
1873 bool rewrite = false;
1874 for (auto ext_it = fnode_extents.begin();
1875 ext_it != p.second->fnode.extents.end();
1876 ++ext_it) {
1877 if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
1878 rewrite = true;
1879 break;
1880 }
1881 }
1882 if (rewrite) {
1883 dout(10) << __func__ << " migrating" << dendl;
1884
1885 // read entire file
1886 bufferlist bl;
1887 for (auto old_ext : fnode_extents) {
1888 buf.resize(old_ext.length);
1889 int r = _bdev_read_random(old_ext.bdev,
1890 old_ext.offset,
1891 old_ext.length,
1892 (char*)&buf.at(0),
1893 buffered);
1894 if (r != 0) {
1895 derr << __func__ << " failed to read 0x" << std::hex
1896 << old_ext.offset << "~" << old_ext.length << std::dec
1897 << " from " << (int)dev_target << dendl;
1898 return -EIO;
1899 }
1900 bl.append((char*)&buf[0], old_ext.length);
1901 }
1902
1903 // write entire file
1904 PExtentVector extents;
1905 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1906 if (l < 0) {
1907 derr << __func__ << " unable to allocate len 0x" << std::hex
1908 << bl.length() << std::dec << " from " << (int)dev_target
1909 << ": " << cpp_strerror(l) << dendl;
1910 return -ENOSPC;
1911 }
1912
1913 uint64_t off = 0;
1914 for (auto& i : extents) {
1915 bufferlist cur;
1916 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1917 ceph_assert(cur_len > 0);
1918 cur.substr_of(bl, off, cur_len);
1919 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1920 ceph_assert(r == 0);
1921 off += cur_len;
1922 }
1923
1924 // release old extents
1925 for (auto old_ext : fnode_extents) {
1926 PExtentVector to_release;
1927 to_release.emplace_back(old_ext.offset, old_ext.length);
1928 alloc[old_ext.bdev]->release(to_release);
1929 if (is_shared_alloc(old_ext.bdev)) {
1930 shared_alloc->bluefs_used -= to_release.size();
1931 }
1932 }
1933
1934 // update fnode
1935 fnode_extents.clear();
1936 for (auto& i : extents) {
1937 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
1938 }
1939 }
1940 }
1941 // new logging device in the current naming scheme
1942 int new_log_dev_cur =
1943 bdev[BDEV_NEWWAL] ?
1944 BDEV_NEWWAL :
1945 bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ?
1946 BDEV_WAL :
1947 bdev[BDEV_NEWDB] ?
1948 BDEV_NEWDB :
1949 bdev[BDEV_DB] && !(flags & REMOVE_DB)?
1950 BDEV_DB :
1951 BDEV_SLOW;
1952
1953 // new logging device in new naming scheme
1954 int new_log_dev_next =
1955 new_log_dev_cur == BDEV_NEWWAL ?
1956 BDEV_WAL :
1957 new_log_dev_cur == BDEV_NEWDB ?
1958 BDEV_DB :
1959 new_log_dev_cur;
1960
1961 int super_dev =
1962 dev_target == BDEV_NEWDB ?
1963 BDEV_NEWDB :
1964 bdev[BDEV_DB] ?
1965 BDEV_DB :
1966 BDEV_SLOW;
1967
1968 _rewrite_log_and_layout_sync_LNF_LD(
1969 false,
1970 super_dev,
1971 new_log_dev_cur,
1972 new_log_dev_next,
1973 flags,
1974 layout);
1975 return 0;
1976 }
1977
1978 BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
1979 {
1980 auto p = nodes.file_map.find(ino);
1981 if (p == nodes.file_map.end()) {
1982 FileRef f = ceph::make_ref<File>();
1983 nodes.file_map[ino] = f;
1984 // track files count in logger
1985 logger->set(l_bluefs_num_files, nodes.file_map.size());
1986 dout(30) << __func__ << " ino " << ino << " = " << f
1987 << " (new)" << dendl;
1988 return f;
1989 } else {
1990 dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
1991 return p->second;
1992 }
1993 }
1994
1995
1996 /**
1997 To modify fnode both FileWriter::lock and File::lock must be obtained.
1998 The special case is when we modify bluefs log (ino 1) or
1999 we are compacting log (ino 0).
2000
2001 In any case it is enough to hold File::lock to be sure fnode will not be modified.
2002 */
2003 struct lock_fnode_print {
2004 BlueFS::FileRef file;
2005 lock_fnode_print(BlueFS::FileRef file) : file(file) {};
2006 };
2007 std::ostream& operator<<(std::ostream& out, const lock_fnode_print& to_lock) {
2008 std::lock_guard l(to_lock.file->lock);
2009 out << to_lock.file->fnode;
2010 return out;
2011 }
2012
2013 void BlueFS::_drop_link_D(FileRef file)
2014 {
2015 dout(20) << __func__ << " had refs " << file->refs
2016 << " on " << lock_fnode_print(file) << dendl;
2017 ceph_assert(file->refs > 0);
2018 ceph_assert(ceph_mutex_is_locked(log.lock));
2019 ceph_assert(ceph_mutex_is_locked(nodes.lock));
2020
2021 --file->refs;
2022 if (file->refs == 0) {
2023 dout(20) << __func__ << " destroying " << file->fnode << dendl;
2024 ceph_assert(file->num_reading.load() == 0);
2025 vselector->sub_usage(file->vselector_hint, file->fnode);
2026 log.t.op_file_remove(file->fnode.ino);
2027 nodes.file_map.erase(file->fnode.ino);
2028 logger->set(l_bluefs_num_files, nodes.file_map.size());
2029 file->deleted = true;
2030
2031 std::lock_guard dl(dirty.lock);
2032 for (auto& r : file->fnode.extents) {
2033 dirty.pending_release[r.bdev].insert(r.offset, r.length);
2034 }
2035 if (file->dirty_seq > dirty.seq_stable) {
2036 // retract request to serialize changes
2037 ceph_assert(dirty.files.count(file->dirty_seq));
2038 auto it = dirty.files[file->dirty_seq].iterator_to(*file);
2039 dirty.files[file->dirty_seq].erase(it);
2040 file->dirty_seq = dirty.seq_stable;
2041 }
2042 }
2043 }
2044
2045 int64_t BlueFS::_read_random(
2046 FileReader *h, ///< [in] read from here
2047 uint64_t off, ///< [in] offset
2048 uint64_t len, ///< [in] this many bytes
2049 char *out) ///< [out] copy it here
2050 {
2051 auto* buf = &h->buf;
2052
2053 int64_t ret = 0;
2054 dout(10) << __func__ << " h " << h
2055 << " 0x" << std::hex << off << "~" << len << std::dec
2056 << " from " << lock_fnode_print(h->file) << dendl;
2057
2058 ++h->file->num_reading;
2059
2060 if (!h->ignore_eof &&
2061 off + len > h->file->fnode.size) {
2062 if (off > h->file->fnode.size)
2063 len = 0;
2064 else
2065 len = h->file->fnode.size - off;
2066 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
2067 << std::hex << len << std::dec << dendl;
2068 }
2069 logger->inc(l_bluefs_read_random_count, 1);
2070 logger->inc(l_bluefs_read_random_bytes, len);
2071
2072 std::shared_lock s_lock(h->lock);
2073 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
2074 while (len > 0) {
2075 if (off < buf->bl_off || off >= buf->get_buf_end()) {
2076 s_lock.unlock();
2077 uint64_t x_off = 0;
2078 auto p = h->file->fnode.seek(off, &x_off);
2079 ceph_assert(p != h->file->fnode.extents.end());
2080 uint64_t l = std::min(p->length - x_off, len);
2081 //hard cap to 1GB
2082 l = std::min(l, uint64_t(1) << 30);
2083 dout(20) << __func__ << " read random 0x"
2084 << std::hex << x_off << "~" << l << std::dec
2085 << " of " << *p << dendl;
2086 int r;
2087 if (!cct->_conf->bluefs_check_for_zeros) {
2088 r = _bdev_read_random(p->bdev, p->offset + x_off, l, out,
2089 cct->_conf->bluefs_buffered_io);
2090 } else {
2091 r = _read_random_and_check(p->bdev, p->offset + x_off, l, out,
2092 cct->_conf->bluefs_buffered_io);
2093 }
2094 ceph_assert(r == 0);
2095 off += l;
2096 len -= l;
2097 ret += l;
2098 out += l;
2099
2100 logger->inc(l_bluefs_read_random_disk_count, 1);
2101 logger->inc(l_bluefs_read_random_disk_bytes, l);
2102 if (len > 0) {
2103 s_lock.lock();
2104 }
2105 } else {
2106 auto left = buf->get_buf_remaining(off);
2107 int64_t r = std::min(len, left);
2108 logger->inc(l_bluefs_read_random_buffer_count, 1);
2109 logger->inc(l_bluefs_read_random_buffer_bytes, r);
2110 dout(20) << __func__ << " left 0x" << std::hex << left
2111 << " 0x" << off << "~" << len << std::dec
2112 << dendl;
2113
2114 auto p = buf->bl.begin();
2115 p.seek(off - buf->bl_off);
2116 p.copy(r, out);
2117 out += r;
2118
2119 dout(30) << __func__ << " result chunk (0x"
2120 << std::hex << r << std::dec << " bytes):\n";
2121 bufferlist t;
2122 t.substr_of(buf->bl, off - buf->bl_off, r);
2123 t.hexdump(*_dout);
2124 *_dout << dendl;
2125
2126 off += r;
2127 len -= r;
2128 ret += r;
2129 buf->pos += r;
2130 }
2131 }
2132 dout(20) << __func__ << " got " << ret << dendl;
2133 --h->file->num_reading;
2134 return ret;
2135 }
2136
2137 int64_t BlueFS::_read(
2138 FileReader *h, ///< [in] read from here
2139 uint64_t off, ///< [in] offset
2140 size_t len, ///< [in] this many bytes
2141 bufferlist *outbl, ///< [out] optional: reference the result here
2142 char *out) ///< [out] optional: or copy it here
2143 {
2144 FileReaderBuffer *buf = &(h->buf);
2145
2146 bool prefetch = !outbl && !out;
2147 dout(10) << __func__ << " h " << h
2148 << " 0x" << std::hex << off << "~" << len << std::dec
2149 << " from " << lock_fnode_print(h->file)
2150 << (prefetch ? " prefetch" : "")
2151 << dendl;
2152
2153 ++h->file->num_reading;
2154
2155 if (!h->ignore_eof &&
2156 off + len > h->file->fnode.size) {
2157 if (off > h->file->fnode.size)
2158 len = 0;
2159 else
2160 len = h->file->fnode.size - off;
2161 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
2162 << std::hex << len << std::dec << dendl;
2163 }
2164 logger->inc(l_bluefs_read_count, 1);
2165 logger->inc(l_bluefs_read_bytes, len);
2166 if (prefetch) {
2167 logger->inc(l_bluefs_read_prefetch_count, 1);
2168 logger->inc(l_bluefs_read_prefetch_bytes, len);
2169 }
2170
2171 if (outbl)
2172 outbl->clear();
2173
2174 int64_t ret = 0;
2175 std::shared_lock s_lock(h->lock);
2176 while (len > 0) {
2177 size_t left;
2178 if (off < buf->bl_off || off >= buf->get_buf_end()) {
2179 s_lock.unlock();
2180 std::unique_lock u_lock(h->lock);
2181 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
2182 if (off < buf->bl_off || off >= buf->get_buf_end()) {
2183 // if precondition hasn't changed during locking upgrade.
2184 buf->bl.clear();
2185 buf->bl_off = off & super.block_mask();
2186 uint64_t x_off = 0;
2187 auto p = h->file->fnode.seek(buf->bl_off, &x_off);
2188 if (p == h->file->fnode.extents.end()) {
2189 dout(5) << __func__ << " reading less then required "
2190 << ret << "<" << ret + len << dendl;
2191 break;
2192 }
2193
2194 uint64_t want = round_up_to(len + (off & ~super.block_mask()),
2195 super.block_size);
2196 want = std::max(want, buf->max_prefetch);
2197 uint64_t l = std::min(p->length - x_off, want);
2198 //hard cap to 1GB
2199 l = std::min(l, uint64_t(1) << 30);
2200 uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size);
2201 if (!h->ignore_eof &&
2202 buf->bl_off + l > eof_offset) {
2203 l = eof_offset - buf->bl_off;
2204 }
2205 dout(20) << __func__ << " fetching 0x"
2206 << std::hex << x_off << "~" << l << std::dec
2207 << " of " << *p << dendl;
2208 int r;
2209 if (!cct->_conf->bluefs_check_for_zeros) {
2210 r = _bdev_read(p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev],
2211 cct->_conf->bluefs_buffered_io);
2212 } else {
2213 r = _read_and_check(
2214 p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev],
2215 cct->_conf->bluefs_buffered_io);
2216 }
2217 logger->inc(l_bluefs_read_disk_count, 1);
2218 logger->inc(l_bluefs_read_disk_bytes, l);
2219
2220 ceph_assert(r == 0);
2221 }
2222 u_lock.unlock();
2223 s_lock.lock();
2224 // we should recheck if buffer is valid after lock downgrade
2225 continue;
2226 }
2227 left = buf->get_buf_remaining(off);
2228 dout(20) << __func__ << " left 0x" << std::hex << left
2229 << " len 0x" << len << std::dec << dendl;
2230
2231 int64_t r = std::min(len, left);
2232 if (outbl) {
2233 bufferlist t;
2234 t.substr_of(buf->bl, off - buf->bl_off, r);
2235 outbl->claim_append(t);
2236 }
2237 if (out) {
2238 auto p = buf->bl.begin();
2239 p.seek(off - buf->bl_off);
2240 p.copy(r, out);
2241 out += r;
2242 }
2243
2244 dout(30) << __func__ << " result chunk (0x"
2245 << std::hex << r << std::dec << " bytes):\n";
2246 bufferlist t;
2247 t.substr_of(buf->bl, off - buf->bl_off, r);
2248 t.hexdump(*_dout);
2249 *_dout << dendl;
2250
2251 off += r;
2252 len -= r;
2253 ret += r;
2254 buf->pos += r;
2255 }
2256
2257 dout(20) << __func__ << " got " << ret << dendl;
2258 ceph_assert(!outbl || (int)outbl->length() == ret);
2259 --h->file->num_reading;
2260 return ret;
2261 }
2262
2263 void BlueFS::invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
2264 {
2265 std::lock_guard l(f->lock);
2266 dout(10) << __func__ << " file " << f->fnode
2267 << " 0x" << std::hex << offset << "~" << length << std::dec
2268 << dendl;
2269 if (offset & ~super.block_mask()) {
2270 offset &= super.block_mask();
2271 length = round_up_to(length, super.block_size);
2272 }
2273 uint64_t x_off = 0;
2274 auto p = f->fnode.seek(offset, &x_off);
2275 while (length > 0 && p != f->fnode.extents.end()) {
2276 uint64_t x_len = std::min(p->length - x_off, length);
2277 bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
2278 dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len
2279 << std:: dec << " of " << *p << dendl;
2280 offset += x_len;
2281 length -= x_len;
2282 }
2283 }
2284
2285 uint64_t BlueFS::_estimate_log_size_N()
2286 {
2287 std::lock_guard nl(nodes.lock);
2288 int avg_dir_size = 40; // fixme
2289 int avg_file_size = 12;
2290 uint64_t size = 4096 * 2;
2291 size += nodes.file_map.size() * (1 + sizeof(bluefs_fnode_t));
2292 size += nodes.dir_map.size() + (1 + avg_dir_size);
2293 size += nodes.file_map.size() * (1 + avg_dir_size + avg_file_size);
2294 return round_up_to(size, super.block_size);
2295 }
2296
2297 void BlueFS::compact_log()/*_LNF_LD_NF_D*/
2298 {
2299 if (!cct->_conf->bluefs_replay_recovery_disable_compact) {
2300 if (cct->_conf->bluefs_compact_log_sync) {
2301 _compact_log_sync_LNF_LD();
2302 } else {
2303 _compact_log_async_LD_LNF_D();
2304 }
2305 }
2306 }
2307
2308 bool BlueFS::_should_start_compact_log_L_N()
2309 {
2310 if (log_is_compacting.load() == true) {
2311 // compaction is already running
2312 return false;
2313 }
2314 uint64_t current;
2315 {
2316 std::lock_guard ll(log.lock);
2317 current = log.writer->file->fnode.size;
2318 }
2319 uint64_t expected = _estimate_log_size_N();
2320 float ratio = (float)current / (float)expected;
2321 dout(10) << __func__ << " current 0x" << std::hex << current
2322 << " expected " << expected << std::dec
2323 << " ratio " << ratio
2324 << dendl;
2325 if (current < cct->_conf->bluefs_log_compact_min_size ||
2326 ratio < cct->_conf->bluefs_log_compact_min_ratio) {
2327 return false;
2328 }
2329 return true;
2330 }
2331
2332 void BlueFS::_compact_log_dump_metadata_NF(bluefs_transaction_t *t,
2333 int flags)
2334 {
2335 std::lock_guard nl(nodes.lock);
2336
2337 t->seq = 1;
2338 t->uuid = super.uuid;
2339 dout(20) << __func__ << " op_init" << dendl;
2340
2341 t->op_init();
2342 for (auto& [ino, file_ref] : nodes.file_map) {
2343 if (ino == 1)
2344 continue;
2345 ceph_assert(ino > 1);
2346 std::lock_guard fl(file_ref->lock);
2347 for(auto& e : file_ref->fnode.extents) {
2348 auto bdev = e.bdev;
2349 auto bdev_new = bdev;
2350 ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL));
2351 if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
2352 bdev_new = BDEV_DB;
2353 }
2354 if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
2355 bdev_new = BDEV_SLOW;
2356 }
2357 if (bdev == BDEV_NEWDB) {
2358 // REMOVE_DB xor RENAME_DB
2359 ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
2360 ceph_assert(!(flags & RENAME_SLOW2DB));
2361 bdev_new = BDEV_DB;
2362 }
2363 if (bdev == BDEV_NEWWAL) {
2364 ceph_assert(flags & REMOVE_WAL);
2365 bdev_new = BDEV_WAL;
2366 }
2367 e.bdev = bdev_new;
2368 }
2369 dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
2370 t->op_file_update(file_ref->fnode);
2371 }
2372 for (auto& [path, dir_ref] : nodes.dir_map) {
2373 dout(20) << __func__ << " op_dir_create " << path << dendl;
2374 t->op_dir_create(path);
2375 for (auto& [fname, file_ref] : dir_ref->file_map) {
2376 dout(20) << __func__ << " op_dir_link " << path << "/" << fname
2377 << " to " << file_ref->fnode.ino << dendl;
2378 t->op_dir_link(path, fname, file_ref->fnode.ino);
2379 }
2380 }
2381 }
2382 /* Streams to t files modified before *capture_before_seq* and all dirs */
2383 void BlueFS::_compact_log_async_dump_metadata_NF(bluefs_transaction_t *t,
2384 uint64_t capture_before_seq)
2385 {
2386 std::lock_guard nl(nodes.lock);
2387
2388 t->seq = 1;
2389 t->uuid = super.uuid;
2390 dout(20) << __func__ << " op_init" << dendl;
2391
2392 t->op_init();
2393 for (auto& [ino, file_ref] : nodes.file_map) {
2394 if (ino == 1)
2395 continue;
2396 ceph_assert(ino > 1);
2397 std::lock_guard fl(file_ref->lock);
2398 if (file_ref->dirty_seq < capture_before_seq) {
2399 dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
2400 } else {
2401 dout(20) << __func__ << " op_file_update just modified, dirty_seq="
2402 << file_ref->dirty_seq << " " << file_ref->fnode << dendl;
2403 }
2404 t->op_file_update(file_ref->fnode);
2405 }
2406 for (auto& [path, dir_ref] : nodes.dir_map) {
2407 dout(20) << __func__ << " op_dir_create " << path << dendl;
2408 t->op_dir_create(path);
2409 for (auto& [fname, file_ref] : dir_ref->file_map) {
2410 dout(20) << __func__ << " op_dir_link " << path << "/" << fname
2411 << " to " << file_ref->fnode.ino << dendl;
2412 t->op_dir_link(path, fname, file_ref->fnode.ino);
2413 }
2414 }
2415 }
2416
2417 void BlueFS::_compact_log_sync_LNF_LD()
2418 {
2419 dout(10) << __func__ << dendl;
2420 uint8_t prefer_bdev;
2421 {
2422 std::lock_guard ll(log.lock);
2423 prefer_bdev =
2424 vselector->select_prefer_bdev(log.writer->file->vselector_hint);
2425 }
2426 _rewrite_log_and_layout_sync_LNF_LD(true,
2427 BDEV_DB,
2428 prefer_bdev,
2429 prefer_bdev,
2430 0,
2431 super.memorized_layout);
2432 logger->inc(l_bluefs_log_compactions);
2433 }
2434
2435 void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
2436 int super_dev,
2437 int log_dev,
2438 int log_dev_new,
2439 int flags,
2440 std::optional<bluefs_layout_t> layout)
2441 {
2442 std::lock_guard ll(log.lock);
2443
2444 File *log_file = log.writer->file.get();
2445
2446 // log.t.seq is always set to current live seq
2447 ceph_assert(log.t.seq == log.seq_live);
2448 // Capturing entire state. Dump anything that has been stored there.
2449 log.t.clear();
2450 log.t.seq = log.seq_live;
2451 // From now on, no changes to log.t are permitted until we finish rewriting log.
2452 // Can allow dirty to remain dirty - log.seq_live will not change.
2453
2454 dout(20) << __func__ << " super_dev:" << super_dev
2455 << " log_dev:" << log_dev
2456 << " log_dev_new:" << log_dev_new
2457 << " flags:" << flags
2458 << dendl;
2459 bluefs_transaction_t t;
2460 _compact_log_dump_metadata_NF(&t, flags);
2461
2462 dout(20) << __func__ << " op_jump_seq " << log.seq_live << dendl;
2463 t.op_jump_seq(log.seq_live);
2464
2465 bufferlist bl;
2466 encode(t, bl);
2467 _pad_bl(bl);
2468
2469 uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
2470 dout(20) << __func__ << " need " << need << dendl;
2471
2472 bluefs_fnode_t old_fnode;
2473 int r;
2474 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2475 log_file->fnode.swap_extents(old_fnode);
2476 if (allocate_with_fallback) {
2477 r = _allocate(log_dev, need, &log_file->fnode);
2478 ceph_assert(r == 0);
2479 } else {
2480 PExtentVector extents;
2481 r = _allocate_without_fallback(log_dev,
2482 need,
2483 &extents);
2484 ceph_assert(r == 0);
2485 for (auto& p : extents) {
2486 log_file->fnode.append_extent(
2487 bluefs_extent_t(log_dev, p.offset, p.length));
2488 }
2489 }
2490
2491 _close_writer(log.writer);
2492
2493 // we will write it to super
2494 log_file->fnode.reset_delta();
2495 log_file->fnode.size = bl.length();
2496
2497 log.writer = _create_writer(log_file);
2498 log.writer->append(bl);
2499 _flush_special(log.writer);
2500 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2501 #ifdef HAVE_LIBAIO
2502 if (!cct->_conf->bluefs_sync_write) {
2503 list<aio_t> completed_ios;
2504 _claim_completed_aios(log.writer, &completed_ios);
2505 _wait_for_aio(log.writer);
2506 completed_ios.clear();
2507 }
2508 #endif
2509 _flush_bdev();
2510 ++log.seq_live;
2511 dirty.seq_live = log.seq_live;
2512 log.t.seq = log.seq_live;
2513
2514 super.memorized_layout = layout;
2515 super.log_fnode = log_file->fnode;
2516 // rename device if needed
2517 if (log_dev != log_dev_new) {
2518 dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl;
2519 for (auto& p : super.log_fnode.extents) {
2520 p.bdev = log_dev_new;
2521 }
2522 }
2523 dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl;
2524
2525 ++super.version;
2526 _write_super(super_dev);
2527 _flush_bdev();
2528
2529 dout(10) << __func__ << " release old log extents " << old_fnode.extents << dendl;
2530 std::lock_guard dl(dirty.lock);
2531 for (auto& r : old_fnode.extents) {
2532 dirty.pending_release[r.bdev].insert(r.offset, r.length);
2533 }
2534 }
2535
2536 /*
2537 * 1. Allocate a new extent to continue the log, and then log an event
2538 * that jumps the log write position to the new extent. At this point, the
2539 * old extent(s) won't be written to, and reflect everything to compact.
2540 * New events will be written to the new region that we'll keep.
2541 *
2542 * 2. While still holding the lock, encode a bufferlist that dumps all of the
2543 * in-memory fnodes and names. This will become the new beginning of the
2544 * log. The last event will jump to the log continuation extent from #1.
2545 *
2546 * 3. Queue a write to a new extent for the new beginnging of the log.
2547 *
2548 * 4. Drop lock and wait
2549 *
2550 * 5. Retake the lock.
2551 *
2552 * 6. Update the log_fnode to splice in the new beginning.
2553 *
2554 * 7. Write the new superblock.
2555 *
2556 * 8. Release the old log space. Clean up.
2557 */
2558
2559 void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
2560 {
2561 dout(10) << __func__ << dendl;
2562 // only one compaction allowed at one time
2563 bool old_is_comp = std::atomic_exchange(&log_is_compacting, true);
2564 if (old_is_comp) {
2565 dout(10) << __func__ << " ongoing" <<dendl;
2566 return;
2567 }
2568
2569 log.lock.lock();
2570 File *log_file = log.writer->file.get();
2571 FileWriter *new_log_writer = nullptr;
2572 FileRef new_log = nullptr;
2573 uint64_t new_log_jump_to = 0;
2574 uint64_t old_log_jump_to = 0;
2575
2576 new_log = ceph::make_ref<File>();
2577 new_log->fnode.ino = 0; // we use _flush_special to avoid log of the fnode
2578
2579 // Part 1.
2580 // Prepare current log for jumping into it.
2581 // 1. Allocate extent
2582 // 2. Update op to log
2583 // 3. Jump op to log
2584 // During that, no one else can write to log, otherwise we risk jumping backwards.
2585 // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that.
2586
2587 //signal _maybe_extend_log that expansion of log is temporary inacceptable
2588 bool old_forbidden = atomic_exchange(&log_forbidden_to_expand, true);
2589 ceph_assert(old_forbidden == false);
2590
2591 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2592
2593 // 1.1 allocate new log space and jump to it.
2594 old_log_jump_to = log_file->fnode.get_allocated();
2595 uint64_t runway = log_file->fnode.get_allocated() - log.writer->get_effective_write_pos();
2596 dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
2597 << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl;
2598 int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
2599 cct->_conf->bluefs_max_log_runway,
2600 &log_file->fnode);
2601 ceph_assert(r == 0);
2602 //adjust usage as flush below will need it
2603 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2604 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2605
2606 // update the log file change and log a jump to the offset where we want to
2607 // write the new entries
2608 log.t.op_file_update(log_file->fnode);
2609 // jump to new position should mean next seq
2610 log.t.op_jump(log.seq_live + 1, old_log_jump_to);
2611 uint64_t seq_now = log.seq_live;
2612 // we need to flush all bdev because we will be streaming all dirty files to log
2613 // TODO - think - if _flush_and_sync_log_jump will not add dirty files nor release pending allocations
2614 // then flush_bdev() will not be necessary
2615 _flush_bdev();
2616 _flush_and_sync_log_jump_D(old_log_jump_to, runway);
2617
2618 // out of jump section
2619
2620 // 2. prepare compacted log
2621 bluefs_transaction_t t;
2622 _compact_log_async_dump_metadata_NF(&t, seq_now);
2623
2624 // now state is captured to bufferlist
2625 // log can be used to write to, ops in log will be continuation of captured state
2626 log.lock.unlock();
2627
2628 uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
2629 std::max(alloc_size[BDEV_DB],
2630 alloc_size[BDEV_SLOW]));
2631
2632 // conservative estimate for final encoded size
2633 new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2,
2634 max_alloc_size);
2635 //newly constructed log head will jump to what we had before
2636 t.op_jump(seq_now, new_log_jump_to);
2637
2638 // allocate
2639 //FIXME: check if we want DB here?
2640 r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
2641 &new_log->fnode);
2642 ceph_assert(r == 0);
2643
2644 bufferlist bl;
2645 encode(t, bl);
2646 _pad_bl(bl);
2647
2648 dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
2649 << std::dec << dendl;
2650
2651 new_log_writer = _create_writer(new_log);
2652
2653 new_log_writer->append(bl);
2654 // 3. flush
2655 _flush_special(new_log_writer);
2656
2657 // 4. wait
2658 _flush_bdev(new_log_writer);
2659 // 5. update our log fnode
2660 // we need to append to new_log the extents that were allocated in step 1.1
2661 // we do it by inverse logic - we drop 'old_log_jump_to' bytes and keep rest
2662 // todo - maybe improve _allocate so we will give clear set of new allocations
2663 uint64_t processed = 0;
2664 mempool::bluefs::vector<bluefs_extent_t> old_extents;
2665 for (auto& e : log_file->fnode.extents) {
2666 if (processed + e.length <= old_log_jump_to) {
2667 // drop whole extent
2668 dout(10) << __func__ << " remove old log extent " << e << dendl;
2669 old_extents.push_back(e);
2670 } else {
2671 // keep, but how much?
2672 if (processed < old_log_jump_to) {
2673 ceph_assert(processed + e.length > old_log_jump_to);
2674 ceph_assert(old_log_jump_to - processed <= std::numeric_limits<uint32_t>::max());
2675 uint32_t cut_at = uint32_t(old_log_jump_to - processed);
2676 // need to cut, first half gets dropped
2677 bluefs_extent_t retire(e.bdev, e.offset, cut_at);
2678 old_extents.push_back(retire);
2679 // second half goes to new log
2680 bluefs_extent_t keep(e.bdev, e.offset + cut_at, e.length - cut_at);
2681 new_log->fnode.append_extent(keep);
2682 dout(10) << __func__ << " kept " << keep << " removed " << retire << dendl;
2683 } else {
2684 // take entire extent
2685 ceph_assert(processed >= old_log_jump_to);
2686 new_log->fnode.append_extent(e);
2687 dout(10) << __func__ << " kept " << e << dendl;
2688 }
2689 }
2690 processed += e.length;
2691 }
2692 // we will write it to super
2693 new_log->fnode.reset_delta();
2694
2695 // 6. write the super block to reflect the changes
2696 dout(10) << __func__ << " writing super" << dendl;
2697 new_log->fnode.ino = log_file->fnode.ino;
2698 new_log->fnode.size = 0;
2699 new_log->fnode.mtime = ceph_clock_now();
2700 super.log_fnode = new_log->fnode;
2701 ++super.version;
2702 _write_super(BDEV_DB);
2703 _flush_bdev();
2704
2705 log.lock.lock();
2706 // swapping log_file and new_log
2707 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2708
2709 // clear the extents from old log file, they are added to new log
2710 log_file->fnode.clear_extents();
2711 // swap the log files. New log file is the log file now.
2712 new_log->fnode.swap_extents(log_file->fnode);
2713
2714 log.writer->pos = log.writer->file->fnode.size =
2715 log.writer->pos - old_log_jump_to + new_log_jump_to;
2716
2717 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2718
2719 log.lock.unlock();
2720
2721 old_forbidden = atomic_exchange(&log_forbidden_to_expand, false);
2722 ceph_assert(old_forbidden == true);
2723 //to wake up if someone was in need of expanding log
2724 log_cond.notify_all();
2725
2726 // 7. release old space
2727 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
2728 {
2729 std::lock_guard dl(dirty.lock);
2730 for (auto& r : old_extents) {
2731 dirty.pending_release[r.bdev].insert(r.offset, r.length);
2732 }
2733 }
2734
2735 // delete the new log, remove from the dirty files list
2736 _close_writer(new_log_writer);
2737 new_log_writer = nullptr;
2738 new_log = nullptr;
2739 log_cond.notify_all();
2740
2741 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2742 logger->inc(l_bluefs_log_compactions);
2743
2744 old_is_comp = atomic_exchange(&log_is_compacting, false);
2745 ceph_assert(old_is_comp);
2746 }
2747
2748 void BlueFS::_pad_bl(bufferlist& bl)
2749 {
2750 uint64_t partial = bl.length() % super.block_size;
2751 if (partial) {
2752 dout(10) << __func__ << " padding with 0x" << std::hex
2753 << super.block_size - partial << " zeros" << std::dec << dendl;
2754 bl.append_zero(super.block_size - partial);
2755 }
2756 }
2757
2758
2759 // Returns log seq that was live before advance.
2760 uint64_t BlueFS::_log_advance_seq()
2761 {
2762 ceph_assert(ceph_mutex_is_locked(dirty.lock));
2763 ceph_assert(ceph_mutex_is_locked(log.lock));
2764 //acquire new seq
2765 // this will became seq_stable once we write
2766 ceph_assert(dirty.seq_stable < dirty.seq_live);
2767 ceph_assert(log.t.seq == log.seq_live);
2768 uint64_t seq = log.seq_live;
2769 log.t.uuid = super.uuid;
2770
2771 ++dirty.seq_live;
2772 ++log.seq_live;
2773 ceph_assert(dirty.seq_live == log.seq_live);
2774 return seq;
2775 }
2776
2777
2778 // Adds to log.t file modifications mentioned in `dirty.files`.
2779 // Note: some bluefs ops may have already been stored in log.t transaction.
2780 void BlueFS::_consume_dirty(uint64_t seq)
2781 {
2782 ceph_assert(ceph_mutex_is_locked(dirty.lock));
2783 ceph_assert(ceph_mutex_is_locked(log.lock));
2784
2785 // log dirty files
2786 // we just incremented log_seq. It is now illegal to add to dirty.files[log_seq]
2787 auto lsi = dirty.files.find(seq);
2788 if (lsi != dirty.files.end()) {
2789 dout(20) << __func__ << " " << lsi->second.size() << " dirty.files" << dendl;
2790 for (auto &f : lsi->second) {
2791 // fnode here is protected indirectly
2792 // the only path that adds to dirty.files goes from _fsync()
2793 // _fsync() is executed under writer lock,
2794 // and does not exit until syncing log is done
2795 dout(20) << __func__ << " op_file_update_inc " << f.fnode << dendl;
2796 log.t.op_file_update_inc(f.fnode);
2797 }
2798 }
2799 }
2800
2801 // Extends log if its free space is smaller then bluefs_min_log_runway.
2802 // Returns space available *BEFORE* adding new space. Signed for additional <0 detection.
2803 int64_t BlueFS::_maybe_extend_log()
2804 {
2805 ceph_assert(ceph_mutex_is_locked(log.lock));
2806 // allocate some more space (before we run out)?
2807 // BTW: this triggers `flush()` in the `page_aligned_appender` of `log.writer`.
2808 int64_t runway = log.writer->file->fnode.get_allocated() -
2809 log.writer->get_effective_write_pos();
2810 if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
2811 dout(10) << __func__ << " allocating more log runway (0x"
2812 << std::hex << runway << std::dec << " remaining)" << dendl;
2813 /*
2814 * Usually, when we are low on space in log, we just allocate new extent,
2815 * put update op(log) to log and we are fine.
2816 * Problem - it interferes with log compaction:
2817 * New log produced in compaction will include - as last op - jump into some offset (anchor) of current log.
2818 * It is assumed that log region (anchor - end) will contain all changes made by bluefs since
2819 * full state capture into new log.
2820 * Putting log update into (anchor - end) region is illegal, because any update there must be compatible with
2821 * both logs, but old log is different then new log.
2822 *
2823 * Possible solutions:
2824 * - stall extending log until we finish compacting and switch log (CURRENT)
2825 * - re-run compaction with more runway for old log
2826 * - add OP_FILE_ADDEXT that adds extent; will be compatible with both logs
2827 */
2828 if (log_forbidden_to_expand.load() == true) {
2829 return -EWOULDBLOCK;
2830 }
2831 vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
2832 int r = _allocate(
2833 vselector->select_prefer_bdev(log.writer->file->vselector_hint),
2834 cct->_conf->bluefs_max_log_runway,
2835 &log.writer->file->fnode);
2836 ceph_assert(r == 0);
2837 vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
2838 log.t.op_file_update_inc(log.writer->file->fnode);
2839 }
2840 return runway;
2841 }
2842
2843 void BlueFS::_flush_and_sync_log_core(int64_t runway)
2844 {
2845 ceph_assert(ceph_mutex_is_locked(log.lock));
2846 dout(10) << __func__ << " " << log.t << dendl;
2847
2848 bufferlist bl;
2849 bl.reserve(super.block_size);
2850 encode(log.t, bl);
2851 // pad to block boundary
2852 size_t realign = super.block_size - (bl.length() % super.block_size);
2853 if (realign && realign != super.block_size)
2854 bl.append_zero(realign);
2855
2856 logger->inc(l_bluefs_logged_bytes, bl.length());
2857
2858 if (true) {
2859 ceph_assert(bl.length() <= runway); // if we write this, we will have an unrecoverable data loss
2860 // transaction will not fit extents before growth -> data loss on _replay
2861 }
2862
2863 log.writer->append(bl);
2864
2865 // prepare log for new transactions
2866 log.t.clear();
2867 log.t.seq = log.seq_live;
2868
2869 uint64_t new_data = _flush_special(log.writer);
2870 vselector->add_usage(log.writer->file->vselector_hint, new_data);
2871 }
2872
2873 // Clears dirty.files up to (including) seq_stable.
2874 void BlueFS::_clear_dirty_set_stable_D(uint64_t seq)
2875 {
2876 std::lock_guard dl(dirty.lock);
2877
2878 // clean dirty files
2879 if (seq > dirty.seq_stable) {
2880 dirty.seq_stable = seq;
2881 dout(20) << __func__ << " seq_stable " << dirty.seq_stable << dendl;
2882
2883 // undirty all files that were already streamed to log
2884 auto p = dirty.files.begin();
2885 while (p != dirty.files.end()) {
2886 if (p->first > dirty.seq_stable) {
2887 dout(20) << __func__ << " done cleaning up dirty files" << dendl;
2888 break;
2889 }
2890
2891 auto l = p->second.begin();
2892 while (l != p->second.end()) {
2893 File *file = &*l;
2894 ceph_assert(file->dirty_seq <= dirty.seq_stable);
2895 dout(20) << __func__ << " cleaned file " << file->fnode.ino << dendl;
2896 file->dirty_seq = dirty.seq_stable;
2897 p->second.erase(l++);
2898 }
2899
2900 ceph_assert(p->second.empty());
2901 dirty.files.erase(p++);
2902 }
2903 } else {
2904 dout(20) << __func__ << " seq_stable " << dirty.seq_stable
2905 << " already >= out seq " << seq
2906 << ", we lost a race against another log flush, done" << dendl;
2907 }
2908 }
2909
2910 void BlueFS::_release_pending_allocations(vector<interval_set<uint64_t>>& to_release)
2911 {
2912 for (unsigned i = 0; i < to_release.size(); ++i) {
2913 if (!to_release[i].empty()) {
2914 /* OK, now we have the guarantee alloc[i] won't be null. */
2915 int r = 0;
2916 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
2917 r = bdev[i]->queue_discard(to_release[i]);
2918 if (r == 0)
2919 continue;
2920 } else if (cct->_conf->bdev_enable_discard) {
2921 for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
2922 bdev[i]->discard(p.get_start(), p.get_len());
2923 }
2924 }
2925 alloc[i]->release(to_release[i]);
2926 if (is_shared_alloc(i)) {
2927 shared_alloc->bluefs_used -= to_release[i].size();
2928 }
2929 }
2930 }
2931 }
2932
2933 int BlueFS::_flush_and_sync_log_LD(uint64_t want_seq)
2934 {
2935 int64_t available_runway;
2936 do {
2937 log.lock.lock();
2938 dirty.lock.lock();
2939 if (want_seq && want_seq <= dirty.seq_stable) {
2940 dout(10) << __func__ << " want_seq " << want_seq << " <= seq_stable "
2941 << dirty.seq_stable << ", done" << dendl;
2942 dirty.lock.unlock();
2943 log.lock.unlock();
2944 return 0;
2945 }
2946
2947 available_runway = _maybe_extend_log();
2948 if (available_runway == -EWOULDBLOCK) {
2949 // we are in need of adding runway, but we are during log-switch from compaction
2950 dirty.lock.unlock();
2951 //instead log.lock.unlock() do move ownership
2952 std::unique_lock<ceph::mutex> ll(log.lock, std::adopt_lock);
2953 while (log_forbidden_to_expand.load()) {
2954 log_cond.wait(ll);
2955 }
2956 } else {
2957 ceph_assert(available_runway >= 0);
2958 }
2959 } while (available_runway < 0);
2960
2961 ceph_assert(want_seq == 0 || want_seq <= dirty.seq_live); // illegal to request seq that was not created yet
2962 uint64_t seq =_log_advance_seq();
2963 _consume_dirty(seq);
2964 vector<interval_set<uint64_t>> to_release(dirty.pending_release.size());
2965 to_release.swap(dirty.pending_release);
2966 dirty.lock.unlock();
2967
2968 _flush_and_sync_log_core(available_runway);
2969 _flush_bdev(log.writer);
2970 logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size);
2971 //now log.lock is no longer needed
2972 log.lock.unlock();
2973
2974 _clear_dirty_set_stable_D(seq);
2975 _release_pending_allocations(to_release);
2976
2977 _update_logger_stats();
2978 return 0;
2979 }
2980
2981 // Flushes log and immediately adjusts log_writer pos.
2982 int BlueFS::_flush_and_sync_log_jump_D(uint64_t jump_to,
2983 int64_t available_runway)
2984 {
2985 ceph_assert(ceph_mutex_is_locked(log.lock));
2986
2987 ceph_assert(jump_to);
2988 // we synchronize writing to log, by lock to log.lock
2989
2990 dirty.lock.lock();
2991 uint64_t seq =_log_advance_seq();
2992 _consume_dirty(seq);
2993 vector<interval_set<uint64_t>> to_release(dirty.pending_release.size());
2994 to_release.swap(dirty.pending_release);
2995 dirty.lock.unlock();
2996 _flush_and_sync_log_core(available_runway);
2997
2998 dout(10) << __func__ << " jumping log offset from 0x" << std::hex
2999 << log.writer->pos << " -> 0x" << jump_to << std::dec << dendl;
3000 log.writer->pos = jump_to;
3001 vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode.size);
3002 log.writer->file->fnode.size = jump_to;
3003 vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode.size);
3004
3005 _flush_bdev(log.writer);
3006
3007 _clear_dirty_set_stable_D(seq);
3008 _release_pending_allocations(to_release);
3009
3010 logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size);
3011 _update_logger_stats();
3012 return 0;
3013 }
3014
3015 ceph::bufferlist BlueFS::FileWriter::flush_buffer(
3016 CephContext* const cct,
3017 const bool partial,
3018 const unsigned length,
3019 const bluefs_super_t& super)
3020 {
3021 ceph_assert(ceph_mutex_is_locked(this->lock) || file->fnode.ino <= 1);
3022 ceph::bufferlist bl;
3023 if (partial) {
3024 tail_block.splice(0, tail_block.length(), &bl);
3025 }
3026 const auto remaining_len = length - bl.length();
3027 buffer.splice(0, remaining_len, &bl);
3028 if (buffer.length()) {
3029 dout(20) << " leaving 0x" << std::hex << buffer.length() << std::dec
3030 << " unflushed" << dendl;
3031 }
3032 if (const unsigned tail = bl.length() & ~super.block_mask(); tail) {
3033 const auto padding_len = super.block_size - tail;
3034 dout(20) << __func__ << " caching tail of 0x"
3035 << std::hex << tail
3036 << " and padding block with 0x" << padding_len
3037 << " buffer.length() " << buffer.length()
3038 << std::dec << dendl;
3039 // We need to go through the `buffer_appender` to get a chance to
3040 // preserve in-memory contiguity and not mess with the alignment.
3041 // Otherwise a costly rebuild could happen in e.g. `KernelDevice`.
3042 buffer_appender.append_zero(padding_len);
3043 buffer.splice(buffer.length() - padding_len, padding_len, &bl);
3044 // Deep copy the tail here. This allows to avoid costlier copy on
3045 // bufferlist rebuild in e.g. `KernelDevice` and minimizes number
3046 // of memory allocations.
3047 // The alternative approach would be to place the entire tail and
3048 // padding on a dedicated, 4 KB long memory chunk. This shouldn't
3049 // trigger the rebuild while still being less expensive.
3050 buffer_appender.substr_of(bl, bl.length() - padding_len - tail, tail);
3051 buffer.splice(buffer.length() - tail, tail, &tail_block);
3052 } else {
3053 tail_block.clear();
3054 }
3055 return bl;
3056 }
3057
3058 int BlueFS::_signal_dirty_to_log_D(FileWriter *h)
3059 {
3060 ceph_assert(ceph_mutex_is_locked(h->lock));
3061 std::lock_guard dl(dirty.lock);
3062 h->file->fnode.mtime = ceph_clock_now();
3063 ceph_assert(h->file->fnode.ino >= 1);
3064 if (h->file->dirty_seq <= dirty.seq_stable) {
3065 h->file->dirty_seq = dirty.seq_live;
3066 dirty.files[h->file->dirty_seq].push_back(*h->file);
3067 dout(20) << __func__ << " dirty_seq = " << dirty.seq_live
3068 << " (was clean)" << dendl;
3069 } else {
3070 if (h->file->dirty_seq != dirty.seq_live) {
3071 // need re-dirty, erase from list first
3072 ceph_assert(dirty.files.count(h->file->dirty_seq));
3073 auto it = dirty.files[h->file->dirty_seq].iterator_to(*h->file);
3074 dirty.files[h->file->dirty_seq].erase(it);
3075 h->file->dirty_seq = dirty.seq_live;
3076 dirty.files[h->file->dirty_seq].push_back(*h->file);
3077 dout(20) << __func__ << " dirty_seq = " << dirty.seq_live
3078 << " (was " << h->file->dirty_seq << ")" << dendl;
3079 } else {
3080 dout(20) << __func__ << " dirty_seq = " << dirty.seq_live
3081 << " (unchanged, do nothing) " << dendl;
3082 }
3083 }
3084 return 0;
3085 }
3086
3087 void BlueFS::flush_range(FileWriter *h, uint64_t offset, uint64_t length)/*_WF*/
3088 {
3089 _maybe_check_vselector_LNF();
3090 std::unique_lock hl(h->lock);
3091 _flush_range_F(h, offset, length);
3092 }
3093
3094 int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length)
3095 {
3096 ceph_assert(ceph_mutex_is_locked(h->lock));
3097 ceph_assert(h->file->num_readers.load() == 0);
3098 ceph_assert(h->file->fnode.ino > 1);
3099
3100 dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
3101 << " 0x" << offset << "~" << length << std::dec
3102 << " to " << h->file->fnode << dendl;
3103 if (h->file->deleted) {
3104 dout(10) << __func__ << " deleted, no-op" << dendl;
3105 return 0;
3106 }
3107
3108 bool buffered = cct->_conf->bluefs_buffered_io;
3109
3110 if (offset + length <= h->pos)
3111 return 0;
3112 if (offset < h->pos) {
3113 length -= h->pos - offset;
3114 offset = h->pos;
3115 dout(10) << " still need 0x"
3116 << std::hex << offset << "~" << length << std::dec
3117 << dendl;
3118 }
3119 std::lock_guard file_lock(h->file->lock);
3120 ceph_assert(offset <= h->file->fnode.size);
3121
3122 uint64_t allocated = h->file->fnode.get_allocated();
3123 vselector->sub_usage(h->file->vselector_hint, h->file->fnode);
3124 // do not bother to dirty the file if we are overwriting
3125 // previously allocated extents.
3126 if (allocated < offset + length) {
3127 // we should never run out of log space here; see the min runway check
3128 // in _flush_and_sync_log.
3129 int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint),
3130 offset + length - allocated,
3131 &h->file->fnode);
3132 if (r < 0) {
3133 derr << __func__ << " allocated: 0x" << std::hex << allocated
3134 << " offset: 0x" << offset << " length: 0x" << length << std::dec
3135 << dendl;
3136 vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo
3137 ceph_abort_msg("bluefs enospc");
3138 return r;
3139 }
3140 h->file->is_dirty = true;
3141 }
3142 if (h->file->fnode.size < offset + length) {
3143 h->file->fnode.size = offset + length;
3144 h->file->is_dirty = true;
3145 }
3146
3147 dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl;
3148 int res = _flush_data(h, offset, length, buffered);
3149 vselector->add_usage(h->file->vselector_hint, h->file->fnode);
3150 return res;
3151 }
3152
3153 int BlueFS::_flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool buffered)
3154 {
3155 if (h->file->fnode.ino > 1) {
3156 ceph_assert(ceph_mutex_is_locked(h->lock));
3157 ceph_assert(ceph_mutex_is_locked(h->file->lock));
3158 }
3159 uint64_t x_off = 0;
3160 auto p = h->file->fnode.seek(offset, &x_off);
3161 ceph_assert(p != h->file->fnode.extents.end());
3162 dout(20) << __func__ << " in " << *p << " x_off 0x"
3163 << std::hex << x_off << std::dec << dendl;
3164
3165 unsigned partial = x_off & ~super.block_mask();
3166 if (partial) {
3167 dout(20) << __func__ << " using partial tail 0x"
3168 << std::hex << partial << std::dec << dendl;
3169 x_off -= partial;
3170 offset -= partial;
3171 length += partial;
3172 dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
3173 for (auto p : h->iocv) {
3174 if (p) {
3175 p->aio_wait();
3176 }
3177 }
3178 }
3179
3180 auto bl = h->flush_buffer(cct, partial, length, super);
3181 ceph_assert(bl.length() >= length);
3182 h->pos = offset + length;
3183 length = bl.length();
3184
3185 switch (h->writer_type) {
3186 case WRITER_WAL:
3187 logger->inc(l_bluefs_bytes_written_wal, length);
3188 break;
3189 case WRITER_SST:
3190 logger->inc(l_bluefs_bytes_written_sst, length);
3191 break;
3192 }
3193
3194 dout(30) << "dump:\n";
3195 bl.hexdump(*_dout);
3196 *_dout << dendl;
3197
3198 uint64_t bloff = 0;
3199 uint64_t bytes_written_slow = 0;
3200 while (length > 0) {
3201 uint64_t x_len = std::min(p->length - x_off, length);
3202 bufferlist t;
3203 t.substr_of(bl, bloff, x_len);
3204 if (cct->_conf->bluefs_sync_write) {
3205 bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint);
3206 } else {
3207 bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint);
3208 }
3209 h->dirty_devs[p->bdev] = true;
3210 if (p->bdev == BDEV_SLOW) {
3211 bytes_written_slow += t.length();
3212 }
3213
3214 bloff += x_len;
3215 length -= x_len;
3216 ++p;
3217 x_off = 0;
3218 }
3219 if (bytes_written_slow) {
3220 logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow);
3221 }
3222 for (unsigned i = 0; i < MAX_BDEV; ++i) {
3223 if (bdev[i]) {
3224 if (h->iocv[i] && h->iocv[i]->has_pending_aios()) {
3225 bdev[i]->aio_submit(h->iocv[i]);
3226 }
3227 }
3228 }
3229 dout(20) << __func__ << " h " << h << " pos now 0x"
3230 << std::hex << h->pos << std::dec << dendl;
3231 return 0;
3232 }
3233
3234 #ifdef HAVE_LIBAIO
3235 // we need to retire old completed aios so they don't stick around in
3236 // memory indefinitely (along with their bufferlist refs).
3237 void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
3238 {
3239 for (auto p : h->iocv) {
3240 if (p) {
3241 ls->splice(ls->end(), p->running_aios);
3242 }
3243 }
3244 dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
3245 }
3246
3247 void BlueFS::_wait_for_aio(FileWriter *h)
3248 {
3249 // NOTE: this is safe to call without a lock, as long as our reference is
3250 // stable.
3251 utime_t start;
3252 lgeneric_subdout(cct, bluefs, 10) << __func__;
3253 start = ceph_clock_now();
3254 *_dout << " " << h << dendl;
3255 for (auto p : h->iocv) {
3256 if (p) {
3257 p->aio_wait();
3258 }
3259 }
3260 dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl;
3261 }
3262 #endif
3263
3264 void BlueFS::append_try_flush(FileWriter *h, const char* buf, size_t len)/*_WF_LNF_NF_LD_D*/
3265 {
3266 bool flushed_sum = false;
3267 {
3268 std::unique_lock hl(h->lock);
3269 size_t max_size = 1ull << 30; // cap to 1GB
3270 while (len > 0) {
3271 bool need_flush = true;
3272 auto l0 = h->get_buffer_length();
3273 if (l0 < max_size) {
3274 size_t l = std::min(len, max_size - l0);
3275 h->append(buf, l);
3276 buf += l;
3277 len -= l;
3278 need_flush = h->get_buffer_length() >= cct->_conf->bluefs_min_flush_size;
3279 }
3280 if (need_flush) {
3281 bool flushed = false;
3282 int r = _flush_F(h, true, &flushed);
3283 ceph_assert(r == 0);
3284 flushed_sum |= flushed;
3285 // make sure we've made any progress with flush hence the
3286 // loop doesn't iterate forever
3287 ceph_assert(h->get_buffer_length() < max_size);
3288 }
3289 }
3290 }
3291 if (flushed_sum) {
3292 _maybe_compact_log_LNF_NF_LD_D();
3293 }
3294 }
3295
3296 void BlueFS::flush(FileWriter *h, bool force)/*_WF_LNF_NF_LD_D*/
3297 {
3298 bool flushed = false;
3299 int r;
3300 {
3301 std::unique_lock hl(h->lock);
3302 r = _flush_F(h, force, &flushed);
3303 ceph_assert(r == 0);
3304 }
3305 if (r == 0 && flushed) {
3306 _maybe_compact_log_LNF_NF_LD_D();
3307 }
3308 }
3309
3310 int BlueFS::_flush_F(FileWriter *h, bool force, bool *flushed)
3311 {
3312 ceph_assert(ceph_mutex_is_locked(h->lock));
3313 uint64_t length = h->get_buffer_length();
3314 uint64_t offset = h->pos;
3315 if (flushed) {
3316 *flushed = false;
3317 }
3318 if (!force &&
3319 length < cct->_conf->bluefs_min_flush_size) {
3320 dout(10) << __func__ << " " << h << " ignoring, length " << length
3321 << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
3322 << dendl;
3323 return 0;
3324 }
3325 if (length == 0) {
3326 dout(10) << __func__ << " " << h << " no dirty data on "
3327 << h->file->fnode << dendl;
3328 return 0;
3329 }
3330 dout(10) << __func__ << " " << h << " 0x"
3331 << std::hex << offset << "~" << length << std::dec
3332 << " to " << h->file->fnode << dendl;
3333 ceph_assert(h->pos <= h->file->fnode.size);
3334 int r = _flush_range_F(h, offset, length);
3335 if (flushed) {
3336 *flushed = true;
3337 }
3338 return r;
3339 }
3340
3341 // Flush for bluefs special files.
3342 // Does not add extents to h.
3343 // Does not mark h as dirty.
3344 // we do not need to dirty the log file (or it's compacting
3345 // replacement) when the file size changes because replay is
3346 // smart enough to discover it on its own.
3347 uint64_t BlueFS::_flush_special(FileWriter *h)
3348 {
3349 ceph_assert(h->file->fnode.ino <= 1);
3350 uint64_t length = h->get_buffer_length();
3351 uint64_t offset = h->pos;
3352 uint64_t new_data = 0;
3353 ceph_assert(length + offset <= h->file->fnode.get_allocated());
3354 if (h->file->fnode.size < offset + length) {
3355 new_data = offset + length - h->file->fnode.size;
3356 h->file->fnode.size = offset + length;
3357 }
3358 _flush_data(h, offset, length, false);
3359 return new_data;
3360 }
3361
3362 int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
3363 {
3364 std::lock_guard hl(h->lock);
3365 dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
3366 << " file " << h->file->fnode << dendl;
3367 if (h->file->deleted) {
3368 dout(10) << __func__ << " deleted, no-op" << dendl;
3369 return 0;
3370 }
3371
3372 // we never truncate internal log files
3373 ceph_assert(h->file->fnode.ino > 1);
3374
3375 // truncate off unflushed data?
3376 if (h->pos < offset &&
3377 h->pos + h->get_buffer_length() > offset) {
3378 dout(20) << __func__ << " tossing out last " << offset - h->pos
3379 << " unflushed bytes" << dendl;
3380 ceph_abort_msg("actually this shouldn't happen");
3381 }
3382 if (h->get_buffer_length()) {
3383 int r = _flush_F(h, true);
3384 if (r < 0)
3385 return r;
3386 }
3387 if (offset == h->file->fnode.size) {
3388 return 0; // no-op!
3389 }
3390 if (offset > h->file->fnode.size) {
3391 ceph_abort_msg("truncate up not supported");
3392 }
3393 ceph_assert(h->file->fnode.size >= offset);
3394 _flush_bdev(h);
3395
3396 std::lock_guard ll(log.lock);
3397 vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
3398 h->file->fnode.size = offset;
3399 vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
3400 log.t.op_file_update_inc(h->file->fnode);
3401 return 0;
3402 }
3403
3404 int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/
3405 {
3406 _maybe_check_vselector_LNF();
3407 std::unique_lock hl(h->lock);
3408 uint64_t old_dirty_seq = 0;
3409 {
3410 dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl;
3411 int r = _flush_F(h, true);
3412 if (r < 0)
3413 return r;
3414 _flush_bdev(h);
3415 if (h->file->is_dirty) {
3416 _signal_dirty_to_log_D(h);
3417 h->file->is_dirty = false;
3418 }
3419 {
3420 std::lock_guard dl(dirty.lock);
3421 if (dirty.seq_stable < h->file->dirty_seq) {
3422 old_dirty_seq = h->file->dirty_seq;
3423 dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
3424 << ") on " << h->file->fnode << ", flushing log" << dendl;
3425 }
3426 }
3427 }
3428 if (old_dirty_seq) {
3429 _flush_and_sync_log_LD(old_dirty_seq);
3430 }
3431 _maybe_compact_log_LNF_NF_LD_D();
3432
3433 return 0;
3434 }
3435
3436 // be careful - either h->file->lock or log.lock must be taken
3437 void BlueFS::_flush_bdev(FileWriter *h)
3438 {
3439 if (h->file->fnode.ino > 1) {
3440 ceph_assert(ceph_mutex_is_locked(h->lock));
3441 } else if (h->file->fnode.ino == 1) {
3442 ceph_assert(ceph_mutex_is_locked(log.lock));
3443 }
3444 std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs;
3445 h->dirty_devs.fill(false);
3446 #ifdef HAVE_LIBAIO
3447 if (!cct->_conf->bluefs_sync_write) {
3448 list<aio_t> completed_ios;
3449 _claim_completed_aios(h, &completed_ios);
3450 _wait_for_aio(h);
3451 completed_ios.clear();
3452 }
3453 #endif
3454 _flush_bdev(flush_devs);
3455 }
3456
3457 void BlueFS::_flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs)
3458 {
3459 // NOTE: this is safe to call without a lock.
3460 dout(20) << __func__ << dendl;
3461 for (unsigned i = 0; i < MAX_BDEV; i++) {
3462 if (dirty_bdevs[i])
3463 bdev[i]->flush();
3464 }
3465 }
3466
3467 void BlueFS::_flush_bdev()
3468 {
3469 // NOTE: this is safe to call without a lock.
3470 dout(20) << __func__ << dendl;
3471 for (unsigned i = 0; i < MAX_BDEV; i++) {
3472 // alloc space from BDEV_SLOW is unexpected.
3473 // So most cases we don't alloc from BDEV_SLOW and so avoiding flush not-used device.
3474 if (bdev[i] && (i != BDEV_SLOW || _get_used(i))) {
3475 bdev[i]->flush();
3476 }
3477 }
3478 }
3479
3480 const char* BlueFS::get_device_name(unsigned id)
3481 {
3482 if (id >= MAX_BDEV) return "BDEV_INV";
3483 const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
3484 return names[id];
3485 }
3486
3487 int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
3488 PExtentVector* extents)
3489 {
3490 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
3491 << " from " << (int)id << dendl;
3492 assert(id < alloc.size());
3493 if (!alloc[id]) {
3494 return -ENOENT;
3495 }
3496 extents->reserve(4); // 4 should be (more than) enough for most allocations
3497 int64_t need = round_up_to(len, alloc_size[id]);
3498 int64_t alloc_len = alloc[id]->allocate(need, alloc_size[id], 0, extents);
3499 if (alloc_len < 0 || alloc_len < need) {
3500 if (alloc_len > 0) {
3501 alloc[id]->release(*extents);
3502 }
3503 derr << __func__ << " unable to allocate 0x" << std::hex << need
3504 << " on bdev " << (int)id
3505 << ", allocator name " << alloc[id]->get_name()
3506 << ", allocator type " << alloc[id]->get_type()
3507 << ", capacity 0x" << alloc[id]->get_capacity()
3508 << ", block size 0x" << alloc[id]->get_block_size()
3509 << ", alloc size 0x" << alloc_size[id]
3510 << ", free 0x" << alloc[id]->get_free()
3511 << ", fragmentation " << alloc[id]->get_fragmentation()
3512 << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0)
3513 << std::dec << dendl;
3514 alloc[id]->dump();
3515 return -ENOSPC;
3516 }
3517 if (is_shared_alloc(id)) {
3518 shared_alloc->bluefs_used += alloc_len;
3519 }
3520
3521 return 0;
3522 }
3523
3524 int BlueFS::_allocate(uint8_t id, uint64_t len,
3525 bluefs_fnode_t* node)
3526 {
3527 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
3528 << " from " << (int)id << dendl;
3529 ceph_assert(id < alloc.size());
3530 int64_t alloc_len = 0;
3531 PExtentVector extents;
3532 uint64_t hint = 0;
3533 int64_t need = len;
3534 if (alloc[id]) {
3535 need = round_up_to(len, alloc_size[id]);
3536 if (!node->extents.empty() && node->extents.back().bdev == id) {
3537 hint = node->extents.back().end();
3538 }
3539 extents.reserve(4); // 4 should be (more than) enough for most allocations
3540 alloc_len = alloc[id]->allocate(need, alloc_size[id], hint, &extents);
3541 }
3542 if (alloc_len < 0 || alloc_len < need) {
3543 if (alloc[id]) {
3544 if (alloc_len > 0) {
3545 alloc[id]->release(extents);
3546 }
3547 dout(1) << __func__ << " unable to allocate 0x" << std::hex << need
3548 << " on bdev " << (int)id
3549 << ", allocator name " << alloc[id]->get_name()
3550 << ", allocator type " << alloc[id]->get_type()
3551 << ", capacity 0x" << alloc[id]->get_capacity()
3552 << ", block size 0x" << alloc[id]->get_block_size()
3553 << ", alloc size 0x" << alloc_size[id]
3554 << ", free 0x" << alloc[id]->get_free()
3555 << ", fragmentation " << alloc[id]->get_fragmentation()
3556 << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0)
3557 << std::dec << dendl;
3558 } else {
3559 dout(20) << __func__ << " alloc-id not set on index="<< (int)id << " unable to allocate 0x" << std::hex << need
3560 << " on bdev " << (int)id << std::dec << dendl;
3561 }
3562 if (id != BDEV_SLOW) {
3563 dout(20) << __func__ << " fallback to bdev "
3564 << (int)id + 1
3565 << dendl;
3566 return _allocate(id + 1, len, node);
3567 } else {
3568 derr << __func__ << " allocation failed, needed 0x" << std::hex << need
3569 << dendl;
3570 }
3571 return -ENOSPC;
3572 } else {
3573 uint64_t used = _get_used(id);
3574 if (max_bytes[id] < used) {
3575 logger->set(max_bytes_pcounters[id], used);
3576 max_bytes[id] = used;
3577 }
3578 if (is_shared_alloc(id)) {
3579 shared_alloc->bluefs_used += alloc_len;
3580 }
3581 }
3582
3583 for (auto& p : extents) {
3584 node->append_extent(bluefs_extent_t(id, p.offset, p.length));
3585 }
3586
3587 return 0;
3588 }
3589
3590 int BlueFS::preallocate(FileRef f, uint64_t off, uint64_t len)/*_LF*/
3591 {
3592 std::lock_guard ll(log.lock);
3593 std::lock_guard fl(f->lock);
3594 dout(10) << __func__ << " file " << f->fnode << " 0x"
3595 << std::hex << off << "~" << len << std::dec << dendl;
3596 if (f->deleted) {
3597 dout(10) << __func__ << " deleted, no-op" << dendl;
3598 return 0;
3599 }
3600 ceph_assert(f->fnode.ino > 1);
3601 uint64_t allocated = f->fnode.get_allocated();
3602 if (off + len > allocated) {
3603 uint64_t want = off + len - allocated;
3604
3605 vselector->sub_usage(f->vselector_hint, f->fnode);
3606 int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint),
3607 want,
3608 &f->fnode);
3609 vselector->add_usage(f->vselector_hint, f->fnode);
3610 if (r < 0)
3611 return r;
3612
3613 log.t.op_file_update_inc(f->fnode);
3614 }
3615 return 0;
3616 }
3617
3618 void BlueFS::sync_metadata(bool avoid_compact)/*_LNF_NF_LD_D*/
3619 {
3620 bool can_skip_flush;
3621 {
3622 std::lock_guard ll(log.lock);
3623 std::lock_guard dl(dirty.lock);
3624 can_skip_flush = log.t.empty() && dirty.files.empty();
3625 }
3626 if (can_skip_flush) {
3627 dout(10) << __func__ << " - no pending log events" << dendl;
3628 } else {
3629 utime_t start;
3630 lgeneric_subdout(cct, bluefs, 10) << __func__;
3631 start = ceph_clock_now();
3632 *_dout << dendl;
3633 _flush_bdev(); // FIXME?
3634 _flush_and_sync_log_LD();
3635 dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl;
3636 }
3637
3638 if (!avoid_compact) {
3639 _maybe_compact_log_LNF_NF_LD_D();
3640 }
3641 }
3642
3643 void BlueFS::_maybe_compact_log_LNF_NF_LD_D()
3644 {
3645 if (!cct->_conf->bluefs_replay_recovery_disable_compact &&
3646 _should_start_compact_log_L_N()) {
3647 if (cct->_conf->bluefs_compact_log_sync) {
3648 _compact_log_sync_LNF_LD();
3649 } else {
3650 _compact_log_async_LD_LNF_D();
3651 }
3652 }
3653 }
3654
3655 int BlueFS::open_for_write(
3656 std::string_view dirname,
3657 std::string_view filename,
3658 FileWriter **h,
3659 bool overwrite)/*_N_LD*/
3660 {
3661 _maybe_check_vselector_LNF();
3662 FileRef file;
3663 bool create = false;
3664 bool truncate = false;
3665 mempool::bluefs::vector<bluefs_extent_t> pending_release_extents;
3666 {
3667 std::unique_lock nl(nodes.lock);
3668 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3669 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
3670 DirRef dir;
3671 if (p == nodes.dir_map.end()) {
3672 // implicitly create the dir
3673 dout(20) << __func__ << " dir " << dirname
3674 << " does not exist" << dendl;
3675 return -ENOENT;
3676 } else {
3677 dir = p->second;
3678 }
3679
3680 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3681 if (q == dir->file_map.end()) {
3682 if (overwrite) {
3683 dout(20) << __func__ << " dir " << dirname << " (" << dir
3684 << ") file " << filename
3685 << " does not exist" << dendl;
3686 return -ENOENT;
3687 }
3688 file = ceph::make_ref<File>();
3689 file->fnode.ino = ++ino_last;
3690 nodes.file_map[ino_last] = file;
3691 dir->file_map[string{filename}] = file;
3692 ++file->refs;
3693 create = true;
3694 logger->set(l_bluefs_num_files, nodes.file_map.size());
3695 } else {
3696 // overwrite existing file?
3697 file = q->second;
3698 if (overwrite) {
3699 dout(20) << __func__ << " dir " << dirname << " (" << dir
3700 << ") file " << filename
3701 << " already exists, overwrite in place" << dendl;
3702 } else {
3703 dout(20) << __func__ << " dir " << dirname << " (" << dir
3704 << ") file " << filename
3705 << " already exists, truncate + overwrite" << dendl;
3706 vselector->sub_usage(file->vselector_hint, file->fnode);
3707 file->fnode.size = 0;
3708 pending_release_extents.swap(file->fnode.extents);
3709 truncate = true;
3710
3711 file->fnode.clear_extents();
3712 }
3713 }
3714 ceph_assert(file->fnode.ino > 1);
3715
3716 file->fnode.mtime = ceph_clock_now();
3717 file->vselector_hint = vselector->get_hint_by_dir(dirname);
3718 if (create || truncate) {
3719 vselector->add_usage(file->vselector_hint, file->fnode); // update file count
3720 }
3721
3722 dout(20) << __func__ << " mapping " << dirname << "/" << filename
3723 << " vsel_hint " << file->vselector_hint
3724 << dendl;
3725 }
3726 {
3727 std::lock_guard ll(log.lock);
3728 log.t.op_file_update(file->fnode);
3729 if (create)
3730 log.t.op_dir_link(dirname, filename, file->fnode.ino);
3731
3732 std::lock_guard dl(dirty.lock);
3733 for (auto& p : pending_release_extents) {
3734 dirty.pending_release[p.bdev].insert(p.offset, p.length);
3735 }
3736 }
3737 *h = _create_writer(file);
3738
3739 if (boost::algorithm::ends_with(filename, ".log")) {
3740 (*h)->writer_type = BlueFS::WRITER_WAL;
3741 if (logger && !overwrite) {
3742 logger->inc(l_bluefs_files_written_wal);
3743 }
3744 } else if (boost::algorithm::ends_with(filename, ".sst")) {
3745 (*h)->writer_type = BlueFS::WRITER_SST;
3746 if (logger) {
3747 logger->inc(l_bluefs_files_written_sst);
3748 }
3749 }
3750
3751 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
3752 return 0;
3753 }
3754
3755 BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
3756 {
3757 FileWriter *w = new FileWriter(f);
3758 for (unsigned i = 0; i < MAX_BDEV; ++i) {
3759 if (bdev[i]) {
3760 w->iocv[i] = new IOContext(cct, NULL);
3761 }
3762 }
3763 return w;
3764 }
3765
3766 void BlueFS::_drain_writer(FileWriter *h)
3767 {
3768 dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
3769 //h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer);
3770 for (unsigned i=0; i<MAX_BDEV; ++i) {
3771 if (bdev[i]) {
3772 if (h->iocv[i]) {
3773 h->iocv[i]->aio_wait();
3774 delete h->iocv[i];
3775 }
3776 }
3777 }
3778 // sanity
3779 if (h->file->fnode.size >= (1ull << 30)) {
3780 dout(10) << __func__ << " file is unexpectedly large:" << h->file->fnode << dendl;
3781 }
3782 }
3783
3784 void BlueFS::_close_writer(FileWriter *h)
3785 {
3786 _drain_writer(h);
3787 delete h;
3788 }
3789 void BlueFS::close_writer(FileWriter *h)
3790 {
3791 {
3792 std::lock_guard l(h->lock);
3793 _drain_writer(h);
3794 }
3795 delete h;
3796 }
3797
3798 uint64_t BlueFS::debug_get_dirty_seq(FileWriter *h)
3799 {
3800 std::lock_guard l(h->lock);
3801 return h->file->dirty_seq;
3802 }
3803
3804 bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev)
3805 {
3806 std::lock_guard l(h->lock);
3807 return h->dirty_devs[dev];
3808 }
3809
3810 int BlueFS::open_for_read(
3811 std::string_view dirname,
3812 std::string_view filename,
3813 FileReader **h,
3814 bool random)/*_N*/
3815 {
3816 _maybe_check_vselector_LNF();
3817 std::lock_guard nl(nodes.lock);
3818 dout(10) << __func__ << " " << dirname << "/" << filename
3819 << (random ? " (random)":" (sequential)") << dendl;
3820 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
3821 if (p == nodes.dir_map.end()) {
3822 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3823 return -ENOENT;
3824 }
3825 DirRef dir = p->second;
3826
3827 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3828 if (q == dir->file_map.end()) {
3829 dout(20) << __func__ << " dir " << dirname << " (" << dir
3830 << ") file " << filename
3831 << " not found" << dendl;
3832 return -ENOENT;
3833 }
3834 File *file = q->second.get();
3835
3836 *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
3837 random, false);
3838 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
3839 return 0;
3840 }
3841
3842 int BlueFS::rename(
3843 std::string_view old_dirname, std::string_view old_filename,
3844 std::string_view new_dirname, std::string_view new_filename)/*_LND*/
3845 {
3846 std::lock_guard ll(log.lock);
3847 std::lock_guard nl(nodes.lock);
3848 dout(10) << __func__ << " " << old_dirname << "/" << old_filename
3849 << " -> " << new_dirname << "/" << new_filename << dendl;
3850 map<string,DirRef>::iterator p = nodes.dir_map.find(old_dirname);
3851 if (p == nodes.dir_map.end()) {
3852 dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
3853 return -ENOENT;
3854 }
3855 DirRef old_dir = p->second;
3856 map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
3857 if (q == old_dir->file_map.end()) {
3858 dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
3859 << ") file " << old_filename
3860 << " not found" << dendl;
3861 return -ENOENT;
3862 }
3863 FileRef file = q->second;
3864
3865 p = nodes.dir_map.find(new_dirname);
3866 if (p == nodes.dir_map.end()) {
3867 dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
3868 return -ENOENT;
3869 }
3870 DirRef new_dir = p->second;
3871 q = new_dir->file_map.find(new_filename);
3872 if (q != new_dir->file_map.end()) {
3873 dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
3874 << ") file " << new_filename
3875 << " already exists, unlinking" << dendl;
3876 ceph_assert(q->second != file);
3877 log.t.op_dir_unlink(new_dirname, new_filename);
3878 _drop_link_D(q->second);
3879 }
3880
3881 dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
3882 << " " << file->fnode << dendl;
3883
3884 new_dir->file_map[string{new_filename}] = file;
3885 old_dir->file_map.erase(string{old_filename});
3886
3887 log.t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
3888 log.t.op_dir_unlink(old_dirname, old_filename);
3889 return 0;
3890 }
3891
3892 int BlueFS::mkdir(std::string_view dirname)/*_LN*/
3893 {
3894 std::lock_guard ll(log.lock);
3895 std::lock_guard nl(nodes.lock);
3896 dout(10) << __func__ << " " << dirname << dendl;
3897 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
3898 if (p != nodes.dir_map.end()) {
3899 dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
3900 return -EEXIST;
3901 }
3902 nodes.dir_map[string{dirname}] = ceph::make_ref<Dir>();
3903 log.t.op_dir_create(dirname);
3904 return 0;
3905 }
3906
3907 int BlueFS::rmdir(std::string_view dirname)/*_LN*/
3908 {
3909 std::lock_guard ll(log.lock);
3910 std::lock_guard nl(nodes.lock);
3911 dout(10) << __func__ << " " << dirname << dendl;
3912 auto p = nodes.dir_map.find(dirname);
3913 if (p == nodes.dir_map.end()) {
3914 dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
3915 return -ENOENT;
3916 }
3917 DirRef dir = p->second;
3918 if (!dir->file_map.empty()) {
3919 dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
3920 return -ENOTEMPTY;
3921 }
3922 nodes.dir_map.erase(string{dirname});
3923 log.t.op_dir_remove(dirname);
3924 return 0;
3925 }
3926
3927 bool BlueFS::dir_exists(std::string_view dirname)/*_N*/
3928 {
3929 std::lock_guard nl(nodes.lock);
3930 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
3931 bool exists = p != nodes.dir_map.end();
3932 dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
3933 return exists;
3934 }
3935
3936 int BlueFS::stat(std::string_view dirname, std::string_view filename,
3937 uint64_t *size, utime_t *mtime)/*_N*/
3938 {
3939 std::lock_guard nl(nodes.lock);
3940 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3941 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
3942 if (p == nodes.dir_map.end()) {
3943 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3944 return -ENOENT;
3945 }
3946 DirRef dir = p->second;
3947 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3948 if (q == dir->file_map.end()) {
3949 dout(20) << __func__ << " dir " << dirname << " (" << dir
3950 << ") file " << filename
3951 << " not found" << dendl;
3952 return -ENOENT;
3953 }
3954 File *file = q->second.get();
3955 dout(10) << __func__ << " " << dirname << "/" << filename
3956 << " " << file->fnode << dendl;
3957 if (size)
3958 *size = file->fnode.size;
3959 if (mtime)
3960 *mtime = file->fnode.mtime;
3961 return 0;
3962 }
3963
3964 int BlueFS::lock_file(std::string_view dirname, std::string_view filename,
3965 FileLock **plock)/*_LN*/
3966 {
3967 std::lock_guard ll(log.lock);
3968 std::lock_guard nl(nodes.lock);
3969 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3970 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
3971 if (p == nodes.dir_map.end()) {
3972 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3973 return -ENOENT;
3974 }
3975 DirRef dir = p->second;
3976 auto q = dir->file_map.find(filename);
3977 FileRef file;
3978 if (q == dir->file_map.end()) {
3979 dout(20) << __func__ << " dir " << dirname << " (" << dir
3980 << ") file " << filename
3981 << " not found, creating" << dendl;
3982 file = ceph::make_ref<File>();
3983 file->fnode.ino = ++ino_last;
3984 file->fnode.mtime = ceph_clock_now();
3985 nodes.file_map[ino_last] = file;
3986 dir->file_map[string{filename}] = file;
3987 logger->set(l_bluefs_num_files, nodes.file_map.size());
3988 ++file->refs;
3989 log.t.op_file_update(file->fnode);
3990 log.t.op_dir_link(dirname, filename, file->fnode.ino);
3991 } else {
3992 file = q->second;
3993 if (file->locked) {
3994 dout(10) << __func__ << " already locked" << dendl;
3995 return -ENOLCK;
3996 }
3997 }
3998 file->locked = true;
3999 *plock = new FileLock(file);
4000 dout(10) << __func__ << " locked " << file->fnode
4001 << " with " << *plock << dendl;
4002 return 0;
4003 }
4004
4005 int BlueFS::unlock_file(FileLock *fl)/*_N*/
4006 {
4007 std::lock_guard nl(nodes.lock);
4008 dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
4009 ceph_assert(fl->file->locked);
4010 fl->file->locked = false;
4011 delete fl;
4012 return 0;
4013 }
4014
4015 int BlueFS::readdir(std::string_view dirname, vector<string> *ls)/*_N*/
4016 {
4017 // dirname may contain a trailing /
4018 if (!dirname.empty() && dirname.back() == '/') {
4019 dirname.remove_suffix(1);
4020 }
4021 std::lock_guard nl(nodes.lock);
4022 dout(10) << __func__ << " " << dirname << dendl;
4023 if (dirname.empty()) {
4024 // list dirs
4025 ls->reserve(nodes.dir_map.size() + 2);
4026 for (auto& q : nodes.dir_map) {
4027 ls->push_back(q.first);
4028 }
4029 } else {
4030 // list files in dir
4031 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4032 if (p == nodes.dir_map.end()) {
4033 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
4034 return -ENOENT;
4035 }
4036 DirRef dir = p->second;
4037 ls->reserve(dir->file_map.size() + 2);
4038 for (auto& q : dir->file_map) {
4039 ls->push_back(q.first);
4040 }
4041 }
4042 ls->push_back(".");
4043 ls->push_back("..");
4044 return 0;
4045 }
4046
4047 int BlueFS::unlink(std::string_view dirname, std::string_view filename)/*_LND*/
4048 {
4049 std::lock_guard ll(log.lock);
4050 std::lock_guard nl(nodes.lock);
4051 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
4052 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4053 if (p == nodes.dir_map.end()) {
4054 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
4055 return -ENOENT;
4056 }
4057 DirRef dir = p->second;
4058 map<string,FileRef>::iterator q = dir->file_map.find(filename);
4059 if (q == dir->file_map.end()) {
4060 dout(20) << __func__ << " file " << dirname << "/" << filename
4061 << " not found" << dendl;
4062 return -ENOENT;
4063 }
4064 FileRef file = q->second;
4065 if (file->locked) {
4066 dout(20) << __func__ << " file " << dirname << "/" << filename
4067 << " is locked" << dendl;
4068 return -EBUSY;
4069 }
4070 dir->file_map.erase(string{filename});
4071 log.t.op_dir_unlink(dirname, filename);
4072 _drop_link_D(file);
4073 return 0;
4074 }
4075
4076 bool BlueFS::wal_is_rotational()
4077 {
4078 if (bdev[BDEV_WAL]) {
4079 return bdev[BDEV_WAL]->is_rotational();
4080 } else if (bdev[BDEV_DB]) {
4081 return bdev[BDEV_DB]->is_rotational();
4082 }
4083 return bdev[BDEV_SLOW]->is_rotational();
4084 }
4085
4086 bool BlueFS::db_is_rotational()
4087 {
4088 if (bdev[BDEV_DB]) {
4089 return bdev[BDEV_DB]->is_rotational();
4090 }
4091 return bdev[BDEV_SLOW]->is_rotational();
4092 }
4093
4094 /*
4095 Algorithm.
4096 do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there.
4097 Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future,
4098 and try if using it will produce healthy bluefs transaction.
4099 We encode already known bluefs log extents and search disk for these bytes.
4100 When we find it, we decode following bytes as extent.
4101 We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction.
4102 */
4103 int BlueFS::_do_replay_recovery_read(FileReader *log_reader,
4104 size_t replay_pos,
4105 size_t read_offset,
4106 size_t read_len,
4107 bufferlist* bl) {
4108 dout(1) << __func__ << " replay_pos=0x" << std::hex << replay_pos <<
4109 " needs 0x" << read_offset << "~" << read_len << std::dec << dendl;
4110
4111 bluefs_fnode_t& log_fnode = log_reader->file->fnode;
4112 bufferlist bin_extents;
4113 ::encode(log_fnode.extents, bin_extents);
4114 dout(2) << __func__ << " log file encoded extents length = " << bin_extents.length() << dendl;
4115
4116 // cannot process if too small to effectively search
4117 ceph_assert(bin_extents.length() >= 32);
4118 bufferlist last_32;
4119 last_32.substr_of(bin_extents, bin_extents.length() - 32, 32);
4120
4121 //read fixed part from replay_pos to end of bluefs_log extents
4122 bufferlist fixed;
4123 uint64_t e_off = 0;
4124 auto e = log_fnode.seek(replay_pos, &e_off);
4125 ceph_assert(e != log_fnode.extents.end());
4126 int r = _bdev_read(e->bdev, e->offset + e_off, e->length - e_off, &fixed, ioc[e->bdev],
4127 cct->_conf->bluefs_buffered_io);
4128 ceph_assert(r == 0);
4129 //capture dev of last good extent
4130 uint8_t last_e_dev = e->bdev;
4131 uint64_t last_e_off = e->offset;
4132 ++e;
4133 while (e != log_fnode.extents.end()) {
4134 r = _bdev_read(e->bdev, e->offset, e->length, &fixed, ioc[e->bdev],
4135 cct->_conf->bluefs_buffered_io);
4136 ceph_assert(r == 0);
4137 last_e_dev = e->bdev;
4138 ++e;
4139 }
4140 ceph_assert(replay_pos + fixed.length() == read_offset);
4141
4142 dout(2) << __func__ << " valid data in log = " << fixed.length() << dendl;
4143
4144 struct compare {
4145 bool operator()(const bluefs_extent_t& a, const bluefs_extent_t& b) const {
4146 if (a.bdev < b.bdev) return true;
4147 if (a.offset < b.offset) return true;
4148 return a.length < b.length;
4149 }
4150 };
4151 std::set<bluefs_extent_t, compare> extents_rejected;
4152 for (int dcnt = 0; dcnt < 3; dcnt++) {
4153 uint8_t dev = (last_e_dev + dcnt) % MAX_BDEV;
4154 if (bdev[dev] == nullptr) continue;
4155 dout(2) << __func__ << " processing " << get_device_name(dev) << dendl;
4156 interval_set<uint64_t> disk_regions;
4157 disk_regions.insert(0, bdev[dev]->get_size());
4158 for (auto f : nodes.file_map) {
4159 auto& e = f.second->fnode.extents;
4160 for (auto& p : e) {
4161 if (p.bdev == dev) {
4162 disk_regions.erase(p.offset, p.length);
4163 }
4164 }
4165 }
4166 size_t disk_regions_count = disk_regions.num_intervals();
4167 dout(5) << __func__ << " " << disk_regions_count << " regions to scan on " << get_device_name(dev) << dendl;
4168
4169 auto reg = disk_regions.lower_bound(last_e_off);
4170 //for all except first, start from beginning
4171 last_e_off = 0;
4172 if (reg == disk_regions.end()) {
4173 reg = disk_regions.begin();
4174 }
4175 const uint64_t chunk_size = 4 * 1024 * 1024;
4176 const uint64_t page_size = 4096;
4177 const uint64_t max_extent_size = 16;
4178 uint64_t overlay_size = last_32.length() + max_extent_size;
4179 for (size_t i = 0; i < disk_regions_count; reg++, i++) {
4180 if (reg == disk_regions.end()) {
4181 reg = disk_regions.begin();
4182 }
4183 uint64_t pos = reg.get_start();
4184 uint64_t len = reg.get_len();
4185
4186 std::unique_ptr<char[]> raw_data_p{new char[page_size + chunk_size]};
4187 char* raw_data = raw_data_p.get();
4188 memset(raw_data, 0, page_size);
4189
4190 while (len > last_32.length()) {
4191 uint64_t chunk_len = len > chunk_size ? chunk_size : len;
4192 dout(5) << __func__ << " read "
4193 << get_device_name(dev) << ":0x" << std::hex << pos << "+" << chunk_len
4194 << std::dec << dendl;
4195 r = _bdev_read_random(dev, pos, chunk_len,
4196 raw_data + page_size, cct->_conf->bluefs_buffered_io);
4197 ceph_assert(r == 0);
4198
4199 //search for fixed_last_32
4200 char* chunk_b = raw_data + page_size;
4201 char* chunk_e = chunk_b + chunk_len;
4202
4203 char* search_b = chunk_b - overlay_size;
4204 char* search_e = chunk_e;
4205
4206 for (char* sp = search_b; ; sp += last_32.length()) {
4207 sp = (char*)memmem(sp, search_e - sp, last_32.c_str(), last_32.length());
4208 if (sp == nullptr) {
4209 break;
4210 }
4211
4212 char* n = sp + last_32.length();
4213 dout(5) << __func__ << " checking location 0x" << std::hex << pos + (n - chunk_b) << std::dec << dendl;
4214 bufferlist test;
4215 test.append(n, std::min<size_t>(max_extent_size, chunk_e - n));
4216 bluefs_extent_t ne;
4217 try {
4218 bufferlist::const_iterator p = test.begin();
4219 ::decode(ne, p);
4220 } catch (buffer::error& e) {
4221 continue;
4222 }
4223 if (extents_rejected.count(ne) != 0) {
4224 dout(5) << __func__ << " extent " << ne << " already refected" <<dendl;
4225 continue;
4226 }
4227 //insert as rejected already. if we succeed, it wouldn't make difference.
4228 extents_rejected.insert(ne);
4229
4230 if (ne.bdev >= MAX_BDEV ||
4231 bdev[ne.bdev] == nullptr ||
4232 ne.length > 16 * 1024 * 1024 ||
4233 (ne.length & 4095) != 0 ||
4234 ne.offset + ne.length > bdev[ne.bdev]->get_size() ||
4235 (ne.offset & 4095) != 0) {
4236 dout(5) << __func__ << " refusing extent " << ne << dendl;
4237 continue;
4238 }
4239 dout(5) << __func__ << " checking extent " << ne << dendl;
4240
4241 //read candidate extent - whole
4242 bufferlist candidate;
4243 candidate.append(fixed);
4244 r = _bdev_read(ne.bdev, ne.offset, ne.length, &candidate, ioc[ne.bdev],
4245 cct->_conf->bluefs_buffered_io);
4246 ceph_assert(r == 0);
4247
4248 //check if transaction & crc is ok
4249 bluefs_transaction_t t;
4250 try {
4251 bufferlist::const_iterator p = candidate.begin();
4252 ::decode(t, p);
4253 }
4254 catch (buffer::error& e) {
4255 dout(5) << __func__ << " failed match" << dendl;
4256 continue;
4257 }
4258
4259 //success, it seems a probable candidate
4260 uint64_t l = std::min<uint64_t>(ne.length, read_len);
4261 //trim to required size
4262 bufferlist requested_read;
4263 requested_read.substr_of(candidate, fixed.length(), l);
4264 bl->append(requested_read);
4265 dout(5) << __func__ << " successful extension of log " << l << "/" << read_len << dendl;
4266 log_fnode.append_extent(ne);
4267 log_fnode.recalc_allocated();
4268 log_reader->buf.pos += l;
4269 return l;
4270 }
4271 //save overlay for next search
4272 memcpy(search_b, chunk_e - overlay_size, overlay_size);
4273 pos += chunk_len;
4274 len -= chunk_len;
4275 }
4276 }
4277 }
4278 return 0;
4279 }
4280
4281 void BlueFS::_check_vselector_LNF() {
4282 BlueFSVolumeSelector* vs = vselector->clone_empty();
4283 if (!vs) {
4284 return;
4285 }
4286 std::lock_guard ll(log.lock);
4287 std::lock_guard nl(nodes.lock);
4288 // Checking vselector is under log, nodes and file(s) locks,
4289 // so any modification of vselector must be under at least one of those locks.
4290 for (auto& f : nodes.file_map) {
4291 f.second->lock.lock();
4292 vs->add_usage(f.second->vselector_hint, f.second->fnode);
4293 }
4294 bool res = vselector->compare(vs);
4295 if (!res) {
4296 dout(0) << "Current:";
4297 vselector->dump(*_dout);
4298 *_dout << dendl;
4299 dout(0) << "Expected:";
4300 vs->dump(*_dout);
4301 *_dout << dendl;
4302 }
4303 ceph_assert(res);
4304 for (auto& f : nodes.file_map) {
4305 f.second->lock.unlock();
4306 }
4307 delete vs;
4308 }
4309
4310 size_t BlueFS::probe_alloc_avail(int dev, uint64_t alloc_size)
4311 {
4312 size_t total = 0;
4313 auto iterated_allocation = [&](size_t off, size_t len) {
4314 //only count in size that is alloc_size aligned
4315 size_t dist_to_alignment;
4316 size_t offset_in_block = off & (alloc_size - 1);
4317 if (offset_in_block == 0)
4318 dist_to_alignment = 0;
4319 else
4320 dist_to_alignment = alloc_size - offset_in_block;
4321 if (dist_to_alignment >= len)
4322 return;
4323 len -= dist_to_alignment;
4324 total += p2align(len, alloc_size);
4325 };
4326 if (alloc[dev]) {
4327 alloc[dev]->dump(iterated_allocation);
4328 }
4329 return total;
4330 }
4331 // ===============================================
4332 // OriginalVolumeSelector
4333
4334 void* OriginalVolumeSelector::get_hint_for_log() const {
4335 return reinterpret_cast<void*>(BlueFS::BDEV_WAL);
4336 }
4337 void* OriginalVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
4338 uint8_t res = BlueFS::BDEV_DB;
4339 if (dirname.length() > 5) {
4340 // the "db.slow" and "db.wal" directory names are hard-coded at
4341 // match up with bluestore. the slow device is always the second
4342 // one (when a dedicated block.db device is present and used at
4343 // bdev 0). the wal device is always last.
4344 if (boost::algorithm::ends_with(dirname, ".slow") && slow_total) {
4345 res = BlueFS::BDEV_SLOW;
4346 } else if (boost::algorithm::ends_with(dirname, ".wal") && wal_total) {
4347 res = BlueFS::BDEV_WAL;
4348 }
4349 }
4350 return reinterpret_cast<void*>(res);
4351 }
4352
4353 uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint)
4354 {
4355 return (uint8_t)(reinterpret_cast<uint64_t>(hint));
4356 }
4357
4358 void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const
4359 {
4360 res.emplace_back(base, db_total);
4361 res.emplace_back(base + ".slow",
4362 slow_total ? slow_total : db_total); // use fake non-zero value if needed to
4363 // avoid RocksDB complains
4364 }
4365
4366 #undef dout_prefix
4367 #define dout_prefix *_dout << "OriginalVolumeSelector: "
4368
4369 void OriginalVolumeSelector::dump(ostream& sout) {
4370 sout<< "wal_total:" << wal_total
4371 << ", db_total:" << db_total
4372 << ", slow_total:" << slow_total
4373 << std::endl;
4374 }
4375
4376 // ===============================================
4377 // FitToFastVolumeSelector
4378
4379 void FitToFastVolumeSelector::get_paths(const std::string& base, paths& res) const {
4380 res.emplace_back(base, 1); // size of the last db_path has no effect
4381 }