]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueFS.cc
871cd0e55e4ccd2c98c29ac88a9bc2e0a3e56fa1
[ceph.git] / ceph / src / os / bluestore / BlueFS.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "boost/algorithm/string.hpp"
5 #include "bluestore_common.h"
6 #include "BlueFS.h"
7
8 #include "common/debug.h"
9 #include "common/errno.h"
10 #include "common/perf_counters.h"
11 #include "BlockDevice.h"
12 #include "Allocator.h"
13 #include "include/ceph_assert.h"
14 #include "common/admin_socket.h"
15
16 #define dout_context cct
17 #define dout_subsys ceph_subsys_bluefs
18 #undef dout_prefix
19 #define dout_prefix *_dout << "bluefs "
20 using TOPNSPC::common::cmd_getval;
21 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
22 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
23 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs_file_writer);
24 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
25 bluefs_file_reader_buffer, bluefs_file_reader);
26 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs_file_reader);
27 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
28
29 static void wal_discard_cb(void *priv, void* priv2) {
30 BlueFS *bluefs = static_cast<BlueFS*>(priv);
31 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
32 bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp);
33 }
34
35 static void db_discard_cb(void *priv, void* priv2) {
36 BlueFS *bluefs = static_cast<BlueFS*>(priv);
37 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
38 bluefs->handle_discard(BlueFS::BDEV_DB, *tmp);
39 }
40
41 static void slow_discard_cb(void *priv, void* priv2) {
42 BlueFS *bluefs = static_cast<BlueFS*>(priv);
43 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
44 bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
45 }
46
47 class BlueFS::SocketHook : public AdminSocketHook {
48 BlueFS* bluefs;
49 public:
50 static BlueFS::SocketHook* create(BlueFS* bluefs)
51 {
52 BlueFS::SocketHook* hook = nullptr;
53 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
54 if (admin_socket) {
55 hook = new BlueFS::SocketHook(bluefs);
56 int r = admin_socket->register_command("bluestore bluefs available "
57 "name=alloc_size,type=CephInt,req=false",
58 hook,
59 "Report available space for bluefs. "
60 "If alloc_size set, make simulation.");
61 if (r != 0) {
62 ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl;
63 delete hook;
64 hook = nullptr;
65 } else {
66 r = admin_socket->register_command("bluefs stats",
67 hook,
68 "Dump internal statistics for bluefs."
69 "");
70 ceph_assert(r == 0);
71 }
72 }
73 return hook;
74 }
75
76 ~SocketHook() {
77 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
78 admin_socket->unregister_commands(this);
79 }
80 private:
81 SocketHook(BlueFS* bluefs) :
82 bluefs(bluefs) {}
83 int call(std::string_view command, const cmdmap_t& cmdmap,
84 Formatter *f,
85 std::ostream& errss,
86 bufferlist& out) override {
87 if (command == "bluestore bluefs available") {
88 int64_t alloc_size = 0;
89 cmd_getval(cmdmap, "alloc_size", alloc_size);
90 if ((alloc_size & (alloc_size - 1)) != 0) {
91 errss << "Invalid allocation size:'" << alloc_size << std::endl;
92 return -EINVAL;
93 }
94 if (alloc_size == 0)
95 alloc_size = bluefs->cct->_conf->bluefs_alloc_size;
96 f->open_object_section("bluefs_available_space");
97 for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) {
98 if (bluefs->bdev[dev]) {
99 f->open_object_section("dev");
100 f->dump_string("device", bluefs->get_device_name(dev));
101 ceph_assert(bluefs->alloc[dev]);
102 f->dump_int("free", bluefs->alloc[dev]->get_free());
103 f->close_section();
104 }
105 }
106 size_t extra_space = 0;
107 if (bluefs->slow_dev_expander) {
108 extra_space = bluefs->slow_dev_expander->available_freespace(alloc_size);
109 }
110 f->dump_int("available_from_bluestore", extra_space);
111 f->close_section();
112 } else if (command == "bluefs stats") {
113 std::stringstream ss;
114 bluefs->dump_block_extents(ss);
115 bluefs->dump_volume_selector(ss);
116 out.append(ss);
117 } else {
118 errss << "Invalid command" << std::endl;
119 return -ENOSYS;
120 }
121 return 0;
122 }
123 };
124
125 BlueFS::BlueFS(CephContext* cct)
126 : cct(cct),
127 bdev(MAX_BDEV),
128 ioc(MAX_BDEV),
129 block_all(MAX_BDEV)
130 {
131 discard_cb[BDEV_WAL] = wal_discard_cb;
132 discard_cb[BDEV_DB] = db_discard_cb;
133 discard_cb[BDEV_SLOW] = slow_discard_cb;
134 asok_hook = SocketHook::create(this);
135 }
136
137 BlueFS::~BlueFS()
138 {
139 delete asok_hook;
140 for (auto p : ioc) {
141 if (p)
142 p->aio_wait();
143 }
144 for (auto p : bdev) {
145 if (p) {
146 p->close();
147 delete p;
148 }
149 }
150 for (auto p : ioc) {
151 delete p;
152 }
153 }
154
155 void BlueFS::_init_logger()
156 {
157 PerfCountersBuilder b(cct, "bluefs",
158 l_bluefs_first, l_bluefs_last);
159 b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes",
160 "Bytes gifted from BlueStore", NULL, 0, unit_t(UNIT_BYTES));
161 b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes",
162 "Bytes reclaimed by BlueStore", NULL, 0, unit_t(UNIT_BYTES));
163 b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
164 "Total bytes (main db device)",
165 "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
166 b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
167 "Used bytes (main db device)",
168 "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
169 b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
170 "Total bytes (wal device)",
171 "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
172 b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
173 "Used bytes (wal device)",
174 "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
175 b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
176 "Total bytes (slow device)",
177 "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
178 b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
179 "Used bytes (slow device)",
180 "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
181 b.add_u64(l_bluefs_num_files, "num_files", "File count",
182 "f", PerfCountersBuilder::PRIO_USEFUL);
183 b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
184 "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
185 b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
186 "Compactions of the metadata log");
187 b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
188 "Bytes written to the metadata log", "j",
189 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
190 b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
191 "Files written to WAL");
192 b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
193 "Files written to SSTs");
194 b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
195 "Bytes written to WAL", "wal",
196 PerfCountersBuilder::PRIO_CRITICAL);
197 b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
198 "Bytes written to SSTs", "sst",
199 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
200 b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow",
201 "Bytes written to WAL/SSTs at slow device", NULL,
202 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
203 b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal",
204 "Maximum bytes allocated from WAL");
205 b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db",
206 "Maximum bytes allocated from DB");
207 b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow",
208 "Maximum bytes allocated from SLOW");
209
210 b.add_u64_counter(l_bluefs_read_random_count, "read_random_count",
211 "random read requests processed");
212 b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes",
213 "Bytes requested in random read mode", NULL,
214 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
215 b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count",
216 "random reads requests going to disk");
217 b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes",
218 "Bytes read from disk in random read mode", NULL,
219 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
220 b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count",
221 "random read requests processed using prefetch buffer");
222 b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes",
223 "Bytes read from prefetch buffer in random read mode", NULL,
224 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
225
226 b.add_u64_counter(l_bluefs_read_count, "read_count",
227 "buffered read requests processed");
228 b.add_u64_counter(l_bluefs_read_bytes, "read_bytes",
229 "Bytes requested in buffered read mode", NULL,
230 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
231
232 b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count",
233 "prefetch read requests processed");
234 b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes",
235 "Bytes requested in prefetch read mode", NULL,
236 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
237
238 logger = b.create_perf_counters();
239 cct->get_perfcounters_collection()->add(logger);
240 }
241
242 void BlueFS::_shutdown_logger()
243 {
244 cct->get_perfcounters_collection()->remove(logger);
245 delete logger;
246 }
247
248 void BlueFS::_update_logger_stats()
249 {
250 // we must be holding the lock
251 logger->set(l_bluefs_num_files, file_map.size());
252 logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
253
254 if (alloc[BDEV_WAL]) {
255 logger->set(l_bluefs_wal_total_bytes, block_all[BDEV_WAL].size());
256 logger->set(l_bluefs_wal_used_bytes,
257 block_all[BDEV_WAL].size() - alloc[BDEV_WAL]->get_free());
258 }
259 if (alloc[BDEV_DB]) {
260 logger->set(l_bluefs_db_total_bytes, block_all[BDEV_DB].size());
261 logger->set(l_bluefs_db_used_bytes,
262 block_all[BDEV_DB].size() - alloc[BDEV_DB]->get_free());
263 }
264 if (alloc[BDEV_SLOW]) {
265 logger->set(l_bluefs_slow_total_bytes, block_all[BDEV_SLOW].size());
266 logger->set(l_bluefs_slow_used_bytes,
267 block_all[BDEV_SLOW].size() - alloc[BDEV_SLOW]->get_free());
268 }
269 }
270
271 int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
272 bool shared_with_bluestore)
273 {
274 dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
275 ceph_assert(id < bdev.size());
276 ceph_assert(bdev[id] == NULL);
277 BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
278 discard_cb[id], static_cast<void*>(this));
279 if (shared_with_bluestore) {
280 b->set_no_exclusive_lock();
281 }
282 int r = b->open(path);
283 if (r < 0) {
284 delete b;
285 return r;
286 }
287 if (trim) {
288 b->discard(0, b->get_size());
289 }
290
291 dout(1) << __func__ << " bdev " << id << " path " << path
292 << " size " << byte_u_t(b->get_size()) << dendl;
293 bdev[id] = b;
294 ioc[id] = new IOContext(cct, NULL);
295 return 0;
296 }
297
298 bool BlueFS::bdev_support_label(unsigned id)
299 {
300 ceph_assert(id < bdev.size());
301 ceph_assert(bdev[id]);
302 return bdev[id]->supported_bdev_label();
303 }
304
305 uint64_t BlueFS::get_block_device_size(unsigned id)
306 {
307 if (id < bdev.size() && bdev[id])
308 return bdev[id]->get_size();
309 return 0;
310 }
311
312 void BlueFS::_add_block_extent(unsigned id, uint64_t offset, uint64_t length,
313 bool skip)
314 {
315 dout(1) << __func__ << " bdev " << id
316 << " 0x" << std::hex << offset << "~" << length << std::dec
317 << " skip " << skip
318 << dendl;
319
320 ceph_assert(id < bdev.size());
321 ceph_assert(bdev[id]);
322 ceph_assert(bdev[id]->get_size() >= offset + length);
323 block_all[id].insert(offset, length);
324
325 if (id < alloc.size() && alloc[id]) {
326 if (!skip)
327 log_t.op_alloc_add(id, offset, length);
328
329 alloc[id]->init_add_free(offset, length);
330 }
331
332 if (logger)
333 logger->inc(l_bluefs_gift_bytes, length);
334 dout(10) << __func__ << " done" << dendl;
335 }
336
337 int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
338 PExtentVector *extents)
339 {
340 std::unique_lock l(lock);
341 dout(1) << __func__ << " bdev " << id
342 << " want 0x" << std::hex << want << std::dec << dendl;
343 ceph_assert(id < alloc.size());
344 ceph_assert(alloc[id]);
345 int64_t got = 0;
346
347 interval_set<uint64_t> granular;
348 while (want > 0 && !block_unused_too_granular[id].empty()) {
349 auto p = block_unused_too_granular[id].begin();
350 dout(20) << __func__ << " unused " << (int)id << ":"
351 << std::hex << p.get_start() << "~" << p.get_len() << dendl;
352 extents->push_back({p.get_start(), p.get_len()});
353 granular.insert(p.get_start(), p.get_len());
354 if (want >= p.get_len()) {
355 want -= p.get_len();
356 } else {
357 want = 0;
358 }
359 got += p.get_len();
360 block_unused_too_granular[id].erase(p);
361 }
362
363 if (want > 0) {
364 got += alloc[id]->allocate(want, alloc_size[id], 0, extents);
365 ceph_assert(got != 0);
366 if (got < 0) {
367 derr << __func__ << " failed to allocate space to return to bluestore"
368 << dendl;
369 alloc[id]->dump();
370 block_unused_too_granular[id].insert(granular);
371 return got;
372 }
373
374 for (auto& p : *extents) {
375 block_all[id].erase(p.offset, p.length);
376 log_t.op_alloc_rm(id, p.offset, p.length);
377 }
378
379 flush_bdev();
380 int r = _flush_and_sync_log(l);
381 ceph_assert(r == 0);
382 }
383
384 logger->inc(l_bluefs_reclaim_bytes, got);
385 dout(1) << __func__ << " bdev " << id << " want 0x" << std::hex << want
386 << " got " << *extents << dendl;
387 return 0;
388 }
389
390 void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
391 {
392 dout(10) << __func__ << " bdev " << id << dendl;
393 ceph_assert(alloc[id]);
394 alloc[id]->release(to_release);
395 }
396
397 uint64_t BlueFS::get_used()
398 {
399 std::lock_guard l(lock);
400 uint64_t used = 0;
401 for (unsigned id = 0; id < MAX_BDEV; ++id) {
402 if (alloc[id]) {
403 used += block_all[id].size() - alloc[id]->get_free();
404 }
405 }
406 return used;
407 }
408
409 uint64_t BlueFS::get_total(unsigned id)
410 {
411 std::lock_guard l(lock);
412 ceph_assert(id < block_all.size());
413 return block_all[id].size();
414 }
415
416 uint64_t BlueFS::get_free(unsigned id)
417 {
418 std::lock_guard l(lock);
419 ceph_assert(id < alloc.size());
420 return alloc[id]->get_free();
421 }
422
423 void BlueFS::dump_perf_counters(Formatter *f)
424 {
425 f->open_object_section("bluefs_perf_counters");
426 logger->dump_formatted(f,0);
427 f->close_section();
428 }
429
430 void BlueFS::dump_block_extents(ostream& out)
431 {
432 for (unsigned i = 0; i < MAX_BDEV; ++i) {
433 if (!bdev[i]) {
434 continue;
435 }
436 auto owned = get_total(i);
437 auto free = get_free(i);
438
439 out << i << " : device size 0x" << std::hex << bdev[i]->get_size()
440 << " : own 0x" << block_all[i]
441 << " = 0x" << owned
442 << " : using 0x" << owned - free
443 << std::dec << "(" << byte_u_t(owned - free) << ")";
444 if (i == _get_slow_device_id()) {
445 ceph_assert(slow_dev_expander);
446 ceph_assert(alloc[i]);
447 free = slow_dev_expander->available_freespace(alloc_size[i]);
448 out << std::hex
449 << " : bluestore has 0x" << free
450 << std::dec << "(" << byte_u_t(free) << ") available";
451 }
452 out << "\n";
453 }
454 }
455
456 void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
457 {
458 std::lock_guard l(lock);
459 usage->resize(bdev.size());
460 for (unsigned id = 0; id < bdev.size(); ++id) {
461 if (!bdev[id]) {
462 (*usage)[id] = make_pair(0, 0);
463 continue;
464 }
465 (*usage)[id].first = alloc[id]->get_free();
466 (*usage)[id].second = block_all[id].size();
467 uint64_t used =
468 (block_all[id].size() - (*usage)[id].first) * 100 / block_all[id].size();
469 dout(10) << __func__ << " bdev " << id
470 << " free " << (*usage)[id].first
471 << " (" << byte_u_t((*usage)[id].first) << ")"
472 << " / " << (*usage)[id].second
473 << " (" << byte_u_t((*usage)[id].second) << ")"
474 << ", used " << used << "%"
475 << dendl;
476 }
477 }
478
479 int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
480 {
481 std::lock_guard l(lock);
482 dout(10) << __func__ << " bdev " << id << dendl;
483 if (id >= block_all.size())
484 return -EINVAL;
485 *extents = block_all[id];
486 return 0;
487 }
488
489 int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
490 {
491 std::unique_lock l(lock);
492 dout(1) << __func__
493 << " osd_uuid " << osd_uuid
494 << dendl;
495
496 // set volume selector if not provided before/outside
497 if (vselector == nullptr) {
498 vselector.reset(
499 new OriginalVolumeSelector(
500 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
501 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
502 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
503 }
504
505 _init_alloc();
506 _init_logger();
507
508 super.version = 1;
509 super.block_size = bdev[BDEV_DB]->get_block_size();
510 super.osd_uuid = osd_uuid;
511 super.uuid.generate_random();
512 dout(1) << __func__ << " uuid " << super.uuid << dendl;
513
514 // init log
515 FileRef log_file = ceph::make_ref<File>();
516 log_file->fnode.ino = 1;
517 log_file->vselector_hint = vselector->get_hint_for_log();
518 int r = _allocate(
519 vselector->select_prefer_bdev(log_file->vselector_hint),
520 cct->_conf->bluefs_max_log_runway,
521 &log_file->fnode);
522 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
523 ceph_assert(r == 0);
524 log_writer = _create_writer(log_file);
525
526 // initial txn
527 log_t.op_init();
528 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
529 interval_set<uint64_t>& p = block_all[bdev];
530 if (p.empty())
531 continue;
532 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
533 dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
534 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
535 << dendl;
536 log_t.op_alloc_add(bdev, q.get_start(), q.get_len());
537 }
538 }
539 _flush_and_sync_log(l);
540
541 // write supers
542 super.log_fnode = log_file->fnode;
543 super.memorized_layout = layout;
544 _write_super(BDEV_DB);
545 flush_bdev();
546
547 // clean up
548 super = bluefs_super_t();
549 _close_writer(log_writer);
550 log_writer = NULL;
551 block_all.clear();
552 vselector.reset(nullptr);
553 _stop_alloc();
554 _shutdown_logger();
555
556 dout(10) << __func__ << " success" << dendl;
557 return 0;
558 }
559
560 void BlueFS::_init_alloc()
561 {
562 dout(20) << __func__ << dendl;
563 alloc.resize(MAX_BDEV);
564 alloc_size.resize(MAX_BDEV, 0);
565 pending_release.resize(MAX_BDEV);
566 block_unused_too_granular.resize(MAX_BDEV);
567
568 if (bdev[BDEV_WAL]) {
569 alloc_size[BDEV_WAL] = cct->_conf->bluefs_alloc_size;
570 }
571 if (bdev[BDEV_SLOW]) {
572 alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size;
573 alloc_size[BDEV_SLOW] = cct->_conf->bluefs_shared_alloc_size;
574 } else {
575 alloc_size[BDEV_DB] = cct->_conf->bluefs_shared_alloc_size;
576 }
577 // new wal and db devices are never shared
578 if (bdev[BDEV_NEWWAL]) {
579 alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size;
580 }
581 if (bdev[BDEV_NEWDB]) {
582 alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size;
583 }
584
585 for (unsigned id = 0; id < bdev.size(); ++id) {
586 if (!bdev[id]) {
587 continue;
588 }
589 ceph_assert(bdev[id]->get_size());
590 std::string name = "bluefs-";
591 const char* devnames[] = {"wal","db","slow"};
592 if (id <= BDEV_SLOW)
593 name += devnames[id];
594 else
595 name += to_string(uintptr_t(this));
596 ceph_assert(alloc_size[id]);
597 dout(1) << __func__ << " id " << id
598 << " alloc_size 0x" << std::hex << alloc_size[id]
599 << " size 0x" << bdev[id]->get_size() << std::dec << dendl;
600 alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
601 bdev[id]->get_size(),
602 alloc_size[id], name);
603 interval_set<uint64_t>& p = block_all[id];
604 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
605 alloc[id]->init_add_free(q.get_start(), q.get_len());
606 }
607 }
608 }
609
610 void BlueFS::_stop_alloc()
611 {
612 dout(20) << __func__ << dendl;
613 for (auto p : bdev) {
614 if (p)
615 p->discard_drain();
616 }
617
618 for (auto p : alloc) {
619 if (p != nullptr) {
620 p->shutdown();
621 delete p;
622 }
623 }
624 alloc.clear();
625 block_unused_too_granular.clear();
626 }
627
628 int BlueFS::mount()
629 {
630 dout(1) << __func__ << dendl;
631
632 int r = _open_super();
633 if (r < 0) {
634 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
635 goto out;
636 }
637
638 // set volume selector if not provided before/outside
639 if (vselector == nullptr) {
640 vselector.reset(
641 new OriginalVolumeSelector(
642 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
643 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
644 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
645 }
646
647 block_all.clear();
648 block_all.resize(MAX_BDEV);
649 _init_alloc();
650 _init_logger();
651
652 r = _replay(false, false);
653 if (r < 0) {
654 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
655 _stop_alloc();
656 goto out;
657 }
658
659 // init freelist
660 for (auto& p : file_map) {
661 dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
662 for (auto& q : p.second->fnode.extents) {
663 alloc[q.bdev]->init_rm_free(q.offset, q.length);
664 }
665 }
666
667 // set up the log for future writes
668 log_writer = _create_writer(_get_file(1));
669 ceph_assert(log_writer->file->fnode.ino == 1);
670 log_writer->pos = log_writer->file->fnode.size;
671 dout(10) << __func__ << " log write pos set to 0x"
672 << std::hex << log_writer->pos << std::dec
673 << dendl;
674
675 return 0;
676
677 out:
678 super = bluefs_super_t();
679 return r;
680 }
681
682 int BlueFS::maybe_verify_layout(const bluefs_layout_t& layout) const
683 {
684 if (super.memorized_layout) {
685 if (layout == *super.memorized_layout) {
686 dout(10) << __func__ << " bluefs layout verified positively" << dendl;
687 } else {
688 derr << __func__ << " memorized layout doesn't fit current one" << dendl;
689 return -EIO;
690 }
691 } else {
692 dout(10) << __func__ << " no memorized_layout in bluefs superblock"
693 << dendl;
694 }
695
696 return 0;
697 }
698
699 void BlueFS::umount(bool avoid_compact)
700 {
701 dout(1) << __func__ << dendl;
702
703 sync_metadata(avoid_compact);
704
705 _close_writer(log_writer);
706 log_writer = NULL;
707
708 vselector.reset(nullptr);
709 _stop_alloc();
710 file_map.clear();
711 dir_map.clear();
712 super = bluefs_super_t();
713 log_t.clear();
714 _shutdown_logger();
715 }
716
717 int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout)
718 {
719 dout(1) << __func__ << dendl;
720
721 if(id == BDEV_NEWDB) {
722 int new_log_dev_cur = BDEV_WAL;
723 int new_log_dev_next = BDEV_WAL;
724 if (!bdev[BDEV_WAL]) {
725 new_log_dev_cur = BDEV_NEWDB;
726 new_log_dev_next = BDEV_DB;
727 }
728 _rewrite_log_and_layout_sync(false,
729 BDEV_NEWDB,
730 new_log_dev_cur,
731 new_log_dev_next,
732 RENAME_DB2SLOW,
733 layout);
734 //}
735 } else if(id == BDEV_NEWWAL) {
736 _rewrite_log_and_layout_sync(false,
737 BDEV_DB,
738 BDEV_NEWWAL,
739 BDEV_WAL,
740 REMOVE_WAL,
741 layout);
742 } else {
743 assert(false);
744 }
745 return 0;
746 }
747
748 void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id)
749 {
750 if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB])
751 bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
752 if (bdev[BDEV_WAL])
753 bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
754 }
755
756 void BlueFS::get_devices(set<string> *ls)
757 {
758 for (unsigned i = 0; i < MAX_BDEV; ++i) {
759 if (bdev[i]) {
760 bdev[i]->get_devices(ls);
761 }
762 }
763 }
764
765 int BlueFS::fsck()
766 {
767 std::lock_guard l(lock);
768 dout(1) << __func__ << dendl;
769 // hrm, i think we check everything on mount...
770 return 0;
771 }
772
773 int BlueFS::_write_super(int dev)
774 {
775 // build superblock
776 bufferlist bl;
777 encode(super, bl);
778 uint32_t crc = bl.crc32c(-1);
779 encode(crc, bl);
780 dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
781 dout(10) << __func__ << " superblock " << super.version << dendl;
782 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
783 ceph_assert_always(bl.length() <= get_super_length());
784 bl.append_zero(get_super_length() - bl.length());
785
786 bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT);
787 dout(20) << __func__ << " v " << super.version
788 << " crc 0x" << std::hex << crc
789 << " offset 0x" << get_super_offset() << std::dec
790 << dendl;
791 return 0;
792 }
793
794 int BlueFS::_open_super()
795 {
796 dout(10) << __func__ << dendl;
797
798 bufferlist bl;
799 uint32_t expected_crc, crc;
800 int r;
801
802 // always the second block
803 r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
804 &bl, ioc[BDEV_DB], false);
805 if (r < 0)
806 return r;
807
808 auto p = bl.cbegin();
809 decode(super, p);
810 {
811 bufferlist t;
812 t.substr_of(bl, 0, p.get_off());
813 crc = t.crc32c(-1);
814 }
815 decode(expected_crc, p);
816 if (crc != expected_crc) {
817 derr << __func__ << " bad crc on superblock, expected 0x"
818 << std::hex << expected_crc << " != actual 0x" << crc << std::dec
819 << dendl;
820 return -EIO;
821 }
822 dout(10) << __func__ << " superblock " << super.version << dendl;
823 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
824 return 0;
825 }
826
827 int BlueFS::_check_new_allocations(const bluefs_fnode_t& fnode,
828 size_t dev_count,
829 boost::dynamic_bitset<uint64_t>* owned_blocks,
830 boost::dynamic_bitset<uint64_t>* used_blocks)
831 {
832 auto& fnode_extents = fnode.extents;
833 for (auto e : fnode_extents) {
834 auto id = e.bdev;
835 bool fail = false;
836 ceph_assert(id < dev_count);
837 apply_for_bitset_range(e.offset, e.length, alloc_size[id], owned_blocks[id],
838 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
839 if (!bs.test(pos)) {
840 fail = true;
841 }
842 }
843 );
844 if (fail) {
845 derr << __func__ << " invalid extent " << int(id)
846 << ": 0x" << std::hex << e.offset << "~" << e.length
847 << std::dec
848 << ": wasn't given but allocated for ino " << fnode.ino
849 << dendl;
850 return -EFAULT;
851 }
852
853 apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
854 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
855 if (bs.test(pos)) {
856 fail = true;
857 }
858 bs.set(pos);
859 }
860 );
861 if (fail) {
862 derr << __func__ << " invalid extent " << int(e.bdev)
863 << ": 0x" << std::hex << e.offset << "~" << e.length
864 << std::dec << ": duplicate reference, ino " << fnode.ino
865 << dendl;
866 return -EFAULT;
867 }
868 }
869 return 0;
870 }
871
872 int BlueFS::_adjust_granularity(
873 __u8 id, uint64_t *offset, uint64_t *length, bool alloc)
874 {
875 const char *op = alloc ? "op_alloc_add" : "op_alloc_rm";
876 auto oldo = *offset;
877 auto oldl = *length;
878 if (*offset & (alloc_size[id] - 1)) {
879 *offset &= ~(alloc_size[id] - 1);
880 *offset += alloc_size[id];
881 if (*length > *offset - oldo) {
882 if (alloc) {
883 block_unused_too_granular[id].insert(oldo, *offset - oldo);
884 } else {
885 block_unused_too_granular[id].erase(oldo, *offset - oldo);
886 }
887 *length -= (*offset - oldo);
888 } else {
889 if (alloc) {
890 block_unused_too_granular[id].insert(oldo, *length);
891 } else {
892 block_unused_too_granular[id].erase(oldo, *length);
893 }
894 *length = 0;
895 }
896 }
897 if (*length & (alloc_size[id] - 1)) {
898 *length &= ~(alloc_size[id] - 1);
899 if (alloc) {
900 block_unused_too_granular[id].insert(
901 *offset + *length,
902 oldo + oldl - *offset - *length);
903 } else {
904 block_unused_too_granular[id].erase(
905 *offset + *length,
906 oldo + oldl - *offset - *length);
907 }
908 }
909 if (oldo != *offset || oldl != *length) {
910 dout(10) << __func__ << " " << op << " "
911 << (int)id << ":" << std::hex << oldo << "~" << oldl
912 << " -> " << (int)id << ":" << *offset << "~" << *length << dendl;
913 }
914 return 0;
915 }
916
917 int BlueFS::_verify_alloc_granularity(
918 __u8 id, uint64_t offset, uint64_t length, const char *op)
919 {
920 if ((offset & (alloc_size[id] - 1)) ||
921 (length & (alloc_size[id] - 1))) {
922 derr << __func__ << " " << op << " of " << (int)id
923 << ":0x" << std::hex << offset << "~" << length << std::dec
924 << " does not align to alloc_size 0x"
925 << std::hex << alloc_size[id] << std::dec << dendl;
926 // be helpful
927 auto need = alloc_size[id];
928 while (need && ((offset & (need - 1)) ||
929 (length & (need - 1)))) {
930 need >>= 1;
931 }
932 if (need) {
933 const char *which;
934 if (id == BDEV_SLOW ||
935 (id == BDEV_DB && !bdev[BDEV_SLOW])) {
936 which = "bluefs_shared_alloc_size";
937 } else {
938 which = "bluefs_alloc_size";
939 }
940 derr << "work-around by setting " << which << " = " << need
941 << " for this OSD" << dendl;
942 }
943 return -EFAULT;
944 }
945 return 0;
946 }
947
948 int BlueFS::_replay(bool noop, bool to_stdout)
949 {
950 dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
951 ino_last = 1; // by the log
952 log_seq = 0;
953
954 FileRef log_file;
955 log_file = _get_file(1);
956
957 // sanity check
958 for (auto& a : block_unused_too_granular) {
959 ceph_assert(a.empty());
960 }
961
962 if (!noop) {
963 log_file->fnode = super.log_fnode;
964 log_file->vselector_hint =
965 vselector->get_hint_for_log();
966 } else {
967 // do not use fnode from superblock in 'noop' mode - log_file's one should
968 // be fine and up-to-date
969 ceph_assert(log_file->fnode.ino == 1);
970 ceph_assert(log_file->fnode.extents.size() != 0);
971 }
972 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
973 if (unlikely(to_stdout)) {
974 std::cout << " log_fnode " << super.log_fnode << std::endl;
975 }
976
977 FileReader *log_reader = new FileReader(
978 log_file, cct->_conf->bluefs_max_prefetch,
979 false, // !random
980 true); // ignore eof
981
982 bool seen_recs = false;
983
984 boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV];
985 boost::dynamic_bitset<uint64_t> owned_blocks[MAX_BDEV];
986
987 if (cct->_conf->bluefs_log_replay_check_allocations) {
988 for (size_t i = 0; i < MAX_BDEV; ++i) {
989 if (alloc_size[i] != 0 && bdev[i] != nullptr) {
990 used_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]);
991 owned_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]);
992 }
993 }
994 }
995
996 bool first_log_check = true;
997
998 while (true) {
999 ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0);
1000 uint64_t pos = log_reader->buf.pos;
1001 uint64_t read_pos = pos;
1002 bufferlist bl;
1003 {
1004 int r = _read(log_reader, &log_reader->buf, read_pos, super.block_size,
1005 &bl, NULL);
1006 if (r != (int)super.block_size && cct->_conf->bluefs_replay_recovery) {
1007 r += do_replay_recovery_read(log_reader, pos, read_pos + r, super.block_size - r, &bl);
1008 }
1009 assert(r == (int)super.block_size);
1010 read_pos += r;
1011 }
1012 uint64_t more = 0;
1013 uint64_t seq;
1014 uuid_d uuid;
1015 {
1016 auto p = bl.cbegin();
1017 __u8 a, b;
1018 uint32_t len;
1019 decode(a, p);
1020 decode(b, p);
1021 decode(len, p);
1022 decode(uuid, p);
1023 decode(seq, p);
1024 if (len + 6 > bl.length()) {
1025 more = round_up_to(len + 6 - bl.length(), super.block_size);
1026 }
1027 }
1028 if (uuid != super.uuid) {
1029 if (seen_recs) {
1030 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1031 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1032 << dendl;
1033 } else {
1034 derr << __func__ << " 0x" << std::hex << pos << std::dec
1035 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1036 << ", block dump: \n";
1037 bufferlist t;
1038 t.substr_of(bl, 0, super.block_size);
1039 t.hexdump(*_dout);
1040 *_dout << dendl;
1041 }
1042 break;
1043 }
1044 if (seq != log_seq + 1) {
1045 if (seen_recs) {
1046 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1047 << ": stop: seq " << seq << " != expected " << log_seq + 1
1048 << dendl;;
1049 } else {
1050 derr << __func__ << " 0x" << std::hex << pos << std::dec
1051 << ": stop: seq " << seq << " != expected " << log_seq + 1
1052 << dendl;;
1053 }
1054 break;
1055 }
1056 if (more) {
1057 dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
1058 << " more bytes" << dendl;
1059 bufferlist t;
1060 int r = _read(log_reader, &log_reader->buf, read_pos, more, &t, NULL);
1061 if (r < (int)more) {
1062 dout(10) << __func__ << " 0x" << std::hex << pos
1063 << ": stop: len is 0x" << bl.length() + more << std::dec
1064 << ", which is past eof" << dendl;
1065 if (cct->_conf->bluefs_replay_recovery) {
1066 //try to search for more data
1067 r += do_replay_recovery_read(log_reader, pos, read_pos + r, more - r, &t);
1068 if (r < (int)more) {
1069 //in normal mode we must read r==more, for recovery it is too strict
1070 break;
1071 }
1072 }
1073 }
1074 ceph_assert(r == (int)more);
1075 bl.claim_append(t);
1076 read_pos += r;
1077 }
1078 seen_recs = true;
1079 bluefs_transaction_t t;
1080 try {
1081 auto p = bl.cbegin();
1082 decode(t, p);
1083 }
1084 catch (buffer::error& e) {
1085 derr << __func__ << " 0x" << std::hex << pos << std::dec
1086 << ": stop: failed to decode: " << e.what()
1087 << dendl;
1088 delete log_reader;
1089 return -EIO;
1090 }
1091 ceph_assert(seq == t.seq);
1092 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1093 << ": " << t << dendl;
1094 if (unlikely(to_stdout)) {
1095 std::cout << " 0x" << std::hex << pos << std::dec
1096 << ": " << t << std::endl;
1097 }
1098
1099 auto p = t.op_bl.cbegin();
1100 while (!p.end()) {
1101 __u8 op;
1102 decode(op, p);
1103 switch (op) {
1104
1105 case bluefs_transaction_t::OP_INIT:
1106 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1107 << ": op_init" << dendl;
1108 if (unlikely(to_stdout)) {
1109 std::cout << " 0x" << std::hex << pos << std::dec
1110 << ": op_init" << std::endl;
1111 }
1112
1113 ceph_assert(t.seq == 1);
1114 break;
1115
1116 case bluefs_transaction_t::OP_JUMP:
1117 {
1118 uint64_t next_seq;
1119 uint64_t offset;
1120 decode(next_seq, p);
1121 decode(offset, p);
1122 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1123 << ": op_jump seq " << next_seq
1124 << " offset 0x" << std::hex << offset << std::dec << dendl;
1125 if (unlikely(to_stdout)) {
1126 std::cout << " 0x" << std::hex << pos << std::dec
1127 << ": op_jump seq " << next_seq
1128 << " offset 0x" << std::hex << offset << std::dec
1129 << std::endl;
1130 }
1131
1132 ceph_assert(next_seq >= log_seq);
1133 log_seq = next_seq - 1; // we will increment it below
1134 uint64_t skip = offset - read_pos;
1135 if (skip) {
1136 bufferlist junk;
1137 int r = _read(log_reader, &log_reader->buf, read_pos, skip, &junk,
1138 NULL);
1139 if (r != (int)skip) {
1140 dout(10) << __func__ << " 0x" << std::hex << read_pos
1141 << ": stop: failed to skip to " << offset
1142 << std::dec << dendl;
1143 ceph_abort_msg("problem with op_jump");
1144 }
1145 }
1146 }
1147 break;
1148
1149 case bluefs_transaction_t::OP_JUMP_SEQ:
1150 {
1151 uint64_t next_seq;
1152 decode(next_seq, p);
1153 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1154 << ": op_jump_seq " << next_seq << dendl;
1155 if (unlikely(to_stdout)) {
1156 std::cout << " 0x" << std::hex << pos << std::dec
1157 << ": op_jump_seq " << next_seq << std::endl;
1158 }
1159
1160 ceph_assert(next_seq >= log_seq);
1161 log_seq = next_seq - 1; // we will increment it below
1162 }
1163 break;
1164
1165 case bluefs_transaction_t::OP_ALLOC_ADD:
1166 {
1167 __u8 id;
1168 uint64_t offset, length;
1169 decode(id, p);
1170 decode(offset, p);
1171 decode(length, p);
1172 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1173 << ": op_alloc_add " << " " << (int)id
1174 << ":0x" << std::hex << offset << "~" << length << std::dec
1175 << dendl;
1176 if (unlikely(to_stdout)) {
1177 std::cout << " 0x" << std::hex << pos << std::dec
1178 << ": op_alloc_add " << " " << (int)id
1179 << ":0x" << std::hex << offset << "~" << length << std::dec
1180 << std::endl;
1181 }
1182 if (!noop) {
1183 block_all[id].insert(offset, length);
1184 _adjust_granularity(id, &offset, &length, true);
1185 if (length) {
1186 alloc[id]->init_add_free(offset, length);
1187 }
1188
1189 if (cct->_conf->bluefs_log_replay_check_allocations) {
1190 bool fail = false;
1191 apply_for_bitset_range(offset, length, alloc_size[id], owned_blocks[id],
1192 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1193 if (bs.test(pos)) {
1194 fail = true;
1195 } else {
1196 bs.set(pos);
1197 }
1198 }
1199 );
1200 if (fail) {
1201 derr << __func__ << " invalid extent " << (int)id
1202 << ": 0x" << std::hex << offset << "~" << length
1203 << std::dec << ": already given" << dendl;
1204 return -EFAULT;
1205 }
1206 apply_for_bitset_range(offset, length, alloc_size[id], used_blocks[id],
1207 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1208 if (bs.test(pos)) {
1209 fail = true;
1210 }
1211 }
1212 );
1213 if (fail) {
1214 derr << __func__ << " invalid extent " << int(id)
1215 << ": 0x" << std::hex << offset << "~" << length
1216 << std::dec << ": already in use" << dendl;
1217 return -EFAULT;
1218 }
1219 }
1220 }
1221 }
1222 break;
1223
1224 case bluefs_transaction_t::OP_ALLOC_RM:
1225 {
1226 __u8 id;
1227 uint64_t offset, length;
1228 decode(id, p);
1229 decode(offset, p);
1230 decode(length, p);
1231 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1232 << ": op_alloc_rm " << " " << (int)id
1233 << ":0x" << std::hex << offset << "~" << length << std::dec
1234 << dendl;
1235 if (unlikely(to_stdout)) {
1236 std::cout << " 0x" << std::hex << pos << std::dec
1237 << ": op_alloc_rm " << " " << (int)id
1238 << ":0x" << std::hex << offset << "~" << length << std::dec
1239 << std::endl;
1240 }
1241 if (!noop) {
1242 block_all[id].erase(offset, length);
1243 _adjust_granularity(id, &offset, &length, false);
1244 if (length) {
1245 alloc[id]->init_rm_free(offset, length);
1246 }
1247 if (cct->_conf->bluefs_log_replay_check_allocations) {
1248 bool fail = false;
1249 apply_for_bitset_range(offset, length, alloc_size[id], owned_blocks[id],
1250 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1251 if (!bs.test(pos)) {
1252 fail = true;
1253 } else {
1254 bs.reset(pos);
1255 }
1256 }
1257 );
1258 if (fail) {
1259 derr << __func__ << " invalid extent " << int(id)
1260 << ": 0x" << std::hex << offset << "~" << length
1261 << std::dec << ": wasn't given" << dendl;
1262 return -EFAULT;
1263 }
1264
1265 apply_for_bitset_range(offset, length, alloc_size[id], used_blocks[id],
1266 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1267 if (bs.test(pos)) {
1268 fail = true;
1269 }
1270 }
1271 );
1272 if (fail) {
1273 derr << __func__ << " invalid extent " << (int)id
1274 << ": 0x" << std::hex << offset << "~" << length
1275 << std::dec << ": still in use" << dendl;
1276 return -EFAULT;
1277 }
1278 }
1279 }
1280 }
1281 break;
1282
1283 case bluefs_transaction_t::OP_DIR_LINK:
1284 {
1285 string dirname, filename;
1286 uint64_t ino;
1287 decode(dirname, p);
1288 decode(filename, p);
1289 decode(ino, p);
1290 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1291 << ": op_dir_link " << " " << dirname << "/" << filename
1292 << " to " << ino
1293 << dendl;
1294 if (unlikely(to_stdout)) {
1295 std::cout << " 0x" << std::hex << pos << std::dec
1296 << ": op_dir_link " << " " << dirname << "/" << filename
1297 << " to " << ino
1298 << std::endl;
1299 }
1300
1301 if (!noop) {
1302 FileRef file = _get_file(ino);
1303 ceph_assert(file->fnode.ino);
1304 map<string,DirRef>::iterator q = dir_map.find(dirname);
1305 ceph_assert(q != dir_map.end());
1306 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
1307 ceph_assert(r == q->second->file_map.end());
1308
1309 vselector->sub_usage(file->vselector_hint, file->fnode);
1310 file->vselector_hint =
1311 vselector->get_hint_by_dir(dirname);
1312 vselector->add_usage(file->vselector_hint, file->fnode);
1313
1314 q->second->file_map[filename] = file;
1315 ++file->refs;
1316 }
1317 }
1318 break;
1319
1320 case bluefs_transaction_t::OP_DIR_UNLINK:
1321 {
1322 string dirname, filename;
1323 decode(dirname, p);
1324 decode(filename, p);
1325 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1326 << ": op_dir_unlink " << " " << dirname << "/" << filename
1327 << dendl;
1328 if (unlikely(to_stdout)) {
1329 std::cout << " 0x" << std::hex << pos << std::dec
1330 << ": op_dir_unlink " << " " << dirname << "/" << filename
1331 << std::endl;
1332 }
1333
1334 if (!noop) {
1335 map<string,DirRef>::iterator q = dir_map.find(dirname);
1336 ceph_assert(q != dir_map.end());
1337 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
1338 ceph_assert(r != q->second->file_map.end());
1339 ceph_assert(r->second->refs > 0);
1340 --r->second->refs;
1341 q->second->file_map.erase(r);
1342 }
1343 }
1344 break;
1345
1346 case bluefs_transaction_t::OP_DIR_CREATE:
1347 {
1348 string dirname;
1349 decode(dirname, p);
1350 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1351 << ": op_dir_create " << dirname << dendl;
1352 if (unlikely(to_stdout)) {
1353 std::cout << " 0x" << std::hex << pos << std::dec
1354 << ": op_dir_create " << dirname << std::endl;
1355 }
1356
1357 if (!noop) {
1358 map<string,DirRef>::iterator q = dir_map.find(dirname);
1359 ceph_assert(q == dir_map.end());
1360 dir_map[dirname] = ceph::make_ref<Dir>();
1361 }
1362 }
1363 break;
1364
1365 case bluefs_transaction_t::OP_DIR_REMOVE:
1366 {
1367 string dirname;
1368 decode(dirname, p);
1369 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1370 << ": op_dir_remove " << dirname << dendl;
1371 if (unlikely(to_stdout)) {
1372 std::cout << " 0x" << std::hex << pos << std::dec
1373 << ": op_dir_remove " << dirname << std::endl;
1374 }
1375
1376 if (!noop) {
1377 map<string,DirRef>::iterator q = dir_map.find(dirname);
1378 ceph_assert(q != dir_map.end());
1379 ceph_assert(q->second->file_map.empty());
1380 dir_map.erase(q);
1381 }
1382 }
1383 break;
1384
1385 case bluefs_transaction_t::OP_FILE_UPDATE:
1386 {
1387 bluefs_fnode_t fnode;
1388 decode(fnode, p);
1389 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1390 << ": op_file_update " << " " << fnode << " " << dendl;
1391 if (unlikely(to_stdout)) {
1392 std::cout << " 0x" << std::hex << pos << std::dec
1393 << ": op_file_update " << " " << fnode << std::endl;
1394 }
1395 if (!noop) {
1396 FileRef f = _get_file(fnode.ino);
1397 if (cct->_conf->bluefs_log_replay_check_allocations) {
1398 // check initial log layout
1399 if (first_log_check) {
1400 first_log_check = false;
1401 int r = _check_new_allocations(log_file->fnode,
1402 MAX_BDEV, owned_blocks, used_blocks);
1403 if (r < 0) {
1404 return r;
1405 }
1406 }
1407
1408 auto& fnode_extents = f->fnode.extents;
1409 for (auto e : fnode_extents) {
1410 auto id = e.bdev;
1411 if (int r = _verify_alloc_granularity(id, e.offset, e.length,
1412 "OP_FILE_UPDATE"); r < 0) {
1413 return r;
1414 }
1415 apply_for_bitset_range(e.offset, e.length, alloc_size[id],
1416 used_blocks[id],
1417 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1418 ceph_assert(bs.test(pos));
1419 bs.reset(pos);
1420 }
1421 );
1422 }
1423 }
1424
1425 if (fnode.ino != 1) {
1426 vselector->sub_usage(f->vselector_hint, f->fnode);
1427 }
1428 f->fnode = fnode;
1429 if (fnode.ino != 1) {
1430 vselector->add_usage(f->vselector_hint, f->fnode);
1431 }
1432
1433 if (fnode.ino > ino_last) {
1434 ino_last = fnode.ino;
1435 }
1436 if (cct->_conf->bluefs_log_replay_check_allocations) {
1437 int r = _check_new_allocations(f->fnode,
1438 MAX_BDEV, owned_blocks, used_blocks);
1439 if (r < 0) {
1440 return r;
1441 }
1442 }
1443 }
1444 }
1445 break;
1446
1447 case bluefs_transaction_t::OP_FILE_REMOVE:
1448 {
1449 uint64_t ino;
1450 decode(ino, p);
1451 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1452 << ": op_file_remove " << ino << dendl;
1453 if (unlikely(to_stdout)) {
1454 std::cout << " 0x" << std::hex << pos << std::dec
1455 << ": op_file_remove " << ino << std::endl;
1456 }
1457
1458 if (!noop) {
1459 auto p = file_map.find(ino);
1460 ceph_assert(p != file_map.end());
1461 vselector->sub_usage(p->second->vselector_hint, p->second->fnode);
1462 if (cct->_conf->bluefs_log_replay_check_allocations) {
1463 auto& fnode_extents = p->second->fnode.extents;
1464 for (auto e : fnode_extents) {
1465 auto id = e.bdev;
1466 bool fail = false;
1467 apply_for_bitset_range(e.offset, e.length, alloc_size[id], owned_blocks[id],
1468 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1469 if (!bs.test(pos)) {
1470 fail = true;
1471 }
1472 }
1473 );
1474 if (fail) {
1475 derr << __func__ << " invalid extent " << int(id)
1476 << ": 0x" << std::hex << e.offset << "~" << e.length
1477 << std::dec
1478 << ": wasn't given but is allocated for removed ino " << ino
1479 << dendl;
1480 return -EFAULT;
1481 }
1482
1483 apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
1484 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1485 if (!bs.test(pos)) {
1486 fail = true;
1487 }
1488 bs.reset(pos);
1489 }
1490 );
1491 if (fail) {
1492 derr << __func__ << " invalid extent " << int(id)
1493 << ": 0x" << std::hex << e.offset << "~" << e.length
1494 << std::dec
1495 << ": not in use but is allocated for removed ino " << ino
1496 << dendl;
1497 return -EFAULT;
1498 }
1499 }
1500 }
1501 file_map.erase(p);
1502 }
1503 }
1504 break;
1505
1506 default:
1507 derr << __func__ << " 0x" << std::hex << pos << std::dec
1508 << ": stop: unrecognized op " << (int)op << dendl;
1509 delete log_reader;
1510 return -EIO;
1511 }
1512 }
1513 ceph_assert(p.end());
1514
1515 // we successfully replayed the transaction; bump the seq and log size
1516 ++log_seq;
1517 log_file->fnode.size = log_reader->buf.pos;
1518 }
1519 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
1520
1521 if (!noop && first_log_check &&
1522 cct->_conf->bluefs_log_replay_check_allocations) {
1523 int r = _check_new_allocations(log_file->fnode,
1524 MAX_BDEV, owned_blocks, used_blocks);
1525 if (r < 0) {
1526 return r;
1527 }
1528 }
1529
1530 dout(10) << __func__ << " log file size was 0x"
1531 << std::hex << log_file->fnode.size << std::dec << dendl;
1532 if (unlikely(to_stdout)) {
1533 std::cout << " log file size was 0x"
1534 << std::hex << log_file->fnode.size << std::dec << std::endl;
1535 }
1536
1537 delete log_reader;
1538
1539 if (!noop) {
1540 // verify file link counts are all >0
1541 for (auto& p : file_map) {
1542 if (p.second->refs == 0 &&
1543 p.second->fnode.ino > 1) {
1544 derr << __func__ << " file with link count 0: " << p.second->fnode
1545 << dendl;
1546 return -EIO;
1547 }
1548 }
1549 }
1550
1551 for (unsigned id = 0; id < MAX_BDEV; ++id) {
1552 dout(10) << __func__ << " block_unused_too_granular " << id << ": "
1553 << block_unused_too_granular[id] << dendl;
1554 }
1555 dout(10) << __func__ << " done" << dendl;
1556 return 0;
1557 }
1558
1559 int BlueFS::log_dump()
1560 {
1561 // only dump log file's content
1562 int r = _replay(true, true);
1563 if (r < 0) {
1564 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
1565 return r;
1566 }
1567
1568 return 0;
1569 }
1570
1571 int BlueFS::device_migrate_to_existing(
1572 CephContext *cct,
1573 const set<int>& devs_source,
1574 int dev_target,
1575 const bluefs_layout_t& layout)
1576 {
1577 vector<byte> buf;
1578 bool buffered = cct->_conf->bluefs_buffered_io;
1579
1580 dout(10) << __func__ << " devs_source " << devs_source
1581 << " dev_target " << dev_target << dendl;
1582 assert(dev_target < (int)MAX_BDEV);
1583
1584 int flags = 0;
1585 flags |= devs_source.count(BDEV_DB) ?
1586 (REMOVE_DB | RENAME_SLOW2DB) : 0;
1587 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1588 int dev_target_new = dev_target;
1589
1590 // Slow device without separate DB one is addressed via BDEV_DB
1591 // Hence need renaming.
1592 if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) {
1593 dev_target_new = BDEV_DB;
1594 dout(0) << __func__ << " super to be written to " << dev_target << dendl;
1595 }
1596
1597 for (auto& [ino, file_ref] : file_map) {
1598 //do not copy log
1599 if (file_ref->fnode.ino == 1) {
1600 continue;
1601 }
1602 dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
1603
1604 auto& fnode_extents = file_ref->fnode.extents;
1605
1606 bool rewrite = std::any_of(
1607 fnode_extents.begin(),
1608 fnode_extents.end(),
1609 [=](auto& ext) {
1610 return ext.bdev != dev_target && devs_source.count(ext.bdev);
1611 });
1612 if (rewrite) {
1613 dout(10) << __func__ << " migrating" << dendl;
1614
1615 // read entire file
1616 bufferlist bl;
1617 for (auto old_ext : fnode_extents) {
1618 buf.resize(old_ext.length);
1619 int r = bdev[old_ext.bdev]->read_random(
1620 old_ext.offset,
1621 old_ext.length,
1622 (char*)&buf.at(0),
1623 buffered);
1624 if (r != 0) {
1625 derr << __func__ << " failed to read 0x" << std::hex
1626 << old_ext.offset << "~" << old_ext.length << std::dec
1627 << " from " << (int)dev_target << dendl;
1628 return -EIO;
1629 }
1630 bl.append((char*)&buf[0], old_ext.length);
1631 }
1632
1633 // write entire file
1634 PExtentVector extents;
1635 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1636 if (l < 0) {
1637 derr << __func__ << " unable to allocate len 0x" << std::hex
1638 << bl.length() << std::dec << " from " << (int)dev_target
1639 << ": " << cpp_strerror(l) << dendl;
1640 return -ENOSPC;
1641 }
1642
1643 uint64_t off = 0;
1644 for (auto& i : extents) {
1645 bufferlist cur;
1646 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1647 ceph_assert(cur_len > 0);
1648 cur.substr_of(bl, off, cur_len);
1649 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1650 ceph_assert(r == 0);
1651 off += cur_len;
1652 }
1653
1654 // release old extents
1655 for (auto old_ext : fnode_extents) {
1656 PExtentVector to_release;
1657 to_release.emplace_back(old_ext.offset, old_ext.length);
1658 alloc[old_ext.bdev]->release(to_release);
1659 }
1660
1661 // update fnode
1662 fnode_extents.clear();
1663 for (auto& i : extents) {
1664 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
1665 }
1666 } else {
1667 for (auto& ext : fnode_extents) {
1668 if (dev_target != dev_target_new && ext.bdev == dev_target) {
1669 dout(20) << __func__ << " " << " ... adjusting extent 0x"
1670 << std::hex << ext.offset << std::dec
1671 << " bdev " << dev_target << " -> " << dev_target_new
1672 << dendl;
1673 ext.bdev = dev_target_new;
1674 }
1675 }
1676 }
1677 }
1678 // new logging device in the current naming scheme
1679 int new_log_dev_cur = bdev[BDEV_WAL] ?
1680 BDEV_WAL :
1681 bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
1682
1683 // new logging device in new naming scheme
1684 int new_log_dev_next = new_log_dev_cur;
1685
1686 if (devs_source.count(new_log_dev_cur)) {
1687 // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1688 new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ?
1689 BDEV_DB :
1690 BDEV_WAL;
1691
1692 dout(0) << __func__ << " log moved from " << new_log_dev_cur
1693 << " to " << new_log_dev_next << dendl;
1694
1695 new_log_dev_cur =
1696 (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ?
1697 BDEV_SLOW :
1698 new_log_dev_next;
1699 }
1700
1701 _rewrite_log_and_layout_sync(
1702 false,
1703 (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB,
1704 new_log_dev_cur,
1705 new_log_dev_next,
1706 flags,
1707 layout);
1708 return 0;
1709 }
1710
1711 int BlueFS::device_migrate_to_new(
1712 CephContext *cct,
1713 const set<int>& devs_source,
1714 int dev_target,
1715 const bluefs_layout_t& layout)
1716 {
1717 vector<byte> buf;
1718 bool buffered = cct->_conf->bluefs_buffered_io;
1719
1720 dout(10) << __func__ << " devs_source " << devs_source
1721 << " dev_target " << dev_target << dendl;
1722 assert(dev_target == (int)BDEV_NEWDB || (int)BDEV_NEWWAL);
1723
1724 int flags = 0;
1725
1726 flags |= devs_source.count(BDEV_DB) ?
1727 (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) :
1728 0;
1729 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1730 int dev_target_new = dev_target; //FIXME: remove, makes no sense
1731
1732 for (auto& p : file_map) {
1733 //do not copy log
1734 if (p.second->fnode.ino == 1) {
1735 continue;
1736 }
1737 dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl;
1738
1739 auto& fnode_extents = p.second->fnode.extents;
1740
1741 bool rewrite = false;
1742 for (auto ext_it = fnode_extents.begin();
1743 ext_it != p.second->fnode.extents.end();
1744 ++ext_it) {
1745 if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
1746 rewrite = true;
1747 break;
1748 }
1749 }
1750 if (rewrite) {
1751 dout(10) << __func__ << " migrating" << dendl;
1752
1753 // read entire file
1754 bufferlist bl;
1755 for (auto old_ext : fnode_extents) {
1756 buf.resize(old_ext.length);
1757 int r = bdev[old_ext.bdev]->read_random(
1758 old_ext.offset,
1759 old_ext.length,
1760 (char*)&buf.at(0),
1761 buffered);
1762 if (r != 0) {
1763 derr << __func__ << " failed to read 0x" << std::hex
1764 << old_ext.offset << "~" << old_ext.length << std::dec
1765 << " from " << (int)dev_target << dendl;
1766 return -EIO;
1767 }
1768 bl.append((char*)&buf[0], old_ext.length);
1769 }
1770
1771 // write entire file
1772 PExtentVector extents;
1773 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1774 if (l < 0) {
1775 derr << __func__ << " unable to allocate len 0x" << std::hex
1776 << bl.length() << std::dec << " from " << (int)dev_target
1777 << ": " << cpp_strerror(l) << dendl;
1778 return -ENOSPC;
1779 }
1780
1781 uint64_t off = 0;
1782 for (auto& i : extents) {
1783 bufferlist cur;
1784 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1785 ceph_assert(cur_len > 0);
1786 cur.substr_of(bl, off, cur_len);
1787 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1788 ceph_assert(r == 0);
1789 off += cur_len;
1790 }
1791
1792 // release old extents
1793 for (auto old_ext : fnode_extents) {
1794 PExtentVector to_release;
1795 to_release.emplace_back(old_ext.offset, old_ext.length);
1796 alloc[old_ext.bdev]->release(to_release);
1797 }
1798
1799 // update fnode
1800 fnode_extents.clear();
1801 for (auto& i : extents) {
1802 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
1803 }
1804 }
1805 }
1806 // new logging device in the current naming scheme
1807 int new_log_dev_cur =
1808 bdev[BDEV_NEWWAL] ?
1809 BDEV_NEWWAL :
1810 bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ?
1811 BDEV_WAL :
1812 bdev[BDEV_NEWDB] ?
1813 BDEV_NEWDB :
1814 bdev[BDEV_DB] && !(flags & REMOVE_DB)?
1815 BDEV_DB :
1816 BDEV_SLOW;
1817
1818 // new logging device in new naming scheme
1819 int new_log_dev_next =
1820 new_log_dev_cur == BDEV_NEWWAL ?
1821 BDEV_WAL :
1822 new_log_dev_cur == BDEV_NEWDB ?
1823 BDEV_DB :
1824 new_log_dev_cur;
1825
1826 int super_dev =
1827 dev_target == BDEV_NEWDB ?
1828 BDEV_NEWDB :
1829 bdev[BDEV_DB] ?
1830 BDEV_DB :
1831 BDEV_SLOW;
1832
1833 _rewrite_log_and_layout_sync(
1834 false,
1835 super_dev,
1836 new_log_dev_cur,
1837 new_log_dev_next,
1838 flags,
1839 layout);
1840 return 0;
1841 }
1842
1843 BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
1844 {
1845 auto p = file_map.find(ino);
1846 if (p == file_map.end()) {
1847 FileRef f = ceph::make_ref<File>();
1848 file_map[ino] = f;
1849 dout(30) << __func__ << " ino " << ino << " = " << f
1850 << " (new)" << dendl;
1851 return f;
1852 } else {
1853 dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
1854 return p->second;
1855 }
1856 }
1857
1858 void BlueFS::_drop_link(FileRef file)
1859 {
1860 dout(20) << __func__ << " had refs " << file->refs
1861 << " on " << file->fnode << dendl;
1862 ceph_assert(file->refs > 0);
1863 --file->refs;
1864 if (file->refs == 0) {
1865 dout(20) << __func__ << " destroying " << file->fnode << dendl;
1866 ceph_assert(file->num_reading.load() == 0);
1867 vselector->sub_usage(file->vselector_hint, file->fnode);
1868 log_t.op_file_remove(file->fnode.ino);
1869 for (auto& r : file->fnode.extents) {
1870 pending_release[r.bdev].insert(r.offset, r.length);
1871 }
1872 file_map.erase(file->fnode.ino);
1873 file->deleted = true;
1874
1875 if (file->dirty_seq) {
1876 ceph_assert(file->dirty_seq > log_seq_stable);
1877 ceph_assert(dirty_files.count(file->dirty_seq));
1878 auto it = dirty_files[file->dirty_seq].iterator_to(*file);
1879 dirty_files[file->dirty_seq].erase(it);
1880 file->dirty_seq = 0;
1881 }
1882 }
1883 }
1884
1885 int64_t BlueFS::_read_random(
1886 FileReader *h, ///< [in] read from here
1887 uint64_t off, ///< [in] offset
1888 uint64_t len, ///< [in] this many bytes
1889 char *out) ///< [out] optional: or copy it here
1890 {
1891 auto* buf = &h->buf;
1892
1893 int64_t ret = 0;
1894 dout(10) << __func__ << " h " << h
1895 << " 0x" << std::hex << off << "~" << len << std::dec
1896 << " from " << h->file->fnode << dendl;
1897
1898 ++h->file->num_reading;
1899
1900 if (!h->ignore_eof &&
1901 off + len > h->file->fnode.size) {
1902 if (off > h->file->fnode.size)
1903 len = 0;
1904 else
1905 len = h->file->fnode.size - off;
1906 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
1907 << std::hex << len << std::dec << dendl;
1908 }
1909 logger->inc(l_bluefs_read_random_count, 1);
1910 logger->inc(l_bluefs_read_random_bytes, len);
1911
1912 std::shared_lock s_lock(h->lock);
1913 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
1914 while (len > 0) {
1915 if (off < buf->bl_off || off >= buf->get_buf_end()) {
1916 s_lock.unlock();
1917 uint64_t x_off = 0;
1918 auto p = h->file->fnode.seek(off, &x_off);
1919 ceph_assert(p != h->file->fnode.extents.end());
1920 uint64_t l = std::min(p->length - x_off, len);
1921 //hard cap to 1GB
1922 l = std::min(l, uint64_t(1) << 30);
1923 dout(20) << __func__ << " read random 0x"
1924 << std::hex << x_off << "~" << l << std::dec
1925 << " of " << *p << dendl;
1926 int r = bdev[p->bdev]->read_random(p->offset + x_off, l, out,
1927 cct->_conf->bluefs_buffered_io);
1928 ceph_assert(r == 0);
1929 off += l;
1930 len -= l;
1931 ret += l;
1932 out += l;
1933
1934 logger->inc(l_bluefs_read_random_disk_count, 1);
1935 logger->inc(l_bluefs_read_random_disk_bytes, l);
1936 if (len > 0) {
1937 s_lock.lock();
1938 }
1939 } else {
1940 auto left = buf->get_buf_remaining(off);
1941 int64_t r = std::min(len, left);
1942 logger->inc(l_bluefs_read_random_buffer_count, 1);
1943 logger->inc(l_bluefs_read_random_buffer_bytes, r);
1944 dout(20) << __func__ << " left 0x" << std::hex << left
1945 << " 0x" << off << "~" << len << std::dec
1946 << dendl;
1947
1948 if (out) {
1949 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
1950 memcpy(out, buf->bl.c_str() + off - buf->bl_off, r);
1951 out += r;
1952 }
1953
1954 dout(30) << __func__ << " result chunk (0x"
1955 << std::hex << r << std::dec << " bytes):\n";
1956 bufferlist t;
1957 t.substr_of(buf->bl, off - buf->bl_off, r);
1958 t.hexdump(*_dout);
1959 *_dout << dendl;
1960
1961 off += r;
1962 len -= r;
1963 ret += r;
1964 buf->pos += r;
1965 }
1966 }
1967 dout(20) << __func__ << " got " << ret << dendl;
1968 --h->file->num_reading;
1969 return ret;
1970 }
1971
1972 int64_t BlueFS::_read(
1973 FileReader *h, ///< [in] read from here
1974 FileReaderBuffer *buf, ///< [in] reader state
1975 uint64_t off, ///< [in] offset
1976 size_t len, ///< [in] this many bytes
1977 bufferlist *outbl, ///< [out] optional: reference the result here
1978 char *out) ///< [out] optional: or copy it here
1979 {
1980 bool prefetch = !outbl && !out;
1981 dout(10) << __func__ << " h " << h
1982 << " 0x" << std::hex << off << "~" << len << std::dec
1983 << " from " << h->file->fnode
1984 << (prefetch ? " prefetch" : "")
1985 << dendl;
1986
1987 ++h->file->num_reading;
1988
1989 if (!h->ignore_eof &&
1990 off + len > h->file->fnode.size) {
1991 if (off > h->file->fnode.size)
1992 len = 0;
1993 else
1994 len = h->file->fnode.size - off;
1995 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
1996 << std::hex << len << std::dec << dendl;
1997 }
1998 logger->inc(l_bluefs_read_count, 1);
1999 logger->inc(l_bluefs_read_bytes, len);
2000 if (prefetch) {
2001 logger->inc(l_bluefs_read_prefetch_count, 1);
2002 logger->inc(l_bluefs_read_prefetch_bytes, len);
2003 }
2004
2005 if (outbl)
2006 outbl->clear();
2007
2008 int64_t ret = 0;
2009 std::shared_lock s_lock(h->lock);
2010 while (len > 0) {
2011 size_t left;
2012 if (off < buf->bl_off || off >= buf->get_buf_end()) {
2013 s_lock.unlock();
2014 std::unique_lock u_lock(h->lock);
2015 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
2016 if (off < buf->bl_off || off >= buf->get_buf_end()) {
2017 // if precondition hasn't changed during locking upgrade.
2018 buf->bl.clear();
2019 buf->bl_off = off & super.block_mask();
2020 uint64_t x_off = 0;
2021 auto p = h->file->fnode.seek(buf->bl_off, &x_off);
2022 if (p == h->file->fnode.extents.end()) {
2023 dout(5) << __func__ << " reading less then required "
2024 << ret << "<" << ret + len << dendl;
2025 break;
2026 }
2027
2028 uint64_t want = round_up_to(len + (off & ~super.block_mask()),
2029 super.block_size);
2030 want = std::max(want, buf->max_prefetch);
2031 uint64_t l = std::min(p->length - x_off, want);
2032 //hard cap to 1GB
2033 l = std::min(l, uint64_t(1) << 30);
2034 uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size);
2035 if (!h->ignore_eof &&
2036 buf->bl_off + l > eof_offset) {
2037 l = eof_offset - buf->bl_off;
2038 }
2039 dout(20) << __func__ << " fetching 0x"
2040 << std::hex << x_off << "~" << l << std::dec
2041 << " of " << *p << dendl;
2042 int r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev],
2043 cct->_conf->bluefs_buffered_io);
2044 ceph_assert(r == 0);
2045 }
2046 u_lock.unlock();
2047 s_lock.lock();
2048 // we should recheck if buffer is valid after lock downgrade
2049 continue;
2050 }
2051 left = buf->get_buf_remaining(off);
2052 dout(20) << __func__ << " left 0x" << std::hex << left
2053 << " len 0x" << len << std::dec << dendl;
2054
2055 int64_t r = std::min(len, left);
2056 if (outbl) {
2057 bufferlist t;
2058 t.substr_of(buf->bl, off - buf->bl_off, r);
2059 outbl->claim_append(t);
2060 }
2061 if (out) {
2062 // NOTE: h->bl is normally a contiguous buffer so c_str() is free.
2063 memcpy(out, buf->bl.c_str() + off - buf->bl_off, r);
2064 out += r;
2065 }
2066
2067 dout(30) << __func__ << " result chunk (0x"
2068 << std::hex << r << std::dec << " bytes):\n";
2069 bufferlist t;
2070 t.substr_of(buf->bl, off - buf->bl_off, r);
2071 t.hexdump(*_dout);
2072 *_dout << dendl;
2073
2074 off += r;
2075 len -= r;
2076 ret += r;
2077 buf->pos += r;
2078 }
2079 dout(20) << __func__ << " got " << ret << dendl;
2080 ceph_assert(!outbl || (int)outbl->length() == ret);
2081 --h->file->num_reading;
2082 return ret;
2083 }
2084
2085 void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
2086 {
2087 dout(10) << __func__ << " file " << f->fnode
2088 << " 0x" << std::hex << offset << "~" << length << std::dec
2089 << dendl;
2090 if (offset & ~super.block_mask()) {
2091 offset &= super.block_mask();
2092 length = round_up_to(length, super.block_size);
2093 }
2094 uint64_t x_off = 0;
2095 auto p = f->fnode.seek(offset, &x_off);
2096 while (length > 0 && p != f->fnode.extents.end()) {
2097 uint64_t x_len = std::min(p->length - x_off, length);
2098 bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
2099 dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len
2100 << std:: dec << " of " << *p << dendl;
2101 offset += x_len;
2102 length -= x_len;
2103 }
2104 }
2105
2106 uint64_t BlueFS::_estimate_log_size()
2107 {
2108 int avg_dir_size = 40; // fixme
2109 int avg_file_size = 12;
2110 uint64_t size = 4096 * 2;
2111 size += file_map.size() * (1 + sizeof(bluefs_fnode_t));
2112 for (auto& p : block_all)
2113 size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
2114 size += dir_map.size() + (1 + avg_dir_size);
2115 size += file_map.size() * (1 + avg_dir_size + avg_file_size);
2116 return round_up_to(size, super.block_size);
2117 }
2118
2119 void BlueFS::compact_log()
2120 {
2121 std::unique_lock<ceph::mutex> l(lock);
2122 if (!cct->_conf->bluefs_replay_recovery_disable_compact) {
2123 if (cct->_conf->bluefs_compact_log_sync) {
2124 _compact_log_sync();
2125 } else {
2126 _compact_log_async(l);
2127 }
2128 }
2129 }
2130
2131 bool BlueFS::_should_compact_log()
2132 {
2133 uint64_t current = log_writer->file->fnode.size;
2134 uint64_t expected = _estimate_log_size();
2135 float ratio = (float)current / (float)expected;
2136 dout(10) << __func__ << " current 0x" << std::hex << current
2137 << " expected " << expected << std::dec
2138 << " ratio " << ratio
2139 << (new_log ? " (async compaction in progress)" : "")
2140 << dendl;
2141 if (new_log ||
2142 current < cct->_conf->bluefs_log_compact_min_size ||
2143 ratio < cct->_conf->bluefs_log_compact_min_ratio) {
2144 return false;
2145 }
2146 return true;
2147 }
2148
2149 void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t,
2150 int flags)
2151 {
2152 t->seq = 1;
2153 t->uuid = super.uuid;
2154 dout(20) << __func__ << " op_init" << dendl;
2155
2156 t->op_init();
2157 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
2158 interval_set<uint64_t>& p = block_all[bdev];
2159 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
2160 auto bdev_new = bdev;
2161 if ((flags & REMOVE_WAL) && bdev == BDEV_WAL) {
2162 continue;
2163 }
2164 if ((flags & REMOVE_DB) && bdev == BDEV_DB) {
2165 continue;
2166 }
2167 if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
2168 bdev_new = BDEV_DB;
2169 }
2170 if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
2171 bdev_new = BDEV_SLOW;
2172 }
2173 if (bdev == BDEV_NEWDB) {
2174 // REMOVE_DB xor RENAME_DB
2175 ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
2176 ceph_assert(!(flags & RENAME_SLOW2DB));
2177 bdev_new = BDEV_DB;
2178 }
2179 if (bdev == BDEV_NEWWAL) {
2180 ceph_assert(flags & REMOVE_WAL);
2181 bdev_new = BDEV_WAL;
2182 }
2183 dout(20) << __func__ << " op_alloc_add " << bdev_new << " 0x"
2184 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
2185 << dendl;
2186 t->op_alloc_add(bdev_new, q.get_start(), q.get_len());
2187 }
2188 }
2189 for (auto& [ino, file_ref] : file_map) {
2190 if (ino == 1)
2191 continue;
2192 ceph_assert(ino > 1);
2193
2194 for(auto& e : file_ref->fnode.extents) {
2195 auto bdev = e.bdev;
2196 auto bdev_new = bdev;
2197 ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL));
2198 if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
2199 bdev_new = BDEV_DB;
2200 }
2201 if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
2202 bdev_new = BDEV_SLOW;
2203 }
2204 if (bdev == BDEV_NEWDB) {
2205 // REMOVE_DB xor RENAME_DB
2206 ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
2207 ceph_assert(!(flags & RENAME_SLOW2DB));
2208 bdev_new = BDEV_DB;
2209 }
2210 if (bdev == BDEV_NEWWAL) {
2211 ceph_assert(flags & REMOVE_WAL);
2212 bdev_new = BDEV_WAL;
2213 }
2214 e.bdev = bdev_new;
2215 }
2216 dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
2217 t->op_file_update(file_ref->fnode);
2218 }
2219 for (auto& [path, dir_ref] : dir_map) {
2220 dout(20) << __func__ << " op_dir_create " << path << dendl;
2221 t->op_dir_create(path);
2222 for (auto& [fname, file_ref] : dir_ref->file_map) {
2223 dout(20) << __func__ << " op_dir_link " << path << "/" << fname
2224 << " to " << file_ref->fnode.ino << dendl;
2225 t->op_dir_link(path, fname, file_ref->fnode.ino);
2226 }
2227 }
2228 }
2229
2230 void BlueFS::_compact_log_sync()
2231 {
2232 dout(10) << __func__ << dendl;
2233 auto prefer_bdev =
2234 vselector->select_prefer_bdev(log_writer->file->vselector_hint);
2235 _rewrite_log_and_layout_sync(true,
2236 BDEV_DB,
2237 prefer_bdev,
2238 prefer_bdev,
2239 0,
2240 super.memorized_layout);
2241 logger->inc(l_bluefs_log_compactions);
2242 }
2243
2244 void BlueFS::_rewrite_log_and_layout_sync(bool allocate_with_fallback,
2245 int super_dev,
2246 int log_dev,
2247 int log_dev_new,
2248 int flags,
2249 std::optional<bluefs_layout_t> layout)
2250 {
2251 File *log_file = log_writer->file.get();
2252
2253 // clear out log (be careful who calls us!!!)
2254 log_t.clear();
2255
2256 dout(20) << __func__ << " super_dev:" << super_dev
2257 << " log_dev:" << log_dev
2258 << " log_dev_new:" << log_dev_new
2259 << " flags:" << flags
2260 << dendl;
2261 bluefs_transaction_t t;
2262 _compact_log_dump_metadata(&t, flags);
2263
2264 dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
2265 t.op_jump_seq(log_seq);
2266
2267 bufferlist bl;
2268 encode(t, bl);
2269 _pad_bl(bl);
2270
2271 uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
2272 dout(20) << __func__ << " need " << need << dendl;
2273
2274 bluefs_fnode_t old_fnode;
2275 int r;
2276 log_file->fnode.swap_extents(old_fnode);
2277 if (allocate_with_fallback) {
2278 r = _allocate(log_dev, need, &log_file->fnode);
2279 ceph_assert(r == 0);
2280 } else {
2281 PExtentVector extents;
2282 r = _allocate_without_fallback(log_dev,
2283 need,
2284 &extents);
2285 ceph_assert(r == 0);
2286 for (auto& p : extents) {
2287 log_file->fnode.append_extent(
2288 bluefs_extent_t(log_dev, p.offset, p.length));
2289 }
2290 }
2291
2292 _close_writer(log_writer);
2293
2294 log_file->fnode.size = bl.length();
2295 vselector->sub_usage(log_file->vselector_hint, old_fnode);
2296 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2297
2298 log_writer = _create_writer(log_file);
2299 log_writer->append(bl);
2300 r = _flush(log_writer, true);
2301 ceph_assert(r == 0);
2302 #ifdef HAVE_LIBAIO
2303 if (!cct->_conf->bluefs_sync_write) {
2304 list<aio_t> completed_ios;
2305 _claim_completed_aios(log_writer, &completed_ios);
2306 wait_for_aio(log_writer);
2307 completed_ios.clear();
2308 }
2309 #endif
2310 flush_bdev();
2311
2312 super.memorized_layout = layout;
2313 super.log_fnode = log_file->fnode;
2314 // rename device if needed
2315 if (log_dev != log_dev_new) {
2316 dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl;
2317 for (auto& p : super.log_fnode.extents) {
2318 p.bdev = log_dev_new;
2319 }
2320 }
2321 dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl;
2322
2323 ++super.version;
2324 _write_super(super_dev);
2325 flush_bdev();
2326
2327 dout(10) << __func__ << " release old log extents " << old_fnode.extents << dendl;
2328 for (auto& r : old_fnode.extents) {
2329 pending_release[r.bdev].insert(r.offset, r.length);
2330 }
2331 }
2332
2333 /*
2334 * 1. Allocate a new extent to continue the log, and then log an event
2335 * that jumps the log write position to the new extent. At this point, the
2336 * old extent(s) won't be written to, and reflect everything to compact.
2337 * New events will be written to the new region that we'll keep.
2338 *
2339 * 2. While still holding the lock, encode a bufferlist that dumps all of the
2340 * in-memory fnodes and names. This will become the new beginning of the
2341 * log. The last event will jump to the log continuation extent from #1.
2342 *
2343 * 3. Queue a write to a new extent for the new beginnging of the log.
2344 *
2345 * 4. Drop lock and wait
2346 *
2347 * 5. Retake the lock.
2348 *
2349 * 6. Update the log_fnode to splice in the new beginning.
2350 *
2351 * 7. Write the new superblock.
2352 *
2353 * 8. Release the old log space. Clean up.
2354 */
2355 void BlueFS::_compact_log_async(std::unique_lock<ceph::mutex>& l)
2356 {
2357 dout(10) << __func__ << dendl;
2358 File *log_file = log_writer->file.get();
2359 ceph_assert(!new_log);
2360 ceph_assert(!new_log_writer);
2361
2362 // create a new log [writer] so that we know compaction is in progress
2363 // (see _should_compact_log)
2364 new_log = ceph::make_ref<File>();
2365 new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode
2366
2367 // 0. wait for any racing flushes to complete. (We do not want to block
2368 // in _flush_sync_log with jump_to set or else a racing thread might flush
2369 // our entries and our jump_to update won't be correct.)
2370 while (log_flushing) {
2371 dout(10) << __func__ << " log is currently flushing, waiting" << dendl;
2372 log_cond.wait(l);
2373 }
2374
2375 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2376
2377 // 1. allocate new log space and jump to it.
2378 old_log_jump_to = log_file->fnode.get_allocated();
2379 dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
2380 << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl;
2381 int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
2382 cct->_conf->bluefs_max_log_runway,
2383 &log_file->fnode);
2384 ceph_assert(r == 0);
2385 //adjust usage as flush below will need it
2386 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2387 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2388
2389 // update the log file change and log a jump to the offset where we want to
2390 // write the new entries
2391 log_t.op_file_update(log_file->fnode);
2392 log_t.op_jump(log_seq, old_log_jump_to);
2393
2394 flush_bdev(); // FIXME?
2395
2396 _flush_and_sync_log(l, 0, old_log_jump_to);
2397
2398 // 2. prepare compacted log
2399 bluefs_transaction_t t;
2400 //avoid record two times in log_t and _compact_log_dump_metadata.
2401 log_t.clear();
2402 _compact_log_dump_metadata(&t, 0);
2403
2404 uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
2405 std::max(alloc_size[BDEV_DB],
2406 alloc_size[BDEV_SLOW]));
2407
2408 // conservative estimate for final encoded size
2409 new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2,
2410 max_alloc_size);
2411 t.op_jump(log_seq, new_log_jump_to);
2412
2413 // allocate
2414 //FIXME: check if we want DB here?
2415 r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
2416 &new_log->fnode);
2417 ceph_assert(r == 0);
2418
2419 // we might have some more ops in log_t due to _allocate call
2420 t.claim_ops(log_t);
2421
2422 bufferlist bl;
2423 encode(t, bl);
2424 _pad_bl(bl);
2425
2426 dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
2427 << std::dec << dendl;
2428
2429 new_log_writer = _create_writer(new_log);
2430 new_log_writer->append(bl);
2431
2432 // 3. flush
2433 r = _flush(new_log_writer, true);
2434 ceph_assert(r == 0);
2435
2436 // 4. wait
2437 _flush_bdev_safely(new_log_writer);
2438
2439 // 5. update our log fnode
2440 // discard first old_log_jump_to extents
2441
2442 dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec
2443 << " of " << log_file->fnode.extents << dendl;
2444 uint64_t discarded = 0;
2445 mempool::bluefs::vector<bluefs_extent_t> old_extents;
2446 while (discarded < old_log_jump_to) {
2447 ceph_assert(!log_file->fnode.extents.empty());
2448 bluefs_extent_t& e = log_file->fnode.extents.front();
2449 bluefs_extent_t temp = e;
2450 if (discarded + e.length <= old_log_jump_to) {
2451 dout(10) << __func__ << " remove old log extent " << e << dendl;
2452 discarded += e.length;
2453 log_file->fnode.pop_front_extent();
2454 } else {
2455 dout(10) << __func__ << " remove front of old log extent " << e << dendl;
2456 uint64_t drop = old_log_jump_to - discarded;
2457 temp.length = drop;
2458 e.offset += drop;
2459 e.length -= drop;
2460 discarded += drop;
2461 dout(10) << __func__ << " kept " << e << " removed " << temp << dendl;
2462 }
2463 old_extents.push_back(temp);
2464 }
2465 auto from = log_file->fnode.extents.begin();
2466 auto to = log_file->fnode.extents.end();
2467 while (from != to) {
2468 new_log->fnode.append_extent(*from);
2469 ++from;
2470 }
2471
2472 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2473
2474 // clear the extents from old log file, they are added to new log
2475 log_file->fnode.clear_extents();
2476 // swap the log files. New log file is the log file now.
2477 new_log->fnode.swap_extents(log_file->fnode);
2478
2479 log_writer->pos = log_writer->file->fnode.size =
2480 log_writer->pos - old_log_jump_to + new_log_jump_to;
2481
2482 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2483
2484 // 6. write the super block to reflect the changes
2485 dout(10) << __func__ << " writing super" << dendl;
2486 super.log_fnode = log_file->fnode;
2487 ++super.version;
2488 _write_super(BDEV_DB);
2489
2490 lock.unlock();
2491 flush_bdev();
2492 lock.lock();
2493
2494 // 7. release old space
2495 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
2496 for (auto& r : old_extents) {
2497 pending_release[r.bdev].insert(r.offset, r.length);
2498 }
2499
2500 // delete the new log, remove from the dirty files list
2501 _close_writer(new_log_writer);
2502 if (new_log->dirty_seq) {
2503 ceph_assert(dirty_files.count(new_log->dirty_seq));
2504 auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log);
2505 dirty_files[new_log->dirty_seq].erase(it);
2506 }
2507 new_log_writer = nullptr;
2508 new_log = nullptr;
2509 log_cond.notify_all();
2510
2511 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2512 logger->inc(l_bluefs_log_compactions);
2513 }
2514
2515 void BlueFS::_pad_bl(bufferlist& bl)
2516 {
2517 uint64_t partial = bl.length() % super.block_size;
2518 if (partial) {
2519 dout(10) << __func__ << " padding with 0x" << std::hex
2520 << super.block_size - partial << " zeros" << std::dec << dendl;
2521 bl.append_zero(super.block_size - partial);
2522 }
2523 }
2524
2525
2526 int BlueFS::_flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
2527 uint64_t want_seq,
2528 uint64_t jump_to)
2529 {
2530 while (log_flushing) {
2531 dout(10) << __func__ << " want_seq " << want_seq
2532 << " log is currently flushing, waiting" << dendl;
2533 ceph_assert(!jump_to);
2534 log_cond.wait(l);
2535 }
2536 if (want_seq && want_seq <= log_seq_stable) {
2537 dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable "
2538 << log_seq_stable << ", done" << dendl;
2539 ceph_assert(!jump_to);
2540 return 0;
2541 }
2542 if (log_t.empty() && dirty_files.empty()) {
2543 dout(10) << __func__ << " want_seq " << want_seq
2544 << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl;
2545 ceph_assert(!jump_to);
2546 return 0;
2547 }
2548
2549 vector<interval_set<uint64_t>> to_release(pending_release.size());
2550 to_release.swap(pending_release);
2551
2552 uint64_t seq = log_t.seq = ++log_seq;
2553 ceph_assert(want_seq == 0 || want_seq <= seq);
2554 log_t.uuid = super.uuid;
2555
2556 // log dirty files
2557 auto lsi = dirty_files.find(seq);
2558 if (lsi != dirty_files.end()) {
2559 dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl;
2560 for (auto &f : lsi->second) {
2561 dout(20) << __func__ << " op_file_update " << f.fnode << dendl;
2562 log_t.op_file_update(f.fnode);
2563 }
2564 }
2565
2566 dout(10) << __func__ << " " << log_t << dendl;
2567 ceph_assert(!log_t.empty());
2568
2569 // allocate some more space (before we run out)?
2570 int64_t runway = log_writer->file->fnode.get_allocated() -
2571 log_writer->get_effective_write_pos();
2572 bool just_expanded_log = false;
2573 if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
2574 dout(10) << __func__ << " allocating more log runway (0x"
2575 << std::hex << runway << std::dec << " remaining)" << dendl;
2576 while (new_log_writer) {
2577 dout(10) << __func__ << " waiting for async compaction" << dendl;
2578 log_cond.wait(l);
2579 }
2580 vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode);
2581 int r = _allocate(
2582 vselector->select_prefer_bdev(log_writer->file->vselector_hint),
2583 cct->_conf->bluefs_max_log_runway,
2584 &log_writer->file->fnode);
2585 ceph_assert(r == 0);
2586 vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode);
2587 log_t.op_file_update(log_writer->file->fnode);
2588 just_expanded_log = true;
2589 }
2590
2591 bufferlist bl;
2592 bl.reserve(super.block_size);
2593 encode(log_t, bl);
2594 // pad to block boundary
2595 size_t realign = super.block_size - (bl.length() % super.block_size);
2596 if (realign && realign != super.block_size)
2597 bl.append_zero(realign);
2598
2599 logger->inc(l_bluefs_logged_bytes, bl.length());
2600
2601 if (just_expanded_log) {
2602 ceph_assert(bl.length() <= runway); // if we write this, we will have an unrecoverable data loss
2603 }
2604
2605 log_writer->append(bl);
2606
2607 log_t.clear();
2608 log_t.seq = 0; // just so debug output is less confusing
2609 log_flushing = true;
2610
2611 int r = _flush(log_writer, true);
2612 ceph_assert(r == 0);
2613
2614 if (jump_to) {
2615 dout(10) << __func__ << " jumping log offset from 0x" << std::hex
2616 << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl;
2617 log_writer->pos = jump_to;
2618 vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size);
2619 log_writer->file->fnode.size = jump_to;
2620 vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size);
2621 }
2622
2623 _flush_bdev_safely(log_writer);
2624
2625 log_flushing = false;
2626 log_cond.notify_all();
2627
2628 // clean dirty files
2629 if (seq > log_seq_stable) {
2630 log_seq_stable = seq;
2631 dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl;
2632
2633 auto p = dirty_files.begin();
2634 while (p != dirty_files.end()) {
2635 if (p->first > log_seq_stable) {
2636 dout(20) << __func__ << " done cleaning up dirty files" << dendl;
2637 break;
2638 }
2639
2640 auto l = p->second.begin();
2641 while (l != p->second.end()) {
2642 File *file = &*l;
2643 ceph_assert(file->dirty_seq > 0);
2644 ceph_assert(file->dirty_seq <= log_seq_stable);
2645 dout(20) << __func__ << " cleaned file " << file->fnode << dendl;
2646 file->dirty_seq = 0;
2647 p->second.erase(l++);
2648 }
2649
2650 ceph_assert(p->second.empty());
2651 dirty_files.erase(p++);
2652 }
2653 } else {
2654 dout(20) << __func__ << " log_seq_stable " << log_seq_stable
2655 << " already >= out seq " << seq
2656 << ", we lost a race against another log flush, done" << dendl;
2657 }
2658
2659 for (unsigned i = 0; i < to_release.size(); ++i) {
2660 if (!to_release[i].empty()) {
2661 /* OK, now we have the guarantee alloc[i] won't be null. */
2662 int r = 0;
2663 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
2664 r = bdev[i]->queue_discard(to_release[i]);
2665 if (r == 0)
2666 continue;
2667 } else if (cct->_conf->bdev_enable_discard) {
2668 for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
2669 bdev[i]->discard(p.get_start(), p.get_len());
2670 }
2671 }
2672 alloc[i]->release(to_release[i]);
2673 }
2674 }
2675
2676 _update_logger_stats();
2677
2678 return 0;
2679 }
2680
2681 int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
2682 {
2683 dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
2684 << " 0x" << offset << "~" << length << std::dec
2685 << " to " << h->file->fnode << dendl;
2686 ceph_assert(!h->file->deleted);
2687 ceph_assert(h->file->num_readers.load() == 0);
2688
2689 h->buffer_appender.flush();
2690
2691 bool buffered;
2692 if (h->file->fnode.ino == 1)
2693 buffered = false;
2694 else
2695 buffered = cct->_conf->bluefs_buffered_io;
2696
2697 if (offset + length <= h->pos)
2698 return 0;
2699 if (offset < h->pos) {
2700 length -= h->pos - offset;
2701 offset = h->pos;
2702 dout(10) << " still need 0x"
2703 << std::hex << offset << "~" << length << std::dec
2704 << dendl;
2705 }
2706 ceph_assert(offset <= h->file->fnode.size);
2707
2708 uint64_t allocated = h->file->fnode.get_allocated();
2709 vselector->sub_usage(h->file->vselector_hint, h->file->fnode);
2710 // do not bother to dirty the file if we are overwriting
2711 // previously allocated extents.
2712 bool must_dirty = false;
2713 if (allocated < offset + length) {
2714 // we should never run out of log space here; see the min runway check
2715 // in _flush_and_sync_log.
2716 ceph_assert(h->file->fnode.ino != 1);
2717 int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint),
2718 offset + length - allocated,
2719 &h->file->fnode);
2720 if (r < 0) {
2721 derr << __func__ << " allocated: 0x" << std::hex << allocated
2722 << " offset: 0x" << offset << " length: 0x" << length << std::dec
2723 << dendl;
2724 vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo
2725 ceph_abort_msg("bluefs enospc");
2726 return r;
2727 }
2728 must_dirty = true;
2729 }
2730 if (h->file->fnode.size < offset + length) {
2731 h->file->fnode.size = offset + length;
2732 if (h->file->fnode.ino > 1) {
2733 // we do not need to dirty the log file (or it's compacting
2734 // replacement) when the file size changes because replay is
2735 // smart enough to discover it on its own.
2736 must_dirty = true;
2737 }
2738 }
2739 if (must_dirty) {
2740 h->file->fnode.mtime = ceph_clock_now();
2741 ceph_assert(h->file->fnode.ino >= 1);
2742 if (h->file->dirty_seq == 0) {
2743 h->file->dirty_seq = log_seq + 1;
2744 dirty_files[h->file->dirty_seq].push_back(*h->file);
2745 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2746 << " (was clean)" << dendl;
2747 } else {
2748 if (h->file->dirty_seq != log_seq + 1) {
2749 // need re-dirty, erase from list first
2750 ceph_assert(dirty_files.count(h->file->dirty_seq));
2751 auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
2752 dirty_files[h->file->dirty_seq].erase(it);
2753 h->file->dirty_seq = log_seq + 1;
2754 dirty_files[h->file->dirty_seq].push_back(*h->file);
2755 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2756 << " (was " << h->file->dirty_seq << ")" << dendl;
2757 } else {
2758 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2759 << " (unchanged, do nothing) " << dendl;
2760 }
2761 }
2762 }
2763 dout(20) << __func__ << " file now " << h->file->fnode << dendl;
2764
2765 uint64_t x_off = 0;
2766 auto p = h->file->fnode.seek(offset, &x_off);
2767 ceph_assert(p != h->file->fnode.extents.end());
2768 dout(20) << __func__ << " in " << *p << " x_off 0x"
2769 << std::hex << x_off << std::dec << dendl;
2770
2771 unsigned partial = x_off & ~super.block_mask();
2772 bufferlist bl;
2773 if (partial) {
2774 dout(20) << __func__ << " using partial tail 0x"
2775 << std::hex << partial << std::dec << dendl;
2776 ceph_assert(h->tail_block.length() == partial);
2777 bl.claim_append_piecewise(h->tail_block);
2778 x_off -= partial;
2779 offset -= partial;
2780 length += partial;
2781 dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
2782 for (auto p : h->iocv) {
2783 if (p) {
2784 p->aio_wait();
2785 }
2786 }
2787 }
2788 if (length == partial + h->buffer.length()) {
2789 /* in case of inital allocation and need to zero, limited flush is unacceptable */
2790 bl.claim_append_piecewise(h->buffer);
2791 } else {
2792 bufferlist t;
2793 h->buffer.splice(0, length, &t);
2794 bl.claim_append_piecewise(t);
2795 t.substr_of(h->buffer, length, h->buffer.length() - length);
2796 h->buffer.swap(t);
2797 dout(20) << " leaving 0x" << std::hex << h->buffer.length() << std::dec
2798 << " unflushed" << dendl;
2799 }
2800 ceph_assert(bl.length() == length);
2801
2802 h->pos = offset + length;
2803
2804 unsigned tail = bl.length() & ~super.block_mask();
2805 if (tail) {
2806 dout(20) << __func__ << " caching tail of 0x"
2807 << std::hex << tail
2808 << " and padding block with 0x" << (super.block_size - tail)
2809 << std::dec << dendl;
2810 h->tail_block.substr_of(bl, bl.length() - tail, tail);
2811 bl.append_zero(super.block_size - tail);
2812 length += super.block_size - tail;
2813 } else {
2814 h->tail_block.clear();
2815 }
2816 ceph_assert(bl.length() == length);
2817
2818 switch (h->writer_type) {
2819 case WRITER_WAL:
2820 logger->inc(l_bluefs_bytes_written_wal, length);
2821 break;
2822 case WRITER_SST:
2823 logger->inc(l_bluefs_bytes_written_sst, length);
2824 break;
2825 }
2826
2827 dout(30) << "dump:\n";
2828 bl.hexdump(*_dout);
2829 *_dout << dendl;
2830
2831 uint64_t bloff = 0;
2832 uint64_t bytes_written_slow = 0;
2833 while (length > 0) {
2834 uint64_t x_len = std::min(p->length - x_off, length);
2835 bufferlist t;
2836 t.substr_of(bl, bloff, x_len);
2837 if (cct->_conf->bluefs_sync_write) {
2838 bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint);
2839 } else {
2840 bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint);
2841 }
2842 h->dirty_devs[p->bdev] = true;
2843 if (p->bdev == BDEV_SLOW) {
2844 bytes_written_slow += t.length();
2845 }
2846
2847 bloff += x_len;
2848 length -= x_len;
2849 ++p;
2850 x_off = 0;
2851 }
2852 logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow);
2853 for (unsigned i = 0; i < MAX_BDEV; ++i) {
2854 if (bdev[i]) {
2855 if (h->iocv[i] && h->iocv[i]->has_pending_aios()) {
2856 bdev[i]->aio_submit(h->iocv[i]);
2857 }
2858 }
2859 }
2860 vselector->add_usage(h->file->vselector_hint, h->file->fnode);
2861 dout(20) << __func__ << " h " << h << " pos now 0x"
2862 << std::hex << h->pos << std::dec << dendl;
2863 return 0;
2864 }
2865
2866 #ifdef HAVE_LIBAIO
2867 // we need to retire old completed aios so they don't stick around in
2868 // memory indefinitely (along with their bufferlist refs).
2869 void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
2870 {
2871 for (auto p : h->iocv) {
2872 if (p) {
2873 ls->splice(ls->end(), p->running_aios);
2874 }
2875 }
2876 dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
2877 }
2878
2879 void BlueFS::wait_for_aio(FileWriter *h)
2880 {
2881 // NOTE: this is safe to call without a lock, as long as our reference is
2882 // stable.
2883 dout(10) << __func__ << " " << h << dendl;
2884 utime_t start = ceph_clock_now();
2885 for (auto p : h->iocv) {
2886 if (p) {
2887 p->aio_wait();
2888 }
2889 }
2890 dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl;
2891 }
2892 #endif
2893
2894 int BlueFS::_flush(FileWriter *h, bool force, std::unique_lock<ceph::mutex>& l)
2895 {
2896 bool flushed = false;
2897 int r = _flush(h, force, &flushed);
2898 if (r == 0 && flushed) {
2899 _maybe_compact_log(l);
2900 }
2901 return r;
2902 }
2903
2904 int BlueFS::_flush(FileWriter *h, bool force, bool *flushed)
2905 {
2906 h->buffer_appender.flush();
2907 uint64_t length = h->buffer.length();
2908 uint64_t offset = h->pos;
2909 if (flushed) {
2910 *flushed = false;
2911 }
2912 if (!force &&
2913 length < cct->_conf->bluefs_min_flush_size) {
2914 dout(10) << __func__ << " " << h << " ignoring, length " << length
2915 << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
2916 << dendl;
2917 return 0;
2918 }
2919 if (length == 0) {
2920 dout(10) << __func__ << " " << h << " no dirty data on "
2921 << h->file->fnode << dendl;
2922 return 0;
2923 }
2924 dout(10) << __func__ << " " << h << " 0x"
2925 << std::hex << offset << "~" << length << std::dec
2926 << " to " << h->file->fnode << dendl;
2927 ceph_assert(h->pos <= h->file->fnode.size);
2928 int r = _flush_range(h, offset, length);
2929 if (flushed) {
2930 *flushed = true;
2931 }
2932 return r;
2933 }
2934
2935 int BlueFS::_truncate(FileWriter *h, uint64_t offset)
2936 {
2937 dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
2938 << " file " << h->file->fnode << dendl;
2939 if (h->file->deleted) {
2940 dout(10) << __func__ << " deleted, no-op" << dendl;
2941 return 0;
2942 }
2943
2944 // we never truncate internal log files
2945 ceph_assert(h->file->fnode.ino > 1);
2946
2947 h->buffer_appender.flush();
2948
2949 // truncate off unflushed data?
2950 if (h->pos < offset &&
2951 h->pos + h->buffer.length() > offset) {
2952 bufferlist t;
2953 dout(20) << __func__ << " tossing out last " << offset - h->pos
2954 << " unflushed bytes" << dendl;
2955 t.substr_of(h->buffer, 0, offset - h->pos);
2956 h->buffer.swap(t);
2957 ceph_abort_msg("actually this shouldn't happen");
2958 }
2959 if (h->buffer.length()) {
2960 int r = _flush(h, true);
2961 if (r < 0)
2962 return r;
2963 }
2964 if (offset == h->file->fnode.size) {
2965 return 0; // no-op!
2966 }
2967 if (offset > h->file->fnode.size) {
2968 ceph_abort_msg("truncate up not supported");
2969 }
2970 ceph_assert(h->file->fnode.size >= offset);
2971 vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
2972 h->file->fnode.size = offset;
2973 vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
2974 log_t.op_file_update(h->file->fnode);
2975 return 0;
2976 }
2977
2978 int BlueFS::_fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l)
2979 {
2980 dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl;
2981 int r = _flush(h, true);
2982 if (r < 0)
2983 return r;
2984 uint64_t old_dirty_seq = h->file->dirty_seq;
2985
2986 _flush_bdev_safely(h);
2987
2988 if (old_dirty_seq) {
2989 uint64_t s = log_seq;
2990 dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
2991 << ") on " << h->file->fnode << ", flushing log" << dendl;
2992 _flush_and_sync_log(l, old_dirty_seq);
2993 ceph_assert(h->file->dirty_seq == 0 || // cleaned
2994 h->file->dirty_seq > s); // or redirtied by someone else
2995 }
2996 return 0;
2997 }
2998
2999 void BlueFS::_flush_bdev_safely(FileWriter *h)
3000 {
3001 std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs;
3002 h->dirty_devs.fill(false);
3003 #ifdef HAVE_LIBAIO
3004 if (!cct->_conf->bluefs_sync_write) {
3005 list<aio_t> completed_ios;
3006 _claim_completed_aios(h, &completed_ios);
3007 lock.unlock();
3008 wait_for_aio(h);
3009 completed_ios.clear();
3010 flush_bdev(flush_devs);
3011 lock.lock();
3012 } else
3013 #endif
3014 {
3015 lock.unlock();
3016 flush_bdev(flush_devs);
3017 lock.lock();
3018 }
3019 }
3020
3021 void BlueFS::flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs)
3022 {
3023 // NOTE: this is safe to call without a lock.
3024 dout(20) << __func__ << dendl;
3025 for (unsigned i = 0; i < MAX_BDEV; i++) {
3026 if (dirty_bdevs[i])
3027 bdev[i]->flush();
3028 }
3029 }
3030
3031 void BlueFS::flush_bdev()
3032 {
3033 // NOTE: this is safe to call without a lock.
3034 dout(20) << __func__ << dendl;
3035 for (auto p : bdev) {
3036 if (p)
3037 p->flush();
3038 }
3039 }
3040
3041 const char* BlueFS::get_device_name(unsigned id)
3042 {
3043 if (id >= MAX_BDEV) return "BDEV_INV";
3044 const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
3045 return names[id];
3046 }
3047
3048 int BlueFS::_expand_slow_device(uint64_t need, PExtentVector& extents)
3049 {
3050 int r = -ENOSPC;
3051 if (slow_dev_expander) {
3052 auto id = _get_slow_device_id();
3053 auto min_alloc_size = alloc_size[id];
3054 ceph_assert(id <= alloc.size() && alloc[id]);
3055 auto min_need = round_up_to(need, min_alloc_size);
3056 need = std::max(need,
3057 slow_dev_expander->get_recommended_expansion_delta(
3058 alloc[id]->get_free(), block_all[id].size()));
3059
3060 need = round_up_to(need, min_alloc_size);
3061 dout(10) << __func__ << " expanding slow device by 0x"
3062 << std::hex << need << std::dec
3063 << dendl;
3064 r = slow_dev_expander->allocate_freespace(min_need, need, extents);
3065 }
3066 return r;
3067 }
3068
3069 int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
3070 PExtentVector* extents)
3071 {
3072 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
3073 << " from " << (int)id << dendl;
3074 assert(id < alloc.size());
3075 if (!alloc[id]) {
3076 return -ENOENT;
3077 }
3078 extents->reserve(4); // 4 should be (more than) enough for most allocations
3079 uint64_t min_alloc_size = alloc_size[id];
3080 uint64_t left = round_up_to(len, min_alloc_size);
3081 int64_t alloc_len = alloc[id]->allocate(left, min_alloc_size, 0, extents);
3082 if (alloc_len < 0 || alloc_len < (int64_t)left) {
3083 if (alloc_len > 0) {
3084 alloc[id]->release(*extents);
3085 }
3086 if (bdev[id])
3087 derr << __func__ << " failed to allocate 0x" << std::hex << left
3088 << " on bdev " << (int)id
3089 << ", free 0x" << alloc[id]->get_free() << std::dec << dendl;
3090 else
3091 derr << __func__ << " failed to allocate 0x" << std::hex << left
3092 << " on bdev " << (int)id << ", dne" << std::dec << dendl;
3093 if (alloc[id])
3094 alloc[id]->dump();
3095 return -ENOSPC;
3096 }
3097
3098 return 0;
3099 }
3100
3101 int BlueFS::_allocate(uint8_t id, uint64_t len,
3102 bluefs_fnode_t* node)
3103 {
3104 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
3105 << " from " << (int)id << dendl;
3106 ceph_assert(id < alloc.size());
3107 int64_t alloc_len = 0;
3108 PExtentVector extents;
3109 uint64_t hint = 0;
3110 if (alloc[id]) {
3111 if (!node->extents.empty() && node->extents.back().bdev == id) {
3112 hint = node->extents.back().end();
3113 }
3114 extents.reserve(4); // 4 should be (more than) enough for most allocations
3115 alloc_len = alloc[id]->allocate(round_up_to(len, alloc_size[id]),
3116 alloc_size[id], hint, &extents);
3117 }
3118 if (!alloc[id] ||
3119 alloc_len < 0 ||
3120 alloc_len < (int64_t)round_up_to(len, alloc_size[id])) {
3121 if (alloc_len > 0) {
3122 alloc[id]->release(extents);
3123 }
3124 if (id != BDEV_SLOW) {
3125 if (bdev[id]) {
3126 dout(1) << __func__ << " failed to allocate 0x" << std::hex << len
3127 << " on bdev " << (int)id
3128 << ", free 0x" << alloc[id]->get_free()
3129 << "; fallback to bdev " << (int)id + 1
3130 << std::dec << dendl;
3131 }
3132 return _allocate(id + 1, len, node);
3133 }
3134 dout(1) << __func__ << " unable to allocate 0x" << std::hex << len
3135 << " on bdev " << (int)id << ", free 0x"
3136 << (alloc[id] ? alloc[id]->get_free() : (uint64_t)-1)
3137 << "; fallback to slow device expander "
3138 << std::dec << dendl;
3139 extents.clear();
3140 if (_expand_slow_device(len, extents) == 0) {
3141 id = _get_slow_device_id();
3142 for (auto& e : extents) {
3143 _add_block_extent(id, e.offset, e.length);
3144 }
3145 extents.clear();
3146 auto* last_alloc = alloc[id];
3147 ceph_assert(last_alloc);
3148 // try again
3149 alloc_len = last_alloc->allocate(round_up_to(len, alloc_size[id]),
3150 alloc_size[id], hint, &extents);
3151 if (alloc_len < 0 || alloc_len < (int64_t)len) {
3152 if (alloc_len > 0) {
3153 last_alloc->release(extents);
3154 }
3155 derr << __func__ << " failed to allocate 0x" << std::hex << len
3156 << " on bdev " << (int)id
3157 << ", free 0x" << last_alloc->get_free() << std::dec << dendl;
3158 return -ENOSPC;
3159 }
3160 } else {
3161 derr << __func__ << " failed to expand slow device to fit +0x"
3162 << std::hex << len << std::dec
3163 << dendl;
3164 return -ENOSPC;
3165 }
3166 } else {
3167 uint64_t total_allocated =
3168 block_all[id].size() - alloc[id]->get_free();
3169 if (max_bytes[id] < total_allocated) {
3170 logger->set(max_bytes_pcounters[id], total_allocated);
3171 max_bytes[id] = total_allocated;
3172 }
3173 }
3174
3175 for (auto& p : extents) {
3176 node->append_extent(bluefs_extent_t(id, p.offset, p.length));
3177 }
3178
3179 return 0;
3180 }
3181
3182 int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len)
3183 {
3184 dout(10) << __func__ << " file " << f->fnode << " 0x"
3185 << std::hex << off << "~" << len << std::dec << dendl;
3186 if (f->deleted) {
3187 dout(10) << __func__ << " deleted, no-op" << dendl;
3188 return 0;
3189 }
3190 ceph_assert(f->fnode.ino > 1);
3191 uint64_t allocated = f->fnode.get_allocated();
3192 if (off + len > allocated) {
3193 uint64_t want = off + len - allocated;
3194 vselector->sub_usage(f->vselector_hint, f->fnode);
3195
3196 int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint),
3197 want,
3198 &f->fnode);
3199 vselector->add_usage(f->vselector_hint, f->fnode);
3200 if (r < 0)
3201 return r;
3202 log_t.op_file_update(f->fnode);
3203 }
3204 return 0;
3205 }
3206
3207 void BlueFS::sync_metadata(bool avoid_compact)
3208 {
3209 std::unique_lock<ceph::mutex> l(lock);
3210 if (log_t.empty() && dirty_files.empty()) {
3211 dout(10) << __func__ << " - no pending log events" << dendl;
3212 } else {
3213 dout(10) << __func__ << dendl;
3214 utime_t start = ceph_clock_now();
3215 flush_bdev(); // FIXME?
3216 _flush_and_sync_log(l);
3217 dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl;
3218 }
3219
3220 if (!avoid_compact) {
3221 _maybe_compact_log(l);
3222 }
3223 }
3224
3225 void BlueFS::_maybe_compact_log(std::unique_lock<ceph::mutex>& l)
3226 {
3227 if (!cct->_conf->bluefs_replay_recovery_disable_compact &&
3228 _should_compact_log()) {
3229 if (cct->_conf->bluefs_compact_log_sync) {
3230 _compact_log_sync();
3231 } else {
3232 _compact_log_async(l);
3233 }
3234 }
3235 }
3236
3237 int BlueFS::open_for_write(
3238 const string& dirname,
3239 const string& filename,
3240 FileWriter **h,
3241 bool overwrite)
3242 {
3243 std::lock_guard l(lock);
3244 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3245 map<string,DirRef>::iterator p = dir_map.find(dirname);
3246 DirRef dir;
3247 if (p == dir_map.end()) {
3248 // implicitly create the dir
3249 dout(20) << __func__ << " dir " << dirname
3250 << " does not exist" << dendl;
3251 return -ENOENT;
3252 } else {
3253 dir = p->second;
3254 }
3255
3256 FileRef file;
3257 bool create = false;
3258 bool truncate = false;
3259 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3260 if (q == dir->file_map.end()) {
3261 if (overwrite) {
3262 dout(20) << __func__ << " dir " << dirname << " (" << dir
3263 << ") file " << filename
3264 << " does not exist" << dendl;
3265 return -ENOENT;
3266 }
3267 file = ceph::make_ref<File>();
3268 file->fnode.ino = ++ino_last;
3269 file_map[ino_last] = file;
3270 dir->file_map[filename] = file;
3271 ++file->refs;
3272 create = true;
3273 } else {
3274 // overwrite existing file?
3275 file = q->second;
3276 if (overwrite) {
3277 dout(20) << __func__ << " dir " << dirname << " (" << dir
3278 << ") file " << filename
3279 << " already exists, overwrite in place" << dendl;
3280 } else {
3281 dout(20) << __func__ << " dir " << dirname << " (" << dir
3282 << ") file " << filename
3283 << " already exists, truncate + overwrite" << dendl;
3284 vselector->sub_usage(file->vselector_hint, file->fnode);
3285 file->fnode.size = 0;
3286 for (auto& p : file->fnode.extents) {
3287 pending_release[p.bdev].insert(p.offset, p.length);
3288 }
3289 truncate = true;
3290
3291 file->fnode.clear_extents();
3292 }
3293 }
3294 ceph_assert(file->fnode.ino > 1);
3295
3296 file->fnode.mtime = ceph_clock_now();
3297 file->vselector_hint = vselector->get_hint_by_dir(dirname);
3298 if (create || truncate) {
3299 vselector->add_usage(file->vselector_hint, file->fnode); // update file count
3300 }
3301
3302 dout(20) << __func__ << " mapping " << dirname << "/" << filename
3303 << " vsel_hint " << file->vselector_hint
3304 << dendl;
3305
3306 log_t.op_file_update(file->fnode);
3307 if (create)
3308 log_t.op_dir_link(dirname, filename, file->fnode.ino);
3309
3310 *h = _create_writer(file);
3311
3312 if (boost::algorithm::ends_with(filename, ".log")) {
3313 (*h)->writer_type = BlueFS::WRITER_WAL;
3314 if (logger && !overwrite) {
3315 logger->inc(l_bluefs_files_written_wal);
3316 }
3317 } else if (boost::algorithm::ends_with(filename, ".sst")) {
3318 (*h)->writer_type = BlueFS::WRITER_SST;
3319 if (logger) {
3320 logger->inc(l_bluefs_files_written_sst);
3321 }
3322 }
3323
3324 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
3325 return 0;
3326 }
3327
3328 BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
3329 {
3330 FileWriter *w = new FileWriter(f);
3331 for (unsigned i = 0; i < MAX_BDEV; ++i) {
3332 if (bdev[i]) {
3333 w->iocv[i] = new IOContext(cct, NULL);
3334 }
3335 }
3336 return w;
3337 }
3338
3339 void BlueFS::_close_writer(FileWriter *h)
3340 {
3341 dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
3342 h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer);
3343 for (unsigned i=0; i<MAX_BDEV; ++i) {
3344 if (bdev[i]) {
3345 if (h->iocv[i]) {
3346 h->iocv[i]->aio_wait();
3347 bdev[i]->queue_reap_ioc(h->iocv[i]);
3348 }
3349 }
3350 }
3351 delete h;
3352 }
3353
3354 int BlueFS::open_for_read(
3355 const string& dirname,
3356 const string& filename,
3357 FileReader **h,
3358 bool random)
3359 {
3360 std::lock_guard l(lock);
3361 dout(10) << __func__ << " " << dirname << "/" << filename
3362 << (random ? " (random)":" (sequential)") << dendl;
3363 map<string,DirRef>::iterator p = dir_map.find(dirname);
3364 if (p == dir_map.end()) {
3365 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3366 return -ENOENT;
3367 }
3368 DirRef dir = p->second;
3369
3370 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3371 if (q == dir->file_map.end()) {
3372 dout(20) << __func__ << " dir " << dirname << " (" << dir
3373 << ") file " << filename
3374 << " not found" << dendl;
3375 return -ENOENT;
3376 }
3377 File *file = q->second.get();
3378
3379 *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
3380 random, false);
3381 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
3382 return 0;
3383 }
3384
3385 int BlueFS::rename(
3386 const string& old_dirname, const string& old_filename,
3387 const string& new_dirname, const string& new_filename)
3388 {
3389 std::lock_guard l(lock);
3390 dout(10) << __func__ << " " << old_dirname << "/" << old_filename
3391 << " -> " << new_dirname << "/" << new_filename << dendl;
3392 map<string,DirRef>::iterator p = dir_map.find(old_dirname);
3393 if (p == dir_map.end()) {
3394 dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
3395 return -ENOENT;
3396 }
3397 DirRef old_dir = p->second;
3398 map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
3399 if (q == old_dir->file_map.end()) {
3400 dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
3401 << ") file " << old_filename
3402 << " not found" << dendl;
3403 return -ENOENT;
3404 }
3405 FileRef file = q->second;
3406
3407 p = dir_map.find(new_dirname);
3408 if (p == dir_map.end()) {
3409 dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
3410 return -ENOENT;
3411 }
3412 DirRef new_dir = p->second;
3413 q = new_dir->file_map.find(new_filename);
3414 if (q != new_dir->file_map.end()) {
3415 dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
3416 << ") file " << new_filename
3417 << " already exists, unlinking" << dendl;
3418 ceph_assert(q->second != file);
3419 log_t.op_dir_unlink(new_dirname, new_filename);
3420 _drop_link(q->second);
3421 }
3422
3423 dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
3424 << " " << file->fnode << dendl;
3425
3426 new_dir->file_map[new_filename] = file;
3427 old_dir->file_map.erase(old_filename);
3428
3429 log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
3430 log_t.op_dir_unlink(old_dirname, old_filename);
3431 return 0;
3432 }
3433
3434 int BlueFS::mkdir(const string& dirname)
3435 {
3436 std::lock_guard l(lock);
3437 dout(10) << __func__ << " " << dirname << dendl;
3438 map<string,DirRef>::iterator p = dir_map.find(dirname);
3439 if (p != dir_map.end()) {
3440 dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
3441 return -EEXIST;
3442 }
3443 dir_map[dirname] = ceph::make_ref<Dir>();
3444 log_t.op_dir_create(dirname);
3445 return 0;
3446 }
3447
3448 int BlueFS::rmdir(const string& dirname)
3449 {
3450 std::lock_guard l(lock);
3451 dout(10) << __func__ << " " << dirname << dendl;
3452 map<string,DirRef>::iterator p = dir_map.find(dirname);
3453 if (p == dir_map.end()) {
3454 dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
3455 return -ENOENT;
3456 }
3457 DirRef dir = p->second;
3458 if (!dir->file_map.empty()) {
3459 dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
3460 return -ENOTEMPTY;
3461 }
3462 dir_map.erase(dirname);
3463 log_t.op_dir_remove(dirname);
3464 return 0;
3465 }
3466
3467 bool BlueFS::dir_exists(const string& dirname)
3468 {
3469 std::lock_guard l(lock);
3470 map<string,DirRef>::iterator p = dir_map.find(dirname);
3471 bool exists = p != dir_map.end();
3472 dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
3473 return exists;
3474 }
3475
3476 int BlueFS::stat(const string& dirname, const string& filename,
3477 uint64_t *size, utime_t *mtime)
3478 {
3479 std::lock_guard l(lock);
3480 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3481 map<string,DirRef>::iterator p = dir_map.find(dirname);
3482 if (p == dir_map.end()) {
3483 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3484 return -ENOENT;
3485 }
3486 DirRef dir = p->second;
3487 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3488 if (q == dir->file_map.end()) {
3489 dout(20) << __func__ << " dir " << dirname << " (" << dir
3490 << ") file " << filename
3491 << " not found" << dendl;
3492 return -ENOENT;
3493 }
3494 File *file = q->second.get();
3495 dout(10) << __func__ << " " << dirname << "/" << filename
3496 << " " << file->fnode << dendl;
3497 if (size)
3498 *size = file->fnode.size;
3499 if (mtime)
3500 *mtime = file->fnode.mtime;
3501 return 0;
3502 }
3503
3504 int BlueFS::lock_file(const string& dirname, const string& filename,
3505 FileLock **plock)
3506 {
3507 std::lock_guard l(lock);
3508 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3509 map<string,DirRef>::iterator p = dir_map.find(dirname);
3510 if (p == dir_map.end()) {
3511 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3512 return -ENOENT;
3513 }
3514 DirRef dir = p->second;
3515 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3516 FileRef file;
3517 if (q == dir->file_map.end()) {
3518 dout(20) << __func__ << " dir " << dirname << " (" << dir
3519 << ") file " << filename
3520 << " not found, creating" << dendl;
3521 file = ceph::make_ref<File>();
3522 file->fnode.ino = ++ino_last;
3523 file->fnode.mtime = ceph_clock_now();
3524 file_map[ino_last] = file;
3525 dir->file_map[filename] = file;
3526 ++file->refs;
3527 log_t.op_file_update(file->fnode);
3528 log_t.op_dir_link(dirname, filename, file->fnode.ino);
3529 } else {
3530 file = q->second;
3531 if (file->locked) {
3532 dout(10) << __func__ << " already locked" << dendl;
3533 return -ENOLCK;
3534 }
3535 }
3536 file->locked = true;
3537 *plock = new FileLock(file);
3538 dout(10) << __func__ << " locked " << file->fnode
3539 << " with " << *plock << dendl;
3540 return 0;
3541 }
3542
3543 int BlueFS::unlock_file(FileLock *fl)
3544 {
3545 std::lock_guard l(lock);
3546 dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
3547 ceph_assert(fl->file->locked);
3548 fl->file->locked = false;
3549 delete fl;
3550 return 0;
3551 }
3552
3553 int BlueFS::readdir(const string& dirname, vector<string> *ls)
3554 {
3555 std::lock_guard l(lock);
3556 dout(10) << __func__ << " " << dirname << dendl;
3557 if (dirname.empty()) {
3558 // list dirs
3559 ls->reserve(dir_map.size() + 2);
3560 for (auto& q : dir_map) {
3561 ls->push_back(q.first);
3562 }
3563 } else {
3564 // list files in dir
3565 map<string,DirRef>::iterator p = dir_map.find(dirname);
3566 if (p == dir_map.end()) {
3567 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3568 return -ENOENT;
3569 }
3570 DirRef dir = p->second;
3571 ls->reserve(dir->file_map.size() + 2);
3572 for (auto& q : dir->file_map) {
3573 ls->push_back(q.first);
3574 }
3575 }
3576 ls->push_back(".");
3577 ls->push_back("..");
3578 return 0;
3579 }
3580
3581 int BlueFS::unlink(const string& dirname, const string& filename)
3582 {
3583 std::lock_guard l(lock);
3584 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3585 map<string,DirRef>::iterator p = dir_map.find(dirname);
3586 if (p == dir_map.end()) {
3587 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3588 return -ENOENT;
3589 }
3590 DirRef dir = p->second;
3591 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3592 if (q == dir->file_map.end()) {
3593 dout(20) << __func__ << " file " << dirname << "/" << filename
3594 << " not found" << dendl;
3595 return -ENOENT;
3596 }
3597 FileRef file = q->second;
3598 if (file->locked) {
3599 dout(20) << __func__ << " file " << dirname << "/" << filename
3600 << " is locked" << dendl;
3601 return -EBUSY;
3602 }
3603 dir->file_map.erase(filename);
3604 log_t.op_dir_unlink(dirname, filename);
3605 _drop_link(file);
3606 return 0;
3607 }
3608
3609 bool BlueFS::wal_is_rotational()
3610 {
3611 if (bdev[BDEV_WAL]) {
3612 return bdev[BDEV_WAL]->is_rotational();
3613 } else if (bdev[BDEV_DB]) {
3614 return bdev[BDEV_DB]->is_rotational();
3615 }
3616 return bdev[BDEV_SLOW]->is_rotational();
3617 }
3618
3619 /*
3620 Algorithm.
3621 do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there.
3622 Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future,
3623 and try if using it will produce healthy bluefs transaction.
3624 We encode already known bluefs log extents and search disk for these bytes.
3625 When we find it, we decode following bytes as extent.
3626 We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction.
3627 */
3628 int BlueFS::do_replay_recovery_read(FileReader *log_reader,
3629 size_t replay_pos,
3630 size_t read_offset,
3631 size_t read_len,
3632 bufferlist* bl) {
3633 dout(1) << __func__ << " replay_pos=0x" << std::hex << replay_pos <<
3634 " needs 0x" << read_offset << "~" << read_len << std::dec << dendl;
3635
3636 bluefs_fnode_t& log_fnode = log_reader->file->fnode;
3637 bufferlist bin_extents;
3638 ceph::encode(log_fnode.extents, bin_extents);
3639 dout(2) << __func__ << " log file encoded extents length = " << bin_extents.length() << dendl;
3640
3641 // cannot process if too small to effectively search
3642 ceph_assert(bin_extents.length() >= 32);
3643 bufferlist last_32;
3644 last_32.substr_of(bin_extents, bin_extents.length() - 32, 32);
3645
3646 //read fixed part from replay_pos to end of bluefs_log extents
3647 bufferlist fixed;
3648 uint64_t e_off = 0;
3649 auto e = log_fnode.seek(replay_pos, &e_off);
3650 ceph_assert(e != log_fnode.extents.end());
3651 int r = bdev[e->bdev]->read(e->offset + e_off, e->length - e_off, &fixed, ioc[e->bdev],
3652 cct->_conf->bluefs_buffered_io);
3653 ceph_assert(r == 0);
3654 //capture dev of last good extent
3655 uint8_t last_e_dev = e->bdev;
3656 uint64_t last_e_off = e->offset;
3657 ++e;
3658 while (e != log_fnode.extents.end()) {
3659 r = bdev[e->bdev]->read(e->offset, e->length, &fixed, ioc[e->bdev],
3660 cct->_conf->bluefs_buffered_io);
3661 ceph_assert(r == 0);
3662 last_e_dev = e->bdev;
3663 ++e;
3664 }
3665 ceph_assert(replay_pos + fixed.length() == read_offset);
3666
3667 dout(2) << __func__ << " valid data in log = " << fixed.length() << dendl;
3668
3669 struct compare {
3670 bool operator()(const bluefs_extent_t& a, const bluefs_extent_t& b) const {
3671 if (a.bdev < b.bdev) return true;
3672 if (a.offset < b.offset) return true;
3673 return a.length < b.length;
3674 }
3675 };
3676 std::set<bluefs_extent_t, compare> extents_rejected;
3677 for (int dcnt = 0; dcnt < 3; dcnt++) {
3678 uint8_t dev = (last_e_dev + dcnt) % MAX_BDEV;
3679 if (bdev[dev] == nullptr) continue;
3680 dout(2) << __func__ << " processing " << get_device_name(dev) << dendl;
3681 interval_set<uint64_t> disk_regions;
3682 disk_regions.insert(0, bdev[dev]->get_size());
3683 for (auto f : file_map) {
3684 auto& e = f.second->fnode.extents;
3685 for (auto& p : e) {
3686 if (p.bdev == dev) {
3687 disk_regions.erase(p.offset, p.length);
3688 }
3689 }
3690 }
3691 size_t disk_regions_count = disk_regions.num_intervals();
3692 dout(5) << __func__ << " " << disk_regions_count << " regions to scan on " << get_device_name(dev) << dendl;
3693
3694 auto reg = disk_regions.lower_bound(last_e_off);
3695 //for all except first, start from beginning
3696 last_e_off = 0;
3697 if (reg == disk_regions.end()) {
3698 reg = disk_regions.begin();
3699 }
3700 const uint64_t chunk_size = 4 * 1024 * 1024;
3701 const uint64_t page_size = 4096;
3702 const uint64_t max_extent_size = 16;
3703 uint64_t overlay_size = last_32.length() + max_extent_size;
3704 for (size_t i = 0; i < disk_regions_count; reg++, i++) {
3705 if (reg == disk_regions.end()) {
3706 reg = disk_regions.begin();
3707 }
3708 uint64_t pos = reg.get_start();
3709 uint64_t len = reg.get_len();
3710
3711 std::unique_ptr<char[]> raw_data_p{new char[page_size + chunk_size]};
3712 char* raw_data = raw_data_p.get();
3713 memset(raw_data, 0, page_size);
3714
3715 while (len > last_32.length()) {
3716 uint64_t chunk_len = len > chunk_size ? chunk_size : len;
3717 dout(5) << __func__ << " read "
3718 << get_device_name(dev) << ":0x" << std::hex << pos << "+" << chunk_len << std::dec << dendl;
3719 r = bdev[dev]->read_random(pos, chunk_len, raw_data + page_size, cct->_conf->bluefs_buffered_io);
3720 ceph_assert(r == 0);
3721
3722 //search for fixed_last_32
3723 char* chunk_b = raw_data + page_size;
3724 char* chunk_e = chunk_b + chunk_len;
3725
3726 char* search_b = chunk_b - overlay_size;
3727 char* search_e = chunk_e;
3728
3729 for (char* sp = search_b; ; sp += last_32.length()) {
3730 sp = (char*)memmem(sp, search_e - sp, last_32.c_str(), last_32.length());
3731 if (sp == nullptr) {
3732 break;
3733 }
3734
3735 char* n = sp + last_32.length();
3736 dout(5) << __func__ << " checking location 0x" << std::hex << pos + (n - chunk_b) << std::dec << dendl;
3737 bufferlist test;
3738 test.append(n, std::min<size_t>(max_extent_size, chunk_e - n));
3739 bluefs_extent_t ne;
3740 try {
3741 bufferlist::const_iterator p = test.begin();
3742 ceph::decode(ne, p);
3743 } catch (buffer::error& e) {
3744 continue;
3745 }
3746 if (extents_rejected.count(ne) != 0) {
3747 dout(5) << __func__ << " extent " << ne << " already refected" <<dendl;
3748 continue;
3749 }
3750 //insert as rejected already. if we succeed, it wouldn't make difference.
3751 extents_rejected.insert(ne);
3752
3753 if (ne.bdev >= MAX_BDEV ||
3754 bdev[ne.bdev] == nullptr ||
3755 ne.length > 16 * 1024 * 1024 ||
3756 (ne.length & 4095) != 0 ||
3757 ne.offset + ne.length > bdev[ne.bdev]->get_size() ||
3758 (ne.offset & 4095) != 0) {
3759 dout(5) << __func__ << " refusing extent " << ne << dendl;
3760 continue;
3761 }
3762 dout(5) << __func__ << " checking extent " << ne << dendl;
3763
3764 //read candidate extent - whole
3765 bufferlist candidate;
3766 candidate.append(fixed);
3767 r = bdev[ne.bdev]->read(ne.offset, ne.length, &candidate, ioc[ne.bdev],
3768 cct->_conf->bluefs_buffered_io);
3769 ceph_assert(r == 0);
3770
3771 //check if transaction & crc is ok
3772 bluefs_transaction_t t;
3773 try {
3774 bufferlist::const_iterator p = candidate.cbegin();
3775 decode(t, p);
3776 }
3777 catch (buffer::error& e) {
3778 dout(5) << __func__ << " failed match" << dendl;
3779 continue;
3780 }
3781
3782 //success, it seems a probable candidate
3783 uint64_t l = std::min<uint64_t>(ne.length, read_len);
3784 //trim to required size
3785 bufferlist requested_read;
3786 requested_read.substr_of(candidate, fixed.length(), l);
3787 bl->append(requested_read);
3788 dout(5) << __func__ << " successful extension of log " << l << "/" << read_len << dendl;
3789 log_fnode.append_extent(ne);
3790 log_fnode.recalc_allocated();
3791 log_reader->buf.pos += l;
3792 return l;
3793 }
3794 //save overlay for next search
3795 memcpy(search_b, chunk_e - overlay_size, overlay_size);
3796 pos += chunk_len;
3797 len -= chunk_len;
3798 }
3799 }
3800 }
3801 return 0;
3802 }
3803
3804 void BlueFS::debug_inject_duplicate_gift(unsigned id,
3805 uint64_t offset,
3806 uint64_t len)
3807 {
3808 dout(0) << __func__ << dendl;
3809 if (id < alloc.size() && alloc[id]) {
3810 alloc[id]->init_add_free(offset, len);
3811 }
3812 }
3813
3814 // ===============================================
3815 // OriginalVolumeSelector
3816
3817 void* OriginalVolumeSelector::get_hint_for_log() const {
3818 return reinterpret_cast<void*>(BlueFS::BDEV_WAL);
3819 }
3820 void* OriginalVolumeSelector::get_hint_by_dir(const string& dirname) const {
3821 uint8_t res = BlueFS::BDEV_DB;
3822 if (dirname.length() > 5) {
3823 // the "db.slow" and "db.wal" directory names are hard-coded at
3824 // match up with bluestore. the slow device is always the second
3825 // one (when a dedicated block.db device is present and used at
3826 // bdev 0). the wal device is always last.
3827 if (boost::algorithm::ends_with(dirname, ".slow")) {
3828 res = BlueFS::BDEV_SLOW;
3829 }
3830 else if (boost::algorithm::ends_with(dirname, ".wal")) {
3831 res = BlueFS::BDEV_WAL;
3832 }
3833 }
3834 return reinterpret_cast<void*>(res);
3835 }
3836
3837 uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint)
3838 {
3839 return (uint8_t)(reinterpret_cast<uint64_t>(hint));
3840 }
3841
3842 void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const
3843 {
3844 res.emplace_back(base, db_total);
3845 res.emplace_back(base + ".slow", slow_total);
3846 }
3847
3848 #undef dout_prefix
3849 #define dout_prefix *_dout << "OriginalVolumeSelector: "
3850
3851 void OriginalVolumeSelector::dump(ostream& sout) {
3852 sout<< "wal_total:" << wal_total
3853 << ", db_total:" << db_total
3854 << ", slow_total:" << slow_total
3855 << std::endl;
3856 }