]> git.proxmox.com Git - ceph.git/blob - ceph/src/os/bluestore/BlueFS.cc
import ceph 15.2.14
[ceph.git] / ceph / src / os / bluestore / BlueFS.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "boost/algorithm/string.hpp"
5 #include "bluestore_common.h"
6 #include "BlueFS.h"
7
8 #include "common/debug.h"
9 #include "common/errno.h"
10 #include "common/perf_counters.h"
11 #include "BlockDevice.h"
12 #include "Allocator.h"
13 #include "include/ceph_assert.h"
14 #include "common/admin_socket.h"
15
16 #define dout_context cct
17 #define dout_subsys ceph_subsys_bluefs
18 #undef dout_prefix
19 #define dout_prefix *_dout << "bluefs "
20 using TOPNSPC::common::cmd_getval;
21 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
22 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
23 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs_file_writer);
24 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
25 bluefs_file_reader_buffer, bluefs_file_reader);
26 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs_file_reader);
27 MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
28
29 static void wal_discard_cb(void *priv, void* priv2) {
30 BlueFS *bluefs = static_cast<BlueFS*>(priv);
31 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
32 bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp);
33 }
34
35 static void db_discard_cb(void *priv, void* priv2) {
36 BlueFS *bluefs = static_cast<BlueFS*>(priv);
37 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
38 bluefs->handle_discard(BlueFS::BDEV_DB, *tmp);
39 }
40
41 static void slow_discard_cb(void *priv, void* priv2) {
42 BlueFS *bluefs = static_cast<BlueFS*>(priv);
43 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
44 bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
45 }
46
47 class BlueFS::SocketHook : public AdminSocketHook {
48 BlueFS* bluefs;
49 public:
50 static BlueFS::SocketHook* create(BlueFS* bluefs)
51 {
52 BlueFS::SocketHook* hook = nullptr;
53 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
54 if (admin_socket) {
55 hook = new BlueFS::SocketHook(bluefs);
56 int r = admin_socket->register_command("bluestore bluefs available "
57 "name=alloc_size,type=CephInt,req=false",
58 hook,
59 "Report available space for bluefs. "
60 "If alloc_size set, make simulation.");
61 if (r != 0) {
62 ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl;
63 delete hook;
64 hook = nullptr;
65 } else {
66 r = admin_socket->register_command("bluefs stats",
67 hook,
68 "Dump internal statistics for bluefs."
69 "");
70 ceph_assert(r == 0);
71 r = admin_socket->register_command("bluefs debug_inject_read_zeros", hook,
72 "Injects 8K zeros into next BlueFS read. Debug only.");
73 ceph_assert(r == 0);
74 }
75 }
76 return hook;
77 }
78
79 ~SocketHook() {
80 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
81 admin_socket->unregister_commands(this);
82 }
83 private:
84 SocketHook(BlueFS* bluefs) :
85 bluefs(bluefs) {}
86 int call(std::string_view command, const cmdmap_t& cmdmap,
87 Formatter *f,
88 std::ostream& errss,
89 bufferlist& out) override {
90 if (command == "bluestore bluefs available") {
91 int64_t alloc_size = 0;
92 cmd_getval(cmdmap, "alloc_size", alloc_size);
93 if ((alloc_size & (alloc_size - 1)) != 0) {
94 errss << "Invalid allocation size:'" << alloc_size << std::endl;
95 return -EINVAL;
96 }
97 if (alloc_size == 0)
98 alloc_size = bluefs->cct->_conf->bluefs_alloc_size;
99 f->open_object_section("bluefs_available_space");
100 for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) {
101 if (bluefs->bdev[dev]) {
102 f->open_object_section("dev");
103 f->dump_string("device", bluefs->get_device_name(dev));
104 ceph_assert(bluefs->alloc[dev]);
105 f->dump_int("free", bluefs->alloc[dev]->get_free());
106 f->close_section();
107 }
108 }
109 size_t extra_space = 0;
110 if (bluefs->slow_dev_expander) {
111 extra_space = bluefs->slow_dev_expander->available_freespace(alloc_size);
112 }
113 f->dump_int("available_from_bluestore", extra_space);
114 f->close_section();
115 } else if (command == "bluefs stats") {
116 std::stringstream ss;
117 bluefs->dump_block_extents(ss);
118 bluefs->dump_volume_selector(ss);
119 out.append(ss);
120 } else if (command == "bluefs debug_inject_read_zeros") {
121 bluefs->inject_read_zeros++;
122 } else {
123 errss << "Invalid command" << std::endl;
124 return -ENOSYS;
125 }
126 return 0;
127 }
128 };
129
130 BlueFS::BlueFS(CephContext* cct)
131 : cct(cct),
132 bdev(MAX_BDEV),
133 ioc(MAX_BDEV),
134 block_all(MAX_BDEV)
135 {
136 discard_cb[BDEV_WAL] = wal_discard_cb;
137 discard_cb[BDEV_DB] = db_discard_cb;
138 discard_cb[BDEV_SLOW] = slow_discard_cb;
139 asok_hook = SocketHook::create(this);
140 }
141
142 BlueFS::~BlueFS()
143 {
144 delete asok_hook;
145 for (auto p : ioc) {
146 if (p)
147 p->aio_wait();
148 }
149 for (auto p : bdev) {
150 if (p) {
151 p->close();
152 delete p;
153 }
154 }
155 for (auto p : ioc) {
156 delete p;
157 }
158 }
159
160 void BlueFS::_init_logger()
161 {
162 PerfCountersBuilder b(cct, "bluefs",
163 l_bluefs_first, l_bluefs_last);
164 b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes",
165 "Bytes gifted from BlueStore", NULL, 0, unit_t(UNIT_BYTES));
166 b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes",
167 "Bytes reclaimed by BlueStore", NULL, 0, unit_t(UNIT_BYTES));
168 b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
169 "Total bytes (main db device)",
170 "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
171 b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
172 "Used bytes (main db device)",
173 "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
174 b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
175 "Total bytes (wal device)",
176 "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
177 b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
178 "Used bytes (wal device)",
179 "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
180 b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
181 "Total bytes (slow device)",
182 "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
183 b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
184 "Used bytes (slow device)",
185 "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
186 b.add_u64(l_bluefs_num_files, "num_files", "File count",
187 "f", PerfCountersBuilder::PRIO_USEFUL);
188 b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
189 "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
190 b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
191 "Compactions of the metadata log");
192 b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
193 "Bytes written to the metadata log", "j",
194 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
195 b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
196 "Files written to WAL");
197 b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
198 "Files written to SSTs");
199 b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
200 "Bytes written to WAL", "wal",
201 PerfCountersBuilder::PRIO_CRITICAL);
202 b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
203 "Bytes written to SSTs", "sst",
204 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
205 b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow",
206 "Bytes written to WAL/SSTs at slow device", NULL,
207 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
208 b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal",
209 "Maximum bytes allocated from WAL");
210 b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db",
211 "Maximum bytes allocated from DB");
212 b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow",
213 "Maximum bytes allocated from SLOW");
214
215 b.add_u64_counter(l_bluefs_read_random_count, "read_random_count",
216 "random read requests processed");
217 b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes",
218 "Bytes requested in random read mode", NULL,
219 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
220 b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count",
221 "random reads requests going to disk");
222 b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes",
223 "Bytes read from disk in random read mode", NULL,
224 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
225 b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count",
226 "random read requests processed using prefetch buffer");
227 b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes",
228 "Bytes read from prefetch buffer in random read mode", NULL,
229 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
230
231 b.add_u64_counter(l_bluefs_read_count, "read_count",
232 "buffered read requests processed");
233 b.add_u64_counter(l_bluefs_read_bytes, "read_bytes",
234 "Bytes requested in buffered read mode", NULL,
235 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
236
237 b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count",
238 "prefetch read requests processed");
239 b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes",
240 "Bytes requested in prefetch read mode", NULL,
241 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
242 b.add_u64(l_bluefs_read_zeros_candidate, "read_zeros_candidate",
243 "How many times bluefs read found page with all 0s");
244 b.add_u64(l_bluefs_read_zeros_errors, "read_zeros_errors",
245 "How many times bluefs read found transient page with all 0s");
246
247 logger = b.create_perf_counters();
248 cct->get_perfcounters_collection()->add(logger);
249 }
250
251 void BlueFS::_shutdown_logger()
252 {
253 cct->get_perfcounters_collection()->remove(logger);
254 delete logger;
255 }
256
257 void BlueFS::_update_logger_stats()
258 {
259 // we must be holding the lock
260 logger->set(l_bluefs_num_files, file_map.size());
261 logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
262
263 if (alloc[BDEV_WAL]) {
264 logger->set(l_bluefs_wal_total_bytes, block_all[BDEV_WAL].size());
265 logger->set(l_bluefs_wal_used_bytes,
266 block_all[BDEV_WAL].size() - alloc[BDEV_WAL]->get_free());
267 }
268 if (alloc[BDEV_DB]) {
269 logger->set(l_bluefs_db_total_bytes, block_all[BDEV_DB].size());
270 logger->set(l_bluefs_db_used_bytes,
271 block_all[BDEV_DB].size() - alloc[BDEV_DB]->get_free());
272 }
273 if (alloc[BDEV_SLOW]) {
274 logger->set(l_bluefs_slow_total_bytes, block_all[BDEV_SLOW].size());
275 logger->set(l_bluefs_slow_used_bytes,
276 block_all[BDEV_SLOW].size() - alloc[BDEV_SLOW]->get_free());
277 }
278 }
279
280 int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
281 bool shared_with_bluestore)
282 {
283 dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
284 ceph_assert(id < bdev.size());
285 ceph_assert(bdev[id] == NULL);
286 BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
287 discard_cb[id], static_cast<void*>(this));
288 if (shared_with_bluestore) {
289 b->set_no_exclusive_lock();
290 }
291 int r = b->open(path);
292 if (r < 0) {
293 delete b;
294 return r;
295 }
296 if (trim) {
297 b->discard(0, b->get_size());
298 }
299
300 dout(1) << __func__ << " bdev " << id << " path " << path
301 << " size " << byte_u_t(b->get_size()) << dendl;
302 bdev[id] = b;
303 ioc[id] = new IOContext(cct, NULL);
304 return 0;
305 }
306
307 bool BlueFS::bdev_support_label(unsigned id)
308 {
309 ceph_assert(id < bdev.size());
310 ceph_assert(bdev[id]);
311 return bdev[id]->supported_bdev_label();
312 }
313
314 uint64_t BlueFS::get_block_device_size(unsigned id)
315 {
316 if (id < bdev.size() && bdev[id])
317 return bdev[id]->get_size();
318 return 0;
319 }
320
321 void BlueFS::_add_block_extent(unsigned id, uint64_t offset, uint64_t length,
322 bool skip)
323 {
324 dout(1) << __func__ << " bdev " << id
325 << " 0x" << std::hex << offset << "~" << length << std::dec
326 << " skip " << skip
327 << dendl;
328
329 ceph_assert(id < bdev.size());
330 ceph_assert(bdev[id]);
331 ceph_assert(bdev[id]->get_size() >= offset + length);
332 block_all[id].insert(offset, length);
333
334 if (id < alloc.size() && alloc[id]) {
335 if (!skip)
336 log_t.op_alloc_add(id, offset, length);
337
338 alloc[id]->init_add_free(offset, length);
339 }
340
341 if (logger)
342 logger->inc(l_bluefs_gift_bytes, length);
343 dout(10) << __func__ << " done" << dendl;
344 }
345
346 int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
347 PExtentVector *extents)
348 {
349 std::unique_lock l(lock);
350 dout(1) << __func__ << " bdev " << id
351 << " want 0x" << std::hex << want << std::dec << dendl;
352 ceph_assert(id < alloc.size());
353 ceph_assert(alloc[id]);
354 int64_t got = 0;
355
356 interval_set<uint64_t> granular;
357 while (want > 0 && !block_unused_too_granular[id].empty()) {
358 auto p = block_unused_too_granular[id].begin();
359 dout(20) << __func__ << " unused " << (int)id << ":"
360 << std::hex << p.get_start() << "~" << p.get_len() << dendl;
361 extents->push_back({p.get_start(), p.get_len()});
362 granular.insert(p.get_start(), p.get_len());
363 if (want >= p.get_len()) {
364 want -= p.get_len();
365 } else {
366 want = 0;
367 }
368 got += p.get_len();
369 block_unused_too_granular[id].erase(p);
370 }
371
372 if (want > 0) {
373 got += alloc[id]->allocate(want, alloc_size[id], 0, extents);
374 ceph_assert(got != 0);
375 if (got < 0) {
376 derr << __func__ << " failed to allocate space to return to bluestore"
377 << dendl;
378 alloc[id]->dump();
379 block_unused_too_granular[id].insert(granular);
380 return got;
381 }
382
383 for (auto& p : *extents) {
384 block_all[id].erase(p.offset, p.length);
385 log_t.op_alloc_rm(id, p.offset, p.length);
386 }
387
388 flush_bdev();
389 int r = _flush_and_sync_log(l);
390 ceph_assert(r == 0);
391 }
392
393 logger->inc(l_bluefs_reclaim_bytes, got);
394 dout(1) << __func__ << " bdev " << id << " want 0x" << std::hex << want
395 << " got " << *extents << dendl;
396 return 0;
397 }
398
399 void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
400 {
401 dout(10) << __func__ << " bdev " << id << dendl;
402 ceph_assert(alloc[id]);
403 alloc[id]->release(to_release);
404 }
405
406 uint64_t BlueFS::get_used()
407 {
408 std::lock_guard l(lock);
409 uint64_t used = 0;
410 for (unsigned id = 0; id < MAX_BDEV; ++id) {
411 if (alloc[id]) {
412 used += block_all[id].size() - alloc[id]->get_free();
413 }
414 }
415 return used;
416 }
417
418 uint64_t BlueFS::get_total(unsigned id)
419 {
420 std::lock_guard l(lock);
421 ceph_assert(id < block_all.size());
422 return block_all[id].size();
423 }
424
425 uint64_t BlueFS::get_free(unsigned id)
426 {
427 std::lock_guard l(lock);
428 ceph_assert(id < alloc.size());
429 return alloc[id]->get_free();
430 }
431
432 void BlueFS::dump_perf_counters(Formatter *f)
433 {
434 f->open_object_section("bluefs_perf_counters");
435 logger->dump_formatted(f,0);
436 f->close_section();
437 }
438
439 void BlueFS::dump_block_extents(ostream& out)
440 {
441 for (unsigned i = 0; i < MAX_BDEV; ++i) {
442 if (!bdev[i]) {
443 continue;
444 }
445 auto owned = get_total(i);
446 auto free = get_free(i);
447
448 out << i << " : device size 0x" << std::hex << bdev[i]->get_size()
449 << " : own 0x" << block_all[i]
450 << " = 0x" << owned
451 << " : using 0x" << owned - free
452 << std::dec << "(" << byte_u_t(owned - free) << ")";
453 if (i == _get_slow_device_id()) {
454 ceph_assert(slow_dev_expander);
455 ceph_assert(alloc[i]);
456 free = slow_dev_expander->available_freespace(alloc_size[i]);
457 out << std::hex
458 << " : bluestore has 0x" << free
459 << std::dec << "(" << byte_u_t(free) << ") available";
460 }
461 out << "\n";
462 }
463 }
464
465 void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
466 {
467 std::lock_guard l(lock);
468 usage->resize(bdev.size());
469 for (unsigned id = 0; id < bdev.size(); ++id) {
470 if (!bdev[id]) {
471 (*usage)[id] = make_pair(0, 0);
472 continue;
473 }
474 (*usage)[id].first = alloc[id]->get_free();
475 (*usage)[id].second = block_all[id].size();
476 uint64_t used =
477 (block_all[id].size() - (*usage)[id].first) * 100 / block_all[id].size();
478 dout(10) << __func__ << " bdev " << id
479 << " free " << (*usage)[id].first
480 << " (" << byte_u_t((*usage)[id].first) << ")"
481 << " / " << (*usage)[id].second
482 << " (" << byte_u_t((*usage)[id].second) << ")"
483 << ", used " << used << "%"
484 << dendl;
485 }
486 }
487
488 int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
489 {
490 std::lock_guard l(lock);
491 dout(10) << __func__ << " bdev " << id << dendl;
492 if (id >= block_all.size())
493 return -EINVAL;
494 *extents = block_all[id];
495 return 0;
496 }
497
498 int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
499 {
500 std::unique_lock l(lock);
501 dout(1) << __func__
502 << " osd_uuid " << osd_uuid
503 << dendl;
504
505 // set volume selector if not provided before/outside
506 if (vselector == nullptr) {
507 vselector.reset(
508 new OriginalVolumeSelector(
509 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
510 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
511 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
512 }
513
514 _init_alloc();
515 _init_logger();
516
517 super.version = 1;
518 super.block_size = bdev[BDEV_DB]->get_block_size();
519 super.osd_uuid = osd_uuid;
520 super.uuid.generate_random();
521 dout(1) << __func__ << " uuid " << super.uuid << dendl;
522
523 // init log
524 FileRef log_file = ceph::make_ref<File>();
525 log_file->fnode.ino = 1;
526 log_file->vselector_hint = vselector->get_hint_for_log();
527 int r = _allocate(
528 vselector->select_prefer_bdev(log_file->vselector_hint),
529 cct->_conf->bluefs_max_log_runway,
530 &log_file->fnode);
531 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
532 ceph_assert(r == 0);
533 log_writer = _create_writer(log_file);
534
535 // initial txn
536 log_t.op_init();
537 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
538 interval_set<uint64_t>& p = block_all[bdev];
539 if (p.empty())
540 continue;
541 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
542 dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
543 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
544 << dendl;
545 log_t.op_alloc_add(bdev, q.get_start(), q.get_len());
546 }
547 }
548 _flush_and_sync_log(l);
549
550 // write supers
551 super.log_fnode = log_file->fnode;
552 super.memorized_layout = layout;
553 _write_super(BDEV_DB);
554 flush_bdev();
555
556 // clean up
557 super = bluefs_super_t();
558 _close_writer(log_writer);
559 log_writer = NULL;
560 block_all.clear();
561 vselector.reset(nullptr);
562 _stop_alloc();
563 _shutdown_logger();
564
565 dout(10) << __func__ << " success" << dendl;
566 return 0;
567 }
568
569 void BlueFS::_init_alloc()
570 {
571 dout(20) << __func__ << dendl;
572 alloc.resize(MAX_BDEV);
573 alloc_size.resize(MAX_BDEV, 0);
574 pending_release.resize(MAX_BDEV);
575 block_unused_too_granular.resize(MAX_BDEV);
576
577 if (bdev[BDEV_WAL]) {
578 alloc_size[BDEV_WAL] = cct->_conf->bluefs_alloc_size;
579 }
580 if (bdev[BDEV_SLOW]) {
581 alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size;
582 alloc_size[BDEV_SLOW] = cct->_conf->bluefs_shared_alloc_size;
583 } else {
584 alloc_size[BDEV_DB] = cct->_conf->bluefs_shared_alloc_size;
585 }
586 // new wal and db devices are never shared
587 if (bdev[BDEV_NEWWAL]) {
588 alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size;
589 }
590 if (bdev[BDEV_NEWDB]) {
591 alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size;
592 }
593
594 for (unsigned id = 0; id < bdev.size(); ++id) {
595 if (!bdev[id]) {
596 continue;
597 }
598 ceph_assert(bdev[id]->get_size());
599 std::string name = "bluefs-";
600 const char* devnames[] = {"wal","db","slow"};
601 if (id <= BDEV_SLOW)
602 name += devnames[id];
603 else
604 name += to_string(uintptr_t(this));
605 ceph_assert(alloc_size[id]);
606 dout(1) << __func__ << " id " << id
607 << " alloc_size 0x" << std::hex << alloc_size[id]
608 << " size 0x" << bdev[id]->get_size() << std::dec << dendl;
609 alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
610 bdev[id]->get_size(),
611 alloc_size[id], name);
612 interval_set<uint64_t>& p = block_all[id];
613 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
614 alloc[id]->init_add_free(q.get_start(), q.get_len());
615 }
616 }
617 }
618
619 void BlueFS::_stop_alloc()
620 {
621 dout(20) << __func__ << dendl;
622 for (auto p : bdev) {
623 if (p)
624 p->discard_drain();
625 }
626
627 for (auto p : alloc) {
628 if (p != nullptr) {
629 p->shutdown();
630 delete p;
631 }
632 }
633 alloc.clear();
634 block_unused_too_granular.clear();
635 }
636
637 int BlueFS::read(uint8_t ndev, uint64_t off, uint64_t len,
638 ceph::buffer::list *pbl, IOContext *ioc, bool buffered)
639 {
640 dout(10) << __func__ << " dev " << int(ndev)
641 << ": 0x" << std::hex << off << "~" << len << std::dec
642 << (buffered ? " buffered" : "")
643 << dendl;
644 int r;
645 bufferlist bl;
646 r = bdev[ndev]->read(off, len, &bl, ioc, buffered);
647 if (r != 0) {
648 return r;
649 }
650 uint64_t block_size = bdev[ndev]->get_block_size();
651 if (inject_read_zeros) {
652 if (len >= block_size * 2) {
653 derr << __func__ << " injecting error, zeros at "
654 << int(ndev) << ": 0x" << std::hex << (off + len / 2)
655 << "~" << (block_size * 2) << std::dec << dendl;
656 //use beginning, replace 8K in the middle with zeros, use tail
657 bufferlist temp;
658 bl.splice(0, len / 2 - block_size, &temp);
659 temp.append_zero(block_size * 2);
660 bl.splice(block_size * 2, len / 2 - block_size, &temp);
661 bl = temp;
662 inject_read_zeros--;
663 }
664 }
665 //make a check if there is a block with all 0
666 uint64_t to_check_len = len;
667 uint64_t skip = p2nphase(off, block_size);
668 if (skip >= to_check_len) {
669 return r;
670 }
671 auto it = bl.begin(skip);
672 to_check_len -= skip;
673 bool all_zeros = false;
674 while (all_zeros == false && to_check_len >= block_size) {
675 // checking 0s step
676 unsigned block_left = block_size;
677 unsigned avail;
678 const char* data;
679 all_zeros = true;
680 while (all_zeros && block_left > 0) {
681 avail = it.get_ptr_and_advance(block_left, &data);
682 block_left -= avail;
683 all_zeros = mem_is_zero(data, avail);
684 }
685 // skipping step
686 while (block_left > 0) {
687 avail = it.get_ptr_and_advance(block_left, &data);
688 block_left -= avail;
689 }
690 to_check_len -= block_size;
691 }
692 if (all_zeros) {
693 logger->inc(l_bluefs_read_zeros_candidate, 1);
694 bufferlist bl_reread;
695 r = bdev[ndev]->read(off, len, &bl_reread, ioc, buffered);
696 if (r != 0) {
697 return r;
698 }
699 // check if both read gave the same
700 if (!bl.contents_equal(bl_reread)) {
701 // report problems to log, but continue, maybe it will be good now...
702 derr << __func__ << " initial read of " << int(ndev)
703 << ": 0x" << std::hex << off << "~" << len
704 << std::dec << ": different then re-read " << dendl;
705 logger->inc(l_bluefs_read_zeros_errors, 1);
706 }
707 // use second read will be better if is different
708 pbl->append(bl_reread);
709 } else {
710 pbl->append(bl);
711 }
712 return r;
713 }
714
715 int BlueFS::read_random(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered)
716 {
717 dout(10) << __func__ << " dev " << int(ndev)
718 << ": 0x" << std::hex << off << "~" << len << std::dec
719 << (buffered ? " buffered" : "")
720 << dendl;
721 int r;
722 r = bdev[ndev]->read_random(off, len, buf, buffered);
723 if (r != 0) {
724 return r;
725 }
726 uint64_t block_size = bdev[ndev]->get_block_size();
727 if (inject_read_zeros) {
728 if (len >= block_size * 2) {
729 derr << __func__ << " injecting error, zeros at "
730 << int(ndev) << ": 0x" << std::hex << (off + len / 2)
731 << "~" << (block_size * 2) << std::dec << dendl;
732 //zero middle 8K
733 memset(buf + len / 2 - block_size, 0, block_size * 2);
734 inject_read_zeros--;
735 }
736 }
737 //make a check if there is a block with all 0
738 uint64_t to_check_len = len;
739 const char* data = buf;
740 uint64_t skip = p2nphase(off, block_size);
741 if (skip >= to_check_len) {
742 return r;
743 }
744 to_check_len -= skip;
745 data += skip;
746
747 bool all_zeros = false;
748 while (all_zeros == false && to_check_len >= block_size) {
749 if (mem_is_zero(data, block_size)) {
750 // at least one block is all zeros
751 all_zeros = true;
752 break;
753 }
754 data += block_size;
755 to_check_len -= block_size;
756 }
757 if (all_zeros) {
758 logger->inc(l_bluefs_read_zeros_candidate, 1);
759 std::unique_ptr<char[]> data_reread(new char[len]);
760 r = bdev[ndev]->read_random(off, len, &data_reread[0], buffered);
761 if (r != 0) {
762 return r;
763 }
764 // check if both read gave the same
765 if (memcmp(buf, &data_reread[0], len) != 0) {
766 derr << __func__ << " initial read of " << int(ndev)
767 << ": 0x" << std::hex << off << "~" << len
768 << std::dec << ": different then re-read " << dendl;
769 logger->inc(l_bluefs_read_zeros_errors, 1);
770 // second read is probably better
771 memcpy(buf, &data_reread[0], len);
772 }
773 }
774 return r;
775 }
776
777 int BlueFS::mount()
778 {
779 dout(1) << __func__ << dendl;
780
781 int r = _open_super();
782 if (r < 0) {
783 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
784 goto out;
785 }
786
787 // set volume selector if not provided before/outside
788 if (vselector == nullptr) {
789 vselector.reset(
790 new OriginalVolumeSelector(
791 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
792 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
793 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
794 }
795
796 block_all.clear();
797 block_all.resize(MAX_BDEV);
798 _init_alloc();
799 _init_logger();
800
801 r = _replay(false, false);
802 if (r < 0) {
803 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
804 _stop_alloc();
805 goto out;
806 }
807
808 // init freelist
809 for (auto& p : file_map) {
810 dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
811 for (auto& q : p.second->fnode.extents) {
812 alloc[q.bdev]->init_rm_free(q.offset, q.length);
813 }
814 }
815
816 // set up the log for future writes
817 log_writer = _create_writer(_get_file(1));
818 ceph_assert(log_writer->file->fnode.ino == 1);
819 log_writer->pos = log_writer->file->fnode.size;
820 dout(10) << __func__ << " log write pos set to 0x"
821 << std::hex << log_writer->pos << std::dec
822 << dendl;
823
824 return 0;
825
826 out:
827 super = bluefs_super_t();
828 return r;
829 }
830
831 int BlueFS::maybe_verify_layout(const bluefs_layout_t& layout) const
832 {
833 if (super.memorized_layout) {
834 if (layout == *super.memorized_layout) {
835 dout(10) << __func__ << " bluefs layout verified positively" << dendl;
836 } else {
837 derr << __func__ << " memorized layout doesn't fit current one" << dendl;
838 return -EIO;
839 }
840 } else {
841 dout(10) << __func__ << " no memorized_layout in bluefs superblock"
842 << dendl;
843 }
844
845 return 0;
846 }
847
848 void BlueFS::umount(bool avoid_compact)
849 {
850 dout(1) << __func__ << dendl;
851
852 sync_metadata(avoid_compact);
853
854 _close_writer(log_writer);
855 log_writer = NULL;
856
857 vselector.reset(nullptr);
858 _stop_alloc();
859 file_map.clear();
860 dir_map.clear();
861 super = bluefs_super_t();
862 log_t.clear();
863 _shutdown_logger();
864 }
865
866 int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout)
867 {
868 dout(1) << __func__ << dendl;
869
870 if(id == BDEV_NEWDB) {
871 int new_log_dev_cur = BDEV_WAL;
872 int new_log_dev_next = BDEV_WAL;
873 if (!bdev[BDEV_WAL]) {
874 new_log_dev_cur = BDEV_NEWDB;
875 new_log_dev_next = BDEV_DB;
876 }
877 _rewrite_log_and_layout_sync(false,
878 BDEV_NEWDB,
879 new_log_dev_cur,
880 new_log_dev_next,
881 RENAME_DB2SLOW,
882 layout);
883 //}
884 } else if(id == BDEV_NEWWAL) {
885 _rewrite_log_and_layout_sync(false,
886 BDEV_DB,
887 BDEV_NEWWAL,
888 BDEV_WAL,
889 REMOVE_WAL,
890 layout);
891 } else {
892 assert(false);
893 }
894 return 0;
895 }
896
897 void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id)
898 {
899 if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB])
900 bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
901 if (bdev[BDEV_WAL])
902 bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
903 }
904
905 void BlueFS::get_devices(set<string> *ls)
906 {
907 for (unsigned i = 0; i < MAX_BDEV; ++i) {
908 if (bdev[i]) {
909 bdev[i]->get_devices(ls);
910 }
911 }
912 }
913
914 int BlueFS::fsck()
915 {
916 std::lock_guard l(lock);
917 dout(1) << __func__ << dendl;
918 // hrm, i think we check everything on mount...
919 return 0;
920 }
921
922 int BlueFS::_write_super(int dev)
923 {
924 // build superblock
925 bufferlist bl;
926 encode(super, bl);
927 uint32_t crc = bl.crc32c(-1);
928 encode(crc, bl);
929 dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
930 dout(10) << __func__ << " superblock " << super.version << dendl;
931 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
932 ceph_assert_always(bl.length() <= get_super_length());
933 bl.append_zero(get_super_length() - bl.length());
934
935 bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT);
936 dout(20) << __func__ << " v " << super.version
937 << " crc 0x" << std::hex << crc
938 << " offset 0x" << get_super_offset() << std::dec
939 << dendl;
940 return 0;
941 }
942
943 int BlueFS::_open_super()
944 {
945 dout(10) << __func__ << dendl;
946
947 bufferlist bl;
948 uint32_t expected_crc, crc;
949 int r;
950
951 // always the second block
952 r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
953 &bl, ioc[BDEV_DB], false);
954 if (r < 0)
955 return r;
956
957 auto p = bl.cbegin();
958 decode(super, p);
959 {
960 bufferlist t;
961 t.substr_of(bl, 0, p.get_off());
962 crc = t.crc32c(-1);
963 }
964 decode(expected_crc, p);
965 if (crc != expected_crc) {
966 derr << __func__ << " bad crc on superblock, expected 0x"
967 << std::hex << expected_crc << " != actual 0x" << crc << std::dec
968 << dendl;
969 return -EIO;
970 }
971 dout(10) << __func__ << " superblock " << super.version << dendl;
972 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
973 return 0;
974 }
975
976 int BlueFS::_check_new_allocations(const bluefs_fnode_t& fnode,
977 size_t dev_count,
978 boost::dynamic_bitset<uint64_t>* owned_blocks,
979 boost::dynamic_bitset<uint64_t>* used_blocks)
980 {
981 auto& fnode_extents = fnode.extents;
982 for (auto e : fnode_extents) {
983 auto id = e.bdev;
984 bool fail = false;
985 ceph_assert(id < dev_count);
986 apply_for_bitset_range(e.offset, e.length, alloc_size[id], owned_blocks[id],
987 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
988 if (!bs.test(pos)) {
989 fail = true;
990 }
991 }
992 );
993 if (fail) {
994 derr << __func__ << " invalid extent " << int(id)
995 << ": 0x" << std::hex << e.offset << "~" << e.length
996 << std::dec
997 << ": wasn't given but allocated for ino " << fnode.ino
998 << dendl;
999 return -EFAULT;
1000 }
1001
1002 apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
1003 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1004 if (bs.test(pos)) {
1005 fail = true;
1006 }
1007 bs.set(pos);
1008 }
1009 );
1010 if (fail) {
1011 derr << __func__ << " invalid extent " << int(e.bdev)
1012 << ": 0x" << std::hex << e.offset << "~" << e.length
1013 << std::dec << ": duplicate reference, ino " << fnode.ino
1014 << dendl;
1015 return -EFAULT;
1016 }
1017 }
1018 return 0;
1019 }
1020
1021 int BlueFS::_adjust_granularity(
1022 __u8 id, uint64_t *offset, uint64_t *length, bool alloc)
1023 {
1024 const char *op = alloc ? "op_alloc_add" : "op_alloc_rm";
1025 auto oldo = *offset;
1026 auto oldl = *length;
1027 if (*offset & (alloc_size[id] - 1)) {
1028 *offset &= ~(alloc_size[id] - 1);
1029 *offset += alloc_size[id];
1030 if (*length > *offset - oldo) {
1031 if (alloc) {
1032 block_unused_too_granular[id].insert(oldo, *offset - oldo);
1033 } else {
1034 block_unused_too_granular[id].erase(oldo, *offset - oldo);
1035 }
1036 *length -= (*offset - oldo);
1037 } else {
1038 if (alloc) {
1039 block_unused_too_granular[id].insert(oldo, *length);
1040 } else {
1041 block_unused_too_granular[id].erase(oldo, *length);
1042 }
1043 *length = 0;
1044 }
1045 }
1046 if (*length & (alloc_size[id] - 1)) {
1047 *length &= ~(alloc_size[id] - 1);
1048 if (alloc) {
1049 block_unused_too_granular[id].insert(
1050 *offset + *length,
1051 oldo + oldl - *offset - *length);
1052 } else {
1053 block_unused_too_granular[id].erase(
1054 *offset + *length,
1055 oldo + oldl - *offset - *length);
1056 }
1057 }
1058 if (oldo != *offset || oldl != *length) {
1059 dout(10) << __func__ << " " << op << " "
1060 << (int)id << ":" << std::hex << oldo << "~" << oldl
1061 << " -> " << (int)id << ":" << *offset << "~" << *length << dendl;
1062 }
1063 return 0;
1064 }
1065
1066 int BlueFS::_verify_alloc_granularity(
1067 __u8 id, uint64_t offset, uint64_t length, const char *op)
1068 {
1069 if ((offset & (alloc_size[id] - 1)) ||
1070 (length & (alloc_size[id] - 1))) {
1071 derr << __func__ << " " << op << " of " << (int)id
1072 << ":0x" << std::hex << offset << "~" << length << std::dec
1073 << " does not align to alloc_size 0x"
1074 << std::hex << alloc_size[id] << std::dec << dendl;
1075 // be helpful
1076 auto need = alloc_size[id];
1077 while (need && ((offset & (need - 1)) ||
1078 (length & (need - 1)))) {
1079 need >>= 1;
1080 }
1081 if (need) {
1082 const char *which;
1083 if (id == BDEV_SLOW ||
1084 (id == BDEV_DB && !bdev[BDEV_SLOW])) {
1085 which = "bluefs_shared_alloc_size";
1086 } else {
1087 which = "bluefs_alloc_size";
1088 }
1089 derr << "work-around by setting " << which << " = " << need
1090 << " for this OSD" << dendl;
1091 }
1092 return -EFAULT;
1093 }
1094 return 0;
1095 }
1096
1097 int BlueFS::_replay(bool noop, bool to_stdout)
1098 {
1099 dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
1100 ino_last = 1; // by the log
1101 log_seq = 0;
1102
1103 FileRef log_file;
1104 log_file = _get_file(1);
1105
1106 // sanity check
1107 for (auto& a : block_unused_too_granular) {
1108 ceph_assert(a.empty());
1109 }
1110
1111 if (!noop) {
1112 log_file->fnode = super.log_fnode;
1113 log_file->vselector_hint =
1114 vselector->get_hint_for_log();
1115 } else {
1116 // do not use fnode from superblock in 'noop' mode - log_file's one should
1117 // be fine and up-to-date
1118 ceph_assert(log_file->fnode.ino == 1);
1119 ceph_assert(log_file->fnode.extents.size() != 0);
1120 }
1121 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
1122 if (unlikely(to_stdout)) {
1123 std::cout << " log_fnode " << super.log_fnode << std::endl;
1124 }
1125
1126 FileReader *log_reader = new FileReader(
1127 log_file, cct->_conf->bluefs_max_prefetch,
1128 false, // !random
1129 true); // ignore eof
1130
1131 bool seen_recs = false;
1132
1133 boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV];
1134 boost::dynamic_bitset<uint64_t> owned_blocks[MAX_BDEV];
1135
1136 if (cct->_conf->bluefs_log_replay_check_allocations) {
1137 for (size_t i = 0; i < MAX_BDEV; ++i) {
1138 if (alloc_size[i] != 0 && bdev[i] != nullptr) {
1139 used_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]);
1140 owned_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]);
1141 }
1142 }
1143 }
1144
1145 bool first_log_check = true;
1146
1147 while (true) {
1148 ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0);
1149 uint64_t pos = log_reader->buf.pos;
1150 uint64_t read_pos = pos;
1151 bufferlist bl;
1152 {
1153 int r = _read(log_reader, &log_reader->buf, read_pos, super.block_size,
1154 &bl, NULL);
1155 if (r != (int)super.block_size && cct->_conf->bluefs_replay_recovery) {
1156 r += do_replay_recovery_read(log_reader, pos, read_pos + r, super.block_size - r, &bl);
1157 }
1158 assert(r == (int)super.block_size);
1159 read_pos += r;
1160 }
1161 uint64_t more = 0;
1162 uint64_t seq;
1163 uuid_d uuid;
1164 {
1165 auto p = bl.cbegin();
1166 __u8 a, b;
1167 uint32_t len;
1168 decode(a, p);
1169 decode(b, p);
1170 decode(len, p);
1171 decode(uuid, p);
1172 decode(seq, p);
1173 if (len + 6 > bl.length()) {
1174 more = round_up_to(len + 6 - bl.length(), super.block_size);
1175 }
1176 }
1177 if (uuid != super.uuid) {
1178 if (seen_recs) {
1179 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1180 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1181 << dendl;
1182 } else {
1183 derr << __func__ << " 0x" << std::hex << pos << std::dec
1184 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1185 << ", block dump: \n";
1186 bufferlist t;
1187 t.substr_of(bl, 0, super.block_size);
1188 t.hexdump(*_dout);
1189 *_dout << dendl;
1190 }
1191 break;
1192 }
1193 if (seq != log_seq + 1) {
1194 if (seen_recs) {
1195 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1196 << ": stop: seq " << seq << " != expected " << log_seq + 1
1197 << dendl;;
1198 } else {
1199 derr << __func__ << " 0x" << std::hex << pos << std::dec
1200 << ": stop: seq " << seq << " != expected " << log_seq + 1
1201 << dendl;;
1202 }
1203 break;
1204 }
1205 if (more) {
1206 dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
1207 << " more bytes" << dendl;
1208 bufferlist t;
1209 int r = _read(log_reader, &log_reader->buf, read_pos, more, &t, NULL);
1210 if (r < (int)more) {
1211 dout(10) << __func__ << " 0x" << std::hex << pos
1212 << ": stop: len is 0x" << bl.length() + more << std::dec
1213 << ", which is past eof" << dendl;
1214 if (cct->_conf->bluefs_replay_recovery) {
1215 //try to search for more data
1216 r += do_replay_recovery_read(log_reader, pos, read_pos + r, more - r, &t);
1217 if (r < (int)more) {
1218 //in normal mode we must read r==more, for recovery it is too strict
1219 break;
1220 }
1221 }
1222 }
1223 ceph_assert(r == (int)more);
1224 bl.claim_append(t);
1225 read_pos += r;
1226 }
1227 seen_recs = true;
1228 bluefs_transaction_t t;
1229 try {
1230 auto p = bl.cbegin();
1231 decode(t, p);
1232 }
1233 catch (buffer::error& e) {
1234 derr << __func__ << " 0x" << std::hex << pos << std::dec
1235 << ": stop: failed to decode: " << e.what()
1236 << dendl;
1237 delete log_reader;
1238 return -EIO;
1239 }
1240 ceph_assert(seq == t.seq);
1241 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1242 << ": " << t << dendl;
1243 if (unlikely(to_stdout)) {
1244 std::cout << " 0x" << std::hex << pos << std::dec
1245 << ": " << t << std::endl;
1246 }
1247
1248 auto p = t.op_bl.cbegin();
1249 while (!p.end()) {
1250 __u8 op;
1251 decode(op, p);
1252 switch (op) {
1253
1254 case bluefs_transaction_t::OP_INIT:
1255 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1256 << ": op_init" << dendl;
1257 if (unlikely(to_stdout)) {
1258 std::cout << " 0x" << std::hex << pos << std::dec
1259 << ": op_init" << std::endl;
1260 }
1261
1262 ceph_assert(t.seq == 1);
1263 break;
1264
1265 case bluefs_transaction_t::OP_JUMP:
1266 {
1267 uint64_t next_seq;
1268 uint64_t offset;
1269 decode(next_seq, p);
1270 decode(offset, p);
1271 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1272 << ": op_jump seq " << next_seq
1273 << " offset 0x" << std::hex << offset << std::dec << dendl;
1274 if (unlikely(to_stdout)) {
1275 std::cout << " 0x" << std::hex << pos << std::dec
1276 << ": op_jump seq " << next_seq
1277 << " offset 0x" << std::hex << offset << std::dec
1278 << std::endl;
1279 }
1280
1281 ceph_assert(next_seq >= log_seq);
1282 log_seq = next_seq - 1; // we will increment it below
1283 uint64_t skip = offset - read_pos;
1284 if (skip) {
1285 bufferlist junk;
1286 int r = _read(log_reader, &log_reader->buf, read_pos, skip, &junk,
1287 NULL);
1288 if (r != (int)skip) {
1289 dout(10) << __func__ << " 0x" << std::hex << read_pos
1290 << ": stop: failed to skip to " << offset
1291 << std::dec << dendl;
1292 ceph_abort_msg("problem with op_jump");
1293 }
1294 }
1295 }
1296 break;
1297
1298 case bluefs_transaction_t::OP_JUMP_SEQ:
1299 {
1300 uint64_t next_seq;
1301 decode(next_seq, p);
1302 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1303 << ": op_jump_seq " << next_seq << dendl;
1304 if (unlikely(to_stdout)) {
1305 std::cout << " 0x" << std::hex << pos << std::dec
1306 << ": op_jump_seq " << next_seq << std::endl;
1307 }
1308
1309 ceph_assert(next_seq >= log_seq);
1310 log_seq = next_seq - 1; // we will increment it below
1311 }
1312 break;
1313
1314 case bluefs_transaction_t::OP_ALLOC_ADD:
1315 {
1316 __u8 id;
1317 uint64_t offset, length;
1318 decode(id, p);
1319 decode(offset, p);
1320 decode(length, p);
1321 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1322 << ": op_alloc_add " << " " << (int)id
1323 << ":0x" << std::hex << offset << "~" << length << std::dec
1324 << dendl;
1325 if (unlikely(to_stdout)) {
1326 std::cout << " 0x" << std::hex << pos << std::dec
1327 << ": op_alloc_add " << " " << (int)id
1328 << ":0x" << std::hex << offset << "~" << length << std::dec
1329 << std::endl;
1330 }
1331 if (!noop) {
1332 block_all[id].insert(offset, length);
1333 _adjust_granularity(id, &offset, &length, true);
1334 if (length) {
1335 alloc[id]->init_add_free(offset, length);
1336 }
1337
1338 if (cct->_conf->bluefs_log_replay_check_allocations) {
1339 bool fail = false;
1340 apply_for_bitset_range(offset, length, alloc_size[id], owned_blocks[id],
1341 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1342 if (bs.test(pos)) {
1343 fail = true;
1344 } else {
1345 bs.set(pos);
1346 }
1347 }
1348 );
1349 if (fail) {
1350 derr << __func__ << " invalid extent " << (int)id
1351 << ": 0x" << std::hex << offset << "~" << length
1352 << std::dec << ": already given" << dendl;
1353 return -EFAULT;
1354 }
1355 apply_for_bitset_range(offset, length, alloc_size[id], used_blocks[id],
1356 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1357 if (bs.test(pos)) {
1358 fail = true;
1359 }
1360 }
1361 );
1362 if (fail) {
1363 derr << __func__ << " invalid extent " << int(id)
1364 << ": 0x" << std::hex << offset << "~" << length
1365 << std::dec << ": already in use" << dendl;
1366 return -EFAULT;
1367 }
1368 }
1369 }
1370 }
1371 break;
1372
1373 case bluefs_transaction_t::OP_ALLOC_RM:
1374 {
1375 __u8 id;
1376 uint64_t offset, length;
1377 decode(id, p);
1378 decode(offset, p);
1379 decode(length, p);
1380 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1381 << ": op_alloc_rm " << " " << (int)id
1382 << ":0x" << std::hex << offset << "~" << length << std::dec
1383 << dendl;
1384 if (unlikely(to_stdout)) {
1385 std::cout << " 0x" << std::hex << pos << std::dec
1386 << ": op_alloc_rm " << " " << (int)id
1387 << ":0x" << std::hex << offset << "~" << length << std::dec
1388 << std::endl;
1389 }
1390 if (!noop) {
1391 block_all[id].erase(offset, length);
1392 _adjust_granularity(id, &offset, &length, false);
1393 if (length) {
1394 alloc[id]->init_rm_free(offset, length);
1395 }
1396 if (cct->_conf->bluefs_log_replay_check_allocations) {
1397 bool fail = false;
1398 apply_for_bitset_range(offset, length, alloc_size[id], owned_blocks[id],
1399 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1400 if (!bs.test(pos)) {
1401 fail = true;
1402 } else {
1403 bs.reset(pos);
1404 }
1405 }
1406 );
1407 if (fail) {
1408 derr << __func__ << " invalid extent " << int(id)
1409 << ": 0x" << std::hex << offset << "~" << length
1410 << std::dec << ": wasn't given" << dendl;
1411 return -EFAULT;
1412 }
1413
1414 apply_for_bitset_range(offset, length, alloc_size[id], used_blocks[id],
1415 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1416 if (bs.test(pos)) {
1417 fail = true;
1418 }
1419 }
1420 );
1421 if (fail) {
1422 derr << __func__ << " invalid extent " << (int)id
1423 << ": 0x" << std::hex << offset << "~" << length
1424 << std::dec << ": still in use" << dendl;
1425 return -EFAULT;
1426 }
1427 }
1428 }
1429 }
1430 break;
1431
1432 case bluefs_transaction_t::OP_DIR_LINK:
1433 {
1434 string dirname, filename;
1435 uint64_t ino;
1436 decode(dirname, p);
1437 decode(filename, p);
1438 decode(ino, p);
1439 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1440 << ": op_dir_link " << " " << dirname << "/" << filename
1441 << " to " << ino
1442 << dendl;
1443 if (unlikely(to_stdout)) {
1444 std::cout << " 0x" << std::hex << pos << std::dec
1445 << ": op_dir_link " << " " << dirname << "/" << filename
1446 << " to " << ino
1447 << std::endl;
1448 }
1449
1450 if (!noop) {
1451 FileRef file = _get_file(ino);
1452 ceph_assert(file->fnode.ino);
1453 map<string,DirRef>::iterator q = dir_map.find(dirname);
1454 ceph_assert(q != dir_map.end());
1455 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
1456 ceph_assert(r == q->second->file_map.end());
1457
1458 vselector->sub_usage(file->vselector_hint, file->fnode);
1459 file->vselector_hint =
1460 vselector->get_hint_by_dir(dirname);
1461 vselector->add_usage(file->vselector_hint, file->fnode);
1462
1463 q->second->file_map[filename] = file;
1464 ++file->refs;
1465 }
1466 }
1467 break;
1468
1469 case bluefs_transaction_t::OP_DIR_UNLINK:
1470 {
1471 string dirname, filename;
1472 decode(dirname, p);
1473 decode(filename, p);
1474 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1475 << ": op_dir_unlink " << " " << dirname << "/" << filename
1476 << dendl;
1477 if (unlikely(to_stdout)) {
1478 std::cout << " 0x" << std::hex << pos << std::dec
1479 << ": op_dir_unlink " << " " << dirname << "/" << filename
1480 << std::endl;
1481 }
1482
1483 if (!noop) {
1484 map<string,DirRef>::iterator q = dir_map.find(dirname);
1485 ceph_assert(q != dir_map.end());
1486 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
1487 ceph_assert(r != q->second->file_map.end());
1488 ceph_assert(r->second->refs > 0);
1489 --r->second->refs;
1490 q->second->file_map.erase(r);
1491 }
1492 }
1493 break;
1494
1495 case bluefs_transaction_t::OP_DIR_CREATE:
1496 {
1497 string dirname;
1498 decode(dirname, p);
1499 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1500 << ": op_dir_create " << dirname << dendl;
1501 if (unlikely(to_stdout)) {
1502 std::cout << " 0x" << std::hex << pos << std::dec
1503 << ": op_dir_create " << dirname << std::endl;
1504 }
1505
1506 if (!noop) {
1507 map<string,DirRef>::iterator q = dir_map.find(dirname);
1508 ceph_assert(q == dir_map.end());
1509 dir_map[dirname] = ceph::make_ref<Dir>();
1510 }
1511 }
1512 break;
1513
1514 case bluefs_transaction_t::OP_DIR_REMOVE:
1515 {
1516 string dirname;
1517 decode(dirname, p);
1518 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1519 << ": op_dir_remove " << dirname << dendl;
1520 if (unlikely(to_stdout)) {
1521 std::cout << " 0x" << std::hex << pos << std::dec
1522 << ": op_dir_remove " << dirname << std::endl;
1523 }
1524
1525 if (!noop) {
1526 map<string,DirRef>::iterator q = dir_map.find(dirname);
1527 ceph_assert(q != dir_map.end());
1528 ceph_assert(q->second->file_map.empty());
1529 dir_map.erase(q);
1530 }
1531 }
1532 break;
1533
1534 case bluefs_transaction_t::OP_FILE_UPDATE:
1535 {
1536 bluefs_fnode_t fnode;
1537 decode(fnode, p);
1538 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1539 << ": op_file_update " << " " << fnode << " " << dendl;
1540 if (unlikely(to_stdout)) {
1541 std::cout << " 0x" << std::hex << pos << std::dec
1542 << ": op_file_update " << " " << fnode << std::endl;
1543 }
1544 if (!noop) {
1545 FileRef f = _get_file(fnode.ino);
1546 if (cct->_conf->bluefs_log_replay_check_allocations) {
1547 // check initial log layout
1548 if (first_log_check) {
1549 first_log_check = false;
1550 int r = _check_new_allocations(log_file->fnode,
1551 MAX_BDEV, owned_blocks, used_blocks);
1552 if (r < 0) {
1553 return r;
1554 }
1555 }
1556
1557 auto& fnode_extents = f->fnode.extents;
1558 for (auto e : fnode_extents) {
1559 auto id = e.bdev;
1560 if (int r = _verify_alloc_granularity(id, e.offset, e.length,
1561 "OP_FILE_UPDATE"); r < 0) {
1562 return r;
1563 }
1564 apply_for_bitset_range(e.offset, e.length, alloc_size[id],
1565 used_blocks[id],
1566 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1567 ceph_assert(bs.test(pos));
1568 bs.reset(pos);
1569 }
1570 );
1571 }
1572 }
1573
1574 if (fnode.ino != 1) {
1575 vselector->sub_usage(f->vselector_hint, f->fnode);
1576 }
1577 f->fnode = fnode;
1578 if (fnode.ino != 1) {
1579 vselector->add_usage(f->vselector_hint, f->fnode);
1580 }
1581
1582 if (fnode.ino > ino_last) {
1583 ino_last = fnode.ino;
1584 }
1585 if (cct->_conf->bluefs_log_replay_check_allocations) {
1586 int r = _check_new_allocations(f->fnode,
1587 MAX_BDEV, owned_blocks, used_blocks);
1588 if (r < 0) {
1589 return r;
1590 }
1591 }
1592 }
1593 }
1594 break;
1595
1596 case bluefs_transaction_t::OP_FILE_REMOVE:
1597 {
1598 uint64_t ino;
1599 decode(ino, p);
1600 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1601 << ": op_file_remove " << ino << dendl;
1602 if (unlikely(to_stdout)) {
1603 std::cout << " 0x" << std::hex << pos << std::dec
1604 << ": op_file_remove " << ino << std::endl;
1605 }
1606
1607 if (!noop) {
1608 auto p = file_map.find(ino);
1609 ceph_assert(p != file_map.end());
1610 vselector->sub_usage(p->second->vselector_hint, p->second->fnode);
1611 if (cct->_conf->bluefs_log_replay_check_allocations) {
1612 auto& fnode_extents = p->second->fnode.extents;
1613 for (auto e : fnode_extents) {
1614 auto id = e.bdev;
1615 bool fail = false;
1616 apply_for_bitset_range(e.offset, e.length, alloc_size[id], owned_blocks[id],
1617 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1618 if (!bs.test(pos)) {
1619 fail = true;
1620 }
1621 }
1622 );
1623 if (fail) {
1624 derr << __func__ << " invalid extent " << int(id)
1625 << ": 0x" << std::hex << e.offset << "~" << e.length
1626 << std::dec
1627 << ": wasn't given but is allocated for removed ino " << ino
1628 << dendl;
1629 return -EFAULT;
1630 }
1631
1632 apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
1633 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1634 if (!bs.test(pos)) {
1635 fail = true;
1636 }
1637 bs.reset(pos);
1638 }
1639 );
1640 if (fail) {
1641 derr << __func__ << " invalid extent " << int(id)
1642 << ": 0x" << std::hex << e.offset << "~" << e.length
1643 << std::dec
1644 << ": not in use but is allocated for removed ino " << ino
1645 << dendl;
1646 return -EFAULT;
1647 }
1648 }
1649 }
1650 file_map.erase(p);
1651 }
1652 }
1653 break;
1654
1655 default:
1656 derr << __func__ << " 0x" << std::hex << pos << std::dec
1657 << ": stop: unrecognized op " << (int)op << dendl;
1658 delete log_reader;
1659 return -EIO;
1660 }
1661 }
1662 ceph_assert(p.end());
1663
1664 // we successfully replayed the transaction; bump the seq and log size
1665 ++log_seq;
1666 log_file->fnode.size = log_reader->buf.pos;
1667 }
1668 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
1669
1670 if (!noop && first_log_check &&
1671 cct->_conf->bluefs_log_replay_check_allocations) {
1672 int r = _check_new_allocations(log_file->fnode,
1673 MAX_BDEV, owned_blocks, used_blocks);
1674 if (r < 0) {
1675 return r;
1676 }
1677 }
1678
1679 dout(10) << __func__ << " log file size was 0x"
1680 << std::hex << log_file->fnode.size << std::dec << dendl;
1681 if (unlikely(to_stdout)) {
1682 std::cout << " log file size was 0x"
1683 << std::hex << log_file->fnode.size << std::dec << std::endl;
1684 }
1685
1686 delete log_reader;
1687
1688 if (!noop) {
1689 // verify file link counts are all >0
1690 for (auto& p : file_map) {
1691 if (p.second->refs == 0 &&
1692 p.second->fnode.ino > 1) {
1693 derr << __func__ << " file with link count 0: " << p.second->fnode
1694 << dendl;
1695 return -EIO;
1696 }
1697 }
1698 }
1699
1700 for (unsigned id = 0; id < MAX_BDEV; ++id) {
1701 dout(10) << __func__ << " block_unused_too_granular " << id << ": "
1702 << block_unused_too_granular[id] << dendl;
1703 }
1704 dout(10) << __func__ << " done" << dendl;
1705 return 0;
1706 }
1707
1708 int BlueFS::log_dump()
1709 {
1710 // only dump log file's content
1711 int r = _replay(true, true);
1712 if (r < 0) {
1713 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
1714 return r;
1715 }
1716
1717 return 0;
1718 }
1719
1720 int BlueFS::device_migrate_to_existing(
1721 CephContext *cct,
1722 const set<int>& devs_source,
1723 int dev_target,
1724 const bluefs_layout_t& layout)
1725 {
1726 vector<byte> buf;
1727 bool buffered = cct->_conf->bluefs_buffered_io;
1728
1729 dout(10) << __func__ << " devs_source " << devs_source
1730 << " dev_target " << dev_target << dendl;
1731 assert(dev_target < (int)MAX_BDEV);
1732
1733 int flags = 0;
1734 flags |= devs_source.count(BDEV_DB) ?
1735 (REMOVE_DB | RENAME_SLOW2DB) : 0;
1736 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1737 int dev_target_new = dev_target;
1738
1739 // Slow device without separate DB one is addressed via BDEV_DB
1740 // Hence need renaming.
1741 if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) {
1742 dev_target_new = BDEV_DB;
1743 dout(0) << __func__ << " super to be written to " << dev_target << dendl;
1744 }
1745
1746 for (auto& [ino, file_ref] : file_map) {
1747 //do not copy log
1748 if (file_ref->fnode.ino == 1) {
1749 continue;
1750 }
1751 dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
1752
1753 auto& fnode_extents = file_ref->fnode.extents;
1754
1755 bool rewrite = std::any_of(
1756 fnode_extents.begin(),
1757 fnode_extents.end(),
1758 [=](auto& ext) {
1759 return ext.bdev != dev_target && devs_source.count(ext.bdev);
1760 });
1761 if (rewrite) {
1762 dout(10) << __func__ << " migrating" << dendl;
1763
1764 // read entire file
1765 bufferlist bl;
1766 for (auto old_ext : fnode_extents) {
1767 buf.resize(old_ext.length);
1768 int r = bdev[old_ext.bdev]->read_random(
1769 old_ext.offset,
1770 old_ext.length,
1771 (char*)&buf.at(0),
1772 buffered);
1773 if (r != 0) {
1774 derr << __func__ << " failed to read 0x" << std::hex
1775 << old_ext.offset << "~" << old_ext.length << std::dec
1776 << " from " << (int)dev_target << dendl;
1777 return -EIO;
1778 }
1779 bl.append((char*)&buf[0], old_ext.length);
1780 }
1781
1782 // write entire file
1783 PExtentVector extents;
1784 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1785 if (l < 0) {
1786 derr << __func__ << " unable to allocate len 0x" << std::hex
1787 << bl.length() << std::dec << " from " << (int)dev_target
1788 << ": " << cpp_strerror(l) << dendl;
1789 return -ENOSPC;
1790 }
1791
1792 uint64_t off = 0;
1793 for (auto& i : extents) {
1794 bufferlist cur;
1795 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1796 ceph_assert(cur_len > 0);
1797 cur.substr_of(bl, off, cur_len);
1798 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1799 ceph_assert(r == 0);
1800 off += cur_len;
1801 }
1802
1803 // release old extents
1804 for (auto old_ext : fnode_extents) {
1805 PExtentVector to_release;
1806 to_release.emplace_back(old_ext.offset, old_ext.length);
1807 alloc[old_ext.bdev]->release(to_release);
1808 }
1809
1810 // update fnode
1811 fnode_extents.clear();
1812 for (auto& i : extents) {
1813 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
1814 }
1815 } else {
1816 for (auto& ext : fnode_extents) {
1817 if (dev_target != dev_target_new && ext.bdev == dev_target) {
1818 dout(20) << __func__ << " " << " ... adjusting extent 0x"
1819 << std::hex << ext.offset << std::dec
1820 << " bdev " << dev_target << " -> " << dev_target_new
1821 << dendl;
1822 ext.bdev = dev_target_new;
1823 }
1824 }
1825 }
1826 }
1827 // new logging device in the current naming scheme
1828 int new_log_dev_cur = bdev[BDEV_WAL] ?
1829 BDEV_WAL :
1830 bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
1831
1832 // new logging device in new naming scheme
1833 int new_log_dev_next = new_log_dev_cur;
1834
1835 if (devs_source.count(new_log_dev_cur)) {
1836 // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1837 new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ?
1838 BDEV_DB :
1839 BDEV_WAL;
1840
1841 dout(0) << __func__ << " log moved from " << new_log_dev_cur
1842 << " to " << new_log_dev_next << dendl;
1843
1844 new_log_dev_cur =
1845 (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ?
1846 BDEV_SLOW :
1847 new_log_dev_next;
1848 }
1849
1850 _rewrite_log_and_layout_sync(
1851 false,
1852 (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB,
1853 new_log_dev_cur,
1854 new_log_dev_next,
1855 flags,
1856 layout);
1857 return 0;
1858 }
1859
1860 int BlueFS::device_migrate_to_new(
1861 CephContext *cct,
1862 const set<int>& devs_source,
1863 int dev_target,
1864 const bluefs_layout_t& layout)
1865 {
1866 vector<byte> buf;
1867 bool buffered = cct->_conf->bluefs_buffered_io;
1868
1869 dout(10) << __func__ << " devs_source " << devs_source
1870 << " dev_target " << dev_target << dendl;
1871 assert(dev_target == (int)BDEV_NEWDB || (int)BDEV_NEWWAL);
1872
1873 int flags = 0;
1874
1875 flags |= devs_source.count(BDEV_DB) ?
1876 (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) :
1877 0;
1878 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1879 int dev_target_new = dev_target; //FIXME: remove, makes no sense
1880
1881 for (auto& p : file_map) {
1882 //do not copy log
1883 if (p.second->fnode.ino == 1) {
1884 continue;
1885 }
1886 dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl;
1887
1888 auto& fnode_extents = p.second->fnode.extents;
1889
1890 bool rewrite = false;
1891 for (auto ext_it = fnode_extents.begin();
1892 ext_it != p.second->fnode.extents.end();
1893 ++ext_it) {
1894 if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
1895 rewrite = true;
1896 break;
1897 }
1898 }
1899 if (rewrite) {
1900 dout(10) << __func__ << " migrating" << dendl;
1901
1902 // read entire file
1903 bufferlist bl;
1904 for (auto old_ext : fnode_extents) {
1905 buf.resize(old_ext.length);
1906 int r = bdev[old_ext.bdev]->read_random(
1907 old_ext.offset,
1908 old_ext.length,
1909 (char*)&buf.at(0),
1910 buffered);
1911 if (r != 0) {
1912 derr << __func__ << " failed to read 0x" << std::hex
1913 << old_ext.offset << "~" << old_ext.length << std::dec
1914 << " from " << (int)dev_target << dendl;
1915 return -EIO;
1916 }
1917 bl.append((char*)&buf[0], old_ext.length);
1918 }
1919
1920 // write entire file
1921 PExtentVector extents;
1922 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1923 if (l < 0) {
1924 derr << __func__ << " unable to allocate len 0x" << std::hex
1925 << bl.length() << std::dec << " from " << (int)dev_target
1926 << ": " << cpp_strerror(l) << dendl;
1927 return -ENOSPC;
1928 }
1929
1930 uint64_t off = 0;
1931 for (auto& i : extents) {
1932 bufferlist cur;
1933 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1934 ceph_assert(cur_len > 0);
1935 cur.substr_of(bl, off, cur_len);
1936 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1937 ceph_assert(r == 0);
1938 off += cur_len;
1939 }
1940
1941 // release old extents
1942 for (auto old_ext : fnode_extents) {
1943 PExtentVector to_release;
1944 to_release.emplace_back(old_ext.offset, old_ext.length);
1945 alloc[old_ext.bdev]->release(to_release);
1946 }
1947
1948 // update fnode
1949 fnode_extents.clear();
1950 for (auto& i : extents) {
1951 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
1952 }
1953 }
1954 }
1955 // new logging device in the current naming scheme
1956 int new_log_dev_cur =
1957 bdev[BDEV_NEWWAL] ?
1958 BDEV_NEWWAL :
1959 bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ?
1960 BDEV_WAL :
1961 bdev[BDEV_NEWDB] ?
1962 BDEV_NEWDB :
1963 bdev[BDEV_DB] && !(flags & REMOVE_DB)?
1964 BDEV_DB :
1965 BDEV_SLOW;
1966
1967 // new logging device in new naming scheme
1968 int new_log_dev_next =
1969 new_log_dev_cur == BDEV_NEWWAL ?
1970 BDEV_WAL :
1971 new_log_dev_cur == BDEV_NEWDB ?
1972 BDEV_DB :
1973 new_log_dev_cur;
1974
1975 int super_dev =
1976 dev_target == BDEV_NEWDB ?
1977 BDEV_NEWDB :
1978 bdev[BDEV_DB] ?
1979 BDEV_DB :
1980 BDEV_SLOW;
1981
1982 _rewrite_log_and_layout_sync(
1983 false,
1984 super_dev,
1985 new_log_dev_cur,
1986 new_log_dev_next,
1987 flags,
1988 layout);
1989 return 0;
1990 }
1991
1992 BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
1993 {
1994 auto p = file_map.find(ino);
1995 if (p == file_map.end()) {
1996 FileRef f = ceph::make_ref<File>();
1997 file_map[ino] = f;
1998 dout(30) << __func__ << " ino " << ino << " = " << f
1999 << " (new)" << dendl;
2000 return f;
2001 } else {
2002 dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
2003 return p->second;
2004 }
2005 }
2006
2007 void BlueFS::_drop_link(FileRef file)
2008 {
2009 dout(20) << __func__ << " had refs " << file->refs
2010 << " on " << file->fnode << dendl;
2011 ceph_assert(file->refs > 0);
2012 --file->refs;
2013 if (file->refs == 0) {
2014 dout(20) << __func__ << " destroying " << file->fnode << dendl;
2015 ceph_assert(file->num_reading.load() == 0);
2016 vselector->sub_usage(file->vselector_hint, file->fnode);
2017 log_t.op_file_remove(file->fnode.ino);
2018 for (auto& r : file->fnode.extents) {
2019 pending_release[r.bdev].insert(r.offset, r.length);
2020 }
2021 file_map.erase(file->fnode.ino);
2022 file->deleted = true;
2023
2024 if (file->dirty_seq) {
2025 ceph_assert(file->dirty_seq > log_seq_stable);
2026 ceph_assert(dirty_files.count(file->dirty_seq));
2027 auto it = dirty_files[file->dirty_seq].iterator_to(*file);
2028 dirty_files[file->dirty_seq].erase(it);
2029 file->dirty_seq = 0;
2030 }
2031 }
2032 }
2033
2034 int64_t BlueFS::_read_random(
2035 FileReader *h, ///< [in] read from here
2036 uint64_t off, ///< [in] offset
2037 uint64_t len, ///< [in] this many bytes
2038 char *out) ///< [out] optional: or copy it here
2039 {
2040 auto* buf = &h->buf;
2041
2042 int64_t ret = 0;
2043 dout(10) << __func__ << " h " << h
2044 << " 0x" << std::hex << off << "~" << len << std::dec
2045 << " from " << h->file->fnode << dendl;
2046
2047 ++h->file->num_reading;
2048
2049 if (!h->ignore_eof &&
2050 off + len > h->file->fnode.size) {
2051 if (off > h->file->fnode.size)
2052 len = 0;
2053 else
2054 len = h->file->fnode.size - off;
2055 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
2056 << std::hex << len << std::dec << dendl;
2057 }
2058 logger->inc(l_bluefs_read_random_count, 1);
2059 logger->inc(l_bluefs_read_random_bytes, len);
2060
2061 std::shared_lock s_lock(h->lock);
2062 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
2063 while (len > 0) {
2064 if (off < buf->bl_off || off >= buf->get_buf_end()) {
2065 s_lock.unlock();
2066 uint64_t x_off = 0;
2067 auto p = h->file->fnode.seek(off, &x_off);
2068 ceph_assert(p != h->file->fnode.extents.end());
2069 uint64_t l = std::min(p->length - x_off, len);
2070 //hard cap to 1GB
2071 l = std::min(l, uint64_t(1) << 30);
2072 dout(20) << __func__ << " read random 0x"
2073 << std::hex << x_off << "~" << l << std::dec
2074 << " of " << *p << dendl;
2075 int r;
2076 if (!cct->_conf->bluefs_check_for_zeros) {
2077 r = bdev[p->bdev]->read_random(p->offset + x_off, l, out,
2078 cct->_conf->bluefs_buffered_io);
2079 } else {
2080 r = read_random(p->bdev, p->offset + x_off, l, out,
2081 cct->_conf->bluefs_buffered_io);
2082 }
2083 ceph_assert(r == 0);
2084 off += l;
2085 len -= l;
2086 ret += l;
2087 out += l;
2088
2089 logger->inc(l_bluefs_read_random_disk_count, 1);
2090 logger->inc(l_bluefs_read_random_disk_bytes, l);
2091 if (len > 0) {
2092 s_lock.lock();
2093 }
2094 } else {
2095 auto left = buf->get_buf_remaining(off);
2096 int64_t r = std::min(len, left);
2097 logger->inc(l_bluefs_read_random_buffer_count, 1);
2098 logger->inc(l_bluefs_read_random_buffer_bytes, r);
2099 dout(20) << __func__ << " left 0x" << std::hex << left
2100 << " 0x" << off << "~" << len << std::dec
2101 << dendl;
2102
2103 if (out) {
2104 auto p = buf->bl.begin();
2105 p.seek(off - buf->bl_off);
2106 p.copy(r, out);
2107 out += r;
2108 }
2109
2110 dout(30) << __func__ << " result chunk (0x"
2111 << std::hex << r << std::dec << " bytes):\n";
2112 bufferlist t;
2113 t.substr_of(buf->bl, off - buf->bl_off, r);
2114 t.hexdump(*_dout);
2115 *_dout << dendl;
2116
2117 off += r;
2118 len -= r;
2119 ret += r;
2120 buf->pos += r;
2121 }
2122 }
2123 dout(20) << __func__ << " got " << ret << dendl;
2124 --h->file->num_reading;
2125 return ret;
2126 }
2127
2128 int64_t BlueFS::_read(
2129 FileReader *h, ///< [in] read from here
2130 FileReaderBuffer *buf, ///< [in] reader state
2131 uint64_t off, ///< [in] offset
2132 size_t len, ///< [in] this many bytes
2133 bufferlist *outbl, ///< [out] optional: reference the result here
2134 char *out) ///< [out] optional: or copy it here
2135 {
2136 bool prefetch = !outbl && !out;
2137 dout(10) << __func__ << " h " << h
2138 << " 0x" << std::hex << off << "~" << len << std::dec
2139 << " from " << h->file->fnode
2140 << (prefetch ? " prefetch" : "")
2141 << dendl;
2142
2143 ++h->file->num_reading;
2144
2145 if (!h->ignore_eof &&
2146 off + len > h->file->fnode.size) {
2147 if (off > h->file->fnode.size)
2148 len = 0;
2149 else
2150 len = h->file->fnode.size - off;
2151 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
2152 << std::hex << len << std::dec << dendl;
2153 }
2154 logger->inc(l_bluefs_read_count, 1);
2155 logger->inc(l_bluefs_read_bytes, len);
2156 if (prefetch) {
2157 logger->inc(l_bluefs_read_prefetch_count, 1);
2158 logger->inc(l_bluefs_read_prefetch_bytes, len);
2159 }
2160
2161 if (outbl)
2162 outbl->clear();
2163
2164 int64_t ret = 0;
2165 std::shared_lock s_lock(h->lock);
2166 while (len > 0) {
2167 size_t left;
2168 if (off < buf->bl_off || off >= buf->get_buf_end()) {
2169 s_lock.unlock();
2170 std::unique_lock u_lock(h->lock);
2171 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
2172 if (off < buf->bl_off || off >= buf->get_buf_end()) {
2173 // if precondition hasn't changed during locking upgrade.
2174 buf->bl.clear();
2175 buf->bl_off = off & super.block_mask();
2176 uint64_t x_off = 0;
2177 auto p = h->file->fnode.seek(buf->bl_off, &x_off);
2178 if (p == h->file->fnode.extents.end()) {
2179 dout(5) << __func__ << " reading less then required "
2180 << ret << "<" << ret + len << dendl;
2181 break;
2182 }
2183
2184 uint64_t want = round_up_to(len + (off & ~super.block_mask()),
2185 super.block_size);
2186 want = std::max(want, buf->max_prefetch);
2187 uint64_t l = std::min(p->length - x_off, want);
2188 //hard cap to 1GB
2189 l = std::min(l, uint64_t(1) << 30);
2190 uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size);
2191 if (!h->ignore_eof &&
2192 buf->bl_off + l > eof_offset) {
2193 l = eof_offset - buf->bl_off;
2194 }
2195 dout(20) << __func__ << " fetching 0x"
2196 << std::hex << x_off << "~" << l << std::dec
2197 << " of " << *p << dendl;
2198 int r;
2199 if (!cct->_conf->bluefs_check_for_zeros) {
2200 r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev],
2201 cct->_conf->bluefs_buffered_io);
2202 } else {
2203 r = read(p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev],
2204 cct->_conf->bluefs_buffered_io);
2205 }
2206 ceph_assert(r == 0);
2207 }
2208 u_lock.unlock();
2209 s_lock.lock();
2210 // we should recheck if buffer is valid after lock downgrade
2211 continue;
2212 }
2213 left = buf->get_buf_remaining(off);
2214 dout(20) << __func__ << " left 0x" << std::hex << left
2215 << " len 0x" << len << std::dec << dendl;
2216
2217 int64_t r = std::min(len, left);
2218 if (outbl) {
2219 bufferlist t;
2220 t.substr_of(buf->bl, off - buf->bl_off, r);
2221 outbl->claim_append(t);
2222 }
2223 if (out) {
2224 auto p = buf->bl.begin();
2225 p.seek(off - buf->bl_off);
2226 p.copy(r, out);
2227 out += r;
2228 }
2229
2230 dout(30) << __func__ << " result chunk (0x"
2231 << std::hex << r << std::dec << " bytes):\n";
2232 bufferlist t;
2233 t.substr_of(buf->bl, off - buf->bl_off, r);
2234 t.hexdump(*_dout);
2235 *_dout << dendl;
2236
2237 off += r;
2238 len -= r;
2239 ret += r;
2240 buf->pos += r;
2241 }
2242 dout(20) << __func__ << " got " << ret << dendl;
2243 ceph_assert(!outbl || (int)outbl->length() == ret);
2244 --h->file->num_reading;
2245 return ret;
2246 }
2247
2248 void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
2249 {
2250 dout(10) << __func__ << " file " << f->fnode
2251 << " 0x" << std::hex << offset << "~" << length << std::dec
2252 << dendl;
2253 if (offset & ~super.block_mask()) {
2254 offset &= super.block_mask();
2255 length = round_up_to(length, super.block_size);
2256 }
2257 uint64_t x_off = 0;
2258 auto p = f->fnode.seek(offset, &x_off);
2259 while (length > 0 && p != f->fnode.extents.end()) {
2260 uint64_t x_len = std::min(p->length - x_off, length);
2261 bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
2262 dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len
2263 << std:: dec << " of " << *p << dendl;
2264 offset += x_len;
2265 length -= x_len;
2266 }
2267 }
2268
2269 uint64_t BlueFS::_estimate_log_size()
2270 {
2271 int avg_dir_size = 40; // fixme
2272 int avg_file_size = 12;
2273 uint64_t size = 4096 * 2;
2274 size += file_map.size() * (1 + sizeof(bluefs_fnode_t));
2275 for (auto& p : block_all)
2276 size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
2277 size += dir_map.size() + (1 + avg_dir_size);
2278 size += file_map.size() * (1 + avg_dir_size + avg_file_size);
2279 return round_up_to(size, super.block_size);
2280 }
2281
2282 void BlueFS::compact_log()
2283 {
2284 std::unique_lock<ceph::mutex> l(lock);
2285 if (!cct->_conf->bluefs_replay_recovery_disable_compact) {
2286 if (cct->_conf->bluefs_compact_log_sync) {
2287 _compact_log_sync();
2288 } else {
2289 _compact_log_async(l);
2290 }
2291 }
2292 }
2293
2294 bool BlueFS::_should_compact_log()
2295 {
2296 uint64_t current = log_writer->file->fnode.size;
2297 uint64_t expected = _estimate_log_size();
2298 float ratio = (float)current / (float)expected;
2299 dout(10) << __func__ << " current 0x" << std::hex << current
2300 << " expected " << expected << std::dec
2301 << " ratio " << ratio
2302 << (new_log ? " (async compaction in progress)" : "")
2303 << dendl;
2304 if (new_log ||
2305 current < cct->_conf->bluefs_log_compact_min_size ||
2306 ratio < cct->_conf->bluefs_log_compact_min_ratio) {
2307 return false;
2308 }
2309 return true;
2310 }
2311
2312 void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t,
2313 int flags)
2314 {
2315 t->seq = 1;
2316 t->uuid = super.uuid;
2317 dout(20) << __func__ << " op_init" << dendl;
2318
2319 t->op_init();
2320 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
2321 interval_set<uint64_t>& p = block_all[bdev];
2322 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
2323 auto bdev_new = bdev;
2324 if ((flags & REMOVE_WAL) && bdev == BDEV_WAL) {
2325 continue;
2326 }
2327 if ((flags & REMOVE_DB) && bdev == BDEV_DB) {
2328 continue;
2329 }
2330 if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
2331 bdev_new = BDEV_DB;
2332 }
2333 if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
2334 bdev_new = BDEV_SLOW;
2335 }
2336 if (bdev == BDEV_NEWDB) {
2337 // REMOVE_DB xor RENAME_DB
2338 ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
2339 ceph_assert(!(flags & RENAME_SLOW2DB));
2340 bdev_new = BDEV_DB;
2341 }
2342 if (bdev == BDEV_NEWWAL) {
2343 ceph_assert(flags & REMOVE_WAL);
2344 bdev_new = BDEV_WAL;
2345 }
2346 dout(20) << __func__ << " op_alloc_add " << bdev_new << " 0x"
2347 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
2348 << dendl;
2349 t->op_alloc_add(bdev_new, q.get_start(), q.get_len());
2350 }
2351 }
2352 for (auto& [ino, file_ref] : file_map) {
2353 if (ino == 1)
2354 continue;
2355 ceph_assert(ino > 1);
2356
2357 for(auto& e : file_ref->fnode.extents) {
2358 auto bdev = e.bdev;
2359 auto bdev_new = bdev;
2360 ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL));
2361 if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
2362 bdev_new = BDEV_DB;
2363 }
2364 if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
2365 bdev_new = BDEV_SLOW;
2366 }
2367 if (bdev == BDEV_NEWDB) {
2368 // REMOVE_DB xor RENAME_DB
2369 ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
2370 ceph_assert(!(flags & RENAME_SLOW2DB));
2371 bdev_new = BDEV_DB;
2372 }
2373 if (bdev == BDEV_NEWWAL) {
2374 ceph_assert(flags & REMOVE_WAL);
2375 bdev_new = BDEV_WAL;
2376 }
2377 e.bdev = bdev_new;
2378 }
2379 dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
2380 t->op_file_update(file_ref->fnode);
2381 }
2382 for (auto& [path, dir_ref] : dir_map) {
2383 dout(20) << __func__ << " op_dir_create " << path << dendl;
2384 t->op_dir_create(path);
2385 for (auto& [fname, file_ref] : dir_ref->file_map) {
2386 dout(20) << __func__ << " op_dir_link " << path << "/" << fname
2387 << " to " << file_ref->fnode.ino << dendl;
2388 t->op_dir_link(path, fname, file_ref->fnode.ino);
2389 }
2390 }
2391 }
2392
2393 void BlueFS::_compact_log_sync()
2394 {
2395 dout(10) << __func__ << dendl;
2396 auto prefer_bdev =
2397 vselector->select_prefer_bdev(log_writer->file->vselector_hint);
2398 _rewrite_log_and_layout_sync(true,
2399 BDEV_DB,
2400 prefer_bdev,
2401 prefer_bdev,
2402 0,
2403 super.memorized_layout);
2404 logger->inc(l_bluefs_log_compactions);
2405 }
2406
2407 void BlueFS::_rewrite_log_and_layout_sync(bool allocate_with_fallback,
2408 int super_dev,
2409 int log_dev,
2410 int log_dev_new,
2411 int flags,
2412 std::optional<bluefs_layout_t> layout)
2413 {
2414 File *log_file = log_writer->file.get();
2415
2416 // clear out log (be careful who calls us!!!)
2417 log_t.clear();
2418
2419 dout(20) << __func__ << " super_dev:" << super_dev
2420 << " log_dev:" << log_dev
2421 << " log_dev_new:" << log_dev_new
2422 << " flags:" << flags
2423 << dendl;
2424 bluefs_transaction_t t;
2425 _compact_log_dump_metadata(&t, flags);
2426
2427 dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
2428 t.op_jump_seq(log_seq);
2429
2430 bufferlist bl;
2431 encode(t, bl);
2432 _pad_bl(bl);
2433
2434 uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
2435 dout(20) << __func__ << " need " << need << dendl;
2436
2437 bluefs_fnode_t old_fnode;
2438 int r;
2439 log_file->fnode.swap_extents(old_fnode);
2440 if (allocate_with_fallback) {
2441 r = _allocate(log_dev, need, &log_file->fnode);
2442 ceph_assert(r == 0);
2443 } else {
2444 PExtentVector extents;
2445 r = _allocate_without_fallback(log_dev,
2446 need,
2447 &extents);
2448 ceph_assert(r == 0);
2449 for (auto& p : extents) {
2450 log_file->fnode.append_extent(
2451 bluefs_extent_t(log_dev, p.offset, p.length));
2452 }
2453 }
2454
2455 _close_writer(log_writer);
2456
2457 log_file->fnode.size = bl.length();
2458 vselector->sub_usage(log_file->vselector_hint, old_fnode);
2459 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2460
2461 log_writer = _create_writer(log_file);
2462 log_writer->append(bl);
2463 r = _flush(log_writer, true);
2464 ceph_assert(r == 0);
2465 #ifdef HAVE_LIBAIO
2466 if (!cct->_conf->bluefs_sync_write) {
2467 list<aio_t> completed_ios;
2468 _claim_completed_aios(log_writer, &completed_ios);
2469 wait_for_aio(log_writer);
2470 completed_ios.clear();
2471 }
2472 #endif
2473 flush_bdev();
2474
2475 super.memorized_layout = layout;
2476 super.log_fnode = log_file->fnode;
2477 // rename device if needed
2478 if (log_dev != log_dev_new) {
2479 dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl;
2480 for (auto& p : super.log_fnode.extents) {
2481 p.bdev = log_dev_new;
2482 }
2483 }
2484 dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl;
2485
2486 ++super.version;
2487 _write_super(super_dev);
2488 flush_bdev();
2489
2490 dout(10) << __func__ << " release old log extents " << old_fnode.extents << dendl;
2491 for (auto& r : old_fnode.extents) {
2492 pending_release[r.bdev].insert(r.offset, r.length);
2493 }
2494 }
2495
2496 /*
2497 * 1. Allocate a new extent to continue the log, and then log an event
2498 * that jumps the log write position to the new extent. At this point, the
2499 * old extent(s) won't be written to, and reflect everything to compact.
2500 * New events will be written to the new region that we'll keep.
2501 *
2502 * 2. While still holding the lock, encode a bufferlist that dumps all of the
2503 * in-memory fnodes and names. This will become the new beginning of the
2504 * log. The last event will jump to the log continuation extent from #1.
2505 *
2506 * 3. Queue a write to a new extent for the new beginnging of the log.
2507 *
2508 * 4. Drop lock and wait
2509 *
2510 * 5. Retake the lock.
2511 *
2512 * 6. Update the log_fnode to splice in the new beginning.
2513 *
2514 * 7. Write the new superblock.
2515 *
2516 * 8. Release the old log space. Clean up.
2517 */
2518 void BlueFS::_compact_log_async(std::unique_lock<ceph::mutex>& l)
2519 {
2520 dout(10) << __func__ << dendl;
2521 File *log_file = log_writer->file.get();
2522 ceph_assert(!new_log);
2523 ceph_assert(!new_log_writer);
2524
2525 // create a new log [writer] so that we know compaction is in progress
2526 // (see _should_compact_log)
2527 new_log = ceph::make_ref<File>();
2528 new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode
2529
2530 // 0. wait for any racing flushes to complete. (We do not want to block
2531 // in _flush_sync_log with jump_to set or else a racing thread might flush
2532 // our entries and our jump_to update won't be correct.)
2533 while (log_flushing) {
2534 dout(10) << __func__ << " log is currently flushing, waiting" << dendl;
2535 log_cond.wait(l);
2536 }
2537
2538 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2539
2540 // 1. allocate new log space and jump to it.
2541 old_log_jump_to = log_file->fnode.get_allocated();
2542 dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
2543 << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl;
2544 int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
2545 cct->_conf->bluefs_max_log_runway,
2546 &log_file->fnode);
2547 ceph_assert(r == 0);
2548 //adjust usage as flush below will need it
2549 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2550 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2551
2552 // update the log file change and log a jump to the offset where we want to
2553 // write the new entries
2554 log_t.op_file_update(log_file->fnode);
2555 log_t.op_jump(log_seq, old_log_jump_to);
2556
2557 flush_bdev(); // FIXME?
2558
2559 _flush_and_sync_log(l, 0, old_log_jump_to);
2560
2561 // 2. prepare compacted log
2562 bluefs_transaction_t t;
2563 //avoid record two times in log_t and _compact_log_dump_metadata.
2564 log_t.clear();
2565 _compact_log_dump_metadata(&t, 0);
2566
2567 uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
2568 std::max(alloc_size[BDEV_DB],
2569 alloc_size[BDEV_SLOW]));
2570
2571 // conservative estimate for final encoded size
2572 new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2,
2573 max_alloc_size);
2574 t.op_jump(log_seq, new_log_jump_to);
2575
2576 // allocate
2577 //FIXME: check if we want DB here?
2578 r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
2579 &new_log->fnode);
2580 ceph_assert(r == 0);
2581
2582 // we might have some more ops in log_t due to _allocate call
2583 t.claim_ops(log_t);
2584
2585 bufferlist bl;
2586 encode(t, bl);
2587 _pad_bl(bl);
2588
2589 dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
2590 << std::dec << dendl;
2591
2592 new_log_writer = _create_writer(new_log);
2593 new_log_writer->append(bl);
2594
2595 // 3. flush
2596 r = _flush(new_log_writer, true);
2597 ceph_assert(r == 0);
2598
2599 // 4. wait
2600 _flush_bdev_safely(new_log_writer);
2601
2602 // 5. update our log fnode
2603 // discard first old_log_jump_to extents
2604
2605 dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec
2606 << " of " << log_file->fnode.extents << dendl;
2607 uint64_t discarded = 0;
2608 mempool::bluefs::vector<bluefs_extent_t> old_extents;
2609 while (discarded < old_log_jump_to) {
2610 ceph_assert(!log_file->fnode.extents.empty());
2611 bluefs_extent_t& e = log_file->fnode.extents.front();
2612 bluefs_extent_t temp = e;
2613 if (discarded + e.length <= old_log_jump_to) {
2614 dout(10) << __func__ << " remove old log extent " << e << dendl;
2615 discarded += e.length;
2616 log_file->fnode.pop_front_extent();
2617 } else {
2618 dout(10) << __func__ << " remove front of old log extent " << e << dendl;
2619 uint64_t drop = old_log_jump_to - discarded;
2620 temp.length = drop;
2621 e.offset += drop;
2622 e.length -= drop;
2623 discarded += drop;
2624 dout(10) << __func__ << " kept " << e << " removed " << temp << dendl;
2625 }
2626 old_extents.push_back(temp);
2627 }
2628 auto from = log_file->fnode.extents.begin();
2629 auto to = log_file->fnode.extents.end();
2630 while (from != to) {
2631 new_log->fnode.append_extent(*from);
2632 ++from;
2633 }
2634
2635 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2636
2637 // clear the extents from old log file, they are added to new log
2638 log_file->fnode.clear_extents();
2639 // swap the log files. New log file is the log file now.
2640 new_log->fnode.swap_extents(log_file->fnode);
2641
2642 log_writer->pos = log_writer->file->fnode.size =
2643 log_writer->pos - old_log_jump_to + new_log_jump_to;
2644
2645 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2646
2647 // 6. write the super block to reflect the changes
2648 dout(10) << __func__ << " writing super" << dendl;
2649 super.log_fnode = log_file->fnode;
2650 ++super.version;
2651 _write_super(BDEV_DB);
2652
2653 lock.unlock();
2654 flush_bdev();
2655 lock.lock();
2656
2657 // 7. release old space
2658 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
2659 for (auto& r : old_extents) {
2660 pending_release[r.bdev].insert(r.offset, r.length);
2661 }
2662
2663 // delete the new log, remove from the dirty files list
2664 _close_writer(new_log_writer);
2665 if (new_log->dirty_seq) {
2666 ceph_assert(dirty_files.count(new_log->dirty_seq));
2667 auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log);
2668 dirty_files[new_log->dirty_seq].erase(it);
2669 }
2670 new_log_writer = nullptr;
2671 new_log = nullptr;
2672 log_cond.notify_all();
2673
2674 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2675 logger->inc(l_bluefs_log_compactions);
2676 }
2677
2678 void BlueFS::_pad_bl(bufferlist& bl)
2679 {
2680 uint64_t partial = bl.length() % super.block_size;
2681 if (partial) {
2682 dout(10) << __func__ << " padding with 0x" << std::hex
2683 << super.block_size - partial << " zeros" << std::dec << dendl;
2684 bl.append_zero(super.block_size - partial);
2685 }
2686 }
2687
2688
2689 int BlueFS::_flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
2690 uint64_t want_seq,
2691 uint64_t jump_to)
2692 {
2693 while (log_flushing) {
2694 dout(10) << __func__ << " want_seq " << want_seq
2695 << " log is currently flushing, waiting" << dendl;
2696 ceph_assert(!jump_to);
2697 log_cond.wait(l);
2698 }
2699 if (want_seq && want_seq <= log_seq_stable) {
2700 dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable "
2701 << log_seq_stable << ", done" << dendl;
2702 ceph_assert(!jump_to);
2703 return 0;
2704 }
2705 if (log_t.empty() && dirty_files.empty()) {
2706 dout(10) << __func__ << " want_seq " << want_seq
2707 << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl;
2708 ceph_assert(!jump_to);
2709 return 0;
2710 }
2711
2712 vector<interval_set<uint64_t>> to_release(pending_release.size());
2713 to_release.swap(pending_release);
2714
2715 uint64_t seq = log_t.seq = ++log_seq;
2716 ceph_assert(want_seq == 0 || want_seq <= seq);
2717 log_t.uuid = super.uuid;
2718
2719 // log dirty files
2720 auto lsi = dirty_files.find(seq);
2721 if (lsi != dirty_files.end()) {
2722 dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl;
2723 for (auto &f : lsi->second) {
2724 dout(20) << __func__ << " op_file_update " << f.fnode << dendl;
2725 log_t.op_file_update(f.fnode);
2726 }
2727 }
2728
2729 dout(10) << __func__ << " " << log_t << dendl;
2730 ceph_assert(!log_t.empty());
2731
2732 // allocate some more space (before we run out)?
2733 int64_t runway = log_writer->file->fnode.get_allocated() -
2734 log_writer->get_effective_write_pos();
2735 bool just_expanded_log = false;
2736 if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
2737 dout(10) << __func__ << " allocating more log runway (0x"
2738 << std::hex << runway << std::dec << " remaining)" << dendl;
2739 while (new_log_writer) {
2740 dout(10) << __func__ << " waiting for async compaction" << dendl;
2741 log_cond.wait(l);
2742 }
2743 vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode);
2744 int r = _allocate(
2745 vselector->select_prefer_bdev(log_writer->file->vselector_hint),
2746 cct->_conf->bluefs_max_log_runway,
2747 &log_writer->file->fnode);
2748 ceph_assert(r == 0);
2749 vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode);
2750 log_t.op_file_update(log_writer->file->fnode);
2751 just_expanded_log = true;
2752 }
2753
2754 bufferlist bl;
2755 bl.reserve(super.block_size);
2756 encode(log_t, bl);
2757 // pad to block boundary
2758 size_t realign = super.block_size - (bl.length() % super.block_size);
2759 if (realign && realign != super.block_size)
2760 bl.append_zero(realign);
2761
2762 logger->inc(l_bluefs_logged_bytes, bl.length());
2763
2764 if (just_expanded_log) {
2765 ceph_assert(bl.length() <= runway); // if we write this, we will have an unrecoverable data loss
2766 }
2767
2768 log_writer->append(bl);
2769
2770 log_t.clear();
2771 log_t.seq = 0; // just so debug output is less confusing
2772 log_flushing = true;
2773
2774 int r = _flush(log_writer, true);
2775 ceph_assert(r == 0);
2776
2777 if (jump_to) {
2778 dout(10) << __func__ << " jumping log offset from 0x" << std::hex
2779 << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl;
2780 log_writer->pos = jump_to;
2781 vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size);
2782 log_writer->file->fnode.size = jump_to;
2783 vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size);
2784 }
2785
2786 _flush_bdev_safely(log_writer);
2787
2788 log_flushing = false;
2789 log_cond.notify_all();
2790
2791 // clean dirty files
2792 if (seq > log_seq_stable) {
2793 log_seq_stable = seq;
2794 dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl;
2795
2796 auto p = dirty_files.begin();
2797 while (p != dirty_files.end()) {
2798 if (p->first > log_seq_stable) {
2799 dout(20) << __func__ << " done cleaning up dirty files" << dendl;
2800 break;
2801 }
2802
2803 auto l = p->second.begin();
2804 while (l != p->second.end()) {
2805 File *file = &*l;
2806 ceph_assert(file->dirty_seq > 0);
2807 ceph_assert(file->dirty_seq <= log_seq_stable);
2808 dout(20) << __func__ << " cleaned file " << file->fnode << dendl;
2809 file->dirty_seq = 0;
2810 p->second.erase(l++);
2811 }
2812
2813 ceph_assert(p->second.empty());
2814 dirty_files.erase(p++);
2815 }
2816 } else {
2817 dout(20) << __func__ << " log_seq_stable " << log_seq_stable
2818 << " already >= out seq " << seq
2819 << ", we lost a race against another log flush, done" << dendl;
2820 }
2821
2822 for (unsigned i = 0; i < to_release.size(); ++i) {
2823 if (!to_release[i].empty()) {
2824 /* OK, now we have the guarantee alloc[i] won't be null. */
2825 int r = 0;
2826 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
2827 r = bdev[i]->queue_discard(to_release[i]);
2828 if (r == 0)
2829 continue;
2830 } else if (cct->_conf->bdev_enable_discard) {
2831 for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
2832 bdev[i]->discard(p.get_start(), p.get_len());
2833 }
2834 }
2835 alloc[i]->release(to_release[i]);
2836 }
2837 }
2838
2839 _update_logger_stats();
2840
2841 return 0;
2842 }
2843
2844 int BlueFS::_signal_dirty_to_log(FileWriter *h)
2845 {
2846 h->file->fnode.mtime = ceph_clock_now();
2847 ceph_assert(h->file->fnode.ino >= 1);
2848 if (h->file->dirty_seq == 0) {
2849 h->file->dirty_seq = log_seq + 1;
2850 dirty_files[h->file->dirty_seq].push_back(*h->file);
2851 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2852 << " (was clean)" << dendl;
2853 } else {
2854 if (h->file->dirty_seq != log_seq + 1) {
2855 // need re-dirty, erase from list first
2856 ceph_assert(dirty_files.count(h->file->dirty_seq));
2857 auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
2858 dirty_files[h->file->dirty_seq].erase(it);
2859 h->file->dirty_seq = log_seq + 1;
2860 dirty_files[h->file->dirty_seq].push_back(*h->file);
2861 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2862 << " (was " << h->file->dirty_seq << ")" << dendl;
2863 } else {
2864 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2865 << " (unchanged, do nothing) " << dendl;
2866 }
2867 }
2868 return 0;
2869 }
2870
2871 int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
2872 {
2873 dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
2874 << " 0x" << offset << "~" << length << std::dec
2875 << " to " << h->file->fnode << dendl;
2876 if (h->file->deleted) {
2877 dout(10) << __func__ << " deleted, no-op" << dendl;
2878 return 0;
2879 }
2880
2881 ceph_assert(h->file->num_readers.load() == 0);
2882
2883 h->buffer_appender.flush();
2884
2885 bool buffered;
2886 if (h->file->fnode.ino == 1)
2887 buffered = false;
2888 else
2889 buffered = cct->_conf->bluefs_buffered_io;
2890
2891 if (offset + length <= h->pos)
2892 return 0;
2893 if (offset < h->pos) {
2894 length -= h->pos - offset;
2895 offset = h->pos;
2896 dout(10) << " still need 0x"
2897 << std::hex << offset << "~" << length << std::dec
2898 << dendl;
2899 }
2900 ceph_assert(offset <= h->file->fnode.size);
2901
2902 uint64_t allocated = h->file->fnode.get_allocated();
2903 vselector->sub_usage(h->file->vselector_hint, h->file->fnode);
2904 // do not bother to dirty the file if we are overwriting
2905 // previously allocated extents.
2906
2907 if (allocated < offset + length) {
2908 // we should never run out of log space here; see the min runway check
2909 // in _flush_and_sync_log.
2910 ceph_assert(h->file->fnode.ino != 1);
2911 int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint),
2912 offset + length - allocated,
2913 &h->file->fnode);
2914 if (r < 0) {
2915 derr << __func__ << " allocated: 0x" << std::hex << allocated
2916 << " offset: 0x" << offset << " length: 0x" << length << std::dec
2917 << dendl;
2918 vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo
2919 ceph_abort_msg("bluefs enospc");
2920 return r;
2921 }
2922 h->file->is_dirty = true;
2923 }
2924 if (h->file->fnode.size < offset + length) {
2925 h->file->fnode.size = offset + length;
2926 if (h->file->fnode.ino > 1) {
2927 // we do not need to dirty the log file (or it's compacting
2928 // replacement) when the file size changes because replay is
2929 // smart enough to discover it on its own.
2930 h->file->is_dirty = true;
2931 }
2932 }
2933 dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl;
2934
2935 uint64_t x_off = 0;
2936 auto p = h->file->fnode.seek(offset, &x_off);
2937 ceph_assert(p != h->file->fnode.extents.end());
2938 dout(20) << __func__ << " in " << *p << " x_off 0x"
2939 << std::hex << x_off << std::dec << dendl;
2940
2941 unsigned partial = x_off & ~super.block_mask();
2942 bufferlist bl;
2943 if (partial) {
2944 dout(20) << __func__ << " using partial tail 0x"
2945 << std::hex << partial << std::dec << dendl;
2946 ceph_assert(h->tail_block.length() == partial);
2947 bl.claim_append_piecewise(h->tail_block);
2948 x_off -= partial;
2949 offset -= partial;
2950 length += partial;
2951 dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
2952 for (auto p : h->iocv) {
2953 if (p) {
2954 p->aio_wait();
2955 }
2956 }
2957 }
2958 if (length == partial + h->buffer.length()) {
2959 /* in case of inital allocation and need to zero, limited flush is unacceptable */
2960 bl.claim_append_piecewise(h->buffer);
2961 } else {
2962 bufferlist t;
2963 h->buffer.splice(0, length, &t);
2964 bl.claim_append_piecewise(t);
2965 t.substr_of(h->buffer, length, h->buffer.length() - length);
2966 h->buffer.swap(t);
2967 dout(20) << " leaving 0x" << std::hex << h->buffer.length() << std::dec
2968 << " unflushed" << dendl;
2969 }
2970 ceph_assert(bl.length() == length);
2971
2972 h->pos = offset + length;
2973
2974 unsigned tail = bl.length() & ~super.block_mask();
2975 if (tail) {
2976 dout(20) << __func__ << " caching tail of 0x"
2977 << std::hex << tail
2978 << " and padding block with 0x" << (super.block_size - tail)
2979 << std::dec << dendl;
2980 h->tail_block.substr_of(bl, bl.length() - tail, tail);
2981 bl.append_zero(super.block_size - tail);
2982 length += super.block_size - tail;
2983 } else {
2984 h->tail_block.clear();
2985 }
2986 ceph_assert(bl.length() == length);
2987
2988 switch (h->writer_type) {
2989 case WRITER_WAL:
2990 logger->inc(l_bluefs_bytes_written_wal, length);
2991 break;
2992 case WRITER_SST:
2993 logger->inc(l_bluefs_bytes_written_sst, length);
2994 break;
2995 }
2996
2997 dout(30) << "dump:\n";
2998 bl.hexdump(*_dout);
2999 *_dout << dendl;
3000
3001 uint64_t bloff = 0;
3002 uint64_t bytes_written_slow = 0;
3003 while (length > 0) {
3004 uint64_t x_len = std::min(p->length - x_off, length);
3005 bufferlist t;
3006 t.substr_of(bl, bloff, x_len);
3007 if (cct->_conf->bluefs_sync_write) {
3008 bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint);
3009 } else {
3010 bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint);
3011 }
3012 h->dirty_devs[p->bdev] = true;
3013 if (p->bdev == BDEV_SLOW) {
3014 bytes_written_slow += t.length();
3015 }
3016
3017 bloff += x_len;
3018 length -= x_len;
3019 ++p;
3020 x_off = 0;
3021 }
3022 logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow);
3023 for (unsigned i = 0; i < MAX_BDEV; ++i) {
3024 if (bdev[i]) {
3025 if (h->iocv[i] && h->iocv[i]->has_pending_aios()) {
3026 bdev[i]->aio_submit(h->iocv[i]);
3027 }
3028 }
3029 }
3030 vselector->add_usage(h->file->vselector_hint, h->file->fnode);
3031 dout(20) << __func__ << " h " << h << " pos now 0x"
3032 << std::hex << h->pos << std::dec << dendl;
3033 return 0;
3034 }
3035
3036 #ifdef HAVE_LIBAIO
3037 // we need to retire old completed aios so they don't stick around in
3038 // memory indefinitely (along with their bufferlist refs).
3039 void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
3040 {
3041 for (auto p : h->iocv) {
3042 if (p) {
3043 ls->splice(ls->end(), p->running_aios);
3044 }
3045 }
3046 dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
3047 }
3048
3049 void BlueFS::wait_for_aio(FileWriter *h)
3050 {
3051 // NOTE: this is safe to call without a lock, as long as our reference is
3052 // stable.
3053 dout(10) << __func__ << " " << h << dendl;
3054 utime_t start = ceph_clock_now();
3055 for (auto p : h->iocv) {
3056 if (p) {
3057 p->aio_wait();
3058 }
3059 }
3060 dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl;
3061 }
3062 #endif
3063
3064 int BlueFS::_flush(FileWriter *h, bool force, std::unique_lock<ceph::mutex>& l)
3065 {
3066 bool flushed = false;
3067 int r = _flush(h, force, &flushed);
3068 if (r == 0 && flushed) {
3069 _maybe_compact_log(l);
3070 }
3071 return r;
3072 }
3073
3074 int BlueFS::_flush(FileWriter *h, bool force, bool *flushed)
3075 {
3076 h->buffer_appender.flush();
3077 uint64_t length = h->buffer.length();
3078 uint64_t offset = h->pos;
3079 if (flushed) {
3080 *flushed = false;
3081 }
3082 if (!force &&
3083 length < cct->_conf->bluefs_min_flush_size) {
3084 dout(10) << __func__ << " " << h << " ignoring, length " << length
3085 << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
3086 << dendl;
3087 return 0;
3088 }
3089 if (length == 0) {
3090 dout(10) << __func__ << " " << h << " no dirty data on "
3091 << h->file->fnode << dendl;
3092 return 0;
3093 }
3094 dout(10) << __func__ << " " << h << " 0x"
3095 << std::hex << offset << "~" << length << std::dec
3096 << " to " << h->file->fnode << dendl;
3097 ceph_assert(h->pos <= h->file->fnode.size);
3098 int r = _flush_range(h, offset, length);
3099 if (flushed) {
3100 *flushed = true;
3101 }
3102 return r;
3103 }
3104
3105 int BlueFS::_truncate(FileWriter *h, uint64_t offset)
3106 {
3107 dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
3108 << " file " << h->file->fnode << dendl;
3109 if (h->file->deleted) {
3110 dout(10) << __func__ << " deleted, no-op" << dendl;
3111 return 0;
3112 }
3113
3114 // we never truncate internal log files
3115 ceph_assert(h->file->fnode.ino > 1);
3116
3117 h->buffer_appender.flush();
3118
3119 // truncate off unflushed data?
3120 if (h->pos < offset &&
3121 h->pos + h->buffer.length() > offset) {
3122 bufferlist t;
3123 dout(20) << __func__ << " tossing out last " << offset - h->pos
3124 << " unflushed bytes" << dendl;
3125 t.substr_of(h->buffer, 0, offset - h->pos);
3126 h->buffer.swap(t);
3127 ceph_abort_msg("actually this shouldn't happen");
3128 }
3129 if (h->buffer.length()) {
3130 int r = _flush(h, true);
3131 if (r < 0)
3132 return r;
3133 }
3134 if (offset == h->file->fnode.size) {
3135 return 0; // no-op!
3136 }
3137 if (offset > h->file->fnode.size) {
3138 ceph_abort_msg("truncate up not supported");
3139 }
3140 ceph_assert(h->file->fnode.size >= offset);
3141 vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
3142 h->file->fnode.size = offset;
3143 vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
3144 log_t.op_file_update(h->file->fnode);
3145 return 0;
3146 }
3147
3148 int BlueFS::_fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l)
3149 {
3150 dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl;
3151 int r = _flush(h, true);
3152 if (r < 0)
3153 return r;
3154 if (h->file->is_dirty) {
3155 _signal_dirty_to_log(h);
3156 h->file->is_dirty = false;
3157 }
3158 uint64_t old_dirty_seq = h->file->dirty_seq;
3159
3160 _flush_bdev_safely(h);
3161
3162 if (old_dirty_seq) {
3163 uint64_t s = log_seq;
3164 dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
3165 << ") on " << h->file->fnode << ", flushing log" << dendl;
3166 _flush_and_sync_log(l, old_dirty_seq);
3167 ceph_assert(h->file->dirty_seq == 0 || // cleaned
3168 h->file->dirty_seq > s); // or redirtied by someone else
3169 }
3170 return 0;
3171 }
3172
3173 void BlueFS::_flush_bdev_safely(FileWriter *h)
3174 {
3175 std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs;
3176 h->dirty_devs.fill(false);
3177 #ifdef HAVE_LIBAIO
3178 if (!cct->_conf->bluefs_sync_write) {
3179 list<aio_t> completed_ios;
3180 _claim_completed_aios(h, &completed_ios);
3181 lock.unlock();
3182 wait_for_aio(h);
3183 completed_ios.clear();
3184 flush_bdev(flush_devs);
3185 lock.lock();
3186 } else
3187 #endif
3188 {
3189 lock.unlock();
3190 flush_bdev(flush_devs);
3191 lock.lock();
3192 }
3193 }
3194
3195 void BlueFS::flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs)
3196 {
3197 // NOTE: this is safe to call without a lock.
3198 dout(20) << __func__ << dendl;
3199 for (unsigned i = 0; i < MAX_BDEV; i++) {
3200 if (dirty_bdevs[i])
3201 bdev[i]->flush();
3202 }
3203 }
3204
3205 void BlueFS::flush_bdev()
3206 {
3207 // NOTE: this is safe to call without a lock.
3208 dout(20) << __func__ << dendl;
3209 for (auto p : bdev) {
3210 if (p)
3211 p->flush();
3212 }
3213 }
3214
3215 const char* BlueFS::get_device_name(unsigned id)
3216 {
3217 if (id >= MAX_BDEV) return "BDEV_INV";
3218 const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
3219 return names[id];
3220 }
3221
3222 int BlueFS::_expand_slow_device(uint64_t need, PExtentVector& extents)
3223 {
3224 int r = -ENOSPC;
3225 if (slow_dev_expander) {
3226 auto id = _get_slow_device_id();
3227 auto min_alloc_size = alloc_size[id];
3228 ceph_assert(id <= alloc.size() && alloc[id]);
3229 auto min_need = round_up_to(need, min_alloc_size);
3230 need = std::max(need,
3231 slow_dev_expander->get_recommended_expansion_delta(
3232 alloc[id]->get_free(), block_all[id].size()));
3233
3234 need = round_up_to(need, min_alloc_size);
3235 dout(10) << __func__ << " expanding slow device by 0x"
3236 << std::hex << need << std::dec
3237 << dendl;
3238 r = slow_dev_expander->allocate_freespace(min_need, need, extents);
3239 }
3240 return r;
3241 }
3242
3243 int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
3244 PExtentVector* extents)
3245 {
3246 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
3247 << " from " << (int)id << dendl;
3248 assert(id < alloc.size());
3249 if (!alloc[id]) {
3250 return -ENOENT;
3251 }
3252 extents->reserve(4); // 4 should be (more than) enough for most allocations
3253 uint64_t min_alloc_size = alloc_size[id];
3254 uint64_t left = round_up_to(len, min_alloc_size);
3255 int64_t alloc_len = alloc[id]->allocate(left, min_alloc_size, 0, extents);
3256 if (alloc_len < 0 || alloc_len < (int64_t)left) {
3257 if (alloc_len > 0) {
3258 alloc[id]->release(*extents);
3259 }
3260 if (bdev[id])
3261 derr << __func__ << " failed to allocate 0x" << std::hex << left
3262 << " on bdev " << (int)id
3263 << ", free 0x" << alloc[id]->get_free() << std::dec << dendl;
3264 else
3265 derr << __func__ << " failed to allocate 0x" << std::hex << left
3266 << " on bdev " << (int)id << ", dne" << std::dec << dendl;
3267 if (alloc[id])
3268 alloc[id]->dump();
3269 return -ENOSPC;
3270 }
3271
3272 return 0;
3273 }
3274
3275 int BlueFS::_allocate(uint8_t id, uint64_t len,
3276 bluefs_fnode_t* node)
3277 {
3278 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
3279 << " from " << (int)id << dendl;
3280 ceph_assert(id < alloc.size());
3281 int64_t alloc_len = 0;
3282 PExtentVector extents;
3283 uint64_t hint = 0;
3284 if (alloc[id]) {
3285 if (!node->extents.empty() && node->extents.back().bdev == id) {
3286 hint = node->extents.back().end();
3287 }
3288 extents.reserve(4); // 4 should be (more than) enough for most allocations
3289 alloc_len = alloc[id]->allocate(round_up_to(len, alloc_size[id]),
3290 alloc_size[id], hint, &extents);
3291 }
3292 if (!alloc[id] ||
3293 alloc_len < 0 ||
3294 alloc_len < (int64_t)round_up_to(len, alloc_size[id])) {
3295 if (alloc_len > 0) {
3296 alloc[id]->release(extents);
3297 }
3298 if (id != BDEV_SLOW) {
3299 if (bdev[id]) {
3300 dout(1) << __func__ << " failed to allocate 0x" << std::hex << len
3301 << " on bdev " << (int)id
3302 << ", free 0x" << alloc[id]->get_free()
3303 << "; fallback to bdev " << (int)id + 1
3304 << std::dec << dendl;
3305 }
3306 return _allocate(id + 1, len, node);
3307 }
3308 dout(1) << __func__ << " unable to allocate 0x" << std::hex << len
3309 << " on bdev " << (int)id << ", free 0x"
3310 << (alloc[id] ? alloc[id]->get_free() : (uint64_t)-1)
3311 << "; fallback to slow device expander "
3312 << std::dec << dendl;
3313 extents.clear();
3314 if (_expand_slow_device(len, extents) == 0) {
3315 id = _get_slow_device_id();
3316 for (auto& e : extents) {
3317 _add_block_extent(id, e.offset, e.length);
3318 }
3319 extents.clear();
3320 auto* last_alloc = alloc[id];
3321 ceph_assert(last_alloc);
3322 // try again
3323 alloc_len = last_alloc->allocate(round_up_to(len, alloc_size[id]),
3324 alloc_size[id], hint, &extents);
3325 if (alloc_len < 0 || alloc_len < (int64_t)len) {
3326 if (alloc_len > 0) {
3327 last_alloc->release(extents);
3328 }
3329 derr << __func__ << " failed to allocate 0x" << std::hex << len
3330 << " on bdev " << (int)id
3331 << ", free 0x" << last_alloc->get_free() << std::dec << dendl;
3332 return -ENOSPC;
3333 }
3334 } else {
3335 derr << __func__ << " failed to expand slow device to fit +0x"
3336 << std::hex << len << std::dec
3337 << dendl;
3338 return -ENOSPC;
3339 }
3340 } else {
3341 uint64_t total_allocated =
3342 block_all[id].size() - alloc[id]->get_free();
3343 if (max_bytes[id] < total_allocated) {
3344 logger->set(max_bytes_pcounters[id], total_allocated);
3345 max_bytes[id] = total_allocated;
3346 }
3347 }
3348
3349 for (auto& p : extents) {
3350 node->append_extent(bluefs_extent_t(id, p.offset, p.length));
3351 }
3352
3353 return 0;
3354 }
3355
3356 int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len)
3357 {
3358 dout(10) << __func__ << " file " << f->fnode << " 0x"
3359 << std::hex << off << "~" << len << std::dec << dendl;
3360 if (f->deleted) {
3361 dout(10) << __func__ << " deleted, no-op" << dendl;
3362 return 0;
3363 }
3364 ceph_assert(f->fnode.ino > 1);
3365 uint64_t allocated = f->fnode.get_allocated();
3366 if (off + len > allocated) {
3367 uint64_t want = off + len - allocated;
3368 vselector->sub_usage(f->vselector_hint, f->fnode);
3369
3370 int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint),
3371 want,
3372 &f->fnode);
3373 vselector->add_usage(f->vselector_hint, f->fnode);
3374 if (r < 0)
3375 return r;
3376 log_t.op_file_update(f->fnode);
3377 }
3378 return 0;
3379 }
3380
3381 void BlueFS::sync_metadata(bool avoid_compact)
3382 {
3383 std::unique_lock<ceph::mutex> l(lock);
3384 if (log_t.empty() && dirty_files.empty()) {
3385 dout(10) << __func__ << " - no pending log events" << dendl;
3386 } else {
3387 dout(10) << __func__ << dendl;
3388 utime_t start = ceph_clock_now();
3389 flush_bdev(); // FIXME?
3390 _flush_and_sync_log(l);
3391 dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl;
3392 }
3393
3394 if (!avoid_compact) {
3395 _maybe_compact_log(l);
3396 }
3397 }
3398
3399 void BlueFS::_maybe_compact_log(std::unique_lock<ceph::mutex>& l)
3400 {
3401 if (!cct->_conf->bluefs_replay_recovery_disable_compact &&
3402 _should_compact_log()) {
3403 if (cct->_conf->bluefs_compact_log_sync) {
3404 _compact_log_sync();
3405 } else {
3406 _compact_log_async(l);
3407 }
3408 }
3409 }
3410
3411 int BlueFS::open_for_write(
3412 std::string_view dirname,
3413 std::string_view filename,
3414 FileWriter **h,
3415 bool overwrite)
3416 {
3417 std::lock_guard l(lock);
3418 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3419 map<string,DirRef>::iterator p = dir_map.find(dirname);
3420 DirRef dir;
3421 if (p == dir_map.end()) {
3422 // implicitly create the dir
3423 dout(20) << __func__ << " dir " << dirname
3424 << " does not exist" << dendl;
3425 return -ENOENT;
3426 } else {
3427 dir = p->second;
3428 }
3429
3430 FileRef file;
3431 bool create = false;
3432 bool truncate = false;
3433 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3434 if (q == dir->file_map.end()) {
3435 if (overwrite) {
3436 dout(20) << __func__ << " dir " << dirname << " (" << dir
3437 << ") file " << filename
3438 << " does not exist" << dendl;
3439 return -ENOENT;
3440 }
3441 file = ceph::make_ref<File>();
3442 file->fnode.ino = ++ino_last;
3443 file_map[ino_last] = file;
3444 dir->file_map[string{filename}] = file;
3445 ++file->refs;
3446 create = true;
3447 } else {
3448 // overwrite existing file?
3449 file = q->second;
3450 if (overwrite) {
3451 dout(20) << __func__ << " dir " << dirname << " (" << dir
3452 << ") file " << filename
3453 << " already exists, overwrite in place" << dendl;
3454 } else {
3455 dout(20) << __func__ << " dir " << dirname << " (" << dir
3456 << ") file " << filename
3457 << " already exists, truncate + overwrite" << dendl;
3458 vselector->sub_usage(file->vselector_hint, file->fnode);
3459 file->fnode.size = 0;
3460 for (auto& p : file->fnode.extents) {
3461 pending_release[p.bdev].insert(p.offset, p.length);
3462 }
3463 truncate = true;
3464
3465 file->fnode.clear_extents();
3466 }
3467 }
3468 ceph_assert(file->fnode.ino > 1);
3469
3470 file->fnode.mtime = ceph_clock_now();
3471 file->vselector_hint = vselector->get_hint_by_dir(dirname);
3472 if (create || truncate) {
3473 vselector->add_usage(file->vselector_hint, file->fnode); // update file count
3474 }
3475
3476 dout(20) << __func__ << " mapping " << dirname << "/" << filename
3477 << " vsel_hint " << file->vselector_hint
3478 << dendl;
3479
3480 log_t.op_file_update(file->fnode);
3481 if (create)
3482 log_t.op_dir_link(dirname, filename, file->fnode.ino);
3483
3484 *h = _create_writer(file);
3485
3486 if (boost::algorithm::ends_with(filename, ".log")) {
3487 (*h)->writer_type = BlueFS::WRITER_WAL;
3488 if (logger && !overwrite) {
3489 logger->inc(l_bluefs_files_written_wal);
3490 }
3491 } else if (boost::algorithm::ends_with(filename, ".sst")) {
3492 (*h)->writer_type = BlueFS::WRITER_SST;
3493 if (logger) {
3494 logger->inc(l_bluefs_files_written_sst);
3495 }
3496 }
3497
3498 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
3499 return 0;
3500 }
3501
3502 BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
3503 {
3504 FileWriter *w = new FileWriter(f);
3505 for (unsigned i = 0; i < MAX_BDEV; ++i) {
3506 if (bdev[i]) {
3507 w->iocv[i] = new IOContext(cct, NULL);
3508 }
3509 }
3510 return w;
3511 }
3512
3513 void BlueFS::_close_writer(FileWriter *h)
3514 {
3515 dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
3516 h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer);
3517 for (unsigned i=0; i<MAX_BDEV; ++i) {
3518 if (bdev[i]) {
3519 if (h->iocv[i]) {
3520 h->iocv[i]->aio_wait();
3521 bdev[i]->queue_reap_ioc(h->iocv[i]);
3522 }
3523 }
3524 }
3525 delete h;
3526 }
3527
3528 uint64_t BlueFS::debug_get_dirty_seq(FileWriter *h)
3529 {
3530 std::lock_guard l(lock);
3531 return h->file->dirty_seq;
3532 }
3533
3534 bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev)
3535 {
3536 std::lock_guard l(lock);
3537 return h->dirty_devs[dev];
3538 }
3539
3540 int BlueFS::open_for_read(
3541 std::string_view dirname,
3542 std::string_view filename,
3543 FileReader **h,
3544 bool random)
3545 {
3546 std::lock_guard l(lock);
3547 dout(10) << __func__ << " " << dirname << "/" << filename
3548 << (random ? " (random)":" (sequential)") << dendl;
3549 map<string,DirRef>::iterator p = dir_map.find(dirname);
3550 if (p == dir_map.end()) {
3551 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3552 return -ENOENT;
3553 }
3554 DirRef dir = p->second;
3555
3556 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3557 if (q == dir->file_map.end()) {
3558 dout(20) << __func__ << " dir " << dirname << " (" << dir
3559 << ") file " << filename
3560 << " not found" << dendl;
3561 return -ENOENT;
3562 }
3563 File *file = q->second.get();
3564
3565 *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
3566 random, false);
3567 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
3568 return 0;
3569 }
3570
3571 int BlueFS::rename(
3572 std::string_view old_dirname, std::string_view old_filename,
3573 std::string_view new_dirname, std::string_view new_filename)
3574 {
3575 std::lock_guard l(lock);
3576 dout(10) << __func__ << " " << old_dirname << "/" << old_filename
3577 << " -> " << new_dirname << "/" << new_filename << dendl;
3578 map<string,DirRef>::iterator p = dir_map.find(old_dirname);
3579 if (p == dir_map.end()) {
3580 dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
3581 return -ENOENT;
3582 }
3583 DirRef old_dir = p->second;
3584 map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
3585 if (q == old_dir->file_map.end()) {
3586 dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
3587 << ") file " << old_filename
3588 << " not found" << dendl;
3589 return -ENOENT;
3590 }
3591 FileRef file = q->second;
3592
3593 p = dir_map.find(new_dirname);
3594 if (p == dir_map.end()) {
3595 dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
3596 return -ENOENT;
3597 }
3598 DirRef new_dir = p->second;
3599 q = new_dir->file_map.find(new_filename);
3600 if (q != new_dir->file_map.end()) {
3601 dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
3602 << ") file " << new_filename
3603 << " already exists, unlinking" << dendl;
3604 ceph_assert(q->second != file);
3605 log_t.op_dir_unlink(new_dirname, new_filename);
3606 _drop_link(q->second);
3607 }
3608
3609 dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
3610 << " " << file->fnode << dendl;
3611
3612 new_dir->file_map[string{new_filename}] = file;
3613 old_dir->file_map.erase(string{old_filename});
3614
3615 log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
3616 log_t.op_dir_unlink(old_dirname, old_filename);
3617 return 0;
3618 }
3619
3620 int BlueFS::mkdir(std::string_view dirname)
3621 {
3622 std::lock_guard l(lock);
3623 dout(10) << __func__ << " " << dirname << dendl;
3624 map<string,DirRef>::iterator p = dir_map.find(dirname);
3625 if (p != dir_map.end()) {
3626 dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
3627 return -EEXIST;
3628 }
3629 dir_map[string{dirname}] = ceph::make_ref<Dir>();
3630 log_t.op_dir_create(dirname);
3631 return 0;
3632 }
3633
3634 int BlueFS::rmdir(std::string_view dirname)
3635 {
3636 std::lock_guard l(lock);
3637 dout(10) << __func__ << " " << dirname << dendl;
3638 auto p = dir_map.find(dirname);
3639 if (p == dir_map.end()) {
3640 dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
3641 return -ENOENT;
3642 }
3643 DirRef dir = p->second;
3644 if (!dir->file_map.empty()) {
3645 dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
3646 return -ENOTEMPTY;
3647 }
3648 dir_map.erase(string{dirname});
3649 log_t.op_dir_remove(dirname);
3650 return 0;
3651 }
3652
3653 bool BlueFS::dir_exists(std::string_view dirname)
3654 {
3655 std::lock_guard l(lock);
3656 map<string,DirRef>::iterator p = dir_map.find(dirname);
3657 bool exists = p != dir_map.end();
3658 dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
3659 return exists;
3660 }
3661
3662 int BlueFS::stat(std::string_view dirname, std::string_view filename,
3663 uint64_t *size, utime_t *mtime)
3664 {
3665 std::lock_guard l(lock);
3666 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3667 map<string,DirRef>::iterator p = dir_map.find(dirname);
3668 if (p == dir_map.end()) {
3669 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3670 return -ENOENT;
3671 }
3672 DirRef dir = p->second;
3673 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3674 if (q == dir->file_map.end()) {
3675 dout(20) << __func__ << " dir " << dirname << " (" << dir
3676 << ") file " << filename
3677 << " not found" << dendl;
3678 return -ENOENT;
3679 }
3680 File *file = q->second.get();
3681 dout(10) << __func__ << " " << dirname << "/" << filename
3682 << " " << file->fnode << dendl;
3683 if (size)
3684 *size = file->fnode.size;
3685 if (mtime)
3686 *mtime = file->fnode.mtime;
3687 return 0;
3688 }
3689
3690 int BlueFS::lock_file(std::string_view dirname, std::string_view filename,
3691 FileLock **plock)
3692 {
3693 std::lock_guard l(lock);
3694 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3695 map<string,DirRef>::iterator p = dir_map.find(dirname);
3696 if (p == dir_map.end()) {
3697 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3698 return -ENOENT;
3699 }
3700 DirRef dir = p->second;
3701 auto q = dir->file_map.find(filename);
3702 FileRef file;
3703 if (q == dir->file_map.end()) {
3704 dout(20) << __func__ << " dir " << dirname << " (" << dir
3705 << ") file " << filename
3706 << " not found, creating" << dendl;
3707 file = ceph::make_ref<File>();
3708 file->fnode.ino = ++ino_last;
3709 file->fnode.mtime = ceph_clock_now();
3710 file_map[ino_last] = file;
3711 dir->file_map[string{filename}] = file;
3712 ++file->refs;
3713 log_t.op_file_update(file->fnode);
3714 log_t.op_dir_link(dirname, filename, file->fnode.ino);
3715 } else {
3716 file = q->second;
3717 if (file->locked) {
3718 dout(10) << __func__ << " already locked" << dendl;
3719 return -ENOLCK;
3720 }
3721 }
3722 file->locked = true;
3723 *plock = new FileLock(file);
3724 dout(10) << __func__ << " locked " << file->fnode
3725 << " with " << *plock << dendl;
3726 return 0;
3727 }
3728
3729 int BlueFS::unlock_file(FileLock *fl)
3730 {
3731 std::lock_guard l(lock);
3732 dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
3733 ceph_assert(fl->file->locked);
3734 fl->file->locked = false;
3735 delete fl;
3736 return 0;
3737 }
3738
3739 int BlueFS::readdir(std::string_view dirname, vector<string> *ls)
3740 {
3741 // dirname may contain a trailing /
3742 if (!dirname.empty() && dirname.back() == '/') {
3743 dirname.remove_suffix(1);
3744 }
3745 std::lock_guard l(lock);
3746 dout(10) << __func__ << " " << dirname << dendl;
3747 if (dirname.empty()) {
3748 // list dirs
3749 ls->reserve(dir_map.size() + 2);
3750 for (auto& q : dir_map) {
3751 ls->push_back(q.first);
3752 }
3753 } else {
3754 // list files in dir
3755 map<string,DirRef>::iterator p = dir_map.find(dirname);
3756 if (p == dir_map.end()) {
3757 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3758 return -ENOENT;
3759 }
3760 DirRef dir = p->second;
3761 ls->reserve(dir->file_map.size() + 2);
3762 for (auto& q : dir->file_map) {
3763 ls->push_back(q.first);
3764 }
3765 }
3766 ls->push_back(".");
3767 ls->push_back("..");
3768 return 0;
3769 }
3770
3771 int BlueFS::unlink(std::string_view dirname, std::string_view filename)
3772 {
3773 std::lock_guard l(lock);
3774 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3775 map<string,DirRef>::iterator p = dir_map.find(dirname);
3776 if (p == dir_map.end()) {
3777 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3778 return -ENOENT;
3779 }
3780 DirRef dir = p->second;
3781 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3782 if (q == dir->file_map.end()) {
3783 dout(20) << __func__ << " file " << dirname << "/" << filename
3784 << " not found" << dendl;
3785 return -ENOENT;
3786 }
3787 FileRef file = q->second;
3788 if (file->locked) {
3789 dout(20) << __func__ << " file " << dirname << "/" << filename
3790 << " is locked" << dendl;
3791 return -EBUSY;
3792 }
3793 dir->file_map.erase(string{filename});
3794 log_t.op_dir_unlink(dirname, filename);
3795 _drop_link(file);
3796 return 0;
3797 }
3798
3799 bool BlueFS::wal_is_rotational()
3800 {
3801 if (bdev[BDEV_WAL]) {
3802 return bdev[BDEV_WAL]->is_rotational();
3803 } else if (bdev[BDEV_DB]) {
3804 return bdev[BDEV_DB]->is_rotational();
3805 }
3806 return bdev[BDEV_SLOW]->is_rotational();
3807 }
3808
3809 /*
3810 Algorithm.
3811 do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there.
3812 Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future,
3813 and try if using it will produce healthy bluefs transaction.
3814 We encode already known bluefs log extents and search disk for these bytes.
3815 When we find it, we decode following bytes as extent.
3816 We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction.
3817 */
3818 int BlueFS::do_replay_recovery_read(FileReader *log_reader,
3819 size_t replay_pos,
3820 size_t read_offset,
3821 size_t read_len,
3822 bufferlist* bl) {
3823 dout(1) << __func__ << " replay_pos=0x" << std::hex << replay_pos <<
3824 " needs 0x" << read_offset << "~" << read_len << std::dec << dendl;
3825
3826 bluefs_fnode_t& log_fnode = log_reader->file->fnode;
3827 bufferlist bin_extents;
3828 ceph::encode(log_fnode.extents, bin_extents);
3829 dout(2) << __func__ << " log file encoded extents length = " << bin_extents.length() << dendl;
3830
3831 // cannot process if too small to effectively search
3832 ceph_assert(bin_extents.length() >= 32);
3833 bufferlist last_32;
3834 last_32.substr_of(bin_extents, bin_extents.length() - 32, 32);
3835
3836 //read fixed part from replay_pos to end of bluefs_log extents
3837 bufferlist fixed;
3838 uint64_t e_off = 0;
3839 auto e = log_fnode.seek(replay_pos, &e_off);
3840 ceph_assert(e != log_fnode.extents.end());
3841 int r = bdev[e->bdev]->read(e->offset + e_off, e->length - e_off, &fixed, ioc[e->bdev],
3842 cct->_conf->bluefs_buffered_io);
3843 ceph_assert(r == 0);
3844 //capture dev of last good extent
3845 uint8_t last_e_dev = e->bdev;
3846 uint64_t last_e_off = e->offset;
3847 ++e;
3848 while (e != log_fnode.extents.end()) {
3849 r = bdev[e->bdev]->read(e->offset, e->length, &fixed, ioc[e->bdev],
3850 cct->_conf->bluefs_buffered_io);
3851 ceph_assert(r == 0);
3852 last_e_dev = e->bdev;
3853 ++e;
3854 }
3855 ceph_assert(replay_pos + fixed.length() == read_offset);
3856
3857 dout(2) << __func__ << " valid data in log = " << fixed.length() << dendl;
3858
3859 struct compare {
3860 bool operator()(const bluefs_extent_t& a, const bluefs_extent_t& b) const {
3861 if (a.bdev < b.bdev) return true;
3862 if (a.offset < b.offset) return true;
3863 return a.length < b.length;
3864 }
3865 };
3866 std::set<bluefs_extent_t, compare> extents_rejected;
3867 for (int dcnt = 0; dcnt < 3; dcnt++) {
3868 uint8_t dev = (last_e_dev + dcnt) % MAX_BDEV;
3869 if (bdev[dev] == nullptr) continue;
3870 dout(2) << __func__ << " processing " << get_device_name(dev) << dendl;
3871 interval_set<uint64_t> disk_regions;
3872 disk_regions.insert(0, bdev[dev]->get_size());
3873 for (auto f : file_map) {
3874 auto& e = f.second->fnode.extents;
3875 for (auto& p : e) {
3876 if (p.bdev == dev) {
3877 disk_regions.erase(p.offset, p.length);
3878 }
3879 }
3880 }
3881 size_t disk_regions_count = disk_regions.num_intervals();
3882 dout(5) << __func__ << " " << disk_regions_count << " regions to scan on " << get_device_name(dev) << dendl;
3883
3884 auto reg = disk_regions.lower_bound(last_e_off);
3885 //for all except first, start from beginning
3886 last_e_off = 0;
3887 if (reg == disk_regions.end()) {
3888 reg = disk_regions.begin();
3889 }
3890 const uint64_t chunk_size = 4 * 1024 * 1024;
3891 const uint64_t page_size = 4096;
3892 const uint64_t max_extent_size = 16;
3893 uint64_t overlay_size = last_32.length() + max_extent_size;
3894 for (size_t i = 0; i < disk_regions_count; reg++, i++) {
3895 if (reg == disk_regions.end()) {
3896 reg = disk_regions.begin();
3897 }
3898 uint64_t pos = reg.get_start();
3899 uint64_t len = reg.get_len();
3900
3901 std::unique_ptr<char[]> raw_data_p{new char[page_size + chunk_size]};
3902 char* raw_data = raw_data_p.get();
3903 memset(raw_data, 0, page_size);
3904
3905 while (len > last_32.length()) {
3906 uint64_t chunk_len = len > chunk_size ? chunk_size : len;
3907 dout(5) << __func__ << " read "
3908 << get_device_name(dev) << ":0x" << std::hex << pos << "+" << chunk_len << std::dec << dendl;
3909 r = bdev[dev]->read_random(pos, chunk_len, raw_data + page_size, cct->_conf->bluefs_buffered_io);
3910 ceph_assert(r == 0);
3911
3912 //search for fixed_last_32
3913 char* chunk_b = raw_data + page_size;
3914 char* chunk_e = chunk_b + chunk_len;
3915
3916 char* search_b = chunk_b - overlay_size;
3917 char* search_e = chunk_e;
3918
3919 for (char* sp = search_b; ; sp += last_32.length()) {
3920 sp = (char*)memmem(sp, search_e - sp, last_32.c_str(), last_32.length());
3921 if (sp == nullptr) {
3922 break;
3923 }
3924
3925 char* n = sp + last_32.length();
3926 dout(5) << __func__ << " checking location 0x" << std::hex << pos + (n - chunk_b) << std::dec << dendl;
3927 bufferlist test;
3928 test.append(n, std::min<size_t>(max_extent_size, chunk_e - n));
3929 bluefs_extent_t ne;
3930 try {
3931 bufferlist::const_iterator p = test.begin();
3932 ceph::decode(ne, p);
3933 } catch (buffer::error& e) {
3934 continue;
3935 }
3936 if (extents_rejected.count(ne) != 0) {
3937 dout(5) << __func__ << " extent " << ne << " already refected" <<dendl;
3938 continue;
3939 }
3940 //insert as rejected already. if we succeed, it wouldn't make difference.
3941 extents_rejected.insert(ne);
3942
3943 if (ne.bdev >= MAX_BDEV ||
3944 bdev[ne.bdev] == nullptr ||
3945 ne.length > 16 * 1024 * 1024 ||
3946 (ne.length & 4095) != 0 ||
3947 ne.offset + ne.length > bdev[ne.bdev]->get_size() ||
3948 (ne.offset & 4095) != 0) {
3949 dout(5) << __func__ << " refusing extent " << ne << dendl;
3950 continue;
3951 }
3952 dout(5) << __func__ << " checking extent " << ne << dendl;
3953
3954 //read candidate extent - whole
3955 bufferlist candidate;
3956 candidate.append(fixed);
3957 r = bdev[ne.bdev]->read(ne.offset, ne.length, &candidate, ioc[ne.bdev],
3958 cct->_conf->bluefs_buffered_io);
3959 ceph_assert(r == 0);
3960
3961 //check if transaction & crc is ok
3962 bluefs_transaction_t t;
3963 try {
3964 bufferlist::const_iterator p = candidate.cbegin();
3965 decode(t, p);
3966 }
3967 catch (buffer::error& e) {
3968 dout(5) << __func__ << " failed match" << dendl;
3969 continue;
3970 }
3971
3972 //success, it seems a probable candidate
3973 uint64_t l = std::min<uint64_t>(ne.length, read_len);
3974 //trim to required size
3975 bufferlist requested_read;
3976 requested_read.substr_of(candidate, fixed.length(), l);
3977 bl->append(requested_read);
3978 dout(5) << __func__ << " successful extension of log " << l << "/" << read_len << dendl;
3979 log_fnode.append_extent(ne);
3980 log_fnode.recalc_allocated();
3981 log_reader->buf.pos += l;
3982 return l;
3983 }
3984 //save overlay for next search
3985 memcpy(search_b, chunk_e - overlay_size, overlay_size);
3986 pos += chunk_len;
3987 len -= chunk_len;
3988 }
3989 }
3990 }
3991 return 0;
3992 }
3993
3994 void BlueFS::debug_inject_duplicate_gift(unsigned id,
3995 uint64_t offset,
3996 uint64_t len)
3997 {
3998 dout(0) << __func__ << dendl;
3999 if (id < alloc.size() && alloc[id]) {
4000 alloc[id]->init_add_free(offset, len);
4001 }
4002 }
4003
4004 // ===============================================
4005 // OriginalVolumeSelector
4006
4007 void* OriginalVolumeSelector::get_hint_for_log() const {
4008 return reinterpret_cast<void*>(BlueFS::BDEV_WAL);
4009 }
4010 void* OriginalVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
4011 uint8_t res = BlueFS::BDEV_DB;
4012 if (dirname.length() > 5) {
4013 // the "db.slow" and "db.wal" directory names are hard-coded at
4014 // match up with bluestore. the slow device is always the second
4015 // one (when a dedicated block.db device is present and used at
4016 // bdev 0). the wal device is always last.
4017 if (boost::algorithm::ends_with(dirname, ".slow")) {
4018 res = BlueFS::BDEV_SLOW;
4019 }
4020 else if (boost::algorithm::ends_with(dirname, ".wal")) {
4021 res = BlueFS::BDEV_WAL;
4022 }
4023 }
4024 return reinterpret_cast<void*>(res);
4025 }
4026
4027 uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint)
4028 {
4029 return (uint8_t)(reinterpret_cast<uint64_t>(hint));
4030 }
4031
4032 void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const
4033 {
4034 res.emplace_back(base, db_total);
4035 res.emplace_back(base + ".slow", slow_total);
4036 }
4037
4038 #undef dout_prefix
4039 #define dout_prefix *_dout << "OriginalVolumeSelector: "
4040
4041 void OriginalVolumeSelector::dump(ostream& sout) {
4042 sout<< "wal_total:" << wal_total
4043 << ", db_total:" << db_total
4044 << ", slow_total:" << slow_total
4045 << std::endl;
4046 }