]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueFS.cc
import ceph 15.2.14
[ceph.git] / ceph / src / os / bluestore / BlueFS.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "boost/algorithm/string.hpp"
9f95a23c 5#include "bluestore_common.h"
7c673cae
FG
6#include "BlueFS.h"
7
8#include "common/debug.h"
9#include "common/errno.h"
10#include "common/perf_counters.h"
11#include "BlockDevice.h"
12#include "Allocator.h"
11fdf7f2 13#include "include/ceph_assert.h"
eafe8130 14#include "common/admin_socket.h"
7c673cae
FG
15
16#define dout_context cct
17#define dout_subsys ceph_subsys_bluefs
18#undef dout_prefix
19#define dout_prefix *_dout << "bluefs "
9f95a23c 20using TOPNSPC::common::cmd_getval;
7c673cae
FG
21MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
22MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
f91f0fd5 23MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs_file_writer);
7c673cae 24MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
f91f0fd5
TL
25 bluefs_file_reader_buffer, bluefs_file_reader);
26MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs_file_reader);
7c673cae
FG
27MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
28
11fdf7f2
TL
29static void wal_discard_cb(void *priv, void* priv2) {
30 BlueFS *bluefs = static_cast<BlueFS*>(priv);
31 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
32 bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp);
33}
34
35static void db_discard_cb(void *priv, void* priv2) {
36 BlueFS *bluefs = static_cast<BlueFS*>(priv);
37 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
38 bluefs->handle_discard(BlueFS::BDEV_DB, *tmp);
39}
40
41static void slow_discard_cb(void *priv, void* priv2) {
42 BlueFS *bluefs = static_cast<BlueFS*>(priv);
43 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
44 bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
45}
7c673cae 46
eafe8130
TL
47class BlueFS::SocketHook : public AdminSocketHook {
48 BlueFS* bluefs;
49public:
50 static BlueFS::SocketHook* create(BlueFS* bluefs)
51 {
52 BlueFS::SocketHook* hook = nullptr;
53 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
54 if (admin_socket) {
55 hook = new BlueFS::SocketHook(bluefs);
9f95a23c 56 int r = admin_socket->register_command("bluestore bluefs available "
eafe8130
TL
57 "name=alloc_size,type=CephInt,req=false",
58 hook,
59 "Report available space for bluefs. "
60 "If alloc_size set, make simulation.");
61 if (r != 0) {
62 ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl;
63 delete hook;
64 hook = nullptr;
9f95a23c 65 } else {
f6b5b4d7 66 r = admin_socket->register_command("bluefs stats",
9f95a23c
TL
67 hook,
68 "Dump internal statistics for bluefs."
69 "");
70 ceph_assert(r == 0);
cd265ab1
TL
71 r = admin_socket->register_command("bluefs debug_inject_read_zeros", hook,
72 "Injects 8K zeros into next BlueFS read. Debug only.");
73 ceph_assert(r == 0);
eafe8130
TL
74 }
75 }
76 return hook;
77 }
78
79 ~SocketHook() {
80 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
9f95a23c 81 admin_socket->unregister_commands(this);
eafe8130
TL
82 }
83private:
84 SocketHook(BlueFS* bluefs) :
85 bluefs(bluefs) {}
9f95a23c
TL
86 int call(std::string_view command, const cmdmap_t& cmdmap,
87 Formatter *f,
88 std::ostream& errss,
89 bufferlist& out) override {
90 if (command == "bluestore bluefs available") {
91 int64_t alloc_size = 0;
92 cmd_getval(cmdmap, "alloc_size", alloc_size);
93 if ((alloc_size & (alloc_size - 1)) != 0) {
94 errss << "Invalid allocation size:'" << alloc_size << std::endl;
95 return -EINVAL;
96 }
97 if (alloc_size == 0)
98 alloc_size = bluefs->cct->_conf->bluefs_alloc_size;
99 f->open_object_section("bluefs_available_space");
100 for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) {
101 if (bluefs->bdev[dev]) {
102 f->open_object_section("dev");
103 f->dump_string("device", bluefs->get_device_name(dev));
104 ceph_assert(bluefs->alloc[dev]);
105 f->dump_int("free", bluefs->alloc[dev]->get_free());
106 f->close_section();
107 }
108 }
109 size_t extra_space = 0;
110 if (bluefs->slow_dev_expander) {
111 extra_space = bluefs->slow_dev_expander->available_freespace(alloc_size);
eafe8130 112 }
9f95a23c
TL
113 f->dump_int("available_from_bluestore", extra_space);
114 f->close_section();
115 } else if (command == "bluefs stats") {
116 std::stringstream ss;
117 bluefs->dump_block_extents(ss);
118 bluefs->dump_volume_selector(ss);
eafe8130 119 out.append(ss);
cd265ab1
TL
120 } else if (command == "bluefs debug_inject_read_zeros") {
121 bluefs->inject_read_zeros++;
9f95a23c
TL
122 } else {
123 errss << "Invalid command" << std::endl;
124 return -ENOSYS;
eafe8130 125 }
9f95a23c
TL
126 return 0;
127 }
eafe8130
TL
128};
129
7c673cae
FG
130BlueFS::BlueFS(CephContext* cct)
131 : cct(cct),
132 bdev(MAX_BDEV),
133 ioc(MAX_BDEV),
11fdf7f2 134 block_all(MAX_BDEV)
7c673cae 135{
11fdf7f2
TL
136 discard_cb[BDEV_WAL] = wal_discard_cb;
137 discard_cb[BDEV_DB] = db_discard_cb;
138 discard_cb[BDEV_SLOW] = slow_discard_cb;
eafe8130 139 asok_hook = SocketHook::create(this);
7c673cae
FG
140}
141
142BlueFS::~BlueFS()
143{
eafe8130 144 delete asok_hook;
7c673cae
FG
145 for (auto p : ioc) {
146 if (p)
147 p->aio_wait();
148 }
149 for (auto p : bdev) {
150 if (p) {
151 p->close();
152 delete p;
153 }
154 }
155 for (auto p : ioc) {
156 delete p;
157 }
158}
159
160void BlueFS::_init_logger()
161{
162 PerfCountersBuilder b(cct, "bluefs",
163 l_bluefs_first, l_bluefs_last);
164 b.add_u64_counter(l_bluefs_gift_bytes, "gift_bytes",
11fdf7f2 165 "Bytes gifted from BlueStore", NULL, 0, unit_t(UNIT_BYTES));
7c673cae 166 b.add_u64_counter(l_bluefs_reclaim_bytes, "reclaim_bytes",
11fdf7f2 167 "Bytes reclaimed by BlueStore", NULL, 0, unit_t(UNIT_BYTES));
7c673cae
FG
168 b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
169 "Total bytes (main db device)",
11fdf7f2 170 "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
171 b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
172 "Used bytes (main db device)",
11fdf7f2 173 "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
174 b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
175 "Total bytes (wal device)",
11fdf7f2 176 "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
177 b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
178 "Used bytes (wal device)",
11fdf7f2 179 "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
180 b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
181 "Total bytes (slow device)",
11fdf7f2 182 "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
183 b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
184 "Used bytes (slow device)",
11fdf7f2 185 "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
186 b.add_u64(l_bluefs_num_files, "num_files", "File count",
187 "f", PerfCountersBuilder::PRIO_USEFUL);
188 b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
11fdf7f2 189 "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
7c673cae
FG
190 b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
191 "Compactions of the metadata log");
192 b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
193 "Bytes written to the metadata log", "j",
11fdf7f2 194 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
7c673cae
FG
195 b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
196 "Files written to WAL");
197 b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
198 "Files written to SSTs");
199 b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
200 "Bytes written to WAL", "wal",
201 PerfCountersBuilder::PRIO_CRITICAL);
202 b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
203 "Bytes written to SSTs", "sst",
11fdf7f2
TL
204 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
205 b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow",
206 "Bytes written to WAL/SSTs at slow device", NULL,
207 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
208 b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal",
209 "Maximum bytes allocated from WAL");
210 b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db",
211 "Maximum bytes allocated from DB");
212 b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow",
213 "Maximum bytes allocated from SLOW");
494da23a
TL
214
215 b.add_u64_counter(l_bluefs_read_random_count, "read_random_count",
216 "random read requests processed");
217 b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes",
218 "Bytes requested in random read mode", NULL,
219 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
220 b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count",
221 "random reads requests going to disk");
222 b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes",
223 "Bytes read from disk in random read mode", NULL,
224 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
225 b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count",
226 "random read requests processed using prefetch buffer");
227 b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes",
228 "Bytes read from prefetch buffer in random read mode", NULL,
229 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
230
231 b.add_u64_counter(l_bluefs_read_count, "read_count",
232 "buffered read requests processed");
233 b.add_u64_counter(l_bluefs_read_bytes, "read_bytes",
234 "Bytes requested in buffered read mode", NULL,
235 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
236
237 b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count",
238 "prefetch read requests processed");
239 b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes",
240 "Bytes requested in prefetch read mode", NULL,
241 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
cd265ab1
TL
242 b.add_u64(l_bluefs_read_zeros_candidate, "read_zeros_candidate",
243 "How many times bluefs read found page with all 0s");
244 b.add_u64(l_bluefs_read_zeros_errors, "read_zeros_errors",
245 "How many times bluefs read found transient page with all 0s");
494da23a 246
7c673cae
FG
247 logger = b.create_perf_counters();
248 cct->get_perfcounters_collection()->add(logger);
249}
250
251void BlueFS::_shutdown_logger()
252{
253 cct->get_perfcounters_collection()->remove(logger);
254 delete logger;
255}
256
257void BlueFS::_update_logger_stats()
258{
259 // we must be holding the lock
260 logger->set(l_bluefs_num_files, file_map.size());
261 logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
262
263 if (alloc[BDEV_WAL]) {
11fdf7f2 264 logger->set(l_bluefs_wal_total_bytes, block_all[BDEV_WAL].size());
7c673cae 265 logger->set(l_bluefs_wal_used_bytes,
11fdf7f2 266 block_all[BDEV_WAL].size() - alloc[BDEV_WAL]->get_free());
7c673cae
FG
267 }
268 if (alloc[BDEV_DB]) {
11fdf7f2 269 logger->set(l_bluefs_db_total_bytes, block_all[BDEV_DB].size());
7c673cae 270 logger->set(l_bluefs_db_used_bytes,
11fdf7f2 271 block_all[BDEV_DB].size() - alloc[BDEV_DB]->get_free());
7c673cae
FG
272 }
273 if (alloc[BDEV_SLOW]) {
11fdf7f2 274 logger->set(l_bluefs_slow_total_bytes, block_all[BDEV_SLOW].size());
7c673cae 275 logger->set(l_bluefs_slow_used_bytes,
11fdf7f2 276 block_all[BDEV_SLOW].size() - alloc[BDEV_SLOW]->get_free());
7c673cae
FG
277 }
278}
279
11fdf7f2
TL
280int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
281 bool shared_with_bluestore)
7c673cae
FG
282{
283 dout(10) << __func__ << " bdev " << id << " path " << path << dendl;
11fdf7f2
TL
284 ceph_assert(id < bdev.size());
285 ceph_assert(bdev[id] == NULL);
286 BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
287 discard_cb[id], static_cast<void*>(this));
288 if (shared_with_bluestore) {
289 b->set_no_exclusive_lock();
290 }
7c673cae
FG
291 int r = b->open(path);
292 if (r < 0) {
293 delete b;
294 return r;
295 }
11fdf7f2
TL
296 if (trim) {
297 b->discard(0, b->get_size());
298 }
299
7c673cae 300 dout(1) << __func__ << " bdev " << id << " path " << path
1adf2230 301 << " size " << byte_u_t(b->get_size()) << dendl;
7c673cae
FG
302 bdev[id] = b;
303 ioc[id] = new IOContext(cct, NULL);
304 return 0;
305}
306
307bool BlueFS::bdev_support_label(unsigned id)
308{
11fdf7f2
TL
309 ceph_assert(id < bdev.size());
310 ceph_assert(bdev[id]);
7c673cae
FG
311 return bdev[id]->supported_bdev_label();
312}
313
314uint64_t BlueFS::get_block_device_size(unsigned id)
315{
316 if (id < bdev.size() && bdev[id])
317 return bdev[id]->get_size();
318 return 0;
319}
320
1911f103
TL
321void BlueFS::_add_block_extent(unsigned id, uint64_t offset, uint64_t length,
322 bool skip)
7c673cae 323{
7c673cae 324 dout(1) << __func__ << " bdev " << id
11fdf7f2 325 << " 0x" << std::hex << offset << "~" << length << std::dec
1911f103 326 << " skip " << skip
7c673cae 327 << dendl;
11fdf7f2
TL
328
329 ceph_assert(id < bdev.size());
330 ceph_assert(bdev[id]);
331 ceph_assert(bdev[id]->get_size() >= offset + length);
7c673cae 332 block_all[id].insert(offset, length);
7c673cae
FG
333
334 if (id < alloc.size() && alloc[id]) {
1911f103
TL
335 if (!skip)
336 log_t.op_alloc_add(id, offset, length);
337
7c673cae
FG
338 alloc[id]->init_add_free(offset, length);
339 }
340
341 if (logger)
342 logger->inc(l_bluefs_gift_bytes, length);
343 dout(10) << __func__ << " done" << dendl;
344}
345
346int BlueFS::reclaim_blocks(unsigned id, uint64_t want,
a8e16298 347 PExtentVector *extents)
7c673cae 348{
11fdf7f2 349 std::unique_lock l(lock);
7c673cae
FG
350 dout(1) << __func__ << " bdev " << id
351 << " want 0x" << std::hex << want << std::dec << dendl;
11fdf7f2
TL
352 ceph_assert(id < alloc.size());
353 ceph_assert(alloc[id]);
9f95a23c
TL
354 int64_t got = 0;
355
356 interval_set<uint64_t> granular;
357 while (want > 0 && !block_unused_too_granular[id].empty()) {
358 auto p = block_unused_too_granular[id].begin();
359 dout(20) << __func__ << " unused " << (int)id << ":"
360 << std::hex << p.get_start() << "~" << p.get_len() << dendl;
361 extents->push_back({p.get_start(), p.get_len()});
362 granular.insert(p.get_start(), p.get_len());
363 if (want >= p.get_len()) {
364 want -= p.get_len();
365 } else {
366 want = 0;
367 }
368 got += p.get_len();
369 block_unused_too_granular[id].erase(p);
7c673cae
FG
370 }
371
9f95a23c
TL
372 if (want > 0) {
373 got += alloc[id]->allocate(want, alloc_size[id], 0, extents);
374 ceph_assert(got != 0);
375 if (got < 0) {
376 derr << __func__ << " failed to allocate space to return to bluestore"
377 << dendl;
378 alloc[id]->dump();
379 block_unused_too_granular[id].insert(granular);
380 return got;
381 }
7c673cae 382
9f95a23c
TL
383 for (auto& p : *extents) {
384 block_all[id].erase(p.offset, p.length);
385 log_t.op_alloc_rm(id, p.offset, p.length);
386 }
387
388 flush_bdev();
389 int r = _flush_and_sync_log(l);
390 ceph_assert(r == 0);
391 }
7c673cae 392
11fdf7f2 393 logger->inc(l_bluefs_reclaim_bytes, got);
7c673cae
FG
394 dout(1) << __func__ << " bdev " << id << " want 0x" << std::hex << want
395 << " got " << *extents << dendl;
396 return 0;
397}
398
11fdf7f2 399void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
7c673cae 400{
11fdf7f2
TL
401 dout(10) << __func__ << " bdev " << id << dendl;
402 ceph_assert(alloc[id]);
403 alloc[id]->release(to_release);
404}
405
406uint64_t BlueFS::get_used()
407{
408 std::lock_guard l(lock);
409 uint64_t used = 0;
410 for (unsigned id = 0; id < MAX_BDEV; ++id) {
411 if (alloc[id]) {
412 used += block_all[id].size() - alloc[id]->get_free();
413 }
7c673cae 414 }
11fdf7f2 415 return used;
7c673cae
FG
416}
417
418uint64_t BlueFS::get_total(unsigned id)
419{
11fdf7f2
TL
420 std::lock_guard l(lock);
421 ceph_assert(id < block_all.size());
422 return block_all[id].size();
7c673cae
FG
423}
424
425uint64_t BlueFS::get_free(unsigned id)
426{
11fdf7f2
TL
427 std::lock_guard l(lock);
428 ceph_assert(id < alloc.size());
7c673cae
FG
429 return alloc[id]->get_free();
430}
431
432void BlueFS::dump_perf_counters(Formatter *f)
433{
434 f->open_object_section("bluefs_perf_counters");
435 logger->dump_formatted(f,0);
436 f->close_section();
437}
438
3efd9988
FG
439void BlueFS::dump_block_extents(ostream& out)
440{
441 for (unsigned i = 0; i < MAX_BDEV; ++i) {
442 if (!bdev[i]) {
443 continue;
444 }
11fdf7f2
TL
445 auto owned = get_total(i);
446 auto free = get_free(i);
1911f103 447
11fdf7f2
TL
448 out << i << " : device size 0x" << std::hex << bdev[i]->get_size()
449 << " : own 0x" << block_all[i]
450 << " = 0x" << owned
451 << " : using 0x" << owned - free
1911f103
TL
452 << std::dec << "(" << byte_u_t(owned - free) << ")";
453 if (i == _get_slow_device_id()) {
454 ceph_assert(slow_dev_expander);
455 ceph_assert(alloc[i]);
456 free = slow_dev_expander->available_freespace(alloc_size[i]);
457 out << std::hex
458 << " : bluestore has 0x" << free
459 << std::dec << "(" << byte_u_t(free) << ") available";
460 }
461 out << "\n";
3efd9988
FG
462 }
463}
7c673cae
FG
464
465void BlueFS::get_usage(vector<pair<uint64_t,uint64_t>> *usage)
466{
11fdf7f2 467 std::lock_guard l(lock);
7c673cae
FG
468 usage->resize(bdev.size());
469 for (unsigned id = 0; id < bdev.size(); ++id) {
470 if (!bdev[id]) {
471 (*usage)[id] = make_pair(0, 0);
472 continue;
473 }
474 (*usage)[id].first = alloc[id]->get_free();
11fdf7f2 475 (*usage)[id].second = block_all[id].size();
7c673cae 476 uint64_t used =
11fdf7f2 477 (block_all[id].size() - (*usage)[id].first) * 100 / block_all[id].size();
7c673cae
FG
478 dout(10) << __func__ << " bdev " << id
479 << " free " << (*usage)[id].first
1adf2230 480 << " (" << byte_u_t((*usage)[id].first) << ")"
7c673cae 481 << " / " << (*usage)[id].second
1adf2230 482 << " (" << byte_u_t((*usage)[id].second) << ")"
7c673cae
FG
483 << ", used " << used << "%"
484 << dendl;
485 }
486}
487
488int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
489{
11fdf7f2 490 std::lock_guard l(lock);
7c673cae
FG
491 dout(10) << __func__ << " bdev " << id << dendl;
492 if (id >= block_all.size())
493 return -EINVAL;
494 *extents = block_all[id];
495 return 0;
496}
497
9f95a23c 498int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
7c673cae 499{
11fdf7f2 500 std::unique_lock l(lock);
7c673cae
FG
501 dout(1) << __func__
502 << " osd_uuid " << osd_uuid
503 << dendl;
504
9f95a23c
TL
505 // set volume selector if not provided before/outside
506 if (vselector == nullptr) {
507 vselector.reset(
508 new OriginalVolumeSelector(
509 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
510 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
511 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
512 }
513
7c673cae
FG
514 _init_alloc();
515 _init_logger();
516
517 super.version = 1;
518 super.block_size = bdev[BDEV_DB]->get_block_size();
519 super.osd_uuid = osd_uuid;
520 super.uuid.generate_random();
521 dout(1) << __func__ << " uuid " << super.uuid << dendl;
522
523 // init log
9f95a23c 524 FileRef log_file = ceph::make_ref<File>();
7c673cae 525 log_file->fnode.ino = 1;
f6b5b4d7 526 log_file->vselector_hint = vselector->get_hint_for_log();
7c673cae 527 int r = _allocate(
9f95a23c 528 vselector->select_prefer_bdev(log_file->vselector_hint),
7c673cae 529 cct->_conf->bluefs_max_log_runway,
94b18763 530 &log_file->fnode);
9f95a23c 531 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
11fdf7f2 532 ceph_assert(r == 0);
7c673cae
FG
533 log_writer = _create_writer(log_file);
534
535 // initial txn
536 log_t.op_init();
537 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
538 interval_set<uint64_t>& p = block_all[bdev];
539 if (p.empty())
540 continue;
541 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
542 dout(20) << __func__ << " op_alloc_add " << bdev << " 0x"
543 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
544 << dendl;
545 log_t.op_alloc_add(bdev, q.get_start(), q.get_len());
546 }
547 }
548 _flush_and_sync_log(l);
549
550 // write supers
551 super.log_fnode = log_file->fnode;
9f95a23c 552 super.memorized_layout = layout;
11fdf7f2 553 _write_super(BDEV_DB);
7c673cae
FG
554 flush_bdev();
555
556 // clean up
557 super = bluefs_super_t();
558 _close_writer(log_writer);
559 log_writer = NULL;
560 block_all.clear();
9f95a23c 561 vselector.reset(nullptr);
7c673cae
FG
562 _stop_alloc();
563 _shutdown_logger();
564
565 dout(10) << __func__ << " success" << dendl;
566 return 0;
567}
568
569void BlueFS::_init_alloc()
570{
571 dout(20) << __func__ << dendl;
572 alloc.resize(MAX_BDEV);
eafe8130 573 alloc_size.resize(MAX_BDEV, 0);
7c673cae 574 pending_release.resize(MAX_BDEV);
9f95a23c 575 block_unused_too_granular.resize(MAX_BDEV);
eafe8130
TL
576
577 if (bdev[BDEV_WAL]) {
578 alloc_size[BDEV_WAL] = cct->_conf->bluefs_alloc_size;
579 }
580 if (bdev[BDEV_SLOW]) {
581 alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size;
582 alloc_size[BDEV_SLOW] = cct->_conf->bluefs_shared_alloc_size;
583 } else {
584 alloc_size[BDEV_DB] = cct->_conf->bluefs_shared_alloc_size;
585 }
586 // new wal and db devices are never shared
587 if (bdev[BDEV_NEWWAL]) {
588 alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size;
589 }
590 if (bdev[BDEV_NEWDB]) {
591 alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size;
592 }
593
7c673cae
FG
594 for (unsigned id = 0; id < bdev.size(); ++id) {
595 if (!bdev[id]) {
596 continue;
597 }
11fdf7f2 598 ceph_assert(bdev[id]->get_size());
eafe8130
TL
599 std::string name = "bluefs-";
600 const char* devnames[] = {"wal","db","slow"};
601 if (id <= BDEV_SLOW)
602 name += devnames[id];
603 else
604 name += to_string(uintptr_t(this));
605 ceph_assert(alloc_size[id]);
606 dout(1) << __func__ << " id " << id
607 << " alloc_size 0x" << std::hex << alloc_size[id]
608 << " size 0x" << bdev[id]->get_size() << std::dec << dendl;
7c673cae
FG
609 alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
610 bdev[id]->get_size(),
eafe8130 611 alloc_size[id], name);
7c673cae
FG
612 interval_set<uint64_t>& p = block_all[id];
613 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
614 alloc[id]->init_add_free(q.get_start(), q.get_len());
615 }
616 }
617}
618
619void BlueFS::_stop_alloc()
620{
621 dout(20) << __func__ << dendl;
11fdf7f2
TL
622 for (auto p : bdev) {
623 if (p)
624 p->discard_drain();
625 }
626
7c673cae
FG
627 for (auto p : alloc) {
628 if (p != nullptr) {
629 p->shutdown();
630 delete p;
631 }
632 }
633 alloc.clear();
9f95a23c 634 block_unused_too_granular.clear();
7c673cae
FG
635}
636
cd265ab1
TL
637int BlueFS::read(uint8_t ndev, uint64_t off, uint64_t len,
638 ceph::buffer::list *pbl, IOContext *ioc, bool buffered)
639{
640 dout(10) << __func__ << " dev " << int(ndev)
641 << ": 0x" << std::hex << off << "~" << len << std::dec
642 << (buffered ? " buffered" : "")
643 << dendl;
644 int r;
645 bufferlist bl;
646 r = bdev[ndev]->read(off, len, &bl, ioc, buffered);
647 if (r != 0) {
648 return r;
649 }
650 uint64_t block_size = bdev[ndev]->get_block_size();
651 if (inject_read_zeros) {
652 if (len >= block_size * 2) {
653 derr << __func__ << " injecting error, zeros at "
654 << int(ndev) << ": 0x" << std::hex << (off + len / 2)
655 << "~" << (block_size * 2) << std::dec << dendl;
656 //use beginning, replace 8K in the middle with zeros, use tail
657 bufferlist temp;
658 bl.splice(0, len / 2 - block_size, &temp);
659 temp.append_zero(block_size * 2);
660 bl.splice(block_size * 2, len / 2 - block_size, &temp);
661 bl = temp;
662 inject_read_zeros--;
663 }
664 }
665 //make a check if there is a block with all 0
666 uint64_t to_check_len = len;
667 uint64_t skip = p2nphase(off, block_size);
668 if (skip >= to_check_len) {
669 return r;
670 }
671 auto it = bl.begin(skip);
672 to_check_len -= skip;
673 bool all_zeros = false;
674 while (all_zeros == false && to_check_len >= block_size) {
675 // checking 0s step
676 unsigned block_left = block_size;
677 unsigned avail;
678 const char* data;
679 all_zeros = true;
680 while (all_zeros && block_left > 0) {
681 avail = it.get_ptr_and_advance(block_left, &data);
682 block_left -= avail;
683 all_zeros = mem_is_zero(data, avail);
684 }
685 // skipping step
686 while (block_left > 0) {
687 avail = it.get_ptr_and_advance(block_left, &data);
688 block_left -= avail;
689 }
690 to_check_len -= block_size;
691 }
692 if (all_zeros) {
693 logger->inc(l_bluefs_read_zeros_candidate, 1);
694 bufferlist bl_reread;
695 r = bdev[ndev]->read(off, len, &bl_reread, ioc, buffered);
696 if (r != 0) {
697 return r;
698 }
699 // check if both read gave the same
700 if (!bl.contents_equal(bl_reread)) {
701 // report problems to log, but continue, maybe it will be good now...
702 derr << __func__ << " initial read of " << int(ndev)
703 << ": 0x" << std::hex << off << "~" << len
704 << std::dec << ": different then re-read " << dendl;
705 logger->inc(l_bluefs_read_zeros_errors, 1);
706 }
707 // use second read will be better if is different
708 pbl->append(bl_reread);
709 } else {
710 pbl->append(bl);
711 }
712 return r;
713}
714
715int BlueFS::read_random(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered)
716{
717 dout(10) << __func__ << " dev " << int(ndev)
718 << ": 0x" << std::hex << off << "~" << len << std::dec
719 << (buffered ? " buffered" : "")
720 << dendl;
721 int r;
722 r = bdev[ndev]->read_random(off, len, buf, buffered);
723 if (r != 0) {
724 return r;
725 }
726 uint64_t block_size = bdev[ndev]->get_block_size();
727 if (inject_read_zeros) {
728 if (len >= block_size * 2) {
729 derr << __func__ << " injecting error, zeros at "
730 << int(ndev) << ": 0x" << std::hex << (off + len / 2)
731 << "~" << (block_size * 2) << std::dec << dendl;
732 //zero middle 8K
733 memset(buf + len / 2 - block_size, 0, block_size * 2);
734 inject_read_zeros--;
735 }
736 }
737 //make a check if there is a block with all 0
738 uint64_t to_check_len = len;
739 const char* data = buf;
740 uint64_t skip = p2nphase(off, block_size);
741 if (skip >= to_check_len) {
742 return r;
743 }
744 to_check_len -= skip;
745 data += skip;
746
747 bool all_zeros = false;
748 while (all_zeros == false && to_check_len >= block_size) {
749 if (mem_is_zero(data, block_size)) {
750 // at least one block is all zeros
751 all_zeros = true;
752 break;
753 }
754 data += block_size;
755 to_check_len -= block_size;
756 }
757 if (all_zeros) {
758 logger->inc(l_bluefs_read_zeros_candidate, 1);
759 std::unique_ptr<char[]> data_reread(new char[len]);
760 r = bdev[ndev]->read_random(off, len, &data_reread[0], buffered);
761 if (r != 0) {
762 return r;
763 }
764 // check if both read gave the same
765 if (memcmp(buf, &data_reread[0], len) != 0) {
766 derr << __func__ << " initial read of " << int(ndev)
767 << ": 0x" << std::hex << off << "~" << len
768 << std::dec << ": different then re-read " << dendl;
769 logger->inc(l_bluefs_read_zeros_errors, 1);
770 // second read is probably better
771 memcpy(buf, &data_reread[0], len);
772 }
773 }
774 return r;
775}
776
7c673cae
FG
777int BlueFS::mount()
778{
779 dout(1) << __func__ << dendl;
780
781 int r = _open_super();
782 if (r < 0) {
783 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
784 goto out;
785 }
786
9f95a23c
TL
787 // set volume selector if not provided before/outside
788 if (vselector == nullptr) {
789 vselector.reset(
790 new OriginalVolumeSelector(
791 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
792 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
793 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
794 }
795
7c673cae
FG
796 block_all.clear();
797 block_all.resize(MAX_BDEV);
7c673cae 798 _init_alloc();
494da23a 799 _init_logger();
7c673cae 800
11fdf7f2 801 r = _replay(false, false);
7c673cae
FG
802 if (r < 0) {
803 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
804 _stop_alloc();
805 goto out;
806 }
807
808 // init freelist
809 for (auto& p : file_map) {
810 dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
811 for (auto& q : p.second->fnode.extents) {
812 alloc[q.bdev]->init_rm_free(q.offset, q.length);
813 }
814 }
815
816 // set up the log for future writes
817 log_writer = _create_writer(_get_file(1));
11fdf7f2 818 ceph_assert(log_writer->file->fnode.ino == 1);
7c673cae
FG
819 log_writer->pos = log_writer->file->fnode.size;
820 dout(10) << __func__ << " log write pos set to 0x"
821 << std::hex << log_writer->pos << std::dec
822 << dendl;
823
7c673cae
FG
824 return 0;
825
826 out:
827 super = bluefs_super_t();
828 return r;
829}
830
9f95a23c
TL
831int BlueFS::maybe_verify_layout(const bluefs_layout_t& layout) const
832{
833 if (super.memorized_layout) {
834 if (layout == *super.memorized_layout) {
835 dout(10) << __func__ << " bluefs layout verified positively" << dendl;
836 } else {
837 derr << __func__ << " memorized layout doesn't fit current one" << dendl;
838 return -EIO;
839 }
840 } else {
841 dout(10) << __func__ << " no memorized_layout in bluefs superblock"
842 << dendl;
843 }
844
845 return 0;
846}
847
1911f103 848void BlueFS::umount(bool avoid_compact)
7c673cae
FG
849{
850 dout(1) << __func__ << dendl;
851
1911f103 852 sync_metadata(avoid_compact);
7c673cae
FG
853
854 _close_writer(log_writer);
855 log_writer = NULL;
856
9f95a23c 857 vselector.reset(nullptr);
7c673cae
FG
858 _stop_alloc();
859 file_map.clear();
860 dir_map.clear();
861 super = bluefs_super_t();
862 log_t.clear();
863 _shutdown_logger();
864}
865
9f95a23c 866int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout)
7c673cae 867{
11fdf7f2
TL
868 dout(1) << __func__ << dendl;
869
870 if(id == BDEV_NEWDB) {
871 int new_log_dev_cur = BDEV_WAL;
872 int new_log_dev_next = BDEV_WAL;
873 if (!bdev[BDEV_WAL]) {
874 new_log_dev_cur = BDEV_NEWDB;
875 new_log_dev_next = BDEV_DB;
876 }
9f95a23c 877 _rewrite_log_and_layout_sync(false,
11fdf7f2
TL
878 BDEV_NEWDB,
879 new_log_dev_cur,
880 new_log_dev_next,
9f95a23c
TL
881 RENAME_DB2SLOW,
882 layout);
11fdf7f2
TL
883 //}
884 } else if(id == BDEV_NEWWAL) {
9f95a23c
TL
885 _rewrite_log_and_layout_sync(false,
886 BDEV_DB,
887 BDEV_NEWWAL,
888 BDEV_WAL,
889 REMOVE_WAL,
890 layout);
11fdf7f2
TL
891 } else {
892 assert(false);
893 }
894 return 0;
895}
896
897void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id)
898{
899 if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB])
7c673cae
FG
900 bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
901 if (bdev[BDEV_WAL])
902 bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
11fdf7f2
TL
903}
904
905void BlueFS::get_devices(set<string> *ls)
906{
907 for (unsigned i = 0; i < MAX_BDEV; ++i) {
908 if (bdev[i]) {
909 bdev[i]->get_devices(ls);
910 }
911 }
7c673cae
FG
912}
913
914int BlueFS::fsck()
915{
11fdf7f2 916 std::lock_guard l(lock);
7c673cae
FG
917 dout(1) << __func__ << dendl;
918 // hrm, i think we check everything on mount...
919 return 0;
920}
921
11fdf7f2 922int BlueFS::_write_super(int dev)
7c673cae
FG
923{
924 // build superblock
925 bufferlist bl;
11fdf7f2 926 encode(super, bl);
7c673cae 927 uint32_t crc = bl.crc32c(-1);
11fdf7f2 928 encode(crc, bl);
7c673cae
FG
929 dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
930 dout(10) << __func__ << " superblock " << super.version << dendl;
931 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
9f95a23c 932 ceph_assert_always(bl.length() <= get_super_length());
7c673cae
FG
933 bl.append_zero(get_super_length() - bl.length());
934
11fdf7f2 935 bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT);
7c673cae
FG
936 dout(20) << __func__ << " v " << super.version
937 << " crc 0x" << std::hex << crc
938 << " offset 0x" << get_super_offset() << std::dec
939 << dendl;
940 return 0;
941}
942
943int BlueFS::_open_super()
944{
945 dout(10) << __func__ << dendl;
946
947 bufferlist bl;
948 uint32_t expected_crc, crc;
949 int r;
950
951 // always the second block
952 r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
953 &bl, ioc[BDEV_DB], false);
954 if (r < 0)
955 return r;
956
11fdf7f2
TL
957 auto p = bl.cbegin();
958 decode(super, p);
7c673cae
FG
959 {
960 bufferlist t;
961 t.substr_of(bl, 0, p.get_off());
962 crc = t.crc32c(-1);
963 }
11fdf7f2 964 decode(expected_crc, p);
7c673cae
FG
965 if (crc != expected_crc) {
966 derr << __func__ << " bad crc on superblock, expected 0x"
967 << std::hex << expected_crc << " != actual 0x" << crc << std::dec
968 << dendl;
969 return -EIO;
970 }
971 dout(10) << __func__ << " superblock " << super.version << dendl;
972 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
973 return 0;
974}
975
9f95a23c
TL
976int BlueFS::_check_new_allocations(const bluefs_fnode_t& fnode,
977 size_t dev_count,
978 boost::dynamic_bitset<uint64_t>* owned_blocks,
979 boost::dynamic_bitset<uint64_t>* used_blocks)
980{
981 auto& fnode_extents = fnode.extents;
982 for (auto e : fnode_extents) {
983 auto id = e.bdev;
984 bool fail = false;
985 ceph_assert(id < dev_count);
986 apply_for_bitset_range(e.offset, e.length, alloc_size[id], owned_blocks[id],
987 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
988 if (!bs.test(pos)) {
989 fail = true;
990 }
991 }
992 );
993 if (fail) {
994 derr << __func__ << " invalid extent " << int(id)
995 << ": 0x" << std::hex << e.offset << "~" << e.length
996 << std::dec
997 << ": wasn't given but allocated for ino " << fnode.ino
998 << dendl;
999 return -EFAULT;
1000 }
1001
1002 apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
1003 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1004 if (bs.test(pos)) {
1005 fail = true;
1006 }
1007 bs.set(pos);
1008 }
1009 );
1010 if (fail) {
1011 derr << __func__ << " invalid extent " << int(e.bdev)
1012 << ": 0x" << std::hex << e.offset << "~" << e.length
1013 << std::dec << ": duplicate reference, ino " << fnode.ino
1014 << dendl;
1015 return -EFAULT;
1016 }
1017 }
1018 return 0;
1019}
1020
1021int BlueFS::_adjust_granularity(
1022 __u8 id, uint64_t *offset, uint64_t *length, bool alloc)
1023{
1024 const char *op = alloc ? "op_alloc_add" : "op_alloc_rm";
1025 auto oldo = *offset;
1026 auto oldl = *length;
1027 if (*offset & (alloc_size[id] - 1)) {
1028 *offset &= ~(alloc_size[id] - 1);
1029 *offset += alloc_size[id];
1030 if (*length > *offset - oldo) {
1031 if (alloc) {
1032 block_unused_too_granular[id].insert(oldo, *offset - oldo);
1033 } else {
1034 block_unused_too_granular[id].erase(oldo, *offset - oldo);
1035 }
1036 *length -= (*offset - oldo);
1037 } else {
1038 if (alloc) {
1039 block_unused_too_granular[id].insert(oldo, *length);
1040 } else {
1041 block_unused_too_granular[id].erase(oldo, *length);
1042 }
1043 *length = 0;
1044 }
1045 }
1046 if (*length & (alloc_size[id] - 1)) {
1047 *length &= ~(alloc_size[id] - 1);
1048 if (alloc) {
1049 block_unused_too_granular[id].insert(
1050 *offset + *length,
1051 oldo + oldl - *offset - *length);
1052 } else {
1053 block_unused_too_granular[id].erase(
1054 *offset + *length,
1055 oldo + oldl - *offset - *length);
1056 }
1057 }
1058 if (oldo != *offset || oldl != *length) {
1059 dout(10) << __func__ << " " << op << " "
1060 << (int)id << ":" << std::hex << oldo << "~" << oldl
1061 << " -> " << (int)id << ":" << *offset << "~" << *length << dendl;
1062 }
1063 return 0;
1064}
1065
1066int BlueFS::_verify_alloc_granularity(
1067 __u8 id, uint64_t offset, uint64_t length, const char *op)
1068{
1069 if ((offset & (alloc_size[id] - 1)) ||
1070 (length & (alloc_size[id] - 1))) {
1071 derr << __func__ << " " << op << " of " << (int)id
1072 << ":0x" << std::hex << offset << "~" << length << std::dec
1073 << " does not align to alloc_size 0x"
1074 << std::hex << alloc_size[id] << std::dec << dendl;
1075 // be helpful
1076 auto need = alloc_size[id];
1077 while (need && ((offset & (need - 1)) ||
1078 (length & (need - 1)))) {
1079 need >>= 1;
1080 }
1081 if (need) {
1082 const char *which;
1083 if (id == BDEV_SLOW ||
1084 (id == BDEV_DB && !bdev[BDEV_SLOW])) {
1085 which = "bluefs_shared_alloc_size";
1086 } else {
1087 which = "bluefs_alloc_size";
1088 }
1089 derr << "work-around by setting " << which << " = " << need
1090 << " for this OSD" << dendl;
1091 }
1092 return -EFAULT;
1093 }
1094 return 0;
1095}
1096
11fdf7f2 1097int BlueFS::_replay(bool noop, bool to_stdout)
7c673cae
FG
1098{
1099 dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
1100 ino_last = 1; // by the log
1101 log_seq = 0;
1102
1103 FileRef log_file;
11fdf7f2 1104 log_file = _get_file(1);
9f95a23c
TL
1105
1106 // sanity check
1107 for (auto& a : block_unused_too_granular) {
1108 ceph_assert(a.empty());
1109 }
1110
11fdf7f2
TL
1111 if (!noop) {
1112 log_file->fnode = super.log_fnode;
9f95a23c 1113 log_file->vselector_hint =
f6b5b4d7 1114 vselector->get_hint_for_log();
7c673cae 1115 } else {
11fdf7f2
TL
1116 // do not use fnode from superblock in 'noop' mode - log_file's one should
1117 // be fine and up-to-date
1118 ceph_assert(log_file->fnode.ino == 1);
1119 ceph_assert(log_file->fnode.extents.size() != 0);
7c673cae 1120 }
7c673cae 1121 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
11fdf7f2
TL
1122 if (unlikely(to_stdout)) {
1123 std::cout << " log_fnode " << super.log_fnode << std::endl;
1124 }
7c673cae
FG
1125
1126 FileReader *log_reader = new FileReader(
1127 log_file, cct->_conf->bluefs_max_prefetch,
1128 false, // !random
1129 true); // ignore eof
9f95a23c
TL
1130
1131 bool seen_recs = false;
1132
1133 boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV];
1134 boost::dynamic_bitset<uint64_t> owned_blocks[MAX_BDEV];
1135
1136 if (cct->_conf->bluefs_log_replay_check_allocations) {
1137 for (size_t i = 0; i < MAX_BDEV; ++i) {
1138 if (alloc_size[i] != 0 && bdev[i] != nullptr) {
1139 used_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]);
1140 owned_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]);
1141 }
1142 }
1143 }
1144
1145 bool first_log_check = true;
1146
7c673cae 1147 while (true) {
11fdf7f2 1148 ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0);
7c673cae
FG
1149 uint64_t pos = log_reader->buf.pos;
1150 uint64_t read_pos = pos;
1151 bufferlist bl;
1152 {
1153 int r = _read(log_reader, &log_reader->buf, read_pos, super.block_size,
1154 &bl, NULL);
f6b5b4d7
TL
1155 if (r != (int)super.block_size && cct->_conf->bluefs_replay_recovery) {
1156 r += do_replay_recovery_read(log_reader, pos, read_pos + r, super.block_size - r, &bl);
1157 }
1158 assert(r == (int)super.block_size);
7c673cae
FG
1159 read_pos += r;
1160 }
1161 uint64_t more = 0;
1162 uint64_t seq;
1163 uuid_d uuid;
1164 {
11fdf7f2 1165 auto p = bl.cbegin();
7c673cae
FG
1166 __u8 a, b;
1167 uint32_t len;
11fdf7f2
TL
1168 decode(a, p);
1169 decode(b, p);
1170 decode(len, p);
1171 decode(uuid, p);
1172 decode(seq, p);
7c673cae 1173 if (len + 6 > bl.length()) {
11fdf7f2 1174 more = round_up_to(len + 6 - bl.length(), super.block_size);
7c673cae
FG
1175 }
1176 }
1177 if (uuid != super.uuid) {
9f95a23c
TL
1178 if (seen_recs) {
1179 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1180 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1181 << dendl;
1182 } else {
1183 derr << __func__ << " 0x" << std::hex << pos << std::dec
1184 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1185 << ", block dump: \n";
1186 bufferlist t;
1187 t.substr_of(bl, 0, super.block_size);
1188 t.hexdump(*_dout);
1189 *_dout << dendl;
1190 }
7c673cae
FG
1191 break;
1192 }
1193 if (seq != log_seq + 1) {
9f95a23c
TL
1194 if (seen_recs) {
1195 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1196 << ": stop: seq " << seq << " != expected " << log_seq + 1
1197 << dendl;;
1198 } else {
1199 derr << __func__ << " 0x" << std::hex << pos << std::dec
1200 << ": stop: seq " << seq << " != expected " << log_seq + 1
1201 << dendl;;
1202 }
7c673cae
FG
1203 break;
1204 }
1205 if (more) {
1206 dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
1207 << " more bytes" << dendl;
1208 bufferlist t;
1209 int r = _read(log_reader, &log_reader->buf, read_pos, more, &t, NULL);
1210 if (r < (int)more) {
f6b5b4d7
TL
1211 dout(10) << __func__ << " 0x" << std::hex << pos
1212 << ": stop: len is 0x" << bl.length() + more << std::dec
1213 << ", which is past eof" << dendl;
1214 if (cct->_conf->bluefs_replay_recovery) {
1215 //try to search for more data
1216 r += do_replay_recovery_read(log_reader, pos, read_pos + r, more - r, &t);
1217 if (r < (int)more) {
1218 //in normal mode we must read r==more, for recovery it is too strict
1219 break;
1220 }
1221 }
7c673cae 1222 }
11fdf7f2 1223 ceph_assert(r == (int)more);
7c673cae
FG
1224 bl.claim_append(t);
1225 read_pos += r;
1226 }
9f95a23c 1227 seen_recs = true;
7c673cae
FG
1228 bluefs_transaction_t t;
1229 try {
11fdf7f2
TL
1230 auto p = bl.cbegin();
1231 decode(t, p);
7c673cae
FG
1232 }
1233 catch (buffer::error& e) {
9f95a23c
TL
1234 derr << __func__ << " 0x" << std::hex << pos << std::dec
1235 << ": stop: failed to decode: " << e.what()
1236 << dendl;
7c673cae
FG
1237 delete log_reader;
1238 return -EIO;
1239 }
11fdf7f2 1240 ceph_assert(seq == t.seq);
7c673cae
FG
1241 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1242 << ": " << t << dendl;
11fdf7f2
TL
1243 if (unlikely(to_stdout)) {
1244 std::cout << " 0x" << std::hex << pos << std::dec
1245 << ": " << t << std::endl;
1246 }
7c673cae 1247
11fdf7f2 1248 auto p = t.op_bl.cbegin();
7c673cae
FG
1249 while (!p.end()) {
1250 __u8 op;
11fdf7f2 1251 decode(op, p);
7c673cae
FG
1252 switch (op) {
1253
1254 case bluefs_transaction_t::OP_INIT:
1255 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1256 << ": op_init" << dendl;
11fdf7f2
TL
1257 if (unlikely(to_stdout)) {
1258 std::cout << " 0x" << std::hex << pos << std::dec
1259 << ": op_init" << std::endl;
1260 }
1261
1262 ceph_assert(t.seq == 1);
7c673cae
FG
1263 break;
1264
1265 case bluefs_transaction_t::OP_JUMP:
1266 {
1267 uint64_t next_seq;
1268 uint64_t offset;
11fdf7f2
TL
1269 decode(next_seq, p);
1270 decode(offset, p);
7c673cae
FG
1271 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1272 << ": op_jump seq " << next_seq
1273 << " offset 0x" << std::hex << offset << std::dec << dendl;
11fdf7f2
TL
1274 if (unlikely(to_stdout)) {
1275 std::cout << " 0x" << std::hex << pos << std::dec
1276 << ": op_jump seq " << next_seq
1277 << " offset 0x" << std::hex << offset << std::dec
1278 << std::endl;
1279 }
1280
1281 ceph_assert(next_seq >= log_seq);
7c673cae
FG
1282 log_seq = next_seq - 1; // we will increment it below
1283 uint64_t skip = offset - read_pos;
1284 if (skip) {
1285 bufferlist junk;
1286 int r = _read(log_reader, &log_reader->buf, read_pos, skip, &junk,
1287 NULL);
1288 if (r != (int)skip) {
1289 dout(10) << __func__ << " 0x" << std::hex << read_pos
1290 << ": stop: failed to skip to " << offset
1291 << std::dec << dendl;
11fdf7f2 1292 ceph_abort_msg("problem with op_jump");
7c673cae
FG
1293 }
1294 }
1295 }
1296 break;
1297
1298 case bluefs_transaction_t::OP_JUMP_SEQ:
1299 {
1300 uint64_t next_seq;
11fdf7f2 1301 decode(next_seq, p);
7c673cae
FG
1302 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1303 << ": op_jump_seq " << next_seq << dendl;
11fdf7f2
TL
1304 if (unlikely(to_stdout)) {
1305 std::cout << " 0x" << std::hex << pos << std::dec
1306 << ": op_jump_seq " << next_seq << std::endl;
1307 }
1308
1309 ceph_assert(next_seq >= log_seq);
7c673cae
FG
1310 log_seq = next_seq - 1; // we will increment it below
1311 }
1312 break;
1313
1314 case bluefs_transaction_t::OP_ALLOC_ADD:
1315 {
1316 __u8 id;
1317 uint64_t offset, length;
11fdf7f2
TL
1318 decode(id, p);
1319 decode(offset, p);
1320 decode(length, p);
7c673cae
FG
1321 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1322 << ": op_alloc_add " << " " << (int)id
1323 << ":0x" << std::hex << offset << "~" << length << std::dec
1324 << dendl;
11fdf7f2
TL
1325 if (unlikely(to_stdout)) {
1326 std::cout << " 0x" << std::hex << pos << std::dec
1327 << ": op_alloc_add " << " " << (int)id
1328 << ":0x" << std::hex << offset << "~" << length << std::dec
1329 << std::endl;
1330 }
7c673cae
FG
1331 if (!noop) {
1332 block_all[id].insert(offset, length);
9f95a23c
TL
1333 _adjust_granularity(id, &offset, &length, true);
1334 if (length) {
1335 alloc[id]->init_add_free(offset, length);
1336 }
1337
1338 if (cct->_conf->bluefs_log_replay_check_allocations) {
1339 bool fail = false;
1340 apply_for_bitset_range(offset, length, alloc_size[id], owned_blocks[id],
1341 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1342 if (bs.test(pos)) {
1343 fail = true;
1344 } else {
1345 bs.set(pos);
1346 }
1347 }
1348 );
1349 if (fail) {
1350 derr << __func__ << " invalid extent " << (int)id
1351 << ": 0x" << std::hex << offset << "~" << length
1352 << std::dec << ": already given" << dendl;
1353 return -EFAULT;
1354 }
1355 apply_for_bitset_range(offset, length, alloc_size[id], used_blocks[id],
1356 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1357 if (bs.test(pos)) {
1358 fail = true;
1359 }
1360 }
1361 );
1362 if (fail) {
1363 derr << __func__ << " invalid extent " << int(id)
1364 << ": 0x" << std::hex << offset << "~" << length
1365 << std::dec << ": already in use" << dendl;
1366 return -EFAULT;
1367 }
1368 }
7c673cae
FG
1369 }
1370 }
1371 break;
1372
1373 case bluefs_transaction_t::OP_ALLOC_RM:
1374 {
1375 __u8 id;
1376 uint64_t offset, length;
11fdf7f2
TL
1377 decode(id, p);
1378 decode(offset, p);
1379 decode(length, p);
7c673cae
FG
1380 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1381 << ": op_alloc_rm " << " " << (int)id
1382 << ":0x" << std::hex << offset << "~" << length << std::dec
1383 << dendl;
11fdf7f2
TL
1384 if (unlikely(to_stdout)) {
1385 std::cout << " 0x" << std::hex << pos << std::dec
1386 << ": op_alloc_rm " << " " << (int)id
1387 << ":0x" << std::hex << offset << "~" << length << std::dec
1388 << std::endl;
1389 }
7c673cae
FG
1390 if (!noop) {
1391 block_all[id].erase(offset, length);
9f95a23c
TL
1392 _adjust_granularity(id, &offset, &length, false);
1393 if (length) {
1394 alloc[id]->init_rm_free(offset, length);
1395 }
1396 if (cct->_conf->bluefs_log_replay_check_allocations) {
1397 bool fail = false;
1398 apply_for_bitset_range(offset, length, alloc_size[id], owned_blocks[id],
1399 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1400 if (!bs.test(pos)) {
1401 fail = true;
1402 } else {
1403 bs.reset(pos);
1404 }
1405 }
1406 );
1407 if (fail) {
1408 derr << __func__ << " invalid extent " << int(id)
1409 << ": 0x" << std::hex << offset << "~" << length
1410 << std::dec << ": wasn't given" << dendl;
1411 return -EFAULT;
1412 }
1413
1414 apply_for_bitset_range(offset, length, alloc_size[id], used_blocks[id],
1415 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1416 if (bs.test(pos)) {
1417 fail = true;
1418 }
1419 }
1420 );
1421 if (fail) {
1422 derr << __func__ << " invalid extent " << (int)id
1423 << ": 0x" << std::hex << offset << "~" << length
1424 << std::dec << ": still in use" << dendl;
1425 return -EFAULT;
1426 }
1427 }
1428 }
7c673cae
FG
1429 }
1430 break;
1431
1432 case bluefs_transaction_t::OP_DIR_LINK:
1433 {
1434 string dirname, filename;
1435 uint64_t ino;
11fdf7f2
TL
1436 decode(dirname, p);
1437 decode(filename, p);
1438 decode(ino, p);
7c673cae
FG
1439 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1440 << ": op_dir_link " << " " << dirname << "/" << filename
1441 << " to " << ino
1442 << dendl;
11fdf7f2
TL
1443 if (unlikely(to_stdout)) {
1444 std::cout << " 0x" << std::hex << pos << std::dec
1445 << ": op_dir_link " << " " << dirname << "/" << filename
1446 << " to " << ino
1447 << std::endl;
1448 }
1449
7c673cae
FG
1450 if (!noop) {
1451 FileRef file = _get_file(ino);
11fdf7f2 1452 ceph_assert(file->fnode.ino);
7c673cae 1453 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2 1454 ceph_assert(q != dir_map.end());
7c673cae 1455 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
11fdf7f2 1456 ceph_assert(r == q->second->file_map.end());
9f95a23c
TL
1457
1458 vselector->sub_usage(file->vselector_hint, file->fnode);
1459 file->vselector_hint =
1460 vselector->get_hint_by_dir(dirname);
1461 vselector->add_usage(file->vselector_hint, file->fnode);
1462
7c673cae
FG
1463 q->second->file_map[filename] = file;
1464 ++file->refs;
1465 }
1466 }
1467 break;
1468
1469 case bluefs_transaction_t::OP_DIR_UNLINK:
1470 {
1471 string dirname, filename;
11fdf7f2
TL
1472 decode(dirname, p);
1473 decode(filename, p);
7c673cae
FG
1474 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1475 << ": op_dir_unlink " << " " << dirname << "/" << filename
1476 << dendl;
11fdf7f2
TL
1477 if (unlikely(to_stdout)) {
1478 std::cout << " 0x" << std::hex << pos << std::dec
1479 << ": op_dir_unlink " << " " << dirname << "/" << filename
1480 << std::endl;
1481 }
1482
7c673cae
FG
1483 if (!noop) {
1484 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2 1485 ceph_assert(q != dir_map.end());
7c673cae 1486 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
11fdf7f2
TL
1487 ceph_assert(r != q->second->file_map.end());
1488 ceph_assert(r->second->refs > 0);
7c673cae
FG
1489 --r->second->refs;
1490 q->second->file_map.erase(r);
1491 }
1492 }
1493 break;
1494
1495 case bluefs_transaction_t::OP_DIR_CREATE:
1496 {
1497 string dirname;
11fdf7f2 1498 decode(dirname, p);
7c673cae
FG
1499 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1500 << ": op_dir_create " << dirname << dendl;
11fdf7f2
TL
1501 if (unlikely(to_stdout)) {
1502 std::cout << " 0x" << std::hex << pos << std::dec
1503 << ": op_dir_create " << dirname << std::endl;
1504 }
1505
7c673cae
FG
1506 if (!noop) {
1507 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2 1508 ceph_assert(q == dir_map.end());
9f95a23c 1509 dir_map[dirname] = ceph::make_ref<Dir>();
7c673cae
FG
1510 }
1511 }
1512 break;
1513
1514 case bluefs_transaction_t::OP_DIR_REMOVE:
1515 {
1516 string dirname;
11fdf7f2 1517 decode(dirname, p);
7c673cae
FG
1518 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1519 << ": op_dir_remove " << dirname << dendl;
11fdf7f2
TL
1520 if (unlikely(to_stdout)) {
1521 std::cout << " 0x" << std::hex << pos << std::dec
1522 << ": op_dir_remove " << dirname << std::endl;
1523 }
1524
7c673cae
FG
1525 if (!noop) {
1526 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2
TL
1527 ceph_assert(q != dir_map.end());
1528 ceph_assert(q->second->file_map.empty());
7c673cae
FG
1529 dir_map.erase(q);
1530 }
1531 }
1532 break;
1533
1534 case bluefs_transaction_t::OP_FILE_UPDATE:
1535 {
1536 bluefs_fnode_t fnode;
11fdf7f2 1537 decode(fnode, p);
7c673cae 1538 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
9f95a23c 1539 << ": op_file_update " << " " << fnode << " " << dendl;
11fdf7f2
TL
1540 if (unlikely(to_stdout)) {
1541 std::cout << " 0x" << std::hex << pos << std::dec
1542 << ": op_file_update " << " " << fnode << std::endl;
1543 }
9f95a23c 1544 if (!noop) {
7c673cae 1545 FileRef f = _get_file(fnode.ino);
9f95a23c
TL
1546 if (cct->_conf->bluefs_log_replay_check_allocations) {
1547 // check initial log layout
1548 if (first_log_check) {
1549 first_log_check = false;
1550 int r = _check_new_allocations(log_file->fnode,
1551 MAX_BDEV, owned_blocks, used_blocks);
1552 if (r < 0) {
1553 return r;
1554 }
1555 }
1556
1557 auto& fnode_extents = f->fnode.extents;
1558 for (auto e : fnode_extents) {
1559 auto id = e.bdev;
1560 if (int r = _verify_alloc_granularity(id, e.offset, e.length,
1561 "OP_FILE_UPDATE"); r < 0) {
1562 return r;
1563 }
1564 apply_for_bitset_range(e.offset, e.length, alloc_size[id],
1565 used_blocks[id],
1566 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1567 ceph_assert(bs.test(pos));
1568 bs.reset(pos);
1569 }
1570 );
1571 }
1572 }
1573
1574 if (fnode.ino != 1) {
1575 vselector->sub_usage(f->vselector_hint, f->fnode);
1576 }
1577 f->fnode = fnode;
1578 if (fnode.ino != 1) {
1579 vselector->add_usage(f->vselector_hint, f->fnode);
1580 }
1581
7c673cae
FG
1582 if (fnode.ino > ino_last) {
1583 ino_last = fnode.ino;
1584 }
9f95a23c
TL
1585 if (cct->_conf->bluefs_log_replay_check_allocations) {
1586 int r = _check_new_allocations(f->fnode,
1587 MAX_BDEV, owned_blocks, used_blocks);
1588 if (r < 0) {
1589 return r;
1590 }
1591 }
7c673cae 1592 }
9f95a23c 1593 }
7c673cae
FG
1594 break;
1595
1596 case bluefs_transaction_t::OP_FILE_REMOVE:
1597 {
1598 uint64_t ino;
11fdf7f2 1599 decode(ino, p);
7c673cae
FG
1600 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1601 << ": op_file_remove " << ino << dendl;
11fdf7f2
TL
1602 if (unlikely(to_stdout)) {
1603 std::cout << " 0x" << std::hex << pos << std::dec
1604 << ": op_file_remove " << ino << std::endl;
1605 }
1606
9f95a23c
TL
1607 if (!noop) {
1608 auto p = file_map.find(ino);
1609 ceph_assert(p != file_map.end());
1610 vselector->sub_usage(p->second->vselector_hint, p->second->fnode);
1611 if (cct->_conf->bluefs_log_replay_check_allocations) {
1612 auto& fnode_extents = p->second->fnode.extents;
1613 for (auto e : fnode_extents) {
1614 auto id = e.bdev;
1615 bool fail = false;
1616 apply_for_bitset_range(e.offset, e.length, alloc_size[id], owned_blocks[id],
1617 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1618 if (!bs.test(pos)) {
1619 fail = true;
1620 }
1621 }
1622 );
1623 if (fail) {
1624 derr << __func__ << " invalid extent " << int(id)
1625 << ": 0x" << std::hex << e.offset << "~" << e.length
1626 << std::dec
1627 << ": wasn't given but is allocated for removed ino " << ino
1628 << dendl;
1629 return -EFAULT;
1630 }
1631
1632 apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
1633 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1634 if (!bs.test(pos)) {
1635 fail = true;
1636 }
1637 bs.reset(pos);
1638 }
1639 );
1640 if (fail) {
1641 derr << __func__ << " invalid extent " << int(id)
1642 << ": 0x" << std::hex << e.offset << "~" << e.length
1643 << std::dec
1644 << ": not in use but is allocated for removed ino " << ino
1645 << dendl;
1646 return -EFAULT;
1647 }
1648 }
1649 }
1650 file_map.erase(p);
1651 }
1652 }
7c673cae
FG
1653 break;
1654
1655 default:
1656 derr << __func__ << " 0x" << std::hex << pos << std::dec
1657 << ": stop: unrecognized op " << (int)op << dendl;
1658 delete log_reader;
1659 return -EIO;
1660 }
1661 }
11fdf7f2 1662 ceph_assert(p.end());
7c673cae
FG
1663
1664 // we successfully replayed the transaction; bump the seq and log size
1665 ++log_seq;
1666 log_file->fnode.size = log_reader->buf.pos;
1667 }
9f95a23c
TL
1668 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
1669
1670 if (!noop && first_log_check &&
1671 cct->_conf->bluefs_log_replay_check_allocations) {
1672 int r = _check_new_allocations(log_file->fnode,
1673 MAX_BDEV, owned_blocks, used_blocks);
1674 if (r < 0) {
1675 return r;
1676 }
1677 }
7c673cae
FG
1678
1679 dout(10) << __func__ << " log file size was 0x"
1680 << std::hex << log_file->fnode.size << std::dec << dendl;
11fdf7f2
TL
1681 if (unlikely(to_stdout)) {
1682 std::cout << " log file size was 0x"
1683 << std::hex << log_file->fnode.size << std::dec << std::endl;
1684 }
1685
7c673cae
FG
1686 delete log_reader;
1687
1688 if (!noop) {
1689 // verify file link counts are all >0
1690 for (auto& p : file_map) {
1691 if (p.second->refs == 0 &&
1692 p.second->fnode.ino > 1) {
1693 derr << __func__ << " file with link count 0: " << p.second->fnode
1694 << dendl;
1695 return -EIO;
1696 }
1697 }
1698 }
1699
9f95a23c
TL
1700 for (unsigned id = 0; id < MAX_BDEV; ++id) {
1701 dout(10) << __func__ << " block_unused_too_granular " << id << ": "
1702 << block_unused_too_granular[id] << dendl;
1703 }
7c673cae
FG
1704 dout(10) << __func__ << " done" << dendl;
1705 return 0;
1706}
1707
11fdf7f2
TL
1708int BlueFS::log_dump()
1709{
1710 // only dump log file's content
1711 int r = _replay(true, true);
1712 if (r < 0) {
1713 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
1714 return r;
1715 }
1716
1717 return 0;
1718}
1719
1720int BlueFS::device_migrate_to_existing(
1721 CephContext *cct,
1722 const set<int>& devs_source,
9f95a23c
TL
1723 int dev_target,
1724 const bluefs_layout_t& layout)
11fdf7f2
TL
1725{
1726 vector<byte> buf;
1727 bool buffered = cct->_conf->bluefs_buffered_io;
1728
eafe8130
TL
1729 dout(10) << __func__ << " devs_source " << devs_source
1730 << " dev_target " << dev_target << dendl;
11fdf7f2
TL
1731 assert(dev_target < (int)MAX_BDEV);
1732
1733 int flags = 0;
1734 flags |= devs_source.count(BDEV_DB) ?
1735 (REMOVE_DB | RENAME_SLOW2DB) : 0;
1736 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1737 int dev_target_new = dev_target;
1738
1739 // Slow device without separate DB one is addressed via BDEV_DB
1740 // Hence need renaming.
1741 if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) {
1742 dev_target_new = BDEV_DB;
1743 dout(0) << __func__ << " super to be written to " << dev_target << dendl;
1744 }
1745
9f95a23c 1746 for (auto& [ino, file_ref] : file_map) {
11fdf7f2 1747 //do not copy log
9f95a23c 1748 if (file_ref->fnode.ino == 1) {
11fdf7f2
TL
1749 continue;
1750 }
9f95a23c 1751 dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
eafe8130 1752
9f95a23c 1753 auto& fnode_extents = file_ref->fnode.extents;
11fdf7f2 1754
9f95a23c
TL
1755 bool rewrite = std::any_of(
1756 fnode_extents.begin(),
1757 fnode_extents.end(),
1758 [=](auto& ext) {
1759 return ext.bdev != dev_target && devs_source.count(ext.bdev);
1760 });
eafe8130
TL
1761 if (rewrite) {
1762 dout(10) << __func__ << " migrating" << dendl;
1763
1764 // read entire file
1765 bufferlist bl;
1766 for (auto old_ext : fnode_extents) {
1767 buf.resize(old_ext.length);
1768 int r = bdev[old_ext.bdev]->read_random(
1769 old_ext.offset,
1770 old_ext.length,
1771 (char*)&buf.at(0),
1772 buffered);
1773 if (r != 0) {
1774 derr << __func__ << " failed to read 0x" << std::hex
1775 << old_ext.offset << "~" << old_ext.length << std::dec
1776 << " from " << (int)dev_target << dendl;
1777 return -EIO;
1778 }
1779 bl.append((char*)&buf[0], old_ext.length);
1780 }
11fdf7f2 1781
eafe8130
TL
1782 // write entire file
1783 PExtentVector extents;
1784 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1785 if (l < 0) {
1786 derr << __func__ << " unable to allocate len 0x" << std::hex
1787 << bl.length() << std::dec << " from " << (int)dev_target
1788 << ": " << cpp_strerror(l) << dendl;
1789 return -ENOSPC;
1790 }
11fdf7f2 1791
eafe8130
TL
1792 uint64_t off = 0;
1793 for (auto& i : extents) {
1794 bufferlist cur;
1795 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1796 ceph_assert(cur_len > 0);
1797 cur.substr_of(bl, off, cur_len);
1798 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1799 ceph_assert(r == 0);
1800 off += cur_len;
1801 }
1802
1803 // release old extents
1804 for (auto old_ext : fnode_extents) {
1805 PExtentVector to_release;
1806 to_release.emplace_back(old_ext.offset, old_ext.length);
1807 alloc[old_ext.bdev]->release(to_release);
1808 }
1809
1810 // update fnode
1811 fnode_extents.clear();
1812 for (auto& i : extents) {
1813 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
1814 }
1815 } else {
9f95a23c
TL
1816 for (auto& ext : fnode_extents) {
1817 if (dev_target != dev_target_new && ext.bdev == dev_target) {
eafe8130 1818 dout(20) << __func__ << " " << " ... adjusting extent 0x"
9f95a23c 1819 << std::hex << ext.offset << std::dec
eafe8130
TL
1820 << " bdev " << dev_target << " -> " << dev_target_new
1821 << dendl;
9f95a23c 1822 ext.bdev = dev_target_new;
11fdf7f2 1823 }
11fdf7f2
TL
1824 }
1825 }
11fdf7f2
TL
1826 }
1827 // new logging device in the current naming scheme
1828 int new_log_dev_cur = bdev[BDEV_WAL] ?
1829 BDEV_WAL :
1830 bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
1831
1832 // new logging device in new naming scheme
1833 int new_log_dev_next = new_log_dev_cur;
1834
1835 if (devs_source.count(new_log_dev_cur)) {
1836 // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1837 new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ?
1838 BDEV_DB :
1839 BDEV_WAL;
1840
1841 dout(0) << __func__ << " log moved from " << new_log_dev_cur
1842 << " to " << new_log_dev_next << dendl;
1843
1844 new_log_dev_cur =
1845 (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ?
1846 BDEV_SLOW :
1847 new_log_dev_next;
1848 }
1849
9f95a23c 1850 _rewrite_log_and_layout_sync(
11fdf7f2
TL
1851 false,
1852 (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB,
1853 new_log_dev_cur,
1854 new_log_dev_next,
9f95a23c
TL
1855 flags,
1856 layout);
11fdf7f2
TL
1857 return 0;
1858}
1859
1860int BlueFS::device_migrate_to_new(
1861 CephContext *cct,
1862 const set<int>& devs_source,
9f95a23c
TL
1863 int dev_target,
1864 const bluefs_layout_t& layout)
11fdf7f2
TL
1865{
1866 vector<byte> buf;
1867 bool buffered = cct->_conf->bluefs_buffered_io;
1868
eafe8130
TL
1869 dout(10) << __func__ << " devs_source " << devs_source
1870 << " dev_target " << dev_target << dendl;
11fdf7f2
TL
1871 assert(dev_target == (int)BDEV_NEWDB || (int)BDEV_NEWWAL);
1872
1873 int flags = 0;
1874
1875 flags |= devs_source.count(BDEV_DB) ?
1876 (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) :
1877 0;
1878 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
9f95a23c 1879 int dev_target_new = dev_target; //FIXME: remove, makes no sense
11fdf7f2
TL
1880
1881 for (auto& p : file_map) {
1882 //do not copy log
1883 if (p.second->fnode.ino == 1) {
1884 continue;
1885 }
eafe8130
TL
1886 dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl;
1887
11fdf7f2
TL
1888 auto& fnode_extents = p.second->fnode.extents;
1889
eafe8130 1890 bool rewrite = false;
11fdf7f2 1891 for (auto ext_it = fnode_extents.begin();
eafe8130
TL
1892 ext_it != p.second->fnode.extents.end();
1893 ++ext_it) {
11fdf7f2 1894 if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
eafe8130
TL
1895 rewrite = true;
1896 break;
1897 }
1898 }
1899 if (rewrite) {
1900 dout(10) << __func__ << " migrating" << dendl;
1901
1902 // read entire file
1903 bufferlist bl;
1904 for (auto old_ext : fnode_extents) {
1905 buf.resize(old_ext.length);
1906 int r = bdev[old_ext.bdev]->read_random(
1907 old_ext.offset,
1908 old_ext.length,
1909 (char*)&buf.at(0),
1910 buffered);
1911 if (r != 0) {
1912 derr << __func__ << " failed to read 0x" << std::hex
1913 << old_ext.offset << "~" << old_ext.length << std::dec
1914 << " from " << (int)dev_target << dendl;
1915 return -EIO;
11fdf7f2 1916 }
eafe8130
TL
1917 bl.append((char*)&buf[0], old_ext.length);
1918 }
1919
1920 // write entire file
1921 PExtentVector extents;
1922 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1923 if (l < 0) {
1924 derr << __func__ << " unable to allocate len 0x" << std::hex
1925 << bl.length() << std::dec << " from " << (int)dev_target
1926 << ": " << cpp_strerror(l) << dendl;
1927 return -ENOSPC;
1928 }
1929
1930 uint64_t off = 0;
1931 for (auto& i : extents) {
1932 bufferlist cur;
1933 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1934 ceph_assert(cur_len > 0);
1935 cur.substr_of(bl, off, cur_len);
1936 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1937 ceph_assert(r == 0);
1938 off += cur_len;
1939 }
1940
1941 // release old extents
1942 for (auto old_ext : fnode_extents) {
1943 PExtentVector to_release;
1944 to_release.emplace_back(old_ext.offset, old_ext.length);
1945 alloc[old_ext.bdev]->release(to_release);
1946 }
1947
1948 // update fnode
1949 fnode_extents.clear();
1950 for (auto& i : extents) {
1951 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
11fdf7f2
TL
1952 }
1953 }
11fdf7f2
TL
1954 }
1955 // new logging device in the current naming scheme
1956 int new_log_dev_cur =
1957 bdev[BDEV_NEWWAL] ?
1958 BDEV_NEWWAL :
1959 bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ?
1960 BDEV_WAL :
1961 bdev[BDEV_NEWDB] ?
1962 BDEV_NEWDB :
1963 bdev[BDEV_DB] && !(flags & REMOVE_DB)?
1964 BDEV_DB :
1965 BDEV_SLOW;
1966
1967 // new logging device in new naming scheme
1968 int new_log_dev_next =
1969 new_log_dev_cur == BDEV_NEWWAL ?
1970 BDEV_WAL :
1971 new_log_dev_cur == BDEV_NEWDB ?
1972 BDEV_DB :
1973 new_log_dev_cur;
1974
1975 int super_dev =
1976 dev_target == BDEV_NEWDB ?
1977 BDEV_NEWDB :
1978 bdev[BDEV_DB] ?
1979 BDEV_DB :
1980 BDEV_SLOW;
1981
9f95a23c 1982 _rewrite_log_and_layout_sync(
11fdf7f2
TL
1983 false,
1984 super_dev,
1985 new_log_dev_cur,
1986 new_log_dev_next,
9f95a23c
TL
1987 flags,
1988 layout);
11fdf7f2
TL
1989 return 0;
1990}
1991
7c673cae
FG
1992BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
1993{
1994 auto p = file_map.find(ino);
1995 if (p == file_map.end()) {
9f95a23c 1996 FileRef f = ceph::make_ref<File>();
7c673cae
FG
1997 file_map[ino] = f;
1998 dout(30) << __func__ << " ino " << ino << " = " << f
1999 << " (new)" << dendl;
2000 return f;
2001 } else {
2002 dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
2003 return p->second;
2004 }
2005}
2006
2007void BlueFS::_drop_link(FileRef file)
2008{
2009 dout(20) << __func__ << " had refs " << file->refs
2010 << " on " << file->fnode << dendl;
11fdf7f2 2011 ceph_assert(file->refs > 0);
7c673cae
FG
2012 --file->refs;
2013 if (file->refs == 0) {
2014 dout(20) << __func__ << " destroying " << file->fnode << dendl;
11fdf7f2 2015 ceph_assert(file->num_reading.load() == 0);
9f95a23c 2016 vselector->sub_usage(file->vselector_hint, file->fnode);
7c673cae
FG
2017 log_t.op_file_remove(file->fnode.ino);
2018 for (auto& r : file->fnode.extents) {
2019 pending_release[r.bdev].insert(r.offset, r.length);
2020 }
2021 file_map.erase(file->fnode.ino);
2022 file->deleted = true;
94b18763 2023
7c673cae 2024 if (file->dirty_seq) {
11fdf7f2
TL
2025 ceph_assert(file->dirty_seq > log_seq_stable);
2026 ceph_assert(dirty_files.count(file->dirty_seq));
7c673cae
FG
2027 auto it = dirty_files[file->dirty_seq].iterator_to(*file);
2028 dirty_files[file->dirty_seq].erase(it);
2029 file->dirty_seq = 0;
2030 }
2031 }
2032}
2033
adb31ebb 2034int64_t BlueFS::_read_random(
7c673cae
FG
2035 FileReader *h, ///< [in] read from here
2036 uint64_t off, ///< [in] offset
9f95a23c 2037 uint64_t len, ///< [in] this many bytes
7c673cae
FG
2038 char *out) ///< [out] optional: or copy it here
2039{
494da23a
TL
2040 auto* buf = &h->buf;
2041
adb31ebb 2042 int64_t ret = 0;
7c673cae
FG
2043 dout(10) << __func__ << " h " << h
2044 << " 0x" << std::hex << off << "~" << len << std::dec
2045 << " from " << h->file->fnode << dendl;
2046
2047 ++h->file->num_reading;
2048
2049 if (!h->ignore_eof &&
2050 off + len > h->file->fnode.size) {
2051 if (off > h->file->fnode.size)
2052 len = 0;
2053 else
2054 len = h->file->fnode.size - off;
2055 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
2056 << std::hex << len << std::dec << dendl;
2057 }
494da23a
TL
2058 logger->inc(l_bluefs_read_random_count, 1);
2059 logger->inc(l_bluefs_read_random_bytes, len);
7c673cae 2060
494da23a 2061 std::shared_lock s_lock(h->lock);
f91f0fd5 2062 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
7c673cae 2063 while (len > 0) {
494da23a
TL
2064 if (off < buf->bl_off || off >= buf->get_buf_end()) {
2065 s_lock.unlock();
2066 uint64_t x_off = 0;
2067 auto p = h->file->fnode.seek(off, &x_off);
f6b5b4d7 2068 ceph_assert(p != h->file->fnode.extents.end());
9f95a23c 2069 uint64_t l = std::min(p->length - x_off, len);
adb31ebb
TL
2070 //hard cap to 1GB
2071 l = std::min(l, uint64_t(1) << 30);
494da23a
TL
2072 dout(20) << __func__ << " read random 0x"
2073 << std::hex << x_off << "~" << l << std::dec
2074 << " of " << *p << dendl;
cd265ab1
TL
2075 int r;
2076 if (!cct->_conf->bluefs_check_for_zeros) {
2077 r = bdev[p->bdev]->read_random(p->offset + x_off, l, out,
2078 cct->_conf->bluefs_buffered_io);
2079 } else {
2080 r = read_random(p->bdev, p->offset + x_off, l, out,
2081 cct->_conf->bluefs_buffered_io);
2082 }
494da23a
TL
2083 ceph_assert(r == 0);
2084 off += l;
2085 len -= l;
2086 ret += l;
2087 out += l;
2088
2089 logger->inc(l_bluefs_read_random_disk_count, 1);
2090 logger->inc(l_bluefs_read_random_disk_bytes, l);
2091 if (len > 0) {
2092 s_lock.lock();
2093 }
2094 } else {
2095 auto left = buf->get_buf_remaining(off);
adb31ebb 2096 int64_t r = std::min(len, left);
494da23a
TL
2097 logger->inc(l_bluefs_read_random_buffer_count, 1);
2098 logger->inc(l_bluefs_read_random_buffer_bytes, r);
2099 dout(20) << __func__ << " left 0x" << std::hex << left
2100 << " 0x" << off << "~" << len << std::dec
2101 << dendl;
2102
2103 if (out) {
3fec8b72
TL
2104 auto p = buf->bl.begin();
2105 p.seek(off - buf->bl_off);
2106 p.copy(r, out);
494da23a
TL
2107 out += r;
2108 }
7c673cae 2109
494da23a
TL
2110 dout(30) << __func__ << " result chunk (0x"
2111 << std::hex << r << std::dec << " bytes):\n";
2112 bufferlist t;
2113 t.substr_of(buf->bl, off - buf->bl_off, r);
2114 t.hexdump(*_dout);
2115 *_dout << dendl;
2116
2117 off += r;
2118 len -= r;
2119 ret += r;
2120 buf->pos += r;
2121 }
2122 }
7c673cae
FG
2123 dout(20) << __func__ << " got " << ret << dendl;
2124 --h->file->num_reading;
2125 return ret;
2126}
2127
adb31ebb 2128int64_t BlueFS::_read(
7c673cae
FG
2129 FileReader *h, ///< [in] read from here
2130 FileReaderBuffer *buf, ///< [in] reader state
2131 uint64_t off, ///< [in] offset
2132 size_t len, ///< [in] this many bytes
2133 bufferlist *outbl, ///< [out] optional: reference the result here
2134 char *out) ///< [out] optional: or copy it here
2135{
494da23a 2136 bool prefetch = !outbl && !out;
7c673cae
FG
2137 dout(10) << __func__ << " h " << h
2138 << " 0x" << std::hex << off << "~" << len << std::dec
494da23a
TL
2139 << " from " << h->file->fnode
2140 << (prefetch ? " prefetch" : "")
2141 << dendl;
7c673cae
FG
2142
2143 ++h->file->num_reading;
2144
2145 if (!h->ignore_eof &&
2146 off + len > h->file->fnode.size) {
2147 if (off > h->file->fnode.size)
2148 len = 0;
2149 else
2150 len = h->file->fnode.size - off;
2151 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
2152 << std::hex << len << std::dec << dendl;
2153 }
494da23a
TL
2154 logger->inc(l_bluefs_read_count, 1);
2155 logger->inc(l_bluefs_read_bytes, len);
2156 if (prefetch) {
2157 logger->inc(l_bluefs_read_prefetch_count, 1);
2158 logger->inc(l_bluefs_read_prefetch_bytes, len);
2159 }
2160
7c673cae
FG
2161 if (outbl)
2162 outbl->clear();
2163
adb31ebb 2164 int64_t ret = 0;
494da23a 2165 std::shared_lock s_lock(h->lock);
7c673cae
FG
2166 while (len > 0) {
2167 size_t left;
2168 if (off < buf->bl_off || off >= buf->get_buf_end()) {
494da23a
TL
2169 s_lock.unlock();
2170 std::unique_lock u_lock(h->lock);
f91f0fd5 2171 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
494da23a
TL
2172 if (off < buf->bl_off || off >= buf->get_buf_end()) {
2173 // if precondition hasn't changed during locking upgrade.
2174 buf->bl.clear();
2175 buf->bl_off = off & super.block_mask();
2176 uint64_t x_off = 0;
2177 auto p = h->file->fnode.seek(buf->bl_off, &x_off);
f6b5b4d7
TL
2178 if (p == h->file->fnode.extents.end()) {
2179 dout(5) << __func__ << " reading less then required "
2180 << ret << "<" << ret + len << dendl;
2181 break;
2182 }
2183
494da23a
TL
2184 uint64_t want = round_up_to(len + (off & ~super.block_mask()),
2185 super.block_size);
2186 want = std::max(want, buf->max_prefetch);
2187 uint64_t l = std::min(p->length - x_off, want);
adb31ebb
TL
2188 //hard cap to 1GB
2189 l = std::min(l, uint64_t(1) << 30);
494da23a
TL
2190 uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size);
2191 if (!h->ignore_eof &&
2192 buf->bl_off + l > eof_offset) {
2193 l = eof_offset - buf->bl_off;
2194 }
2195 dout(20) << __func__ << " fetching 0x"
2196 << std::hex << x_off << "~" << l << std::dec
2197 << " of " << *p << dendl;
cd265ab1
TL
2198 int r;
2199 if (!cct->_conf->bluefs_check_for_zeros) {
2200 r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev],
2201 cct->_conf->bluefs_buffered_io);
2202 } else {
2203 r = read(p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev],
2204 cct->_conf->bluefs_buffered_io);
2205 }
494da23a 2206 ceph_assert(r == 0);
7c673cae 2207 }
494da23a
TL
2208 u_lock.unlock();
2209 s_lock.lock();
2210 // we should recheck if buffer is valid after lock downgrade
2211 continue;
7c673cae
FG
2212 }
2213 left = buf->get_buf_remaining(off);
2214 dout(20) << __func__ << " left 0x" << std::hex << left
2215 << " len 0x" << len << std::dec << dendl;
2216
adb31ebb 2217 int64_t r = std::min(len, left);
7c673cae
FG
2218 if (outbl) {
2219 bufferlist t;
2220 t.substr_of(buf->bl, off - buf->bl_off, r);
2221 outbl->claim_append(t);
2222 }
2223 if (out) {
3fec8b72
TL
2224 auto p = buf->bl.begin();
2225 p.seek(off - buf->bl_off);
2226 p.copy(r, out);
7c673cae
FG
2227 out += r;
2228 }
2229
2230 dout(30) << __func__ << " result chunk (0x"
2231 << std::hex << r << std::dec << " bytes):\n";
2232 bufferlist t;
2233 t.substr_of(buf->bl, off - buf->bl_off, r);
2234 t.hexdump(*_dout);
2235 *_dout << dendl;
2236
2237 off += r;
2238 len -= r;
2239 ret += r;
2240 buf->pos += r;
2241 }
7c673cae 2242 dout(20) << __func__ << " got " << ret << dendl;
11fdf7f2 2243 ceph_assert(!outbl || (int)outbl->length() == ret);
7c673cae
FG
2244 --h->file->num_reading;
2245 return ret;
2246}
2247
2248void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
2249{
2250 dout(10) << __func__ << " file " << f->fnode
2251 << " 0x" << std::hex << offset << "~" << length << std::dec
2252 << dendl;
2253 if (offset & ~super.block_mask()) {
2254 offset &= super.block_mask();
11fdf7f2 2255 length = round_up_to(length, super.block_size);
7c673cae
FG
2256 }
2257 uint64_t x_off = 0;
2258 auto p = f->fnode.seek(offset, &x_off);
2259 while (length > 0 && p != f->fnode.extents.end()) {
11fdf7f2 2260 uint64_t x_len = std::min(p->length - x_off, length);
7c673cae
FG
2261 bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
2262 dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len
2263 << std:: dec << " of " << *p << dendl;
2264 offset += x_len;
2265 length -= x_len;
2266 }
2267}
2268
2269uint64_t BlueFS::_estimate_log_size()
2270{
2271 int avg_dir_size = 40; // fixme
2272 int avg_file_size = 12;
2273 uint64_t size = 4096 * 2;
2274 size += file_map.size() * (1 + sizeof(bluefs_fnode_t));
2275 for (auto& p : block_all)
2276 size += p.num_intervals() * (1 + 1 + sizeof(uint64_t) * 2);
2277 size += dir_map.size() + (1 + avg_dir_size);
2278 size += file_map.size() * (1 + avg_dir_size + avg_file_size);
11fdf7f2 2279 return round_up_to(size, super.block_size);
7c673cae
FG
2280}
2281
2282void BlueFS::compact_log()
2283{
f6b5b4d7
TL
2284 std::unique_lock<ceph::mutex> l(lock);
2285 if (!cct->_conf->bluefs_replay_recovery_disable_compact) {
2286 if (cct->_conf->bluefs_compact_log_sync) {
2287 _compact_log_sync();
2288 } else {
2289 _compact_log_async(l);
2290 }
7c673cae
FG
2291 }
2292}
2293
2294bool BlueFS::_should_compact_log()
2295{
2296 uint64_t current = log_writer->file->fnode.size;
2297 uint64_t expected = _estimate_log_size();
2298 float ratio = (float)current / (float)expected;
2299 dout(10) << __func__ << " current 0x" << std::hex << current
2300 << " expected " << expected << std::dec
2301 << " ratio " << ratio
2302 << (new_log ? " (async compaction in progress)" : "")
2303 << dendl;
2304 if (new_log ||
2305 current < cct->_conf->bluefs_log_compact_min_size ||
2306 ratio < cct->_conf->bluefs_log_compact_min_ratio) {
2307 return false;
2308 }
2309 return true;
2310}
2311
11fdf7f2
TL
2312void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t,
2313 int flags)
7c673cae
FG
2314{
2315 t->seq = 1;
2316 t->uuid = super.uuid;
2317 dout(20) << __func__ << " op_init" << dendl;
2318
2319 t->op_init();
2320 for (unsigned bdev = 0; bdev < MAX_BDEV; ++bdev) {
2321 interval_set<uint64_t>& p = block_all[bdev];
2322 for (interval_set<uint64_t>::iterator q = p.begin(); q != p.end(); ++q) {
11fdf7f2
TL
2323 auto bdev_new = bdev;
2324 if ((flags & REMOVE_WAL) && bdev == BDEV_WAL) {
2325 continue;
2326 }
2327 if ((flags & REMOVE_DB) && bdev == BDEV_DB) {
2328 continue;
2329 }
2330 if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
2331 bdev_new = BDEV_DB;
2332 }
2333 if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
2334 bdev_new = BDEV_SLOW;
2335 }
2336 if (bdev == BDEV_NEWDB) {
2337 // REMOVE_DB xor RENAME_DB
2338 ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
2339 ceph_assert(!(flags & RENAME_SLOW2DB));
2340 bdev_new = BDEV_DB;
2341 }
2342 if (bdev == BDEV_NEWWAL) {
2343 ceph_assert(flags & REMOVE_WAL);
2344 bdev_new = BDEV_WAL;
2345 }
2346 dout(20) << __func__ << " op_alloc_add " << bdev_new << " 0x"
7c673cae
FG
2347 << std::hex << q.get_start() << "~" << q.get_len() << std::dec
2348 << dendl;
11fdf7f2 2349 t->op_alloc_add(bdev_new, q.get_start(), q.get_len());
7c673cae
FG
2350 }
2351 }
9f95a23c
TL
2352 for (auto& [ino, file_ref] : file_map) {
2353 if (ino == 1)
7c673cae 2354 continue;
9f95a23c 2355 ceph_assert(ino > 1);
11fdf7f2 2356
9f95a23c 2357 for(auto& e : file_ref->fnode.extents) {
11fdf7f2
TL
2358 auto bdev = e.bdev;
2359 auto bdev_new = bdev;
2360 ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL));
2361 if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
2362 bdev_new = BDEV_DB;
2363 }
2364 if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
2365 bdev_new = BDEV_SLOW;
2366 }
2367 if (bdev == BDEV_NEWDB) {
2368 // REMOVE_DB xor RENAME_DB
2369 ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
2370 ceph_assert(!(flags & RENAME_SLOW2DB));
2371 bdev_new = BDEV_DB;
2372 }
2373 if (bdev == BDEV_NEWWAL) {
2374 ceph_assert(flags & REMOVE_WAL);
2375 bdev_new = BDEV_WAL;
2376 }
2377 e.bdev = bdev_new;
2378 }
9f95a23c
TL
2379 dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
2380 t->op_file_update(file_ref->fnode);
7c673cae 2381 }
9f95a23c
TL
2382 for (auto& [path, dir_ref] : dir_map) {
2383 dout(20) << __func__ << " op_dir_create " << path << dendl;
2384 t->op_dir_create(path);
2385 for (auto& [fname, file_ref] : dir_ref->file_map) {
2386 dout(20) << __func__ << " op_dir_link " << path << "/" << fname
2387 << " to " << file_ref->fnode.ino << dendl;
2388 t->op_dir_link(path, fname, file_ref->fnode.ino);
7c673cae
FG
2389 }
2390 }
2391}
2392
2393void BlueFS::_compact_log_sync()
2394{
2395 dout(10) << __func__ << dendl;
9f95a23c
TL
2396 auto prefer_bdev =
2397 vselector->select_prefer_bdev(log_writer->file->vselector_hint);
2398 _rewrite_log_and_layout_sync(true,
11fdf7f2 2399 BDEV_DB,
9f95a23c
TL
2400 prefer_bdev,
2401 prefer_bdev,
2402 0,
2403 super.memorized_layout);
11fdf7f2
TL
2404 logger->inc(l_bluefs_log_compactions);
2405}
2406
9f95a23c
TL
2407void BlueFS::_rewrite_log_and_layout_sync(bool allocate_with_fallback,
2408 int super_dev,
2409 int log_dev,
2410 int log_dev_new,
2411 int flags,
2412 std::optional<bluefs_layout_t> layout)
11fdf7f2 2413{
7c673cae
FG
2414 File *log_file = log_writer->file.get();
2415
2416 // clear out log (be careful who calls us!!!)
2417 log_t.clear();
2418
11fdf7f2
TL
2419 dout(20) << __func__ << " super_dev:" << super_dev
2420 << " log_dev:" << log_dev
2421 << " log_dev_new:" << log_dev_new
2422 << " flags:" << flags
2423 << dendl;
7c673cae 2424 bluefs_transaction_t t;
11fdf7f2 2425 _compact_log_dump_metadata(&t, flags);
7c673cae
FG
2426
2427 dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
2428 t.op_jump_seq(log_seq);
2429
2430 bufferlist bl;
11fdf7f2 2431 encode(t, bl);
7c673cae
FG
2432 _pad_bl(bl);
2433
2434 uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
2435 dout(20) << __func__ << " need " << need << dendl;
2436
494da23a 2437 bluefs_fnode_t old_fnode;
11fdf7f2 2438 int r;
494da23a 2439 log_file->fnode.swap_extents(old_fnode);
11fdf7f2
TL
2440 if (allocate_with_fallback) {
2441 r = _allocate(log_dev, need, &log_file->fnode);
2442 ceph_assert(r == 0);
2443 } else {
2444 PExtentVector extents;
2445 r = _allocate_without_fallback(log_dev,
2446 need,
2447 &extents);
2448 ceph_assert(r == 0);
2449 for (auto& p : extents) {
2450 log_file->fnode.append_extent(
2451 bluefs_extent_t(log_dev, p.offset, p.length));
2452 }
7c673cae
FG
2453 }
2454
2455 _close_writer(log_writer);
2456
2457 log_file->fnode.size = bl.length();
9f95a23c
TL
2458 vselector->sub_usage(log_file->vselector_hint, old_fnode);
2459 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2460
7c673cae
FG
2461 log_writer = _create_writer(log_file);
2462 log_writer->append(bl);
11fdf7f2
TL
2463 r = _flush(log_writer, true);
2464 ceph_assert(r == 0);
2465#ifdef HAVE_LIBAIO
2466 if (!cct->_conf->bluefs_sync_write) {
2467 list<aio_t> completed_ios;
2468 _claim_completed_aios(log_writer, &completed_ios);
2469 wait_for_aio(log_writer);
2470 completed_ios.clear();
2471 }
2472#endif
224ce89b 2473 flush_bdev();
224ce89b 2474
9f95a23c 2475 super.memorized_layout = layout;
7c673cae 2476 super.log_fnode = log_file->fnode;
11fdf7f2
TL
2477 // rename device if needed
2478 if (log_dev != log_dev_new) {
2479 dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl;
2480 for (auto& p : super.log_fnode.extents) {
2481 p.bdev = log_dev_new;
2482 }
2483 }
2484 dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl;
2485
7c673cae 2486 ++super.version;
11fdf7f2 2487 _write_super(super_dev);
7c673cae
FG
2488 flush_bdev();
2489
494da23a
TL
2490 dout(10) << __func__ << " release old log extents " << old_fnode.extents << dendl;
2491 for (auto& r : old_fnode.extents) {
7c673cae
FG
2492 pending_release[r.bdev].insert(r.offset, r.length);
2493 }
7c673cae
FG
2494}
2495
2496/*
2497 * 1. Allocate a new extent to continue the log, and then log an event
2498 * that jumps the log write position to the new extent. At this point, the
2499 * old extent(s) won't be written to, and reflect everything to compact.
2500 * New events will be written to the new region that we'll keep.
2501 *
2502 * 2. While still holding the lock, encode a bufferlist that dumps all of the
2503 * in-memory fnodes and names. This will become the new beginning of the
2504 * log. The last event will jump to the log continuation extent from #1.
2505 *
2506 * 3. Queue a write to a new extent for the new beginnging of the log.
2507 *
2508 * 4. Drop lock and wait
2509 *
2510 * 5. Retake the lock.
2511 *
2512 * 6. Update the log_fnode to splice in the new beginning.
2513 *
2514 * 7. Write the new superblock.
2515 *
2516 * 8. Release the old log space. Clean up.
2517 */
11fdf7f2 2518void BlueFS::_compact_log_async(std::unique_lock<ceph::mutex>& l)
7c673cae
FG
2519{
2520 dout(10) << __func__ << dendl;
2521 File *log_file = log_writer->file.get();
11fdf7f2
TL
2522 ceph_assert(!new_log);
2523 ceph_assert(!new_log_writer);
7c673cae 2524
181888fb
FG
2525 // create a new log [writer] so that we know compaction is in progress
2526 // (see _should_compact_log)
9f95a23c 2527 new_log = ceph::make_ref<File>();
181888fb
FG
2528 new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode
2529
3efd9988
FG
2530 // 0. wait for any racing flushes to complete. (We do not want to block
2531 // in _flush_sync_log with jump_to set or else a racing thread might flush
2532 // our entries and our jump_to update won't be correct.)
2533 while (log_flushing) {
2534 dout(10) << __func__ << " log is currently flushing, waiting" << dendl;
2535 log_cond.wait(l);
2536 }
2537
9f95a23c
TL
2538 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2539
7c673cae
FG
2540 // 1. allocate new log space and jump to it.
2541 old_log_jump_to = log_file->fnode.get_allocated();
7c673cae 2542 dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
11fdf7f2 2543 << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl;
9f95a23c
TL
2544 int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
2545 cct->_conf->bluefs_max_log_runway,
2546 &log_file->fnode);
11fdf7f2 2547 ceph_assert(r == 0);
9f95a23c
TL
2548 //adjust usage as flush below will need it
2549 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
7c673cae
FG
2550 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2551
2552 // update the log file change and log a jump to the offset where we want to
2553 // write the new entries
2554 log_t.op_file_update(log_file->fnode);
2555 log_t.op_jump(log_seq, old_log_jump_to);
2556
2557 flush_bdev(); // FIXME?
2558
2559 _flush_and_sync_log(l, 0, old_log_jump_to);
2560
2561 // 2. prepare compacted log
2562 bluefs_transaction_t t;
224ce89b
WB
2563 //avoid record two times in log_t and _compact_log_dump_metadata.
2564 log_t.clear();
11fdf7f2 2565 _compact_log_dump_metadata(&t, 0);
7c673cae 2566
eafe8130
TL
2567 uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
2568 std::max(alloc_size[BDEV_DB],
2569 alloc_size[BDEV_SLOW]));
2570
7c673cae 2571 // conservative estimate for final encoded size
11fdf7f2 2572 new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2,
eafe8130 2573 max_alloc_size);
7c673cae
FG
2574 t.op_jump(log_seq, new_log_jump_to);
2575
11fdf7f2 2576 // allocate
9f95a23c 2577 //FIXME: check if we want DB here?
11fdf7f2
TL
2578 r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
2579 &new_log->fnode);
2580 ceph_assert(r == 0);
2581
2582 // we might have some more ops in log_t due to _allocate call
2583 t.claim_ops(log_t);
2584
7c673cae 2585 bufferlist bl;
11fdf7f2 2586 encode(t, bl);
7c673cae
FG
2587 _pad_bl(bl);
2588
2589 dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
2590 << std::dec << dendl;
2591
7c673cae
FG
2592 new_log_writer = _create_writer(new_log);
2593 new_log_writer->append(bl);
2594
2595 // 3. flush
2596 r = _flush(new_log_writer, true);
11fdf7f2 2597 ceph_assert(r == 0);
7c673cae
FG
2598
2599 // 4. wait
11fdf7f2 2600 _flush_bdev_safely(new_log_writer);
7c673cae 2601
11fdf7f2 2602 // 5. update our log fnode
7c673cae 2603 // discard first old_log_jump_to extents
9f95a23c 2604
7c673cae
FG
2605 dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec
2606 << " of " << log_file->fnode.extents << dendl;
2607 uint64_t discarded = 0;
2608 mempool::bluefs::vector<bluefs_extent_t> old_extents;
2609 while (discarded < old_log_jump_to) {
11fdf7f2 2610 ceph_assert(!log_file->fnode.extents.empty());
7c673cae
FG
2611 bluefs_extent_t& e = log_file->fnode.extents.front();
2612 bluefs_extent_t temp = e;
2613 if (discarded + e.length <= old_log_jump_to) {
2614 dout(10) << __func__ << " remove old log extent " << e << dendl;
2615 discarded += e.length;
94b18763 2616 log_file->fnode.pop_front_extent();
7c673cae
FG
2617 } else {
2618 dout(10) << __func__ << " remove front of old log extent " << e << dendl;
2619 uint64_t drop = old_log_jump_to - discarded;
2620 temp.length = drop;
2621 e.offset += drop;
2622 e.length -= drop;
2623 discarded += drop;
2624 dout(10) << __func__ << " kept " << e << " removed " << temp << dendl;
2625 }
2626 old_extents.push_back(temp);
2627 }
94b18763
FG
2628 auto from = log_file->fnode.extents.begin();
2629 auto to = log_file->fnode.extents.end();
2630 while (from != to) {
2631 new_log->fnode.append_extent(*from);
2632 ++from;
2633 }
7c673cae 2634
9f95a23c
TL
2635 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2636
7c673cae 2637 // clear the extents from old log file, they are added to new log
94b18763 2638 log_file->fnode.clear_extents();
7c673cae 2639 // swap the log files. New log file is the log file now.
94b18763
FG
2640 new_log->fnode.swap_extents(log_file->fnode);
2641
7c673cae
FG
2642 log_writer->pos = log_writer->file->fnode.size =
2643 log_writer->pos - old_log_jump_to + new_log_jump_to;
2644
9f95a23c
TL
2645 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2646
11fdf7f2 2647 // 6. write the super block to reflect the changes
7c673cae
FG
2648 dout(10) << __func__ << " writing super" << dendl;
2649 super.log_fnode = log_file->fnode;
2650 ++super.version;
11fdf7f2 2651 _write_super(BDEV_DB);
7c673cae
FG
2652
2653 lock.unlock();
2654 flush_bdev();
2655 lock.lock();
2656
11fdf7f2 2657 // 7. release old space
7c673cae
FG
2658 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
2659 for (auto& r : old_extents) {
2660 pending_release[r.bdev].insert(r.offset, r.length);
2661 }
2662
2663 // delete the new log, remove from the dirty files list
2664 _close_writer(new_log_writer);
2665 if (new_log->dirty_seq) {
11fdf7f2 2666 ceph_assert(dirty_files.count(new_log->dirty_seq));
7c673cae
FG
2667 auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log);
2668 dirty_files[new_log->dirty_seq].erase(it);
2669 }
2670 new_log_writer = nullptr;
2671 new_log = nullptr;
2672 log_cond.notify_all();
2673
2674 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2675 logger->inc(l_bluefs_log_compactions);
2676}
2677
2678void BlueFS::_pad_bl(bufferlist& bl)
2679{
2680 uint64_t partial = bl.length() % super.block_size;
2681 if (partial) {
2682 dout(10) << __func__ << " padding with 0x" << std::hex
2683 << super.block_size - partial << " zeros" << std::dec << dendl;
2684 bl.append_zero(super.block_size - partial);
2685 }
2686}
2687
7c673cae 2688
11fdf7f2 2689int BlueFS::_flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
7c673cae
FG
2690 uint64_t want_seq,
2691 uint64_t jump_to)
2692{
2693 while (log_flushing) {
2694 dout(10) << __func__ << " want_seq " << want_seq
2695 << " log is currently flushing, waiting" << dendl;
11fdf7f2 2696 ceph_assert(!jump_to);
7c673cae
FG
2697 log_cond.wait(l);
2698 }
2699 if (want_seq && want_seq <= log_seq_stable) {
2700 dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable "
2701 << log_seq_stable << ", done" << dendl;
11fdf7f2 2702 ceph_assert(!jump_to);
7c673cae
FG
2703 return 0;
2704 }
2705 if (log_t.empty() && dirty_files.empty()) {
2706 dout(10) << __func__ << " want_seq " << want_seq
2707 << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl;
11fdf7f2 2708 ceph_assert(!jump_to);
7c673cae
FG
2709 return 0;
2710 }
2711
a8e16298
TL
2712 vector<interval_set<uint64_t>> to_release(pending_release.size());
2713 to_release.swap(pending_release);
2714
7c673cae 2715 uint64_t seq = log_t.seq = ++log_seq;
11fdf7f2 2716 ceph_assert(want_seq == 0 || want_seq <= seq);
7c673cae
FG
2717 log_t.uuid = super.uuid;
2718
2719 // log dirty files
2720 auto lsi = dirty_files.find(seq);
2721 if (lsi != dirty_files.end()) {
2722 dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl;
2723 for (auto &f : lsi->second) {
2724 dout(20) << __func__ << " op_file_update " << f.fnode << dendl;
2725 log_t.op_file_update(f.fnode);
2726 }
2727 }
2728
2729 dout(10) << __func__ << " " << log_t << dendl;
11fdf7f2 2730 ceph_assert(!log_t.empty());
7c673cae
FG
2731
2732 // allocate some more space (before we run out)?
2733 int64_t runway = log_writer->file->fnode.get_allocated() -
2734 log_writer->get_effective_write_pos();
f6b5b4d7 2735 bool just_expanded_log = false;
7c673cae
FG
2736 if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
2737 dout(10) << __func__ << " allocating more log runway (0x"
2738 << std::hex << runway << std::dec << " remaining)" << dendl;
2739 while (new_log_writer) {
2740 dout(10) << __func__ << " waiting for async compaction" << dendl;
2741 log_cond.wait(l);
2742 }
9f95a23c
TL
2743 vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode);
2744 int r = _allocate(
2745 vselector->select_prefer_bdev(log_writer->file->vselector_hint),
2746 cct->_conf->bluefs_max_log_runway,
2747 &log_writer->file->fnode);
11fdf7f2 2748 ceph_assert(r == 0);
9f95a23c 2749 vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode);
7c673cae 2750 log_t.op_file_update(log_writer->file->fnode);
f6b5b4d7 2751 just_expanded_log = true;
7c673cae
FG
2752 }
2753
2754 bufferlist bl;
11fdf7f2
TL
2755 bl.reserve(super.block_size);
2756 encode(log_t, bl);
7c673cae 2757 // pad to block boundary
11fdf7f2
TL
2758 size_t realign = super.block_size - (bl.length() % super.block_size);
2759 if (realign && realign != super.block_size)
2760 bl.append_zero(realign);
2761
7c673cae
FG
2762 logger->inc(l_bluefs_logged_bytes, bl.length());
2763
f6b5b4d7
TL
2764 if (just_expanded_log) {
2765 ceph_assert(bl.length() <= runway); // if we write this, we will have an unrecoverable data loss
2766 }
2767
7c673cae
FG
2768 log_writer->append(bl);
2769
2770 log_t.clear();
2771 log_t.seq = 0; // just so debug output is less confusing
2772 log_flushing = true;
2773
2774 int r = _flush(log_writer, true);
11fdf7f2 2775 ceph_assert(r == 0);
7c673cae
FG
2776
2777 if (jump_to) {
2778 dout(10) << __func__ << " jumping log offset from 0x" << std::hex
2779 << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl;
2780 log_writer->pos = jump_to;
9f95a23c 2781 vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size);
7c673cae 2782 log_writer->file->fnode.size = jump_to;
9f95a23c 2783 vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size);
7c673cae
FG
2784 }
2785
2786 _flush_bdev_safely(log_writer);
2787
2788 log_flushing = false;
2789 log_cond.notify_all();
2790
2791 // clean dirty files
2792 if (seq > log_seq_stable) {
2793 log_seq_stable = seq;
2794 dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl;
2795
2796 auto p = dirty_files.begin();
2797 while (p != dirty_files.end()) {
2798 if (p->first > log_seq_stable) {
2799 dout(20) << __func__ << " done cleaning up dirty files" << dendl;
2800 break;
2801 }
2802
2803 auto l = p->second.begin();
2804 while (l != p->second.end()) {
2805 File *file = &*l;
11fdf7f2
TL
2806 ceph_assert(file->dirty_seq > 0);
2807 ceph_assert(file->dirty_seq <= log_seq_stable);
7c673cae
FG
2808 dout(20) << __func__ << " cleaned file " << file->fnode << dendl;
2809 file->dirty_seq = 0;
2810 p->second.erase(l++);
2811 }
2812
11fdf7f2 2813 ceph_assert(p->second.empty());
7c673cae
FG
2814 dirty_files.erase(p++);
2815 }
2816 } else {
2817 dout(20) << __func__ << " log_seq_stable " << log_seq_stable
2818 << " already >= out seq " << seq
2819 << ", we lost a race against another log flush, done" << dendl;
2820 }
a8e16298
TL
2821
2822 for (unsigned i = 0; i < to_release.size(); ++i) {
2823 if (!to_release[i].empty()) {
2824 /* OK, now we have the guarantee alloc[i] won't be null. */
11fdf7f2
TL
2825 int r = 0;
2826 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
2827 r = bdev[i]->queue_discard(to_release[i]);
2828 if (r == 0)
2829 continue;
2830 } else if (cct->_conf->bdev_enable_discard) {
2831 for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
2832 bdev[i]->discard(p.get_start(), p.get_len());
2833 }
2834 }
a8e16298
TL
2835 alloc[i]->release(to_release[i]);
2836 }
2837 }
2838
7c673cae
FG
2839 _update_logger_stats();
2840
2841 return 0;
2842}
2843
6d8e3169
FG
2844int BlueFS::_signal_dirty_to_log(FileWriter *h)
2845{
2846 h->file->fnode.mtime = ceph_clock_now();
2847 ceph_assert(h->file->fnode.ino >= 1);
2848 if (h->file->dirty_seq == 0) {
2849 h->file->dirty_seq = log_seq + 1;
2850 dirty_files[h->file->dirty_seq].push_back(*h->file);
2851 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2852 << " (was clean)" << dendl;
2853 } else {
2854 if (h->file->dirty_seq != log_seq + 1) {
2855 // need re-dirty, erase from list first
2856 ceph_assert(dirty_files.count(h->file->dirty_seq));
2857 auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
2858 dirty_files[h->file->dirty_seq].erase(it);
2859 h->file->dirty_seq = log_seq + 1;
2860 dirty_files[h->file->dirty_seq].push_back(*h->file);
2861 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2862 << " (was " << h->file->dirty_seq << ")" << dendl;
2863 } else {
2864 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2865 << " (unchanged, do nothing) " << dendl;
2866 }
2867 }
2868 return 0;
2869}
2870
7c673cae
FG
2871int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
2872{
2873 dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
2874 << " 0x" << offset << "~" << length << std::dec
2875 << " to " << h->file->fnode << dendl;
3fec8b72
TL
2876 if (h->file->deleted) {
2877 dout(10) << __func__ << " deleted, no-op" << dendl;
2878 return 0;
2879 }
2880
11fdf7f2 2881 ceph_assert(h->file->num_readers.load() == 0);
7c673cae
FG
2882
2883 h->buffer_appender.flush();
2884
2885 bool buffered;
2886 if (h->file->fnode.ino == 1)
2887 buffered = false;
2888 else
2889 buffered = cct->_conf->bluefs_buffered_io;
2890
2891 if (offset + length <= h->pos)
2892 return 0;
2893 if (offset < h->pos) {
2894 length -= h->pos - offset;
2895 offset = h->pos;
2896 dout(10) << " still need 0x"
2897 << std::hex << offset << "~" << length << std::dec
2898 << dendl;
2899 }
11fdf7f2 2900 ceph_assert(offset <= h->file->fnode.size);
7c673cae
FG
2901
2902 uint64_t allocated = h->file->fnode.get_allocated();
9f95a23c 2903 vselector->sub_usage(h->file->vselector_hint, h->file->fnode);
7c673cae
FG
2904 // do not bother to dirty the file if we are overwriting
2905 // previously allocated extents.
6d8e3169 2906
7c673cae
FG
2907 if (allocated < offset + length) {
2908 // we should never run out of log space here; see the min runway check
2909 // in _flush_and_sync_log.
11fdf7f2 2910 ceph_assert(h->file->fnode.ino != 1);
9f95a23c 2911 int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint),
7c673cae 2912 offset + length - allocated,
94b18763 2913 &h->file->fnode);
7c673cae
FG
2914 if (r < 0) {
2915 derr << __func__ << " allocated: 0x" << std::hex << allocated
2916 << " offset: 0x" << offset << " length: 0x" << length << std::dec
2917 << dendl;
9f95a23c 2918 vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo
11fdf7f2 2919 ceph_abort_msg("bluefs enospc");
7c673cae
FG
2920 return r;
2921 }
6d8e3169 2922 h->file->is_dirty = true;
7c673cae
FG
2923 }
2924 if (h->file->fnode.size < offset + length) {
2925 h->file->fnode.size = offset + length;
2926 if (h->file->fnode.ino > 1) {
2927 // we do not need to dirty the log file (or it's compacting
2928 // replacement) when the file size changes because replay is
2929 // smart enough to discover it on its own.
6d8e3169 2930 h->file->is_dirty = true;
7c673cae
FG
2931 }
2932 }
6d8e3169 2933 dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl;
7c673cae
FG
2934
2935 uint64_t x_off = 0;
2936 auto p = h->file->fnode.seek(offset, &x_off);
11fdf7f2 2937 ceph_assert(p != h->file->fnode.extents.end());
7c673cae
FG
2938 dout(20) << __func__ << " in " << *p << " x_off 0x"
2939 << std::hex << x_off << std::dec << dendl;
2940
2941 unsigned partial = x_off & ~super.block_mask();
2942 bufferlist bl;
2943 if (partial) {
2944 dout(20) << __func__ << " using partial tail 0x"
2945 << std::hex << partial << std::dec << dendl;
11fdf7f2 2946 ceph_assert(h->tail_block.length() == partial);
31f18b77 2947 bl.claim_append_piecewise(h->tail_block);
7c673cae
FG
2948 x_off -= partial;
2949 offset -= partial;
2950 length += partial;
2951 dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
2952 for (auto p : h->iocv) {
2953 if (p) {
2954 p->aio_wait();
2955 }
2956 }
2957 }
f91f0fd5 2958 if (length == partial + h->buffer.length()) {
9f95a23c 2959 /* in case of inital allocation and need to zero, limited flush is unacceptable */
31f18b77 2960 bl.claim_append_piecewise(h->buffer);
7c673cae
FG
2961 } else {
2962 bufferlist t;
31f18b77
FG
2963 h->buffer.splice(0, length, &t);
2964 bl.claim_append_piecewise(t);
7c673cae
FG
2965 t.substr_of(h->buffer, length, h->buffer.length() - length);
2966 h->buffer.swap(t);
2967 dout(20) << " leaving 0x" << std::hex << h->buffer.length() << std::dec
2968 << " unflushed" << dendl;
2969 }
11fdf7f2 2970 ceph_assert(bl.length() == length);
7c673cae 2971
9f95a23c
TL
2972 h->pos = offset + length;
2973
2974 unsigned tail = bl.length() & ~super.block_mask();
2975 if (tail) {
2976 dout(20) << __func__ << " caching tail of 0x"
2977 << std::hex << tail
2978 << " and padding block with 0x" << (super.block_size - tail)
2979 << std::dec << dendl;
2980 h->tail_block.substr_of(bl, bl.length() - tail, tail);
2981 bl.append_zero(super.block_size - tail);
2982 length += super.block_size - tail;
2983 } else {
2984 h->tail_block.clear();
2985 }
9f95a23c
TL
2986 ceph_assert(bl.length() == length);
2987
7c673cae
FG
2988 switch (h->writer_type) {
2989 case WRITER_WAL:
2990 logger->inc(l_bluefs_bytes_written_wal, length);
2991 break;
2992 case WRITER_SST:
2993 logger->inc(l_bluefs_bytes_written_sst, length);
2994 break;
2995 }
2996
2997 dout(30) << "dump:\n";
2998 bl.hexdump(*_dout);
2999 *_dout << dendl;
3000
7c673cae 3001 uint64_t bloff = 0;
11fdf7f2 3002 uint64_t bytes_written_slow = 0;
7c673cae 3003 while (length > 0) {
11fdf7f2 3004 uint64_t x_len = std::min(p->length - x_off, length);
7c673cae
FG
3005 bufferlist t;
3006 t.substr_of(bl, bloff, x_len);
7c673cae 3007 if (cct->_conf->bluefs_sync_write) {
11fdf7f2 3008 bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint);
7c673cae 3009 } else {
11fdf7f2
TL
3010 bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint);
3011 }
3012 h->dirty_devs[p->bdev] = true;
3013 if (p->bdev == BDEV_SLOW) {
3014 bytes_written_slow += t.length();
7c673cae 3015 }
11fdf7f2 3016
7c673cae
FG
3017 bloff += x_len;
3018 length -= x_len;
3019 ++p;
3020 x_off = 0;
3021 }
11fdf7f2 3022 logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow);
7c673cae
FG
3023 for (unsigned i = 0; i < MAX_BDEV; ++i) {
3024 if (bdev[i]) {
11fdf7f2 3025 if (h->iocv[i] && h->iocv[i]->has_pending_aios()) {
7c673cae
FG
3026 bdev[i]->aio_submit(h->iocv[i]);
3027 }
3028 }
3029 }
9f95a23c 3030 vselector->add_usage(h->file->vselector_hint, h->file->fnode);
7c673cae
FG
3031 dout(20) << __func__ << " h " << h << " pos now 0x"
3032 << std::hex << h->pos << std::dec << dendl;
3033 return 0;
3034}
3035
11fdf7f2 3036#ifdef HAVE_LIBAIO
7c673cae
FG
3037// we need to retire old completed aios so they don't stick around in
3038// memory indefinitely (along with their bufferlist refs).
3039void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
3040{
3041 for (auto p : h->iocv) {
3042 if (p) {
3043 ls->splice(ls->end(), p->running_aios);
3044 }
3045 }
3046 dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
3047}
3048
3049void BlueFS::wait_for_aio(FileWriter *h)
3050{
3051 // NOTE: this is safe to call without a lock, as long as our reference is
3052 // stable.
3053 dout(10) << __func__ << " " << h << dendl;
3054 utime_t start = ceph_clock_now();
3055 for (auto p : h->iocv) {
3056 if (p) {
3057 p->aio_wait();
3058 }
3059 }
11fdf7f2 3060 dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl;
7c673cae 3061}
11fdf7f2 3062#endif
7c673cae 3063
f6b5b4d7
TL
3064int BlueFS::_flush(FileWriter *h, bool force, std::unique_lock<ceph::mutex>& l)
3065{
3066 bool flushed = false;
3067 int r = _flush(h, force, &flushed);
3068 if (r == 0 && flushed) {
3069 _maybe_compact_log(l);
3070 }
3071 return r;
3072}
3073
3074int BlueFS::_flush(FileWriter *h, bool force, bool *flushed)
7c673cae
FG
3075{
3076 h->buffer_appender.flush();
3077 uint64_t length = h->buffer.length();
3078 uint64_t offset = h->pos;
f6b5b4d7
TL
3079 if (flushed) {
3080 *flushed = false;
3081 }
7c673cae
FG
3082 if (!force &&
3083 length < cct->_conf->bluefs_min_flush_size) {
3084 dout(10) << __func__ << " " << h << " ignoring, length " << length
3085 << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
3086 << dendl;
3087 return 0;
3088 }
3089 if (length == 0) {
3090 dout(10) << __func__ << " " << h << " no dirty data on "
3091 << h->file->fnode << dendl;
3092 return 0;
3093 }
3094 dout(10) << __func__ << " " << h << " 0x"
3095 << std::hex << offset << "~" << length << std::dec
3096 << " to " << h->file->fnode << dendl;
11fdf7f2 3097 ceph_assert(h->pos <= h->file->fnode.size);
f6b5b4d7
TL
3098 int r = _flush_range(h, offset, length);
3099 if (flushed) {
3100 *flushed = true;
3101 }
3102 return r;
7c673cae
FG
3103}
3104
3105int BlueFS::_truncate(FileWriter *h, uint64_t offset)
3106{
3107 dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
3108 << " file " << h->file->fnode << dendl;
3109 if (h->file->deleted) {
3110 dout(10) << __func__ << " deleted, no-op" << dendl;
3111 return 0;
3112 }
3113
3114 // we never truncate internal log files
11fdf7f2 3115 ceph_assert(h->file->fnode.ino > 1);
7c673cae
FG
3116
3117 h->buffer_appender.flush();
3118
3119 // truncate off unflushed data?
3120 if (h->pos < offset &&
3121 h->pos + h->buffer.length() > offset) {
3122 bufferlist t;
3123 dout(20) << __func__ << " tossing out last " << offset - h->pos
3124 << " unflushed bytes" << dendl;
3125 t.substr_of(h->buffer, 0, offset - h->pos);
3126 h->buffer.swap(t);
11fdf7f2 3127 ceph_abort_msg("actually this shouldn't happen");
7c673cae
FG
3128 }
3129 if (h->buffer.length()) {
3130 int r = _flush(h, true);
3131 if (r < 0)
3132 return r;
3133 }
3134 if (offset == h->file->fnode.size) {
3135 return 0; // no-op!
3136 }
3137 if (offset > h->file->fnode.size) {
11fdf7f2 3138 ceph_abort_msg("truncate up not supported");
7c673cae 3139 }
11fdf7f2 3140 ceph_assert(h->file->fnode.size >= offset);
9f95a23c 3141 vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
7c673cae 3142 h->file->fnode.size = offset;
9f95a23c 3143 vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
7c673cae
FG
3144 log_t.op_file_update(h->file->fnode);
3145 return 0;
3146}
3147
11fdf7f2 3148int BlueFS::_fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l)
7c673cae
FG
3149{
3150 dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl;
3151 int r = _flush(h, true);
3152 if (r < 0)
3153 return r;
6d8e3169
FG
3154 if (h->file->is_dirty) {
3155 _signal_dirty_to_log(h);
3156 h->file->is_dirty = false;
3157 }
7c673cae
FG
3158 uint64_t old_dirty_seq = h->file->dirty_seq;
3159
3160 _flush_bdev_safely(h);
3161
3162 if (old_dirty_seq) {
3163 uint64_t s = log_seq;
3164 dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
3165 << ") on " << h->file->fnode << ", flushing log" << dendl;
3166 _flush_and_sync_log(l, old_dirty_seq);
11fdf7f2 3167 ceph_assert(h->file->dirty_seq == 0 || // cleaned
7c673cae
FG
3168 h->file->dirty_seq > s); // or redirtied by someone else
3169 }
3170 return 0;
3171}
3172
3173void BlueFS::_flush_bdev_safely(FileWriter *h)
3174{
11fdf7f2
TL
3175 std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs;
3176 h->dirty_devs.fill(false);
3177#ifdef HAVE_LIBAIO
7c673cae
FG
3178 if (!cct->_conf->bluefs_sync_write) {
3179 list<aio_t> completed_ios;
3180 _claim_completed_aios(h, &completed_ios);
3181 lock.unlock();
3182 wait_for_aio(h);
3183 completed_ios.clear();
11fdf7f2 3184 flush_bdev(flush_devs);
7c673cae 3185 lock.lock();
11fdf7f2
TL
3186 } else
3187#endif
3188 {
7c673cae 3189 lock.unlock();
11fdf7f2 3190 flush_bdev(flush_devs);
7c673cae
FG
3191 lock.lock();
3192 }
3193}
3194
11fdf7f2
TL
3195void BlueFS::flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs)
3196{
3197 // NOTE: this is safe to call without a lock.
3198 dout(20) << __func__ << dendl;
3199 for (unsigned i = 0; i < MAX_BDEV; i++) {
3200 if (dirty_bdevs[i])
3201 bdev[i]->flush();
3202 }
3203}
3204
7c673cae
FG
3205void BlueFS::flush_bdev()
3206{
3207 // NOTE: this is safe to call without a lock.
3208 dout(20) << __func__ << dendl;
3209 for (auto p : bdev) {
3210 if (p)
3211 p->flush();
3212 }
3213}
3214
eafe8130
TL
3215const char* BlueFS::get_device_name(unsigned id)
3216{
3217 if (id >= MAX_BDEV) return "BDEV_INV";
3218 const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
3219 return names[id];
3220}
3221
11fdf7f2
TL
3222int BlueFS::_expand_slow_device(uint64_t need, PExtentVector& extents)
3223{
3224 int r = -ENOSPC;
3225 if (slow_dev_expander) {
1911f103 3226 auto id = _get_slow_device_id();
eafe8130 3227 auto min_alloc_size = alloc_size[id];
1911f103 3228 ceph_assert(id <= alloc.size() && alloc[id]);
11fdf7f2
TL
3229 auto min_need = round_up_to(need, min_alloc_size);
3230 need = std::max(need,
3231 slow_dev_expander->get_recommended_expansion_delta(
3232 alloc[id]->get_free(), block_all[id].size()));
3233
3234 need = round_up_to(need, min_alloc_size);
3235 dout(10) << __func__ << " expanding slow device by 0x"
3236 << std::hex << need << std::dec
3237 << dendl;
3238 r = slow_dev_expander->allocate_freespace(min_need, need, extents);
3239 }
3240 return r;
3241}
3242
3243int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
3244 PExtentVector* extents)
3245{
3246 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
3247 << " from " << (int)id << dendl;
3248 assert(id < alloc.size());
11fdf7f2
TL
3249 if (!alloc[id]) {
3250 return -ENOENT;
3251 }
3252 extents->reserve(4); // 4 should be (more than) enough for most allocations
eafe8130
TL
3253 uint64_t min_alloc_size = alloc_size[id];
3254 uint64_t left = round_up_to(len, min_alloc_size);
11fdf7f2 3255 int64_t alloc_len = alloc[id]->allocate(left, min_alloc_size, 0, extents);
eafe8130
TL
3256 if (alloc_len < 0 || alloc_len < (int64_t)left) {
3257 if (alloc_len > 0) {
11fdf7f2
TL
3258 alloc[id]->release(*extents);
3259 }
3260 if (bdev[id])
3261 derr << __func__ << " failed to allocate 0x" << std::hex << left
3262 << " on bdev " << (int)id
3263 << ", free 0x" << alloc[id]->get_free() << std::dec << dendl;
3264 else
3265 derr << __func__ << " failed to allocate 0x" << std::hex << left
3266 << " on bdev " << (int)id << ", dne" << std::dec << dendl;
3267 if (alloc[id])
3268 alloc[id]->dump();
3269 return -ENOSPC;
3270 }
3271
3272 return 0;
3273}
3274
7c673cae 3275int BlueFS::_allocate(uint8_t id, uint64_t len,
94b18763 3276 bluefs_fnode_t* node)
7c673cae
FG
3277{
3278 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
3279 << " from " << (int)id << dendl;
11fdf7f2 3280 ceph_assert(id < alloc.size());
b32b8144 3281 int64_t alloc_len = 0;
a8e16298 3282 PExtentVector extents;
11fdf7f2 3283 uint64_t hint = 0;
7c673cae 3284 if (alloc[id]) {
94b18763
FG
3285 if (!node->extents.empty() && node->extents.back().bdev == id) {
3286 hint = node->extents.back().end();
11fdf7f2 3287 }
b32b8144 3288 extents.reserve(4); // 4 should be (more than) enough for most allocations
eafe8130
TL
3289 alloc_len = alloc[id]->allocate(round_up_to(len, alloc_size[id]),
3290 alloc_size[id], hint, &extents);
b32b8144 3291 }
eafe8130
TL
3292 if (!alloc[id] ||
3293 alloc_len < 0 ||
3294 alloc_len < (int64_t)round_up_to(len, alloc_size[id])) {
11fdf7f2 3295 if (alloc_len > 0) {
a8e16298 3296 alloc[id]->release(extents);
b32b8144 3297 }
7c673cae
FG
3298 if (id != BDEV_SLOW) {
3299 if (bdev[id]) {
eafe8130 3300 dout(1) << __func__ << " failed to allocate 0x" << std::hex << len
7c673cae
FG
3301 << " on bdev " << (int)id
3302 << ", free 0x" << alloc[id]->get_free()
3303 << "; fallback to bdev " << (int)id + 1
3304 << std::dec << dendl;
3305 }
94b18763 3306 return _allocate(id + 1, len, node);
7c673cae 3307 }
eafe8130 3308 dout(1) << __func__ << " unable to allocate 0x" << std::hex << len
11fdf7f2
TL
3309 << " on bdev " << (int)id << ", free 0x"
3310 << (alloc[id] ? alloc[id]->get_free() : (uint64_t)-1)
3311 << "; fallback to slow device expander "
3312 << std::dec << dendl;
3313 extents.clear();
eafe8130 3314 if (_expand_slow_device(len, extents) == 0) {
11fdf7f2
TL
3315 id = _get_slow_device_id();
3316 for (auto& e : extents) {
3317 _add_block_extent(id, e.offset, e.length);
3318 }
3319 extents.clear();
3320 auto* last_alloc = alloc[id];
3321 ceph_assert(last_alloc);
3322 // try again
eafe8130
TL
3323 alloc_len = last_alloc->allocate(round_up_to(len, alloc_size[id]),
3324 alloc_size[id], hint, &extents);
3325 if (alloc_len < 0 || alloc_len < (int64_t)len) {
11fdf7f2
TL
3326 if (alloc_len > 0) {
3327 last_alloc->release(extents);
3328 }
eafe8130 3329 derr << __func__ << " failed to allocate 0x" << std::hex << len
11fdf7f2
TL
3330 << " on bdev " << (int)id
3331 << ", free 0x" << last_alloc->get_free() << std::dec << dendl;
3332 return -ENOSPC;
3333 }
3334 } else {
3335 derr << __func__ << " failed to expand slow device to fit +0x"
eafe8130 3336 << std::hex << len << std::dec
11fdf7f2
TL
3337 << dendl;
3338 return -ENOSPC;
3339 }
3340 } else {
3341 uint64_t total_allocated =
3342 block_all[id].size() - alloc[id]->get_free();
3343 if (max_bytes[id] < total_allocated) {
3344 logger->set(max_bytes_pcounters[id], total_allocated);
3345 max_bytes[id] = total_allocated;
3346 }
7c673cae
FG
3347 }
3348
3349 for (auto& p : extents) {
94b18763 3350 node->append_extent(bluefs_extent_t(id, p.offset, p.length));
7c673cae
FG
3351 }
3352
3353 return 0;
3354}
3355
3356int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len)
3357{
3358 dout(10) << __func__ << " file " << f->fnode << " 0x"
3359 << std::hex << off << "~" << len << std::dec << dendl;
3360 if (f->deleted) {
3361 dout(10) << __func__ << " deleted, no-op" << dendl;
3362 return 0;
3363 }
11fdf7f2 3364 ceph_assert(f->fnode.ino > 1);
7c673cae
FG
3365 uint64_t allocated = f->fnode.get_allocated();
3366 if (off + len > allocated) {
3367 uint64_t want = off + len - allocated;
9f95a23c
TL
3368 vselector->sub_usage(f->vselector_hint, f->fnode);
3369
3370 int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint),
3371 want,
3372 &f->fnode);
3373 vselector->add_usage(f->vselector_hint, f->fnode);
7c673cae
FG
3374 if (r < 0)
3375 return r;
7c673cae
FG
3376 log_t.op_file_update(f->fnode);
3377 }
3378 return 0;
3379}
3380
1911f103 3381void BlueFS::sync_metadata(bool avoid_compact)
7c673cae 3382{
f6b5b4d7 3383 std::unique_lock<ceph::mutex> l(lock);
9f95a23c 3384 if (log_t.empty() && dirty_files.empty()) {
7c673cae 3385 dout(10) << __func__ << " - no pending log events" << dendl;
11fdf7f2
TL
3386 } else {
3387 dout(10) << __func__ << dendl;
3388 utime_t start = ceph_clock_now();
3389 flush_bdev(); // FIXME?
3390 _flush_and_sync_log(l);
3391 dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl;
7c673cae 3392 }
7c673cae 3393
f6b5b4d7
TL
3394 if (!avoid_compact) {
3395 _maybe_compact_log(l);
3396 }
3397}
3398
3399void BlueFS::_maybe_compact_log(std::unique_lock<ceph::mutex>& l)
3400{
3401 if (!cct->_conf->bluefs_replay_recovery_disable_compact &&
3402 _should_compact_log()) {
7c673cae
FG
3403 if (cct->_conf->bluefs_compact_log_sync) {
3404 _compact_log_sync();
3405 } else {
3406 _compact_log_async(l);
3407 }
3408 }
7c673cae
FG
3409}
3410
3411int BlueFS::open_for_write(
6d8e3169
FG
3412 std::string_view dirname,
3413 std::string_view filename,
7c673cae
FG
3414 FileWriter **h,
3415 bool overwrite)
3416{
11fdf7f2 3417 std::lock_guard l(lock);
7c673cae
FG
3418 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3419 map<string,DirRef>::iterator p = dir_map.find(dirname);
3420 DirRef dir;
3421 if (p == dir_map.end()) {
3422 // implicitly create the dir
3423 dout(20) << __func__ << " dir " << dirname
3424 << " does not exist" << dendl;
3425 return -ENOENT;
3426 } else {
3427 dir = p->second;
3428 }
3429
3430 FileRef file;
3431 bool create = false;
f6b5b4d7 3432 bool truncate = false;
7c673cae
FG
3433 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3434 if (q == dir->file_map.end()) {
3435 if (overwrite) {
3436 dout(20) << __func__ << " dir " << dirname << " (" << dir
3437 << ") file " << filename
3438 << " does not exist" << dendl;
3439 return -ENOENT;
3440 }
9f95a23c 3441 file = ceph::make_ref<File>();
7c673cae
FG
3442 file->fnode.ino = ++ino_last;
3443 file_map[ino_last] = file;
6d8e3169 3444 dir->file_map[string{filename}] = file;
7c673cae
FG
3445 ++file->refs;
3446 create = true;
3447 } else {
3448 // overwrite existing file?
3449 file = q->second;
3450 if (overwrite) {
3451 dout(20) << __func__ << " dir " << dirname << " (" << dir
3452 << ") file " << filename
3453 << " already exists, overwrite in place" << dendl;
3454 } else {
3455 dout(20) << __func__ << " dir " << dirname << " (" << dir
3456 << ") file " << filename
3457 << " already exists, truncate + overwrite" << dendl;
9f95a23c 3458 vselector->sub_usage(file->vselector_hint, file->fnode);
7c673cae
FG
3459 file->fnode.size = 0;
3460 for (auto& p : file->fnode.extents) {
3461 pending_release[p.bdev].insert(p.offset, p.length);
3462 }
f6b5b4d7 3463 truncate = true;
94b18763
FG
3464
3465 file->fnode.clear_extents();
7c673cae
FG
3466 }
3467 }
11fdf7f2 3468 ceph_assert(file->fnode.ino > 1);
7c673cae
FG
3469
3470 file->fnode.mtime = ceph_clock_now();
9f95a23c 3471 file->vselector_hint = vselector->get_hint_by_dir(dirname);
f6b5b4d7
TL
3472 if (create || truncate) {
3473 vselector->add_usage(file->vselector_hint, file->fnode); // update file count
3474 }
9f95a23c 3475
7c673cae 3476 dout(20) << __func__ << " mapping " << dirname << "/" << filename
9f95a23c
TL
3477 << " vsel_hint " << file->vselector_hint
3478 << dendl;
7c673cae
FG
3479
3480 log_t.op_file_update(file->fnode);
3481 if (create)
3482 log_t.op_dir_link(dirname, filename, file->fnode.ino);
3483
3484 *h = _create_writer(file);
3485
3486 if (boost::algorithm::ends_with(filename, ".log")) {
3487 (*h)->writer_type = BlueFS::WRITER_WAL;
3488 if (logger && !overwrite) {
3489 logger->inc(l_bluefs_files_written_wal);
3490 }
3491 } else if (boost::algorithm::ends_with(filename, ".sst")) {
3492 (*h)->writer_type = BlueFS::WRITER_SST;
3493 if (logger) {
3494 logger->inc(l_bluefs_files_written_sst);
3495 }
3496 }
3497
3498 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
3499 return 0;
3500}
3501
3502BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
3503{
3504 FileWriter *w = new FileWriter(f);
3505 for (unsigned i = 0; i < MAX_BDEV; ++i) {
3506 if (bdev[i]) {
3507 w->iocv[i] = new IOContext(cct, NULL);
7c673cae
FG
3508 }
3509 }
3510 return w;
3511}
3512
3513void BlueFS::_close_writer(FileWriter *h)
3514{
3515 dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
f91f0fd5 3516 h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer);
7c673cae
FG
3517 for (unsigned i=0; i<MAX_BDEV; ++i) {
3518 if (bdev[i]) {
11fdf7f2
TL
3519 if (h->iocv[i]) {
3520 h->iocv[i]->aio_wait();
3521 bdev[i]->queue_reap_ioc(h->iocv[i]);
3522 }
7c673cae
FG
3523 }
3524 }
3525 delete h;
3526}
3527
6d8e3169
FG
3528uint64_t BlueFS::debug_get_dirty_seq(FileWriter *h)
3529{
3530 std::lock_guard l(lock);
3531 return h->file->dirty_seq;
3532}
3533
3534bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev)
3535{
3536 std::lock_guard l(lock);
3537 return h->dirty_devs[dev];
3538}
3539
7c673cae 3540int BlueFS::open_for_read(
6d8e3169
FG
3541 std::string_view dirname,
3542 std::string_view filename,
7c673cae
FG
3543 FileReader **h,
3544 bool random)
3545{
11fdf7f2 3546 std::lock_guard l(lock);
7c673cae
FG
3547 dout(10) << __func__ << " " << dirname << "/" << filename
3548 << (random ? " (random)":" (sequential)") << dendl;
3549 map<string,DirRef>::iterator p = dir_map.find(dirname);
3550 if (p == dir_map.end()) {
3551 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3552 return -ENOENT;
3553 }
3554 DirRef dir = p->second;
3555
3556 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3557 if (q == dir->file_map.end()) {
3558 dout(20) << __func__ << " dir " << dirname << " (" << dir
3559 << ") file " << filename
3560 << " not found" << dendl;
3561 return -ENOENT;
3562 }
3563 File *file = q->second.get();
3564
3565 *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
3566 random, false);
3567 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
3568 return 0;
3569}
3570
3571int BlueFS::rename(
6d8e3169
FG
3572 std::string_view old_dirname, std::string_view old_filename,
3573 std::string_view new_dirname, std::string_view new_filename)
7c673cae 3574{
11fdf7f2 3575 std::lock_guard l(lock);
7c673cae
FG
3576 dout(10) << __func__ << " " << old_dirname << "/" << old_filename
3577 << " -> " << new_dirname << "/" << new_filename << dendl;
3578 map<string,DirRef>::iterator p = dir_map.find(old_dirname);
3579 if (p == dir_map.end()) {
3580 dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
3581 return -ENOENT;
3582 }
3583 DirRef old_dir = p->second;
3584 map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
3585 if (q == old_dir->file_map.end()) {
3586 dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
3587 << ") file " << old_filename
3588 << " not found" << dendl;
3589 return -ENOENT;
3590 }
3591 FileRef file = q->second;
3592
3593 p = dir_map.find(new_dirname);
3594 if (p == dir_map.end()) {
3595 dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
3596 return -ENOENT;
3597 }
3598 DirRef new_dir = p->second;
3599 q = new_dir->file_map.find(new_filename);
3600 if (q != new_dir->file_map.end()) {
3601 dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
3602 << ") file " << new_filename
3603 << " already exists, unlinking" << dendl;
11fdf7f2 3604 ceph_assert(q->second != file);
7c673cae
FG
3605 log_t.op_dir_unlink(new_dirname, new_filename);
3606 _drop_link(q->second);
3607 }
3608
3609 dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
3610 << " " << file->fnode << dendl;
3611
6d8e3169
FG
3612 new_dir->file_map[string{new_filename}] = file;
3613 old_dir->file_map.erase(string{old_filename});
7c673cae
FG
3614
3615 log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
3616 log_t.op_dir_unlink(old_dirname, old_filename);
3617 return 0;
3618}
3619
6d8e3169 3620int BlueFS::mkdir(std::string_view dirname)
7c673cae 3621{
11fdf7f2 3622 std::lock_guard l(lock);
7c673cae
FG
3623 dout(10) << __func__ << " " << dirname << dendl;
3624 map<string,DirRef>::iterator p = dir_map.find(dirname);
3625 if (p != dir_map.end()) {
3626 dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
3627 return -EEXIST;
3628 }
6d8e3169 3629 dir_map[string{dirname}] = ceph::make_ref<Dir>();
7c673cae
FG
3630 log_t.op_dir_create(dirname);
3631 return 0;
3632}
3633
6d8e3169 3634int BlueFS::rmdir(std::string_view dirname)
7c673cae 3635{
11fdf7f2 3636 std::lock_guard l(lock);
7c673cae 3637 dout(10) << __func__ << " " << dirname << dendl;
6d8e3169 3638 auto p = dir_map.find(dirname);
7c673cae
FG
3639 if (p == dir_map.end()) {
3640 dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
3641 return -ENOENT;
3642 }
3643 DirRef dir = p->second;
3644 if (!dir->file_map.empty()) {
3645 dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
3646 return -ENOTEMPTY;
3647 }
6d8e3169 3648 dir_map.erase(string{dirname});
7c673cae
FG
3649 log_t.op_dir_remove(dirname);
3650 return 0;
3651}
3652
6d8e3169 3653bool BlueFS::dir_exists(std::string_view dirname)
7c673cae 3654{
11fdf7f2 3655 std::lock_guard l(lock);
7c673cae
FG
3656 map<string,DirRef>::iterator p = dir_map.find(dirname);
3657 bool exists = p != dir_map.end();
3658 dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
3659 return exists;
3660}
3661
6d8e3169 3662int BlueFS::stat(std::string_view dirname, std::string_view filename,
7c673cae
FG
3663 uint64_t *size, utime_t *mtime)
3664{
11fdf7f2 3665 std::lock_guard l(lock);
7c673cae
FG
3666 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3667 map<string,DirRef>::iterator p = dir_map.find(dirname);
3668 if (p == dir_map.end()) {
3669 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3670 return -ENOENT;
3671 }
3672 DirRef dir = p->second;
3673 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3674 if (q == dir->file_map.end()) {
3675 dout(20) << __func__ << " dir " << dirname << " (" << dir
3676 << ") file " << filename
3677 << " not found" << dendl;
3678 return -ENOENT;
3679 }
3680 File *file = q->second.get();
3681 dout(10) << __func__ << " " << dirname << "/" << filename
3682 << " " << file->fnode << dendl;
3683 if (size)
3684 *size = file->fnode.size;
3685 if (mtime)
3686 *mtime = file->fnode.mtime;
3687 return 0;
3688}
3689
6d8e3169 3690int BlueFS::lock_file(std::string_view dirname, std::string_view filename,
7c673cae
FG
3691 FileLock **plock)
3692{
11fdf7f2 3693 std::lock_guard l(lock);
7c673cae
FG
3694 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3695 map<string,DirRef>::iterator p = dir_map.find(dirname);
3696 if (p == dir_map.end()) {
3697 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3698 return -ENOENT;
3699 }
3700 DirRef dir = p->second;
6d8e3169 3701 auto q = dir->file_map.find(filename);
9f95a23c 3702 FileRef file;
7c673cae
FG
3703 if (q == dir->file_map.end()) {
3704 dout(20) << __func__ << " dir " << dirname << " (" << dir
3705 << ") file " << filename
3706 << " not found, creating" << dendl;
9f95a23c 3707 file = ceph::make_ref<File>();
7c673cae
FG
3708 file->fnode.ino = ++ino_last;
3709 file->fnode.mtime = ceph_clock_now();
3710 file_map[ino_last] = file;
6d8e3169 3711 dir->file_map[string{filename}] = file;
7c673cae
FG
3712 ++file->refs;
3713 log_t.op_file_update(file->fnode);
3714 log_t.op_dir_link(dirname, filename, file->fnode.ino);
3715 } else {
9f95a23c 3716 file = q->second;
7c673cae
FG
3717 if (file->locked) {
3718 dout(10) << __func__ << " already locked" << dendl;
11fdf7f2 3719 return -ENOLCK;
7c673cae
FG
3720 }
3721 }
3722 file->locked = true;
3723 *plock = new FileLock(file);
3724 dout(10) << __func__ << " locked " << file->fnode
3725 << " with " << *plock << dendl;
3726 return 0;
3727}
3728
3729int BlueFS::unlock_file(FileLock *fl)
3730{
11fdf7f2 3731 std::lock_guard l(lock);
7c673cae 3732 dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
11fdf7f2 3733 ceph_assert(fl->file->locked);
7c673cae
FG
3734 fl->file->locked = false;
3735 delete fl;
3736 return 0;
3737}
3738
6d8e3169 3739int BlueFS::readdir(std::string_view dirname, vector<string> *ls)
7c673cae 3740{
6d8e3169
FG
3741 // dirname may contain a trailing /
3742 if (!dirname.empty() && dirname.back() == '/') {
3743 dirname.remove_suffix(1);
3744 }
11fdf7f2 3745 std::lock_guard l(lock);
7c673cae
FG
3746 dout(10) << __func__ << " " << dirname << dendl;
3747 if (dirname.empty()) {
3748 // list dirs
3749 ls->reserve(dir_map.size() + 2);
3750 for (auto& q : dir_map) {
3751 ls->push_back(q.first);
3752 }
3753 } else {
3754 // list files in dir
3755 map<string,DirRef>::iterator p = dir_map.find(dirname);
3756 if (p == dir_map.end()) {
3757 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3758 return -ENOENT;
3759 }
3760 DirRef dir = p->second;
3761 ls->reserve(dir->file_map.size() + 2);
3762 for (auto& q : dir->file_map) {
3763 ls->push_back(q.first);
3764 }
3765 }
3766 ls->push_back(".");
3767 ls->push_back("..");
3768 return 0;
3769}
3770
6d8e3169 3771int BlueFS::unlink(std::string_view dirname, std::string_view filename)
7c673cae 3772{
11fdf7f2 3773 std::lock_guard l(lock);
7c673cae
FG
3774 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3775 map<string,DirRef>::iterator p = dir_map.find(dirname);
3776 if (p == dir_map.end()) {
3777 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3778 return -ENOENT;
3779 }
3780 DirRef dir = p->second;
3781 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3782 if (q == dir->file_map.end()) {
3783 dout(20) << __func__ << " file " << dirname << "/" << filename
3784 << " not found" << dendl;
3785 return -ENOENT;
3786 }
3787 FileRef file = q->second;
3788 if (file->locked) {
3789 dout(20) << __func__ << " file " << dirname << "/" << filename
3790 << " is locked" << dendl;
3791 return -EBUSY;
3792 }
6d8e3169 3793 dir->file_map.erase(string{filename});
7c673cae
FG
3794 log_t.op_dir_unlink(dirname, filename);
3795 _drop_link(file);
3796 return 0;
3797}
d2e6a577
FG
3798
3799bool BlueFS::wal_is_rotational()
3800{
94b18763
FG
3801 if (bdev[BDEV_WAL]) {
3802 return bdev[BDEV_WAL]->is_rotational();
3803 } else if (bdev[BDEV_DB]) {
3804 return bdev[BDEV_DB]->is_rotational();
3805 }
3806 return bdev[BDEV_SLOW]->is_rotational();
d2e6a577 3807}
9f95a23c 3808
f6b5b4d7
TL
3809/*
3810 Algorithm.
3811 do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there.
3812 Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future,
3813 and try if using it will produce healthy bluefs transaction.
3814 We encode already known bluefs log extents and search disk for these bytes.
3815 When we find it, we decode following bytes as extent.
3816 We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction.
3817 */
3818int BlueFS::do_replay_recovery_read(FileReader *log_reader,
3819 size_t replay_pos,
3820 size_t read_offset,
3821 size_t read_len,
3822 bufferlist* bl) {
3823 dout(1) << __func__ << " replay_pos=0x" << std::hex << replay_pos <<
3824 " needs 0x" << read_offset << "~" << read_len << std::dec << dendl;
3825
3826 bluefs_fnode_t& log_fnode = log_reader->file->fnode;
3827 bufferlist bin_extents;
3828 ceph::encode(log_fnode.extents, bin_extents);
3829 dout(2) << __func__ << " log file encoded extents length = " << bin_extents.length() << dendl;
3830
3831 // cannot process if too small to effectively search
3832 ceph_assert(bin_extents.length() >= 32);
3833 bufferlist last_32;
3834 last_32.substr_of(bin_extents, bin_extents.length() - 32, 32);
3835
3836 //read fixed part from replay_pos to end of bluefs_log extents
3837 bufferlist fixed;
3838 uint64_t e_off = 0;
3839 auto e = log_fnode.seek(replay_pos, &e_off);
3840 ceph_assert(e != log_fnode.extents.end());
3841 int r = bdev[e->bdev]->read(e->offset + e_off, e->length - e_off, &fixed, ioc[e->bdev],
3842 cct->_conf->bluefs_buffered_io);
3843 ceph_assert(r == 0);
3844 //capture dev of last good extent
3845 uint8_t last_e_dev = e->bdev;
3846 uint64_t last_e_off = e->offset;
3847 ++e;
3848 while (e != log_fnode.extents.end()) {
3849 r = bdev[e->bdev]->read(e->offset, e->length, &fixed, ioc[e->bdev],
3850 cct->_conf->bluefs_buffered_io);
3851 ceph_assert(r == 0);
3852 last_e_dev = e->bdev;
3853 ++e;
3854 }
3855 ceph_assert(replay_pos + fixed.length() == read_offset);
3856
3857 dout(2) << __func__ << " valid data in log = " << fixed.length() << dendl;
3858
3859 struct compare {
3860 bool operator()(const bluefs_extent_t& a, const bluefs_extent_t& b) const {
3861 if (a.bdev < b.bdev) return true;
3862 if (a.offset < b.offset) return true;
3863 return a.length < b.length;
3864 }
3865 };
3866 std::set<bluefs_extent_t, compare> extents_rejected;
3867 for (int dcnt = 0; dcnt < 3; dcnt++) {
3868 uint8_t dev = (last_e_dev + dcnt) % MAX_BDEV;
3869 if (bdev[dev] == nullptr) continue;
3870 dout(2) << __func__ << " processing " << get_device_name(dev) << dendl;
3871 interval_set<uint64_t> disk_regions;
3872 disk_regions.insert(0, bdev[dev]->get_size());
3873 for (auto f : file_map) {
3874 auto& e = f.second->fnode.extents;
3875 for (auto& p : e) {
3876 if (p.bdev == dev) {
3877 disk_regions.erase(p.offset, p.length);
3878 }
3879 }
3880 }
3881 size_t disk_regions_count = disk_regions.num_intervals();
3882 dout(5) << __func__ << " " << disk_regions_count << " regions to scan on " << get_device_name(dev) << dendl;
3883
3884 auto reg = disk_regions.lower_bound(last_e_off);
3885 //for all except first, start from beginning
3886 last_e_off = 0;
3887 if (reg == disk_regions.end()) {
3888 reg = disk_regions.begin();
3889 }
3890 const uint64_t chunk_size = 4 * 1024 * 1024;
3891 const uint64_t page_size = 4096;
3892 const uint64_t max_extent_size = 16;
3893 uint64_t overlay_size = last_32.length() + max_extent_size;
3894 for (size_t i = 0; i < disk_regions_count; reg++, i++) {
3895 if (reg == disk_regions.end()) {
3896 reg = disk_regions.begin();
3897 }
3898 uint64_t pos = reg.get_start();
3899 uint64_t len = reg.get_len();
3900
3901 std::unique_ptr<char[]> raw_data_p{new char[page_size + chunk_size]};
3902 char* raw_data = raw_data_p.get();
3903 memset(raw_data, 0, page_size);
3904
3905 while (len > last_32.length()) {
3906 uint64_t chunk_len = len > chunk_size ? chunk_size : len;
3907 dout(5) << __func__ << " read "
3908 << get_device_name(dev) << ":0x" << std::hex << pos << "+" << chunk_len << std::dec << dendl;
3909 r = bdev[dev]->read_random(pos, chunk_len, raw_data + page_size, cct->_conf->bluefs_buffered_io);
3910 ceph_assert(r == 0);
3911
3912 //search for fixed_last_32
3913 char* chunk_b = raw_data + page_size;
3914 char* chunk_e = chunk_b + chunk_len;
3915
3916 char* search_b = chunk_b - overlay_size;
3917 char* search_e = chunk_e;
3918
3919 for (char* sp = search_b; ; sp += last_32.length()) {
3920 sp = (char*)memmem(sp, search_e - sp, last_32.c_str(), last_32.length());
3921 if (sp == nullptr) {
3922 break;
3923 }
3924
3925 char* n = sp + last_32.length();
3926 dout(5) << __func__ << " checking location 0x" << std::hex << pos + (n - chunk_b) << std::dec << dendl;
3927 bufferlist test;
3928 test.append(n, std::min<size_t>(max_extent_size, chunk_e - n));
3929 bluefs_extent_t ne;
3930 try {
3931 bufferlist::const_iterator p = test.begin();
3932 ceph::decode(ne, p);
3933 } catch (buffer::error& e) {
3934 continue;
3935 }
3936 if (extents_rejected.count(ne) != 0) {
3937 dout(5) << __func__ << " extent " << ne << " already refected" <<dendl;
3938 continue;
3939 }
3940 //insert as rejected already. if we succeed, it wouldn't make difference.
3941 extents_rejected.insert(ne);
3942
3943 if (ne.bdev >= MAX_BDEV ||
3944 bdev[ne.bdev] == nullptr ||
3945 ne.length > 16 * 1024 * 1024 ||
3946 (ne.length & 4095) != 0 ||
3947 ne.offset + ne.length > bdev[ne.bdev]->get_size() ||
3948 (ne.offset & 4095) != 0) {
3949 dout(5) << __func__ << " refusing extent " << ne << dendl;
3950 continue;
3951 }
3952 dout(5) << __func__ << " checking extent " << ne << dendl;
3953
3954 //read candidate extent - whole
3955 bufferlist candidate;
3956 candidate.append(fixed);
3957 r = bdev[ne.bdev]->read(ne.offset, ne.length, &candidate, ioc[ne.bdev],
3958 cct->_conf->bluefs_buffered_io);
3959 ceph_assert(r == 0);
3960
3961 //check if transaction & crc is ok
3962 bluefs_transaction_t t;
3963 try {
3964 bufferlist::const_iterator p = candidate.cbegin();
3965 decode(t, p);
3966 }
3967 catch (buffer::error& e) {
3968 dout(5) << __func__ << " failed match" << dendl;
3969 continue;
3970 }
3971
3972 //success, it seems a probable candidate
3973 uint64_t l = std::min<uint64_t>(ne.length, read_len);
3974 //trim to required size
3975 bufferlist requested_read;
3976 requested_read.substr_of(candidate, fixed.length(), l);
3977 bl->append(requested_read);
3978 dout(5) << __func__ << " successful extension of log " << l << "/" << read_len << dendl;
3979 log_fnode.append_extent(ne);
3980 log_fnode.recalc_allocated();
3981 log_reader->buf.pos += l;
3982 return l;
3983 }
3984 //save overlay for next search
3985 memcpy(search_b, chunk_e - overlay_size, overlay_size);
3986 pos += chunk_len;
3987 len -= chunk_len;
3988 }
3989 }
3990 }
3991 return 0;
3992}
3993
9f95a23c
TL
3994void BlueFS::debug_inject_duplicate_gift(unsigned id,
3995 uint64_t offset,
3996 uint64_t len)
3997{
3998 dout(0) << __func__ << dendl;
3999 if (id < alloc.size() && alloc[id]) {
4000 alloc[id]->init_add_free(offset, len);
4001 }
4002}
4003
4004// ===============================================
4005// OriginalVolumeSelector
4006
f6b5b4d7
TL
4007void* OriginalVolumeSelector::get_hint_for_log() const {
4008 return reinterpret_cast<void*>(BlueFS::BDEV_WAL);
9f95a23c 4009}
6d8e3169 4010void* OriginalVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
9f95a23c
TL
4011 uint8_t res = BlueFS::BDEV_DB;
4012 if (dirname.length() > 5) {
4013 // the "db.slow" and "db.wal" directory names are hard-coded at
4014 // match up with bluestore. the slow device is always the second
4015 // one (when a dedicated block.db device is present and used at
4016 // bdev 0). the wal device is always last.
4017 if (boost::algorithm::ends_with(dirname, ".slow")) {
4018 res = BlueFS::BDEV_SLOW;
4019 }
4020 else if (boost::algorithm::ends_with(dirname, ".wal")) {
4021 res = BlueFS::BDEV_WAL;
4022 }
4023 }
4024 return reinterpret_cast<void*>(res);
4025}
4026
4027uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint)
4028{
4029 return (uint8_t)(reinterpret_cast<uint64_t>(hint));
4030}
4031
4032void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const
4033{
4034 res.emplace_back(base, db_total);
4035 res.emplace_back(base + ".slow", slow_total);
4036}
4037
4038#undef dout_prefix
4039#define dout_prefix *_dout << "OriginalVolumeSelector: "
4040
4041void OriginalVolumeSelector::dump(ostream& sout) {
4042 sout<< "wal_total:" << wal_total
4043 << ", db_total:" << db_total
4044 << ", slow_total:" << slow_total
4045 << std::endl;
4046}