]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueFS.cc
bump version to 16.2.6-pve2
[ceph.git] / ceph / src / os / bluestore / BlueFS.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "boost/algorithm/string.hpp"
9f95a23c 5#include "bluestore_common.h"
7c673cae
FG
6#include "BlueFS.h"
7
8#include "common/debug.h"
9#include "common/errno.h"
10#include "common/perf_counters.h"
7c673cae 11#include "Allocator.h"
11fdf7f2 12#include "include/ceph_assert.h"
eafe8130 13#include "common/admin_socket.h"
7c673cae
FG
14
15#define dout_context cct
16#define dout_subsys ceph_subsys_bluefs
17#undef dout_prefix
18#define dout_prefix *_dout << "bluefs "
9f95a23c 19using TOPNSPC::common::cmd_getval;
f67539c2
TL
20
21using std::byte;
22using std::list;
23using std::make_pair;
24using std::map;
25using std::ostream;
26using std::pair;
27using std::set;
28using std::string;
29using std::to_string;
30using std::vector;
31
32using ceph::bufferlist;
33using ceph::decode;
34using ceph::encode;
35using ceph::Formatter;
36
37
7c673cae
FG
38MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
39MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
f91f0fd5 40MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs_file_writer);
7c673cae 41MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
f91f0fd5
TL
42 bluefs_file_reader_buffer, bluefs_file_reader);
43MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs_file_reader);
7c673cae
FG
44MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
45
11fdf7f2
TL
46static void wal_discard_cb(void *priv, void* priv2) {
47 BlueFS *bluefs = static_cast<BlueFS*>(priv);
48 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
49 bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp);
50}
51
52static void db_discard_cb(void *priv, void* priv2) {
53 BlueFS *bluefs = static_cast<BlueFS*>(priv);
54 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
55 bluefs->handle_discard(BlueFS::BDEV_DB, *tmp);
56}
57
58static void slow_discard_cb(void *priv, void* priv2) {
59 BlueFS *bluefs = static_cast<BlueFS*>(priv);
60 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
61 bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
62}
7c673cae 63
eafe8130
TL
64class BlueFS::SocketHook : public AdminSocketHook {
65 BlueFS* bluefs;
66public:
67 static BlueFS::SocketHook* create(BlueFS* bluefs)
68 {
69 BlueFS::SocketHook* hook = nullptr;
70 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
71 if (admin_socket) {
72 hook = new BlueFS::SocketHook(bluefs);
f67539c2 73 int r = admin_socket->register_command("bluestore bluefs device info "
eafe8130
TL
74 "name=alloc_size,type=CephInt,req=false",
75 hook,
f67539c2
TL
76 "Shows space report for bluefs devices. "
77 "This also includes an estimation for space "
78 "available to bluefs at main device. "
79 "alloc_size, if set, specifies the custom bluefs "
80 "allocation unit size for the estimation above.");
eafe8130
TL
81 if (r != 0) {
82 ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl;
83 delete hook;
84 hook = nullptr;
9f95a23c 85 } else {
f6b5b4d7 86 r = admin_socket->register_command("bluefs stats",
9f95a23c
TL
87 hook,
88 "Dump internal statistics for bluefs."
89 "");
90 ceph_assert(r == 0);
f67539c2
TL
91 r = admin_socket->register_command("bluefs files list", hook,
92 "print files in bluefs");
93 ceph_assert(r == 0);
cd265ab1
TL
94 r = admin_socket->register_command("bluefs debug_inject_read_zeros", hook,
95 "Injects 8K zeros into next BlueFS read. Debug only.");
96 ceph_assert(r == 0);
eafe8130
TL
97 }
98 }
99 return hook;
100 }
101
102 ~SocketHook() {
103 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
9f95a23c 104 admin_socket->unregister_commands(this);
eafe8130
TL
105 }
106private:
107 SocketHook(BlueFS* bluefs) :
108 bluefs(bluefs) {}
9f95a23c
TL
109 int call(std::string_view command, const cmdmap_t& cmdmap,
110 Formatter *f,
111 std::ostream& errss,
112 bufferlist& out) override {
f67539c2 113 if (command == "bluestore bluefs device info") {
9f95a23c
TL
114 int64_t alloc_size = 0;
115 cmd_getval(cmdmap, "alloc_size", alloc_size);
116 if ((alloc_size & (alloc_size - 1)) != 0) {
117 errss << "Invalid allocation size:'" << alloc_size << std::endl;
118 return -EINVAL;
119 }
120 if (alloc_size == 0)
f67539c2
TL
121 alloc_size = bluefs->cct->_conf->bluefs_shared_alloc_size;
122 f->open_object_section("bluefs_device_info");
9f95a23c
TL
123 for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) {
124 if (bluefs->bdev[dev]) {
125 f->open_object_section("dev");
126 f->dump_string("device", bluefs->get_device_name(dev));
127 ceph_assert(bluefs->alloc[dev]);
f67539c2
TL
128 auto total = bluefs->get_total(dev);
129 auto free = bluefs->get_free(dev);
130 auto used = bluefs->get_used(dev);
131
132 f->dump_int("total", total);
133 f->dump_int("free", free);
134 f->dump_int("bluefs_used", used);
135 if (bluefs->is_shared_alloc(dev)) {
136 size_t avail = bluefs->probe_alloc_avail(dev, alloc_size);
137 f->dump_int("bluefs max available", avail);
138 }
139 f->close_section();
140 }
eafe8130 141 }
f67539c2 142
9f95a23c
TL
143 f->close_section();
144 } else if (command == "bluefs stats") {
145 std::stringstream ss;
146 bluefs->dump_block_extents(ss);
147 bluefs->dump_volume_selector(ss);
eafe8130 148 out.append(ss);
f67539c2
TL
149 } else if (command == "bluefs files list") {
150 const char* devnames[3] = {"wal","db","slow"};
151 std::lock_guard l(bluefs->lock);
152 f->open_array_section("files");
153 for (auto &d : bluefs->dir_map) {
154 std::string dir = d.first;
155 for (auto &r : d.second->file_map) {
156 f->open_object_section("file");
157 f->dump_string("name", (dir + "/" + r.first).c_str());
158 std::vector<size_t> sizes;
159 sizes.resize(bluefs->bdev.size());
160 for(auto& i : r.second->fnode.extents) {
161 sizes[i.bdev] += i.length;
162 }
163 for (size_t i = 0; i < sizes.size(); i++) {
164 if (sizes[i]>0) {
165 if (i < sizeof(devnames) / sizeof(*devnames))
166 f->dump_int(devnames[i], sizes[i]);
167 else
168 f->dump_int(("dev-"+to_string(i)).c_str(), sizes[i]);
169 }
170 }
171 f->close_section();
172 }
173 }
174 f->close_section();
175 f->flush(out);
cd265ab1
TL
176 } else if (command == "bluefs debug_inject_read_zeros") {
177 bluefs->inject_read_zeros++;
9f95a23c
TL
178 } else {
179 errss << "Invalid command" << std::endl;
180 return -ENOSYS;
eafe8130 181 }
9f95a23c
TL
182 return 0;
183 }
eafe8130
TL
184};
185
7c673cae
FG
186BlueFS::BlueFS(CephContext* cct)
187 : cct(cct),
188 bdev(MAX_BDEV),
189 ioc(MAX_BDEV),
f67539c2
TL
190 block_reserved(MAX_BDEV),
191 alloc(MAX_BDEV),
192 alloc_size(MAX_BDEV, 0),
193 pending_release(MAX_BDEV)
7c673cae 194{
11fdf7f2
TL
195 discard_cb[BDEV_WAL] = wal_discard_cb;
196 discard_cb[BDEV_DB] = db_discard_cb;
197 discard_cb[BDEV_SLOW] = slow_discard_cb;
eafe8130 198 asok_hook = SocketHook::create(this);
f67539c2 199
7c673cae
FG
200}
201
202BlueFS::~BlueFS()
203{
eafe8130 204 delete asok_hook;
7c673cae
FG
205 for (auto p : ioc) {
206 if (p)
207 p->aio_wait();
208 }
209 for (auto p : bdev) {
210 if (p) {
211 p->close();
212 delete p;
213 }
214 }
215 for (auto p : ioc) {
216 delete p;
217 }
218}
219
220void BlueFS::_init_logger()
221{
222 PerfCountersBuilder b(cct, "bluefs",
223 l_bluefs_first, l_bluefs_last);
7c673cae
FG
224 b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
225 "Total bytes (main db device)",
11fdf7f2 226 "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
227 b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
228 "Used bytes (main db device)",
11fdf7f2 229 "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
230 b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
231 "Total bytes (wal device)",
11fdf7f2 232 "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
233 b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
234 "Used bytes (wal device)",
11fdf7f2 235 "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
236 b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
237 "Total bytes (slow device)",
11fdf7f2 238 "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
239 b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
240 "Used bytes (slow device)",
11fdf7f2 241 "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
242 b.add_u64(l_bluefs_num_files, "num_files", "File count",
243 "f", PerfCountersBuilder::PRIO_USEFUL);
244 b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
11fdf7f2 245 "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
7c673cae
FG
246 b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
247 "Compactions of the metadata log");
248 b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
249 "Bytes written to the metadata log", "j",
11fdf7f2 250 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
7c673cae
FG
251 b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
252 "Files written to WAL");
253 b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
254 "Files written to SSTs");
255 b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
256 "Bytes written to WAL", "wal",
257 PerfCountersBuilder::PRIO_CRITICAL);
258 b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
259 "Bytes written to SSTs", "sst",
11fdf7f2
TL
260 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
261 b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow",
262 "Bytes written to WAL/SSTs at slow device", NULL,
263 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
264 b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal",
265 "Maximum bytes allocated from WAL");
266 b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db",
267 "Maximum bytes allocated from DB");
268 b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow",
269 "Maximum bytes allocated from SLOW");
494da23a
TL
270
271 b.add_u64_counter(l_bluefs_read_random_count, "read_random_count",
272 "random read requests processed");
273 b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes",
274 "Bytes requested in random read mode", NULL,
275 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
276 b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count",
277 "random reads requests going to disk");
278 b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes",
279 "Bytes read from disk in random read mode", NULL,
280 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
281 b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count",
282 "random read requests processed using prefetch buffer");
283 b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes",
284 "Bytes read from prefetch buffer in random read mode", NULL,
285 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
286
287 b.add_u64_counter(l_bluefs_read_count, "read_count",
288 "buffered read requests processed");
289 b.add_u64_counter(l_bluefs_read_bytes, "read_bytes",
290 "Bytes requested in buffered read mode", NULL,
291 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
292
293 b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count",
294 "prefetch read requests processed");
295 b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes",
296 "Bytes requested in prefetch read mode", NULL,
297 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
cd265ab1
TL
298 b.add_u64(l_bluefs_read_zeros_candidate, "read_zeros_candidate",
299 "How many times bluefs read found page with all 0s");
300 b.add_u64(l_bluefs_read_zeros_errors, "read_zeros_errors",
301 "How many times bluefs read found transient page with all 0s");
494da23a 302
7c673cae
FG
303 logger = b.create_perf_counters();
304 cct->get_perfcounters_collection()->add(logger);
305}
306
307void BlueFS::_shutdown_logger()
308{
309 cct->get_perfcounters_collection()->remove(logger);
310 delete logger;
311}
312
313void BlueFS::_update_logger_stats()
314{
315 // we must be holding the lock
316 logger->set(l_bluefs_num_files, file_map.size());
317 logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
318
319 if (alloc[BDEV_WAL]) {
f67539c2
TL
320 logger->set(l_bluefs_wal_total_bytes, _get_total(BDEV_WAL));
321 logger->set(l_bluefs_wal_used_bytes, _get_used(BDEV_WAL));
7c673cae
FG
322 }
323 if (alloc[BDEV_DB]) {
f67539c2
TL
324 logger->set(l_bluefs_db_total_bytes, _get_total(BDEV_DB));
325 logger->set(l_bluefs_db_used_bytes, _get_used(BDEV_DB));
7c673cae
FG
326 }
327 if (alloc[BDEV_SLOW]) {
f67539c2
TL
328 logger->set(l_bluefs_slow_total_bytes, _get_total(BDEV_SLOW));
329 logger->set(l_bluefs_slow_used_bytes, _get_used(BDEV_SLOW));
7c673cae
FG
330 }
331}
332
11fdf7f2 333int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
f67539c2
TL
334 uint64_t reserved,
335 bluefs_shared_alloc_context_t* _shared_alloc)
7c673cae 336{
f67539c2
TL
337 dout(10) << __func__ << " bdev " << id << " path " << path << " "
338 << reserved << dendl;
11fdf7f2
TL
339 ceph_assert(id < bdev.size());
340 ceph_assert(bdev[id] == NULL);
341 BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
342 discard_cb[id], static_cast<void*>(this));
f67539c2
TL
343 block_reserved[id] = reserved;
344 if (_shared_alloc) {
11fdf7f2
TL
345 b->set_no_exclusive_lock();
346 }
7c673cae
FG
347 int r = b->open(path);
348 if (r < 0) {
349 delete b;
350 return r;
351 }
11fdf7f2
TL
352 if (trim) {
353 b->discard(0, b->get_size());
354 }
355
7c673cae 356 dout(1) << __func__ << " bdev " << id << " path " << path
1adf2230 357 << " size " << byte_u_t(b->get_size()) << dendl;
7c673cae
FG
358 bdev[id] = b;
359 ioc[id] = new IOContext(cct, NULL);
f67539c2
TL
360 if (_shared_alloc) {
361 ceph_assert(!shared_alloc);
362 shared_alloc = _shared_alloc;
363 alloc[id] = shared_alloc->a;
364 shared_alloc_id = id;
365 }
7c673cae
FG
366 return 0;
367}
368
369bool BlueFS::bdev_support_label(unsigned id)
370{
11fdf7f2
TL
371 ceph_assert(id < bdev.size());
372 ceph_assert(bdev[id]);
7c673cae
FG
373 return bdev[id]->supported_bdev_label();
374}
375
f67539c2 376uint64_t BlueFS::get_block_device_size(unsigned id) const
7c673cae
FG
377{
378 if (id < bdev.size() && bdev[id])
379 return bdev[id]->get_size();
380 return 0;
381}
382
f67539c2 383void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
7c673cae 384{
f67539c2
TL
385 dout(10) << __func__ << " bdev " << id << dendl;
386 ceph_assert(alloc[id]);
387 alloc[id]->release(to_release);
388 if (is_shared_alloc(id)) {
389 shared_alloc->bluefs_used -= to_release.size();
7c673cae 390 }
7c673cae
FG
391}
392
f67539c2 393uint64_t BlueFS::get_used()
7c673cae 394{
f67539c2
TL
395 std::lock_guard l(lock);
396 uint64_t used = 0;
397 for (unsigned id = 0; id < MAX_BDEV; ++id) {
398 used += _get_used(id);
7c673cae 399 }
f67539c2
TL
400 return used;
401}
7c673cae 402
f67539c2
TL
403uint64_t BlueFS::_get_used(unsigned id) const
404{
405 uint64_t used = 0;
406 if (!alloc[id])
407 return 0;
9f95a23c 408
f67539c2
TL
409 if (is_shared_alloc(id)) {
410 used = shared_alloc->bluefs_used;
411 } else {
412 used = _get_total(id) - alloc[id]->get_free();
9f95a23c 413 }
f67539c2 414 return used;
7c673cae
FG
415}
416
f67539c2 417uint64_t BlueFS::get_used(unsigned id)
7c673cae 418{
f67539c2 419 ceph_assert(id < alloc.size());
11fdf7f2 420 ceph_assert(alloc[id]);
f67539c2
TL
421 std::lock_guard l(lock);
422 return _get_used(id);
11fdf7f2
TL
423}
424
f67539c2 425uint64_t BlueFS::_get_total(unsigned id) const
11fdf7f2 426{
f67539c2
TL
427 ceph_assert(id < bdev.size());
428 ceph_assert(id < block_reserved.size());
429 return get_block_device_size(id) - block_reserved[id];
7c673cae
FG
430}
431
432uint64_t BlueFS::get_total(unsigned id)
433{
11fdf7f2 434 std::lock_guard l(lock);
f67539c2 435 return _get_total(id);
7c673cae
FG
436}
437
438uint64_t BlueFS::get_free(unsigned id)
439{
11fdf7f2
TL
440 std::lock_guard l(lock);
441 ceph_assert(id < alloc.size());
7c673cae
FG
442 return alloc[id]->get_free();
443}
444
445void BlueFS::dump_perf_counters(Formatter *f)
446{
447 f->open_object_section("bluefs_perf_counters");
448 logger->dump_formatted(f,0);
449 f->close_section();
450}
451
3efd9988
FG
452void BlueFS::dump_block_extents(ostream& out)
453{
454 for (unsigned i = 0; i < MAX_BDEV; ++i) {
455 if (!bdev[i]) {
456 continue;
457 }
f67539c2 458 auto total = get_total(i);
11fdf7f2 459 auto free = get_free(i);
1911f103 460
f67539c2
TL
461 out << i << " : device size 0x" << std::hex << total
462 << " : using 0x" << total - free
463 << std::dec << "(" << byte_u_t(total - free) << ")";
1911f103 464 out << "\n";
3efd9988
FG
465 }
466}
7c673cae 467
7c673cae
FG
468int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
469{
11fdf7f2 470 std::lock_guard l(lock);
7c673cae 471 dout(10) << __func__ << " bdev " << id << dendl;
f67539c2
TL
472 ceph_assert(id < alloc.size());
473 for (auto& p : file_map) {
474 for (auto& q : p.second->fnode.extents) {
475 if (q.bdev == id) {
476 extents->insert(q.offset, q.length);
477 }
478 }
479 }
7c673cae
FG
480 return 0;
481}
482
9f95a23c 483int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
7c673cae 484{
11fdf7f2 485 std::unique_lock l(lock);
7c673cae
FG
486 dout(1) << __func__
487 << " osd_uuid " << osd_uuid
488 << dendl;
489
9f95a23c
TL
490 // set volume selector if not provided before/outside
491 if (vselector == nullptr) {
492 vselector.reset(
493 new OriginalVolumeSelector(
494 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
495 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
496 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
497 }
498
7c673cae
FG
499 _init_alloc();
500 _init_logger();
501
502 super.version = 1;
503 super.block_size = bdev[BDEV_DB]->get_block_size();
504 super.osd_uuid = osd_uuid;
505 super.uuid.generate_random();
506 dout(1) << __func__ << " uuid " << super.uuid << dendl;
507
508 // init log
9f95a23c 509 FileRef log_file = ceph::make_ref<File>();
7c673cae 510 log_file->fnode.ino = 1;
f6b5b4d7 511 log_file->vselector_hint = vselector->get_hint_for_log();
7c673cae 512 int r = _allocate(
9f95a23c 513 vselector->select_prefer_bdev(log_file->vselector_hint),
7c673cae 514 cct->_conf->bluefs_max_log_runway,
94b18763 515 &log_file->fnode);
9f95a23c 516 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
11fdf7f2 517 ceph_assert(r == 0);
7c673cae
FG
518 log_writer = _create_writer(log_file);
519
520 // initial txn
521 log_t.op_init();
7c673cae
FG
522 _flush_and_sync_log(l);
523
524 // write supers
525 super.log_fnode = log_file->fnode;
9f95a23c 526 super.memorized_layout = layout;
11fdf7f2 527 _write_super(BDEV_DB);
7c673cae
FG
528 flush_bdev();
529
530 // clean up
531 super = bluefs_super_t();
532 _close_writer(log_writer);
533 log_writer = NULL;
9f95a23c 534 vselector.reset(nullptr);
7c673cae
FG
535 _stop_alloc();
536 _shutdown_logger();
f67539c2
TL
537 if (shared_alloc) {
538 ceph_assert(shared_alloc->need_init);
539 shared_alloc->need_init = false;
540 }
7c673cae
FG
541
542 dout(10) << __func__ << " success" << dendl;
543 return 0;
544}
545
546void BlueFS::_init_alloc()
547{
548 dout(20) << __func__ << dendl;
eafe8130
TL
549
550 if (bdev[BDEV_WAL]) {
551 alloc_size[BDEV_WAL] = cct->_conf->bluefs_alloc_size;
552 }
553 if (bdev[BDEV_SLOW]) {
554 alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size;
555 alloc_size[BDEV_SLOW] = cct->_conf->bluefs_shared_alloc_size;
556 } else {
557 alloc_size[BDEV_DB] = cct->_conf->bluefs_shared_alloc_size;
558 }
559 // new wal and db devices are never shared
560 if (bdev[BDEV_NEWWAL]) {
561 alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size;
562 }
563 if (bdev[BDEV_NEWDB]) {
564 alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size;
565 }
566
7c673cae
FG
567 for (unsigned id = 0; id < bdev.size(); ++id) {
568 if (!bdev[id]) {
569 continue;
570 }
11fdf7f2 571 ceph_assert(bdev[id]->get_size());
eafe8130 572 ceph_assert(alloc_size[id]);
f67539c2
TL
573 if (is_shared_alloc(id)) {
574 dout(1) << __func__ << " shared, id " << id << std::hex
575 << ", capacity 0x" << bdev[id]->get_size()
576 << ", block size 0x" << alloc_size[id]
577 << std::dec << dendl;
578 } else {
579 std::string name = "bluefs-";
580 const char* devnames[] = { "wal","db","slow" };
581 if (id <= BDEV_SLOW)
582 name += devnames[id];
583 else
584 name += to_string(uintptr_t(this));
585 dout(1) << __func__ << " new, id " << id << std::hex
586 << ", allocator name " << name
587 << ", allocator type " << cct->_conf->bluefs_allocator
588 << ", capacity 0x" << bdev[id]->get_size()
589 << ", block size 0x" << alloc_size[id]
590 << std::dec << dendl;
591 alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
592 bdev[id]->get_size(),
593 alloc_size[id], name);
594 alloc[id]->init_add_free(
595 block_reserved[id],
596 _get_total(id));
7c673cae
FG
597 }
598 }
599}
600
601void BlueFS::_stop_alloc()
602{
603 dout(20) << __func__ << dendl;
11fdf7f2
TL
604 for (auto p : bdev) {
605 if (p)
606 p->discard_drain();
607 }
608
f67539c2
TL
609 for (size_t i = 0; i < alloc.size(); ++i) {
610 if (alloc[i] && !is_shared_alloc(i)) {
611 alloc[i]->shutdown();
612 delete alloc[i];
613 alloc[i] = nullptr;
7c673cae
FG
614 }
615 }
7c673cae
FG
616}
617
cd265ab1
TL
618int BlueFS::read(uint8_t ndev, uint64_t off, uint64_t len,
619 ceph::buffer::list *pbl, IOContext *ioc, bool buffered)
620{
621 dout(10) << __func__ << " dev " << int(ndev)
622 << ": 0x" << std::hex << off << "~" << len << std::dec
623 << (buffered ? " buffered" : "")
624 << dendl;
625 int r;
626 bufferlist bl;
627 r = bdev[ndev]->read(off, len, &bl, ioc, buffered);
628 if (r != 0) {
629 return r;
630 }
631 uint64_t block_size = bdev[ndev]->get_block_size();
632 if (inject_read_zeros) {
633 if (len >= block_size * 2) {
634 derr << __func__ << " injecting error, zeros at "
635 << int(ndev) << ": 0x" << std::hex << (off + len / 2)
636 << "~" << (block_size * 2) << std::dec << dendl;
637 //use beginning, replace 8K in the middle with zeros, use tail
638 bufferlist temp;
639 bl.splice(0, len / 2 - block_size, &temp);
f67539c2 640 temp.append(buffer::create(block_size * 2, 0));
cd265ab1
TL
641 bl.splice(block_size * 2, len / 2 - block_size, &temp);
642 bl = temp;
643 inject_read_zeros--;
644 }
645 }
646 //make a check if there is a block with all 0
647 uint64_t to_check_len = len;
648 uint64_t skip = p2nphase(off, block_size);
649 if (skip >= to_check_len) {
650 return r;
651 }
652 auto it = bl.begin(skip);
653 to_check_len -= skip;
654 bool all_zeros = false;
655 while (all_zeros == false && to_check_len >= block_size) {
656 // checking 0s step
657 unsigned block_left = block_size;
658 unsigned avail;
659 const char* data;
660 all_zeros = true;
661 while (all_zeros && block_left > 0) {
662 avail = it.get_ptr_and_advance(block_left, &data);
663 block_left -= avail;
664 all_zeros = mem_is_zero(data, avail);
665 }
666 // skipping step
667 while (block_left > 0) {
668 avail = it.get_ptr_and_advance(block_left, &data);
669 block_left -= avail;
670 }
671 to_check_len -= block_size;
672 }
673 if (all_zeros) {
674 logger->inc(l_bluefs_read_zeros_candidate, 1);
675 bufferlist bl_reread;
676 r = bdev[ndev]->read(off, len, &bl_reread, ioc, buffered);
677 if (r != 0) {
678 return r;
679 }
680 // check if both read gave the same
681 if (!bl.contents_equal(bl_reread)) {
682 // report problems to log, but continue, maybe it will be good now...
683 derr << __func__ << " initial read of " << int(ndev)
684 << ": 0x" << std::hex << off << "~" << len
685 << std::dec << ": different then re-read " << dendl;
686 logger->inc(l_bluefs_read_zeros_errors, 1);
687 }
688 // use second read will be better if is different
689 pbl->append(bl_reread);
690 } else {
691 pbl->append(bl);
692 }
693 return r;
694}
695
696int BlueFS::read_random(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered)
697{
698 dout(10) << __func__ << " dev " << int(ndev)
699 << ": 0x" << std::hex << off << "~" << len << std::dec
700 << (buffered ? " buffered" : "")
701 << dendl;
702 int r;
703 r = bdev[ndev]->read_random(off, len, buf, buffered);
704 if (r != 0) {
705 return r;
706 }
707 uint64_t block_size = bdev[ndev]->get_block_size();
708 if (inject_read_zeros) {
709 if (len >= block_size * 2) {
710 derr << __func__ << " injecting error, zeros at "
711 << int(ndev) << ": 0x" << std::hex << (off + len / 2)
712 << "~" << (block_size * 2) << std::dec << dendl;
713 //zero middle 8K
714 memset(buf + len / 2 - block_size, 0, block_size * 2);
715 inject_read_zeros--;
716 }
717 }
718 //make a check if there is a block with all 0
719 uint64_t to_check_len = len;
720 const char* data = buf;
721 uint64_t skip = p2nphase(off, block_size);
722 if (skip >= to_check_len) {
723 return r;
724 }
725 to_check_len -= skip;
726 data += skip;
727
728 bool all_zeros = false;
729 while (all_zeros == false && to_check_len >= block_size) {
730 if (mem_is_zero(data, block_size)) {
731 // at least one block is all zeros
732 all_zeros = true;
733 break;
734 }
735 data += block_size;
736 to_check_len -= block_size;
737 }
738 if (all_zeros) {
739 logger->inc(l_bluefs_read_zeros_candidate, 1);
740 std::unique_ptr<char[]> data_reread(new char[len]);
741 r = bdev[ndev]->read_random(off, len, &data_reread[0], buffered);
742 if (r != 0) {
743 return r;
744 }
745 // check if both read gave the same
746 if (memcmp(buf, &data_reread[0], len) != 0) {
747 derr << __func__ << " initial read of " << int(ndev)
748 << ": 0x" << std::hex << off << "~" << len
749 << std::dec << ": different then re-read " << dendl;
750 logger->inc(l_bluefs_read_zeros_errors, 1);
751 // second read is probably better
752 memcpy(buf, &data_reread[0], len);
753 }
754 }
755 return r;
756}
757
7c673cae
FG
758int BlueFS::mount()
759{
760 dout(1) << __func__ << dendl;
761
762 int r = _open_super();
763 if (r < 0) {
764 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
765 goto out;
766 }
767
9f95a23c
TL
768 // set volume selector if not provided before/outside
769 if (vselector == nullptr) {
770 vselector.reset(
771 new OriginalVolumeSelector(
772 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
773 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
774 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
775 }
776
7c673cae 777 _init_alloc();
494da23a 778 _init_logger();
7c673cae 779
11fdf7f2 780 r = _replay(false, false);
7c673cae
FG
781 if (r < 0) {
782 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
783 _stop_alloc();
784 goto out;
785 }
786
787 // init freelist
788 for (auto& p : file_map) {
789 dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
790 for (auto& q : p.second->fnode.extents) {
f67539c2
TL
791 bool is_shared = is_shared_alloc(q.bdev);
792 ceph_assert(!is_shared || (is_shared && shared_alloc));
793 if (is_shared && shared_alloc->need_init && shared_alloc->a) {
794 shared_alloc->bluefs_used += q.length;
795 alloc[q.bdev]->init_rm_free(q.offset, q.length);
796 } else if (!is_shared) {
797 alloc[q.bdev]->init_rm_free(q.offset, q.length);
798 }
7c673cae
FG
799 }
800 }
f67539c2
TL
801 if (shared_alloc) {
802 shared_alloc->need_init = false;
803 dout(1) << __func__ << " shared_bdev_used = "
804 << shared_alloc->bluefs_used << dendl;
805 } else {
806 dout(1) << __func__ << " shared bdev not used"
807 << dendl;
808 }
7c673cae
FG
809
810 // set up the log for future writes
811 log_writer = _create_writer(_get_file(1));
11fdf7f2 812 ceph_assert(log_writer->file->fnode.ino == 1);
7c673cae
FG
813 log_writer->pos = log_writer->file->fnode.size;
814 dout(10) << __func__ << " log write pos set to 0x"
815 << std::hex << log_writer->pos << std::dec
816 << dendl;
817
7c673cae
FG
818 return 0;
819
820 out:
821 super = bluefs_super_t();
822 return r;
823}
824
9f95a23c
TL
825int BlueFS::maybe_verify_layout(const bluefs_layout_t& layout) const
826{
827 if (super.memorized_layout) {
828 if (layout == *super.memorized_layout) {
829 dout(10) << __func__ << " bluefs layout verified positively" << dendl;
830 } else {
831 derr << __func__ << " memorized layout doesn't fit current one" << dendl;
832 return -EIO;
833 }
834 } else {
835 dout(10) << __func__ << " no memorized_layout in bluefs superblock"
836 << dendl;
837 }
838
839 return 0;
840}
841
1911f103 842void BlueFS::umount(bool avoid_compact)
7c673cae
FG
843{
844 dout(1) << __func__ << dendl;
845
1911f103 846 sync_metadata(avoid_compact);
7c673cae
FG
847
848 _close_writer(log_writer);
849 log_writer = NULL;
850
9f95a23c 851 vselector.reset(nullptr);
7c673cae
FG
852 _stop_alloc();
853 file_map.clear();
854 dir_map.clear();
855 super = bluefs_super_t();
856 log_t.clear();
857 _shutdown_logger();
858}
859
9f95a23c 860int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout)
7c673cae 861{
11fdf7f2
TL
862 dout(1) << __func__ << dendl;
863
864 if(id == BDEV_NEWDB) {
865 int new_log_dev_cur = BDEV_WAL;
866 int new_log_dev_next = BDEV_WAL;
867 if (!bdev[BDEV_WAL]) {
868 new_log_dev_cur = BDEV_NEWDB;
869 new_log_dev_next = BDEV_DB;
870 }
9f95a23c 871 _rewrite_log_and_layout_sync(false,
11fdf7f2
TL
872 BDEV_NEWDB,
873 new_log_dev_cur,
874 new_log_dev_next,
9f95a23c
TL
875 RENAME_DB2SLOW,
876 layout);
11fdf7f2
TL
877 //}
878 } else if(id == BDEV_NEWWAL) {
9f95a23c
TL
879 _rewrite_log_and_layout_sync(false,
880 BDEV_DB,
881 BDEV_NEWWAL,
882 BDEV_WAL,
883 REMOVE_WAL,
884 layout);
11fdf7f2
TL
885 } else {
886 assert(false);
887 }
888 return 0;
889}
890
891void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id)
892{
893 if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB])
7c673cae
FG
894 bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
895 if (bdev[BDEV_WAL])
896 bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
11fdf7f2
TL
897}
898
899void BlueFS::get_devices(set<string> *ls)
900{
901 for (unsigned i = 0; i < MAX_BDEV; ++i) {
902 if (bdev[i]) {
903 bdev[i]->get_devices(ls);
904 }
905 }
7c673cae
FG
906}
907
908int BlueFS::fsck()
909{
11fdf7f2 910 std::lock_guard l(lock);
7c673cae
FG
911 dout(1) << __func__ << dendl;
912 // hrm, i think we check everything on mount...
913 return 0;
914}
915
11fdf7f2 916int BlueFS::_write_super(int dev)
7c673cae
FG
917{
918 // build superblock
919 bufferlist bl;
11fdf7f2 920 encode(super, bl);
7c673cae 921 uint32_t crc = bl.crc32c(-1);
11fdf7f2 922 encode(crc, bl);
7c673cae
FG
923 dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
924 dout(10) << __func__ << " superblock " << super.version << dendl;
925 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
9f95a23c 926 ceph_assert_always(bl.length() <= get_super_length());
7c673cae
FG
927 bl.append_zero(get_super_length() - bl.length());
928
11fdf7f2 929 bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT);
7c673cae
FG
930 dout(20) << __func__ << " v " << super.version
931 << " crc 0x" << std::hex << crc
932 << " offset 0x" << get_super_offset() << std::dec
933 << dendl;
934 return 0;
935}
936
937int BlueFS::_open_super()
938{
939 dout(10) << __func__ << dendl;
940
941 bufferlist bl;
942 uint32_t expected_crc, crc;
943 int r;
944
945 // always the second block
946 r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
947 &bl, ioc[BDEV_DB], false);
948 if (r < 0)
949 return r;
950
11fdf7f2
TL
951 auto p = bl.cbegin();
952 decode(super, p);
7c673cae
FG
953 {
954 bufferlist t;
955 t.substr_of(bl, 0, p.get_off());
956 crc = t.crc32c(-1);
957 }
11fdf7f2 958 decode(expected_crc, p);
7c673cae
FG
959 if (crc != expected_crc) {
960 derr << __func__ << " bad crc on superblock, expected 0x"
961 << std::hex << expected_crc << " != actual 0x" << crc << std::dec
962 << dendl;
963 return -EIO;
964 }
965 dout(10) << __func__ << " superblock " << super.version << dendl;
966 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
967 return 0;
968}
969
9f95a23c
TL
970int BlueFS::_check_new_allocations(const bluefs_fnode_t& fnode,
971 size_t dev_count,
9f95a23c
TL
972 boost::dynamic_bitset<uint64_t>* used_blocks)
973{
974 auto& fnode_extents = fnode.extents;
975 for (auto e : fnode_extents) {
976 auto id = e.bdev;
977 bool fail = false;
978 ceph_assert(id < dev_count);
9f95a23c
TL
979
980 apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
981 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
982 if (bs.test(pos)) {
983 fail = true;
984 }
985 bs.set(pos);
986 }
987 );
988 if (fail) {
989 derr << __func__ << " invalid extent " << int(e.bdev)
990 << ": 0x" << std::hex << e.offset << "~" << e.length
991 << std::dec << ": duplicate reference, ino " << fnode.ino
992 << dendl;
993 return -EFAULT;
994 }
995 }
996 return 0;
997}
998
9f95a23c
TL
999int BlueFS::_verify_alloc_granularity(
1000 __u8 id, uint64_t offset, uint64_t length, const char *op)
1001{
1002 if ((offset & (alloc_size[id] - 1)) ||
1003 (length & (alloc_size[id] - 1))) {
1004 derr << __func__ << " " << op << " of " << (int)id
1005 << ":0x" << std::hex << offset << "~" << length << std::dec
1006 << " does not align to alloc_size 0x"
1007 << std::hex << alloc_size[id] << std::dec << dendl;
1008 // be helpful
1009 auto need = alloc_size[id];
1010 while (need && ((offset & (need - 1)) ||
1011 (length & (need - 1)))) {
1012 need >>= 1;
1013 }
1014 if (need) {
1015 const char *which;
1016 if (id == BDEV_SLOW ||
1017 (id == BDEV_DB && !bdev[BDEV_SLOW])) {
1018 which = "bluefs_shared_alloc_size";
1019 } else {
1020 which = "bluefs_alloc_size";
1021 }
1022 derr << "work-around by setting " << which << " = " << need
1023 << " for this OSD" << dendl;
1024 }
1025 return -EFAULT;
1026 }
1027 return 0;
1028}
1029
11fdf7f2 1030int BlueFS::_replay(bool noop, bool to_stdout)
7c673cae
FG
1031{
1032 dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
1033 ino_last = 1; // by the log
1034 log_seq = 0;
1035
1036 FileRef log_file;
11fdf7f2 1037 log_file = _get_file(1);
9f95a23c 1038
f67539c2 1039 log_file->fnode = super.log_fnode;
11fdf7f2 1040 if (!noop) {
9f95a23c 1041 log_file->vselector_hint =
f6b5b4d7 1042 vselector->get_hint_for_log();
7c673cae 1043 } else {
11fdf7f2
TL
1044 // do not use fnode from superblock in 'noop' mode - log_file's one should
1045 // be fine and up-to-date
1046 ceph_assert(log_file->fnode.ino == 1);
1047 ceph_assert(log_file->fnode.extents.size() != 0);
7c673cae 1048 }
7c673cae 1049 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
11fdf7f2
TL
1050 if (unlikely(to_stdout)) {
1051 std::cout << " log_fnode " << super.log_fnode << std::endl;
1052 }
7c673cae
FG
1053
1054 FileReader *log_reader = new FileReader(
1055 log_file, cct->_conf->bluefs_max_prefetch,
1056 false, // !random
1057 true); // ignore eof
9f95a23c
TL
1058
1059 bool seen_recs = false;
1060
1061 boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV];
9f95a23c 1062
f67539c2
TL
1063 if (!noop) {
1064 if (cct->_conf->bluefs_log_replay_check_allocations) {
1065 for (size_t i = 0; i < MAX_BDEV; ++i) {
1066 if (alloc_size[i] != 0 && bdev[i] != nullptr) {
1067 used_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]);
1068 }
9f95a23c
TL
1069 }
1070 }
1071 }
1072
1073 bool first_log_check = true;
1074
7c673cae 1075 while (true) {
11fdf7f2 1076 ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0);
7c673cae
FG
1077 uint64_t pos = log_reader->buf.pos;
1078 uint64_t read_pos = pos;
1079 bufferlist bl;
1080 {
f67539c2 1081 int r = _read(log_reader, read_pos, super.block_size,
7c673cae 1082 &bl, NULL);
f6b5b4d7
TL
1083 if (r != (int)super.block_size && cct->_conf->bluefs_replay_recovery) {
1084 r += do_replay_recovery_read(log_reader, pos, read_pos + r, super.block_size - r, &bl);
1085 }
1086 assert(r == (int)super.block_size);
7c673cae
FG
1087 read_pos += r;
1088 }
1089 uint64_t more = 0;
1090 uint64_t seq;
1091 uuid_d uuid;
1092 {
11fdf7f2 1093 auto p = bl.cbegin();
7c673cae
FG
1094 __u8 a, b;
1095 uint32_t len;
11fdf7f2
TL
1096 decode(a, p);
1097 decode(b, p);
1098 decode(len, p);
1099 decode(uuid, p);
1100 decode(seq, p);
7c673cae 1101 if (len + 6 > bl.length()) {
11fdf7f2 1102 more = round_up_to(len + 6 - bl.length(), super.block_size);
7c673cae
FG
1103 }
1104 }
1105 if (uuid != super.uuid) {
9f95a23c
TL
1106 if (seen_recs) {
1107 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1108 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1109 << dendl;
1110 } else {
1111 derr << __func__ << " 0x" << std::hex << pos << std::dec
1112 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1113 << ", block dump: \n";
1114 bufferlist t;
1115 t.substr_of(bl, 0, super.block_size);
1116 t.hexdump(*_dout);
1117 *_dout << dendl;
1118 }
7c673cae
FG
1119 break;
1120 }
1121 if (seq != log_seq + 1) {
9f95a23c
TL
1122 if (seen_recs) {
1123 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1124 << ": stop: seq " << seq << " != expected " << log_seq + 1
1125 << dendl;;
1126 } else {
1127 derr << __func__ << " 0x" << std::hex << pos << std::dec
1128 << ": stop: seq " << seq << " != expected " << log_seq + 1
1129 << dendl;;
1130 }
7c673cae
FG
1131 break;
1132 }
1133 if (more) {
1134 dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
1135 << " more bytes" << dendl;
1136 bufferlist t;
f67539c2 1137 int r = _read(log_reader, read_pos, more, &t, NULL);
7c673cae 1138 if (r < (int)more) {
f6b5b4d7
TL
1139 dout(10) << __func__ << " 0x" << std::hex << pos
1140 << ": stop: len is 0x" << bl.length() + more << std::dec
1141 << ", which is past eof" << dendl;
1142 if (cct->_conf->bluefs_replay_recovery) {
1143 //try to search for more data
1144 r += do_replay_recovery_read(log_reader, pos, read_pos + r, more - r, &t);
1145 if (r < (int)more) {
1146 //in normal mode we must read r==more, for recovery it is too strict
1147 break;
1148 }
1149 }
7c673cae 1150 }
11fdf7f2 1151 ceph_assert(r == (int)more);
7c673cae
FG
1152 bl.claim_append(t);
1153 read_pos += r;
1154 }
1155 bluefs_transaction_t t;
1156 try {
11fdf7f2
TL
1157 auto p = bl.cbegin();
1158 decode(t, p);
522d829b 1159 seen_recs = true;
7c673cae 1160 }
f67539c2 1161 catch (ceph::buffer::error& e) {
522d829b
TL
1162 // Multi-block transactions might be incomplete due to unexpected
1163 // power off. Hence let's treat that as a regular stop condition.
1164 if (seen_recs && more) {
1165 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1166 << ": stop: failed to decode: " << e.what()
1167 << dendl;
1168 } else {
1169 derr << __func__ << " 0x" << std::hex << pos << std::dec
1170 << ": stop: failed to decode: " << e.what()
1171 << dendl;
1172 delete log_reader;
1173 return -EIO;
1174 }
1175 break;
7c673cae 1176 }
11fdf7f2 1177 ceph_assert(seq == t.seq);
7c673cae
FG
1178 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1179 << ": " << t << dendl;
11fdf7f2
TL
1180 if (unlikely(to_stdout)) {
1181 std::cout << " 0x" << std::hex << pos << std::dec
1182 << ": " << t << std::endl;
1183 }
7c673cae 1184
11fdf7f2 1185 auto p = t.op_bl.cbegin();
7c673cae
FG
1186 while (!p.end()) {
1187 __u8 op;
11fdf7f2 1188 decode(op, p);
7c673cae
FG
1189 switch (op) {
1190
1191 case bluefs_transaction_t::OP_INIT:
1192 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1193 << ": op_init" << dendl;
11fdf7f2
TL
1194 if (unlikely(to_stdout)) {
1195 std::cout << " 0x" << std::hex << pos << std::dec
1196 << ": op_init" << std::endl;
1197 }
1198
1199 ceph_assert(t.seq == 1);
7c673cae
FG
1200 break;
1201
1202 case bluefs_transaction_t::OP_JUMP:
1203 {
1204 uint64_t next_seq;
1205 uint64_t offset;
11fdf7f2
TL
1206 decode(next_seq, p);
1207 decode(offset, p);
7c673cae
FG
1208 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1209 << ": op_jump seq " << next_seq
1210 << " offset 0x" << std::hex << offset << std::dec << dendl;
11fdf7f2
TL
1211 if (unlikely(to_stdout)) {
1212 std::cout << " 0x" << std::hex << pos << std::dec
1213 << ": op_jump seq " << next_seq
1214 << " offset 0x" << std::hex << offset << std::dec
1215 << std::endl;
1216 }
1217
1218 ceph_assert(next_seq >= log_seq);
7c673cae
FG
1219 log_seq = next_seq - 1; // we will increment it below
1220 uint64_t skip = offset - read_pos;
1221 if (skip) {
1222 bufferlist junk;
f67539c2 1223 int r = _read(log_reader, read_pos, skip, &junk,
7c673cae
FG
1224 NULL);
1225 if (r != (int)skip) {
1226 dout(10) << __func__ << " 0x" << std::hex << read_pos
1227 << ": stop: failed to skip to " << offset
1228 << std::dec << dendl;
11fdf7f2 1229 ceph_abort_msg("problem with op_jump");
7c673cae
FG
1230 }
1231 }
1232 }
1233 break;
1234
1235 case bluefs_transaction_t::OP_JUMP_SEQ:
1236 {
1237 uint64_t next_seq;
11fdf7f2 1238 decode(next_seq, p);
7c673cae
FG
1239 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1240 << ": op_jump_seq " << next_seq << dendl;
11fdf7f2
TL
1241 if (unlikely(to_stdout)) {
1242 std::cout << " 0x" << std::hex << pos << std::dec
1243 << ": op_jump_seq " << next_seq << std::endl;
1244 }
1245
1246 ceph_assert(next_seq >= log_seq);
7c673cae
FG
1247 log_seq = next_seq - 1; // we will increment it below
1248 }
1249 break;
1250
1251 case bluefs_transaction_t::OP_ALLOC_ADD:
f67539c2 1252 // LEGACY, do nothing but read params
7c673cae 1253 {
f67539c2
TL
1254 __u8 id;
1255 uint64_t offset, length;
1256 decode(id, p);
1257 decode(offset, p);
1258 decode(length, p);
1259 }
7c673cae
FG
1260 break;
1261
1262 case bluefs_transaction_t::OP_ALLOC_RM:
f67539c2 1263 // LEGACY, do nothing but read params
7c673cae 1264 {
f67539c2
TL
1265 __u8 id;
1266 uint64_t offset, length;
1267 decode(id, p);
1268 decode(offset, p);
1269 decode(length, p);
1270 }
1271 break;
7c673cae
FG
1272
1273 case bluefs_transaction_t::OP_DIR_LINK:
1274 {
1275 string dirname, filename;
1276 uint64_t ino;
11fdf7f2
TL
1277 decode(dirname, p);
1278 decode(filename, p);
1279 decode(ino, p);
7c673cae
FG
1280 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1281 << ": op_dir_link " << " " << dirname << "/" << filename
1282 << " to " << ino
1283 << dendl;
11fdf7f2
TL
1284 if (unlikely(to_stdout)) {
1285 std::cout << " 0x" << std::hex << pos << std::dec
1286 << ": op_dir_link " << " " << dirname << "/" << filename
1287 << " to " << ino
1288 << std::endl;
1289 }
1290
7c673cae
FG
1291 if (!noop) {
1292 FileRef file = _get_file(ino);
11fdf7f2 1293 ceph_assert(file->fnode.ino);
7c673cae 1294 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2 1295 ceph_assert(q != dir_map.end());
7c673cae 1296 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
11fdf7f2 1297 ceph_assert(r == q->second->file_map.end());
9f95a23c
TL
1298
1299 vselector->sub_usage(file->vselector_hint, file->fnode);
1300 file->vselector_hint =
1301 vselector->get_hint_by_dir(dirname);
1302 vselector->add_usage(file->vselector_hint, file->fnode);
1303
7c673cae
FG
1304 q->second->file_map[filename] = file;
1305 ++file->refs;
1306 }
1307 }
1308 break;
1309
1310 case bluefs_transaction_t::OP_DIR_UNLINK:
1311 {
1312 string dirname, filename;
11fdf7f2
TL
1313 decode(dirname, p);
1314 decode(filename, p);
7c673cae
FG
1315 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1316 << ": op_dir_unlink " << " " << dirname << "/" << filename
1317 << dendl;
11fdf7f2
TL
1318 if (unlikely(to_stdout)) {
1319 std::cout << " 0x" << std::hex << pos << std::dec
1320 << ": op_dir_unlink " << " " << dirname << "/" << filename
1321 << std::endl;
1322 }
1323
7c673cae
FG
1324 if (!noop) {
1325 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2 1326 ceph_assert(q != dir_map.end());
7c673cae 1327 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
11fdf7f2
TL
1328 ceph_assert(r != q->second->file_map.end());
1329 ceph_assert(r->second->refs > 0);
7c673cae
FG
1330 --r->second->refs;
1331 q->second->file_map.erase(r);
1332 }
1333 }
1334 break;
1335
1336 case bluefs_transaction_t::OP_DIR_CREATE:
1337 {
1338 string dirname;
11fdf7f2 1339 decode(dirname, p);
7c673cae
FG
1340 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1341 << ": op_dir_create " << dirname << dendl;
11fdf7f2
TL
1342 if (unlikely(to_stdout)) {
1343 std::cout << " 0x" << std::hex << pos << std::dec
1344 << ": op_dir_create " << dirname << std::endl;
1345 }
1346
7c673cae
FG
1347 if (!noop) {
1348 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2 1349 ceph_assert(q == dir_map.end());
9f95a23c 1350 dir_map[dirname] = ceph::make_ref<Dir>();
7c673cae
FG
1351 }
1352 }
1353 break;
1354
1355 case bluefs_transaction_t::OP_DIR_REMOVE:
1356 {
1357 string dirname;
11fdf7f2 1358 decode(dirname, p);
7c673cae
FG
1359 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1360 << ": op_dir_remove " << dirname << dendl;
11fdf7f2
TL
1361 if (unlikely(to_stdout)) {
1362 std::cout << " 0x" << std::hex << pos << std::dec
1363 << ": op_dir_remove " << dirname << std::endl;
1364 }
1365
7c673cae
FG
1366 if (!noop) {
1367 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2
TL
1368 ceph_assert(q != dir_map.end());
1369 ceph_assert(q->second->file_map.empty());
7c673cae
FG
1370 dir_map.erase(q);
1371 }
1372 }
1373 break;
1374
1375 case bluefs_transaction_t::OP_FILE_UPDATE:
1376 {
1377 bluefs_fnode_t fnode;
11fdf7f2 1378 decode(fnode, p);
7c673cae 1379 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
9f95a23c 1380 << ": op_file_update " << " " << fnode << " " << dendl;
11fdf7f2
TL
1381 if (unlikely(to_stdout)) {
1382 std::cout << " 0x" << std::hex << pos << std::dec
1383 << ": op_file_update " << " " << fnode << std::endl;
1384 }
9f95a23c 1385 if (!noop) {
7c673cae 1386 FileRef f = _get_file(fnode.ino);
9f95a23c
TL
1387 if (cct->_conf->bluefs_log_replay_check_allocations) {
1388 // check initial log layout
1389 if (first_log_check) {
1390 first_log_check = false;
1391 int r = _check_new_allocations(log_file->fnode,
f67539c2 1392 MAX_BDEV, used_blocks);
9f95a23c
TL
1393 if (r < 0) {
1394 return r;
1395 }
1396 }
1397
1398 auto& fnode_extents = f->fnode.extents;
1399 for (auto e : fnode_extents) {
1400 auto id = e.bdev;
1401 if (int r = _verify_alloc_granularity(id, e.offset, e.length,
1402 "OP_FILE_UPDATE"); r < 0) {
1403 return r;
1404 }
1405 apply_for_bitset_range(e.offset, e.length, alloc_size[id],
1406 used_blocks[id],
1407 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1408 ceph_assert(bs.test(pos));
1409 bs.reset(pos);
1410 }
1411 );
1412 }
1413 }
1414
1415 if (fnode.ino != 1) {
1416 vselector->sub_usage(f->vselector_hint, f->fnode);
1417 }
1418 f->fnode = fnode;
1419 if (fnode.ino != 1) {
1420 vselector->add_usage(f->vselector_hint, f->fnode);
1421 }
1422
7c673cae
FG
1423 if (fnode.ino > ino_last) {
1424 ino_last = fnode.ino;
1425 }
9f95a23c
TL
1426 if (cct->_conf->bluefs_log_replay_check_allocations) {
1427 int r = _check_new_allocations(f->fnode,
f67539c2 1428 MAX_BDEV, used_blocks);
9f95a23c
TL
1429 if (r < 0) {
1430 return r;
1431 }
1432 }
522d829b
TL
1433 } else if (noop && fnode.ino == 1) {
1434 FileRef f = _get_file(fnode.ino);
1435 f->fnode = fnode;
7c673cae 1436 }
9f95a23c 1437 }
7c673cae
FG
1438 break;
1439
1440 case bluefs_transaction_t::OP_FILE_REMOVE:
1441 {
1442 uint64_t ino;
11fdf7f2 1443 decode(ino, p);
7c673cae
FG
1444 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1445 << ": op_file_remove " << ino << dendl;
11fdf7f2
TL
1446 if (unlikely(to_stdout)) {
1447 std::cout << " 0x" << std::hex << pos << std::dec
1448 << ": op_file_remove " << ino << std::endl;
1449 }
1450
9f95a23c
TL
1451 if (!noop) {
1452 auto p = file_map.find(ino);
1453 ceph_assert(p != file_map.end());
1454 vselector->sub_usage(p->second->vselector_hint, p->second->fnode);
1455 if (cct->_conf->bluefs_log_replay_check_allocations) {
1456 auto& fnode_extents = p->second->fnode.extents;
1457 for (auto e : fnode_extents) {
1458 auto id = e.bdev;
1459 bool fail = false;
9f95a23c
TL
1460
1461 apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
1462 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1463 if (!bs.test(pos)) {
1464 fail = true;
1465 }
1466 bs.reset(pos);
1467 }
1468 );
1469 if (fail) {
1470 derr << __func__ << " invalid extent " << int(id)
1471 << ": 0x" << std::hex << e.offset << "~" << e.length
1472 << std::dec
1473 << ": not in use but is allocated for removed ino " << ino
1474 << dendl;
1475 return -EFAULT;
1476 }
1477 }
1478 }
1479 file_map.erase(p);
1480 }
1481 }
7c673cae
FG
1482 break;
1483
1484 default:
1485 derr << __func__ << " 0x" << std::hex << pos << std::dec
1486 << ": stop: unrecognized op " << (int)op << dendl;
1487 delete log_reader;
1488 return -EIO;
1489 }
1490 }
11fdf7f2 1491 ceph_assert(p.end());
7c673cae
FG
1492
1493 // we successfully replayed the transaction; bump the seq and log size
1494 ++log_seq;
1495 log_file->fnode.size = log_reader->buf.pos;
1496 }
f67539c2
TL
1497 if (!noop) {
1498 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
1499 }
9f95a23c
TL
1500 if (!noop && first_log_check &&
1501 cct->_conf->bluefs_log_replay_check_allocations) {
1502 int r = _check_new_allocations(log_file->fnode,
f67539c2 1503 MAX_BDEV, used_blocks);
9f95a23c
TL
1504 if (r < 0) {
1505 return r;
1506 }
1507 }
7c673cae
FG
1508
1509 dout(10) << __func__ << " log file size was 0x"
1510 << std::hex << log_file->fnode.size << std::dec << dendl;
11fdf7f2
TL
1511 if (unlikely(to_stdout)) {
1512 std::cout << " log file size was 0x"
1513 << std::hex << log_file->fnode.size << std::dec << std::endl;
1514 }
1515
7c673cae
FG
1516 delete log_reader;
1517
1518 if (!noop) {
1519 // verify file link counts are all >0
1520 for (auto& p : file_map) {
1521 if (p.second->refs == 0 &&
1522 p.second->fnode.ino > 1) {
1523 derr << __func__ << " file with link count 0: " << p.second->fnode
1524 << dendl;
1525 return -EIO;
1526 }
1527 }
1528 }
1529
1530 dout(10) << __func__ << " done" << dendl;
1531 return 0;
1532}
1533
11fdf7f2
TL
1534int BlueFS::log_dump()
1535{
1536 // only dump log file's content
f67539c2
TL
1537 ceph_assert(log_writer == nullptr && "cannot log_dump on mounted BlueFS");
1538 int r = _open_super();
11fdf7f2 1539 if (r < 0) {
f67539c2 1540 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
11fdf7f2
TL
1541 return r;
1542 }
f67539c2
TL
1543 _init_logger();
1544 r = _replay(true, true);
1545 if (r < 0) {
1546 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
1547 }
1548 _shutdown_logger();
1549 super = bluefs_super_t();
1550 return r;
11fdf7f2
TL
1551}
1552
1553int BlueFS::device_migrate_to_existing(
1554 CephContext *cct,
1555 const set<int>& devs_source,
9f95a23c
TL
1556 int dev_target,
1557 const bluefs_layout_t& layout)
11fdf7f2
TL
1558{
1559 vector<byte> buf;
1560 bool buffered = cct->_conf->bluefs_buffered_io;
1561
eafe8130
TL
1562 dout(10) << __func__ << " devs_source " << devs_source
1563 << " dev_target " << dev_target << dendl;
11fdf7f2
TL
1564 assert(dev_target < (int)MAX_BDEV);
1565
1566 int flags = 0;
1567 flags |= devs_source.count(BDEV_DB) ?
1568 (REMOVE_DB | RENAME_SLOW2DB) : 0;
1569 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1570 int dev_target_new = dev_target;
1571
1572 // Slow device without separate DB one is addressed via BDEV_DB
1573 // Hence need renaming.
1574 if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) {
1575 dev_target_new = BDEV_DB;
1576 dout(0) << __func__ << " super to be written to " << dev_target << dendl;
1577 }
1578
9f95a23c 1579 for (auto& [ino, file_ref] : file_map) {
11fdf7f2 1580 //do not copy log
9f95a23c 1581 if (file_ref->fnode.ino == 1) {
11fdf7f2
TL
1582 continue;
1583 }
9f95a23c 1584 dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
eafe8130 1585
9f95a23c 1586 auto& fnode_extents = file_ref->fnode.extents;
11fdf7f2 1587
9f95a23c
TL
1588 bool rewrite = std::any_of(
1589 fnode_extents.begin(),
1590 fnode_extents.end(),
1591 [=](auto& ext) {
1592 return ext.bdev != dev_target && devs_source.count(ext.bdev);
1593 });
eafe8130
TL
1594 if (rewrite) {
1595 dout(10) << __func__ << " migrating" << dendl;
1596
1597 // read entire file
1598 bufferlist bl;
1599 for (auto old_ext : fnode_extents) {
1600 buf.resize(old_ext.length);
1601 int r = bdev[old_ext.bdev]->read_random(
1602 old_ext.offset,
1603 old_ext.length,
1604 (char*)&buf.at(0),
1605 buffered);
1606 if (r != 0) {
1607 derr << __func__ << " failed to read 0x" << std::hex
1608 << old_ext.offset << "~" << old_ext.length << std::dec
1609 << " from " << (int)dev_target << dendl;
1610 return -EIO;
1611 }
1612 bl.append((char*)&buf[0], old_ext.length);
1613 }
11fdf7f2 1614
eafe8130
TL
1615 // write entire file
1616 PExtentVector extents;
1617 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1618 if (l < 0) {
1619 derr << __func__ << " unable to allocate len 0x" << std::hex
1620 << bl.length() << std::dec << " from " << (int)dev_target
1621 << ": " << cpp_strerror(l) << dendl;
1622 return -ENOSPC;
1623 }
11fdf7f2 1624
eafe8130
TL
1625 uint64_t off = 0;
1626 for (auto& i : extents) {
1627 bufferlist cur;
1628 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1629 ceph_assert(cur_len > 0);
1630 cur.substr_of(bl, off, cur_len);
1631 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1632 ceph_assert(r == 0);
1633 off += cur_len;
1634 }
1635
1636 // release old extents
1637 for (auto old_ext : fnode_extents) {
1638 PExtentVector to_release;
1639 to_release.emplace_back(old_ext.offset, old_ext.length);
1640 alloc[old_ext.bdev]->release(to_release);
f67539c2
TL
1641 if (is_shared_alloc(old_ext.bdev)) {
1642 shared_alloc->bluefs_used -= to_release.size();
1643 }
eafe8130
TL
1644 }
1645
1646 // update fnode
1647 fnode_extents.clear();
1648 for (auto& i : extents) {
1649 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
1650 }
1651 } else {
9f95a23c
TL
1652 for (auto& ext : fnode_extents) {
1653 if (dev_target != dev_target_new && ext.bdev == dev_target) {
eafe8130 1654 dout(20) << __func__ << " " << " ... adjusting extent 0x"
9f95a23c 1655 << std::hex << ext.offset << std::dec
eafe8130
TL
1656 << " bdev " << dev_target << " -> " << dev_target_new
1657 << dendl;
9f95a23c 1658 ext.bdev = dev_target_new;
11fdf7f2 1659 }
11fdf7f2
TL
1660 }
1661 }
11fdf7f2
TL
1662 }
1663 // new logging device in the current naming scheme
1664 int new_log_dev_cur = bdev[BDEV_WAL] ?
1665 BDEV_WAL :
1666 bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
1667
1668 // new logging device in new naming scheme
1669 int new_log_dev_next = new_log_dev_cur;
1670
1671 if (devs_source.count(new_log_dev_cur)) {
1672 // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1673 new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ?
1674 BDEV_DB :
1675 BDEV_WAL;
1676
1677 dout(0) << __func__ << " log moved from " << new_log_dev_cur
1678 << " to " << new_log_dev_next << dendl;
1679
1680 new_log_dev_cur =
1681 (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ?
1682 BDEV_SLOW :
1683 new_log_dev_next;
1684 }
1685
9f95a23c 1686 _rewrite_log_and_layout_sync(
11fdf7f2
TL
1687 false,
1688 (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB,
1689 new_log_dev_cur,
1690 new_log_dev_next,
9f95a23c
TL
1691 flags,
1692 layout);
11fdf7f2
TL
1693 return 0;
1694}
1695
1696int BlueFS::device_migrate_to_new(
1697 CephContext *cct,
1698 const set<int>& devs_source,
9f95a23c
TL
1699 int dev_target,
1700 const bluefs_layout_t& layout)
11fdf7f2
TL
1701{
1702 vector<byte> buf;
1703 bool buffered = cct->_conf->bluefs_buffered_io;
1704
eafe8130
TL
1705 dout(10) << __func__ << " devs_source " << devs_source
1706 << " dev_target " << dev_target << dendl;
11fdf7f2
TL
1707 assert(dev_target == (int)BDEV_NEWDB || (int)BDEV_NEWWAL);
1708
1709 int flags = 0;
1710
1711 flags |= devs_source.count(BDEV_DB) ?
1712 (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) :
1713 0;
1714 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
9f95a23c 1715 int dev_target_new = dev_target; //FIXME: remove, makes no sense
11fdf7f2
TL
1716
1717 for (auto& p : file_map) {
1718 //do not copy log
1719 if (p.second->fnode.ino == 1) {
1720 continue;
1721 }
eafe8130
TL
1722 dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl;
1723
11fdf7f2
TL
1724 auto& fnode_extents = p.second->fnode.extents;
1725
eafe8130 1726 bool rewrite = false;
11fdf7f2 1727 for (auto ext_it = fnode_extents.begin();
eafe8130
TL
1728 ext_it != p.second->fnode.extents.end();
1729 ++ext_it) {
11fdf7f2 1730 if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
eafe8130
TL
1731 rewrite = true;
1732 break;
1733 }
1734 }
1735 if (rewrite) {
1736 dout(10) << __func__ << " migrating" << dendl;
1737
1738 // read entire file
1739 bufferlist bl;
1740 for (auto old_ext : fnode_extents) {
1741 buf.resize(old_ext.length);
1742 int r = bdev[old_ext.bdev]->read_random(
1743 old_ext.offset,
1744 old_ext.length,
1745 (char*)&buf.at(0),
1746 buffered);
1747 if (r != 0) {
1748 derr << __func__ << " failed to read 0x" << std::hex
1749 << old_ext.offset << "~" << old_ext.length << std::dec
1750 << " from " << (int)dev_target << dendl;
1751 return -EIO;
11fdf7f2 1752 }
eafe8130
TL
1753 bl.append((char*)&buf[0], old_ext.length);
1754 }
1755
1756 // write entire file
1757 PExtentVector extents;
1758 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1759 if (l < 0) {
1760 derr << __func__ << " unable to allocate len 0x" << std::hex
1761 << bl.length() << std::dec << " from " << (int)dev_target
1762 << ": " << cpp_strerror(l) << dendl;
1763 return -ENOSPC;
1764 }
1765
1766 uint64_t off = 0;
1767 for (auto& i : extents) {
1768 bufferlist cur;
1769 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1770 ceph_assert(cur_len > 0);
1771 cur.substr_of(bl, off, cur_len);
1772 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1773 ceph_assert(r == 0);
1774 off += cur_len;
1775 }
1776
1777 // release old extents
1778 for (auto old_ext : fnode_extents) {
1779 PExtentVector to_release;
1780 to_release.emplace_back(old_ext.offset, old_ext.length);
1781 alloc[old_ext.bdev]->release(to_release);
f67539c2
TL
1782 if (is_shared_alloc(old_ext.bdev)) {
1783 shared_alloc->bluefs_used -= to_release.size();
1784 }
eafe8130
TL
1785 }
1786
1787 // update fnode
1788 fnode_extents.clear();
1789 for (auto& i : extents) {
1790 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
11fdf7f2
TL
1791 }
1792 }
11fdf7f2
TL
1793 }
1794 // new logging device in the current naming scheme
1795 int new_log_dev_cur =
1796 bdev[BDEV_NEWWAL] ?
1797 BDEV_NEWWAL :
1798 bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ?
1799 BDEV_WAL :
1800 bdev[BDEV_NEWDB] ?
1801 BDEV_NEWDB :
1802 bdev[BDEV_DB] && !(flags & REMOVE_DB)?
1803 BDEV_DB :
1804 BDEV_SLOW;
1805
1806 // new logging device in new naming scheme
1807 int new_log_dev_next =
1808 new_log_dev_cur == BDEV_NEWWAL ?
1809 BDEV_WAL :
1810 new_log_dev_cur == BDEV_NEWDB ?
1811 BDEV_DB :
1812 new_log_dev_cur;
1813
1814 int super_dev =
1815 dev_target == BDEV_NEWDB ?
1816 BDEV_NEWDB :
1817 bdev[BDEV_DB] ?
1818 BDEV_DB :
1819 BDEV_SLOW;
1820
9f95a23c 1821 _rewrite_log_and_layout_sync(
11fdf7f2
TL
1822 false,
1823 super_dev,
1824 new_log_dev_cur,
1825 new_log_dev_next,
9f95a23c
TL
1826 flags,
1827 layout);
11fdf7f2
TL
1828 return 0;
1829}
1830
7c673cae
FG
1831BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
1832{
1833 auto p = file_map.find(ino);
1834 if (p == file_map.end()) {
9f95a23c 1835 FileRef f = ceph::make_ref<File>();
7c673cae
FG
1836 file_map[ino] = f;
1837 dout(30) << __func__ << " ino " << ino << " = " << f
1838 << " (new)" << dendl;
1839 return f;
1840 } else {
1841 dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
1842 return p->second;
1843 }
1844}
1845
1846void BlueFS::_drop_link(FileRef file)
1847{
1848 dout(20) << __func__ << " had refs " << file->refs
1849 << " on " << file->fnode << dendl;
11fdf7f2 1850 ceph_assert(file->refs > 0);
7c673cae
FG
1851 --file->refs;
1852 if (file->refs == 0) {
1853 dout(20) << __func__ << " destroying " << file->fnode << dendl;
11fdf7f2 1854 ceph_assert(file->num_reading.load() == 0);
9f95a23c 1855 vselector->sub_usage(file->vselector_hint, file->fnode);
7c673cae
FG
1856 log_t.op_file_remove(file->fnode.ino);
1857 for (auto& r : file->fnode.extents) {
1858 pending_release[r.bdev].insert(r.offset, r.length);
1859 }
1860 file_map.erase(file->fnode.ino);
1861 file->deleted = true;
94b18763 1862
7c673cae 1863 if (file->dirty_seq) {
11fdf7f2
TL
1864 ceph_assert(file->dirty_seq > log_seq_stable);
1865 ceph_assert(dirty_files.count(file->dirty_seq));
7c673cae
FG
1866 auto it = dirty_files[file->dirty_seq].iterator_to(*file);
1867 dirty_files[file->dirty_seq].erase(it);
1868 file->dirty_seq = 0;
1869 }
1870 }
1871}
1872
adb31ebb 1873int64_t BlueFS::_read_random(
7c673cae
FG
1874 FileReader *h, ///< [in] read from here
1875 uint64_t off, ///< [in] offset
9f95a23c 1876 uint64_t len, ///< [in] this many bytes
f67539c2 1877 char *out) ///< [out] copy it here
7c673cae 1878{
494da23a
TL
1879 auto* buf = &h->buf;
1880
adb31ebb 1881 int64_t ret = 0;
7c673cae
FG
1882 dout(10) << __func__ << " h " << h
1883 << " 0x" << std::hex << off << "~" << len << std::dec
1884 << " from " << h->file->fnode << dendl;
1885
1886 ++h->file->num_reading;
1887
1888 if (!h->ignore_eof &&
1889 off + len > h->file->fnode.size) {
1890 if (off > h->file->fnode.size)
1891 len = 0;
1892 else
1893 len = h->file->fnode.size - off;
1894 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
1895 << std::hex << len << std::dec << dendl;
1896 }
494da23a
TL
1897 logger->inc(l_bluefs_read_random_count, 1);
1898 logger->inc(l_bluefs_read_random_bytes, len);
7c673cae 1899
494da23a 1900 std::shared_lock s_lock(h->lock);
f91f0fd5 1901 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
7c673cae 1902 while (len > 0) {
494da23a
TL
1903 if (off < buf->bl_off || off >= buf->get_buf_end()) {
1904 s_lock.unlock();
1905 uint64_t x_off = 0;
1906 auto p = h->file->fnode.seek(off, &x_off);
f6b5b4d7 1907 ceph_assert(p != h->file->fnode.extents.end());
9f95a23c 1908 uint64_t l = std::min(p->length - x_off, len);
adb31ebb
TL
1909 //hard cap to 1GB
1910 l = std::min(l, uint64_t(1) << 30);
494da23a
TL
1911 dout(20) << __func__ << " read random 0x"
1912 << std::hex << x_off << "~" << l << std::dec
1913 << " of " << *p << dendl;
cd265ab1
TL
1914 int r;
1915 if (!cct->_conf->bluefs_check_for_zeros) {
1916 r = bdev[p->bdev]->read_random(p->offset + x_off, l, out,
1917 cct->_conf->bluefs_buffered_io);
1918 } else {
1919 r = read_random(p->bdev, p->offset + x_off, l, out,
1920 cct->_conf->bluefs_buffered_io);
1921 }
494da23a
TL
1922 ceph_assert(r == 0);
1923 off += l;
1924 len -= l;
1925 ret += l;
1926 out += l;
1927
1928 logger->inc(l_bluefs_read_random_disk_count, 1);
1929 logger->inc(l_bluefs_read_random_disk_bytes, l);
1930 if (len > 0) {
1931 s_lock.lock();
1932 }
1933 } else {
1934 auto left = buf->get_buf_remaining(off);
adb31ebb 1935 int64_t r = std::min(len, left);
494da23a
TL
1936 logger->inc(l_bluefs_read_random_buffer_count, 1);
1937 logger->inc(l_bluefs_read_random_buffer_bytes, r);
1938 dout(20) << __func__ << " left 0x" << std::hex << left
1939 << " 0x" << off << "~" << len << std::dec
1940 << dendl;
1941
f67539c2
TL
1942 auto p = buf->bl.begin();
1943 p.seek(off - buf->bl_off);
1944 p.copy(r, out);
1945 out += r;
7c673cae 1946
494da23a
TL
1947 dout(30) << __func__ << " result chunk (0x"
1948 << std::hex << r << std::dec << " bytes):\n";
1949 bufferlist t;
1950 t.substr_of(buf->bl, off - buf->bl_off, r);
1951 t.hexdump(*_dout);
1952 *_dout << dendl;
1953
1954 off += r;
1955 len -= r;
1956 ret += r;
1957 buf->pos += r;
1958 }
1959 }
7c673cae
FG
1960 dout(20) << __func__ << " got " << ret << dendl;
1961 --h->file->num_reading;
1962 return ret;
1963}
1964
adb31ebb 1965int64_t BlueFS::_read(
7c673cae 1966 FileReader *h, ///< [in] read from here
7c673cae
FG
1967 uint64_t off, ///< [in] offset
1968 size_t len, ///< [in] this many bytes
1969 bufferlist *outbl, ///< [out] optional: reference the result here
1970 char *out) ///< [out] optional: or copy it here
1971{
f67539c2
TL
1972 FileReaderBuffer *buf = &(h->buf);
1973
494da23a 1974 bool prefetch = !outbl && !out;
7c673cae
FG
1975 dout(10) << __func__ << " h " << h
1976 << " 0x" << std::hex << off << "~" << len << std::dec
494da23a
TL
1977 << " from " << h->file->fnode
1978 << (prefetch ? " prefetch" : "")
1979 << dendl;
7c673cae
FG
1980
1981 ++h->file->num_reading;
1982
1983 if (!h->ignore_eof &&
1984 off + len > h->file->fnode.size) {
1985 if (off > h->file->fnode.size)
1986 len = 0;
1987 else
1988 len = h->file->fnode.size - off;
1989 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
1990 << std::hex << len << std::dec << dendl;
1991 }
494da23a
TL
1992 logger->inc(l_bluefs_read_count, 1);
1993 logger->inc(l_bluefs_read_bytes, len);
1994 if (prefetch) {
1995 logger->inc(l_bluefs_read_prefetch_count, 1);
1996 logger->inc(l_bluefs_read_prefetch_bytes, len);
1997 }
1998
7c673cae
FG
1999 if (outbl)
2000 outbl->clear();
2001
adb31ebb 2002 int64_t ret = 0;
494da23a 2003 std::shared_lock s_lock(h->lock);
7c673cae
FG
2004 while (len > 0) {
2005 size_t left;
2006 if (off < buf->bl_off || off >= buf->get_buf_end()) {
494da23a
TL
2007 s_lock.unlock();
2008 std::unique_lock u_lock(h->lock);
f91f0fd5 2009 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
494da23a
TL
2010 if (off < buf->bl_off || off >= buf->get_buf_end()) {
2011 // if precondition hasn't changed during locking upgrade.
2012 buf->bl.clear();
2013 buf->bl_off = off & super.block_mask();
2014 uint64_t x_off = 0;
2015 auto p = h->file->fnode.seek(buf->bl_off, &x_off);
f6b5b4d7
TL
2016 if (p == h->file->fnode.extents.end()) {
2017 dout(5) << __func__ << " reading less then required "
2018 << ret << "<" << ret + len << dendl;
2019 break;
2020 }
2021
494da23a
TL
2022 uint64_t want = round_up_to(len + (off & ~super.block_mask()),
2023 super.block_size);
2024 want = std::max(want, buf->max_prefetch);
2025 uint64_t l = std::min(p->length - x_off, want);
adb31ebb
TL
2026 //hard cap to 1GB
2027 l = std::min(l, uint64_t(1) << 30);
494da23a
TL
2028 uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size);
2029 if (!h->ignore_eof &&
2030 buf->bl_off + l > eof_offset) {
2031 l = eof_offset - buf->bl_off;
2032 }
2033 dout(20) << __func__ << " fetching 0x"
2034 << std::hex << x_off << "~" << l << std::dec
2035 << " of " << *p << dendl;
cd265ab1
TL
2036 int r;
2037 if (!cct->_conf->bluefs_check_for_zeros) {
2038 r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev],
2039 cct->_conf->bluefs_buffered_io);
2040 } else {
2041 r = read(p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev],
2042 cct->_conf->bluefs_buffered_io);
2043 }
494da23a 2044 ceph_assert(r == 0);
7c673cae 2045 }
494da23a
TL
2046 u_lock.unlock();
2047 s_lock.lock();
2048 // we should recheck if buffer is valid after lock downgrade
2049 continue;
7c673cae
FG
2050 }
2051 left = buf->get_buf_remaining(off);
2052 dout(20) << __func__ << " left 0x" << std::hex << left
2053 << " len 0x" << len << std::dec << dendl;
2054
adb31ebb 2055 int64_t r = std::min(len, left);
7c673cae
FG
2056 if (outbl) {
2057 bufferlist t;
2058 t.substr_of(buf->bl, off - buf->bl_off, r);
2059 outbl->claim_append(t);
2060 }
2061 if (out) {
f67539c2
TL
2062 auto p = buf->bl.begin();
2063 p.seek(off - buf->bl_off);
2064 p.copy(r, out);
7c673cae
FG
2065 out += r;
2066 }
2067
2068 dout(30) << __func__ << " result chunk (0x"
2069 << std::hex << r << std::dec << " bytes):\n";
2070 bufferlist t;
2071 t.substr_of(buf->bl, off - buf->bl_off, r);
2072 t.hexdump(*_dout);
2073 *_dout << dendl;
2074
2075 off += r;
2076 len -= r;
2077 ret += r;
2078 buf->pos += r;
2079 }
f67539c2 2080
7c673cae 2081 dout(20) << __func__ << " got " << ret << dendl;
11fdf7f2 2082 ceph_assert(!outbl || (int)outbl->length() == ret);
7c673cae
FG
2083 --h->file->num_reading;
2084 return ret;
2085}
2086
2087void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
2088{
2089 dout(10) << __func__ << " file " << f->fnode
2090 << " 0x" << std::hex << offset << "~" << length << std::dec
2091 << dendl;
2092 if (offset & ~super.block_mask()) {
2093 offset &= super.block_mask();
11fdf7f2 2094 length = round_up_to(length, super.block_size);
7c673cae
FG
2095 }
2096 uint64_t x_off = 0;
2097 auto p = f->fnode.seek(offset, &x_off);
2098 while (length > 0 && p != f->fnode.extents.end()) {
11fdf7f2 2099 uint64_t x_len = std::min(p->length - x_off, length);
7c673cae
FG
2100 bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
2101 dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len
2102 << std:: dec << " of " << *p << dendl;
2103 offset += x_len;
2104 length -= x_len;
2105 }
2106}
2107
2108uint64_t BlueFS::_estimate_log_size()
2109{
2110 int avg_dir_size = 40; // fixme
2111 int avg_file_size = 12;
2112 uint64_t size = 4096 * 2;
2113 size += file_map.size() * (1 + sizeof(bluefs_fnode_t));
7c673cae
FG
2114 size += dir_map.size() + (1 + avg_dir_size);
2115 size += file_map.size() * (1 + avg_dir_size + avg_file_size);
11fdf7f2 2116 return round_up_to(size, super.block_size);
7c673cae
FG
2117}
2118
2119void BlueFS::compact_log()
2120{
f6b5b4d7
TL
2121 std::unique_lock<ceph::mutex> l(lock);
2122 if (!cct->_conf->bluefs_replay_recovery_disable_compact) {
2123 if (cct->_conf->bluefs_compact_log_sync) {
2124 _compact_log_sync();
2125 } else {
2126 _compact_log_async(l);
2127 }
7c673cae
FG
2128 }
2129}
2130
2131bool BlueFS::_should_compact_log()
2132{
2133 uint64_t current = log_writer->file->fnode.size;
2134 uint64_t expected = _estimate_log_size();
2135 float ratio = (float)current / (float)expected;
2136 dout(10) << __func__ << " current 0x" << std::hex << current
2137 << " expected " << expected << std::dec
2138 << " ratio " << ratio
2139 << (new_log ? " (async compaction in progress)" : "")
2140 << dendl;
2141 if (new_log ||
2142 current < cct->_conf->bluefs_log_compact_min_size ||
2143 ratio < cct->_conf->bluefs_log_compact_min_ratio) {
2144 return false;
2145 }
2146 return true;
2147}
2148
11fdf7f2
TL
2149void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t,
2150 int flags)
7c673cae
FG
2151{
2152 t->seq = 1;
2153 t->uuid = super.uuid;
2154 dout(20) << __func__ << " op_init" << dendl;
2155
2156 t->op_init();
9f95a23c
TL
2157 for (auto& [ino, file_ref] : file_map) {
2158 if (ino == 1)
7c673cae 2159 continue;
9f95a23c 2160 ceph_assert(ino > 1);
11fdf7f2 2161
9f95a23c 2162 for(auto& e : file_ref->fnode.extents) {
11fdf7f2
TL
2163 auto bdev = e.bdev;
2164 auto bdev_new = bdev;
2165 ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL));
2166 if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
2167 bdev_new = BDEV_DB;
2168 }
2169 if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
2170 bdev_new = BDEV_SLOW;
2171 }
2172 if (bdev == BDEV_NEWDB) {
2173 // REMOVE_DB xor RENAME_DB
2174 ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
2175 ceph_assert(!(flags & RENAME_SLOW2DB));
2176 bdev_new = BDEV_DB;
2177 }
2178 if (bdev == BDEV_NEWWAL) {
2179 ceph_assert(flags & REMOVE_WAL);
2180 bdev_new = BDEV_WAL;
2181 }
2182 e.bdev = bdev_new;
2183 }
9f95a23c
TL
2184 dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
2185 t->op_file_update(file_ref->fnode);
7c673cae 2186 }
9f95a23c
TL
2187 for (auto& [path, dir_ref] : dir_map) {
2188 dout(20) << __func__ << " op_dir_create " << path << dendl;
2189 t->op_dir_create(path);
2190 for (auto& [fname, file_ref] : dir_ref->file_map) {
2191 dout(20) << __func__ << " op_dir_link " << path << "/" << fname
2192 << " to " << file_ref->fnode.ino << dendl;
2193 t->op_dir_link(path, fname, file_ref->fnode.ino);
7c673cae
FG
2194 }
2195 }
2196}
2197
2198void BlueFS::_compact_log_sync()
2199{
2200 dout(10) << __func__ << dendl;
9f95a23c
TL
2201 auto prefer_bdev =
2202 vselector->select_prefer_bdev(log_writer->file->vselector_hint);
2203 _rewrite_log_and_layout_sync(true,
11fdf7f2 2204 BDEV_DB,
9f95a23c
TL
2205 prefer_bdev,
2206 prefer_bdev,
2207 0,
2208 super.memorized_layout);
11fdf7f2
TL
2209 logger->inc(l_bluefs_log_compactions);
2210}
2211
9f95a23c
TL
2212void BlueFS::_rewrite_log_and_layout_sync(bool allocate_with_fallback,
2213 int super_dev,
2214 int log_dev,
2215 int log_dev_new,
2216 int flags,
2217 std::optional<bluefs_layout_t> layout)
11fdf7f2 2218{
7c673cae
FG
2219 File *log_file = log_writer->file.get();
2220
2221 // clear out log (be careful who calls us!!!)
2222 log_t.clear();
2223
11fdf7f2
TL
2224 dout(20) << __func__ << " super_dev:" << super_dev
2225 << " log_dev:" << log_dev
2226 << " log_dev_new:" << log_dev_new
2227 << " flags:" << flags
2228 << dendl;
7c673cae 2229 bluefs_transaction_t t;
11fdf7f2 2230 _compact_log_dump_metadata(&t, flags);
7c673cae
FG
2231
2232 dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
2233 t.op_jump_seq(log_seq);
2234
2235 bufferlist bl;
11fdf7f2 2236 encode(t, bl);
7c673cae
FG
2237 _pad_bl(bl);
2238
2239 uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
2240 dout(20) << __func__ << " need " << need << dendl;
2241
494da23a 2242 bluefs_fnode_t old_fnode;
11fdf7f2 2243 int r;
494da23a 2244 log_file->fnode.swap_extents(old_fnode);
11fdf7f2
TL
2245 if (allocate_with_fallback) {
2246 r = _allocate(log_dev, need, &log_file->fnode);
2247 ceph_assert(r == 0);
2248 } else {
2249 PExtentVector extents;
2250 r = _allocate_without_fallback(log_dev,
2251 need,
2252 &extents);
2253 ceph_assert(r == 0);
2254 for (auto& p : extents) {
2255 log_file->fnode.append_extent(
2256 bluefs_extent_t(log_dev, p.offset, p.length));
2257 }
7c673cae
FG
2258 }
2259
2260 _close_writer(log_writer);
2261
2262 log_file->fnode.size = bl.length();
9f95a23c
TL
2263 vselector->sub_usage(log_file->vselector_hint, old_fnode);
2264 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2265
7c673cae
FG
2266 log_writer = _create_writer(log_file);
2267 log_writer->append(bl);
11fdf7f2
TL
2268 r = _flush(log_writer, true);
2269 ceph_assert(r == 0);
2270#ifdef HAVE_LIBAIO
2271 if (!cct->_conf->bluefs_sync_write) {
2272 list<aio_t> completed_ios;
2273 _claim_completed_aios(log_writer, &completed_ios);
2274 wait_for_aio(log_writer);
2275 completed_ios.clear();
2276 }
2277#endif
224ce89b 2278 flush_bdev();
224ce89b 2279
9f95a23c 2280 super.memorized_layout = layout;
7c673cae 2281 super.log_fnode = log_file->fnode;
11fdf7f2
TL
2282 // rename device if needed
2283 if (log_dev != log_dev_new) {
2284 dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl;
2285 for (auto& p : super.log_fnode.extents) {
2286 p.bdev = log_dev_new;
2287 }
2288 }
2289 dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl;
2290
7c673cae 2291 ++super.version;
11fdf7f2 2292 _write_super(super_dev);
7c673cae
FG
2293 flush_bdev();
2294
494da23a
TL
2295 dout(10) << __func__ << " release old log extents " << old_fnode.extents << dendl;
2296 for (auto& r : old_fnode.extents) {
7c673cae
FG
2297 pending_release[r.bdev].insert(r.offset, r.length);
2298 }
7c673cae
FG
2299}
2300
2301/*
2302 * 1. Allocate a new extent to continue the log, and then log an event
2303 * that jumps the log write position to the new extent. At this point, the
2304 * old extent(s) won't be written to, and reflect everything to compact.
2305 * New events will be written to the new region that we'll keep.
2306 *
2307 * 2. While still holding the lock, encode a bufferlist that dumps all of the
2308 * in-memory fnodes and names. This will become the new beginning of the
2309 * log. The last event will jump to the log continuation extent from #1.
2310 *
2311 * 3. Queue a write to a new extent for the new beginnging of the log.
2312 *
2313 * 4. Drop lock and wait
2314 *
2315 * 5. Retake the lock.
2316 *
2317 * 6. Update the log_fnode to splice in the new beginning.
2318 *
2319 * 7. Write the new superblock.
2320 *
2321 * 8. Release the old log space. Clean up.
2322 */
11fdf7f2 2323void BlueFS::_compact_log_async(std::unique_lock<ceph::mutex>& l)
7c673cae
FG
2324{
2325 dout(10) << __func__ << dendl;
2326 File *log_file = log_writer->file.get();
11fdf7f2
TL
2327 ceph_assert(!new_log);
2328 ceph_assert(!new_log_writer);
7c673cae 2329
181888fb
FG
2330 // create a new log [writer] so that we know compaction is in progress
2331 // (see _should_compact_log)
9f95a23c 2332 new_log = ceph::make_ref<File>();
181888fb
FG
2333 new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode
2334
3efd9988
FG
2335 // 0. wait for any racing flushes to complete. (We do not want to block
2336 // in _flush_sync_log with jump_to set or else a racing thread might flush
2337 // our entries and our jump_to update won't be correct.)
2338 while (log_flushing) {
2339 dout(10) << __func__ << " log is currently flushing, waiting" << dendl;
2340 log_cond.wait(l);
2341 }
2342
9f95a23c
TL
2343 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2344
7c673cae
FG
2345 // 1. allocate new log space and jump to it.
2346 old_log_jump_to = log_file->fnode.get_allocated();
7c673cae 2347 dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
11fdf7f2 2348 << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl;
9f95a23c
TL
2349 int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
2350 cct->_conf->bluefs_max_log_runway,
2351 &log_file->fnode);
11fdf7f2 2352 ceph_assert(r == 0);
9f95a23c
TL
2353 //adjust usage as flush below will need it
2354 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
7c673cae
FG
2355 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2356
2357 // update the log file change and log a jump to the offset where we want to
2358 // write the new entries
2359 log_t.op_file_update(log_file->fnode);
2360 log_t.op_jump(log_seq, old_log_jump_to);
2361
2362 flush_bdev(); // FIXME?
2363
2364 _flush_and_sync_log(l, 0, old_log_jump_to);
2365
2366 // 2. prepare compacted log
2367 bluefs_transaction_t t;
224ce89b
WB
2368 //avoid record two times in log_t and _compact_log_dump_metadata.
2369 log_t.clear();
11fdf7f2 2370 _compact_log_dump_metadata(&t, 0);
7c673cae 2371
eafe8130
TL
2372 uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
2373 std::max(alloc_size[BDEV_DB],
2374 alloc_size[BDEV_SLOW]));
2375
7c673cae 2376 // conservative estimate for final encoded size
11fdf7f2 2377 new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2,
eafe8130 2378 max_alloc_size);
7c673cae
FG
2379 t.op_jump(log_seq, new_log_jump_to);
2380
11fdf7f2 2381 // allocate
9f95a23c 2382 //FIXME: check if we want DB here?
11fdf7f2
TL
2383 r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
2384 &new_log->fnode);
2385 ceph_assert(r == 0);
2386
2387 // we might have some more ops in log_t due to _allocate call
2388 t.claim_ops(log_t);
2389
7c673cae 2390 bufferlist bl;
11fdf7f2 2391 encode(t, bl);
7c673cae
FG
2392 _pad_bl(bl);
2393
2394 dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
2395 << std::dec << dendl;
2396
7c673cae
FG
2397 new_log_writer = _create_writer(new_log);
2398 new_log_writer->append(bl);
2399
2400 // 3. flush
2401 r = _flush(new_log_writer, true);
11fdf7f2 2402 ceph_assert(r == 0);
7c673cae
FG
2403
2404 // 4. wait
11fdf7f2 2405 _flush_bdev_safely(new_log_writer);
7c673cae 2406
11fdf7f2 2407 // 5. update our log fnode
7c673cae 2408 // discard first old_log_jump_to extents
9f95a23c 2409
7c673cae
FG
2410 dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec
2411 << " of " << log_file->fnode.extents << dendl;
2412 uint64_t discarded = 0;
2413 mempool::bluefs::vector<bluefs_extent_t> old_extents;
2414 while (discarded < old_log_jump_to) {
11fdf7f2 2415 ceph_assert(!log_file->fnode.extents.empty());
7c673cae
FG
2416 bluefs_extent_t& e = log_file->fnode.extents.front();
2417 bluefs_extent_t temp = e;
2418 if (discarded + e.length <= old_log_jump_to) {
2419 dout(10) << __func__ << " remove old log extent " << e << dendl;
2420 discarded += e.length;
94b18763 2421 log_file->fnode.pop_front_extent();
7c673cae
FG
2422 } else {
2423 dout(10) << __func__ << " remove front of old log extent " << e << dendl;
2424 uint64_t drop = old_log_jump_to - discarded;
2425 temp.length = drop;
2426 e.offset += drop;
2427 e.length -= drop;
2428 discarded += drop;
2429 dout(10) << __func__ << " kept " << e << " removed " << temp << dendl;
2430 }
2431 old_extents.push_back(temp);
2432 }
94b18763
FG
2433 auto from = log_file->fnode.extents.begin();
2434 auto to = log_file->fnode.extents.end();
2435 while (from != to) {
2436 new_log->fnode.append_extent(*from);
2437 ++from;
2438 }
7c673cae 2439
9f95a23c
TL
2440 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2441
7c673cae 2442 // clear the extents from old log file, they are added to new log
94b18763 2443 log_file->fnode.clear_extents();
7c673cae 2444 // swap the log files. New log file is the log file now.
94b18763
FG
2445 new_log->fnode.swap_extents(log_file->fnode);
2446
7c673cae
FG
2447 log_writer->pos = log_writer->file->fnode.size =
2448 log_writer->pos - old_log_jump_to + new_log_jump_to;
2449
9f95a23c
TL
2450 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2451
11fdf7f2 2452 // 6. write the super block to reflect the changes
7c673cae
FG
2453 dout(10) << __func__ << " writing super" << dendl;
2454 super.log_fnode = log_file->fnode;
2455 ++super.version;
11fdf7f2 2456 _write_super(BDEV_DB);
7c673cae
FG
2457
2458 lock.unlock();
2459 flush_bdev();
2460 lock.lock();
2461
11fdf7f2 2462 // 7. release old space
7c673cae
FG
2463 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
2464 for (auto& r : old_extents) {
2465 pending_release[r.bdev].insert(r.offset, r.length);
2466 }
2467
2468 // delete the new log, remove from the dirty files list
2469 _close_writer(new_log_writer);
2470 if (new_log->dirty_seq) {
11fdf7f2 2471 ceph_assert(dirty_files.count(new_log->dirty_seq));
7c673cae
FG
2472 auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log);
2473 dirty_files[new_log->dirty_seq].erase(it);
2474 }
2475 new_log_writer = nullptr;
2476 new_log = nullptr;
2477 log_cond.notify_all();
2478
2479 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2480 logger->inc(l_bluefs_log_compactions);
2481}
2482
2483void BlueFS::_pad_bl(bufferlist& bl)
2484{
2485 uint64_t partial = bl.length() % super.block_size;
2486 if (partial) {
2487 dout(10) << __func__ << " padding with 0x" << std::hex
2488 << super.block_size - partial << " zeros" << std::dec << dendl;
2489 bl.append_zero(super.block_size - partial);
2490 }
2491}
2492
7c673cae 2493
11fdf7f2 2494int BlueFS::_flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
7c673cae
FG
2495 uint64_t want_seq,
2496 uint64_t jump_to)
2497{
2498 while (log_flushing) {
2499 dout(10) << __func__ << " want_seq " << want_seq
2500 << " log is currently flushing, waiting" << dendl;
11fdf7f2 2501 ceph_assert(!jump_to);
7c673cae
FG
2502 log_cond.wait(l);
2503 }
2504 if (want_seq && want_seq <= log_seq_stable) {
2505 dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable "
2506 << log_seq_stable << ", done" << dendl;
11fdf7f2 2507 ceph_assert(!jump_to);
7c673cae
FG
2508 return 0;
2509 }
2510 if (log_t.empty() && dirty_files.empty()) {
2511 dout(10) << __func__ << " want_seq " << want_seq
2512 << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl;
11fdf7f2 2513 ceph_assert(!jump_to);
7c673cae
FG
2514 return 0;
2515 }
2516
a8e16298
TL
2517 vector<interval_set<uint64_t>> to_release(pending_release.size());
2518 to_release.swap(pending_release);
2519
7c673cae 2520 uint64_t seq = log_t.seq = ++log_seq;
11fdf7f2 2521 ceph_assert(want_seq == 0 || want_seq <= seq);
7c673cae
FG
2522 log_t.uuid = super.uuid;
2523
2524 // log dirty files
2525 auto lsi = dirty_files.find(seq);
2526 if (lsi != dirty_files.end()) {
2527 dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl;
2528 for (auto &f : lsi->second) {
2529 dout(20) << __func__ << " op_file_update " << f.fnode << dendl;
2530 log_t.op_file_update(f.fnode);
2531 }
2532 }
2533
2534 dout(10) << __func__ << " " << log_t << dendl;
11fdf7f2 2535 ceph_assert(!log_t.empty());
7c673cae
FG
2536
2537 // allocate some more space (before we run out)?
f67539c2 2538 // BTW: this triggers `flush()` in the `page_aligned_appender` of `log_writer`.
7c673cae
FG
2539 int64_t runway = log_writer->file->fnode.get_allocated() -
2540 log_writer->get_effective_write_pos();
f6b5b4d7 2541 bool just_expanded_log = false;
7c673cae
FG
2542 if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
2543 dout(10) << __func__ << " allocating more log runway (0x"
2544 << std::hex << runway << std::dec << " remaining)" << dendl;
2545 while (new_log_writer) {
2546 dout(10) << __func__ << " waiting for async compaction" << dendl;
2547 log_cond.wait(l);
2548 }
9f95a23c
TL
2549 vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode);
2550 int r = _allocate(
2551 vselector->select_prefer_bdev(log_writer->file->vselector_hint),
2552 cct->_conf->bluefs_max_log_runway,
2553 &log_writer->file->fnode);
11fdf7f2 2554 ceph_assert(r == 0);
9f95a23c 2555 vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode);
7c673cae 2556 log_t.op_file_update(log_writer->file->fnode);
f6b5b4d7 2557 just_expanded_log = true;
7c673cae
FG
2558 }
2559
2560 bufferlist bl;
11fdf7f2
TL
2561 bl.reserve(super.block_size);
2562 encode(log_t, bl);
7c673cae 2563 // pad to block boundary
11fdf7f2
TL
2564 size_t realign = super.block_size - (bl.length() % super.block_size);
2565 if (realign && realign != super.block_size)
2566 bl.append_zero(realign);
2567
7c673cae
FG
2568 logger->inc(l_bluefs_logged_bytes, bl.length());
2569
f6b5b4d7
TL
2570 if (just_expanded_log) {
2571 ceph_assert(bl.length() <= runway); // if we write this, we will have an unrecoverable data loss
2572 }
2573
7c673cae
FG
2574 log_writer->append(bl);
2575
2576 log_t.clear();
2577 log_t.seq = 0; // just so debug output is less confusing
2578 log_flushing = true;
2579
2580 int r = _flush(log_writer, true);
11fdf7f2 2581 ceph_assert(r == 0);
7c673cae
FG
2582
2583 if (jump_to) {
2584 dout(10) << __func__ << " jumping log offset from 0x" << std::hex
2585 << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl;
2586 log_writer->pos = jump_to;
9f95a23c 2587 vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size);
7c673cae 2588 log_writer->file->fnode.size = jump_to;
9f95a23c 2589 vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size);
7c673cae
FG
2590 }
2591
2592 _flush_bdev_safely(log_writer);
2593
2594 log_flushing = false;
2595 log_cond.notify_all();
2596
2597 // clean dirty files
2598 if (seq > log_seq_stable) {
2599 log_seq_stable = seq;
2600 dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl;
2601
2602 auto p = dirty_files.begin();
2603 while (p != dirty_files.end()) {
2604 if (p->first > log_seq_stable) {
2605 dout(20) << __func__ << " done cleaning up dirty files" << dendl;
2606 break;
2607 }
2608
2609 auto l = p->second.begin();
2610 while (l != p->second.end()) {
2611 File *file = &*l;
11fdf7f2
TL
2612 ceph_assert(file->dirty_seq > 0);
2613 ceph_assert(file->dirty_seq <= log_seq_stable);
7c673cae
FG
2614 dout(20) << __func__ << " cleaned file " << file->fnode << dendl;
2615 file->dirty_seq = 0;
2616 p->second.erase(l++);
2617 }
2618
11fdf7f2 2619 ceph_assert(p->second.empty());
7c673cae
FG
2620 dirty_files.erase(p++);
2621 }
2622 } else {
2623 dout(20) << __func__ << " log_seq_stable " << log_seq_stable
2624 << " already >= out seq " << seq
2625 << ", we lost a race against another log flush, done" << dendl;
2626 }
a8e16298
TL
2627
2628 for (unsigned i = 0; i < to_release.size(); ++i) {
2629 if (!to_release[i].empty()) {
2630 /* OK, now we have the guarantee alloc[i] won't be null. */
11fdf7f2
TL
2631 int r = 0;
2632 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
2633 r = bdev[i]->queue_discard(to_release[i]);
2634 if (r == 0)
2635 continue;
2636 } else if (cct->_conf->bdev_enable_discard) {
2637 for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
2638 bdev[i]->discard(p.get_start(), p.get_len());
2639 }
2640 }
a8e16298 2641 alloc[i]->release(to_release[i]);
f67539c2
TL
2642 if (is_shared_alloc(i)) {
2643 shared_alloc->bluefs_used -= to_release[i].size();
2644 }
a8e16298
TL
2645 }
2646 }
2647
7c673cae
FG
2648 _update_logger_stats();
2649
2650 return 0;
2651}
2652
f67539c2
TL
2653ceph::bufferlist BlueFS::FileWriter::flush_buffer(
2654 CephContext* const cct,
2655 const bool partial,
2656 const unsigned length,
2657 const bluefs_super_t& super)
2658{
2659 ceph::bufferlist bl;
2660 if (partial) {
2661 tail_block.splice(0, tail_block.length(), &bl);
2662 }
2663 const auto remaining_len = length - bl.length();
2664 buffer.splice(0, remaining_len, &bl);
2665 if (buffer.length()) {
2666 dout(20) << " leaving 0x" << std::hex << buffer.length() << std::dec
2667 << " unflushed" << dendl;
2668 }
2669 if (const unsigned tail = bl.length() & ~super.block_mask(); tail) {
2670 const auto padding_len = super.block_size - tail;
2671 dout(20) << __func__ << " caching tail of 0x"
2672 << std::hex << tail
2673 << " and padding block with 0x" << padding_len
2674 << " buffer.length() " << buffer.length()
2675 << std::dec << dendl;
2676 // We need to go through the `buffer_appender` to get a chance to
2677 // preserve in-memory contiguity and not mess with the alignment.
2678 // Otherwise a costly rebuild could happen in e.g. `KernelDevice`.
2679 buffer_appender.append_zero(padding_len);
2680 buffer.splice(buffer.length() - padding_len, padding_len, &bl);
2681 // Deep copy the tail here. This allows to avoid costlier copy on
2682 // bufferlist rebuild in e.g. `KernelDevice` and minimizes number
2683 // of memory allocations.
2684 // The alternative approach would be to place the entire tail and
2685 // padding on a dedicated, 4 KB long memory chunk. This shouldn't
2686 // trigger the rebuild while still being less expensive.
2687 buffer_appender.substr_of(bl, bl.length() - padding_len - tail, tail);
2688 buffer.splice(buffer.length() - tail, tail, &tail_block);
2689 } else {
2690 tail_block.clear();
2691 }
2692 return bl;
2693}
2694
522d829b
TL
2695int BlueFS::_signal_dirty_to_log(FileWriter *h)
2696{
2697 h->file->fnode.mtime = ceph_clock_now();
2698 ceph_assert(h->file->fnode.ino >= 1);
2699 if (h->file->dirty_seq == 0) {
2700 h->file->dirty_seq = log_seq + 1;
2701 dirty_files[h->file->dirty_seq].push_back(*h->file);
2702 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2703 << " (was clean)" << dendl;
2704 } else {
2705 if (h->file->dirty_seq != log_seq + 1) {
2706 // need re-dirty, erase from list first
2707 ceph_assert(dirty_files.count(h->file->dirty_seq));
2708 auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
2709 dirty_files[h->file->dirty_seq].erase(it);
2710 h->file->dirty_seq = log_seq + 1;
2711 dirty_files[h->file->dirty_seq].push_back(*h->file);
2712 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2713 << " (was " << h->file->dirty_seq << ")" << dendl;
2714 } else {
2715 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2716 << " (unchanged, do nothing) " << dendl;
2717 }
2718 }
2719 return 0;
2720}
2721
7c673cae
FG
2722int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
2723{
2724 dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
2725 << " 0x" << offset << "~" << length << std::dec
2726 << " to " << h->file->fnode << dendl;
f67539c2
TL
2727 if (h->file->deleted) {
2728 dout(10) << __func__ << " deleted, no-op" << dendl;
2729 return 0;
2730 }
7c673cae 2731
f67539c2 2732 ceph_assert(h->file->num_readers.load() == 0);
7c673cae
FG
2733
2734 bool buffered;
2735 if (h->file->fnode.ino == 1)
2736 buffered = false;
2737 else
2738 buffered = cct->_conf->bluefs_buffered_io;
2739
2740 if (offset + length <= h->pos)
2741 return 0;
2742 if (offset < h->pos) {
2743 length -= h->pos - offset;
2744 offset = h->pos;
2745 dout(10) << " still need 0x"
2746 << std::hex << offset << "~" << length << std::dec
2747 << dendl;
2748 }
11fdf7f2 2749 ceph_assert(offset <= h->file->fnode.size);
7c673cae
FG
2750
2751 uint64_t allocated = h->file->fnode.get_allocated();
9f95a23c 2752 vselector->sub_usage(h->file->vselector_hint, h->file->fnode);
7c673cae
FG
2753 // do not bother to dirty the file if we are overwriting
2754 // previously allocated extents.
522d829b 2755
7c673cae
FG
2756 if (allocated < offset + length) {
2757 // we should never run out of log space here; see the min runway check
2758 // in _flush_and_sync_log.
11fdf7f2 2759 ceph_assert(h->file->fnode.ino != 1);
9f95a23c 2760 int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint),
7c673cae 2761 offset + length - allocated,
94b18763 2762 &h->file->fnode);
7c673cae
FG
2763 if (r < 0) {
2764 derr << __func__ << " allocated: 0x" << std::hex << allocated
2765 << " offset: 0x" << offset << " length: 0x" << length << std::dec
2766 << dendl;
9f95a23c 2767 vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo
11fdf7f2 2768 ceph_abort_msg("bluefs enospc");
7c673cae
FG
2769 return r;
2770 }
522d829b 2771 h->file->is_dirty = true;
7c673cae
FG
2772 }
2773 if (h->file->fnode.size < offset + length) {
2774 h->file->fnode.size = offset + length;
2775 if (h->file->fnode.ino > 1) {
2776 // we do not need to dirty the log file (or it's compacting
2777 // replacement) when the file size changes because replay is
2778 // smart enough to discover it on its own.
522d829b 2779 h->file->is_dirty = true;
7c673cae
FG
2780 }
2781 }
522d829b 2782 dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl;
7c673cae
FG
2783
2784 uint64_t x_off = 0;
2785 auto p = h->file->fnode.seek(offset, &x_off);
11fdf7f2 2786 ceph_assert(p != h->file->fnode.extents.end());
7c673cae
FG
2787 dout(20) << __func__ << " in " << *p << " x_off 0x"
2788 << std::hex << x_off << std::dec << dendl;
2789
2790 unsigned partial = x_off & ~super.block_mask();
7c673cae
FG
2791 if (partial) {
2792 dout(20) << __func__ << " using partial tail 0x"
2793 << std::hex << partial << std::dec << dendl;
7c673cae
FG
2794 x_off -= partial;
2795 offset -= partial;
2796 length += partial;
2797 dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
2798 for (auto p : h->iocv) {
2799 if (p) {
2800 p->aio_wait();
2801 }
2802 }
2803 }
7c673cae 2804
f67539c2
TL
2805 auto bl = h->flush_buffer(cct, partial, length, super);
2806 ceph_assert(bl.length() >= length);
9f95a23c 2807 h->pos = offset + length;
f67539c2 2808 length = bl.length();
9f95a23c 2809
7c673cae
FG
2810 switch (h->writer_type) {
2811 case WRITER_WAL:
2812 logger->inc(l_bluefs_bytes_written_wal, length);
2813 break;
2814 case WRITER_SST:
2815 logger->inc(l_bluefs_bytes_written_sst, length);
2816 break;
2817 }
2818
2819 dout(30) << "dump:\n";
2820 bl.hexdump(*_dout);
2821 *_dout << dendl;
2822
7c673cae 2823 uint64_t bloff = 0;
11fdf7f2 2824 uint64_t bytes_written_slow = 0;
7c673cae 2825 while (length > 0) {
11fdf7f2 2826 uint64_t x_len = std::min(p->length - x_off, length);
7c673cae
FG
2827 bufferlist t;
2828 t.substr_of(bl, bloff, x_len);
7c673cae 2829 if (cct->_conf->bluefs_sync_write) {
11fdf7f2 2830 bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint);
7c673cae 2831 } else {
11fdf7f2
TL
2832 bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint);
2833 }
2834 h->dirty_devs[p->bdev] = true;
2835 if (p->bdev == BDEV_SLOW) {
2836 bytes_written_slow += t.length();
7c673cae 2837 }
11fdf7f2 2838
7c673cae
FG
2839 bloff += x_len;
2840 length -= x_len;
2841 ++p;
2842 x_off = 0;
2843 }
f67539c2
TL
2844 if (bytes_written_slow) {
2845 logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow);
2846 }
7c673cae
FG
2847 for (unsigned i = 0; i < MAX_BDEV; ++i) {
2848 if (bdev[i]) {
11fdf7f2 2849 if (h->iocv[i] && h->iocv[i]->has_pending_aios()) {
7c673cae
FG
2850 bdev[i]->aio_submit(h->iocv[i]);
2851 }
2852 }
2853 }
9f95a23c 2854 vselector->add_usage(h->file->vselector_hint, h->file->fnode);
7c673cae
FG
2855 dout(20) << __func__ << " h " << h << " pos now 0x"
2856 << std::hex << h->pos << std::dec << dendl;
2857 return 0;
2858}
2859
11fdf7f2 2860#ifdef HAVE_LIBAIO
7c673cae
FG
2861// we need to retire old completed aios so they don't stick around in
2862// memory indefinitely (along with their bufferlist refs).
2863void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
2864{
2865 for (auto p : h->iocv) {
2866 if (p) {
2867 ls->splice(ls->end(), p->running_aios);
2868 }
2869 }
2870 dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
2871}
2872
2873void BlueFS::wait_for_aio(FileWriter *h)
2874{
2875 // NOTE: this is safe to call without a lock, as long as our reference is
2876 // stable.
f67539c2
TL
2877 utime_t start;
2878 lgeneric_subdout(cct, bluefs, 10) << __func__;
2879 start = ceph_clock_now();
2880 *_dout << " " << h << dendl;
7c673cae
FG
2881 for (auto p : h->iocv) {
2882 if (p) {
2883 p->aio_wait();
2884 }
2885 }
11fdf7f2 2886 dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl;
7c673cae 2887}
11fdf7f2 2888#endif
7c673cae 2889
f6b5b4d7
TL
2890int BlueFS::_flush(FileWriter *h, bool force, std::unique_lock<ceph::mutex>& l)
2891{
2892 bool flushed = false;
2893 int r = _flush(h, force, &flushed);
2894 if (r == 0 && flushed) {
2895 _maybe_compact_log(l);
2896 }
2897 return r;
2898}
2899
2900int BlueFS::_flush(FileWriter *h, bool force, bool *flushed)
7c673cae 2901{
f67539c2 2902 uint64_t length = h->get_buffer_length();
7c673cae 2903 uint64_t offset = h->pos;
f6b5b4d7
TL
2904 if (flushed) {
2905 *flushed = false;
2906 }
7c673cae
FG
2907 if (!force &&
2908 length < cct->_conf->bluefs_min_flush_size) {
2909 dout(10) << __func__ << " " << h << " ignoring, length " << length
2910 << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
2911 << dendl;
2912 return 0;
2913 }
2914 if (length == 0) {
2915 dout(10) << __func__ << " " << h << " no dirty data on "
2916 << h->file->fnode << dendl;
2917 return 0;
2918 }
2919 dout(10) << __func__ << " " << h << " 0x"
2920 << std::hex << offset << "~" << length << std::dec
2921 << " to " << h->file->fnode << dendl;
11fdf7f2 2922 ceph_assert(h->pos <= h->file->fnode.size);
f6b5b4d7
TL
2923 int r = _flush_range(h, offset, length);
2924 if (flushed) {
2925 *flushed = true;
2926 }
2927 return r;
7c673cae
FG
2928}
2929
2930int BlueFS::_truncate(FileWriter *h, uint64_t offset)
2931{
2932 dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
2933 << " file " << h->file->fnode << dendl;
2934 if (h->file->deleted) {
2935 dout(10) << __func__ << " deleted, no-op" << dendl;
2936 return 0;
2937 }
2938
2939 // we never truncate internal log files
11fdf7f2 2940 ceph_assert(h->file->fnode.ino > 1);
7c673cae 2941
7c673cae
FG
2942 // truncate off unflushed data?
2943 if (h->pos < offset &&
f67539c2 2944 h->pos + h->get_buffer_length() > offset) {
7c673cae
FG
2945 dout(20) << __func__ << " tossing out last " << offset - h->pos
2946 << " unflushed bytes" << dendl;
11fdf7f2 2947 ceph_abort_msg("actually this shouldn't happen");
7c673cae 2948 }
f67539c2 2949 if (h->get_buffer_length()) {
7c673cae
FG
2950 int r = _flush(h, true);
2951 if (r < 0)
2952 return r;
2953 }
2954 if (offset == h->file->fnode.size) {
2955 return 0; // no-op!
2956 }
2957 if (offset > h->file->fnode.size) {
11fdf7f2 2958 ceph_abort_msg("truncate up not supported");
7c673cae 2959 }
11fdf7f2 2960 ceph_assert(h->file->fnode.size >= offset);
9f95a23c 2961 vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
7c673cae 2962 h->file->fnode.size = offset;
9f95a23c 2963 vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
7c673cae
FG
2964 log_t.op_file_update(h->file->fnode);
2965 return 0;
2966}
2967
11fdf7f2 2968int BlueFS::_fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l)
7c673cae
FG
2969{
2970 dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl;
2971 int r = _flush(h, true);
2972 if (r < 0)
2973 return r;
522d829b
TL
2974 if (h->file->is_dirty) {
2975 _signal_dirty_to_log(h);
2976 h->file->is_dirty = false;
2977 }
7c673cae
FG
2978 uint64_t old_dirty_seq = h->file->dirty_seq;
2979
2980 _flush_bdev_safely(h);
2981
2982 if (old_dirty_seq) {
2983 uint64_t s = log_seq;
2984 dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
2985 << ") on " << h->file->fnode << ", flushing log" << dendl;
2986 _flush_and_sync_log(l, old_dirty_seq);
11fdf7f2 2987 ceph_assert(h->file->dirty_seq == 0 || // cleaned
7c673cae
FG
2988 h->file->dirty_seq > s); // or redirtied by someone else
2989 }
2990 return 0;
2991}
2992
2993void BlueFS::_flush_bdev_safely(FileWriter *h)
2994{
11fdf7f2
TL
2995 std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs;
2996 h->dirty_devs.fill(false);
2997#ifdef HAVE_LIBAIO
7c673cae
FG
2998 if (!cct->_conf->bluefs_sync_write) {
2999 list<aio_t> completed_ios;
3000 _claim_completed_aios(h, &completed_ios);
3001 lock.unlock();
3002 wait_for_aio(h);
3003 completed_ios.clear();
11fdf7f2 3004 flush_bdev(flush_devs);
7c673cae 3005 lock.lock();
11fdf7f2
TL
3006 } else
3007#endif
3008 {
7c673cae 3009 lock.unlock();
11fdf7f2 3010 flush_bdev(flush_devs);
7c673cae
FG
3011 lock.lock();
3012 }
3013}
3014
11fdf7f2
TL
3015void BlueFS::flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs)
3016{
3017 // NOTE: this is safe to call without a lock.
3018 dout(20) << __func__ << dendl;
3019 for (unsigned i = 0; i < MAX_BDEV; i++) {
3020 if (dirty_bdevs[i])
3021 bdev[i]->flush();
3022 }
3023}
3024
7c673cae
FG
3025void BlueFS::flush_bdev()
3026{
3027 // NOTE: this is safe to call without a lock.
3028 dout(20) << __func__ << dendl;
f67539c2
TL
3029 for (unsigned i = 0; i < MAX_BDEV; i++) {
3030 // alloc space from BDEV_SLOW is unexpected.
3031 // So most cases we don't alloc from BDEV_SLOW and so avoiding flush not-used device.
3032 if (bdev[i] && (i != BDEV_SLOW || _get_used(i))) {
3033 bdev[i]->flush();
3034 }
7c673cae
FG
3035 }
3036}
3037
eafe8130
TL
3038const char* BlueFS::get_device_name(unsigned id)
3039{
3040 if (id >= MAX_BDEV) return "BDEV_INV";
3041 const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
3042 return names[id];
3043}
3044
11fdf7f2
TL
3045int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
3046 PExtentVector* extents)
3047{
3048 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
3049 << " from " << (int)id << dendl;
3050 assert(id < alloc.size());
11fdf7f2
TL
3051 if (!alloc[id]) {
3052 return -ENOENT;
3053 }
3054 extents->reserve(4); // 4 should be (more than) enough for most allocations
f67539c2
TL
3055 int64_t need = round_up_to(len, alloc_size[id]);
3056 int64_t alloc_len = alloc[id]->allocate(need, alloc_size[id], 0, extents);
3057 if (alloc_len < 0 || alloc_len < need) {
eafe8130 3058 if (alloc_len > 0) {
11fdf7f2
TL
3059 alloc[id]->release(*extents);
3060 }
f67539c2
TL
3061 derr << __func__ << " unable to allocate 0x" << std::hex << need
3062 << " on bdev " << (int)id
3063 << ", allocator name " << alloc[id]->get_name()
3064 << ", allocator type " << alloc[id]->get_type()
3065 << ", capacity 0x" << alloc[id]->get_capacity()
3066 << ", block size 0x" << alloc[id]->get_block_size()
3067 << ", free 0x" << alloc[id]->get_free()
3068 << ", fragmentation " << alloc[id]->get_fragmentation()
3069 << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0)
3070 << std::dec << dendl;
3071 alloc[id]->dump();
11fdf7f2
TL
3072 return -ENOSPC;
3073 }
f67539c2
TL
3074 if (is_shared_alloc(id)) {
3075 shared_alloc->bluefs_used += alloc_len;
3076 }
11fdf7f2
TL
3077
3078 return 0;
3079}
3080
7c673cae 3081int BlueFS::_allocate(uint8_t id, uint64_t len,
94b18763 3082 bluefs_fnode_t* node)
7c673cae
FG
3083{
3084 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
3085 << " from " << (int)id << dendl;
11fdf7f2 3086 ceph_assert(id < alloc.size());
b32b8144 3087 int64_t alloc_len = 0;
a8e16298 3088 PExtentVector extents;
11fdf7f2 3089 uint64_t hint = 0;
f67539c2 3090 int64_t need = len;
7c673cae 3091 if (alloc[id]) {
f67539c2 3092 need = round_up_to(len, alloc_size[id]);
94b18763
FG
3093 if (!node->extents.empty() && node->extents.back().bdev == id) {
3094 hint = node->extents.back().end();
11fdf7f2 3095 }
b32b8144 3096 extents.reserve(4); // 4 should be (more than) enough for most allocations
f67539c2 3097 alloc_len = alloc[id]->allocate(need, alloc_size[id], hint, &extents);
b32b8144 3098 }
f67539c2
TL
3099 if (alloc_len < 0 || alloc_len < need) {
3100 if (alloc[id]) {
3101 if (alloc_len > 0) {
3102 alloc[id]->release(extents);
3103 }
3104 dout(1) << __func__ << " unable to allocate 0x" << std::hex << need
3105 << " on bdev " << (int)id
3106 << ", allocator name " << alloc[id]->get_name()
3107 << ", allocator type " << alloc[id]->get_type()
3108 << ", capacity 0x" << alloc[id]->get_capacity()
3109 << ", block size 0x" << alloc[id]->get_block_size()
3110 << ", free 0x" << alloc[id]->get_free()
3111 << ", fragmentation " << alloc[id]->get_fragmentation()
3112 << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0)
3113 << std::dec << dendl;
b32b8144 3114 }
f67539c2 3115
7c673cae 3116 if (id != BDEV_SLOW) {
f67539c2
TL
3117 dout(20) << __func__ << " fallback to bdev "
3118 << (int)id + 1
3119 << dendl;
94b18763 3120 return _allocate(id + 1, len, node);
11fdf7f2 3121 } else {
f67539c2
TL
3122 derr << __func__ << " allocation failed, needed 0x" << std::hex << need
3123 << dendl;
11fdf7f2 3124 }
f67539c2 3125 return -ENOSPC;
11fdf7f2 3126 } else {
f67539c2
TL
3127 uint64_t used = _get_used(id);
3128 if (max_bytes[id] < used) {
3129 logger->set(max_bytes_pcounters[id], used);
3130 max_bytes[id] = used;
3131 }
3132 if (is_shared_alloc(id)) {
3133 shared_alloc->bluefs_used += alloc_len;
11fdf7f2 3134 }
7c673cae
FG
3135 }
3136
3137 for (auto& p : extents) {
94b18763 3138 node->append_extent(bluefs_extent_t(id, p.offset, p.length));
7c673cae
FG
3139 }
3140
3141 return 0;
3142}
3143
3144int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len)
3145{
3146 dout(10) << __func__ << " file " << f->fnode << " 0x"
3147 << std::hex << off << "~" << len << std::dec << dendl;
3148 if (f->deleted) {
3149 dout(10) << __func__ << " deleted, no-op" << dendl;
3150 return 0;
3151 }
11fdf7f2 3152 ceph_assert(f->fnode.ino > 1);
7c673cae
FG
3153 uint64_t allocated = f->fnode.get_allocated();
3154 if (off + len > allocated) {
3155 uint64_t want = off + len - allocated;
9f95a23c
TL
3156 vselector->sub_usage(f->vselector_hint, f->fnode);
3157
3158 int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint),
3159 want,
3160 &f->fnode);
3161 vselector->add_usage(f->vselector_hint, f->fnode);
7c673cae
FG
3162 if (r < 0)
3163 return r;
7c673cae
FG
3164 log_t.op_file_update(f->fnode);
3165 }
3166 return 0;
3167}
3168
1911f103 3169void BlueFS::sync_metadata(bool avoid_compact)
7c673cae 3170{
f67539c2 3171 std::unique_lock l(lock);
9f95a23c 3172 if (log_t.empty() && dirty_files.empty()) {
7c673cae 3173 dout(10) << __func__ << " - no pending log events" << dendl;
11fdf7f2 3174 } else {
f67539c2
TL
3175 utime_t start;
3176 lgeneric_subdout(cct, bluefs, 10) << __func__;
3177 start = ceph_clock_now();
3178 *_dout << dendl;
11fdf7f2
TL
3179 flush_bdev(); // FIXME?
3180 _flush_and_sync_log(l);
3181 dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl;
7c673cae 3182 }
7c673cae 3183
f6b5b4d7
TL
3184 if (!avoid_compact) {
3185 _maybe_compact_log(l);
3186 }
3187}
3188
3189void BlueFS::_maybe_compact_log(std::unique_lock<ceph::mutex>& l)
3190{
3191 if (!cct->_conf->bluefs_replay_recovery_disable_compact &&
3192 _should_compact_log()) {
7c673cae
FG
3193 if (cct->_conf->bluefs_compact_log_sync) {
3194 _compact_log_sync();
3195 } else {
3196 _compact_log_async(l);
3197 }
3198 }
7c673cae
FG
3199}
3200
3201int BlueFS::open_for_write(
b3b6e05e
TL
3202 std::string_view dirname,
3203 std::string_view filename,
7c673cae
FG
3204 FileWriter **h,
3205 bool overwrite)
3206{
11fdf7f2 3207 std::lock_guard l(lock);
7c673cae
FG
3208 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3209 map<string,DirRef>::iterator p = dir_map.find(dirname);
3210 DirRef dir;
3211 if (p == dir_map.end()) {
3212 // implicitly create the dir
3213 dout(20) << __func__ << " dir " << dirname
3214 << " does not exist" << dendl;
3215 return -ENOENT;
3216 } else {
3217 dir = p->second;
3218 }
3219
3220 FileRef file;
3221 bool create = false;
f6b5b4d7 3222 bool truncate = false;
7c673cae
FG
3223 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3224 if (q == dir->file_map.end()) {
3225 if (overwrite) {
3226 dout(20) << __func__ << " dir " << dirname << " (" << dir
3227 << ") file " << filename
3228 << " does not exist" << dendl;
3229 return -ENOENT;
3230 }
9f95a23c 3231 file = ceph::make_ref<File>();
7c673cae
FG
3232 file->fnode.ino = ++ino_last;
3233 file_map[ino_last] = file;
b3b6e05e 3234 dir->file_map[string{filename}] = file;
7c673cae
FG
3235 ++file->refs;
3236 create = true;
3237 } else {
3238 // overwrite existing file?
3239 file = q->second;
3240 if (overwrite) {
3241 dout(20) << __func__ << " dir " << dirname << " (" << dir
3242 << ") file " << filename
3243 << " already exists, overwrite in place" << dendl;
3244 } else {
3245 dout(20) << __func__ << " dir " << dirname << " (" << dir
3246 << ") file " << filename
3247 << " already exists, truncate + overwrite" << dendl;
9f95a23c 3248 vselector->sub_usage(file->vselector_hint, file->fnode);
7c673cae
FG
3249 file->fnode.size = 0;
3250 for (auto& p : file->fnode.extents) {
3251 pending_release[p.bdev].insert(p.offset, p.length);
3252 }
f6b5b4d7 3253 truncate = true;
94b18763
FG
3254
3255 file->fnode.clear_extents();
7c673cae
FG
3256 }
3257 }
11fdf7f2 3258 ceph_assert(file->fnode.ino > 1);
7c673cae
FG
3259
3260 file->fnode.mtime = ceph_clock_now();
9f95a23c 3261 file->vselector_hint = vselector->get_hint_by_dir(dirname);
f6b5b4d7
TL
3262 if (create || truncate) {
3263 vselector->add_usage(file->vselector_hint, file->fnode); // update file count
3264 }
9f95a23c 3265
7c673cae 3266 dout(20) << __func__ << " mapping " << dirname << "/" << filename
9f95a23c
TL
3267 << " vsel_hint " << file->vselector_hint
3268 << dendl;
7c673cae
FG
3269
3270 log_t.op_file_update(file->fnode);
3271 if (create)
3272 log_t.op_dir_link(dirname, filename, file->fnode.ino);
3273
3274 *h = _create_writer(file);
3275
3276 if (boost::algorithm::ends_with(filename, ".log")) {
3277 (*h)->writer_type = BlueFS::WRITER_WAL;
3278 if (logger && !overwrite) {
3279 logger->inc(l_bluefs_files_written_wal);
3280 }
3281 } else if (boost::algorithm::ends_with(filename, ".sst")) {
3282 (*h)->writer_type = BlueFS::WRITER_SST;
3283 if (logger) {
3284 logger->inc(l_bluefs_files_written_sst);
3285 }
3286 }
3287
3288 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
3289 return 0;
3290}
3291
3292BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
3293{
3294 FileWriter *w = new FileWriter(f);
3295 for (unsigned i = 0; i < MAX_BDEV; ++i) {
3296 if (bdev[i]) {
3297 w->iocv[i] = new IOContext(cct, NULL);
7c673cae
FG
3298 }
3299 }
3300 return w;
3301}
3302
3303void BlueFS::_close_writer(FileWriter *h)
3304{
3305 dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
f67539c2 3306 //h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer);
7c673cae
FG
3307 for (unsigned i=0; i<MAX_BDEV; ++i) {
3308 if (bdev[i]) {
11fdf7f2
TL
3309 if (h->iocv[i]) {
3310 h->iocv[i]->aio_wait();
3311 bdev[i]->queue_reap_ioc(h->iocv[i]);
3312 }
7c673cae
FG
3313 }
3314 }
522d829b
TL
3315 // sanity
3316 if (h->file->fnode.size >= (1ull << 30)) {
3317 dout(10) << __func__ << " file is unexpectedly large:" << h->file->fnode << dendl;
3318 }
7c673cae
FG
3319 delete h;
3320}
3321
522d829b
TL
3322uint64_t BlueFS::debug_get_dirty_seq(FileWriter *h)
3323{
3324 std::lock_guard l(lock);
3325 return h->file->dirty_seq;
3326}
3327
3328bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev)
3329{
3330 std::lock_guard l(lock);
3331 return h->dirty_devs[dev];
3332}
3333
7c673cae 3334int BlueFS::open_for_read(
b3b6e05e
TL
3335 std::string_view dirname,
3336 std::string_view filename,
7c673cae
FG
3337 FileReader **h,
3338 bool random)
3339{
11fdf7f2 3340 std::lock_guard l(lock);
7c673cae
FG
3341 dout(10) << __func__ << " " << dirname << "/" << filename
3342 << (random ? " (random)":" (sequential)") << dendl;
3343 map<string,DirRef>::iterator p = dir_map.find(dirname);
3344 if (p == dir_map.end()) {
3345 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3346 return -ENOENT;
3347 }
3348 DirRef dir = p->second;
3349
3350 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3351 if (q == dir->file_map.end()) {
3352 dout(20) << __func__ << " dir " << dirname << " (" << dir
3353 << ") file " << filename
3354 << " not found" << dendl;
3355 return -ENOENT;
3356 }
3357 File *file = q->second.get();
3358
3359 *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
3360 random, false);
3361 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
3362 return 0;
3363}
3364
3365int BlueFS::rename(
b3b6e05e
TL
3366 std::string_view old_dirname, std::string_view old_filename,
3367 std::string_view new_dirname, std::string_view new_filename)
7c673cae 3368{
11fdf7f2 3369 std::lock_guard l(lock);
7c673cae
FG
3370 dout(10) << __func__ << " " << old_dirname << "/" << old_filename
3371 << " -> " << new_dirname << "/" << new_filename << dendl;
3372 map<string,DirRef>::iterator p = dir_map.find(old_dirname);
3373 if (p == dir_map.end()) {
3374 dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
3375 return -ENOENT;
3376 }
3377 DirRef old_dir = p->second;
3378 map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
3379 if (q == old_dir->file_map.end()) {
3380 dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
3381 << ") file " << old_filename
3382 << " not found" << dendl;
3383 return -ENOENT;
3384 }
3385 FileRef file = q->second;
3386
3387 p = dir_map.find(new_dirname);
3388 if (p == dir_map.end()) {
3389 dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
3390 return -ENOENT;
3391 }
3392 DirRef new_dir = p->second;
3393 q = new_dir->file_map.find(new_filename);
3394 if (q != new_dir->file_map.end()) {
3395 dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
3396 << ") file " << new_filename
3397 << " already exists, unlinking" << dendl;
11fdf7f2 3398 ceph_assert(q->second != file);
7c673cae
FG
3399 log_t.op_dir_unlink(new_dirname, new_filename);
3400 _drop_link(q->second);
3401 }
3402
3403 dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
3404 << " " << file->fnode << dendl;
3405
b3b6e05e
TL
3406 new_dir->file_map[string{new_filename}] = file;
3407 old_dir->file_map.erase(string{old_filename});
7c673cae
FG
3408
3409 log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
3410 log_t.op_dir_unlink(old_dirname, old_filename);
3411 return 0;
3412}
3413
b3b6e05e 3414int BlueFS::mkdir(std::string_view dirname)
7c673cae 3415{
11fdf7f2 3416 std::lock_guard l(lock);
7c673cae
FG
3417 dout(10) << __func__ << " " << dirname << dendl;
3418 map<string,DirRef>::iterator p = dir_map.find(dirname);
3419 if (p != dir_map.end()) {
3420 dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
3421 return -EEXIST;
3422 }
b3b6e05e 3423 dir_map[string{dirname}] = ceph::make_ref<Dir>();
7c673cae
FG
3424 log_t.op_dir_create(dirname);
3425 return 0;
3426}
3427
b3b6e05e 3428int BlueFS::rmdir(std::string_view dirname)
7c673cae 3429{
11fdf7f2 3430 std::lock_guard l(lock);
7c673cae 3431 dout(10) << __func__ << " " << dirname << dendl;
b3b6e05e 3432 auto p = dir_map.find(dirname);
7c673cae
FG
3433 if (p == dir_map.end()) {
3434 dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
3435 return -ENOENT;
3436 }
3437 DirRef dir = p->second;
3438 if (!dir->file_map.empty()) {
3439 dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
3440 return -ENOTEMPTY;
3441 }
b3b6e05e 3442 dir_map.erase(string{dirname});
7c673cae
FG
3443 log_t.op_dir_remove(dirname);
3444 return 0;
3445}
3446
b3b6e05e 3447bool BlueFS::dir_exists(std::string_view dirname)
7c673cae 3448{
11fdf7f2 3449 std::lock_guard l(lock);
7c673cae
FG
3450 map<string,DirRef>::iterator p = dir_map.find(dirname);
3451 bool exists = p != dir_map.end();
3452 dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
3453 return exists;
3454}
3455
b3b6e05e 3456int BlueFS::stat(std::string_view dirname, std::string_view filename,
7c673cae
FG
3457 uint64_t *size, utime_t *mtime)
3458{
11fdf7f2 3459 std::lock_guard l(lock);
7c673cae
FG
3460 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3461 map<string,DirRef>::iterator p = dir_map.find(dirname);
3462 if (p == dir_map.end()) {
3463 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3464 return -ENOENT;
3465 }
3466 DirRef dir = p->second;
3467 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3468 if (q == dir->file_map.end()) {
3469 dout(20) << __func__ << " dir " << dirname << " (" << dir
3470 << ") file " << filename
3471 << " not found" << dendl;
3472 return -ENOENT;
3473 }
3474 File *file = q->second.get();
3475 dout(10) << __func__ << " " << dirname << "/" << filename
3476 << " " << file->fnode << dendl;
3477 if (size)
3478 *size = file->fnode.size;
3479 if (mtime)
3480 *mtime = file->fnode.mtime;
3481 return 0;
3482}
3483
b3b6e05e 3484int BlueFS::lock_file(std::string_view dirname, std::string_view filename,
7c673cae
FG
3485 FileLock **plock)
3486{
11fdf7f2 3487 std::lock_guard l(lock);
7c673cae
FG
3488 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3489 map<string,DirRef>::iterator p = dir_map.find(dirname);
3490 if (p == dir_map.end()) {
3491 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3492 return -ENOENT;
3493 }
3494 DirRef dir = p->second;
b3b6e05e 3495 auto q = dir->file_map.find(filename);
9f95a23c 3496 FileRef file;
7c673cae
FG
3497 if (q == dir->file_map.end()) {
3498 dout(20) << __func__ << " dir " << dirname << " (" << dir
3499 << ") file " << filename
3500 << " not found, creating" << dendl;
9f95a23c 3501 file = ceph::make_ref<File>();
7c673cae
FG
3502 file->fnode.ino = ++ino_last;
3503 file->fnode.mtime = ceph_clock_now();
3504 file_map[ino_last] = file;
b3b6e05e 3505 dir->file_map[string{filename}] = file;
7c673cae
FG
3506 ++file->refs;
3507 log_t.op_file_update(file->fnode);
3508 log_t.op_dir_link(dirname, filename, file->fnode.ino);
3509 } else {
9f95a23c 3510 file = q->second;
7c673cae
FG
3511 if (file->locked) {
3512 dout(10) << __func__ << " already locked" << dendl;
11fdf7f2 3513 return -ENOLCK;
7c673cae
FG
3514 }
3515 }
3516 file->locked = true;
3517 *plock = new FileLock(file);
3518 dout(10) << __func__ << " locked " << file->fnode
3519 << " with " << *plock << dendl;
3520 return 0;
3521}
3522
3523int BlueFS::unlock_file(FileLock *fl)
3524{
11fdf7f2 3525 std::lock_guard l(lock);
7c673cae 3526 dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
11fdf7f2 3527 ceph_assert(fl->file->locked);
7c673cae
FG
3528 fl->file->locked = false;
3529 delete fl;
3530 return 0;
3531}
3532
b3b6e05e 3533int BlueFS::readdir(std::string_view dirname, vector<string> *ls)
7c673cae 3534{
b3b6e05e
TL
3535 // dirname may contain a trailing /
3536 if (!dirname.empty() && dirname.back() == '/') {
3537 dirname.remove_suffix(1);
3538 }
11fdf7f2 3539 std::lock_guard l(lock);
7c673cae
FG
3540 dout(10) << __func__ << " " << dirname << dendl;
3541 if (dirname.empty()) {
3542 // list dirs
3543 ls->reserve(dir_map.size() + 2);
3544 for (auto& q : dir_map) {
3545 ls->push_back(q.first);
3546 }
3547 } else {
3548 // list files in dir
3549 map<string,DirRef>::iterator p = dir_map.find(dirname);
3550 if (p == dir_map.end()) {
3551 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3552 return -ENOENT;
3553 }
3554 DirRef dir = p->second;
3555 ls->reserve(dir->file_map.size() + 2);
3556 for (auto& q : dir->file_map) {
3557 ls->push_back(q.first);
3558 }
3559 }
3560 ls->push_back(".");
3561 ls->push_back("..");
3562 return 0;
3563}
3564
b3b6e05e 3565int BlueFS::unlink(std::string_view dirname, std::string_view filename)
7c673cae 3566{
11fdf7f2 3567 std::lock_guard l(lock);
7c673cae
FG
3568 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3569 map<string,DirRef>::iterator p = dir_map.find(dirname);
3570 if (p == dir_map.end()) {
3571 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3572 return -ENOENT;
3573 }
3574 DirRef dir = p->second;
3575 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3576 if (q == dir->file_map.end()) {
3577 dout(20) << __func__ << " file " << dirname << "/" << filename
3578 << " not found" << dendl;
3579 return -ENOENT;
3580 }
3581 FileRef file = q->second;
3582 if (file->locked) {
3583 dout(20) << __func__ << " file " << dirname << "/" << filename
3584 << " is locked" << dendl;
3585 return -EBUSY;
3586 }
b3b6e05e 3587 dir->file_map.erase(string{filename});
7c673cae
FG
3588 log_t.op_dir_unlink(dirname, filename);
3589 _drop_link(file);
3590 return 0;
3591}
d2e6a577
FG
3592
3593bool BlueFS::wal_is_rotational()
3594{
94b18763
FG
3595 if (bdev[BDEV_WAL]) {
3596 return bdev[BDEV_WAL]->is_rotational();
3597 } else if (bdev[BDEV_DB]) {
3598 return bdev[BDEV_DB]->is_rotational();
3599 }
3600 return bdev[BDEV_SLOW]->is_rotational();
d2e6a577 3601}
9f95a23c 3602
f6b5b4d7
TL
3603/*
3604 Algorithm.
3605 do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there.
3606 Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future,
3607 and try if using it will produce healthy bluefs transaction.
3608 We encode already known bluefs log extents and search disk for these bytes.
3609 When we find it, we decode following bytes as extent.
3610 We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction.
3611 */
3612int BlueFS::do_replay_recovery_read(FileReader *log_reader,
3613 size_t replay_pos,
3614 size_t read_offset,
3615 size_t read_len,
3616 bufferlist* bl) {
3617 dout(1) << __func__ << " replay_pos=0x" << std::hex << replay_pos <<
3618 " needs 0x" << read_offset << "~" << read_len << std::dec << dendl;
3619
3620 bluefs_fnode_t& log_fnode = log_reader->file->fnode;
3621 bufferlist bin_extents;
f67539c2 3622 ::encode(log_fnode.extents, bin_extents);
f6b5b4d7
TL
3623 dout(2) << __func__ << " log file encoded extents length = " << bin_extents.length() << dendl;
3624
3625 // cannot process if too small to effectively search
3626 ceph_assert(bin_extents.length() >= 32);
3627 bufferlist last_32;
3628 last_32.substr_of(bin_extents, bin_extents.length() - 32, 32);
3629
3630 //read fixed part from replay_pos to end of bluefs_log extents
3631 bufferlist fixed;
3632 uint64_t e_off = 0;
3633 auto e = log_fnode.seek(replay_pos, &e_off);
3634 ceph_assert(e != log_fnode.extents.end());
3635 int r = bdev[e->bdev]->read(e->offset + e_off, e->length - e_off, &fixed, ioc[e->bdev],
3636 cct->_conf->bluefs_buffered_io);
3637 ceph_assert(r == 0);
3638 //capture dev of last good extent
3639 uint8_t last_e_dev = e->bdev;
3640 uint64_t last_e_off = e->offset;
3641 ++e;
3642 while (e != log_fnode.extents.end()) {
3643 r = bdev[e->bdev]->read(e->offset, e->length, &fixed, ioc[e->bdev],
3644 cct->_conf->bluefs_buffered_io);
3645 ceph_assert(r == 0);
3646 last_e_dev = e->bdev;
3647 ++e;
3648 }
3649 ceph_assert(replay_pos + fixed.length() == read_offset);
3650
3651 dout(2) << __func__ << " valid data in log = " << fixed.length() << dendl;
3652
3653 struct compare {
3654 bool operator()(const bluefs_extent_t& a, const bluefs_extent_t& b) const {
3655 if (a.bdev < b.bdev) return true;
3656 if (a.offset < b.offset) return true;
3657 return a.length < b.length;
3658 }
3659 };
3660 std::set<bluefs_extent_t, compare> extents_rejected;
3661 for (int dcnt = 0; dcnt < 3; dcnt++) {
3662 uint8_t dev = (last_e_dev + dcnt) % MAX_BDEV;
3663 if (bdev[dev] == nullptr) continue;
3664 dout(2) << __func__ << " processing " << get_device_name(dev) << dendl;
3665 interval_set<uint64_t> disk_regions;
3666 disk_regions.insert(0, bdev[dev]->get_size());
3667 for (auto f : file_map) {
3668 auto& e = f.second->fnode.extents;
3669 for (auto& p : e) {
3670 if (p.bdev == dev) {
3671 disk_regions.erase(p.offset, p.length);
3672 }
3673 }
3674 }
3675 size_t disk_regions_count = disk_regions.num_intervals();
3676 dout(5) << __func__ << " " << disk_regions_count << " regions to scan on " << get_device_name(dev) << dendl;
3677
3678 auto reg = disk_regions.lower_bound(last_e_off);
3679 //for all except first, start from beginning
3680 last_e_off = 0;
3681 if (reg == disk_regions.end()) {
3682 reg = disk_regions.begin();
3683 }
3684 const uint64_t chunk_size = 4 * 1024 * 1024;
3685 const uint64_t page_size = 4096;
3686 const uint64_t max_extent_size = 16;
3687 uint64_t overlay_size = last_32.length() + max_extent_size;
3688 for (size_t i = 0; i < disk_regions_count; reg++, i++) {
3689 if (reg == disk_regions.end()) {
3690 reg = disk_regions.begin();
3691 }
3692 uint64_t pos = reg.get_start();
3693 uint64_t len = reg.get_len();
3694
3695 std::unique_ptr<char[]> raw_data_p{new char[page_size + chunk_size]};
3696 char* raw_data = raw_data_p.get();
3697 memset(raw_data, 0, page_size);
3698
3699 while (len > last_32.length()) {
3700 uint64_t chunk_len = len > chunk_size ? chunk_size : len;
3701 dout(5) << __func__ << " read "
3702 << get_device_name(dev) << ":0x" << std::hex << pos << "+" << chunk_len << std::dec << dendl;
3703 r = bdev[dev]->read_random(pos, chunk_len, raw_data + page_size, cct->_conf->bluefs_buffered_io);
3704 ceph_assert(r == 0);
3705
3706 //search for fixed_last_32
3707 char* chunk_b = raw_data + page_size;
3708 char* chunk_e = chunk_b + chunk_len;
3709
3710 char* search_b = chunk_b - overlay_size;
3711 char* search_e = chunk_e;
3712
3713 for (char* sp = search_b; ; sp += last_32.length()) {
3714 sp = (char*)memmem(sp, search_e - sp, last_32.c_str(), last_32.length());
3715 if (sp == nullptr) {
3716 break;
3717 }
3718
3719 char* n = sp + last_32.length();
3720 dout(5) << __func__ << " checking location 0x" << std::hex << pos + (n - chunk_b) << std::dec << dendl;
3721 bufferlist test;
3722 test.append(n, std::min<size_t>(max_extent_size, chunk_e - n));
3723 bluefs_extent_t ne;
3724 try {
3725 bufferlist::const_iterator p = test.begin();
f67539c2 3726 ::decode(ne, p);
f6b5b4d7
TL
3727 } catch (buffer::error& e) {
3728 continue;
3729 }
3730 if (extents_rejected.count(ne) != 0) {
3731 dout(5) << __func__ << " extent " << ne << " already refected" <<dendl;
3732 continue;
3733 }
3734 //insert as rejected already. if we succeed, it wouldn't make difference.
3735 extents_rejected.insert(ne);
3736
3737 if (ne.bdev >= MAX_BDEV ||
3738 bdev[ne.bdev] == nullptr ||
3739 ne.length > 16 * 1024 * 1024 ||
3740 (ne.length & 4095) != 0 ||
3741 ne.offset + ne.length > bdev[ne.bdev]->get_size() ||
3742 (ne.offset & 4095) != 0) {
3743 dout(5) << __func__ << " refusing extent " << ne << dendl;
3744 continue;
3745 }
3746 dout(5) << __func__ << " checking extent " << ne << dendl;
3747
3748 //read candidate extent - whole
3749 bufferlist candidate;
3750 candidate.append(fixed);
3751 r = bdev[ne.bdev]->read(ne.offset, ne.length, &candidate, ioc[ne.bdev],
3752 cct->_conf->bluefs_buffered_io);
3753 ceph_assert(r == 0);
3754
3755 //check if transaction & crc is ok
3756 bluefs_transaction_t t;
3757 try {
f67539c2
TL
3758 bufferlist::const_iterator p = candidate.begin();
3759 ::decode(t, p);
f6b5b4d7
TL
3760 }
3761 catch (buffer::error& e) {
3762 dout(5) << __func__ << " failed match" << dendl;
3763 continue;
3764 }
3765
3766 //success, it seems a probable candidate
3767 uint64_t l = std::min<uint64_t>(ne.length, read_len);
3768 //trim to required size
3769 bufferlist requested_read;
3770 requested_read.substr_of(candidate, fixed.length(), l);
3771 bl->append(requested_read);
3772 dout(5) << __func__ << " successful extension of log " << l << "/" << read_len << dendl;
3773 log_fnode.append_extent(ne);
3774 log_fnode.recalc_allocated();
3775 log_reader->buf.pos += l;
3776 return l;
3777 }
3778 //save overlay for next search
3779 memcpy(search_b, chunk_e - overlay_size, overlay_size);
3780 pos += chunk_len;
3781 len -= chunk_len;
3782 }
3783 }
3784 }
3785 return 0;
3786}
3787
f67539c2 3788size_t BlueFS::probe_alloc_avail(int dev, uint64_t alloc_size)
9f95a23c 3789{
f67539c2
TL
3790 size_t total = 0;
3791 auto iterated_allocation = [&](size_t off, size_t len) {
3792 //only count in size that is alloc_size aligned
3793 size_t dist_to_alignment;
3794 size_t offset_in_block = off & (alloc_size - 1);
3795 if (offset_in_block == 0)
3796 dist_to_alignment = 0;
3797 else
3798 dist_to_alignment = alloc_size - offset_in_block;
3799 if (dist_to_alignment >= len)
3800 return;
3801 len -= dist_to_alignment;
3802 total += p2align(len, alloc_size);
3803 };
3804 if (alloc[dev]) {
3805 alloc[dev]->dump(iterated_allocation);
9f95a23c 3806 }
f67539c2 3807 return total;
9f95a23c 3808}
9f95a23c
TL
3809// ===============================================
3810// OriginalVolumeSelector
3811
f6b5b4d7
TL
3812void* OriginalVolumeSelector::get_hint_for_log() const {
3813 return reinterpret_cast<void*>(BlueFS::BDEV_WAL);
9f95a23c 3814}
b3b6e05e 3815void* OriginalVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
9f95a23c
TL
3816 uint8_t res = BlueFS::BDEV_DB;
3817 if (dirname.length() > 5) {
3818 // the "db.slow" and "db.wal" directory names are hard-coded at
3819 // match up with bluestore. the slow device is always the second
3820 // one (when a dedicated block.db device is present and used at
3821 // bdev 0). the wal device is always last.
3822 if (boost::algorithm::ends_with(dirname, ".slow")) {
3823 res = BlueFS::BDEV_SLOW;
3824 }
3825 else if (boost::algorithm::ends_with(dirname, ".wal")) {
3826 res = BlueFS::BDEV_WAL;
3827 }
3828 }
3829 return reinterpret_cast<void*>(res);
3830}
3831
3832uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint)
3833{
3834 return (uint8_t)(reinterpret_cast<uint64_t>(hint));
3835}
3836
3837void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const
3838{
3839 res.emplace_back(base, db_total);
522d829b
TL
3840 res.emplace_back(base + ".slow",
3841 slow_total ? slow_total : db_total); // use fake non-zero value if needed to
3842 // avoid RocksDB complains
9f95a23c
TL
3843}
3844
3845#undef dout_prefix
3846#define dout_prefix *_dout << "OriginalVolumeSelector: "
3847
3848void OriginalVolumeSelector::dump(ostream& sout) {
3849 sout<< "wal_total:" << wal_total
3850 << ", db_total:" << db_total
3851 << ", slow_total:" << slow_total
3852 << std::endl;
3853}
f67539c2
TL
3854
3855// ===============================================
3856// FitToFastVolumeSelector
3857
3858void FitToFastVolumeSelector::get_paths(const std::string& base, paths& res) const {
3859 res.emplace_back(base, 1); // size of the last db_path has no effect
3860}