]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueFS.cc
import ceph pacific 16.2.5
[ceph.git] / ceph / src / os / bluestore / BlueFS.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include "boost/algorithm/string.hpp"
9f95a23c 5#include "bluestore_common.h"
7c673cae
FG
6#include "BlueFS.h"
7
8#include "common/debug.h"
9#include "common/errno.h"
10#include "common/perf_counters.h"
7c673cae 11#include "Allocator.h"
11fdf7f2 12#include "include/ceph_assert.h"
eafe8130 13#include "common/admin_socket.h"
7c673cae
FG
14
15#define dout_context cct
16#define dout_subsys ceph_subsys_bluefs
17#undef dout_prefix
18#define dout_prefix *_dout << "bluefs "
9f95a23c 19using TOPNSPC::common::cmd_getval;
f67539c2
TL
20
21using std::byte;
22using std::list;
23using std::make_pair;
24using std::map;
25using std::ostream;
26using std::pair;
27using std::set;
28using std::string;
29using std::to_string;
30using std::vector;
31
32using ceph::bufferlist;
33using ceph::decode;
34using ceph::encode;
35using ceph::Formatter;
36
37
7c673cae
FG
38MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
39MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
f91f0fd5 40MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs_file_writer);
7c673cae 41MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
f91f0fd5
TL
42 bluefs_file_reader_buffer, bluefs_file_reader);
43MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs_file_reader);
7c673cae
FG
44MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
45
11fdf7f2
TL
46static void wal_discard_cb(void *priv, void* priv2) {
47 BlueFS *bluefs = static_cast<BlueFS*>(priv);
48 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
49 bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp);
50}
51
52static void db_discard_cb(void *priv, void* priv2) {
53 BlueFS *bluefs = static_cast<BlueFS*>(priv);
54 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
55 bluefs->handle_discard(BlueFS::BDEV_DB, *tmp);
56}
57
58static void slow_discard_cb(void *priv, void* priv2) {
59 BlueFS *bluefs = static_cast<BlueFS*>(priv);
60 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
61 bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
62}
7c673cae 63
eafe8130
TL
64class BlueFS::SocketHook : public AdminSocketHook {
65 BlueFS* bluefs;
66public:
67 static BlueFS::SocketHook* create(BlueFS* bluefs)
68 {
69 BlueFS::SocketHook* hook = nullptr;
70 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
71 if (admin_socket) {
72 hook = new BlueFS::SocketHook(bluefs);
f67539c2 73 int r = admin_socket->register_command("bluestore bluefs device info "
eafe8130
TL
74 "name=alloc_size,type=CephInt,req=false",
75 hook,
f67539c2
TL
76 "Shows space report for bluefs devices. "
77 "This also includes an estimation for space "
78 "available to bluefs at main device. "
79 "alloc_size, if set, specifies the custom bluefs "
80 "allocation unit size for the estimation above.");
eafe8130
TL
81 if (r != 0) {
82 ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl;
83 delete hook;
84 hook = nullptr;
9f95a23c 85 } else {
f6b5b4d7 86 r = admin_socket->register_command("bluefs stats",
9f95a23c
TL
87 hook,
88 "Dump internal statistics for bluefs."
89 "");
90 ceph_assert(r == 0);
f67539c2
TL
91 r = admin_socket->register_command("bluefs files list", hook,
92 "print files in bluefs");
93 ceph_assert(r == 0);
cd265ab1
TL
94 r = admin_socket->register_command("bluefs debug_inject_read_zeros", hook,
95 "Injects 8K zeros into next BlueFS read. Debug only.");
96 ceph_assert(r == 0);
eafe8130
TL
97 }
98 }
99 return hook;
100 }
101
102 ~SocketHook() {
103 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
9f95a23c 104 admin_socket->unregister_commands(this);
eafe8130
TL
105 }
106private:
107 SocketHook(BlueFS* bluefs) :
108 bluefs(bluefs) {}
9f95a23c
TL
109 int call(std::string_view command, const cmdmap_t& cmdmap,
110 Formatter *f,
111 std::ostream& errss,
112 bufferlist& out) override {
f67539c2 113 if (command == "bluestore bluefs device info") {
9f95a23c
TL
114 int64_t alloc_size = 0;
115 cmd_getval(cmdmap, "alloc_size", alloc_size);
116 if ((alloc_size & (alloc_size - 1)) != 0) {
117 errss << "Invalid allocation size:'" << alloc_size << std::endl;
118 return -EINVAL;
119 }
120 if (alloc_size == 0)
f67539c2
TL
121 alloc_size = bluefs->cct->_conf->bluefs_shared_alloc_size;
122 f->open_object_section("bluefs_device_info");
9f95a23c
TL
123 for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) {
124 if (bluefs->bdev[dev]) {
125 f->open_object_section("dev");
126 f->dump_string("device", bluefs->get_device_name(dev));
127 ceph_assert(bluefs->alloc[dev]);
f67539c2
TL
128 auto total = bluefs->get_total(dev);
129 auto free = bluefs->get_free(dev);
130 auto used = bluefs->get_used(dev);
131
132 f->dump_int("total", total);
133 f->dump_int("free", free);
134 f->dump_int("bluefs_used", used);
135 if (bluefs->is_shared_alloc(dev)) {
136 size_t avail = bluefs->probe_alloc_avail(dev, alloc_size);
137 f->dump_int("bluefs max available", avail);
138 }
139 f->close_section();
140 }
eafe8130 141 }
f67539c2 142
9f95a23c
TL
143 f->close_section();
144 } else if (command == "bluefs stats") {
145 std::stringstream ss;
146 bluefs->dump_block_extents(ss);
147 bluefs->dump_volume_selector(ss);
eafe8130 148 out.append(ss);
f67539c2
TL
149 } else if (command == "bluefs files list") {
150 const char* devnames[3] = {"wal","db","slow"};
151 std::lock_guard l(bluefs->lock);
152 f->open_array_section("files");
153 for (auto &d : bluefs->dir_map) {
154 std::string dir = d.first;
155 for (auto &r : d.second->file_map) {
156 f->open_object_section("file");
157 f->dump_string("name", (dir + "/" + r.first).c_str());
158 std::vector<size_t> sizes;
159 sizes.resize(bluefs->bdev.size());
160 for(auto& i : r.second->fnode.extents) {
161 sizes[i.bdev] += i.length;
162 }
163 for (size_t i = 0; i < sizes.size(); i++) {
164 if (sizes[i]>0) {
165 if (i < sizeof(devnames) / sizeof(*devnames))
166 f->dump_int(devnames[i], sizes[i]);
167 else
168 f->dump_int(("dev-"+to_string(i)).c_str(), sizes[i]);
169 }
170 }
171 f->close_section();
172 }
173 }
174 f->close_section();
175 f->flush(out);
cd265ab1
TL
176 } else if (command == "bluefs debug_inject_read_zeros") {
177 bluefs->inject_read_zeros++;
9f95a23c
TL
178 } else {
179 errss << "Invalid command" << std::endl;
180 return -ENOSYS;
eafe8130 181 }
9f95a23c
TL
182 return 0;
183 }
eafe8130
TL
184};
185
7c673cae
FG
186BlueFS::BlueFS(CephContext* cct)
187 : cct(cct),
188 bdev(MAX_BDEV),
189 ioc(MAX_BDEV),
f67539c2
TL
190 block_reserved(MAX_BDEV),
191 alloc(MAX_BDEV),
192 alloc_size(MAX_BDEV, 0),
193 pending_release(MAX_BDEV)
7c673cae 194{
11fdf7f2
TL
195 discard_cb[BDEV_WAL] = wal_discard_cb;
196 discard_cb[BDEV_DB] = db_discard_cb;
197 discard_cb[BDEV_SLOW] = slow_discard_cb;
eafe8130 198 asok_hook = SocketHook::create(this);
f67539c2 199
7c673cae
FG
200}
201
202BlueFS::~BlueFS()
203{
eafe8130 204 delete asok_hook;
7c673cae
FG
205 for (auto p : ioc) {
206 if (p)
207 p->aio_wait();
208 }
209 for (auto p : bdev) {
210 if (p) {
211 p->close();
212 delete p;
213 }
214 }
215 for (auto p : ioc) {
216 delete p;
217 }
218}
219
220void BlueFS::_init_logger()
221{
222 PerfCountersBuilder b(cct, "bluefs",
223 l_bluefs_first, l_bluefs_last);
7c673cae
FG
224 b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
225 "Total bytes (main db device)",
11fdf7f2 226 "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
227 b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
228 "Used bytes (main db device)",
11fdf7f2 229 "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
230 b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
231 "Total bytes (wal device)",
11fdf7f2 232 "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
233 b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
234 "Used bytes (wal device)",
11fdf7f2 235 "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
236 b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
237 "Total bytes (slow device)",
11fdf7f2 238 "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
239 b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
240 "Used bytes (slow device)",
11fdf7f2 241 "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
242 b.add_u64(l_bluefs_num_files, "num_files", "File count",
243 "f", PerfCountersBuilder::PRIO_USEFUL);
244 b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
11fdf7f2 245 "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
7c673cae
FG
246 b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
247 "Compactions of the metadata log");
248 b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
249 "Bytes written to the metadata log", "j",
11fdf7f2 250 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
7c673cae
FG
251 b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
252 "Files written to WAL");
253 b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
254 "Files written to SSTs");
255 b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
256 "Bytes written to WAL", "wal",
257 PerfCountersBuilder::PRIO_CRITICAL);
258 b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
259 "Bytes written to SSTs", "sst",
11fdf7f2
TL
260 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
261 b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow",
262 "Bytes written to WAL/SSTs at slow device", NULL,
263 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
264 b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal",
265 "Maximum bytes allocated from WAL");
266 b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db",
267 "Maximum bytes allocated from DB");
268 b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow",
269 "Maximum bytes allocated from SLOW");
494da23a
TL
270
271 b.add_u64_counter(l_bluefs_read_random_count, "read_random_count",
272 "random read requests processed");
273 b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes",
274 "Bytes requested in random read mode", NULL,
275 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
276 b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count",
277 "random reads requests going to disk");
278 b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes",
279 "Bytes read from disk in random read mode", NULL,
280 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
281 b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count",
282 "random read requests processed using prefetch buffer");
283 b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes",
284 "Bytes read from prefetch buffer in random read mode", NULL,
285 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
286
287 b.add_u64_counter(l_bluefs_read_count, "read_count",
288 "buffered read requests processed");
289 b.add_u64_counter(l_bluefs_read_bytes, "read_bytes",
290 "Bytes requested in buffered read mode", NULL,
291 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
292
293 b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count",
294 "prefetch read requests processed");
295 b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes",
296 "Bytes requested in prefetch read mode", NULL,
297 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
cd265ab1
TL
298 b.add_u64(l_bluefs_read_zeros_candidate, "read_zeros_candidate",
299 "How many times bluefs read found page with all 0s");
300 b.add_u64(l_bluefs_read_zeros_errors, "read_zeros_errors",
301 "How many times bluefs read found transient page with all 0s");
494da23a 302
7c673cae
FG
303 logger = b.create_perf_counters();
304 cct->get_perfcounters_collection()->add(logger);
305}
306
307void BlueFS::_shutdown_logger()
308{
309 cct->get_perfcounters_collection()->remove(logger);
310 delete logger;
311}
312
313void BlueFS::_update_logger_stats()
314{
315 // we must be holding the lock
316 logger->set(l_bluefs_num_files, file_map.size());
317 logger->set(l_bluefs_log_bytes, log_writer->file->fnode.size);
318
319 if (alloc[BDEV_WAL]) {
f67539c2
TL
320 logger->set(l_bluefs_wal_total_bytes, _get_total(BDEV_WAL));
321 logger->set(l_bluefs_wal_used_bytes, _get_used(BDEV_WAL));
7c673cae
FG
322 }
323 if (alloc[BDEV_DB]) {
f67539c2
TL
324 logger->set(l_bluefs_db_total_bytes, _get_total(BDEV_DB));
325 logger->set(l_bluefs_db_used_bytes, _get_used(BDEV_DB));
7c673cae
FG
326 }
327 if (alloc[BDEV_SLOW]) {
f67539c2
TL
328 logger->set(l_bluefs_slow_total_bytes, _get_total(BDEV_SLOW));
329 logger->set(l_bluefs_slow_used_bytes, _get_used(BDEV_SLOW));
7c673cae
FG
330 }
331}
332
11fdf7f2 333int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
f67539c2
TL
334 uint64_t reserved,
335 bluefs_shared_alloc_context_t* _shared_alloc)
7c673cae 336{
f67539c2
TL
337 dout(10) << __func__ << " bdev " << id << " path " << path << " "
338 << reserved << dendl;
11fdf7f2
TL
339 ceph_assert(id < bdev.size());
340 ceph_assert(bdev[id] == NULL);
341 BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
342 discard_cb[id], static_cast<void*>(this));
f67539c2
TL
343 block_reserved[id] = reserved;
344 if (_shared_alloc) {
11fdf7f2
TL
345 b->set_no_exclusive_lock();
346 }
7c673cae
FG
347 int r = b->open(path);
348 if (r < 0) {
349 delete b;
350 return r;
351 }
11fdf7f2
TL
352 if (trim) {
353 b->discard(0, b->get_size());
354 }
355
7c673cae 356 dout(1) << __func__ << " bdev " << id << " path " << path
1adf2230 357 << " size " << byte_u_t(b->get_size()) << dendl;
7c673cae
FG
358 bdev[id] = b;
359 ioc[id] = new IOContext(cct, NULL);
f67539c2
TL
360 if (_shared_alloc) {
361 ceph_assert(!shared_alloc);
362 shared_alloc = _shared_alloc;
363 alloc[id] = shared_alloc->a;
364 shared_alloc_id = id;
365 }
7c673cae
FG
366 return 0;
367}
368
369bool BlueFS::bdev_support_label(unsigned id)
370{
11fdf7f2
TL
371 ceph_assert(id < bdev.size());
372 ceph_assert(bdev[id]);
7c673cae
FG
373 return bdev[id]->supported_bdev_label();
374}
375
f67539c2 376uint64_t BlueFS::get_block_device_size(unsigned id) const
7c673cae
FG
377{
378 if (id < bdev.size() && bdev[id])
379 return bdev[id]->get_size();
380 return 0;
381}
382
f67539c2 383void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
7c673cae 384{
f67539c2
TL
385 dout(10) << __func__ << " bdev " << id << dendl;
386 ceph_assert(alloc[id]);
387 alloc[id]->release(to_release);
388 if (is_shared_alloc(id)) {
389 shared_alloc->bluefs_used -= to_release.size();
7c673cae 390 }
7c673cae
FG
391}
392
f67539c2 393uint64_t BlueFS::get_used()
7c673cae 394{
f67539c2
TL
395 std::lock_guard l(lock);
396 uint64_t used = 0;
397 for (unsigned id = 0; id < MAX_BDEV; ++id) {
398 used += _get_used(id);
7c673cae 399 }
f67539c2
TL
400 return used;
401}
7c673cae 402
f67539c2
TL
403uint64_t BlueFS::_get_used(unsigned id) const
404{
405 uint64_t used = 0;
406 if (!alloc[id])
407 return 0;
9f95a23c 408
f67539c2
TL
409 if (is_shared_alloc(id)) {
410 used = shared_alloc->bluefs_used;
411 } else {
412 used = _get_total(id) - alloc[id]->get_free();
9f95a23c 413 }
f67539c2 414 return used;
7c673cae
FG
415}
416
f67539c2 417uint64_t BlueFS::get_used(unsigned id)
7c673cae 418{
f67539c2 419 ceph_assert(id < alloc.size());
11fdf7f2 420 ceph_assert(alloc[id]);
f67539c2
TL
421 std::lock_guard l(lock);
422 return _get_used(id);
11fdf7f2
TL
423}
424
f67539c2 425uint64_t BlueFS::_get_total(unsigned id) const
11fdf7f2 426{
f67539c2
TL
427 ceph_assert(id < bdev.size());
428 ceph_assert(id < block_reserved.size());
429 return get_block_device_size(id) - block_reserved[id];
7c673cae
FG
430}
431
432uint64_t BlueFS::get_total(unsigned id)
433{
11fdf7f2 434 std::lock_guard l(lock);
f67539c2 435 return _get_total(id);
7c673cae
FG
436}
437
438uint64_t BlueFS::get_free(unsigned id)
439{
11fdf7f2
TL
440 std::lock_guard l(lock);
441 ceph_assert(id < alloc.size());
7c673cae
FG
442 return alloc[id]->get_free();
443}
444
445void BlueFS::dump_perf_counters(Formatter *f)
446{
447 f->open_object_section("bluefs_perf_counters");
448 logger->dump_formatted(f,0);
449 f->close_section();
450}
451
3efd9988
FG
452void BlueFS::dump_block_extents(ostream& out)
453{
454 for (unsigned i = 0; i < MAX_BDEV; ++i) {
455 if (!bdev[i]) {
456 continue;
457 }
f67539c2 458 auto total = get_total(i);
11fdf7f2 459 auto free = get_free(i);
1911f103 460
f67539c2
TL
461 out << i << " : device size 0x" << std::hex << total
462 << " : using 0x" << total - free
463 << std::dec << "(" << byte_u_t(total - free) << ")";
1911f103 464 out << "\n";
3efd9988
FG
465 }
466}
7c673cae 467
7c673cae
FG
468int BlueFS::get_block_extents(unsigned id, interval_set<uint64_t> *extents)
469{
11fdf7f2 470 std::lock_guard l(lock);
7c673cae 471 dout(10) << __func__ << " bdev " << id << dendl;
f67539c2
TL
472 ceph_assert(id < alloc.size());
473 for (auto& p : file_map) {
474 for (auto& q : p.second->fnode.extents) {
475 if (q.bdev == id) {
476 extents->insert(q.offset, q.length);
477 }
478 }
479 }
7c673cae
FG
480 return 0;
481}
482
9f95a23c 483int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
7c673cae 484{
11fdf7f2 485 std::unique_lock l(lock);
7c673cae
FG
486 dout(1) << __func__
487 << " osd_uuid " << osd_uuid
488 << dendl;
489
9f95a23c
TL
490 // set volume selector if not provided before/outside
491 if (vselector == nullptr) {
492 vselector.reset(
493 new OriginalVolumeSelector(
494 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
495 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
496 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
497 }
498
7c673cae
FG
499 _init_alloc();
500 _init_logger();
501
502 super.version = 1;
503 super.block_size = bdev[BDEV_DB]->get_block_size();
504 super.osd_uuid = osd_uuid;
505 super.uuid.generate_random();
506 dout(1) << __func__ << " uuid " << super.uuid << dendl;
507
508 // init log
9f95a23c 509 FileRef log_file = ceph::make_ref<File>();
7c673cae 510 log_file->fnode.ino = 1;
f6b5b4d7 511 log_file->vselector_hint = vselector->get_hint_for_log();
7c673cae 512 int r = _allocate(
9f95a23c 513 vselector->select_prefer_bdev(log_file->vselector_hint),
7c673cae 514 cct->_conf->bluefs_max_log_runway,
94b18763 515 &log_file->fnode);
9f95a23c 516 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
11fdf7f2 517 ceph_assert(r == 0);
7c673cae
FG
518 log_writer = _create_writer(log_file);
519
520 // initial txn
521 log_t.op_init();
7c673cae
FG
522 _flush_and_sync_log(l);
523
524 // write supers
525 super.log_fnode = log_file->fnode;
9f95a23c 526 super.memorized_layout = layout;
11fdf7f2 527 _write_super(BDEV_DB);
7c673cae
FG
528 flush_bdev();
529
530 // clean up
531 super = bluefs_super_t();
532 _close_writer(log_writer);
533 log_writer = NULL;
9f95a23c 534 vselector.reset(nullptr);
7c673cae
FG
535 _stop_alloc();
536 _shutdown_logger();
f67539c2
TL
537 if (shared_alloc) {
538 ceph_assert(shared_alloc->need_init);
539 shared_alloc->need_init = false;
540 }
7c673cae
FG
541
542 dout(10) << __func__ << " success" << dendl;
543 return 0;
544}
545
546void BlueFS::_init_alloc()
547{
548 dout(20) << __func__ << dendl;
eafe8130
TL
549
550 if (bdev[BDEV_WAL]) {
551 alloc_size[BDEV_WAL] = cct->_conf->bluefs_alloc_size;
552 }
553 if (bdev[BDEV_SLOW]) {
554 alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size;
555 alloc_size[BDEV_SLOW] = cct->_conf->bluefs_shared_alloc_size;
556 } else {
557 alloc_size[BDEV_DB] = cct->_conf->bluefs_shared_alloc_size;
558 }
559 // new wal and db devices are never shared
560 if (bdev[BDEV_NEWWAL]) {
561 alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size;
562 }
563 if (bdev[BDEV_NEWDB]) {
564 alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size;
565 }
566
7c673cae
FG
567 for (unsigned id = 0; id < bdev.size(); ++id) {
568 if (!bdev[id]) {
569 continue;
570 }
11fdf7f2 571 ceph_assert(bdev[id]->get_size());
eafe8130 572 ceph_assert(alloc_size[id]);
f67539c2
TL
573 if (is_shared_alloc(id)) {
574 dout(1) << __func__ << " shared, id " << id << std::hex
575 << ", capacity 0x" << bdev[id]->get_size()
576 << ", block size 0x" << alloc_size[id]
577 << std::dec << dendl;
578 } else {
579 std::string name = "bluefs-";
580 const char* devnames[] = { "wal","db","slow" };
581 if (id <= BDEV_SLOW)
582 name += devnames[id];
583 else
584 name += to_string(uintptr_t(this));
585 dout(1) << __func__ << " new, id " << id << std::hex
586 << ", allocator name " << name
587 << ", allocator type " << cct->_conf->bluefs_allocator
588 << ", capacity 0x" << bdev[id]->get_size()
589 << ", block size 0x" << alloc_size[id]
590 << std::dec << dendl;
591 alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
592 bdev[id]->get_size(),
593 alloc_size[id], name);
594 alloc[id]->init_add_free(
595 block_reserved[id],
596 _get_total(id));
7c673cae
FG
597 }
598 }
599}
600
601void BlueFS::_stop_alloc()
602{
603 dout(20) << __func__ << dendl;
11fdf7f2
TL
604 for (auto p : bdev) {
605 if (p)
606 p->discard_drain();
607 }
608
f67539c2
TL
609 for (size_t i = 0; i < alloc.size(); ++i) {
610 if (alloc[i] && !is_shared_alloc(i)) {
611 alloc[i]->shutdown();
612 delete alloc[i];
613 alloc[i] = nullptr;
7c673cae
FG
614 }
615 }
7c673cae
FG
616}
617
cd265ab1
TL
618int BlueFS::read(uint8_t ndev, uint64_t off, uint64_t len,
619 ceph::buffer::list *pbl, IOContext *ioc, bool buffered)
620{
621 dout(10) << __func__ << " dev " << int(ndev)
622 << ": 0x" << std::hex << off << "~" << len << std::dec
623 << (buffered ? " buffered" : "")
624 << dendl;
625 int r;
626 bufferlist bl;
627 r = bdev[ndev]->read(off, len, &bl, ioc, buffered);
628 if (r != 0) {
629 return r;
630 }
631 uint64_t block_size = bdev[ndev]->get_block_size();
632 if (inject_read_zeros) {
633 if (len >= block_size * 2) {
634 derr << __func__ << " injecting error, zeros at "
635 << int(ndev) << ": 0x" << std::hex << (off + len / 2)
636 << "~" << (block_size * 2) << std::dec << dendl;
637 //use beginning, replace 8K in the middle with zeros, use tail
638 bufferlist temp;
639 bl.splice(0, len / 2 - block_size, &temp);
f67539c2 640 temp.append(buffer::create(block_size * 2, 0));
cd265ab1
TL
641 bl.splice(block_size * 2, len / 2 - block_size, &temp);
642 bl = temp;
643 inject_read_zeros--;
644 }
645 }
646 //make a check if there is a block with all 0
647 uint64_t to_check_len = len;
648 uint64_t skip = p2nphase(off, block_size);
649 if (skip >= to_check_len) {
650 return r;
651 }
652 auto it = bl.begin(skip);
653 to_check_len -= skip;
654 bool all_zeros = false;
655 while (all_zeros == false && to_check_len >= block_size) {
656 // checking 0s step
657 unsigned block_left = block_size;
658 unsigned avail;
659 const char* data;
660 all_zeros = true;
661 while (all_zeros && block_left > 0) {
662 avail = it.get_ptr_and_advance(block_left, &data);
663 block_left -= avail;
664 all_zeros = mem_is_zero(data, avail);
665 }
666 // skipping step
667 while (block_left > 0) {
668 avail = it.get_ptr_and_advance(block_left, &data);
669 block_left -= avail;
670 }
671 to_check_len -= block_size;
672 }
673 if (all_zeros) {
674 logger->inc(l_bluefs_read_zeros_candidate, 1);
675 bufferlist bl_reread;
676 r = bdev[ndev]->read(off, len, &bl_reread, ioc, buffered);
677 if (r != 0) {
678 return r;
679 }
680 // check if both read gave the same
681 if (!bl.contents_equal(bl_reread)) {
682 // report problems to log, but continue, maybe it will be good now...
683 derr << __func__ << " initial read of " << int(ndev)
684 << ": 0x" << std::hex << off << "~" << len
685 << std::dec << ": different then re-read " << dendl;
686 logger->inc(l_bluefs_read_zeros_errors, 1);
687 }
688 // use second read will be better if is different
689 pbl->append(bl_reread);
690 } else {
691 pbl->append(bl);
692 }
693 return r;
694}
695
696int BlueFS::read_random(uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered)
697{
698 dout(10) << __func__ << " dev " << int(ndev)
699 << ": 0x" << std::hex << off << "~" << len << std::dec
700 << (buffered ? " buffered" : "")
701 << dendl;
702 int r;
703 r = bdev[ndev]->read_random(off, len, buf, buffered);
704 if (r != 0) {
705 return r;
706 }
707 uint64_t block_size = bdev[ndev]->get_block_size();
708 if (inject_read_zeros) {
709 if (len >= block_size * 2) {
710 derr << __func__ << " injecting error, zeros at "
711 << int(ndev) << ": 0x" << std::hex << (off + len / 2)
712 << "~" << (block_size * 2) << std::dec << dendl;
713 //zero middle 8K
714 memset(buf + len / 2 - block_size, 0, block_size * 2);
715 inject_read_zeros--;
716 }
717 }
718 //make a check if there is a block with all 0
719 uint64_t to_check_len = len;
720 const char* data = buf;
721 uint64_t skip = p2nphase(off, block_size);
722 if (skip >= to_check_len) {
723 return r;
724 }
725 to_check_len -= skip;
726 data += skip;
727
728 bool all_zeros = false;
729 while (all_zeros == false && to_check_len >= block_size) {
730 if (mem_is_zero(data, block_size)) {
731 // at least one block is all zeros
732 all_zeros = true;
733 break;
734 }
735 data += block_size;
736 to_check_len -= block_size;
737 }
738 if (all_zeros) {
739 logger->inc(l_bluefs_read_zeros_candidate, 1);
740 std::unique_ptr<char[]> data_reread(new char[len]);
741 r = bdev[ndev]->read_random(off, len, &data_reread[0], buffered);
742 if (r != 0) {
743 return r;
744 }
745 // check if both read gave the same
746 if (memcmp(buf, &data_reread[0], len) != 0) {
747 derr << __func__ << " initial read of " << int(ndev)
748 << ": 0x" << std::hex << off << "~" << len
749 << std::dec << ": different then re-read " << dendl;
750 logger->inc(l_bluefs_read_zeros_errors, 1);
751 // second read is probably better
752 memcpy(buf, &data_reread[0], len);
753 }
754 }
755 return r;
756}
757
7c673cae
FG
758int BlueFS::mount()
759{
760 dout(1) << __func__ << dendl;
761
762 int r = _open_super();
763 if (r < 0) {
764 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
765 goto out;
766 }
767
9f95a23c
TL
768 // set volume selector if not provided before/outside
769 if (vselector == nullptr) {
770 vselector.reset(
771 new OriginalVolumeSelector(
772 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
773 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
774 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
775 }
776
7c673cae 777 _init_alloc();
494da23a 778 _init_logger();
7c673cae 779
11fdf7f2 780 r = _replay(false, false);
7c673cae
FG
781 if (r < 0) {
782 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
783 _stop_alloc();
784 goto out;
785 }
786
787 // init freelist
788 for (auto& p : file_map) {
789 dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
790 for (auto& q : p.second->fnode.extents) {
f67539c2
TL
791 bool is_shared = is_shared_alloc(q.bdev);
792 ceph_assert(!is_shared || (is_shared && shared_alloc));
793 if (is_shared && shared_alloc->need_init && shared_alloc->a) {
794 shared_alloc->bluefs_used += q.length;
795 alloc[q.bdev]->init_rm_free(q.offset, q.length);
796 } else if (!is_shared) {
797 alloc[q.bdev]->init_rm_free(q.offset, q.length);
798 }
7c673cae
FG
799 }
800 }
f67539c2
TL
801 if (shared_alloc) {
802 shared_alloc->need_init = false;
803 dout(1) << __func__ << " shared_bdev_used = "
804 << shared_alloc->bluefs_used << dendl;
805 } else {
806 dout(1) << __func__ << " shared bdev not used"
807 << dendl;
808 }
7c673cae
FG
809
810 // set up the log for future writes
811 log_writer = _create_writer(_get_file(1));
11fdf7f2 812 ceph_assert(log_writer->file->fnode.ino == 1);
7c673cae
FG
813 log_writer->pos = log_writer->file->fnode.size;
814 dout(10) << __func__ << " log write pos set to 0x"
815 << std::hex << log_writer->pos << std::dec
816 << dendl;
817
7c673cae
FG
818 return 0;
819
820 out:
821 super = bluefs_super_t();
822 return r;
823}
824
9f95a23c
TL
825int BlueFS::maybe_verify_layout(const bluefs_layout_t& layout) const
826{
827 if (super.memorized_layout) {
828 if (layout == *super.memorized_layout) {
829 dout(10) << __func__ << " bluefs layout verified positively" << dendl;
830 } else {
831 derr << __func__ << " memorized layout doesn't fit current one" << dendl;
832 return -EIO;
833 }
834 } else {
835 dout(10) << __func__ << " no memorized_layout in bluefs superblock"
836 << dendl;
837 }
838
839 return 0;
840}
841
1911f103 842void BlueFS::umount(bool avoid_compact)
7c673cae
FG
843{
844 dout(1) << __func__ << dendl;
845
1911f103 846 sync_metadata(avoid_compact);
7c673cae
FG
847
848 _close_writer(log_writer);
849 log_writer = NULL;
850
9f95a23c 851 vselector.reset(nullptr);
7c673cae
FG
852 _stop_alloc();
853 file_map.clear();
854 dir_map.clear();
855 super = bluefs_super_t();
856 log_t.clear();
857 _shutdown_logger();
858}
859
9f95a23c 860int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout)
7c673cae 861{
11fdf7f2
TL
862 dout(1) << __func__ << dendl;
863
864 if(id == BDEV_NEWDB) {
865 int new_log_dev_cur = BDEV_WAL;
866 int new_log_dev_next = BDEV_WAL;
867 if (!bdev[BDEV_WAL]) {
868 new_log_dev_cur = BDEV_NEWDB;
869 new_log_dev_next = BDEV_DB;
870 }
9f95a23c 871 _rewrite_log_and_layout_sync(false,
11fdf7f2
TL
872 BDEV_NEWDB,
873 new_log_dev_cur,
874 new_log_dev_next,
9f95a23c
TL
875 RENAME_DB2SLOW,
876 layout);
11fdf7f2
TL
877 //}
878 } else if(id == BDEV_NEWWAL) {
9f95a23c
TL
879 _rewrite_log_and_layout_sync(false,
880 BDEV_DB,
881 BDEV_NEWWAL,
882 BDEV_WAL,
883 REMOVE_WAL,
884 layout);
11fdf7f2
TL
885 } else {
886 assert(false);
887 }
888 return 0;
889}
890
891void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id)
892{
893 if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB])
7c673cae
FG
894 bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
895 if (bdev[BDEV_WAL])
896 bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
11fdf7f2
TL
897}
898
899void BlueFS::get_devices(set<string> *ls)
900{
901 for (unsigned i = 0; i < MAX_BDEV; ++i) {
902 if (bdev[i]) {
903 bdev[i]->get_devices(ls);
904 }
905 }
7c673cae
FG
906}
907
908int BlueFS::fsck()
909{
11fdf7f2 910 std::lock_guard l(lock);
7c673cae
FG
911 dout(1) << __func__ << dendl;
912 // hrm, i think we check everything on mount...
913 return 0;
914}
915
11fdf7f2 916int BlueFS::_write_super(int dev)
7c673cae
FG
917{
918 // build superblock
919 bufferlist bl;
11fdf7f2 920 encode(super, bl);
7c673cae 921 uint32_t crc = bl.crc32c(-1);
11fdf7f2 922 encode(crc, bl);
7c673cae
FG
923 dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
924 dout(10) << __func__ << " superblock " << super.version << dendl;
925 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
9f95a23c 926 ceph_assert_always(bl.length() <= get_super_length());
7c673cae
FG
927 bl.append_zero(get_super_length() - bl.length());
928
11fdf7f2 929 bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT);
7c673cae
FG
930 dout(20) << __func__ << " v " << super.version
931 << " crc 0x" << std::hex << crc
932 << " offset 0x" << get_super_offset() << std::dec
933 << dendl;
934 return 0;
935}
936
937int BlueFS::_open_super()
938{
939 dout(10) << __func__ << dendl;
940
941 bufferlist bl;
942 uint32_t expected_crc, crc;
943 int r;
944
945 // always the second block
946 r = bdev[BDEV_DB]->read(get_super_offset(), get_super_length(),
947 &bl, ioc[BDEV_DB], false);
948 if (r < 0)
949 return r;
950
11fdf7f2
TL
951 auto p = bl.cbegin();
952 decode(super, p);
7c673cae
FG
953 {
954 bufferlist t;
955 t.substr_of(bl, 0, p.get_off());
956 crc = t.crc32c(-1);
957 }
11fdf7f2 958 decode(expected_crc, p);
7c673cae
FG
959 if (crc != expected_crc) {
960 derr << __func__ << " bad crc on superblock, expected 0x"
961 << std::hex << expected_crc << " != actual 0x" << crc << std::dec
962 << dendl;
963 return -EIO;
964 }
965 dout(10) << __func__ << " superblock " << super.version << dendl;
966 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
967 return 0;
968}
969
9f95a23c
TL
970int BlueFS::_check_new_allocations(const bluefs_fnode_t& fnode,
971 size_t dev_count,
9f95a23c
TL
972 boost::dynamic_bitset<uint64_t>* used_blocks)
973{
974 auto& fnode_extents = fnode.extents;
975 for (auto e : fnode_extents) {
976 auto id = e.bdev;
977 bool fail = false;
978 ceph_assert(id < dev_count);
9f95a23c
TL
979
980 apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
981 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
982 if (bs.test(pos)) {
983 fail = true;
984 }
985 bs.set(pos);
986 }
987 );
988 if (fail) {
989 derr << __func__ << " invalid extent " << int(e.bdev)
990 << ": 0x" << std::hex << e.offset << "~" << e.length
991 << std::dec << ": duplicate reference, ino " << fnode.ino
992 << dendl;
993 return -EFAULT;
994 }
995 }
996 return 0;
997}
998
9f95a23c
TL
999int BlueFS::_verify_alloc_granularity(
1000 __u8 id, uint64_t offset, uint64_t length, const char *op)
1001{
1002 if ((offset & (alloc_size[id] - 1)) ||
1003 (length & (alloc_size[id] - 1))) {
1004 derr << __func__ << " " << op << " of " << (int)id
1005 << ":0x" << std::hex << offset << "~" << length << std::dec
1006 << " does not align to alloc_size 0x"
1007 << std::hex << alloc_size[id] << std::dec << dendl;
1008 // be helpful
1009 auto need = alloc_size[id];
1010 while (need && ((offset & (need - 1)) ||
1011 (length & (need - 1)))) {
1012 need >>= 1;
1013 }
1014 if (need) {
1015 const char *which;
1016 if (id == BDEV_SLOW ||
1017 (id == BDEV_DB && !bdev[BDEV_SLOW])) {
1018 which = "bluefs_shared_alloc_size";
1019 } else {
1020 which = "bluefs_alloc_size";
1021 }
1022 derr << "work-around by setting " << which << " = " << need
1023 << " for this OSD" << dendl;
1024 }
1025 return -EFAULT;
1026 }
1027 return 0;
1028}
1029
11fdf7f2 1030int BlueFS::_replay(bool noop, bool to_stdout)
7c673cae
FG
1031{
1032 dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
1033 ino_last = 1; // by the log
1034 log_seq = 0;
1035
1036 FileRef log_file;
11fdf7f2 1037 log_file = _get_file(1);
9f95a23c 1038
f67539c2 1039 log_file->fnode = super.log_fnode;
11fdf7f2 1040 if (!noop) {
9f95a23c 1041 log_file->vselector_hint =
f6b5b4d7 1042 vselector->get_hint_for_log();
7c673cae 1043 } else {
11fdf7f2
TL
1044 // do not use fnode from superblock in 'noop' mode - log_file's one should
1045 // be fine and up-to-date
1046 ceph_assert(log_file->fnode.ino == 1);
1047 ceph_assert(log_file->fnode.extents.size() != 0);
7c673cae 1048 }
7c673cae 1049 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
11fdf7f2
TL
1050 if (unlikely(to_stdout)) {
1051 std::cout << " log_fnode " << super.log_fnode << std::endl;
1052 }
7c673cae
FG
1053
1054 FileReader *log_reader = new FileReader(
1055 log_file, cct->_conf->bluefs_max_prefetch,
1056 false, // !random
1057 true); // ignore eof
9f95a23c
TL
1058
1059 bool seen_recs = false;
1060
1061 boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV];
9f95a23c 1062
f67539c2
TL
1063 if (!noop) {
1064 if (cct->_conf->bluefs_log_replay_check_allocations) {
1065 for (size_t i = 0; i < MAX_BDEV; ++i) {
1066 if (alloc_size[i] != 0 && bdev[i] != nullptr) {
1067 used_blocks[i].resize(round_up_to(bdev[i]->get_size(), alloc_size[i]) / alloc_size[i]);
1068 }
9f95a23c
TL
1069 }
1070 }
1071 }
1072
1073 bool first_log_check = true;
1074
7c673cae 1075 while (true) {
11fdf7f2 1076 ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0);
7c673cae
FG
1077 uint64_t pos = log_reader->buf.pos;
1078 uint64_t read_pos = pos;
1079 bufferlist bl;
1080 {
f67539c2 1081 int r = _read(log_reader, read_pos, super.block_size,
7c673cae 1082 &bl, NULL);
f6b5b4d7
TL
1083 if (r != (int)super.block_size && cct->_conf->bluefs_replay_recovery) {
1084 r += do_replay_recovery_read(log_reader, pos, read_pos + r, super.block_size - r, &bl);
1085 }
1086 assert(r == (int)super.block_size);
7c673cae
FG
1087 read_pos += r;
1088 }
1089 uint64_t more = 0;
1090 uint64_t seq;
1091 uuid_d uuid;
1092 {
11fdf7f2 1093 auto p = bl.cbegin();
7c673cae
FG
1094 __u8 a, b;
1095 uint32_t len;
11fdf7f2
TL
1096 decode(a, p);
1097 decode(b, p);
1098 decode(len, p);
1099 decode(uuid, p);
1100 decode(seq, p);
7c673cae 1101 if (len + 6 > bl.length()) {
11fdf7f2 1102 more = round_up_to(len + 6 - bl.length(), super.block_size);
7c673cae
FG
1103 }
1104 }
1105 if (uuid != super.uuid) {
9f95a23c
TL
1106 if (seen_recs) {
1107 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1108 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1109 << dendl;
1110 } else {
1111 derr << __func__ << " 0x" << std::hex << pos << std::dec
1112 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1113 << ", block dump: \n";
1114 bufferlist t;
1115 t.substr_of(bl, 0, super.block_size);
1116 t.hexdump(*_dout);
1117 *_dout << dendl;
1118 }
7c673cae
FG
1119 break;
1120 }
1121 if (seq != log_seq + 1) {
9f95a23c
TL
1122 if (seen_recs) {
1123 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1124 << ": stop: seq " << seq << " != expected " << log_seq + 1
1125 << dendl;;
1126 } else {
1127 derr << __func__ << " 0x" << std::hex << pos << std::dec
1128 << ": stop: seq " << seq << " != expected " << log_seq + 1
1129 << dendl;;
1130 }
7c673cae
FG
1131 break;
1132 }
1133 if (more) {
1134 dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
1135 << " more bytes" << dendl;
1136 bufferlist t;
f67539c2 1137 int r = _read(log_reader, read_pos, more, &t, NULL);
7c673cae 1138 if (r < (int)more) {
f6b5b4d7
TL
1139 dout(10) << __func__ << " 0x" << std::hex << pos
1140 << ": stop: len is 0x" << bl.length() + more << std::dec
1141 << ", which is past eof" << dendl;
1142 if (cct->_conf->bluefs_replay_recovery) {
1143 //try to search for more data
1144 r += do_replay_recovery_read(log_reader, pos, read_pos + r, more - r, &t);
1145 if (r < (int)more) {
1146 //in normal mode we must read r==more, for recovery it is too strict
1147 break;
1148 }
1149 }
7c673cae 1150 }
11fdf7f2 1151 ceph_assert(r == (int)more);
7c673cae
FG
1152 bl.claim_append(t);
1153 read_pos += r;
1154 }
9f95a23c 1155 seen_recs = true;
7c673cae
FG
1156 bluefs_transaction_t t;
1157 try {
11fdf7f2
TL
1158 auto p = bl.cbegin();
1159 decode(t, p);
7c673cae 1160 }
f67539c2 1161 catch (ceph::buffer::error& e) {
9f95a23c
TL
1162 derr << __func__ << " 0x" << std::hex << pos << std::dec
1163 << ": stop: failed to decode: " << e.what()
1164 << dendl;
7c673cae
FG
1165 delete log_reader;
1166 return -EIO;
1167 }
11fdf7f2 1168 ceph_assert(seq == t.seq);
7c673cae
FG
1169 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1170 << ": " << t << dendl;
11fdf7f2
TL
1171 if (unlikely(to_stdout)) {
1172 std::cout << " 0x" << std::hex << pos << std::dec
1173 << ": " << t << std::endl;
1174 }
7c673cae 1175
11fdf7f2 1176 auto p = t.op_bl.cbegin();
7c673cae
FG
1177 while (!p.end()) {
1178 __u8 op;
11fdf7f2 1179 decode(op, p);
7c673cae
FG
1180 switch (op) {
1181
1182 case bluefs_transaction_t::OP_INIT:
1183 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1184 << ": op_init" << dendl;
11fdf7f2
TL
1185 if (unlikely(to_stdout)) {
1186 std::cout << " 0x" << std::hex << pos << std::dec
1187 << ": op_init" << std::endl;
1188 }
1189
1190 ceph_assert(t.seq == 1);
7c673cae
FG
1191 break;
1192
1193 case bluefs_transaction_t::OP_JUMP:
1194 {
1195 uint64_t next_seq;
1196 uint64_t offset;
11fdf7f2
TL
1197 decode(next_seq, p);
1198 decode(offset, p);
7c673cae
FG
1199 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1200 << ": op_jump seq " << next_seq
1201 << " offset 0x" << std::hex << offset << std::dec << dendl;
11fdf7f2
TL
1202 if (unlikely(to_stdout)) {
1203 std::cout << " 0x" << std::hex << pos << std::dec
1204 << ": op_jump seq " << next_seq
1205 << " offset 0x" << std::hex << offset << std::dec
1206 << std::endl;
1207 }
1208
1209 ceph_assert(next_seq >= log_seq);
7c673cae
FG
1210 log_seq = next_seq - 1; // we will increment it below
1211 uint64_t skip = offset - read_pos;
1212 if (skip) {
1213 bufferlist junk;
f67539c2 1214 int r = _read(log_reader, read_pos, skip, &junk,
7c673cae
FG
1215 NULL);
1216 if (r != (int)skip) {
1217 dout(10) << __func__ << " 0x" << std::hex << read_pos
1218 << ": stop: failed to skip to " << offset
1219 << std::dec << dendl;
11fdf7f2 1220 ceph_abort_msg("problem with op_jump");
7c673cae
FG
1221 }
1222 }
1223 }
1224 break;
1225
1226 case bluefs_transaction_t::OP_JUMP_SEQ:
1227 {
1228 uint64_t next_seq;
11fdf7f2 1229 decode(next_seq, p);
7c673cae
FG
1230 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1231 << ": op_jump_seq " << next_seq << dendl;
11fdf7f2
TL
1232 if (unlikely(to_stdout)) {
1233 std::cout << " 0x" << std::hex << pos << std::dec
1234 << ": op_jump_seq " << next_seq << std::endl;
1235 }
1236
1237 ceph_assert(next_seq >= log_seq);
7c673cae
FG
1238 log_seq = next_seq - 1; // we will increment it below
1239 }
1240 break;
1241
1242 case bluefs_transaction_t::OP_ALLOC_ADD:
f67539c2 1243 // LEGACY, do nothing but read params
7c673cae 1244 {
f67539c2
TL
1245 __u8 id;
1246 uint64_t offset, length;
1247 decode(id, p);
1248 decode(offset, p);
1249 decode(length, p);
1250 }
7c673cae
FG
1251 break;
1252
1253 case bluefs_transaction_t::OP_ALLOC_RM:
f67539c2 1254 // LEGACY, do nothing but read params
7c673cae 1255 {
f67539c2
TL
1256 __u8 id;
1257 uint64_t offset, length;
1258 decode(id, p);
1259 decode(offset, p);
1260 decode(length, p);
1261 }
1262 break;
7c673cae
FG
1263
1264 case bluefs_transaction_t::OP_DIR_LINK:
1265 {
1266 string dirname, filename;
1267 uint64_t ino;
11fdf7f2
TL
1268 decode(dirname, p);
1269 decode(filename, p);
1270 decode(ino, p);
7c673cae
FG
1271 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1272 << ": op_dir_link " << " " << dirname << "/" << filename
1273 << " to " << ino
1274 << dendl;
11fdf7f2
TL
1275 if (unlikely(to_stdout)) {
1276 std::cout << " 0x" << std::hex << pos << std::dec
1277 << ": op_dir_link " << " " << dirname << "/" << filename
1278 << " to " << ino
1279 << std::endl;
1280 }
1281
7c673cae
FG
1282 if (!noop) {
1283 FileRef file = _get_file(ino);
11fdf7f2 1284 ceph_assert(file->fnode.ino);
7c673cae 1285 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2 1286 ceph_assert(q != dir_map.end());
7c673cae 1287 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
11fdf7f2 1288 ceph_assert(r == q->second->file_map.end());
9f95a23c
TL
1289
1290 vselector->sub_usage(file->vselector_hint, file->fnode);
1291 file->vselector_hint =
1292 vselector->get_hint_by_dir(dirname);
1293 vselector->add_usage(file->vselector_hint, file->fnode);
1294
7c673cae
FG
1295 q->second->file_map[filename] = file;
1296 ++file->refs;
1297 }
1298 }
1299 break;
1300
1301 case bluefs_transaction_t::OP_DIR_UNLINK:
1302 {
1303 string dirname, filename;
11fdf7f2
TL
1304 decode(dirname, p);
1305 decode(filename, p);
7c673cae
FG
1306 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1307 << ": op_dir_unlink " << " " << dirname << "/" << filename
1308 << dendl;
11fdf7f2
TL
1309 if (unlikely(to_stdout)) {
1310 std::cout << " 0x" << std::hex << pos << std::dec
1311 << ": op_dir_unlink " << " " << dirname << "/" << filename
1312 << std::endl;
1313 }
1314
7c673cae
FG
1315 if (!noop) {
1316 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2 1317 ceph_assert(q != dir_map.end());
7c673cae 1318 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
11fdf7f2
TL
1319 ceph_assert(r != q->second->file_map.end());
1320 ceph_assert(r->second->refs > 0);
7c673cae
FG
1321 --r->second->refs;
1322 q->second->file_map.erase(r);
1323 }
1324 }
1325 break;
1326
1327 case bluefs_transaction_t::OP_DIR_CREATE:
1328 {
1329 string dirname;
11fdf7f2 1330 decode(dirname, p);
7c673cae
FG
1331 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1332 << ": op_dir_create " << dirname << dendl;
11fdf7f2
TL
1333 if (unlikely(to_stdout)) {
1334 std::cout << " 0x" << std::hex << pos << std::dec
1335 << ": op_dir_create " << dirname << std::endl;
1336 }
1337
7c673cae
FG
1338 if (!noop) {
1339 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2 1340 ceph_assert(q == dir_map.end());
9f95a23c 1341 dir_map[dirname] = ceph::make_ref<Dir>();
7c673cae
FG
1342 }
1343 }
1344 break;
1345
1346 case bluefs_transaction_t::OP_DIR_REMOVE:
1347 {
1348 string dirname;
11fdf7f2 1349 decode(dirname, p);
7c673cae
FG
1350 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1351 << ": op_dir_remove " << dirname << dendl;
11fdf7f2
TL
1352 if (unlikely(to_stdout)) {
1353 std::cout << " 0x" << std::hex << pos << std::dec
1354 << ": op_dir_remove " << dirname << std::endl;
1355 }
1356
7c673cae
FG
1357 if (!noop) {
1358 map<string,DirRef>::iterator q = dir_map.find(dirname);
11fdf7f2
TL
1359 ceph_assert(q != dir_map.end());
1360 ceph_assert(q->second->file_map.empty());
7c673cae
FG
1361 dir_map.erase(q);
1362 }
1363 }
1364 break;
1365
1366 case bluefs_transaction_t::OP_FILE_UPDATE:
1367 {
1368 bluefs_fnode_t fnode;
11fdf7f2 1369 decode(fnode, p);
7c673cae 1370 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
9f95a23c 1371 << ": op_file_update " << " " << fnode << " " << dendl;
11fdf7f2
TL
1372 if (unlikely(to_stdout)) {
1373 std::cout << " 0x" << std::hex << pos << std::dec
1374 << ": op_file_update " << " " << fnode << std::endl;
1375 }
9f95a23c 1376 if (!noop) {
7c673cae 1377 FileRef f = _get_file(fnode.ino);
9f95a23c
TL
1378 if (cct->_conf->bluefs_log_replay_check_allocations) {
1379 // check initial log layout
1380 if (first_log_check) {
1381 first_log_check = false;
1382 int r = _check_new_allocations(log_file->fnode,
f67539c2 1383 MAX_BDEV, used_blocks);
9f95a23c
TL
1384 if (r < 0) {
1385 return r;
1386 }
1387 }
1388
1389 auto& fnode_extents = f->fnode.extents;
1390 for (auto e : fnode_extents) {
1391 auto id = e.bdev;
1392 if (int r = _verify_alloc_granularity(id, e.offset, e.length,
1393 "OP_FILE_UPDATE"); r < 0) {
1394 return r;
1395 }
1396 apply_for_bitset_range(e.offset, e.length, alloc_size[id],
1397 used_blocks[id],
1398 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1399 ceph_assert(bs.test(pos));
1400 bs.reset(pos);
1401 }
1402 );
1403 }
1404 }
1405
1406 if (fnode.ino != 1) {
1407 vselector->sub_usage(f->vselector_hint, f->fnode);
1408 }
1409 f->fnode = fnode;
1410 if (fnode.ino != 1) {
1411 vselector->add_usage(f->vselector_hint, f->fnode);
1412 }
1413
7c673cae
FG
1414 if (fnode.ino > ino_last) {
1415 ino_last = fnode.ino;
1416 }
9f95a23c
TL
1417 if (cct->_conf->bluefs_log_replay_check_allocations) {
1418 int r = _check_new_allocations(f->fnode,
f67539c2 1419 MAX_BDEV, used_blocks);
9f95a23c
TL
1420 if (r < 0) {
1421 return r;
1422 }
1423 }
7c673cae 1424 }
9f95a23c 1425 }
7c673cae
FG
1426 break;
1427
1428 case bluefs_transaction_t::OP_FILE_REMOVE:
1429 {
1430 uint64_t ino;
11fdf7f2 1431 decode(ino, p);
7c673cae
FG
1432 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1433 << ": op_file_remove " << ino << dendl;
11fdf7f2
TL
1434 if (unlikely(to_stdout)) {
1435 std::cout << " 0x" << std::hex << pos << std::dec
1436 << ": op_file_remove " << ino << std::endl;
1437 }
1438
9f95a23c
TL
1439 if (!noop) {
1440 auto p = file_map.find(ino);
1441 ceph_assert(p != file_map.end());
1442 vselector->sub_usage(p->second->vselector_hint, p->second->fnode);
1443 if (cct->_conf->bluefs_log_replay_check_allocations) {
1444 auto& fnode_extents = p->second->fnode.extents;
1445 for (auto e : fnode_extents) {
1446 auto id = e.bdev;
1447 bool fail = false;
9f95a23c
TL
1448
1449 apply_for_bitset_range(e.offset, e.length, alloc_size[id], used_blocks[id],
1450 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
1451 if (!bs.test(pos)) {
1452 fail = true;
1453 }
1454 bs.reset(pos);
1455 }
1456 );
1457 if (fail) {
1458 derr << __func__ << " invalid extent " << int(id)
1459 << ": 0x" << std::hex << e.offset << "~" << e.length
1460 << std::dec
1461 << ": not in use but is allocated for removed ino " << ino
1462 << dendl;
1463 return -EFAULT;
1464 }
1465 }
1466 }
1467 file_map.erase(p);
1468 }
1469 }
7c673cae
FG
1470 break;
1471
1472 default:
1473 derr << __func__ << " 0x" << std::hex << pos << std::dec
1474 << ": stop: unrecognized op " << (int)op << dendl;
1475 delete log_reader;
1476 return -EIO;
1477 }
1478 }
11fdf7f2 1479 ceph_assert(p.end());
7c673cae
FG
1480
1481 // we successfully replayed the transaction; bump the seq and log size
1482 ++log_seq;
1483 log_file->fnode.size = log_reader->buf.pos;
1484 }
f67539c2
TL
1485 if (!noop) {
1486 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
1487 }
9f95a23c
TL
1488 if (!noop && first_log_check &&
1489 cct->_conf->bluefs_log_replay_check_allocations) {
1490 int r = _check_new_allocations(log_file->fnode,
f67539c2 1491 MAX_BDEV, used_blocks);
9f95a23c
TL
1492 if (r < 0) {
1493 return r;
1494 }
1495 }
7c673cae
FG
1496
1497 dout(10) << __func__ << " log file size was 0x"
1498 << std::hex << log_file->fnode.size << std::dec << dendl;
11fdf7f2
TL
1499 if (unlikely(to_stdout)) {
1500 std::cout << " log file size was 0x"
1501 << std::hex << log_file->fnode.size << std::dec << std::endl;
1502 }
1503
7c673cae
FG
1504 delete log_reader;
1505
1506 if (!noop) {
1507 // verify file link counts are all >0
1508 for (auto& p : file_map) {
1509 if (p.second->refs == 0 &&
1510 p.second->fnode.ino > 1) {
1511 derr << __func__ << " file with link count 0: " << p.second->fnode
1512 << dendl;
1513 return -EIO;
1514 }
1515 }
1516 }
1517
1518 dout(10) << __func__ << " done" << dendl;
1519 return 0;
1520}
1521
11fdf7f2
TL
1522int BlueFS::log_dump()
1523{
1524 // only dump log file's content
f67539c2
TL
1525 ceph_assert(log_writer == nullptr && "cannot log_dump on mounted BlueFS");
1526 int r = _open_super();
11fdf7f2 1527 if (r < 0) {
f67539c2 1528 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
11fdf7f2
TL
1529 return r;
1530 }
f67539c2
TL
1531 _init_logger();
1532 r = _replay(true, true);
1533 if (r < 0) {
1534 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
1535 }
1536 _shutdown_logger();
1537 super = bluefs_super_t();
1538 return r;
11fdf7f2
TL
1539}
1540
1541int BlueFS::device_migrate_to_existing(
1542 CephContext *cct,
1543 const set<int>& devs_source,
9f95a23c
TL
1544 int dev_target,
1545 const bluefs_layout_t& layout)
11fdf7f2
TL
1546{
1547 vector<byte> buf;
1548 bool buffered = cct->_conf->bluefs_buffered_io;
1549
eafe8130
TL
1550 dout(10) << __func__ << " devs_source " << devs_source
1551 << " dev_target " << dev_target << dendl;
11fdf7f2
TL
1552 assert(dev_target < (int)MAX_BDEV);
1553
1554 int flags = 0;
1555 flags |= devs_source.count(BDEV_DB) ?
1556 (REMOVE_DB | RENAME_SLOW2DB) : 0;
1557 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1558 int dev_target_new = dev_target;
1559
1560 // Slow device without separate DB one is addressed via BDEV_DB
1561 // Hence need renaming.
1562 if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) {
1563 dev_target_new = BDEV_DB;
1564 dout(0) << __func__ << " super to be written to " << dev_target << dendl;
1565 }
1566
9f95a23c 1567 for (auto& [ino, file_ref] : file_map) {
11fdf7f2 1568 //do not copy log
9f95a23c 1569 if (file_ref->fnode.ino == 1) {
11fdf7f2
TL
1570 continue;
1571 }
9f95a23c 1572 dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
eafe8130 1573
9f95a23c 1574 auto& fnode_extents = file_ref->fnode.extents;
11fdf7f2 1575
9f95a23c
TL
1576 bool rewrite = std::any_of(
1577 fnode_extents.begin(),
1578 fnode_extents.end(),
1579 [=](auto& ext) {
1580 return ext.bdev != dev_target && devs_source.count(ext.bdev);
1581 });
eafe8130
TL
1582 if (rewrite) {
1583 dout(10) << __func__ << " migrating" << dendl;
1584
1585 // read entire file
1586 bufferlist bl;
1587 for (auto old_ext : fnode_extents) {
1588 buf.resize(old_ext.length);
1589 int r = bdev[old_ext.bdev]->read_random(
1590 old_ext.offset,
1591 old_ext.length,
1592 (char*)&buf.at(0),
1593 buffered);
1594 if (r != 0) {
1595 derr << __func__ << " failed to read 0x" << std::hex
1596 << old_ext.offset << "~" << old_ext.length << std::dec
1597 << " from " << (int)dev_target << dendl;
1598 return -EIO;
1599 }
1600 bl.append((char*)&buf[0], old_ext.length);
1601 }
11fdf7f2 1602
eafe8130
TL
1603 // write entire file
1604 PExtentVector extents;
1605 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1606 if (l < 0) {
1607 derr << __func__ << " unable to allocate len 0x" << std::hex
1608 << bl.length() << std::dec << " from " << (int)dev_target
1609 << ": " << cpp_strerror(l) << dendl;
1610 return -ENOSPC;
1611 }
11fdf7f2 1612
eafe8130
TL
1613 uint64_t off = 0;
1614 for (auto& i : extents) {
1615 bufferlist cur;
1616 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1617 ceph_assert(cur_len > 0);
1618 cur.substr_of(bl, off, cur_len);
1619 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1620 ceph_assert(r == 0);
1621 off += cur_len;
1622 }
1623
1624 // release old extents
1625 for (auto old_ext : fnode_extents) {
1626 PExtentVector to_release;
1627 to_release.emplace_back(old_ext.offset, old_ext.length);
1628 alloc[old_ext.bdev]->release(to_release);
f67539c2
TL
1629 if (is_shared_alloc(old_ext.bdev)) {
1630 shared_alloc->bluefs_used -= to_release.size();
1631 }
eafe8130
TL
1632 }
1633
1634 // update fnode
1635 fnode_extents.clear();
1636 for (auto& i : extents) {
1637 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
1638 }
1639 } else {
9f95a23c
TL
1640 for (auto& ext : fnode_extents) {
1641 if (dev_target != dev_target_new && ext.bdev == dev_target) {
eafe8130 1642 dout(20) << __func__ << " " << " ... adjusting extent 0x"
9f95a23c 1643 << std::hex << ext.offset << std::dec
eafe8130
TL
1644 << " bdev " << dev_target << " -> " << dev_target_new
1645 << dendl;
9f95a23c 1646 ext.bdev = dev_target_new;
11fdf7f2 1647 }
11fdf7f2
TL
1648 }
1649 }
11fdf7f2
TL
1650 }
1651 // new logging device in the current naming scheme
1652 int new_log_dev_cur = bdev[BDEV_WAL] ?
1653 BDEV_WAL :
1654 bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
1655
1656 // new logging device in new naming scheme
1657 int new_log_dev_next = new_log_dev_cur;
1658
1659 if (devs_source.count(new_log_dev_cur)) {
1660 // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1661 new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ?
1662 BDEV_DB :
1663 BDEV_WAL;
1664
1665 dout(0) << __func__ << " log moved from " << new_log_dev_cur
1666 << " to " << new_log_dev_next << dendl;
1667
1668 new_log_dev_cur =
1669 (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ?
1670 BDEV_SLOW :
1671 new_log_dev_next;
1672 }
1673
9f95a23c 1674 _rewrite_log_and_layout_sync(
11fdf7f2
TL
1675 false,
1676 (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB,
1677 new_log_dev_cur,
1678 new_log_dev_next,
9f95a23c
TL
1679 flags,
1680 layout);
11fdf7f2
TL
1681 return 0;
1682}
1683
1684int BlueFS::device_migrate_to_new(
1685 CephContext *cct,
1686 const set<int>& devs_source,
9f95a23c
TL
1687 int dev_target,
1688 const bluefs_layout_t& layout)
11fdf7f2
TL
1689{
1690 vector<byte> buf;
1691 bool buffered = cct->_conf->bluefs_buffered_io;
1692
eafe8130
TL
1693 dout(10) << __func__ << " devs_source " << devs_source
1694 << " dev_target " << dev_target << dendl;
11fdf7f2
TL
1695 assert(dev_target == (int)BDEV_NEWDB || (int)BDEV_NEWWAL);
1696
1697 int flags = 0;
1698
1699 flags |= devs_source.count(BDEV_DB) ?
1700 (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) :
1701 0;
1702 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
9f95a23c 1703 int dev_target_new = dev_target; //FIXME: remove, makes no sense
11fdf7f2
TL
1704
1705 for (auto& p : file_map) {
1706 //do not copy log
1707 if (p.second->fnode.ino == 1) {
1708 continue;
1709 }
eafe8130
TL
1710 dout(10) << __func__ << " " << p.first << " " << p.second->fnode << dendl;
1711
11fdf7f2
TL
1712 auto& fnode_extents = p.second->fnode.extents;
1713
eafe8130 1714 bool rewrite = false;
11fdf7f2 1715 for (auto ext_it = fnode_extents.begin();
eafe8130
TL
1716 ext_it != p.second->fnode.extents.end();
1717 ++ext_it) {
11fdf7f2 1718 if (ext_it->bdev != dev_target && devs_source.count(ext_it->bdev)) {
eafe8130
TL
1719 rewrite = true;
1720 break;
1721 }
1722 }
1723 if (rewrite) {
1724 dout(10) << __func__ << " migrating" << dendl;
1725
1726 // read entire file
1727 bufferlist bl;
1728 for (auto old_ext : fnode_extents) {
1729 buf.resize(old_ext.length);
1730 int r = bdev[old_ext.bdev]->read_random(
1731 old_ext.offset,
1732 old_ext.length,
1733 (char*)&buf.at(0),
1734 buffered);
1735 if (r != 0) {
1736 derr << __func__ << " failed to read 0x" << std::hex
1737 << old_ext.offset << "~" << old_ext.length << std::dec
1738 << " from " << (int)dev_target << dendl;
1739 return -EIO;
11fdf7f2 1740 }
eafe8130
TL
1741 bl.append((char*)&buf[0], old_ext.length);
1742 }
1743
1744 // write entire file
1745 PExtentVector extents;
1746 auto l = _allocate_without_fallback(dev_target, bl.length(), &extents);
1747 if (l < 0) {
1748 derr << __func__ << " unable to allocate len 0x" << std::hex
1749 << bl.length() << std::dec << " from " << (int)dev_target
1750 << ": " << cpp_strerror(l) << dendl;
1751 return -ENOSPC;
1752 }
1753
1754 uint64_t off = 0;
1755 for (auto& i : extents) {
1756 bufferlist cur;
1757 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1758 ceph_assert(cur_len > 0);
1759 cur.substr_of(bl, off, cur_len);
1760 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1761 ceph_assert(r == 0);
1762 off += cur_len;
1763 }
1764
1765 // release old extents
1766 for (auto old_ext : fnode_extents) {
1767 PExtentVector to_release;
1768 to_release.emplace_back(old_ext.offset, old_ext.length);
1769 alloc[old_ext.bdev]->release(to_release);
f67539c2
TL
1770 if (is_shared_alloc(old_ext.bdev)) {
1771 shared_alloc->bluefs_used -= to_release.size();
1772 }
eafe8130
TL
1773 }
1774
1775 // update fnode
1776 fnode_extents.clear();
1777 for (auto& i : extents) {
1778 fnode_extents.emplace_back(dev_target_new, i.offset, i.length);
11fdf7f2
TL
1779 }
1780 }
11fdf7f2
TL
1781 }
1782 // new logging device in the current naming scheme
1783 int new_log_dev_cur =
1784 bdev[BDEV_NEWWAL] ?
1785 BDEV_NEWWAL :
1786 bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ?
1787 BDEV_WAL :
1788 bdev[BDEV_NEWDB] ?
1789 BDEV_NEWDB :
1790 bdev[BDEV_DB] && !(flags & REMOVE_DB)?
1791 BDEV_DB :
1792 BDEV_SLOW;
1793
1794 // new logging device in new naming scheme
1795 int new_log_dev_next =
1796 new_log_dev_cur == BDEV_NEWWAL ?
1797 BDEV_WAL :
1798 new_log_dev_cur == BDEV_NEWDB ?
1799 BDEV_DB :
1800 new_log_dev_cur;
1801
1802 int super_dev =
1803 dev_target == BDEV_NEWDB ?
1804 BDEV_NEWDB :
1805 bdev[BDEV_DB] ?
1806 BDEV_DB :
1807 BDEV_SLOW;
1808
9f95a23c 1809 _rewrite_log_and_layout_sync(
11fdf7f2
TL
1810 false,
1811 super_dev,
1812 new_log_dev_cur,
1813 new_log_dev_next,
9f95a23c
TL
1814 flags,
1815 layout);
11fdf7f2
TL
1816 return 0;
1817}
1818
7c673cae
FG
1819BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
1820{
1821 auto p = file_map.find(ino);
1822 if (p == file_map.end()) {
9f95a23c 1823 FileRef f = ceph::make_ref<File>();
7c673cae
FG
1824 file_map[ino] = f;
1825 dout(30) << __func__ << " ino " << ino << " = " << f
1826 << " (new)" << dendl;
1827 return f;
1828 } else {
1829 dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
1830 return p->second;
1831 }
1832}
1833
1834void BlueFS::_drop_link(FileRef file)
1835{
1836 dout(20) << __func__ << " had refs " << file->refs
1837 << " on " << file->fnode << dendl;
11fdf7f2 1838 ceph_assert(file->refs > 0);
7c673cae
FG
1839 --file->refs;
1840 if (file->refs == 0) {
1841 dout(20) << __func__ << " destroying " << file->fnode << dendl;
11fdf7f2 1842 ceph_assert(file->num_reading.load() == 0);
9f95a23c 1843 vselector->sub_usage(file->vselector_hint, file->fnode);
7c673cae
FG
1844 log_t.op_file_remove(file->fnode.ino);
1845 for (auto& r : file->fnode.extents) {
1846 pending_release[r.bdev].insert(r.offset, r.length);
1847 }
1848 file_map.erase(file->fnode.ino);
1849 file->deleted = true;
94b18763 1850
7c673cae 1851 if (file->dirty_seq) {
11fdf7f2
TL
1852 ceph_assert(file->dirty_seq > log_seq_stable);
1853 ceph_assert(dirty_files.count(file->dirty_seq));
7c673cae
FG
1854 auto it = dirty_files[file->dirty_seq].iterator_to(*file);
1855 dirty_files[file->dirty_seq].erase(it);
1856 file->dirty_seq = 0;
1857 }
1858 }
1859}
1860
adb31ebb 1861int64_t BlueFS::_read_random(
7c673cae
FG
1862 FileReader *h, ///< [in] read from here
1863 uint64_t off, ///< [in] offset
9f95a23c 1864 uint64_t len, ///< [in] this many bytes
f67539c2 1865 char *out) ///< [out] copy it here
7c673cae 1866{
494da23a
TL
1867 auto* buf = &h->buf;
1868
adb31ebb 1869 int64_t ret = 0;
7c673cae
FG
1870 dout(10) << __func__ << " h " << h
1871 << " 0x" << std::hex << off << "~" << len << std::dec
1872 << " from " << h->file->fnode << dendl;
1873
1874 ++h->file->num_reading;
1875
1876 if (!h->ignore_eof &&
1877 off + len > h->file->fnode.size) {
1878 if (off > h->file->fnode.size)
1879 len = 0;
1880 else
1881 len = h->file->fnode.size - off;
1882 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
1883 << std::hex << len << std::dec << dendl;
1884 }
494da23a
TL
1885 logger->inc(l_bluefs_read_random_count, 1);
1886 logger->inc(l_bluefs_read_random_bytes, len);
7c673cae 1887
494da23a 1888 std::shared_lock s_lock(h->lock);
f91f0fd5 1889 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
7c673cae 1890 while (len > 0) {
494da23a
TL
1891 if (off < buf->bl_off || off >= buf->get_buf_end()) {
1892 s_lock.unlock();
1893 uint64_t x_off = 0;
1894 auto p = h->file->fnode.seek(off, &x_off);
f6b5b4d7 1895 ceph_assert(p != h->file->fnode.extents.end());
9f95a23c 1896 uint64_t l = std::min(p->length - x_off, len);
adb31ebb
TL
1897 //hard cap to 1GB
1898 l = std::min(l, uint64_t(1) << 30);
494da23a
TL
1899 dout(20) << __func__ << " read random 0x"
1900 << std::hex << x_off << "~" << l << std::dec
1901 << " of " << *p << dendl;
cd265ab1
TL
1902 int r;
1903 if (!cct->_conf->bluefs_check_for_zeros) {
1904 r = bdev[p->bdev]->read_random(p->offset + x_off, l, out,
1905 cct->_conf->bluefs_buffered_io);
1906 } else {
1907 r = read_random(p->bdev, p->offset + x_off, l, out,
1908 cct->_conf->bluefs_buffered_io);
1909 }
494da23a
TL
1910 ceph_assert(r == 0);
1911 off += l;
1912 len -= l;
1913 ret += l;
1914 out += l;
1915
1916 logger->inc(l_bluefs_read_random_disk_count, 1);
1917 logger->inc(l_bluefs_read_random_disk_bytes, l);
1918 if (len > 0) {
1919 s_lock.lock();
1920 }
1921 } else {
1922 auto left = buf->get_buf_remaining(off);
adb31ebb 1923 int64_t r = std::min(len, left);
494da23a
TL
1924 logger->inc(l_bluefs_read_random_buffer_count, 1);
1925 logger->inc(l_bluefs_read_random_buffer_bytes, r);
1926 dout(20) << __func__ << " left 0x" << std::hex << left
1927 << " 0x" << off << "~" << len << std::dec
1928 << dendl;
1929
f67539c2
TL
1930 auto p = buf->bl.begin();
1931 p.seek(off - buf->bl_off);
1932 p.copy(r, out);
1933 out += r;
7c673cae 1934
494da23a
TL
1935 dout(30) << __func__ << " result chunk (0x"
1936 << std::hex << r << std::dec << " bytes):\n";
1937 bufferlist t;
1938 t.substr_of(buf->bl, off - buf->bl_off, r);
1939 t.hexdump(*_dout);
1940 *_dout << dendl;
1941
1942 off += r;
1943 len -= r;
1944 ret += r;
1945 buf->pos += r;
1946 }
1947 }
7c673cae
FG
1948 dout(20) << __func__ << " got " << ret << dendl;
1949 --h->file->num_reading;
1950 return ret;
1951}
1952
adb31ebb 1953int64_t BlueFS::_read(
7c673cae 1954 FileReader *h, ///< [in] read from here
7c673cae
FG
1955 uint64_t off, ///< [in] offset
1956 size_t len, ///< [in] this many bytes
1957 bufferlist *outbl, ///< [out] optional: reference the result here
1958 char *out) ///< [out] optional: or copy it here
1959{
f67539c2
TL
1960 FileReaderBuffer *buf = &(h->buf);
1961
494da23a 1962 bool prefetch = !outbl && !out;
7c673cae
FG
1963 dout(10) << __func__ << " h " << h
1964 << " 0x" << std::hex << off << "~" << len << std::dec
494da23a
TL
1965 << " from " << h->file->fnode
1966 << (prefetch ? " prefetch" : "")
1967 << dendl;
7c673cae
FG
1968
1969 ++h->file->num_reading;
1970
1971 if (!h->ignore_eof &&
1972 off + len > h->file->fnode.size) {
1973 if (off > h->file->fnode.size)
1974 len = 0;
1975 else
1976 len = h->file->fnode.size - off;
1977 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
1978 << std::hex << len << std::dec << dendl;
1979 }
494da23a
TL
1980 logger->inc(l_bluefs_read_count, 1);
1981 logger->inc(l_bluefs_read_bytes, len);
1982 if (prefetch) {
1983 logger->inc(l_bluefs_read_prefetch_count, 1);
1984 logger->inc(l_bluefs_read_prefetch_bytes, len);
1985 }
1986
7c673cae
FG
1987 if (outbl)
1988 outbl->clear();
1989
adb31ebb 1990 int64_t ret = 0;
494da23a 1991 std::shared_lock s_lock(h->lock);
7c673cae
FG
1992 while (len > 0) {
1993 size_t left;
1994 if (off < buf->bl_off || off >= buf->get_buf_end()) {
494da23a
TL
1995 s_lock.unlock();
1996 std::unique_lock u_lock(h->lock);
f91f0fd5 1997 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
494da23a
TL
1998 if (off < buf->bl_off || off >= buf->get_buf_end()) {
1999 // if precondition hasn't changed during locking upgrade.
2000 buf->bl.clear();
2001 buf->bl_off = off & super.block_mask();
2002 uint64_t x_off = 0;
2003 auto p = h->file->fnode.seek(buf->bl_off, &x_off);
f6b5b4d7
TL
2004 if (p == h->file->fnode.extents.end()) {
2005 dout(5) << __func__ << " reading less then required "
2006 << ret << "<" << ret + len << dendl;
2007 break;
2008 }
2009
494da23a
TL
2010 uint64_t want = round_up_to(len + (off & ~super.block_mask()),
2011 super.block_size);
2012 want = std::max(want, buf->max_prefetch);
2013 uint64_t l = std::min(p->length - x_off, want);
adb31ebb
TL
2014 //hard cap to 1GB
2015 l = std::min(l, uint64_t(1) << 30);
494da23a
TL
2016 uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size);
2017 if (!h->ignore_eof &&
2018 buf->bl_off + l > eof_offset) {
2019 l = eof_offset - buf->bl_off;
2020 }
2021 dout(20) << __func__ << " fetching 0x"
2022 << std::hex << x_off << "~" << l << std::dec
2023 << " of " << *p << dendl;
cd265ab1
TL
2024 int r;
2025 if (!cct->_conf->bluefs_check_for_zeros) {
2026 r = bdev[p->bdev]->read(p->offset + x_off, l, &buf->bl, ioc[p->bdev],
2027 cct->_conf->bluefs_buffered_io);
2028 } else {
2029 r = read(p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev],
2030 cct->_conf->bluefs_buffered_io);
2031 }
494da23a 2032 ceph_assert(r == 0);
7c673cae 2033 }
494da23a
TL
2034 u_lock.unlock();
2035 s_lock.lock();
2036 // we should recheck if buffer is valid after lock downgrade
2037 continue;
7c673cae
FG
2038 }
2039 left = buf->get_buf_remaining(off);
2040 dout(20) << __func__ << " left 0x" << std::hex << left
2041 << " len 0x" << len << std::dec << dendl;
2042
adb31ebb 2043 int64_t r = std::min(len, left);
7c673cae
FG
2044 if (outbl) {
2045 bufferlist t;
2046 t.substr_of(buf->bl, off - buf->bl_off, r);
2047 outbl->claim_append(t);
2048 }
2049 if (out) {
f67539c2
TL
2050 auto p = buf->bl.begin();
2051 p.seek(off - buf->bl_off);
2052 p.copy(r, out);
7c673cae
FG
2053 out += r;
2054 }
2055
2056 dout(30) << __func__ << " result chunk (0x"
2057 << std::hex << r << std::dec << " bytes):\n";
2058 bufferlist t;
2059 t.substr_of(buf->bl, off - buf->bl_off, r);
2060 t.hexdump(*_dout);
2061 *_dout << dendl;
2062
2063 off += r;
2064 len -= r;
2065 ret += r;
2066 buf->pos += r;
2067 }
f67539c2 2068
7c673cae 2069 dout(20) << __func__ << " got " << ret << dendl;
11fdf7f2 2070 ceph_assert(!outbl || (int)outbl->length() == ret);
7c673cae
FG
2071 --h->file->num_reading;
2072 return ret;
2073}
2074
2075void BlueFS::_invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
2076{
2077 dout(10) << __func__ << " file " << f->fnode
2078 << " 0x" << std::hex << offset << "~" << length << std::dec
2079 << dendl;
2080 if (offset & ~super.block_mask()) {
2081 offset &= super.block_mask();
11fdf7f2 2082 length = round_up_to(length, super.block_size);
7c673cae
FG
2083 }
2084 uint64_t x_off = 0;
2085 auto p = f->fnode.seek(offset, &x_off);
2086 while (length > 0 && p != f->fnode.extents.end()) {
11fdf7f2 2087 uint64_t x_len = std::min(p->length - x_off, length);
7c673cae
FG
2088 bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
2089 dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len
2090 << std:: dec << " of " << *p << dendl;
2091 offset += x_len;
2092 length -= x_len;
2093 }
2094}
2095
2096uint64_t BlueFS::_estimate_log_size()
2097{
2098 int avg_dir_size = 40; // fixme
2099 int avg_file_size = 12;
2100 uint64_t size = 4096 * 2;
2101 size += file_map.size() * (1 + sizeof(bluefs_fnode_t));
7c673cae
FG
2102 size += dir_map.size() + (1 + avg_dir_size);
2103 size += file_map.size() * (1 + avg_dir_size + avg_file_size);
11fdf7f2 2104 return round_up_to(size, super.block_size);
7c673cae
FG
2105}
2106
2107void BlueFS::compact_log()
2108{
f6b5b4d7
TL
2109 std::unique_lock<ceph::mutex> l(lock);
2110 if (!cct->_conf->bluefs_replay_recovery_disable_compact) {
2111 if (cct->_conf->bluefs_compact_log_sync) {
2112 _compact_log_sync();
2113 } else {
2114 _compact_log_async(l);
2115 }
7c673cae
FG
2116 }
2117}
2118
2119bool BlueFS::_should_compact_log()
2120{
2121 uint64_t current = log_writer->file->fnode.size;
2122 uint64_t expected = _estimate_log_size();
2123 float ratio = (float)current / (float)expected;
2124 dout(10) << __func__ << " current 0x" << std::hex << current
2125 << " expected " << expected << std::dec
2126 << " ratio " << ratio
2127 << (new_log ? " (async compaction in progress)" : "")
2128 << dendl;
2129 if (new_log ||
2130 current < cct->_conf->bluefs_log_compact_min_size ||
2131 ratio < cct->_conf->bluefs_log_compact_min_ratio) {
2132 return false;
2133 }
2134 return true;
2135}
2136
11fdf7f2
TL
2137void BlueFS::_compact_log_dump_metadata(bluefs_transaction_t *t,
2138 int flags)
7c673cae
FG
2139{
2140 t->seq = 1;
2141 t->uuid = super.uuid;
2142 dout(20) << __func__ << " op_init" << dendl;
2143
2144 t->op_init();
9f95a23c
TL
2145 for (auto& [ino, file_ref] : file_map) {
2146 if (ino == 1)
7c673cae 2147 continue;
9f95a23c 2148 ceph_assert(ino > 1);
11fdf7f2 2149
9f95a23c 2150 for(auto& e : file_ref->fnode.extents) {
11fdf7f2
TL
2151 auto bdev = e.bdev;
2152 auto bdev_new = bdev;
2153 ceph_assert(!((flags & REMOVE_WAL) && bdev == BDEV_WAL));
2154 if ((flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
2155 bdev_new = BDEV_DB;
2156 }
2157 if ((flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
2158 bdev_new = BDEV_SLOW;
2159 }
2160 if (bdev == BDEV_NEWDB) {
2161 // REMOVE_DB xor RENAME_DB
2162 ceph_assert(!(flags & REMOVE_DB) != !(flags & RENAME_DB2SLOW));
2163 ceph_assert(!(flags & RENAME_SLOW2DB));
2164 bdev_new = BDEV_DB;
2165 }
2166 if (bdev == BDEV_NEWWAL) {
2167 ceph_assert(flags & REMOVE_WAL);
2168 bdev_new = BDEV_WAL;
2169 }
2170 e.bdev = bdev_new;
2171 }
9f95a23c
TL
2172 dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
2173 t->op_file_update(file_ref->fnode);
7c673cae 2174 }
9f95a23c
TL
2175 for (auto& [path, dir_ref] : dir_map) {
2176 dout(20) << __func__ << " op_dir_create " << path << dendl;
2177 t->op_dir_create(path);
2178 for (auto& [fname, file_ref] : dir_ref->file_map) {
2179 dout(20) << __func__ << " op_dir_link " << path << "/" << fname
2180 << " to " << file_ref->fnode.ino << dendl;
2181 t->op_dir_link(path, fname, file_ref->fnode.ino);
7c673cae
FG
2182 }
2183 }
2184}
2185
2186void BlueFS::_compact_log_sync()
2187{
2188 dout(10) << __func__ << dendl;
9f95a23c
TL
2189 auto prefer_bdev =
2190 vselector->select_prefer_bdev(log_writer->file->vselector_hint);
2191 _rewrite_log_and_layout_sync(true,
11fdf7f2 2192 BDEV_DB,
9f95a23c
TL
2193 prefer_bdev,
2194 prefer_bdev,
2195 0,
2196 super.memorized_layout);
11fdf7f2
TL
2197 logger->inc(l_bluefs_log_compactions);
2198}
2199
9f95a23c
TL
2200void BlueFS::_rewrite_log_and_layout_sync(bool allocate_with_fallback,
2201 int super_dev,
2202 int log_dev,
2203 int log_dev_new,
2204 int flags,
2205 std::optional<bluefs_layout_t> layout)
11fdf7f2 2206{
7c673cae
FG
2207 File *log_file = log_writer->file.get();
2208
2209 // clear out log (be careful who calls us!!!)
2210 log_t.clear();
2211
11fdf7f2
TL
2212 dout(20) << __func__ << " super_dev:" << super_dev
2213 << " log_dev:" << log_dev
2214 << " log_dev_new:" << log_dev_new
2215 << " flags:" << flags
2216 << dendl;
7c673cae 2217 bluefs_transaction_t t;
11fdf7f2 2218 _compact_log_dump_metadata(&t, flags);
7c673cae
FG
2219
2220 dout(20) << __func__ << " op_jump_seq " << log_seq << dendl;
2221 t.op_jump_seq(log_seq);
2222
2223 bufferlist bl;
11fdf7f2 2224 encode(t, bl);
7c673cae
FG
2225 _pad_bl(bl);
2226
2227 uint64_t need = bl.length() + cct->_conf->bluefs_max_log_runway;
2228 dout(20) << __func__ << " need " << need << dendl;
2229
494da23a 2230 bluefs_fnode_t old_fnode;
11fdf7f2 2231 int r;
494da23a 2232 log_file->fnode.swap_extents(old_fnode);
11fdf7f2
TL
2233 if (allocate_with_fallback) {
2234 r = _allocate(log_dev, need, &log_file->fnode);
2235 ceph_assert(r == 0);
2236 } else {
2237 PExtentVector extents;
2238 r = _allocate_without_fallback(log_dev,
2239 need,
2240 &extents);
2241 ceph_assert(r == 0);
2242 for (auto& p : extents) {
2243 log_file->fnode.append_extent(
2244 bluefs_extent_t(log_dev, p.offset, p.length));
2245 }
7c673cae
FG
2246 }
2247
2248 _close_writer(log_writer);
2249
2250 log_file->fnode.size = bl.length();
9f95a23c
TL
2251 vselector->sub_usage(log_file->vselector_hint, old_fnode);
2252 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2253
7c673cae
FG
2254 log_writer = _create_writer(log_file);
2255 log_writer->append(bl);
11fdf7f2
TL
2256 r = _flush(log_writer, true);
2257 ceph_assert(r == 0);
2258#ifdef HAVE_LIBAIO
2259 if (!cct->_conf->bluefs_sync_write) {
2260 list<aio_t> completed_ios;
2261 _claim_completed_aios(log_writer, &completed_ios);
2262 wait_for_aio(log_writer);
2263 completed_ios.clear();
2264 }
2265#endif
224ce89b 2266 flush_bdev();
224ce89b 2267
9f95a23c 2268 super.memorized_layout = layout;
7c673cae 2269 super.log_fnode = log_file->fnode;
11fdf7f2
TL
2270 // rename device if needed
2271 if (log_dev != log_dev_new) {
2272 dout(10) << __func__ << " renaming log extents to " << log_dev_new << dendl;
2273 for (auto& p : super.log_fnode.extents) {
2274 p.bdev = log_dev_new;
2275 }
2276 }
2277 dout(10) << __func__ << " writing super, log fnode: " << super.log_fnode << dendl;
2278
7c673cae 2279 ++super.version;
11fdf7f2 2280 _write_super(super_dev);
7c673cae
FG
2281 flush_bdev();
2282
494da23a
TL
2283 dout(10) << __func__ << " release old log extents " << old_fnode.extents << dendl;
2284 for (auto& r : old_fnode.extents) {
7c673cae
FG
2285 pending_release[r.bdev].insert(r.offset, r.length);
2286 }
7c673cae
FG
2287}
2288
2289/*
2290 * 1. Allocate a new extent to continue the log, and then log an event
2291 * that jumps the log write position to the new extent. At this point, the
2292 * old extent(s) won't be written to, and reflect everything to compact.
2293 * New events will be written to the new region that we'll keep.
2294 *
2295 * 2. While still holding the lock, encode a bufferlist that dumps all of the
2296 * in-memory fnodes and names. This will become the new beginning of the
2297 * log. The last event will jump to the log continuation extent from #1.
2298 *
2299 * 3. Queue a write to a new extent for the new beginnging of the log.
2300 *
2301 * 4. Drop lock and wait
2302 *
2303 * 5. Retake the lock.
2304 *
2305 * 6. Update the log_fnode to splice in the new beginning.
2306 *
2307 * 7. Write the new superblock.
2308 *
2309 * 8. Release the old log space. Clean up.
2310 */
11fdf7f2 2311void BlueFS::_compact_log_async(std::unique_lock<ceph::mutex>& l)
7c673cae
FG
2312{
2313 dout(10) << __func__ << dendl;
2314 File *log_file = log_writer->file.get();
11fdf7f2
TL
2315 ceph_assert(!new_log);
2316 ceph_assert(!new_log_writer);
7c673cae 2317
181888fb
FG
2318 // create a new log [writer] so that we know compaction is in progress
2319 // (see _should_compact_log)
9f95a23c 2320 new_log = ceph::make_ref<File>();
181888fb
FG
2321 new_log->fnode.ino = 0; // so that _flush_range won't try to log the fnode
2322
3efd9988
FG
2323 // 0. wait for any racing flushes to complete. (We do not want to block
2324 // in _flush_sync_log with jump_to set or else a racing thread might flush
2325 // our entries and our jump_to update won't be correct.)
2326 while (log_flushing) {
2327 dout(10) << __func__ << " log is currently flushing, waiting" << dendl;
2328 log_cond.wait(l);
2329 }
2330
9f95a23c
TL
2331 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2332
7c673cae
FG
2333 // 1. allocate new log space and jump to it.
2334 old_log_jump_to = log_file->fnode.get_allocated();
7c673cae 2335 dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
11fdf7f2 2336 << " need 0x" << (old_log_jump_to + cct->_conf->bluefs_max_log_runway) << std::dec << dendl;
9f95a23c
TL
2337 int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
2338 cct->_conf->bluefs_max_log_runway,
2339 &log_file->fnode);
11fdf7f2 2340 ceph_assert(r == 0);
9f95a23c
TL
2341 //adjust usage as flush below will need it
2342 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
7c673cae
FG
2343 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2344
2345 // update the log file change and log a jump to the offset where we want to
2346 // write the new entries
2347 log_t.op_file_update(log_file->fnode);
2348 log_t.op_jump(log_seq, old_log_jump_to);
2349
2350 flush_bdev(); // FIXME?
2351
2352 _flush_and_sync_log(l, 0, old_log_jump_to);
2353
2354 // 2. prepare compacted log
2355 bluefs_transaction_t t;
224ce89b
WB
2356 //avoid record two times in log_t and _compact_log_dump_metadata.
2357 log_t.clear();
11fdf7f2 2358 _compact_log_dump_metadata(&t, 0);
7c673cae 2359
eafe8130
TL
2360 uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
2361 std::max(alloc_size[BDEV_DB],
2362 alloc_size[BDEV_SLOW]));
2363
7c673cae 2364 // conservative estimate for final encoded size
11fdf7f2 2365 new_log_jump_to = round_up_to(t.op_bl.length() + super.block_size * 2,
eafe8130 2366 max_alloc_size);
7c673cae
FG
2367 t.op_jump(log_seq, new_log_jump_to);
2368
11fdf7f2 2369 // allocate
9f95a23c 2370 //FIXME: check if we want DB here?
11fdf7f2
TL
2371 r = _allocate(BlueFS::BDEV_DB, new_log_jump_to,
2372 &new_log->fnode);
2373 ceph_assert(r == 0);
2374
2375 // we might have some more ops in log_t due to _allocate call
2376 t.claim_ops(log_t);
2377
7c673cae 2378 bufferlist bl;
11fdf7f2 2379 encode(t, bl);
7c673cae
FG
2380 _pad_bl(bl);
2381
2382 dout(10) << __func__ << " new_log_jump_to 0x" << std::hex << new_log_jump_to
2383 << std::dec << dendl;
2384
7c673cae
FG
2385 new_log_writer = _create_writer(new_log);
2386 new_log_writer->append(bl);
2387
2388 // 3. flush
2389 r = _flush(new_log_writer, true);
11fdf7f2 2390 ceph_assert(r == 0);
7c673cae
FG
2391
2392 // 4. wait
11fdf7f2 2393 _flush_bdev_safely(new_log_writer);
7c673cae 2394
11fdf7f2 2395 // 5. update our log fnode
7c673cae 2396 // discard first old_log_jump_to extents
9f95a23c 2397
7c673cae
FG
2398 dout(10) << __func__ << " remove 0x" << std::hex << old_log_jump_to << std::dec
2399 << " of " << log_file->fnode.extents << dendl;
2400 uint64_t discarded = 0;
2401 mempool::bluefs::vector<bluefs_extent_t> old_extents;
2402 while (discarded < old_log_jump_to) {
11fdf7f2 2403 ceph_assert(!log_file->fnode.extents.empty());
7c673cae
FG
2404 bluefs_extent_t& e = log_file->fnode.extents.front();
2405 bluefs_extent_t temp = e;
2406 if (discarded + e.length <= old_log_jump_to) {
2407 dout(10) << __func__ << " remove old log extent " << e << dendl;
2408 discarded += e.length;
94b18763 2409 log_file->fnode.pop_front_extent();
7c673cae
FG
2410 } else {
2411 dout(10) << __func__ << " remove front of old log extent " << e << dendl;
2412 uint64_t drop = old_log_jump_to - discarded;
2413 temp.length = drop;
2414 e.offset += drop;
2415 e.length -= drop;
2416 discarded += drop;
2417 dout(10) << __func__ << " kept " << e << " removed " << temp << dendl;
2418 }
2419 old_extents.push_back(temp);
2420 }
94b18763
FG
2421 auto from = log_file->fnode.extents.begin();
2422 auto to = log_file->fnode.extents.end();
2423 while (from != to) {
2424 new_log->fnode.append_extent(*from);
2425 ++from;
2426 }
7c673cae 2427
9f95a23c
TL
2428 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
2429
7c673cae 2430 // clear the extents from old log file, they are added to new log
94b18763 2431 log_file->fnode.clear_extents();
7c673cae 2432 // swap the log files. New log file is the log file now.
94b18763
FG
2433 new_log->fnode.swap_extents(log_file->fnode);
2434
7c673cae
FG
2435 log_writer->pos = log_writer->file->fnode.size =
2436 log_writer->pos - old_log_jump_to + new_log_jump_to;
2437
9f95a23c
TL
2438 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
2439
11fdf7f2 2440 // 6. write the super block to reflect the changes
7c673cae
FG
2441 dout(10) << __func__ << " writing super" << dendl;
2442 super.log_fnode = log_file->fnode;
2443 ++super.version;
11fdf7f2 2444 _write_super(BDEV_DB);
7c673cae
FG
2445
2446 lock.unlock();
2447 flush_bdev();
2448 lock.lock();
2449
11fdf7f2 2450 // 7. release old space
7c673cae
FG
2451 dout(10) << __func__ << " release old log extents " << old_extents << dendl;
2452 for (auto& r : old_extents) {
2453 pending_release[r.bdev].insert(r.offset, r.length);
2454 }
2455
2456 // delete the new log, remove from the dirty files list
2457 _close_writer(new_log_writer);
2458 if (new_log->dirty_seq) {
11fdf7f2 2459 ceph_assert(dirty_files.count(new_log->dirty_seq));
7c673cae
FG
2460 auto it = dirty_files[new_log->dirty_seq].iterator_to(*new_log);
2461 dirty_files[new_log->dirty_seq].erase(it);
2462 }
2463 new_log_writer = nullptr;
2464 new_log = nullptr;
2465 log_cond.notify_all();
2466
2467 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2468 logger->inc(l_bluefs_log_compactions);
2469}
2470
2471void BlueFS::_pad_bl(bufferlist& bl)
2472{
2473 uint64_t partial = bl.length() % super.block_size;
2474 if (partial) {
2475 dout(10) << __func__ << " padding with 0x" << std::hex
2476 << super.block_size - partial << " zeros" << std::dec << dendl;
2477 bl.append_zero(super.block_size - partial);
2478 }
2479}
2480
7c673cae 2481
11fdf7f2 2482int BlueFS::_flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
7c673cae
FG
2483 uint64_t want_seq,
2484 uint64_t jump_to)
2485{
2486 while (log_flushing) {
2487 dout(10) << __func__ << " want_seq " << want_seq
2488 << " log is currently flushing, waiting" << dendl;
11fdf7f2 2489 ceph_assert(!jump_to);
7c673cae
FG
2490 log_cond.wait(l);
2491 }
2492 if (want_seq && want_seq <= log_seq_stable) {
2493 dout(10) << __func__ << " want_seq " << want_seq << " <= log_seq_stable "
2494 << log_seq_stable << ", done" << dendl;
11fdf7f2 2495 ceph_assert(!jump_to);
7c673cae
FG
2496 return 0;
2497 }
2498 if (log_t.empty() && dirty_files.empty()) {
2499 dout(10) << __func__ << " want_seq " << want_seq
2500 << " " << log_t << " not dirty, dirty_files empty, no-op" << dendl;
11fdf7f2 2501 ceph_assert(!jump_to);
7c673cae
FG
2502 return 0;
2503 }
2504
a8e16298
TL
2505 vector<interval_set<uint64_t>> to_release(pending_release.size());
2506 to_release.swap(pending_release);
2507
7c673cae 2508 uint64_t seq = log_t.seq = ++log_seq;
11fdf7f2 2509 ceph_assert(want_seq == 0 || want_seq <= seq);
7c673cae
FG
2510 log_t.uuid = super.uuid;
2511
2512 // log dirty files
2513 auto lsi = dirty_files.find(seq);
2514 if (lsi != dirty_files.end()) {
2515 dout(20) << __func__ << " " << lsi->second.size() << " dirty_files" << dendl;
2516 for (auto &f : lsi->second) {
2517 dout(20) << __func__ << " op_file_update " << f.fnode << dendl;
2518 log_t.op_file_update(f.fnode);
2519 }
2520 }
2521
2522 dout(10) << __func__ << " " << log_t << dendl;
11fdf7f2 2523 ceph_assert(!log_t.empty());
7c673cae
FG
2524
2525 // allocate some more space (before we run out)?
f67539c2 2526 // BTW: this triggers `flush()` in the `page_aligned_appender` of `log_writer`.
7c673cae
FG
2527 int64_t runway = log_writer->file->fnode.get_allocated() -
2528 log_writer->get_effective_write_pos();
f6b5b4d7 2529 bool just_expanded_log = false;
7c673cae
FG
2530 if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
2531 dout(10) << __func__ << " allocating more log runway (0x"
2532 << std::hex << runway << std::dec << " remaining)" << dendl;
2533 while (new_log_writer) {
2534 dout(10) << __func__ << " waiting for async compaction" << dendl;
2535 log_cond.wait(l);
2536 }
9f95a23c
TL
2537 vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode);
2538 int r = _allocate(
2539 vselector->select_prefer_bdev(log_writer->file->vselector_hint),
2540 cct->_conf->bluefs_max_log_runway,
2541 &log_writer->file->fnode);
11fdf7f2 2542 ceph_assert(r == 0);
9f95a23c 2543 vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode);
7c673cae 2544 log_t.op_file_update(log_writer->file->fnode);
f6b5b4d7 2545 just_expanded_log = true;
7c673cae
FG
2546 }
2547
2548 bufferlist bl;
11fdf7f2
TL
2549 bl.reserve(super.block_size);
2550 encode(log_t, bl);
7c673cae 2551 // pad to block boundary
11fdf7f2
TL
2552 size_t realign = super.block_size - (bl.length() % super.block_size);
2553 if (realign && realign != super.block_size)
2554 bl.append_zero(realign);
2555
7c673cae
FG
2556 logger->inc(l_bluefs_logged_bytes, bl.length());
2557
f6b5b4d7
TL
2558 if (just_expanded_log) {
2559 ceph_assert(bl.length() <= runway); // if we write this, we will have an unrecoverable data loss
2560 }
2561
7c673cae
FG
2562 log_writer->append(bl);
2563
2564 log_t.clear();
2565 log_t.seq = 0; // just so debug output is less confusing
2566 log_flushing = true;
2567
2568 int r = _flush(log_writer, true);
11fdf7f2 2569 ceph_assert(r == 0);
7c673cae
FG
2570
2571 if (jump_to) {
2572 dout(10) << __func__ << " jumping log offset from 0x" << std::hex
2573 << log_writer->pos << " -> 0x" << jump_to << std::dec << dendl;
2574 log_writer->pos = jump_to;
9f95a23c 2575 vselector->sub_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size);
7c673cae 2576 log_writer->file->fnode.size = jump_to;
9f95a23c 2577 vselector->add_usage(log_writer->file->vselector_hint, log_writer->file->fnode.size);
7c673cae
FG
2578 }
2579
2580 _flush_bdev_safely(log_writer);
2581
2582 log_flushing = false;
2583 log_cond.notify_all();
2584
2585 // clean dirty files
2586 if (seq > log_seq_stable) {
2587 log_seq_stable = seq;
2588 dout(20) << __func__ << " log_seq_stable " << log_seq_stable << dendl;
2589
2590 auto p = dirty_files.begin();
2591 while (p != dirty_files.end()) {
2592 if (p->first > log_seq_stable) {
2593 dout(20) << __func__ << " done cleaning up dirty files" << dendl;
2594 break;
2595 }
2596
2597 auto l = p->second.begin();
2598 while (l != p->second.end()) {
2599 File *file = &*l;
11fdf7f2
TL
2600 ceph_assert(file->dirty_seq > 0);
2601 ceph_assert(file->dirty_seq <= log_seq_stable);
7c673cae
FG
2602 dout(20) << __func__ << " cleaned file " << file->fnode << dendl;
2603 file->dirty_seq = 0;
2604 p->second.erase(l++);
2605 }
2606
11fdf7f2 2607 ceph_assert(p->second.empty());
7c673cae
FG
2608 dirty_files.erase(p++);
2609 }
2610 } else {
2611 dout(20) << __func__ << " log_seq_stable " << log_seq_stable
2612 << " already >= out seq " << seq
2613 << ", we lost a race against another log flush, done" << dendl;
2614 }
a8e16298
TL
2615
2616 for (unsigned i = 0; i < to_release.size(); ++i) {
2617 if (!to_release[i].empty()) {
2618 /* OK, now we have the guarantee alloc[i] won't be null. */
11fdf7f2
TL
2619 int r = 0;
2620 if (cct->_conf->bdev_enable_discard && cct->_conf->bdev_async_discard) {
2621 r = bdev[i]->queue_discard(to_release[i]);
2622 if (r == 0)
2623 continue;
2624 } else if (cct->_conf->bdev_enable_discard) {
2625 for (auto p = to_release[i].begin(); p != to_release[i].end(); ++p) {
2626 bdev[i]->discard(p.get_start(), p.get_len());
2627 }
2628 }
a8e16298 2629 alloc[i]->release(to_release[i]);
f67539c2
TL
2630 if (is_shared_alloc(i)) {
2631 shared_alloc->bluefs_used -= to_release[i].size();
2632 }
a8e16298
TL
2633 }
2634 }
2635
7c673cae
FG
2636 _update_logger_stats();
2637
2638 return 0;
2639}
2640
f67539c2
TL
2641ceph::bufferlist BlueFS::FileWriter::flush_buffer(
2642 CephContext* const cct,
2643 const bool partial,
2644 const unsigned length,
2645 const bluefs_super_t& super)
2646{
2647 ceph::bufferlist bl;
2648 if (partial) {
2649 tail_block.splice(0, tail_block.length(), &bl);
2650 }
2651 const auto remaining_len = length - bl.length();
2652 buffer.splice(0, remaining_len, &bl);
2653 if (buffer.length()) {
2654 dout(20) << " leaving 0x" << std::hex << buffer.length() << std::dec
2655 << " unflushed" << dendl;
2656 }
2657 if (const unsigned tail = bl.length() & ~super.block_mask(); tail) {
2658 const auto padding_len = super.block_size - tail;
2659 dout(20) << __func__ << " caching tail of 0x"
2660 << std::hex << tail
2661 << " and padding block with 0x" << padding_len
2662 << " buffer.length() " << buffer.length()
2663 << std::dec << dendl;
2664 // We need to go through the `buffer_appender` to get a chance to
2665 // preserve in-memory contiguity and not mess with the alignment.
2666 // Otherwise a costly rebuild could happen in e.g. `KernelDevice`.
2667 buffer_appender.append_zero(padding_len);
2668 buffer.splice(buffer.length() - padding_len, padding_len, &bl);
2669 // Deep copy the tail here. This allows to avoid costlier copy on
2670 // bufferlist rebuild in e.g. `KernelDevice` and minimizes number
2671 // of memory allocations.
2672 // The alternative approach would be to place the entire tail and
2673 // padding on a dedicated, 4 KB long memory chunk. This shouldn't
2674 // trigger the rebuild while still being less expensive.
2675 buffer_appender.substr_of(bl, bl.length() - padding_len - tail, tail);
2676 buffer.splice(buffer.length() - tail, tail, &tail_block);
2677 } else {
2678 tail_block.clear();
2679 }
2680 return bl;
2681}
2682
7c673cae
FG
2683int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
2684{
2685 dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
2686 << " 0x" << offset << "~" << length << std::dec
2687 << " to " << h->file->fnode << dendl;
f67539c2
TL
2688 if (h->file->deleted) {
2689 dout(10) << __func__ << " deleted, no-op" << dendl;
2690 return 0;
2691 }
7c673cae 2692
f67539c2 2693 ceph_assert(h->file->num_readers.load() == 0);
7c673cae
FG
2694
2695 bool buffered;
2696 if (h->file->fnode.ino == 1)
2697 buffered = false;
2698 else
2699 buffered = cct->_conf->bluefs_buffered_io;
2700
2701 if (offset + length <= h->pos)
2702 return 0;
2703 if (offset < h->pos) {
2704 length -= h->pos - offset;
2705 offset = h->pos;
2706 dout(10) << " still need 0x"
2707 << std::hex << offset << "~" << length << std::dec
2708 << dendl;
2709 }
11fdf7f2 2710 ceph_assert(offset <= h->file->fnode.size);
7c673cae
FG
2711
2712 uint64_t allocated = h->file->fnode.get_allocated();
9f95a23c 2713 vselector->sub_usage(h->file->vselector_hint, h->file->fnode);
7c673cae
FG
2714 // do not bother to dirty the file if we are overwriting
2715 // previously allocated extents.
2716 bool must_dirty = false;
2717 if (allocated < offset + length) {
2718 // we should never run out of log space here; see the min runway check
2719 // in _flush_and_sync_log.
11fdf7f2 2720 ceph_assert(h->file->fnode.ino != 1);
9f95a23c 2721 int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint),
7c673cae 2722 offset + length - allocated,
94b18763 2723 &h->file->fnode);
7c673cae
FG
2724 if (r < 0) {
2725 derr << __func__ << " allocated: 0x" << std::hex << allocated
2726 << " offset: 0x" << offset << " length: 0x" << length << std::dec
2727 << dendl;
9f95a23c 2728 vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo
11fdf7f2 2729 ceph_abort_msg("bluefs enospc");
7c673cae
FG
2730 return r;
2731 }
7c673cae
FG
2732 must_dirty = true;
2733 }
2734 if (h->file->fnode.size < offset + length) {
2735 h->file->fnode.size = offset + length;
2736 if (h->file->fnode.ino > 1) {
2737 // we do not need to dirty the log file (or it's compacting
2738 // replacement) when the file size changes because replay is
2739 // smart enough to discover it on its own.
2740 must_dirty = true;
2741 }
2742 }
2743 if (must_dirty) {
2744 h->file->fnode.mtime = ceph_clock_now();
11fdf7f2 2745 ceph_assert(h->file->fnode.ino >= 1);
7c673cae
FG
2746 if (h->file->dirty_seq == 0) {
2747 h->file->dirty_seq = log_seq + 1;
2748 dirty_files[h->file->dirty_seq].push_back(*h->file);
2749 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2750 << " (was clean)" << dendl;
2751 } else {
2752 if (h->file->dirty_seq != log_seq + 1) {
2753 // need re-dirty, erase from list first
11fdf7f2 2754 ceph_assert(dirty_files.count(h->file->dirty_seq));
7c673cae
FG
2755 auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
2756 dirty_files[h->file->dirty_seq].erase(it);
2757 h->file->dirty_seq = log_seq + 1;
2758 dirty_files[h->file->dirty_seq].push_back(*h->file);
2759 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2760 << " (was " << h->file->dirty_seq << ")" << dendl;
2761 } else {
2762 dout(20) << __func__ << " dirty_seq = " << log_seq + 1
2763 << " (unchanged, do nothing) " << dendl;
2764 }
2765 }
2766 }
2767 dout(20) << __func__ << " file now " << h->file->fnode << dendl;
2768
2769 uint64_t x_off = 0;
2770 auto p = h->file->fnode.seek(offset, &x_off);
11fdf7f2 2771 ceph_assert(p != h->file->fnode.extents.end());
7c673cae
FG
2772 dout(20) << __func__ << " in " << *p << " x_off 0x"
2773 << std::hex << x_off << std::dec << dendl;
2774
2775 unsigned partial = x_off & ~super.block_mask();
7c673cae
FG
2776 if (partial) {
2777 dout(20) << __func__ << " using partial tail 0x"
2778 << std::hex << partial << std::dec << dendl;
7c673cae
FG
2779 x_off -= partial;
2780 offset -= partial;
2781 length += partial;
2782 dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
2783 for (auto p : h->iocv) {
2784 if (p) {
2785 p->aio_wait();
2786 }
2787 }
2788 }
7c673cae 2789
f67539c2
TL
2790 auto bl = h->flush_buffer(cct, partial, length, super);
2791 ceph_assert(bl.length() >= length);
9f95a23c 2792 h->pos = offset + length;
f67539c2 2793 length = bl.length();
9f95a23c 2794
7c673cae
FG
2795 switch (h->writer_type) {
2796 case WRITER_WAL:
2797 logger->inc(l_bluefs_bytes_written_wal, length);
2798 break;
2799 case WRITER_SST:
2800 logger->inc(l_bluefs_bytes_written_sst, length);
2801 break;
2802 }
2803
2804 dout(30) << "dump:\n";
2805 bl.hexdump(*_dout);
2806 *_dout << dendl;
2807
7c673cae 2808 uint64_t bloff = 0;
11fdf7f2 2809 uint64_t bytes_written_slow = 0;
7c673cae 2810 while (length > 0) {
11fdf7f2 2811 uint64_t x_len = std::min(p->length - x_off, length);
7c673cae
FG
2812 bufferlist t;
2813 t.substr_of(bl, bloff, x_len);
7c673cae 2814 if (cct->_conf->bluefs_sync_write) {
11fdf7f2 2815 bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint);
7c673cae 2816 } else {
11fdf7f2
TL
2817 bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint);
2818 }
2819 h->dirty_devs[p->bdev] = true;
2820 if (p->bdev == BDEV_SLOW) {
2821 bytes_written_slow += t.length();
7c673cae 2822 }
11fdf7f2 2823
7c673cae
FG
2824 bloff += x_len;
2825 length -= x_len;
2826 ++p;
2827 x_off = 0;
2828 }
f67539c2
TL
2829 if (bytes_written_slow) {
2830 logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow);
2831 }
7c673cae
FG
2832 for (unsigned i = 0; i < MAX_BDEV; ++i) {
2833 if (bdev[i]) {
11fdf7f2 2834 if (h->iocv[i] && h->iocv[i]->has_pending_aios()) {
7c673cae
FG
2835 bdev[i]->aio_submit(h->iocv[i]);
2836 }
2837 }
2838 }
9f95a23c 2839 vselector->add_usage(h->file->vselector_hint, h->file->fnode);
7c673cae
FG
2840 dout(20) << __func__ << " h " << h << " pos now 0x"
2841 << std::hex << h->pos << std::dec << dendl;
2842 return 0;
2843}
2844
11fdf7f2 2845#ifdef HAVE_LIBAIO
7c673cae
FG
2846// we need to retire old completed aios so they don't stick around in
2847// memory indefinitely (along with their bufferlist refs).
2848void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
2849{
2850 for (auto p : h->iocv) {
2851 if (p) {
2852 ls->splice(ls->end(), p->running_aios);
2853 }
2854 }
2855 dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
2856}
2857
2858void BlueFS::wait_for_aio(FileWriter *h)
2859{
2860 // NOTE: this is safe to call without a lock, as long as our reference is
2861 // stable.
f67539c2
TL
2862 utime_t start;
2863 lgeneric_subdout(cct, bluefs, 10) << __func__;
2864 start = ceph_clock_now();
2865 *_dout << " " << h << dendl;
7c673cae
FG
2866 for (auto p : h->iocv) {
2867 if (p) {
2868 p->aio_wait();
2869 }
2870 }
11fdf7f2 2871 dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl;
7c673cae 2872}
11fdf7f2 2873#endif
7c673cae 2874
f6b5b4d7
TL
2875int BlueFS::_flush(FileWriter *h, bool force, std::unique_lock<ceph::mutex>& l)
2876{
2877 bool flushed = false;
2878 int r = _flush(h, force, &flushed);
2879 if (r == 0 && flushed) {
2880 _maybe_compact_log(l);
2881 }
2882 return r;
2883}
2884
2885int BlueFS::_flush(FileWriter *h, bool force, bool *flushed)
7c673cae 2886{
f67539c2 2887 uint64_t length = h->get_buffer_length();
7c673cae 2888 uint64_t offset = h->pos;
f6b5b4d7
TL
2889 if (flushed) {
2890 *flushed = false;
2891 }
7c673cae
FG
2892 if (!force &&
2893 length < cct->_conf->bluefs_min_flush_size) {
2894 dout(10) << __func__ << " " << h << " ignoring, length " << length
2895 << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
2896 << dendl;
2897 return 0;
2898 }
2899 if (length == 0) {
2900 dout(10) << __func__ << " " << h << " no dirty data on "
2901 << h->file->fnode << dendl;
2902 return 0;
2903 }
2904 dout(10) << __func__ << " " << h << " 0x"
2905 << std::hex << offset << "~" << length << std::dec
2906 << " to " << h->file->fnode << dendl;
11fdf7f2 2907 ceph_assert(h->pos <= h->file->fnode.size);
f6b5b4d7
TL
2908 int r = _flush_range(h, offset, length);
2909 if (flushed) {
2910 *flushed = true;
2911 }
2912 return r;
7c673cae
FG
2913}
2914
2915int BlueFS::_truncate(FileWriter *h, uint64_t offset)
2916{
2917 dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
2918 << " file " << h->file->fnode << dendl;
2919 if (h->file->deleted) {
2920 dout(10) << __func__ << " deleted, no-op" << dendl;
2921 return 0;
2922 }
2923
2924 // we never truncate internal log files
11fdf7f2 2925 ceph_assert(h->file->fnode.ino > 1);
7c673cae 2926
7c673cae
FG
2927 // truncate off unflushed data?
2928 if (h->pos < offset &&
f67539c2 2929 h->pos + h->get_buffer_length() > offset) {
7c673cae
FG
2930 dout(20) << __func__ << " tossing out last " << offset - h->pos
2931 << " unflushed bytes" << dendl;
11fdf7f2 2932 ceph_abort_msg("actually this shouldn't happen");
7c673cae 2933 }
f67539c2 2934 if (h->get_buffer_length()) {
7c673cae
FG
2935 int r = _flush(h, true);
2936 if (r < 0)
2937 return r;
2938 }
2939 if (offset == h->file->fnode.size) {
2940 return 0; // no-op!
2941 }
2942 if (offset > h->file->fnode.size) {
11fdf7f2 2943 ceph_abort_msg("truncate up not supported");
7c673cae 2944 }
11fdf7f2 2945 ceph_assert(h->file->fnode.size >= offset);
9f95a23c 2946 vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
7c673cae 2947 h->file->fnode.size = offset;
9f95a23c 2948 vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
7c673cae
FG
2949 log_t.op_file_update(h->file->fnode);
2950 return 0;
2951}
2952
11fdf7f2 2953int BlueFS::_fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l)
7c673cae
FG
2954{
2955 dout(10) << __func__ << " " << h << " " << h->file->fnode << dendl;
2956 int r = _flush(h, true);
2957 if (r < 0)
2958 return r;
2959 uint64_t old_dirty_seq = h->file->dirty_seq;
2960
2961 _flush_bdev_safely(h);
2962
2963 if (old_dirty_seq) {
2964 uint64_t s = log_seq;
2965 dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
2966 << ") on " << h->file->fnode << ", flushing log" << dendl;
2967 _flush_and_sync_log(l, old_dirty_seq);
11fdf7f2 2968 ceph_assert(h->file->dirty_seq == 0 || // cleaned
7c673cae
FG
2969 h->file->dirty_seq > s); // or redirtied by someone else
2970 }
2971 return 0;
2972}
2973
2974void BlueFS::_flush_bdev_safely(FileWriter *h)
2975{
11fdf7f2
TL
2976 std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs;
2977 h->dirty_devs.fill(false);
2978#ifdef HAVE_LIBAIO
7c673cae
FG
2979 if (!cct->_conf->bluefs_sync_write) {
2980 list<aio_t> completed_ios;
2981 _claim_completed_aios(h, &completed_ios);
2982 lock.unlock();
2983 wait_for_aio(h);
2984 completed_ios.clear();
11fdf7f2 2985 flush_bdev(flush_devs);
7c673cae 2986 lock.lock();
11fdf7f2
TL
2987 } else
2988#endif
2989 {
7c673cae 2990 lock.unlock();
11fdf7f2 2991 flush_bdev(flush_devs);
7c673cae
FG
2992 lock.lock();
2993 }
2994}
2995
11fdf7f2
TL
2996void BlueFS::flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs)
2997{
2998 // NOTE: this is safe to call without a lock.
2999 dout(20) << __func__ << dendl;
3000 for (unsigned i = 0; i < MAX_BDEV; i++) {
3001 if (dirty_bdevs[i])
3002 bdev[i]->flush();
3003 }
3004}
3005
7c673cae
FG
3006void BlueFS::flush_bdev()
3007{
3008 // NOTE: this is safe to call without a lock.
3009 dout(20) << __func__ << dendl;
f67539c2
TL
3010 for (unsigned i = 0; i < MAX_BDEV; i++) {
3011 // alloc space from BDEV_SLOW is unexpected.
3012 // So most cases we don't alloc from BDEV_SLOW and so avoiding flush not-used device.
3013 if (bdev[i] && (i != BDEV_SLOW || _get_used(i))) {
3014 bdev[i]->flush();
3015 }
7c673cae
FG
3016 }
3017}
3018
eafe8130
TL
3019const char* BlueFS::get_device_name(unsigned id)
3020{
3021 if (id >= MAX_BDEV) return "BDEV_INV";
3022 const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
3023 return names[id];
3024}
3025
11fdf7f2
TL
3026int BlueFS::_allocate_without_fallback(uint8_t id, uint64_t len,
3027 PExtentVector* extents)
3028{
3029 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
3030 << " from " << (int)id << dendl;
3031 assert(id < alloc.size());
11fdf7f2
TL
3032 if (!alloc[id]) {
3033 return -ENOENT;
3034 }
3035 extents->reserve(4); // 4 should be (more than) enough for most allocations
f67539c2
TL
3036 int64_t need = round_up_to(len, alloc_size[id]);
3037 int64_t alloc_len = alloc[id]->allocate(need, alloc_size[id], 0, extents);
3038 if (alloc_len < 0 || alloc_len < need) {
eafe8130 3039 if (alloc_len > 0) {
11fdf7f2
TL
3040 alloc[id]->release(*extents);
3041 }
f67539c2
TL
3042 derr << __func__ << " unable to allocate 0x" << std::hex << need
3043 << " on bdev " << (int)id
3044 << ", allocator name " << alloc[id]->get_name()
3045 << ", allocator type " << alloc[id]->get_type()
3046 << ", capacity 0x" << alloc[id]->get_capacity()
3047 << ", block size 0x" << alloc[id]->get_block_size()
3048 << ", free 0x" << alloc[id]->get_free()
3049 << ", fragmentation " << alloc[id]->get_fragmentation()
3050 << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0)
3051 << std::dec << dendl;
3052 alloc[id]->dump();
11fdf7f2
TL
3053 return -ENOSPC;
3054 }
f67539c2
TL
3055 if (is_shared_alloc(id)) {
3056 shared_alloc->bluefs_used += alloc_len;
3057 }
11fdf7f2
TL
3058
3059 return 0;
3060}
3061
7c673cae 3062int BlueFS::_allocate(uint8_t id, uint64_t len,
94b18763 3063 bluefs_fnode_t* node)
7c673cae
FG
3064{
3065 dout(10) << __func__ << " len 0x" << std::hex << len << std::dec
3066 << " from " << (int)id << dendl;
11fdf7f2 3067 ceph_assert(id < alloc.size());
b32b8144 3068 int64_t alloc_len = 0;
a8e16298 3069 PExtentVector extents;
11fdf7f2 3070 uint64_t hint = 0;
f67539c2 3071 int64_t need = len;
7c673cae 3072 if (alloc[id]) {
f67539c2 3073 need = round_up_to(len, alloc_size[id]);
94b18763
FG
3074 if (!node->extents.empty() && node->extents.back().bdev == id) {
3075 hint = node->extents.back().end();
11fdf7f2 3076 }
b32b8144 3077 extents.reserve(4); // 4 should be (more than) enough for most allocations
f67539c2 3078 alloc_len = alloc[id]->allocate(need, alloc_size[id], hint, &extents);
b32b8144 3079 }
f67539c2
TL
3080 if (alloc_len < 0 || alloc_len < need) {
3081 if (alloc[id]) {
3082 if (alloc_len > 0) {
3083 alloc[id]->release(extents);
3084 }
3085 dout(1) << __func__ << " unable to allocate 0x" << std::hex << need
3086 << " on bdev " << (int)id
3087 << ", allocator name " << alloc[id]->get_name()
3088 << ", allocator type " << alloc[id]->get_type()
3089 << ", capacity 0x" << alloc[id]->get_capacity()
3090 << ", block size 0x" << alloc[id]->get_block_size()
3091 << ", free 0x" << alloc[id]->get_free()
3092 << ", fragmentation " << alloc[id]->get_fragmentation()
3093 << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0)
3094 << std::dec << dendl;
b32b8144 3095 }
f67539c2 3096
7c673cae 3097 if (id != BDEV_SLOW) {
f67539c2
TL
3098 dout(20) << __func__ << " fallback to bdev "
3099 << (int)id + 1
3100 << dendl;
94b18763 3101 return _allocate(id + 1, len, node);
11fdf7f2 3102 } else {
f67539c2
TL
3103 derr << __func__ << " allocation failed, needed 0x" << std::hex << need
3104 << dendl;
11fdf7f2 3105 }
f67539c2 3106 return -ENOSPC;
11fdf7f2 3107 } else {
f67539c2
TL
3108 uint64_t used = _get_used(id);
3109 if (max_bytes[id] < used) {
3110 logger->set(max_bytes_pcounters[id], used);
3111 max_bytes[id] = used;
3112 }
3113 if (is_shared_alloc(id)) {
3114 shared_alloc->bluefs_used += alloc_len;
11fdf7f2 3115 }
7c673cae
FG
3116 }
3117
3118 for (auto& p : extents) {
94b18763 3119 node->append_extent(bluefs_extent_t(id, p.offset, p.length));
7c673cae
FG
3120 }
3121
3122 return 0;
3123}
3124
3125int BlueFS::_preallocate(FileRef f, uint64_t off, uint64_t len)
3126{
3127 dout(10) << __func__ << " file " << f->fnode << " 0x"
3128 << std::hex << off << "~" << len << std::dec << dendl;
3129 if (f->deleted) {
3130 dout(10) << __func__ << " deleted, no-op" << dendl;
3131 return 0;
3132 }
11fdf7f2 3133 ceph_assert(f->fnode.ino > 1);
7c673cae
FG
3134 uint64_t allocated = f->fnode.get_allocated();
3135 if (off + len > allocated) {
3136 uint64_t want = off + len - allocated;
9f95a23c
TL
3137 vselector->sub_usage(f->vselector_hint, f->fnode);
3138
3139 int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint),
3140 want,
3141 &f->fnode);
3142 vselector->add_usage(f->vselector_hint, f->fnode);
7c673cae
FG
3143 if (r < 0)
3144 return r;
7c673cae
FG
3145 log_t.op_file_update(f->fnode);
3146 }
3147 return 0;
3148}
3149
1911f103 3150void BlueFS::sync_metadata(bool avoid_compact)
7c673cae 3151{
f67539c2 3152 std::unique_lock l(lock);
9f95a23c 3153 if (log_t.empty() && dirty_files.empty()) {
7c673cae 3154 dout(10) << __func__ << " - no pending log events" << dendl;
11fdf7f2 3155 } else {
f67539c2
TL
3156 utime_t start;
3157 lgeneric_subdout(cct, bluefs, 10) << __func__;
3158 start = ceph_clock_now();
3159 *_dout << dendl;
11fdf7f2
TL
3160 flush_bdev(); // FIXME?
3161 _flush_and_sync_log(l);
3162 dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl;
7c673cae 3163 }
7c673cae 3164
f6b5b4d7
TL
3165 if (!avoid_compact) {
3166 _maybe_compact_log(l);
3167 }
3168}
3169
3170void BlueFS::_maybe_compact_log(std::unique_lock<ceph::mutex>& l)
3171{
3172 if (!cct->_conf->bluefs_replay_recovery_disable_compact &&
3173 _should_compact_log()) {
7c673cae
FG
3174 if (cct->_conf->bluefs_compact_log_sync) {
3175 _compact_log_sync();
3176 } else {
3177 _compact_log_async(l);
3178 }
3179 }
7c673cae
FG
3180}
3181
3182int BlueFS::open_for_write(
b3b6e05e
TL
3183 std::string_view dirname,
3184 std::string_view filename,
7c673cae
FG
3185 FileWriter **h,
3186 bool overwrite)
3187{
11fdf7f2 3188 std::lock_guard l(lock);
7c673cae
FG
3189 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3190 map<string,DirRef>::iterator p = dir_map.find(dirname);
3191 DirRef dir;
3192 if (p == dir_map.end()) {
3193 // implicitly create the dir
3194 dout(20) << __func__ << " dir " << dirname
3195 << " does not exist" << dendl;
3196 return -ENOENT;
3197 } else {
3198 dir = p->second;
3199 }
3200
3201 FileRef file;
3202 bool create = false;
f6b5b4d7 3203 bool truncate = false;
7c673cae
FG
3204 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3205 if (q == dir->file_map.end()) {
3206 if (overwrite) {
3207 dout(20) << __func__ << " dir " << dirname << " (" << dir
3208 << ") file " << filename
3209 << " does not exist" << dendl;
3210 return -ENOENT;
3211 }
9f95a23c 3212 file = ceph::make_ref<File>();
7c673cae
FG
3213 file->fnode.ino = ++ino_last;
3214 file_map[ino_last] = file;
b3b6e05e 3215 dir->file_map[string{filename}] = file;
7c673cae
FG
3216 ++file->refs;
3217 create = true;
3218 } else {
3219 // overwrite existing file?
3220 file = q->second;
3221 if (overwrite) {
3222 dout(20) << __func__ << " dir " << dirname << " (" << dir
3223 << ") file " << filename
3224 << " already exists, overwrite in place" << dendl;
3225 } else {
3226 dout(20) << __func__ << " dir " << dirname << " (" << dir
3227 << ") file " << filename
3228 << " already exists, truncate + overwrite" << dendl;
9f95a23c 3229 vselector->sub_usage(file->vselector_hint, file->fnode);
7c673cae
FG
3230 file->fnode.size = 0;
3231 for (auto& p : file->fnode.extents) {
3232 pending_release[p.bdev].insert(p.offset, p.length);
3233 }
f6b5b4d7 3234 truncate = true;
94b18763
FG
3235
3236 file->fnode.clear_extents();
7c673cae
FG
3237 }
3238 }
11fdf7f2 3239 ceph_assert(file->fnode.ino > 1);
7c673cae
FG
3240
3241 file->fnode.mtime = ceph_clock_now();
9f95a23c 3242 file->vselector_hint = vselector->get_hint_by_dir(dirname);
f6b5b4d7
TL
3243 if (create || truncate) {
3244 vselector->add_usage(file->vselector_hint, file->fnode); // update file count
3245 }
9f95a23c 3246
7c673cae 3247 dout(20) << __func__ << " mapping " << dirname << "/" << filename
9f95a23c
TL
3248 << " vsel_hint " << file->vselector_hint
3249 << dendl;
7c673cae
FG
3250
3251 log_t.op_file_update(file->fnode);
3252 if (create)
3253 log_t.op_dir_link(dirname, filename, file->fnode.ino);
3254
3255 *h = _create_writer(file);
3256
3257 if (boost::algorithm::ends_with(filename, ".log")) {
3258 (*h)->writer_type = BlueFS::WRITER_WAL;
3259 if (logger && !overwrite) {
3260 logger->inc(l_bluefs_files_written_wal);
3261 }
3262 } else if (boost::algorithm::ends_with(filename, ".sst")) {
3263 (*h)->writer_type = BlueFS::WRITER_SST;
3264 if (logger) {
3265 logger->inc(l_bluefs_files_written_sst);
3266 }
3267 }
3268
3269 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
3270 return 0;
3271}
3272
3273BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
3274{
3275 FileWriter *w = new FileWriter(f);
3276 for (unsigned i = 0; i < MAX_BDEV; ++i) {
3277 if (bdev[i]) {
3278 w->iocv[i] = new IOContext(cct, NULL);
7c673cae
FG
3279 }
3280 }
3281 return w;
3282}
3283
3284void BlueFS::_close_writer(FileWriter *h)
3285{
3286 dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
f67539c2 3287 //h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer);
7c673cae
FG
3288 for (unsigned i=0; i<MAX_BDEV; ++i) {
3289 if (bdev[i]) {
11fdf7f2
TL
3290 if (h->iocv[i]) {
3291 h->iocv[i]->aio_wait();
3292 bdev[i]->queue_reap_ioc(h->iocv[i]);
3293 }
7c673cae
FG
3294 }
3295 }
3296 delete h;
3297}
3298
3299int BlueFS::open_for_read(
b3b6e05e
TL
3300 std::string_view dirname,
3301 std::string_view filename,
7c673cae
FG
3302 FileReader **h,
3303 bool random)
3304{
11fdf7f2 3305 std::lock_guard l(lock);
7c673cae
FG
3306 dout(10) << __func__ << " " << dirname << "/" << filename
3307 << (random ? " (random)":" (sequential)") << dendl;
3308 map<string,DirRef>::iterator p = dir_map.find(dirname);
3309 if (p == dir_map.end()) {
3310 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3311 return -ENOENT;
3312 }
3313 DirRef dir = p->second;
3314
3315 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3316 if (q == dir->file_map.end()) {
3317 dout(20) << __func__ << " dir " << dirname << " (" << dir
3318 << ") file " << filename
3319 << " not found" << dendl;
3320 return -ENOENT;
3321 }
3322 File *file = q->second.get();
3323
3324 *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
3325 random, false);
3326 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
3327 return 0;
3328}
3329
3330int BlueFS::rename(
b3b6e05e
TL
3331 std::string_view old_dirname, std::string_view old_filename,
3332 std::string_view new_dirname, std::string_view new_filename)
7c673cae 3333{
11fdf7f2 3334 std::lock_guard l(lock);
7c673cae
FG
3335 dout(10) << __func__ << " " << old_dirname << "/" << old_filename
3336 << " -> " << new_dirname << "/" << new_filename << dendl;
3337 map<string,DirRef>::iterator p = dir_map.find(old_dirname);
3338 if (p == dir_map.end()) {
3339 dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
3340 return -ENOENT;
3341 }
3342 DirRef old_dir = p->second;
3343 map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
3344 if (q == old_dir->file_map.end()) {
3345 dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
3346 << ") file " << old_filename
3347 << " not found" << dendl;
3348 return -ENOENT;
3349 }
3350 FileRef file = q->second;
3351
3352 p = dir_map.find(new_dirname);
3353 if (p == dir_map.end()) {
3354 dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
3355 return -ENOENT;
3356 }
3357 DirRef new_dir = p->second;
3358 q = new_dir->file_map.find(new_filename);
3359 if (q != new_dir->file_map.end()) {
3360 dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
3361 << ") file " << new_filename
3362 << " already exists, unlinking" << dendl;
11fdf7f2 3363 ceph_assert(q->second != file);
7c673cae
FG
3364 log_t.op_dir_unlink(new_dirname, new_filename);
3365 _drop_link(q->second);
3366 }
3367
3368 dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
3369 << " " << file->fnode << dendl;
3370
b3b6e05e
TL
3371 new_dir->file_map[string{new_filename}] = file;
3372 old_dir->file_map.erase(string{old_filename});
7c673cae
FG
3373
3374 log_t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
3375 log_t.op_dir_unlink(old_dirname, old_filename);
3376 return 0;
3377}
3378
b3b6e05e 3379int BlueFS::mkdir(std::string_view dirname)
7c673cae 3380{
11fdf7f2 3381 std::lock_guard l(lock);
7c673cae
FG
3382 dout(10) << __func__ << " " << dirname << dendl;
3383 map<string,DirRef>::iterator p = dir_map.find(dirname);
3384 if (p != dir_map.end()) {
3385 dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
3386 return -EEXIST;
3387 }
b3b6e05e 3388 dir_map[string{dirname}] = ceph::make_ref<Dir>();
7c673cae
FG
3389 log_t.op_dir_create(dirname);
3390 return 0;
3391}
3392
b3b6e05e 3393int BlueFS::rmdir(std::string_view dirname)
7c673cae 3394{
11fdf7f2 3395 std::lock_guard l(lock);
7c673cae 3396 dout(10) << __func__ << " " << dirname << dendl;
b3b6e05e 3397 auto p = dir_map.find(dirname);
7c673cae
FG
3398 if (p == dir_map.end()) {
3399 dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
3400 return -ENOENT;
3401 }
3402 DirRef dir = p->second;
3403 if (!dir->file_map.empty()) {
3404 dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
3405 return -ENOTEMPTY;
3406 }
b3b6e05e 3407 dir_map.erase(string{dirname});
7c673cae
FG
3408 log_t.op_dir_remove(dirname);
3409 return 0;
3410}
3411
b3b6e05e 3412bool BlueFS::dir_exists(std::string_view dirname)
7c673cae 3413{
11fdf7f2 3414 std::lock_guard l(lock);
7c673cae
FG
3415 map<string,DirRef>::iterator p = dir_map.find(dirname);
3416 bool exists = p != dir_map.end();
3417 dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
3418 return exists;
3419}
3420
b3b6e05e 3421int BlueFS::stat(std::string_view dirname, std::string_view filename,
7c673cae
FG
3422 uint64_t *size, utime_t *mtime)
3423{
11fdf7f2 3424 std::lock_guard l(lock);
7c673cae
FG
3425 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3426 map<string,DirRef>::iterator p = dir_map.find(dirname);
3427 if (p == dir_map.end()) {
3428 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3429 return -ENOENT;
3430 }
3431 DirRef dir = p->second;
3432 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3433 if (q == dir->file_map.end()) {
3434 dout(20) << __func__ << " dir " << dirname << " (" << dir
3435 << ") file " << filename
3436 << " not found" << dendl;
3437 return -ENOENT;
3438 }
3439 File *file = q->second.get();
3440 dout(10) << __func__ << " " << dirname << "/" << filename
3441 << " " << file->fnode << dendl;
3442 if (size)
3443 *size = file->fnode.size;
3444 if (mtime)
3445 *mtime = file->fnode.mtime;
3446 return 0;
3447}
3448
b3b6e05e 3449int BlueFS::lock_file(std::string_view dirname, std::string_view filename,
7c673cae
FG
3450 FileLock **plock)
3451{
11fdf7f2 3452 std::lock_guard l(lock);
7c673cae
FG
3453 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3454 map<string,DirRef>::iterator p = dir_map.find(dirname);
3455 if (p == dir_map.end()) {
3456 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3457 return -ENOENT;
3458 }
3459 DirRef dir = p->second;
b3b6e05e 3460 auto q = dir->file_map.find(filename);
9f95a23c 3461 FileRef file;
7c673cae
FG
3462 if (q == dir->file_map.end()) {
3463 dout(20) << __func__ << " dir " << dirname << " (" << dir
3464 << ") file " << filename
3465 << " not found, creating" << dendl;
9f95a23c 3466 file = ceph::make_ref<File>();
7c673cae
FG
3467 file->fnode.ino = ++ino_last;
3468 file->fnode.mtime = ceph_clock_now();
3469 file_map[ino_last] = file;
b3b6e05e 3470 dir->file_map[string{filename}] = file;
7c673cae
FG
3471 ++file->refs;
3472 log_t.op_file_update(file->fnode);
3473 log_t.op_dir_link(dirname, filename, file->fnode.ino);
3474 } else {
9f95a23c 3475 file = q->second;
7c673cae
FG
3476 if (file->locked) {
3477 dout(10) << __func__ << " already locked" << dendl;
11fdf7f2 3478 return -ENOLCK;
7c673cae
FG
3479 }
3480 }
3481 file->locked = true;
3482 *plock = new FileLock(file);
3483 dout(10) << __func__ << " locked " << file->fnode
3484 << " with " << *plock << dendl;
3485 return 0;
3486}
3487
3488int BlueFS::unlock_file(FileLock *fl)
3489{
11fdf7f2 3490 std::lock_guard l(lock);
7c673cae 3491 dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
11fdf7f2 3492 ceph_assert(fl->file->locked);
7c673cae
FG
3493 fl->file->locked = false;
3494 delete fl;
3495 return 0;
3496}
3497
b3b6e05e 3498int BlueFS::readdir(std::string_view dirname, vector<string> *ls)
7c673cae 3499{
b3b6e05e
TL
3500 // dirname may contain a trailing /
3501 if (!dirname.empty() && dirname.back() == '/') {
3502 dirname.remove_suffix(1);
3503 }
11fdf7f2 3504 std::lock_guard l(lock);
7c673cae
FG
3505 dout(10) << __func__ << " " << dirname << dendl;
3506 if (dirname.empty()) {
3507 // list dirs
3508 ls->reserve(dir_map.size() + 2);
3509 for (auto& q : dir_map) {
3510 ls->push_back(q.first);
3511 }
3512 } else {
3513 // list files in dir
3514 map<string,DirRef>::iterator p = dir_map.find(dirname);
3515 if (p == dir_map.end()) {
3516 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3517 return -ENOENT;
3518 }
3519 DirRef dir = p->second;
3520 ls->reserve(dir->file_map.size() + 2);
3521 for (auto& q : dir->file_map) {
3522 ls->push_back(q.first);
3523 }
3524 }
3525 ls->push_back(".");
3526 ls->push_back("..");
3527 return 0;
3528}
3529
b3b6e05e 3530int BlueFS::unlink(std::string_view dirname, std::string_view filename)
7c673cae 3531{
11fdf7f2 3532 std::lock_guard l(lock);
7c673cae
FG
3533 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
3534 map<string,DirRef>::iterator p = dir_map.find(dirname);
3535 if (p == dir_map.end()) {
3536 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
3537 return -ENOENT;
3538 }
3539 DirRef dir = p->second;
3540 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3541 if (q == dir->file_map.end()) {
3542 dout(20) << __func__ << " file " << dirname << "/" << filename
3543 << " not found" << dendl;
3544 return -ENOENT;
3545 }
3546 FileRef file = q->second;
3547 if (file->locked) {
3548 dout(20) << __func__ << " file " << dirname << "/" << filename
3549 << " is locked" << dendl;
3550 return -EBUSY;
3551 }
b3b6e05e 3552 dir->file_map.erase(string{filename});
7c673cae
FG
3553 log_t.op_dir_unlink(dirname, filename);
3554 _drop_link(file);
3555 return 0;
3556}
d2e6a577
FG
3557
3558bool BlueFS::wal_is_rotational()
3559{
94b18763
FG
3560 if (bdev[BDEV_WAL]) {
3561 return bdev[BDEV_WAL]->is_rotational();
3562 } else if (bdev[BDEV_DB]) {
3563 return bdev[BDEV_DB]->is_rotational();
3564 }
3565 return bdev[BDEV_SLOW]->is_rotational();
d2e6a577 3566}
9f95a23c 3567
f6b5b4d7
TL
3568/*
3569 Algorithm.
3570 do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there.
3571 Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future,
3572 and try if using it will produce healthy bluefs transaction.
3573 We encode already known bluefs log extents and search disk for these bytes.
3574 When we find it, we decode following bytes as extent.
3575 We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction.
3576 */
3577int BlueFS::do_replay_recovery_read(FileReader *log_reader,
3578 size_t replay_pos,
3579 size_t read_offset,
3580 size_t read_len,
3581 bufferlist* bl) {
3582 dout(1) << __func__ << " replay_pos=0x" << std::hex << replay_pos <<
3583 " needs 0x" << read_offset << "~" << read_len << std::dec << dendl;
3584
3585 bluefs_fnode_t& log_fnode = log_reader->file->fnode;
3586 bufferlist bin_extents;
f67539c2 3587 ::encode(log_fnode.extents, bin_extents);
f6b5b4d7
TL
3588 dout(2) << __func__ << " log file encoded extents length = " << bin_extents.length() << dendl;
3589
3590 // cannot process if too small to effectively search
3591 ceph_assert(bin_extents.length() >= 32);
3592 bufferlist last_32;
3593 last_32.substr_of(bin_extents, bin_extents.length() - 32, 32);
3594
3595 //read fixed part from replay_pos to end of bluefs_log extents
3596 bufferlist fixed;
3597 uint64_t e_off = 0;
3598 auto e = log_fnode.seek(replay_pos, &e_off);
3599 ceph_assert(e != log_fnode.extents.end());
3600 int r = bdev[e->bdev]->read(e->offset + e_off, e->length - e_off, &fixed, ioc[e->bdev],
3601 cct->_conf->bluefs_buffered_io);
3602 ceph_assert(r == 0);
3603 //capture dev of last good extent
3604 uint8_t last_e_dev = e->bdev;
3605 uint64_t last_e_off = e->offset;
3606 ++e;
3607 while (e != log_fnode.extents.end()) {
3608 r = bdev[e->bdev]->read(e->offset, e->length, &fixed, ioc[e->bdev],
3609 cct->_conf->bluefs_buffered_io);
3610 ceph_assert(r == 0);
3611 last_e_dev = e->bdev;
3612 ++e;
3613 }
3614 ceph_assert(replay_pos + fixed.length() == read_offset);
3615
3616 dout(2) << __func__ << " valid data in log = " << fixed.length() << dendl;
3617
3618 struct compare {
3619 bool operator()(const bluefs_extent_t& a, const bluefs_extent_t& b) const {
3620 if (a.bdev < b.bdev) return true;
3621 if (a.offset < b.offset) return true;
3622 return a.length < b.length;
3623 }
3624 };
3625 std::set<bluefs_extent_t, compare> extents_rejected;
3626 for (int dcnt = 0; dcnt < 3; dcnt++) {
3627 uint8_t dev = (last_e_dev + dcnt) % MAX_BDEV;
3628 if (bdev[dev] == nullptr) continue;
3629 dout(2) << __func__ << " processing " << get_device_name(dev) << dendl;
3630 interval_set<uint64_t> disk_regions;
3631 disk_regions.insert(0, bdev[dev]->get_size());
3632 for (auto f : file_map) {
3633 auto& e = f.second->fnode.extents;
3634 for (auto& p : e) {
3635 if (p.bdev == dev) {
3636 disk_regions.erase(p.offset, p.length);
3637 }
3638 }
3639 }
3640 size_t disk_regions_count = disk_regions.num_intervals();
3641 dout(5) << __func__ << " " << disk_regions_count << " regions to scan on " << get_device_name(dev) << dendl;
3642
3643 auto reg = disk_regions.lower_bound(last_e_off);
3644 //for all except first, start from beginning
3645 last_e_off = 0;
3646 if (reg == disk_regions.end()) {
3647 reg = disk_regions.begin();
3648 }
3649 const uint64_t chunk_size = 4 * 1024 * 1024;
3650 const uint64_t page_size = 4096;
3651 const uint64_t max_extent_size = 16;
3652 uint64_t overlay_size = last_32.length() + max_extent_size;
3653 for (size_t i = 0; i < disk_regions_count; reg++, i++) {
3654 if (reg == disk_regions.end()) {
3655 reg = disk_regions.begin();
3656 }
3657 uint64_t pos = reg.get_start();
3658 uint64_t len = reg.get_len();
3659
3660 std::unique_ptr<char[]> raw_data_p{new char[page_size + chunk_size]};
3661 char* raw_data = raw_data_p.get();
3662 memset(raw_data, 0, page_size);
3663
3664 while (len > last_32.length()) {
3665 uint64_t chunk_len = len > chunk_size ? chunk_size : len;
3666 dout(5) << __func__ << " read "
3667 << get_device_name(dev) << ":0x" << std::hex << pos << "+" << chunk_len << std::dec << dendl;
3668 r = bdev[dev]->read_random(pos, chunk_len, raw_data + page_size, cct->_conf->bluefs_buffered_io);
3669 ceph_assert(r == 0);
3670
3671 //search for fixed_last_32
3672 char* chunk_b = raw_data + page_size;
3673 char* chunk_e = chunk_b + chunk_len;
3674
3675 char* search_b = chunk_b - overlay_size;
3676 char* search_e = chunk_e;
3677
3678 for (char* sp = search_b; ; sp += last_32.length()) {
3679 sp = (char*)memmem(sp, search_e - sp, last_32.c_str(), last_32.length());
3680 if (sp == nullptr) {
3681 break;
3682 }
3683
3684 char* n = sp + last_32.length();
3685 dout(5) << __func__ << " checking location 0x" << std::hex << pos + (n - chunk_b) << std::dec << dendl;
3686 bufferlist test;
3687 test.append(n, std::min<size_t>(max_extent_size, chunk_e - n));
3688 bluefs_extent_t ne;
3689 try {
3690 bufferlist::const_iterator p = test.begin();
f67539c2 3691 ::decode(ne, p);
f6b5b4d7
TL
3692 } catch (buffer::error& e) {
3693 continue;
3694 }
3695 if (extents_rejected.count(ne) != 0) {
3696 dout(5) << __func__ << " extent " << ne << " already refected" <<dendl;
3697 continue;
3698 }
3699 //insert as rejected already. if we succeed, it wouldn't make difference.
3700 extents_rejected.insert(ne);
3701
3702 if (ne.bdev >= MAX_BDEV ||
3703 bdev[ne.bdev] == nullptr ||
3704 ne.length > 16 * 1024 * 1024 ||
3705 (ne.length & 4095) != 0 ||
3706 ne.offset + ne.length > bdev[ne.bdev]->get_size() ||
3707 (ne.offset & 4095) != 0) {
3708 dout(5) << __func__ << " refusing extent " << ne << dendl;
3709 continue;
3710 }
3711 dout(5) << __func__ << " checking extent " << ne << dendl;
3712
3713 //read candidate extent - whole
3714 bufferlist candidate;
3715 candidate.append(fixed);
3716 r = bdev[ne.bdev]->read(ne.offset, ne.length, &candidate, ioc[ne.bdev],
3717 cct->_conf->bluefs_buffered_io);
3718 ceph_assert(r == 0);
3719
3720 //check if transaction & crc is ok
3721 bluefs_transaction_t t;
3722 try {
f67539c2
TL
3723 bufferlist::const_iterator p = candidate.begin();
3724 ::decode(t, p);
f6b5b4d7
TL
3725 }
3726 catch (buffer::error& e) {
3727 dout(5) << __func__ << " failed match" << dendl;
3728 continue;
3729 }
3730
3731 //success, it seems a probable candidate
3732 uint64_t l = std::min<uint64_t>(ne.length, read_len);
3733 //trim to required size
3734 bufferlist requested_read;
3735 requested_read.substr_of(candidate, fixed.length(), l);
3736 bl->append(requested_read);
3737 dout(5) << __func__ << " successful extension of log " << l << "/" << read_len << dendl;
3738 log_fnode.append_extent(ne);
3739 log_fnode.recalc_allocated();
3740 log_reader->buf.pos += l;
3741 return l;
3742 }
3743 //save overlay for next search
3744 memcpy(search_b, chunk_e - overlay_size, overlay_size);
3745 pos += chunk_len;
3746 len -= chunk_len;
3747 }
3748 }
3749 }
3750 return 0;
3751}
3752
f67539c2 3753size_t BlueFS::probe_alloc_avail(int dev, uint64_t alloc_size)
9f95a23c 3754{
f67539c2
TL
3755 size_t total = 0;
3756 auto iterated_allocation = [&](size_t off, size_t len) {
3757 //only count in size that is alloc_size aligned
3758 size_t dist_to_alignment;
3759 size_t offset_in_block = off & (alloc_size - 1);
3760 if (offset_in_block == 0)
3761 dist_to_alignment = 0;
3762 else
3763 dist_to_alignment = alloc_size - offset_in_block;
3764 if (dist_to_alignment >= len)
3765 return;
3766 len -= dist_to_alignment;
3767 total += p2align(len, alloc_size);
3768 };
3769 if (alloc[dev]) {
3770 alloc[dev]->dump(iterated_allocation);
9f95a23c 3771 }
f67539c2 3772 return total;
9f95a23c 3773}
9f95a23c
TL
3774// ===============================================
3775// OriginalVolumeSelector
3776
f6b5b4d7
TL
3777void* OriginalVolumeSelector::get_hint_for_log() const {
3778 return reinterpret_cast<void*>(BlueFS::BDEV_WAL);
9f95a23c 3779}
b3b6e05e 3780void* OriginalVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
9f95a23c
TL
3781 uint8_t res = BlueFS::BDEV_DB;
3782 if (dirname.length() > 5) {
3783 // the "db.slow" and "db.wal" directory names are hard-coded at
3784 // match up with bluestore. the slow device is always the second
3785 // one (when a dedicated block.db device is present and used at
3786 // bdev 0). the wal device is always last.
3787 if (boost::algorithm::ends_with(dirname, ".slow")) {
3788 res = BlueFS::BDEV_SLOW;
3789 }
3790 else if (boost::algorithm::ends_with(dirname, ".wal")) {
3791 res = BlueFS::BDEV_WAL;
3792 }
3793 }
3794 return reinterpret_cast<void*>(res);
3795}
3796
3797uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint)
3798{
3799 return (uint8_t)(reinterpret_cast<uint64_t>(hint));
3800}
3801
3802void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const
3803{
3804 res.emplace_back(base, db_total);
3805 res.emplace_back(base + ".slow", slow_total);
3806}
3807
3808#undef dout_prefix
3809#define dout_prefix *_dout << "OriginalVolumeSelector: "
3810
3811void OriginalVolumeSelector::dump(ostream& sout) {
3812 sout<< "wal_total:" << wal_total
3813 << ", db_total:" << db_total
3814 << ", slow_total:" << slow_total
3815 << std::endl;
3816}
f67539c2
TL
3817
3818// ===============================================
3819// FitToFastVolumeSelector
3820
3821void FitToFastVolumeSelector::get_paths(const std::string& base, paths& res) const {
3822 res.emplace_back(base, 1); // size of the last db_path has no effect
3823}