]> git.proxmox.com Git - ceph.git/blame - ceph/src/os/bluestore/BlueFS.cc
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / os / bluestore / BlueFS.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
39ae355f 3#include <chrono>
7c673cae 4#include "boost/algorithm/string.hpp"
9f95a23c 5#include "bluestore_common.h"
7c673cae
FG
6#include "BlueFS.h"
7
8#include "common/debug.h"
9#include "common/errno.h"
10#include "common/perf_counters.h"
7c673cae 11#include "Allocator.h"
11fdf7f2 12#include "include/ceph_assert.h"
eafe8130 13#include "common/admin_socket.h"
7c673cae
FG
14
15#define dout_context cct
16#define dout_subsys ceph_subsys_bluefs
17#undef dout_prefix
18#define dout_prefix *_dout << "bluefs "
9f95a23c 19using TOPNSPC::common::cmd_getval;
f67539c2
TL
20
21using std::byte;
22using std::list;
23using std::make_pair;
24using std::map;
25using std::ostream;
26using std::pair;
27using std::set;
28using std::string;
29using std::to_string;
30using std::vector;
39ae355f 31using std::chrono::duration;
39ae355f 32using std::chrono::seconds;
f67539c2
TL
33
34using ceph::bufferlist;
35using ceph::decode;
36using ceph::encode;
37using ceph::Formatter;
38
39
7c673cae
FG
40MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::File, bluefs_file, bluefs);
41MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::Dir, bluefs_dir, bluefs);
f91f0fd5 42MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileWriter, bluefs_file_writer, bluefs_file_writer);
7c673cae 43MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReaderBuffer,
f91f0fd5
TL
44 bluefs_file_reader_buffer, bluefs_file_reader);
45MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileReader, bluefs_file_reader, bluefs_file_reader);
7c673cae
FG
46MEMPOOL_DEFINE_OBJECT_FACTORY(BlueFS::FileLock, bluefs_file_lock, bluefs);
47
11fdf7f2
TL
48static void wal_discard_cb(void *priv, void* priv2) {
49 BlueFS *bluefs = static_cast<BlueFS*>(priv);
50 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
51 bluefs->handle_discard(BlueFS::BDEV_WAL, *tmp);
52}
53
54static void db_discard_cb(void *priv, void* priv2) {
55 BlueFS *bluefs = static_cast<BlueFS*>(priv);
56 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
57 bluefs->handle_discard(BlueFS::BDEV_DB, *tmp);
58}
59
60static void slow_discard_cb(void *priv, void* priv2) {
61 BlueFS *bluefs = static_cast<BlueFS*>(priv);
62 interval_set<uint64_t> *tmp = static_cast<interval_set<uint64_t>*>(priv2);
63 bluefs->handle_discard(BlueFS::BDEV_SLOW, *tmp);
64}
7c673cae 65
eafe8130
TL
66class BlueFS::SocketHook : public AdminSocketHook {
67 BlueFS* bluefs;
68public:
69 static BlueFS::SocketHook* create(BlueFS* bluefs)
70 {
71 BlueFS::SocketHook* hook = nullptr;
72 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
73 if (admin_socket) {
74 hook = new BlueFS::SocketHook(bluefs);
f67539c2 75 int r = admin_socket->register_command("bluestore bluefs device info "
eafe8130
TL
76 "name=alloc_size,type=CephInt,req=false",
77 hook,
f67539c2
TL
78 "Shows space report for bluefs devices. "
79 "This also includes an estimation for space "
80 "available to bluefs at main device. "
81 "alloc_size, if set, specifies the custom bluefs "
82 "allocation unit size for the estimation above.");
eafe8130
TL
83 if (r != 0) {
84 ldout(bluefs->cct, 1) << __func__ << " cannot register SocketHook" << dendl;
85 delete hook;
86 hook = nullptr;
9f95a23c 87 } else {
f6b5b4d7 88 r = admin_socket->register_command("bluefs stats",
9f95a23c
TL
89 hook,
90 "Dump internal statistics for bluefs."
91 "");
92 ceph_assert(r == 0);
f67539c2
TL
93 r = admin_socket->register_command("bluefs files list", hook,
94 "print files in bluefs");
95 ceph_assert(r == 0);
cd265ab1
TL
96 r = admin_socket->register_command("bluefs debug_inject_read_zeros", hook,
97 "Injects 8K zeros into next BlueFS read. Debug only.");
98 ceph_assert(r == 0);
eafe8130
TL
99 }
100 }
101 return hook;
102 }
103
104 ~SocketHook() {
105 AdminSocket* admin_socket = bluefs->cct->get_admin_socket();
9f95a23c 106 admin_socket->unregister_commands(this);
eafe8130
TL
107 }
108private:
109 SocketHook(BlueFS* bluefs) :
110 bluefs(bluefs) {}
9f95a23c 111 int call(std::string_view command, const cmdmap_t& cmdmap,
39ae355f 112 const bufferlist&,
9f95a23c
TL
113 Formatter *f,
114 std::ostream& errss,
115 bufferlist& out) override {
f67539c2 116 if (command == "bluestore bluefs device info") {
9f95a23c
TL
117 int64_t alloc_size = 0;
118 cmd_getval(cmdmap, "alloc_size", alloc_size);
119 if ((alloc_size & (alloc_size - 1)) != 0) {
120 errss << "Invalid allocation size:'" << alloc_size << std::endl;
121 return -EINVAL;
122 }
123 if (alloc_size == 0)
f67539c2
TL
124 alloc_size = bluefs->cct->_conf->bluefs_shared_alloc_size;
125 f->open_object_section("bluefs_device_info");
9f95a23c
TL
126 for (unsigned dev = BDEV_WAL; dev <= BDEV_SLOW; dev++) {
127 if (bluefs->bdev[dev]) {
128 f->open_object_section("dev");
129 f->dump_string("device", bluefs->get_device_name(dev));
130 ceph_assert(bluefs->alloc[dev]);
f67539c2
TL
131 auto total = bluefs->get_total(dev);
132 auto free = bluefs->get_free(dev);
133 auto used = bluefs->get_used(dev);
134
135 f->dump_int("total", total);
136 f->dump_int("free", free);
137 f->dump_int("bluefs_used", used);
138 if (bluefs->is_shared_alloc(dev)) {
139 size_t avail = bluefs->probe_alloc_avail(dev, alloc_size);
140 f->dump_int("bluefs max available", avail);
141 }
142 f->close_section();
143 }
eafe8130 144 }
f67539c2 145
9f95a23c
TL
146 f->close_section();
147 } else if (command == "bluefs stats") {
148 std::stringstream ss;
149 bluefs->dump_block_extents(ss);
150 bluefs->dump_volume_selector(ss);
eafe8130 151 out.append(ss);
f67539c2
TL
152 } else if (command == "bluefs files list") {
153 const char* devnames[3] = {"wal","db","slow"};
20effc67 154 std::lock_guard l(bluefs->nodes.lock);
f67539c2 155 f->open_array_section("files");
20effc67 156 for (auto &d : bluefs->nodes.dir_map) {
f67539c2
TL
157 std::string dir = d.first;
158 for (auto &r : d.second->file_map) {
159 f->open_object_section("file");
160 f->dump_string("name", (dir + "/" + r.first).c_str());
161 std::vector<size_t> sizes;
162 sizes.resize(bluefs->bdev.size());
163 for(auto& i : r.second->fnode.extents) {
164 sizes[i.bdev] += i.length;
165 }
166 for (size_t i = 0; i < sizes.size(); i++) {
167 if (sizes[i]>0) {
168 if (i < sizeof(devnames) / sizeof(*devnames))
169 f->dump_int(devnames[i], sizes[i]);
170 else
171 f->dump_int(("dev-"+to_string(i)).c_str(), sizes[i]);
172 }
173 }
174 f->close_section();
175 }
176 }
177 f->close_section();
178 f->flush(out);
cd265ab1
TL
179 } else if (command == "bluefs debug_inject_read_zeros") {
180 bluefs->inject_read_zeros++;
9f95a23c
TL
181 } else {
182 errss << "Invalid command" << std::endl;
183 return -ENOSYS;
eafe8130 184 }
9f95a23c
TL
185 return 0;
186 }
eafe8130
TL
187};
188
7c673cae
FG
189BlueFS::BlueFS(CephContext* cct)
190 : cct(cct),
191 bdev(MAX_BDEV),
192 ioc(MAX_BDEV),
f67539c2
TL
193 block_reserved(MAX_BDEV),
194 alloc(MAX_BDEV),
20effc67 195 alloc_size(MAX_BDEV, 0)
7c673cae 196{
20effc67 197 dirty.pending_release.resize(MAX_BDEV);
11fdf7f2
TL
198 discard_cb[BDEV_WAL] = wal_discard_cb;
199 discard_cb[BDEV_DB] = db_discard_cb;
200 discard_cb[BDEV_SLOW] = slow_discard_cb;
eafe8130 201 asok_hook = SocketHook::create(this);
7c673cae
FG
202}
203
204BlueFS::~BlueFS()
205{
eafe8130 206 delete asok_hook;
7c673cae
FG
207 for (auto p : ioc) {
208 if (p)
209 p->aio_wait();
210 }
211 for (auto p : bdev) {
212 if (p) {
213 p->close();
214 delete p;
215 }
216 }
217 for (auto p : ioc) {
218 delete p;
219 }
220}
221
222void BlueFS::_init_logger()
223{
224 PerfCountersBuilder b(cct, "bluefs",
225 l_bluefs_first, l_bluefs_last);
7c673cae
FG
226 b.add_u64(l_bluefs_db_total_bytes, "db_total_bytes",
227 "Total bytes (main db device)",
11fdf7f2 228 "b", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
229 b.add_u64(l_bluefs_db_used_bytes, "db_used_bytes",
230 "Used bytes (main db device)",
11fdf7f2 231 "u", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
232 b.add_u64(l_bluefs_wal_total_bytes, "wal_total_bytes",
233 "Total bytes (wal device)",
11fdf7f2 234 "walb", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
235 b.add_u64(l_bluefs_wal_used_bytes, "wal_used_bytes",
236 "Used bytes (wal device)",
11fdf7f2 237 "walu", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
238 b.add_u64(l_bluefs_slow_total_bytes, "slow_total_bytes",
239 "Total bytes (slow device)",
11fdf7f2 240 "slob", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
241 b.add_u64(l_bluefs_slow_used_bytes, "slow_used_bytes",
242 "Used bytes (slow device)",
11fdf7f2 243 "slou", PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
7c673cae
FG
244 b.add_u64(l_bluefs_num_files, "num_files", "File count",
245 "f", PerfCountersBuilder::PRIO_USEFUL);
246 b.add_u64(l_bluefs_log_bytes, "log_bytes", "Size of the metadata log",
11fdf7f2 247 "jlen", PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
7c673cae
FG
248 b.add_u64_counter(l_bluefs_log_compactions, "log_compactions",
249 "Compactions of the metadata log");
1e59de90
TL
250 b.add_u64_counter(l_bluefs_log_write_count, "log_write_count",
251 "Write op count to the metadata log");
7c673cae 252 b.add_u64_counter(l_bluefs_logged_bytes, "logged_bytes",
20effc67
TL
253 "Bytes written to the metadata log",
254 "j",
11fdf7f2 255 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
7c673cae
FG
256 b.add_u64_counter(l_bluefs_files_written_wal, "files_written_wal",
257 "Files written to WAL");
258 b.add_u64_counter(l_bluefs_files_written_sst, "files_written_sst",
259 "Files written to SSTs");
1e59de90
TL
260 b.add_u64_counter(l_bluefs_write_count_wal, "write_count_wal",
261 "Write op count to WAL");
262 b.add_u64_counter(l_bluefs_write_count_sst, "write_count_sst",
263 "Write op count to SSTs");
7c673cae 264 b.add_u64_counter(l_bluefs_bytes_written_wal, "bytes_written_wal",
20effc67
TL
265 "Bytes written to WAL",
266 "walb",
7c673cae
FG
267 PerfCountersBuilder::PRIO_CRITICAL);
268 b.add_u64_counter(l_bluefs_bytes_written_sst, "bytes_written_sst",
20effc67
TL
269 "Bytes written to SSTs",
270 "sstb",
11fdf7f2
TL
271 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
272 b.add_u64_counter(l_bluefs_bytes_written_slow, "bytes_written_slow",
20effc67
TL
273 "Bytes written to WAL/SSTs at slow device",
274 "slwb",
275 PerfCountersBuilder::PRIO_CRITICAL, unit_t(UNIT_BYTES));
11fdf7f2 276 b.add_u64_counter(l_bluefs_max_bytes_wal, "max_bytes_wal",
20effc67
TL
277 "Maximum bytes allocated from WAL",
278 "mxwb",
279 PerfCountersBuilder::PRIO_INTERESTING,
280 unit_t(UNIT_BYTES));
11fdf7f2 281 b.add_u64_counter(l_bluefs_max_bytes_db, "max_bytes_db",
20effc67
TL
282 "Maximum bytes allocated from DB",
283 "mxdb",
284 PerfCountersBuilder::PRIO_INTERESTING,
285 unit_t(UNIT_BYTES));
11fdf7f2 286 b.add_u64_counter(l_bluefs_max_bytes_slow, "max_bytes_slow",
20effc67
TL
287 "Maximum bytes allocated from SLOW",
288 "mxwb",
289 PerfCountersBuilder::PRIO_INTERESTING,
290 unit_t(UNIT_BYTES));
291 b.add_u64_counter(l_bluefs_main_alloc_unit, "alloc_unit_main",
292 "Allocation unit size (in bytes) for primary/shared device",
293 "aumb",
294 PerfCountersBuilder::PRIO_CRITICAL,
295 unit_t(UNIT_BYTES));
296 b.add_u64_counter(l_bluefs_db_alloc_unit, "alloc_unit_db",
297 "Allocation unit size (in bytes) for standalone DB device",
298 "audb",
299 PerfCountersBuilder::PRIO_CRITICAL,
300 unit_t(UNIT_BYTES));
301 b.add_u64_counter(l_bluefs_wal_alloc_unit, "alloc_unit_wal",
302 "Allocation unit size (in bytes) for standalone WAL device",
303 "auwb",
304 PerfCountersBuilder::PRIO_CRITICAL,
305 unit_t(UNIT_BYTES));
494da23a 306 b.add_u64_counter(l_bluefs_read_random_count, "read_random_count",
20effc67
TL
307 "random read requests processed",
308 NULL,
309 PerfCountersBuilder::PRIO_USEFUL);
494da23a 310 b.add_u64_counter(l_bluefs_read_random_bytes, "read_random_bytes",
20effc67
TL
311 "Bytes requested in random read mode",
312 NULL,
494da23a
TL
313 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
314 b.add_u64_counter(l_bluefs_read_random_disk_count, "read_random_disk_count",
20effc67
TL
315 "random reads requests going to disk",
316 NULL,
317 PerfCountersBuilder::PRIO_USEFUL);
494da23a 318 b.add_u64_counter(l_bluefs_read_random_disk_bytes, "read_random_disk_bytes",
20effc67
TL
319 "Bytes read from disk in random read mode",
320 "rrb",
321 PerfCountersBuilder::PRIO_INTERESTING,
322 unit_t(UNIT_BYTES));
323 b.add_u64_counter(l_bluefs_read_random_disk_bytes_wal, "read_random_disk_bytes_wal",
324 "random reads requests going to WAL disk",
325 NULL,
326 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
327 b.add_u64_counter(l_bluefs_read_random_disk_bytes_db, "read_random_disk_bytes_db",
328 "random reads requests going to DB disk",
329 NULL,
494da23a 330 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
20effc67
TL
331 b.add_u64_counter(l_bluefs_read_random_disk_bytes_slow, "read_random_disk_bytes_slow",
332 "random reads requests going to main disk",
333 "rrsb",
334 PerfCountersBuilder::PRIO_INTERESTING,
335 unit_t(UNIT_BYTES));
494da23a 336 b.add_u64_counter(l_bluefs_read_random_buffer_count, "read_random_buffer_count",
20effc67
TL
337 "random read requests processed using prefetch buffer",
338 NULL,
339 PerfCountersBuilder::PRIO_USEFUL);
494da23a 340 b.add_u64_counter(l_bluefs_read_random_buffer_bytes, "read_random_buffer_bytes",
20effc67
TL
341 "Bytes read from prefetch buffer in random read mode",
342 NULL,
494da23a 343 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
494da23a 344 b.add_u64_counter(l_bluefs_read_count, "read_count",
20effc67
TL
345 "buffered read requests processed",
346 NULL,
347 PerfCountersBuilder::PRIO_USEFUL);
494da23a 348 b.add_u64_counter(l_bluefs_read_bytes, "read_bytes",
20effc67
TL
349 "Bytes requested in buffered read mode",
350 NULL,
494da23a 351 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
20effc67
TL
352 b.add_u64_counter(l_bluefs_read_disk_count, "read_disk_count",
353 "buffered reads requests going to disk",
354 NULL,
355 PerfCountersBuilder::PRIO_USEFUL);
356 b.add_u64_counter(l_bluefs_read_disk_bytes, "read_disk_bytes",
357 "Bytes read in buffered mode from disk",
358 "rb",
359 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
360 b.add_u64_counter(l_bluefs_read_disk_bytes_wal, "read_disk_bytes_wal",
361 "reads requests going to WAL disk",
362 NULL,
363 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
364 b.add_u64_counter(l_bluefs_read_disk_bytes_db, "read_disk_bytes_db",
365 "reads requests going to DB disk",
366 NULL,
367 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
368 b.add_u64_counter(l_bluefs_read_disk_bytes_slow, "read_disk_bytes_slow",
369 "reads requests going to main disk",
370 "rsb",
371 PerfCountersBuilder::PRIO_INTERESTING, unit_t(UNIT_BYTES));
494da23a 372 b.add_u64_counter(l_bluefs_read_prefetch_count, "read_prefetch_count",
20effc67
TL
373 "prefetch read requests processed",
374 NULL,
375 PerfCountersBuilder::PRIO_USEFUL);
494da23a 376 b.add_u64_counter(l_bluefs_read_prefetch_bytes, "read_prefetch_bytes",
20effc67
TL
377 "Bytes requested in prefetch read mode",
378 NULL,
494da23a 379 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
1e59de90
TL
380 b.add_u64_counter(l_bluefs_write_count, "write_count",
381 "Write requests processed");
382 b.add_u64_counter(l_bluefs_write_disk_count, "write_disk_count",
383 "Write requests sent to disk");
384 b.add_u64_counter(l_bluefs_write_bytes, "write_bytes",
385 "Bytes written", NULL,
386 PerfCountersBuilder::PRIO_USEFUL, unit_t(UNIT_BYTES));
39ae355f
TL
387 b.add_time_avg (l_bluefs_compaction_lat, "compact_lat",
388 "Average bluefs log compaction latency",
389 "c__t",
390 PerfCountersBuilder::PRIO_INTERESTING);
391 b.add_time_avg (l_bluefs_compaction_lock_lat, "compact_lock_lat",
392 "Average lock duration while compacting bluefs log",
393 "c_lt",
394 PerfCountersBuilder::PRIO_INTERESTING);
395 b.add_u64_counter(l_bluefs_alloc_shared_dev_fallbacks, "alloc_slow_fallback",
396 "Amount of allocations that required fallback to "
397 " slow/shared device",
398 "asdf",
399 PerfCountersBuilder::PRIO_USEFUL);
400 b.add_u64_counter(l_bluefs_alloc_shared_size_fallbacks, "alloc_slow_size_fallback",
401 "Amount of allocations that required fallback to shared device's "
402 "regular unit size",
403 "assf",
404 PerfCountersBuilder::PRIO_USEFUL);
cd265ab1
TL
405 b.add_u64(l_bluefs_read_zeros_candidate, "read_zeros_candidate",
406 "How many times bluefs read found page with all 0s");
407 b.add_u64(l_bluefs_read_zeros_errors, "read_zeros_errors",
408 "How many times bluefs read found transient page with all 0s");
494da23a 409
7c673cae
FG
410 logger = b.create_perf_counters();
411 cct->get_perfcounters_collection()->add(logger);
412}
413
414void BlueFS::_shutdown_logger()
415{
416 cct->get_perfcounters_collection()->remove(logger);
417 delete logger;
418}
419
420void BlueFS::_update_logger_stats()
421{
7c673cae 422 if (alloc[BDEV_WAL]) {
f67539c2
TL
423 logger->set(l_bluefs_wal_total_bytes, _get_total(BDEV_WAL));
424 logger->set(l_bluefs_wal_used_bytes, _get_used(BDEV_WAL));
7c673cae
FG
425 }
426 if (alloc[BDEV_DB]) {
f67539c2
TL
427 logger->set(l_bluefs_db_total_bytes, _get_total(BDEV_DB));
428 logger->set(l_bluefs_db_used_bytes, _get_used(BDEV_DB));
7c673cae
FG
429 }
430 if (alloc[BDEV_SLOW]) {
f67539c2
TL
431 logger->set(l_bluefs_slow_total_bytes, _get_total(BDEV_SLOW));
432 logger->set(l_bluefs_slow_used_bytes, _get_used(BDEV_SLOW));
7c673cae
FG
433 }
434}
435
11fdf7f2 436int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
f67539c2
TL
437 uint64_t reserved,
438 bluefs_shared_alloc_context_t* _shared_alloc)
7c673cae 439{
f67539c2
TL
440 dout(10) << __func__ << " bdev " << id << " path " << path << " "
441 << reserved << dendl;
11fdf7f2
TL
442 ceph_assert(id < bdev.size());
443 ceph_assert(bdev[id] == NULL);
444 BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
445 discard_cb[id], static_cast<void*>(this));
f67539c2
TL
446 block_reserved[id] = reserved;
447 if (_shared_alloc) {
11fdf7f2
TL
448 b->set_no_exclusive_lock();
449 }
7c673cae
FG
450 int r = b->open(path);
451 if (r < 0) {
452 delete b;
453 return r;
454 }
11fdf7f2 455 if (trim) {
1e59de90
TL
456 interval_set<uint64_t> whole_device;
457 whole_device.insert(0, b->get_size());
458 b->try_discard(whole_device, false);
11fdf7f2
TL
459 }
460
7c673cae 461 dout(1) << __func__ << " bdev " << id << " path " << path
1adf2230 462 << " size " << byte_u_t(b->get_size()) << dendl;
7c673cae
FG
463 bdev[id] = b;
464 ioc[id] = new IOContext(cct, NULL);
f67539c2
TL
465 if (_shared_alloc) {
466 ceph_assert(!shared_alloc);
467 shared_alloc = _shared_alloc;
468 alloc[id] = shared_alloc->a;
469 shared_alloc_id = id;
470 }
7c673cae
FG
471 return 0;
472}
473
474bool BlueFS::bdev_support_label(unsigned id)
475{
11fdf7f2
TL
476 ceph_assert(id < bdev.size());
477 ceph_assert(bdev[id]);
7c673cae
FG
478 return bdev[id]->supported_bdev_label();
479}
480
f67539c2 481uint64_t BlueFS::get_block_device_size(unsigned id) const
7c673cae
FG
482{
483 if (id < bdev.size() && bdev[id])
484 return bdev[id]->get_size();
485 return 0;
486}
487
f67539c2 488void BlueFS::handle_discard(unsigned id, interval_set<uint64_t>& to_release)
7c673cae 489{
f67539c2
TL
490 dout(10) << __func__ << " bdev " << id << dendl;
491 ceph_assert(alloc[id]);
492 alloc[id]->release(to_release);
493 if (is_shared_alloc(id)) {
494 shared_alloc->bluefs_used -= to_release.size();
7c673cae 495 }
7c673cae
FG
496}
497
f67539c2 498uint64_t BlueFS::get_used()
7c673cae 499{
f67539c2
TL
500 uint64_t used = 0;
501 for (unsigned id = 0; id < MAX_BDEV; ++id) {
502 used += _get_used(id);
7c673cae 503 }
f67539c2
TL
504 return used;
505}
7c673cae 506
f67539c2
TL
507uint64_t BlueFS::_get_used(unsigned id) const
508{
509 uint64_t used = 0;
510 if (!alloc[id])
511 return 0;
9f95a23c 512
f67539c2
TL
513 if (is_shared_alloc(id)) {
514 used = shared_alloc->bluefs_used;
515 } else {
516 used = _get_total(id) - alloc[id]->get_free();
9f95a23c 517 }
f67539c2 518 return used;
7c673cae
FG
519}
520
f67539c2 521uint64_t BlueFS::get_used(unsigned id)
7c673cae 522{
f67539c2 523 ceph_assert(id < alloc.size());
11fdf7f2 524 ceph_assert(alloc[id]);
f67539c2 525 return _get_used(id);
11fdf7f2
TL
526}
527
f67539c2 528uint64_t BlueFS::_get_total(unsigned id) const
11fdf7f2 529{
f67539c2
TL
530 ceph_assert(id < bdev.size());
531 ceph_assert(id < block_reserved.size());
532 return get_block_device_size(id) - block_reserved[id];
7c673cae
FG
533}
534
535uint64_t BlueFS::get_total(unsigned id)
536{
f67539c2 537 return _get_total(id);
7c673cae
FG
538}
539
540uint64_t BlueFS::get_free(unsigned id)
541{
11fdf7f2 542 ceph_assert(id < alloc.size());
7c673cae
FG
543 return alloc[id]->get_free();
544}
545
546void BlueFS::dump_perf_counters(Formatter *f)
547{
548 f->open_object_section("bluefs_perf_counters");
1e59de90 549 logger->dump_formatted(f, false, false);
7c673cae
FG
550 f->close_section();
551}
552
3efd9988
FG
553void BlueFS::dump_block_extents(ostream& out)
554{
555 for (unsigned i = 0; i < MAX_BDEV; ++i) {
556 if (!bdev[i]) {
557 continue;
558 }
f67539c2 559 auto total = get_total(i);
11fdf7f2 560 auto free = get_free(i);
1911f103 561
f67539c2
TL
562 out << i << " : device size 0x" << std::hex << total
563 << " : using 0x" << total - free
564 << std::dec << "(" << byte_u_t(total - free) << ")";
1911f103 565 out << "\n";
3efd9988
FG
566 }
567}
7c673cae 568
1e59de90
TL
569void BlueFS::foreach_block_extents(
570 unsigned id,
571 std::function<void(uint64_t, uint32_t)> fn)
7c673cae 572{
20effc67 573 std::lock_guard nl(nodes.lock);
7c673cae 574 dout(10) << __func__ << " bdev " << id << dendl;
f67539c2 575 ceph_assert(id < alloc.size());
20effc67 576 for (auto& p : nodes.file_map) {
f67539c2
TL
577 for (auto& q : p.second->fnode.extents) {
578 if (q.bdev == id) {
1e59de90 579 fn(q.offset, q.length);
f67539c2
TL
580 }
581 }
582 }
7c673cae
FG
583}
584
9f95a23c 585int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
7c673cae 586{
7c673cae
FG
587 dout(1) << __func__
588 << " osd_uuid " << osd_uuid
589 << dendl;
590
9f95a23c
TL
591 // set volume selector if not provided before/outside
592 if (vselector == nullptr) {
593 vselector.reset(
594 new OriginalVolumeSelector(
595 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
596 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
597 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
598 }
599
7c673cae 600 _init_logger();
20effc67 601 _init_alloc();
7c673cae 602
39ae355f 603 super.version = 0;
7c673cae
FG
604 super.block_size = bdev[BDEV_DB]->get_block_size();
605 super.osd_uuid = osd_uuid;
606 super.uuid.generate_random();
607 dout(1) << __func__ << " uuid " << super.uuid << dendl;
608
609 // init log
9f95a23c 610 FileRef log_file = ceph::make_ref<File>();
7c673cae 611 log_file->fnode.ino = 1;
f6b5b4d7 612 log_file->vselector_hint = vselector->get_hint_for_log();
7c673cae 613 int r = _allocate(
9f95a23c 614 vselector->select_prefer_bdev(log_file->vselector_hint),
7c673cae 615 cct->_conf->bluefs_max_log_runway,
39ae355f 616 0,
94b18763 617 &log_file->fnode);
9f95a23c 618 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
11fdf7f2 619 ceph_assert(r == 0);
20effc67 620 log.writer = _create_writer(log_file);
7c673cae
FG
621
622 // initial txn
20effc67
TL
623 ceph_assert(log.seq_live == 1);
624 log.t.seq = 1;
625 log.t.op_init();
626 _flush_and_sync_log_LD();
7c673cae
FG
627
628 // write supers
629 super.log_fnode = log_file->fnode;
9f95a23c 630 super.memorized_layout = layout;
11fdf7f2 631 _write_super(BDEV_DB);
20effc67 632 _flush_bdev();
7c673cae
FG
633
634 // clean up
635 super = bluefs_super_t();
20effc67
TL
636 _close_writer(log.writer);
637 log.writer = NULL;
9f95a23c 638 vselector.reset(nullptr);
7c673cae
FG
639 _stop_alloc();
640 _shutdown_logger();
f67539c2
TL
641 if (shared_alloc) {
642 ceph_assert(shared_alloc->need_init);
643 shared_alloc->need_init = false;
644 }
7c673cae
FG
645
646 dout(10) << __func__ << " success" << dendl;
647 return 0;
648}
649
650void BlueFS::_init_alloc()
651{
652 dout(20) << __func__ << dendl;
eafe8130 653
20effc67 654 size_t wal_alloc_size = 0;
eafe8130 655 if (bdev[BDEV_WAL]) {
20effc67
TL
656 wal_alloc_size = cct->_conf->bluefs_alloc_size;
657 alloc_size[BDEV_WAL] = wal_alloc_size;
eafe8130 658 }
20effc67
TL
659 logger->set(l_bluefs_wal_alloc_unit, wal_alloc_size);
660
aee94f69
TL
661
662 uint64_t shared_alloc_size = cct->_conf->bluefs_shared_alloc_size;
663 if (shared_alloc && shared_alloc->a) {
664 uint64_t unit = shared_alloc->a->get_block_size();
665 shared_alloc_size = std::max(
666 unit,
667 shared_alloc_size);
668 ceph_assert(0 == p2phase(shared_alloc_size, unit));
669 }
eafe8130
TL
670 if (bdev[BDEV_SLOW]) {
671 alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size;
aee94f69 672 alloc_size[BDEV_SLOW] = shared_alloc_size;
eafe8130 673 } else {
aee94f69
TL
674 alloc_size[BDEV_DB] = shared_alloc_size;
675 alloc_size[BDEV_SLOW] = 0;
eafe8130 676 }
aee94f69
TL
677 logger->set(l_bluefs_db_alloc_unit, alloc_size[BDEV_DB]);
678 logger->set(l_bluefs_main_alloc_unit, alloc_size[BDEV_SLOW]);
eafe8130
TL
679 // new wal and db devices are never shared
680 if (bdev[BDEV_NEWWAL]) {
681 alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size;
682 }
683 if (bdev[BDEV_NEWDB]) {
684 alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size;
685 }
686
7c673cae
FG
687 for (unsigned id = 0; id < bdev.size(); ++id) {
688 if (!bdev[id]) {
689 continue;
690 }
11fdf7f2 691 ceph_assert(bdev[id]->get_size());
f67539c2
TL
692 if (is_shared_alloc(id)) {
693 dout(1) << __func__ << " shared, id " << id << std::hex
694 << ", capacity 0x" << bdev[id]->get_size()
695 << ", block size 0x" << alloc_size[id]
696 << std::dec << dendl;
697 } else {
aee94f69 698 ceph_assert(alloc_size[id]);
f67539c2
TL
699 std::string name = "bluefs-";
700 const char* devnames[] = { "wal","db","slow" };
701 if (id <= BDEV_SLOW)
702 name += devnames[id];
703 else
704 name += to_string(uintptr_t(this));
705 dout(1) << __func__ << " new, id " << id << std::hex
706 << ", allocator name " << name
707 << ", allocator type " << cct->_conf->bluefs_allocator
708 << ", capacity 0x" << bdev[id]->get_size()
709 << ", block size 0x" << alloc_size[id]
710 << std::dec << dendl;
711 alloc[id] = Allocator::create(cct, cct->_conf->bluefs_allocator,
712 bdev[id]->get_size(),
20effc67
TL
713 alloc_size[id],
714 0, 0,
715 name);
f67539c2
TL
716 alloc[id]->init_add_free(
717 block_reserved[id],
718 _get_total(id));
7c673cae
FG
719 }
720 }
721}
722
723void BlueFS::_stop_alloc()
724{
725 dout(20) << __func__ << dendl;
11fdf7f2
TL
726 for (auto p : bdev) {
727 if (p)
728 p->discard_drain();
729 }
730
f67539c2
TL
731 for (size_t i = 0; i < alloc.size(); ++i) {
732 if (alloc[i] && !is_shared_alloc(i)) {
733 alloc[i]->shutdown();
734 delete alloc[i];
735 alloc[i] = nullptr;
7c673cae
FG
736 }
737 }
7c673cae
FG
738}
739
20effc67
TL
740int BlueFS::_read_and_check(uint8_t ndev, uint64_t off, uint64_t len,
741 ceph::buffer::list *pbl, IOContext *ioc, bool buffered)
cd265ab1
TL
742{
743 dout(10) << __func__ << " dev " << int(ndev)
744 << ": 0x" << std::hex << off << "~" << len << std::dec
745 << (buffered ? " buffered" : "")
746 << dendl;
747 int r;
748 bufferlist bl;
20effc67 749 r = _bdev_read(ndev, off, len, &bl, ioc, buffered);
cd265ab1
TL
750 if (r != 0) {
751 return r;
752 }
753 uint64_t block_size = bdev[ndev]->get_block_size();
754 if (inject_read_zeros) {
755 if (len >= block_size * 2) {
756 derr << __func__ << " injecting error, zeros at "
757 << int(ndev) << ": 0x" << std::hex << (off + len / 2)
758 << "~" << (block_size * 2) << std::dec << dendl;
759 //use beginning, replace 8K in the middle with zeros, use tail
760 bufferlist temp;
761 bl.splice(0, len / 2 - block_size, &temp);
f67539c2 762 temp.append(buffer::create(block_size * 2, 0));
cd265ab1
TL
763 bl.splice(block_size * 2, len / 2 - block_size, &temp);
764 bl = temp;
765 inject_read_zeros--;
766 }
767 }
768 //make a check if there is a block with all 0
769 uint64_t to_check_len = len;
770 uint64_t skip = p2nphase(off, block_size);
771 if (skip >= to_check_len) {
772 return r;
773 }
774 auto it = bl.begin(skip);
775 to_check_len -= skip;
776 bool all_zeros = false;
777 while (all_zeros == false && to_check_len >= block_size) {
778 // checking 0s step
779 unsigned block_left = block_size;
780 unsigned avail;
781 const char* data;
782 all_zeros = true;
783 while (all_zeros && block_left > 0) {
784 avail = it.get_ptr_and_advance(block_left, &data);
785 block_left -= avail;
786 all_zeros = mem_is_zero(data, avail);
787 }
788 // skipping step
789 while (block_left > 0) {
790 avail = it.get_ptr_and_advance(block_left, &data);
791 block_left -= avail;
792 }
793 to_check_len -= block_size;
794 }
795 if (all_zeros) {
796 logger->inc(l_bluefs_read_zeros_candidate, 1);
797 bufferlist bl_reread;
20effc67 798 r = _bdev_read(ndev, off, len, &bl_reread, ioc, buffered);
cd265ab1
TL
799 if (r != 0) {
800 return r;
801 }
802 // check if both read gave the same
803 if (!bl.contents_equal(bl_reread)) {
804 // report problems to log, but continue, maybe it will be good now...
805 derr << __func__ << " initial read of " << int(ndev)
806 << ": 0x" << std::hex << off << "~" << len
807 << std::dec << ": different then re-read " << dendl;
808 logger->inc(l_bluefs_read_zeros_errors, 1);
809 }
810 // use second read will be better if is different
811 pbl->append(bl_reread);
812 } else {
813 pbl->append(bl);
814 }
815 return r;
816}
817
20effc67
TL
818int BlueFS::_read_random_and_check(
819 uint8_t ndev, uint64_t off, uint64_t len, char *buf, bool buffered)
cd265ab1
TL
820{
821 dout(10) << __func__ << " dev " << int(ndev)
822 << ": 0x" << std::hex << off << "~" << len << std::dec
823 << (buffered ? " buffered" : "")
824 << dendl;
825 int r;
20effc67 826 r = _bdev_read_random(ndev, off, len, buf, buffered);
cd265ab1
TL
827 if (r != 0) {
828 return r;
829 }
830 uint64_t block_size = bdev[ndev]->get_block_size();
831 if (inject_read_zeros) {
832 if (len >= block_size * 2) {
833 derr << __func__ << " injecting error, zeros at "
834 << int(ndev) << ": 0x" << std::hex << (off + len / 2)
835 << "~" << (block_size * 2) << std::dec << dendl;
836 //zero middle 8K
837 memset(buf + len / 2 - block_size, 0, block_size * 2);
838 inject_read_zeros--;
839 }
840 }
841 //make a check if there is a block with all 0
842 uint64_t to_check_len = len;
843 const char* data = buf;
844 uint64_t skip = p2nphase(off, block_size);
845 if (skip >= to_check_len) {
846 return r;
847 }
848 to_check_len -= skip;
849 data += skip;
850
851 bool all_zeros = false;
852 while (all_zeros == false && to_check_len >= block_size) {
853 if (mem_is_zero(data, block_size)) {
854 // at least one block is all zeros
855 all_zeros = true;
856 break;
857 }
858 data += block_size;
859 to_check_len -= block_size;
860 }
861 if (all_zeros) {
862 logger->inc(l_bluefs_read_zeros_candidate, 1);
863 std::unique_ptr<char[]> data_reread(new char[len]);
20effc67 864 r = _bdev_read_random(ndev, off, len, &data_reread[0], buffered);
cd265ab1
TL
865 if (r != 0) {
866 return r;
867 }
868 // check if both read gave the same
869 if (memcmp(buf, &data_reread[0], len) != 0) {
870 derr << __func__ << " initial read of " << int(ndev)
871 << ": 0x" << std::hex << off << "~" << len
872 << std::dec << ": different then re-read " << dendl;
873 logger->inc(l_bluefs_read_zeros_errors, 1);
874 // second read is probably better
875 memcpy(buf, &data_reread[0], len);
876 }
877 }
878 return r;
879}
880
20effc67
TL
881int BlueFS::_bdev_read(uint8_t ndev, uint64_t off, uint64_t len,
882 ceph::buffer::list* pbl, IOContext* ioc, bool buffered)
883{
884 int cnt = 0;
885 switch (ndev) {
886 case BDEV_WAL: cnt = l_bluefs_read_disk_bytes_wal; break;
887 case BDEV_DB: cnt = l_bluefs_read_disk_bytes_db; break;
888 case BDEV_SLOW: cnt = l_bluefs_read_disk_bytes_slow; break;
889
890 }
891 if (cnt) {
892 logger->inc(cnt, len);
893 }
894 return bdev[ndev]->read(off, len, pbl, ioc, buffered);
895}
896
897int BlueFS::_bdev_read_random(uint8_t ndev, uint64_t off, uint64_t len,
898 char* buf, bool buffered)
899{
900 int cnt = 0;
901 switch (ndev) {
902 case BDEV_WAL: cnt = l_bluefs_read_random_disk_bytes_wal; break;
903 case BDEV_DB: cnt = l_bluefs_read_random_disk_bytes_db; break;
904 case BDEV_SLOW: cnt = l_bluefs_read_random_disk_bytes_slow; break;
905 }
906 if (cnt) {
907 logger->inc(cnt, len);
908 }
909 return bdev[ndev]->read_random(off, len, buf, buffered);
910}
911
7c673cae
FG
912int BlueFS::mount()
913{
914 dout(1) << __func__ << dendl;
915
20effc67 916 _init_logger();
7c673cae
FG
917 int r = _open_super();
918 if (r < 0) {
919 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
920 goto out;
921 }
922
9f95a23c
TL
923 // set volume selector if not provided before/outside
924 if (vselector == nullptr) {
925 vselector.reset(
926 new OriginalVolumeSelector(
927 get_block_device_size(BlueFS::BDEV_WAL) * 95 / 100,
928 get_block_device_size(BlueFS::BDEV_DB) * 95 / 100,
929 get_block_device_size(BlueFS::BDEV_SLOW) * 95 / 100));
930 }
931
7c673cae
FG
932 _init_alloc();
933
11fdf7f2 934 r = _replay(false, false);
7c673cae
FG
935 if (r < 0) {
936 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
937 _stop_alloc();
938 goto out;
939 }
940
941 // init freelist
20effc67 942 for (auto& p : nodes.file_map) {
7c673cae
FG
943 dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
944 for (auto& q : p.second->fnode.extents) {
f67539c2
TL
945 bool is_shared = is_shared_alloc(q.bdev);
946 ceph_assert(!is_shared || (is_shared && shared_alloc));
947 if (is_shared && shared_alloc->need_init && shared_alloc->a) {
948 shared_alloc->bluefs_used += q.length;
949 alloc[q.bdev]->init_rm_free(q.offset, q.length);
950 } else if (!is_shared) {
951 alloc[q.bdev]->init_rm_free(q.offset, q.length);
952 }
7c673cae
FG
953 }
954 }
f67539c2
TL
955 if (shared_alloc) {
956 shared_alloc->need_init = false;
957 dout(1) << __func__ << " shared_bdev_used = "
958 << shared_alloc->bluefs_used << dendl;
959 } else {
960 dout(1) << __func__ << " shared bdev not used"
961 << dendl;
962 }
7c673cae
FG
963
964 // set up the log for future writes
20effc67
TL
965 log.writer = _create_writer(_get_file(1));
966 ceph_assert(log.writer->file->fnode.ino == 1);
967 log.writer->pos = log.writer->file->fnode.size;
968 log.writer->file->fnode.reset_delta();
7c673cae 969 dout(10) << __func__ << " log write pos set to 0x"
20effc67 970 << std::hex << log.writer->pos << std::dec
7c673cae 971 << dendl;
20effc67
TL
972 // update log size
973 logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size);
7c673cae
FG
974 return 0;
975
976 out:
977 super = bluefs_super_t();
978 return r;
979}
980
9f95a23c
TL
981int BlueFS::maybe_verify_layout(const bluefs_layout_t& layout) const
982{
983 if (super.memorized_layout) {
984 if (layout == *super.memorized_layout) {
985 dout(10) << __func__ << " bluefs layout verified positively" << dendl;
986 } else {
987 derr << __func__ << " memorized layout doesn't fit current one" << dendl;
988 return -EIO;
989 }
990 } else {
991 dout(10) << __func__ << " no memorized_layout in bluefs superblock"
992 << dendl;
993 }
994
995 return 0;
996}
997
1911f103 998void BlueFS::umount(bool avoid_compact)
7c673cae
FG
999{
1000 dout(1) << __func__ << dendl;
1001
1911f103 1002 sync_metadata(avoid_compact);
20effc67
TL
1003 if (cct->_conf->bluefs_check_volume_selector_on_umount) {
1004 _check_vselector_LNF();
1005 }
1006 _close_writer(log.writer);
1007 log.writer = NULL;
1008 log.t.clear();
7c673cae 1009
9f95a23c 1010 vselector.reset(nullptr);
7c673cae 1011 _stop_alloc();
20effc67
TL
1012 nodes.file_map.clear();
1013 nodes.dir_map.clear();
7c673cae 1014 super = bluefs_super_t();
7c673cae
FG
1015 _shutdown_logger();
1016}
1017
9f95a23c 1018int BlueFS::prepare_new_device(int id, const bluefs_layout_t& layout)
7c673cae 1019{
11fdf7f2
TL
1020 dout(1) << __func__ << dendl;
1021
1022 if(id == BDEV_NEWDB) {
1023 int new_log_dev_cur = BDEV_WAL;
1024 int new_log_dev_next = BDEV_WAL;
1025 if (!bdev[BDEV_WAL]) {
1026 new_log_dev_cur = BDEV_NEWDB;
1027 new_log_dev_next = BDEV_DB;
1028 }
20effc67 1029 _rewrite_log_and_layout_sync_LNF_LD(false,
11fdf7f2
TL
1030 BDEV_NEWDB,
1031 new_log_dev_cur,
1032 new_log_dev_next,
9f95a23c
TL
1033 RENAME_DB2SLOW,
1034 layout);
11fdf7f2 1035 } else if(id == BDEV_NEWWAL) {
20effc67 1036 _rewrite_log_and_layout_sync_LNF_LD(false,
9f95a23c
TL
1037 BDEV_DB,
1038 BDEV_NEWWAL,
1039 BDEV_WAL,
1040 REMOVE_WAL,
1041 layout);
11fdf7f2
TL
1042 } else {
1043 assert(false);
1044 }
1045 return 0;
1046}
1047
1048void BlueFS::collect_metadata(map<string,string> *pm, unsigned skip_bdev_id)
1049{
1050 if (skip_bdev_id != BDEV_DB && bdev[BDEV_DB])
7c673cae
FG
1051 bdev[BDEV_DB]->collect_metadata("bluefs_db_", pm);
1052 if (bdev[BDEV_WAL])
1053 bdev[BDEV_WAL]->collect_metadata("bluefs_wal_", pm);
11fdf7f2
TL
1054}
1055
1056void BlueFS::get_devices(set<string> *ls)
1057{
1058 for (unsigned i = 0; i < MAX_BDEV; ++i) {
1059 if (bdev[i]) {
1060 bdev[i]->get_devices(ls);
1061 }
1062 }
7c673cae
FG
1063}
1064
1065int BlueFS::fsck()
1066{
7c673cae
FG
1067 dout(1) << __func__ << dendl;
1068 // hrm, i think we check everything on mount...
1069 return 0;
1070}
1071
11fdf7f2 1072int BlueFS::_write_super(int dev)
7c673cae 1073{
39ae355f 1074 ++super.version;
7c673cae
FG
1075 // build superblock
1076 bufferlist bl;
11fdf7f2 1077 encode(super, bl);
7c673cae 1078 uint32_t crc = bl.crc32c(-1);
11fdf7f2 1079 encode(crc, bl);
7c673cae
FG
1080 dout(10) << __func__ << " super block length(encoded): " << bl.length() << dendl;
1081 dout(10) << __func__ << " superblock " << super.version << dendl;
1082 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
9f95a23c 1083 ceph_assert_always(bl.length() <= get_super_length());
7c673cae
FG
1084 bl.append_zero(get_super_length() - bl.length());
1085
11fdf7f2 1086 bdev[dev]->write(get_super_offset(), bl, false, WRITE_LIFE_SHORT);
7c673cae
FG
1087 dout(20) << __func__ << " v " << super.version
1088 << " crc 0x" << std::hex << crc
1089 << " offset 0x" << get_super_offset() << std::dec
1090 << dendl;
1091 return 0;
1092}
1093
1094int BlueFS::_open_super()
1095{
1096 dout(10) << __func__ << dendl;
1097
1098 bufferlist bl;
1099 uint32_t expected_crc, crc;
1100 int r;
1101
1102 // always the second block
20effc67
TL
1103 r = _bdev_read(BDEV_DB, get_super_offset(), get_super_length(),
1104 &bl, ioc[BDEV_DB], false);
7c673cae
FG
1105 if (r < 0)
1106 return r;
1107
11fdf7f2
TL
1108 auto p = bl.cbegin();
1109 decode(super, p);
7c673cae
FG
1110 {
1111 bufferlist t;
1112 t.substr_of(bl, 0, p.get_off());
1113 crc = t.crc32c(-1);
1114 }
11fdf7f2 1115 decode(expected_crc, p);
7c673cae
FG
1116 if (crc != expected_crc) {
1117 derr << __func__ << " bad crc on superblock, expected 0x"
1118 << std::hex << expected_crc << " != actual 0x" << crc << std::dec
1119 << dendl;
1120 return -EIO;
1121 }
1122 dout(10) << __func__ << " superblock " << super.version << dendl;
1123 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
1124 return 0;
1125}
1126
20effc67
TL
1127int BlueFS::_check_allocations(const bluefs_fnode_t& fnode,
1128 boost::dynamic_bitset<uint64_t>* used_blocks,
1129 bool is_alloc, //true when allocating, false when deallocating
1130 const char* op_name)
9f95a23c
TL
1131{
1132 auto& fnode_extents = fnode.extents;
1133 for (auto e : fnode_extents) {
1134 auto id = e.bdev;
1135 bool fail = false;
20effc67 1136 ceph_assert(id < MAX_BDEV);
39ae355f
TL
1137 ceph_assert(bdev[id]);
1138 // let's use minimal allocation unit we can have
1139 auto alloc_unit = bdev[id]->get_block_size();
1140
20effc67 1141 if (int r = _verify_alloc_granularity(id, e.offset, e.length,
39ae355f 1142 alloc_unit,
20effc67
TL
1143 op_name); r < 0) {
1144 return r;
1145 }
9f95a23c 1146
39ae355f 1147 apply_for_bitset_range(e.offset, e.length, alloc_unit, used_blocks[id],
9f95a23c 1148 [&](uint64_t pos, boost::dynamic_bitset<uint64_t> &bs) {
20effc67
TL
1149 if (is_alloc == bs.test(pos)) {
1150 fail = true;
1151 } else {
1152 bs.flip(pos);
1153 }
9f95a23c
TL
1154 }
1155 );
1156 if (fail) {
20effc67
TL
1157 derr << __func__ << " " << op_name << " invalid extent " << int(e.bdev)
1158 << ": 0x" << std::hex << e.offset << "~" << e.length << std::dec
1159 << (is_alloc == true ?
1160 ": duplicate reference, ino " : ": double free, ino ")
1161 << fnode.ino << dendl;
9f95a23c
TL
1162 return -EFAULT;
1163 }
1164 }
1165 return 0;
1166}
1167
9f95a23c 1168int BlueFS::_verify_alloc_granularity(
39ae355f 1169 __u8 id, uint64_t offset, uint64_t length, uint64_t alloc_unit, const char *op)
9f95a23c 1170{
39ae355f
TL
1171 if ((offset & (alloc_unit - 1)) ||
1172 (length & (alloc_unit - 1))) {
9f95a23c
TL
1173 derr << __func__ << " " << op << " of " << (int)id
1174 << ":0x" << std::hex << offset << "~" << length << std::dec
1175 << " does not align to alloc_size 0x"
39ae355f 1176 << std::hex << alloc_unit << std::dec << dendl;
9f95a23c
TL
1177 return -EFAULT;
1178 }
1179 return 0;
1180}
1181
11fdf7f2 1182int BlueFS::_replay(bool noop, bool to_stdout)
7c673cae
FG
1183{
1184 dout(10) << __func__ << (noop ? " NO-OP" : "") << dendl;
1185 ino_last = 1; // by the log
20effc67 1186 uint64_t log_seq = 0;
7c673cae
FG
1187
1188 FileRef log_file;
11fdf7f2 1189 log_file = _get_file(1);
9f95a23c 1190
f67539c2 1191 log_file->fnode = super.log_fnode;
11fdf7f2 1192 if (!noop) {
9f95a23c 1193 log_file->vselector_hint =
f6b5b4d7 1194 vselector->get_hint_for_log();
7c673cae 1195 }
7c673cae 1196 dout(10) << __func__ << " log_fnode " << super.log_fnode << dendl;
11fdf7f2
TL
1197 if (unlikely(to_stdout)) {
1198 std::cout << " log_fnode " << super.log_fnode << std::endl;
1199 }
7c673cae
FG
1200
1201 FileReader *log_reader = new FileReader(
1202 log_file, cct->_conf->bluefs_max_prefetch,
1203 false, // !random
1204 true); // ignore eof
9f95a23c
TL
1205
1206 bool seen_recs = false;
1207
1208 boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV];
9f95a23c 1209
f67539c2
TL
1210 if (!noop) {
1211 if (cct->_conf->bluefs_log_replay_check_allocations) {
1212 for (size_t i = 0; i < MAX_BDEV; ++i) {
39ae355f
TL
1213 if (bdev[i] != nullptr) {
1214 // let's use minimal allocation unit we can have
1215 auto au = bdev[i]->get_block_size();
1216 //hmm... on 32TB/4K drive this would take 1GB RAM!!!
1217 used_blocks[i].resize(round_up_to(bdev[i]->get_size(), au) / au);
f67539c2 1218 }
9f95a23c 1219 }
20effc67
TL
1220 // check initial log layout
1221 int r = _check_allocations(log_file->fnode,
1222 used_blocks, true, "Log from super");
1223 if (r < 0) {
1224 return r;
1225 }
9f95a23c
TL
1226 }
1227 }
1228
7c673cae 1229 while (true) {
11fdf7f2 1230 ceph_assert((log_reader->buf.pos & ~super.block_mask()) == 0);
7c673cae
FG
1231 uint64_t pos = log_reader->buf.pos;
1232 uint64_t read_pos = pos;
1233 bufferlist bl;
1234 {
f67539c2 1235 int r = _read(log_reader, read_pos, super.block_size,
7c673cae 1236 &bl, NULL);
f6b5b4d7 1237 if (r != (int)super.block_size && cct->_conf->bluefs_replay_recovery) {
20effc67 1238 r += _do_replay_recovery_read(log_reader, pos, read_pos + r, super.block_size - r, &bl);
f6b5b4d7
TL
1239 }
1240 assert(r == (int)super.block_size);
7c673cae
FG
1241 read_pos += r;
1242 }
1243 uint64_t more = 0;
1244 uint64_t seq;
1245 uuid_d uuid;
1246 {
11fdf7f2 1247 auto p = bl.cbegin();
7c673cae
FG
1248 __u8 a, b;
1249 uint32_t len;
11fdf7f2
TL
1250 decode(a, p);
1251 decode(b, p);
1252 decode(len, p);
1253 decode(uuid, p);
1254 decode(seq, p);
7c673cae 1255 if (len + 6 > bl.length()) {
11fdf7f2 1256 more = round_up_to(len + 6 - bl.length(), super.block_size);
7c673cae
FG
1257 }
1258 }
1259 if (uuid != super.uuid) {
9f95a23c
TL
1260 if (seen_recs) {
1261 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1262 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1263 << dendl;
1264 } else {
1265 derr << __func__ << " 0x" << std::hex << pos << std::dec
1266 << ": stop: uuid " << uuid << " != super.uuid " << super.uuid
1267 << ", block dump: \n";
1268 bufferlist t;
1269 t.substr_of(bl, 0, super.block_size);
1270 t.hexdump(*_dout);
1271 *_dout << dendl;
1272 }
7c673cae
FG
1273 break;
1274 }
1275 if (seq != log_seq + 1) {
9f95a23c
TL
1276 if (seen_recs) {
1277 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1278 << ": stop: seq " << seq << " != expected " << log_seq + 1
1279 << dendl;;
1280 } else {
1281 derr << __func__ << " 0x" << std::hex << pos << std::dec
1282 << ": stop: seq " << seq << " != expected " << log_seq + 1
1283 << dendl;;
1284 }
7c673cae
FG
1285 break;
1286 }
1287 if (more) {
1288 dout(20) << __func__ << " need 0x" << std::hex << more << std::dec
1289 << " more bytes" << dendl;
1290 bufferlist t;
f67539c2 1291 int r = _read(log_reader, read_pos, more, &t, NULL);
7c673cae 1292 if (r < (int)more) {
f6b5b4d7
TL
1293 dout(10) << __func__ << " 0x" << std::hex << pos
1294 << ": stop: len is 0x" << bl.length() + more << std::dec
1295 << ", which is past eof" << dendl;
1296 if (cct->_conf->bluefs_replay_recovery) {
1297 //try to search for more data
20effc67 1298 r += _do_replay_recovery_read(log_reader, pos, read_pos + r, more - r, &t);
f6b5b4d7
TL
1299 if (r < (int)more) {
1300 //in normal mode we must read r==more, for recovery it is too strict
1301 break;
1302 }
1303 }
7c673cae 1304 }
11fdf7f2 1305 ceph_assert(r == (int)more);
7c673cae
FG
1306 bl.claim_append(t);
1307 read_pos += r;
1308 }
1309 bluefs_transaction_t t;
1310 try {
11fdf7f2
TL
1311 auto p = bl.cbegin();
1312 decode(t, p);
522d829b 1313 seen_recs = true;
7c673cae 1314 }
f67539c2 1315 catch (ceph::buffer::error& e) {
522d829b
TL
1316 // Multi-block transactions might be incomplete due to unexpected
1317 // power off. Hence let's treat that as a regular stop condition.
1318 if (seen_recs && more) {
1319 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1320 << ": stop: failed to decode: " << e.what()
1321 << dendl;
1322 } else {
1323 derr << __func__ << " 0x" << std::hex << pos << std::dec
1324 << ": stop: failed to decode: " << e.what()
1325 << dendl;
1326 delete log_reader;
1327 return -EIO;
1328 }
1329 break;
7c673cae 1330 }
11fdf7f2 1331 ceph_assert(seq == t.seq);
7c673cae
FG
1332 dout(10) << __func__ << " 0x" << std::hex << pos << std::dec
1333 << ": " << t << dendl;
11fdf7f2
TL
1334 if (unlikely(to_stdout)) {
1335 std::cout << " 0x" << std::hex << pos << std::dec
1336 << ": " << t << std::endl;
1337 }
7c673cae 1338
11fdf7f2 1339 auto p = t.op_bl.cbegin();
39ae355f 1340 auto pos0 = pos;
7c673cae 1341 while (!p.end()) {
39ae355f 1342 pos = pos0 + p.get_off();
7c673cae 1343 __u8 op;
11fdf7f2 1344 decode(op, p);
7c673cae
FG
1345 switch (op) {
1346
1347 case bluefs_transaction_t::OP_INIT:
1348 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1349 << ": op_init" << dendl;
11fdf7f2
TL
1350 if (unlikely(to_stdout)) {
1351 std::cout << " 0x" << std::hex << pos << std::dec
1352 << ": op_init" << std::endl;
1353 }
1354
1355 ceph_assert(t.seq == 1);
7c673cae
FG
1356 break;
1357
1358 case bluefs_transaction_t::OP_JUMP:
1359 {
1360 uint64_t next_seq;
1361 uint64_t offset;
11fdf7f2
TL
1362 decode(next_seq, p);
1363 decode(offset, p);
7c673cae
FG
1364 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1365 << ": op_jump seq " << next_seq
1366 << " offset 0x" << std::hex << offset << std::dec << dendl;
11fdf7f2
TL
1367 if (unlikely(to_stdout)) {
1368 std::cout << " 0x" << std::hex << pos << std::dec
1369 << ": op_jump seq " << next_seq
1370 << " offset 0x" << std::hex << offset << std::dec
1371 << std::endl;
1372 }
1373
20effc67 1374 ceph_assert(next_seq > log_seq);
7c673cae
FG
1375 log_seq = next_seq - 1; // we will increment it below
1376 uint64_t skip = offset - read_pos;
1377 if (skip) {
1378 bufferlist junk;
f67539c2 1379 int r = _read(log_reader, read_pos, skip, &junk,
7c673cae
FG
1380 NULL);
1381 if (r != (int)skip) {
1382 dout(10) << __func__ << " 0x" << std::hex << read_pos
1383 << ": stop: failed to skip to " << offset
1384 << std::dec << dendl;
11fdf7f2 1385 ceph_abort_msg("problem with op_jump");
7c673cae
FG
1386 }
1387 }
1388 }
1389 break;
1390
1391 case bluefs_transaction_t::OP_JUMP_SEQ:
1392 {
1393 uint64_t next_seq;
11fdf7f2 1394 decode(next_seq, p);
7c673cae
FG
1395 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1396 << ": op_jump_seq " << next_seq << dendl;
11fdf7f2
TL
1397 if (unlikely(to_stdout)) {
1398 std::cout << " 0x" << std::hex << pos << std::dec
1399 << ": op_jump_seq " << next_seq << std::endl;
1400 }
1401
20effc67 1402 ceph_assert(next_seq > log_seq);
7c673cae
FG
1403 log_seq = next_seq - 1; // we will increment it below
1404 }
1405 break;
1406
1407 case bluefs_transaction_t::OP_ALLOC_ADD:
f67539c2 1408 // LEGACY, do nothing but read params
7c673cae 1409 {
f67539c2
TL
1410 __u8 id;
1411 uint64_t offset, length;
1412 decode(id, p);
1413 decode(offset, p);
1414 decode(length, p);
1415 }
7c673cae
FG
1416 break;
1417
1418 case bluefs_transaction_t::OP_ALLOC_RM:
f67539c2 1419 // LEGACY, do nothing but read params
7c673cae 1420 {
f67539c2
TL
1421 __u8 id;
1422 uint64_t offset, length;
1423 decode(id, p);
1424 decode(offset, p);
1425 decode(length, p);
1426 }
1427 break;
7c673cae
FG
1428
1429 case bluefs_transaction_t::OP_DIR_LINK:
1430 {
1431 string dirname, filename;
1432 uint64_t ino;
11fdf7f2
TL
1433 decode(dirname, p);
1434 decode(filename, p);
1435 decode(ino, p);
7c673cae
FG
1436 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1437 << ": op_dir_link " << " " << dirname << "/" << filename
1438 << " to " << ino
1439 << dendl;
11fdf7f2
TL
1440 if (unlikely(to_stdout)) {
1441 std::cout << " 0x" << std::hex << pos << std::dec
1442 << ": op_dir_link " << " " << dirname << "/" << filename
1443 << " to " << ino
1444 << std::endl;
1445 }
1446
7c673cae
FG
1447 if (!noop) {
1448 FileRef file = _get_file(ino);
11fdf7f2 1449 ceph_assert(file->fnode.ino);
20effc67
TL
1450 map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1451 ceph_assert(q != nodes.dir_map.end());
7c673cae 1452 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
11fdf7f2 1453 ceph_assert(r == q->second->file_map.end());
9f95a23c
TL
1454
1455 vselector->sub_usage(file->vselector_hint, file->fnode);
1456 file->vselector_hint =
1457 vselector->get_hint_by_dir(dirname);
1458 vselector->add_usage(file->vselector_hint, file->fnode);
1459
7c673cae
FG
1460 q->second->file_map[filename] = file;
1461 ++file->refs;
1462 }
1463 }
1464 break;
1465
1466 case bluefs_transaction_t::OP_DIR_UNLINK:
1467 {
1468 string dirname, filename;
11fdf7f2
TL
1469 decode(dirname, p);
1470 decode(filename, p);
7c673cae
FG
1471 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1472 << ": op_dir_unlink " << " " << dirname << "/" << filename
1473 << dendl;
11fdf7f2
TL
1474 if (unlikely(to_stdout)) {
1475 std::cout << " 0x" << std::hex << pos << std::dec
1476 << ": op_dir_unlink " << " " << dirname << "/" << filename
1477 << std::endl;
1478 }
1479
7c673cae 1480 if (!noop) {
20effc67
TL
1481 map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1482 ceph_assert(q != nodes.dir_map.end());
7c673cae 1483 map<string,FileRef>::iterator r = q->second->file_map.find(filename);
11fdf7f2
TL
1484 ceph_assert(r != q->second->file_map.end());
1485 ceph_assert(r->second->refs > 0);
7c673cae
FG
1486 --r->second->refs;
1487 q->second->file_map.erase(r);
1488 }
1489 }
1490 break;
1491
1492 case bluefs_transaction_t::OP_DIR_CREATE:
1493 {
1494 string dirname;
11fdf7f2 1495 decode(dirname, p);
7c673cae
FG
1496 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1497 << ": op_dir_create " << dirname << dendl;
11fdf7f2
TL
1498 if (unlikely(to_stdout)) {
1499 std::cout << " 0x" << std::hex << pos << std::dec
1500 << ": op_dir_create " << dirname << std::endl;
1501 }
1502
7c673cae 1503 if (!noop) {
20effc67
TL
1504 map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1505 ceph_assert(q == nodes.dir_map.end());
1506 nodes.dir_map[dirname] = ceph::make_ref<Dir>();
7c673cae
FG
1507 }
1508 }
1509 break;
1510
1511 case bluefs_transaction_t::OP_DIR_REMOVE:
1512 {
1513 string dirname;
11fdf7f2 1514 decode(dirname, p);
7c673cae
FG
1515 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1516 << ": op_dir_remove " << dirname << dendl;
11fdf7f2
TL
1517 if (unlikely(to_stdout)) {
1518 std::cout << " 0x" << std::hex << pos << std::dec
1519 << ": op_dir_remove " << dirname << std::endl;
1520 }
1521
7c673cae 1522 if (!noop) {
20effc67
TL
1523 map<string,DirRef>::iterator q = nodes.dir_map.find(dirname);
1524 ceph_assert(q != nodes.dir_map.end());
11fdf7f2 1525 ceph_assert(q->second->file_map.empty());
20effc67 1526 nodes.dir_map.erase(q);
7c673cae
FG
1527 }
1528 }
1529 break;
1530
1531 case bluefs_transaction_t::OP_FILE_UPDATE:
1532 {
1533 bluefs_fnode_t fnode;
11fdf7f2 1534 decode(fnode, p);
7c673cae 1535 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
9f95a23c 1536 << ": op_file_update " << " " << fnode << " " << dendl;
11fdf7f2
TL
1537 if (unlikely(to_stdout)) {
1538 std::cout << " 0x" << std::hex << pos << std::dec
1539 << ": op_file_update " << " " << fnode << std::endl;
1540 }
9f95a23c 1541 if (!noop) {
7c673cae 1542 FileRef f = _get_file(fnode.ino);
20effc67
TL
1543 if (cct->_conf->bluefs_log_replay_check_allocations) {
1544 int r = _check_allocations(f->fnode,
1545 used_blocks, false, "OP_FILE_UPDATE");
1546 if (r < 0) {
1547 return r;
9f95a23c
TL
1548 }
1549 }
9f95a23c
TL
1550 if (fnode.ino != 1) {
1551 vselector->sub_usage(f->vselector_hint, f->fnode);
1552 }
1553 f->fnode = fnode;
1554 if (fnode.ino != 1) {
1555 vselector->add_usage(f->vselector_hint, f->fnode);
1556 }
1557
7c673cae
FG
1558 if (fnode.ino > ino_last) {
1559 ino_last = fnode.ino;
1560 }
9f95a23c 1561 if (cct->_conf->bluefs_log_replay_check_allocations) {
20effc67
TL
1562 int r = _check_allocations(f->fnode,
1563 used_blocks, true, "OP_FILE_UPDATE");
9f95a23c
TL
1564 if (r < 0) {
1565 return r;
1566 }
1567 }
522d829b
TL
1568 } else if (noop && fnode.ino == 1) {
1569 FileRef f = _get_file(fnode.ino);
1570 f->fnode = fnode;
7c673cae 1571 }
9f95a23c 1572 }
7c673cae 1573 break;
20effc67
TL
1574 case bluefs_transaction_t::OP_FILE_UPDATE_INC:
1575 {
1576 bluefs_fnode_delta_t delta;
1577 decode(delta, p);
1578 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1579 << ": op_file_update_inc " << " " << delta << " " << dendl;
1580 if (unlikely(to_stdout)) {
1581 std::cout << " 0x" << std::hex << pos << std::dec
1582 << ": op_file_update_inc " << " " << delta << std::endl;
1583 }
1584 if (!noop) {
1585 FileRef f = _get_file(delta.ino);
1586 bluefs_fnode_t& fnode = f->fnode;
1587 if (delta.offset != fnode.allocated) {
1588 derr << __func__ << " invalid op_file_update_inc, new extents miss end of file"
1589 << " fnode=" << fnode
1590 << " delta=" << delta
1591 << dendl;
1592 ceph_assert(delta.offset == fnode.allocated);
1593 }
1594 if (cct->_conf->bluefs_log_replay_check_allocations) {
1595 int r = _check_allocations(fnode,
1596 used_blocks, false, "OP_FILE_UPDATE_INC");
1597 if (r < 0) {
1598 return r;
1599 }
1600 }
1601
1602 fnode.ino = delta.ino;
1603 fnode.mtime = delta.mtime;
1604 if (fnode.ino != 1) {
1605 vselector->sub_usage(f->vselector_hint, fnode);
1606 }
1607 fnode.size = delta.size;
1608 fnode.claim_extents(delta.extents);
1609 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1610 << ": op_file_update_inc produced " << " " << fnode << " " << dendl;
1611
1612 if (fnode.ino != 1) {
1613 vselector->add_usage(f->vselector_hint, fnode);
1614 }
1615
1616 if (fnode.ino > ino_last) {
1617 ino_last = fnode.ino;
1618 }
1619 if (cct->_conf->bluefs_log_replay_check_allocations) {
1620 int r = _check_allocations(f->fnode,
1621 used_blocks, true, "OP_FILE_UPDATE_INC");
1622 if (r < 0) {
1623 return r;
1624 }
1625 }
1626 } else if (noop && delta.ino == 1) {
1627 // we need to track bluefs log, even in noop mode
1628 FileRef f = _get_file(1);
1629 bluefs_fnode_t& fnode = f->fnode;
1630 fnode.ino = delta.ino;
1631 fnode.mtime = delta.mtime;
1632 fnode.size = delta.size;
1633 fnode.claim_extents(delta.extents);
1634 }
1635 }
1636 break;
7c673cae
FG
1637
1638 case bluefs_transaction_t::OP_FILE_REMOVE:
1639 {
1640 uint64_t ino;
11fdf7f2 1641 decode(ino, p);
7c673cae
FG
1642 dout(20) << __func__ << " 0x" << std::hex << pos << std::dec
1643 << ": op_file_remove " << ino << dendl;
11fdf7f2
TL
1644 if (unlikely(to_stdout)) {
1645 std::cout << " 0x" << std::hex << pos << std::dec
1646 << ": op_file_remove " << ino << std::endl;
1647 }
1648
9f95a23c 1649 if (!noop) {
20effc67
TL
1650 auto p = nodes.file_map.find(ino);
1651 ceph_assert(p != nodes.file_map.end());
9f95a23c
TL
1652 vselector->sub_usage(p->second->vselector_hint, p->second->fnode);
1653 if (cct->_conf->bluefs_log_replay_check_allocations) {
20effc67
TL
1654 int r = _check_allocations(p->second->fnode,
1655 used_blocks, false, "OP_FILE_REMOVE");
1656 if (r < 0) {
1657 return r;
9f95a23c
TL
1658 }
1659 }
20effc67 1660 nodes.file_map.erase(p);
9f95a23c
TL
1661 }
1662 }
7c673cae
FG
1663 break;
1664
1665 default:
1666 derr << __func__ << " 0x" << std::hex << pos << std::dec
1667 << ": stop: unrecognized op " << (int)op << dendl;
1668 delete log_reader;
1669 return -EIO;
1670 }
1671 }
11fdf7f2 1672 ceph_assert(p.end());
7c673cae
FG
1673
1674 // we successfully replayed the transaction; bump the seq and log size
1675 ++log_seq;
1676 log_file->fnode.size = log_reader->buf.pos;
1677 }
f67539c2
TL
1678 if (!noop) {
1679 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
20effc67
TL
1680 log.seq_live = log_seq + 1;
1681 dirty.seq_live = log_seq + 1;
1682 log.t.seq = log.seq_live;
1683 dirty.seq_stable = log_seq;
9f95a23c 1684 }
7c673cae
FG
1685
1686 dout(10) << __func__ << " log file size was 0x"
1687 << std::hex << log_file->fnode.size << std::dec << dendl;
11fdf7f2
TL
1688 if (unlikely(to_stdout)) {
1689 std::cout << " log file size was 0x"
1690 << std::hex << log_file->fnode.size << std::dec << std::endl;
1691 }
1692
7c673cae
FG
1693 delete log_reader;
1694
1695 if (!noop) {
1696 // verify file link counts are all >0
20effc67 1697 for (auto& p : nodes.file_map) {
7c673cae
FG
1698 if (p.second->refs == 0 &&
1699 p.second->fnode.ino > 1) {
1700 derr << __func__ << " file with link count 0: " << p.second->fnode
1701 << dendl;
1702 return -EIO;
1703 }
1704 }
1705 }
20effc67
TL
1706 // reflect file count in logger
1707 logger->set(l_bluefs_num_files, nodes.file_map.size());
7c673cae
FG
1708
1709 dout(10) << __func__ << " done" << dendl;
1710 return 0;
1711}
1712
11fdf7f2
TL
1713int BlueFS::log_dump()
1714{
1715 // only dump log file's content
20effc67
TL
1716 ceph_assert(log.writer == nullptr && "cannot log_dump on mounted BlueFS");
1717 _init_logger();
f67539c2 1718 int r = _open_super();
11fdf7f2 1719 if (r < 0) {
f67539c2 1720 derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
11fdf7f2
TL
1721 return r;
1722 }
f67539c2
TL
1723 r = _replay(true, true);
1724 if (r < 0) {
1725 derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
1726 }
1727 _shutdown_logger();
1728 super = bluefs_super_t();
1729 return r;
11fdf7f2
TL
1730}
1731
1732int BlueFS::device_migrate_to_existing(
1733 CephContext *cct,
1734 const set<int>& devs_source,
9f95a23c
TL
1735 int dev_target,
1736 const bluefs_layout_t& layout)
11fdf7f2
TL
1737{
1738 vector<byte> buf;
1739 bool buffered = cct->_conf->bluefs_buffered_io;
1740
eafe8130
TL
1741 dout(10) << __func__ << " devs_source " << devs_source
1742 << " dev_target " << dev_target << dendl;
11fdf7f2
TL
1743 assert(dev_target < (int)MAX_BDEV);
1744
1745 int flags = 0;
1746 flags |= devs_source.count(BDEV_DB) ?
1747 (REMOVE_DB | RENAME_SLOW2DB) : 0;
1748 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
1749 int dev_target_new = dev_target;
1750
1751 // Slow device without separate DB one is addressed via BDEV_DB
1752 // Hence need renaming.
1753 if ((flags & REMOVE_DB) && dev_target == BDEV_SLOW) {
1754 dev_target_new = BDEV_DB;
1755 dout(0) << __func__ << " super to be written to " << dev_target << dendl;
1756 }
1757
20effc67 1758 for (auto& [ino, file_ref] : nodes.file_map) {
11fdf7f2 1759 //do not copy log
39ae355f 1760 if (ino == 1) {
11fdf7f2
TL
1761 continue;
1762 }
9f95a23c 1763 dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
eafe8130 1764
20effc67 1765 vselector->sub_usage(file_ref->vselector_hint, file_ref->fnode);
11fdf7f2 1766
9f95a23c 1767 bool rewrite = std::any_of(
39ae355f
TL
1768 file_ref->fnode.extents.begin(),
1769 file_ref->fnode.extents.end(),
9f95a23c
TL
1770 [=](auto& ext) {
1771 return ext.bdev != dev_target && devs_source.count(ext.bdev);
1772 });
eafe8130
TL
1773 if (rewrite) {
1774 dout(10) << __func__ << " migrating" << dendl;
39ae355f
TL
1775 bluefs_fnode_t old_fnode;
1776 old_fnode.swap_extents(file_ref->fnode);
1777 auto& old_fnode_extents = old_fnode.extents;
eafe8130
TL
1778 // read entire file
1779 bufferlist bl;
39ae355f 1780 for (const auto &old_ext : old_fnode_extents) {
eafe8130 1781 buf.resize(old_ext.length);
20effc67 1782 int r = _bdev_read_random(old_ext.bdev,
eafe8130
TL
1783 old_ext.offset,
1784 old_ext.length,
1785 (char*)&buf.at(0),
1786 buffered);
1787 if (r != 0) {
1788 derr << __func__ << " failed to read 0x" << std::hex
1789 << old_ext.offset << "~" << old_ext.length << std::dec
1790 << " from " << (int)dev_target << dendl;
1791 return -EIO;
1792 }
1793 bl.append((char*)&buf[0], old_ext.length);
1794 }
11fdf7f2 1795
eafe8130 1796 // write entire file
39ae355f
TL
1797 auto l = _allocate(dev_target, bl.length(), 0,
1798 &file_ref->fnode, 0, false);
eafe8130
TL
1799 if (l < 0) {
1800 derr << __func__ << " unable to allocate len 0x" << std::hex
1801 << bl.length() << std::dec << " from " << (int)dev_target
1802 << ": " << cpp_strerror(l) << dendl;
1803 return -ENOSPC;
1804 }
11fdf7f2 1805
eafe8130 1806 uint64_t off = 0;
39ae355f 1807 for (auto& i : file_ref->fnode.extents) {
eafe8130
TL
1808 bufferlist cur;
1809 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1810 ceph_assert(cur_len > 0);
1811 cur.substr_of(bl, off, cur_len);
1812 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1813 ceph_assert(r == 0);
1814 off += cur_len;
1815 }
1816
1817 // release old extents
39ae355f 1818 for (const auto &old_ext : old_fnode_extents) {
eafe8130
TL
1819 PExtentVector to_release;
1820 to_release.emplace_back(old_ext.offset, old_ext.length);
1821 alloc[old_ext.bdev]->release(to_release);
f67539c2
TL
1822 if (is_shared_alloc(old_ext.bdev)) {
1823 shared_alloc->bluefs_used -= to_release.size();
1824 }
eafe8130
TL
1825 }
1826
1827 // update fnode
39ae355f
TL
1828 for (auto& i : file_ref->fnode.extents) {
1829 i.bdev = dev_target_new;
eafe8130
TL
1830 }
1831 } else {
39ae355f 1832 for (auto& ext : file_ref->fnode.extents) {
9f95a23c 1833 if (dev_target != dev_target_new && ext.bdev == dev_target) {
eafe8130 1834 dout(20) << __func__ << " " << " ... adjusting extent 0x"
9f95a23c 1835 << std::hex << ext.offset << std::dec
eafe8130
TL
1836 << " bdev " << dev_target << " -> " << dev_target_new
1837 << dendl;
9f95a23c 1838 ext.bdev = dev_target_new;
11fdf7f2 1839 }
11fdf7f2
TL
1840 }
1841 }
20effc67 1842 vselector->add_usage(file_ref->vselector_hint, file_ref->fnode);
11fdf7f2
TL
1843 }
1844 // new logging device in the current naming scheme
1845 int new_log_dev_cur = bdev[BDEV_WAL] ?
1846 BDEV_WAL :
1847 bdev[BDEV_DB] ? BDEV_DB : BDEV_SLOW;
1848
1849 // new logging device in new naming scheme
1850 int new_log_dev_next = new_log_dev_cur;
1851
1852 if (devs_source.count(new_log_dev_cur)) {
1853 // SLOW device is addressed via BDEV_DB too hence either WAL or DB
1854 new_log_dev_next = (flags & REMOVE_WAL) || !bdev[BDEV_WAL] ?
1855 BDEV_DB :
1856 BDEV_WAL;
1857
1858 dout(0) << __func__ << " log moved from " << new_log_dev_cur
1859 << " to " << new_log_dev_next << dendl;
1860
1861 new_log_dev_cur =
1862 (flags & REMOVE_DB) && new_log_dev_next == BDEV_DB ?
1863 BDEV_SLOW :
1864 new_log_dev_next;
1865 }
1866
20effc67 1867 _rewrite_log_and_layout_sync_LNF_LD(
11fdf7f2
TL
1868 false,
1869 (flags & REMOVE_DB) ? BDEV_SLOW : BDEV_DB,
1870 new_log_dev_cur,
1871 new_log_dev_next,
9f95a23c
TL
1872 flags,
1873 layout);
11fdf7f2
TL
1874 return 0;
1875}
1876
1877int BlueFS::device_migrate_to_new(
1878 CephContext *cct,
1879 const set<int>& devs_source,
9f95a23c
TL
1880 int dev_target,
1881 const bluefs_layout_t& layout)
11fdf7f2
TL
1882{
1883 vector<byte> buf;
1884 bool buffered = cct->_conf->bluefs_buffered_io;
1885
eafe8130
TL
1886 dout(10) << __func__ << " devs_source " << devs_source
1887 << " dev_target " << dev_target << dendl;
20effc67 1888 assert(dev_target == (int)BDEV_NEWDB || dev_target == (int)BDEV_NEWWAL);
11fdf7f2
TL
1889
1890 int flags = 0;
1891
1892 flags |= devs_source.count(BDEV_DB) ?
1893 (!bdev[BDEV_SLOW] ? RENAME_DB2SLOW: REMOVE_DB) :
1894 0;
1895 flags |= devs_source.count(BDEV_WAL) ? REMOVE_WAL : 0;
9f95a23c 1896 int dev_target_new = dev_target; //FIXME: remove, makes no sense
11fdf7f2 1897
39ae355f 1898 for (auto& [ino, file_ref] : nodes.file_map) {
11fdf7f2 1899 //do not copy log
39ae355f 1900 if (ino == 1) {
11fdf7f2
TL
1901 continue;
1902 }
39ae355f 1903 dout(10) << __func__ << " " << ino << " " << file_ref->fnode << dendl;
eafe8130 1904
39ae355f 1905 vselector->sub_usage(file_ref->vselector_hint, file_ref->fnode);
11fdf7f2 1906
39ae355f
TL
1907 bool rewrite = std::any_of(
1908 file_ref->fnode.extents.begin(),
1909 file_ref->fnode.extents.end(),
1910 [=](auto& ext) {
1911 return ext.bdev != dev_target && devs_source.count(ext.bdev);
1912 });
eafe8130
TL
1913 if (rewrite) {
1914 dout(10) << __func__ << " migrating" << dendl;
39ae355f
TL
1915 bluefs_fnode_t old_fnode;
1916 old_fnode.swap_extents(file_ref->fnode);
1917 auto& old_fnode_extents = old_fnode.extents;
eafe8130
TL
1918 // read entire file
1919 bufferlist bl;
39ae355f 1920 for (const auto &old_ext : old_fnode_extents) {
eafe8130 1921 buf.resize(old_ext.length);
20effc67 1922 int r = _bdev_read_random(old_ext.bdev,
eafe8130
TL
1923 old_ext.offset,
1924 old_ext.length,
1925 (char*)&buf.at(0),
1926 buffered);
1927 if (r != 0) {
1928 derr << __func__ << " failed to read 0x" << std::hex
1929 << old_ext.offset << "~" << old_ext.length << std::dec
1930 << " from " << (int)dev_target << dendl;
1931 return -EIO;
11fdf7f2 1932 }
eafe8130
TL
1933 bl.append((char*)&buf[0], old_ext.length);
1934 }
1935
1936 // write entire file
39ae355f
TL
1937 auto l = _allocate(dev_target, bl.length(), 0,
1938 &file_ref->fnode, 0, false);
eafe8130
TL
1939 if (l < 0) {
1940 derr << __func__ << " unable to allocate len 0x" << std::hex
1941 << bl.length() << std::dec << " from " << (int)dev_target
1942 << ": " << cpp_strerror(l) << dendl;
1943 return -ENOSPC;
1944 }
1945
1946 uint64_t off = 0;
39ae355f 1947 for (auto& i : file_ref->fnode.extents) {
eafe8130
TL
1948 bufferlist cur;
1949 uint64_t cur_len = std::min<uint64_t>(i.length, bl.length() - off);
1950 ceph_assert(cur_len > 0);
1951 cur.substr_of(bl, off, cur_len);
1952 int r = bdev[dev_target]->write(i.offset, cur, buffered);
1953 ceph_assert(r == 0);
1954 off += cur_len;
1955 }
1956
1957 // release old extents
39ae355f 1958 for (const auto &old_ext : old_fnode_extents) {
eafe8130
TL
1959 PExtentVector to_release;
1960 to_release.emplace_back(old_ext.offset, old_ext.length);
1961 alloc[old_ext.bdev]->release(to_release);
f67539c2
TL
1962 if (is_shared_alloc(old_ext.bdev)) {
1963 shared_alloc->bluefs_used -= to_release.size();
1964 }
eafe8130
TL
1965 }
1966
1967 // update fnode
39ae355f
TL
1968 for (auto& i : file_ref->fnode.extents) {
1969 i.bdev = dev_target_new;
11fdf7f2
TL
1970 }
1971 }
11fdf7f2
TL
1972 }
1973 // new logging device in the current naming scheme
1974 int new_log_dev_cur =
1975 bdev[BDEV_NEWWAL] ?
1976 BDEV_NEWWAL :
1977 bdev[BDEV_WAL] && !(flags & REMOVE_WAL) ?
1978 BDEV_WAL :
1979 bdev[BDEV_NEWDB] ?
1980 BDEV_NEWDB :
1981 bdev[BDEV_DB] && !(flags & REMOVE_DB)?
1982 BDEV_DB :
1983 BDEV_SLOW;
1984
1985 // new logging device in new naming scheme
1986 int new_log_dev_next =
1987 new_log_dev_cur == BDEV_NEWWAL ?
1988 BDEV_WAL :
1989 new_log_dev_cur == BDEV_NEWDB ?
1990 BDEV_DB :
1991 new_log_dev_cur;
1992
1993 int super_dev =
1994 dev_target == BDEV_NEWDB ?
1995 BDEV_NEWDB :
1996 bdev[BDEV_DB] ?
1997 BDEV_DB :
1998 BDEV_SLOW;
1999
20effc67 2000 _rewrite_log_and_layout_sync_LNF_LD(
11fdf7f2
TL
2001 false,
2002 super_dev,
2003 new_log_dev_cur,
2004 new_log_dev_next,
9f95a23c
TL
2005 flags,
2006 layout);
11fdf7f2
TL
2007 return 0;
2008}
2009
7c673cae
FG
2010BlueFS::FileRef BlueFS::_get_file(uint64_t ino)
2011{
20effc67
TL
2012 auto p = nodes.file_map.find(ino);
2013 if (p == nodes.file_map.end()) {
9f95a23c 2014 FileRef f = ceph::make_ref<File>();
20effc67
TL
2015 nodes.file_map[ino] = f;
2016 // track files count in logger
2017 logger->set(l_bluefs_num_files, nodes.file_map.size());
7c673cae
FG
2018 dout(30) << __func__ << " ino " << ino << " = " << f
2019 << " (new)" << dendl;
2020 return f;
2021 } else {
2022 dout(30) << __func__ << " ino " << ino << " = " << p->second << dendl;
2023 return p->second;
2024 }
2025}
2026
20effc67
TL
2027
2028/**
2029To modify fnode both FileWriter::lock and File::lock must be obtained.
2030The special case is when we modify bluefs log (ino 1) or
2031we are compacting log (ino 0).
2032
2033In any case it is enough to hold File::lock to be sure fnode will not be modified.
2034*/
2035struct lock_fnode_print {
2036 BlueFS::FileRef file;
2037 lock_fnode_print(BlueFS::FileRef file) : file(file) {};
2038};
2039std::ostream& operator<<(std::ostream& out, const lock_fnode_print& to_lock) {
2040 std::lock_guard l(to_lock.file->lock);
2041 out << to_lock.file->fnode;
2042 return out;
2043}
2044
2045void BlueFS::_drop_link_D(FileRef file)
7c673cae
FG
2046{
2047 dout(20) << __func__ << " had refs " << file->refs
20effc67 2048 << " on " << lock_fnode_print(file) << dendl;
11fdf7f2 2049 ceph_assert(file->refs > 0);
20effc67
TL
2050 ceph_assert(ceph_mutex_is_locked(log.lock));
2051 ceph_assert(ceph_mutex_is_locked(nodes.lock));
2052
7c673cae
FG
2053 --file->refs;
2054 if (file->refs == 0) {
2055 dout(20) << __func__ << " destroying " << file->fnode << dendl;
11fdf7f2 2056 ceph_assert(file->num_reading.load() == 0);
9f95a23c 2057 vselector->sub_usage(file->vselector_hint, file->fnode);
20effc67
TL
2058 log.t.op_file_remove(file->fnode.ino);
2059 nodes.file_map.erase(file->fnode.ino);
2060 logger->set(l_bluefs_num_files, nodes.file_map.size());
7c673cae 2061 file->deleted = true;
94b18763 2062
20effc67
TL
2063 std::lock_guard dl(dirty.lock);
2064 for (auto& r : file->fnode.extents) {
2065 dirty.pending_release[r.bdev].insert(r.offset, r.length);
2066 }
2067 if (file->dirty_seq > dirty.seq_stable) {
2068 // retract request to serialize changes
2069 ceph_assert(dirty.files.count(file->dirty_seq));
2070 auto it = dirty.files[file->dirty_seq].iterator_to(*file);
2071 dirty.files[file->dirty_seq].erase(it);
2072 file->dirty_seq = dirty.seq_stable;
7c673cae
FG
2073 }
2074 }
2075}
2076
adb31ebb 2077int64_t BlueFS::_read_random(
7c673cae
FG
2078 FileReader *h, ///< [in] read from here
2079 uint64_t off, ///< [in] offset
9f95a23c 2080 uint64_t len, ///< [in] this many bytes
f67539c2 2081 char *out) ///< [out] copy it here
7c673cae 2082{
494da23a
TL
2083 auto* buf = &h->buf;
2084
adb31ebb 2085 int64_t ret = 0;
7c673cae
FG
2086 dout(10) << __func__ << " h " << h
2087 << " 0x" << std::hex << off << "~" << len << std::dec
20effc67 2088 << " from " << lock_fnode_print(h->file) << dendl;
7c673cae
FG
2089
2090 ++h->file->num_reading;
2091
2092 if (!h->ignore_eof &&
2093 off + len > h->file->fnode.size) {
2094 if (off > h->file->fnode.size)
2095 len = 0;
2096 else
2097 len = h->file->fnode.size - off;
2098 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
2099 << std::hex << len << std::dec << dendl;
2100 }
494da23a
TL
2101 logger->inc(l_bluefs_read_random_count, 1);
2102 logger->inc(l_bluefs_read_random_bytes, len);
7c673cae 2103
494da23a 2104 std::shared_lock s_lock(h->lock);
f91f0fd5 2105 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
7c673cae 2106 while (len > 0) {
494da23a
TL
2107 if (off < buf->bl_off || off >= buf->get_buf_end()) {
2108 s_lock.unlock();
2109 uint64_t x_off = 0;
2110 auto p = h->file->fnode.seek(off, &x_off);
f6b5b4d7 2111 ceph_assert(p != h->file->fnode.extents.end());
9f95a23c 2112 uint64_t l = std::min(p->length - x_off, len);
adb31ebb
TL
2113 //hard cap to 1GB
2114 l = std::min(l, uint64_t(1) << 30);
494da23a
TL
2115 dout(20) << __func__ << " read random 0x"
2116 << std::hex << x_off << "~" << l << std::dec
2117 << " of " << *p << dendl;
cd265ab1
TL
2118 int r;
2119 if (!cct->_conf->bluefs_check_for_zeros) {
20effc67
TL
2120 r = _bdev_read_random(p->bdev, p->offset + x_off, l, out,
2121 cct->_conf->bluefs_buffered_io);
cd265ab1 2122 } else {
20effc67 2123 r = _read_random_and_check(p->bdev, p->offset + x_off, l, out,
cd265ab1
TL
2124 cct->_conf->bluefs_buffered_io);
2125 }
494da23a
TL
2126 ceph_assert(r == 0);
2127 off += l;
2128 len -= l;
2129 ret += l;
2130 out += l;
2131
2132 logger->inc(l_bluefs_read_random_disk_count, 1);
2133 logger->inc(l_bluefs_read_random_disk_bytes, l);
2134 if (len > 0) {
2135 s_lock.lock();
2136 }
2137 } else {
2138 auto left = buf->get_buf_remaining(off);
adb31ebb 2139 int64_t r = std::min(len, left);
494da23a
TL
2140 logger->inc(l_bluefs_read_random_buffer_count, 1);
2141 logger->inc(l_bluefs_read_random_buffer_bytes, r);
2142 dout(20) << __func__ << " left 0x" << std::hex << left
2143 << " 0x" << off << "~" << len << std::dec
2144 << dendl;
2145
f67539c2
TL
2146 auto p = buf->bl.begin();
2147 p.seek(off - buf->bl_off);
2148 p.copy(r, out);
2149 out += r;
7c673cae 2150
494da23a
TL
2151 dout(30) << __func__ << " result chunk (0x"
2152 << std::hex << r << std::dec << " bytes):\n";
2153 bufferlist t;
2154 t.substr_of(buf->bl, off - buf->bl_off, r);
2155 t.hexdump(*_dout);
2156 *_dout << dendl;
2157
2158 off += r;
2159 len -= r;
2160 ret += r;
2161 buf->pos += r;
2162 }
2163 }
39ae355f
TL
2164 dout(20) << __func__ << std::hex
2165 << " got 0x" << ret
2166 << std::dec << dendl;
7c673cae
FG
2167 --h->file->num_reading;
2168 return ret;
2169}
2170
adb31ebb 2171int64_t BlueFS::_read(
7c673cae 2172 FileReader *h, ///< [in] read from here
7c673cae
FG
2173 uint64_t off, ///< [in] offset
2174 size_t len, ///< [in] this many bytes
2175 bufferlist *outbl, ///< [out] optional: reference the result here
2176 char *out) ///< [out] optional: or copy it here
2177{
f67539c2
TL
2178 FileReaderBuffer *buf = &(h->buf);
2179
494da23a 2180 bool prefetch = !outbl && !out;
7c673cae
FG
2181 dout(10) << __func__ << " h " << h
2182 << " 0x" << std::hex << off << "~" << len << std::dec
20effc67 2183 << " from " << lock_fnode_print(h->file)
494da23a
TL
2184 << (prefetch ? " prefetch" : "")
2185 << dendl;
7c673cae
FG
2186
2187 ++h->file->num_reading;
2188
2189 if (!h->ignore_eof &&
2190 off + len > h->file->fnode.size) {
2191 if (off > h->file->fnode.size)
2192 len = 0;
2193 else
2194 len = h->file->fnode.size - off;
2195 dout(20) << __func__ << " reaching (or past) eof, len clipped to 0x"
2196 << std::hex << len << std::dec << dendl;
2197 }
494da23a
TL
2198 logger->inc(l_bluefs_read_count, 1);
2199 logger->inc(l_bluefs_read_bytes, len);
2200 if (prefetch) {
2201 logger->inc(l_bluefs_read_prefetch_count, 1);
2202 logger->inc(l_bluefs_read_prefetch_bytes, len);
2203 }
2204
7c673cae
FG
2205 if (outbl)
2206 outbl->clear();
2207
adb31ebb 2208 int64_t ret = 0;
494da23a 2209 std::shared_lock s_lock(h->lock);
7c673cae
FG
2210 while (len > 0) {
2211 size_t left;
2212 if (off < buf->bl_off || off >= buf->get_buf_end()) {
494da23a
TL
2213 s_lock.unlock();
2214 std::unique_lock u_lock(h->lock);
f91f0fd5 2215 buf->bl.reassign_to_mempool(mempool::mempool_bluefs_file_reader);
494da23a
TL
2216 if (off < buf->bl_off || off >= buf->get_buf_end()) {
2217 // if precondition hasn't changed during locking upgrade.
2218 buf->bl.clear();
2219 buf->bl_off = off & super.block_mask();
2220 uint64_t x_off = 0;
2221 auto p = h->file->fnode.seek(buf->bl_off, &x_off);
f6b5b4d7
TL
2222 if (p == h->file->fnode.extents.end()) {
2223 dout(5) << __func__ << " reading less then required "
2224 << ret << "<" << ret + len << dendl;
2225 break;
2226 }
2227
494da23a
TL
2228 uint64_t want = round_up_to(len + (off & ~super.block_mask()),
2229 super.block_size);
2230 want = std::max(want, buf->max_prefetch);
2231 uint64_t l = std::min(p->length - x_off, want);
adb31ebb
TL
2232 //hard cap to 1GB
2233 l = std::min(l, uint64_t(1) << 30);
494da23a
TL
2234 uint64_t eof_offset = round_up_to(h->file->fnode.size, super.block_size);
2235 if (!h->ignore_eof &&
2236 buf->bl_off + l > eof_offset) {
2237 l = eof_offset - buf->bl_off;
2238 }
2239 dout(20) << __func__ << " fetching 0x"
2240 << std::hex << x_off << "~" << l << std::dec
2241 << " of " << *p << dendl;
cd265ab1 2242 int r;
39ae355f
TL
2243 // when reading BlueFS log (only happens on startup) use non-buffered io
2244 // it makes it in sync with logic in _flush_range()
2245 bool use_buffered_io = h->file->fnode.ino == 1 ? false : cct->_conf->bluefs_buffered_io;
cd265ab1 2246 if (!cct->_conf->bluefs_check_for_zeros) {
20effc67 2247 r = _bdev_read(p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev],
39ae355f 2248 use_buffered_io);
cd265ab1 2249 } else {
20effc67
TL
2250 r = _read_and_check(
2251 p->bdev, p->offset + x_off, l, &buf->bl, ioc[p->bdev],
39ae355f 2252 use_buffered_io);
cd265ab1 2253 }
20effc67
TL
2254 logger->inc(l_bluefs_read_disk_count, 1);
2255 logger->inc(l_bluefs_read_disk_bytes, l);
2256
494da23a 2257 ceph_assert(r == 0);
7c673cae 2258 }
494da23a
TL
2259 u_lock.unlock();
2260 s_lock.lock();
2261 // we should recheck if buffer is valid after lock downgrade
2262 continue;
7c673cae
FG
2263 }
2264 left = buf->get_buf_remaining(off);
2265 dout(20) << __func__ << " left 0x" << std::hex << left
2266 << " len 0x" << len << std::dec << dendl;
2267
adb31ebb 2268 int64_t r = std::min(len, left);
7c673cae
FG
2269 if (outbl) {
2270 bufferlist t;
2271 t.substr_of(buf->bl, off - buf->bl_off, r);
2272 outbl->claim_append(t);
2273 }
2274 if (out) {
f67539c2
TL
2275 auto p = buf->bl.begin();
2276 p.seek(off - buf->bl_off);
2277 p.copy(r, out);
7c673cae
FG
2278 out += r;
2279 }
2280
2281 dout(30) << __func__ << " result chunk (0x"
2282 << std::hex << r << std::dec << " bytes):\n";
2283 bufferlist t;
2284 t.substr_of(buf->bl, off - buf->bl_off, r);
2285 t.hexdump(*_dout);
2286 *_dout << dendl;
2287
2288 off += r;
2289 len -= r;
2290 ret += r;
2291 buf->pos += r;
2292 }
f67539c2 2293
39ae355f
TL
2294 dout(20) << __func__ << std::hex
2295 << " got 0x" << ret
2296 << std::dec << dendl;
11fdf7f2 2297 ceph_assert(!outbl || (int)outbl->length() == ret);
7c673cae
FG
2298 --h->file->num_reading;
2299 return ret;
2300}
2301
20effc67 2302void BlueFS::invalidate_cache(FileRef f, uint64_t offset, uint64_t length)
7c673cae 2303{
20effc67 2304 std::lock_guard l(f->lock);
7c673cae
FG
2305 dout(10) << __func__ << " file " << f->fnode
2306 << " 0x" << std::hex << offset << "~" << length << std::dec
2307 << dendl;
2308 if (offset & ~super.block_mask()) {
2309 offset &= super.block_mask();
11fdf7f2 2310 length = round_up_to(length, super.block_size);
7c673cae
FG
2311 }
2312 uint64_t x_off = 0;
2313 auto p = f->fnode.seek(offset, &x_off);
2314 while (length > 0 && p != f->fnode.extents.end()) {
11fdf7f2 2315 uint64_t x_len = std::min(p->length - x_off, length);
7c673cae
FG
2316 bdev[p->bdev]->invalidate_cache(p->offset + x_off, x_len);
2317 dout(20) << __func__ << " 0x" << std::hex << x_off << "~" << x_len
2318 << std:: dec << " of " << *p << dendl;
2319 offset += x_len;
2320 length -= x_len;
2321 }
2322}
2323
39ae355f
TL
2324
2325uint64_t BlueFS::_estimate_transaction_size(bluefs_transaction_t* t)
2326{
2327 uint64_t max_alloc_size = std::max(alloc_size[BDEV_WAL],
2328 std::max(alloc_size[BDEV_DB],
2329 alloc_size[BDEV_SLOW]));
2330
2331 // conservative estimate for final encoded size
2332 return round_up_to(t->op_bl.length() + super.block_size * 2, max_alloc_size);
2333}
2334
2335uint64_t BlueFS::_make_initial_transaction(uint64_t start_seq,
2336 bluefs_fnode_t& fnode,
2337 uint64_t expected_final_size,
2338 bufferlist* out)
2339{
2340 bluefs_transaction_t t0;
2341 t0.seq = start_seq;
2342 t0.uuid = super.uuid;
2343 t0.op_init();
2344 t0.op_file_update_inc(fnode);
2345 t0.op_jump(start_seq, expected_final_size); // this is a fixed size op,
2346 // hence it's valid with fake
2347 // params for overall txc size
2348 // estimation
2349 if (!out) {
2350 return _estimate_transaction_size(&t0);
2351 }
2352
2353 ceph_assert(expected_final_size > 0);
2354 out->reserve(expected_final_size);
2355 encode(t0, *out);
2356 // make sure we're not wrong aboth the size
2357 ceph_assert(out->length() <= expected_final_size);
2358 _pad_bl(*out, expected_final_size);
2359 return expected_final_size;
2360}
2361
20effc67 2362uint64_t BlueFS::_estimate_log_size_N()
7c673cae 2363{
20effc67 2364 std::lock_guard nl(nodes.lock);
7c673cae
FG
2365 int avg_dir_size = 40; // fixme
2366 int avg_file_size = 12;
2367 uint64_t size = 4096 * 2;
20effc67
TL
2368 size += nodes.file_map.size() * (1 + sizeof(bluefs_fnode_t));
2369 size += nodes.dir_map.size() + (1 + avg_dir_size);
2370 size += nodes.file_map.size() * (1 + avg_dir_size + avg_file_size);
11fdf7f2 2371 return round_up_to(size, super.block_size);
7c673cae
FG
2372}
2373
20effc67 2374void BlueFS::compact_log()/*_LNF_LD_NF_D*/
7c673cae 2375{
f6b5b4d7
TL
2376 if (!cct->_conf->bluefs_replay_recovery_disable_compact) {
2377 if (cct->_conf->bluefs_compact_log_sync) {
20effc67 2378 _compact_log_sync_LNF_LD();
f6b5b4d7 2379 } else {
20effc67 2380 _compact_log_async_LD_LNF_D();
f6b5b4d7 2381 }
7c673cae
FG
2382 }
2383}
2384
20effc67 2385bool BlueFS::_should_start_compact_log_L_N()
7c673cae 2386{
20effc67
TL
2387 if (log_is_compacting.load() == true) {
2388 // compaction is already running
2389 return false;
2390 }
2391 uint64_t current;
2392 {
2393 std::lock_guard ll(log.lock);
2394 current = log.writer->file->fnode.size;
2395 }
2396 uint64_t expected = _estimate_log_size_N();
7c673cae
FG
2397 float ratio = (float)current / (float)expected;
2398 dout(10) << __func__ << " current 0x" << std::hex << current
2399 << " expected " << expected << std::dec
2400 << " ratio " << ratio
7c673cae 2401 << dendl;
20effc67 2402 if (current < cct->_conf->bluefs_log_compact_min_size ||
7c673cae
FG
2403 ratio < cct->_conf->bluefs_log_compact_min_ratio) {
2404 return false;
2405 }
2406 return true;
2407}
2408
39ae355f
TL
2409void BlueFS::_compact_log_dump_metadata_NF(uint64_t start_seq,
2410 bluefs_transaction_t *t,
2411 int bdev_update_flags,
2412 uint64_t capture_before_seq)
7c673cae 2413{
39ae355f
TL
2414 dout(20) << __func__ << dendl;
2415 t->seq = start_seq;
7c673cae 2416 t->uuid = super.uuid;
7c673cae 2417
20effc67
TL
2418 std::lock_guard nl(nodes.lock);
2419
20effc67
TL
2420 for (auto& [ino, file_ref] : nodes.file_map) {
2421 if (ino == 1)
2422 continue;
2423 ceph_assert(ino > 1);
2424 std::lock_guard fl(file_ref->lock);
39ae355f
TL
2425 if (bdev_update_flags) {
2426 for(auto& e : file_ref->fnode.extents) {
2427 auto bdev = e.bdev;
2428 auto bdev_new = bdev;
2429 ceph_assert(!((bdev_update_flags & REMOVE_WAL) && bdev == BDEV_WAL));
2430 if ((bdev_update_flags & RENAME_SLOW2DB) && bdev == BDEV_SLOW) {
2431 bdev_new = BDEV_DB;
2432 }
2433 if ((bdev_update_flags & RENAME_DB2SLOW) && bdev == BDEV_DB) {
2434 bdev_new = BDEV_SLOW;
2435 }
2436 if (bdev == BDEV_NEWDB) {
2437 // REMOVE_DB xor RENAME_DB
2438 ceph_assert(!(bdev_update_flags & REMOVE_DB) != !(bdev_update_flags & RENAME_DB2SLOW));
2439 ceph_assert(!(bdev_update_flags & RENAME_SLOW2DB));
2440 bdev_new = BDEV_DB;
2441 }
2442 if (bdev == BDEV_NEWWAL) {
2443 ceph_assert(bdev_update_flags & REMOVE_WAL);
2444 bdev_new = BDEV_WAL;
2445 }
2446 e.bdev = bdev_new;
2447 }
2448 }
2449 if (capture_before_seq == 0 || file_ref->dirty_seq < capture_before_seq) {
20effc67
TL
2450 dout(20) << __func__ << " op_file_update " << file_ref->fnode << dendl;
2451 } else {
2452 dout(20) << __func__ << " op_file_update just modified, dirty_seq="
39ae355f 2453 << file_ref->dirty_seq << " " << file_ref->fnode << dendl;
20effc67
TL
2454 }
2455 t->op_file_update(file_ref->fnode);
2456 }
2457 for (auto& [path, dir_ref] : nodes.dir_map) {
9f95a23c
TL
2458 dout(20) << __func__ << " op_dir_create " << path << dendl;
2459 t->op_dir_create(path);
2460 for (auto& [fname, file_ref] : dir_ref->file_map) {
2461 dout(20) << __func__ << " op_dir_link " << path << "/" << fname
2462 << " to " << file_ref->fnode.ino << dendl;
2463 t->op_dir_link(path, fname, file_ref->fnode.ino);
7c673cae
FG
2464 }
2465 }
2466}
2467
20effc67 2468void BlueFS::_compact_log_sync_LNF_LD()
7c673cae
FG
2469{
2470 dout(10) << __func__ << dendl;
20effc67
TL
2471 uint8_t prefer_bdev;
2472 {
2473 std::lock_guard ll(log.lock);
2474 prefer_bdev =
2475 vselector->select_prefer_bdev(log.writer->file->vselector_hint);
2476 }
2477 _rewrite_log_and_layout_sync_LNF_LD(true,
11fdf7f2 2478 BDEV_DB,
9f95a23c
TL
2479 prefer_bdev,
2480 prefer_bdev,
2481 0,
2482 super.memorized_layout);
11fdf7f2
TL
2483 logger->inc(l_bluefs_log_compactions);
2484}
2485
39ae355f
TL
2486/*
2487 * SYNC LOG COMPACTION
2488 *
2489 * 0. Lock the log completely through the whole procedure
2490 *
2491 * 1. Build new log. It will include log's starter and compacted metadata
2492 * body. Jump op appended to the starter will link the pieces together.
2493 *
2494 * 2. Write out new log's content
2495 *
2496 * 3. Write out new superblock. This includes relevant device layout update.
2497 *
2498 * 4. Finalization. Old space release.
2499 */
2500
2501void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool permit_dev_fallback,
20effc67
TL
2502 int super_dev,
2503 int log_dev,
2504 int log_dev_new,
2505 int flags,
2506 std::optional<bluefs_layout_t> layout)
11fdf7f2 2507{
39ae355f
TL
2508 // we substitute log_dev with log_dev_new for new allocations below
2509 // and permitting fallback allocations prevents such a substitution
2510 ceph_assert((permit_dev_fallback && log_dev == log_dev_new) ||
2511 !permit_dev_fallback);
2512
2513 dout(10) << __func__ << " super_dev:" << super_dev
2514 << " log_dev:" << log_dev
2515 << " log_dev_new:" << log_dev_new
2516 << " flags:" << flags
2517 << " seq:" << log.seq_live
2518 << dendl;
2519 utime_t mtime = ceph_clock_now();
2520 uint64_t starter_seq = 1;
2521
2522 // Part 0.
2523 // Lock the log totally till the end of the procedure
20effc67 2524 std::lock_guard ll(log.lock);
39ae355f 2525 auto t0 = mono_clock::now();
20effc67
TL
2526
2527 File *log_file = log.writer->file.get();
39ae355f 2528 bluefs_fnode_t fnode_tail;
20effc67
TL
2529 // log.t.seq is always set to current live seq
2530 ceph_assert(log.t.seq == log.seq_live);
2531 // Capturing entire state. Dump anything that has been stored there.
2532 log.t.clear();
2533 log.t.seq = log.seq_live;
2534 // From now on, no changes to log.t are permitted until we finish rewriting log.
2535 // Can allow dirty to remain dirty - log.seq_live will not change.
7c673cae 2536
39ae355f
TL
2537 //
2538 // Part 1.
2539 // Build new log starter and compacted metadata body
2540 // 1.1. Build full compacted meta transaction.
2541 // Encode a bluefs transaction that dumps all of the in-memory fnodes
2542 // and names.
2543 // This might be pretty large and its allocation map can exceed
2544 // superblock size. Hence instead we'll need log starter part which
2545 // goes to superblock and refers that new meta through op_update_inc.
2546 // 1.2. Allocate space for the above transaction
2547 // using its size estimation.
2548 // 1.3. Allocate the space required for the starter part of the new log.
2549 // It should be small enough to fit into superblock.
2550 // 1.4 Building new log persistent fnode representation which will
2551 // finally land to disk.
2552 // Depending on input parameters we might need to perform device ids
2553 // rename - runtime and persistent replicas should be different when we
2554 // are in the device migration process.
2555 // 1.5 Store starter fnode to run-time superblock, to be written out later.
2556 // It doesn't contain compacted meta to fit relevant alocation map into
2557 // superblock.
2558 // 1.6 Proceed building new log persistent fnode representation.
2559 // Will add log tail with compacted meta extents from 1.1.
2560 // Device rename applied as well
2561 //
2562 // 1.7. Encode new log fnode starter,
2563 // It will include op_init, new log's op_update_inc
2564 // and jump to the compacted meta transaction beginning.
2565 // Superblock will reference this starter part
2566 //
2567 // 1.8. Encode compacted meta transaction,
2568 // extend the transaction with a jump to proper sequence no
2569 //
2570
2571
2572 // 1.1 Build full compacted meta transaction
2573 bluefs_transaction_t compacted_meta_t;
2574 _compact_log_dump_metadata_NF(starter_seq + 1, &compacted_meta_t, flags, 0);
2575
2576 // 1.2 Allocate the space required for the compacted meta transaction
2577 uint64_t compacted_meta_need =
2578 _estimate_transaction_size(&compacted_meta_t) +
2579 cct->_conf->bluefs_max_log_runway;
2580
2581 dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need << dendl;
2582
2583 int r = _allocate(log_dev, compacted_meta_need, 0, &fnode_tail, 0,
2584 permit_dev_fallback);
2585 ceph_assert(r == 0);
7c673cae 2586
7c673cae 2587
39ae355f
TL
2588 // 1.3 Allocate the space required for the starter part of the new log.
2589 // estimate new log fnode size to be referenced from superblock
2590 // hence use dummy fnode and jump parameters
2591 uint64_t starter_need = _make_initial_transaction(starter_seq, fnode_tail, 0, nullptr);
7c673cae 2592
39ae355f
TL
2593 bluefs_fnode_t fnode_starter(log_file->fnode.ino, 0, mtime);
2594 r = _allocate(log_dev, starter_need, 0, &fnode_starter, 0,
2595 permit_dev_fallback);
2596 ceph_assert(r == 0);
7c673cae 2597
39ae355f
TL
2598 // 1.4 Building starter fnode
2599 bluefs_fnode_t fnode_persistent(fnode_starter.ino, 0, mtime);
2600 for (auto p : fnode_starter.extents) {
2601 // rename device if needed - this is possible when fallback allocations
2602 // are prohibited only. Which means every extent is targeted to the same
2603 // device and we can unconditionally update them.
2604 if (log_dev != log_dev_new) {
2605 dout(10) << __func__ << " renaming log extents to "
2606 << log_dev_new << dendl;
2607 p.bdev = log_dev_new;
11fdf7f2 2608 }
39ae355f 2609 fnode_persistent.append_extent(p);
7c673cae
FG
2610 }
2611
39ae355f
TL
2612 // 1.5 Store starter fnode to run-time superblock, to be written out later
2613 super.log_fnode = fnode_persistent;
7c673cae 2614
39ae355f
TL
2615 // 1.6 Proceed building new log persistent fnode representation
2616 // we'll build incremental update starting from this point
2617 fnode_persistent.reset_delta();
2618 for (auto p : fnode_tail.extents) {
2619 // rename device if needed - this is possible when fallback allocations
2620 // are prohibited only. Which means every extent is targeted to the same
2621 // device and we can unconditionally update them.
2622 if (log_dev != log_dev_new) {
2623 dout(10) << __func__ << " renaming log extents to "
2624 << log_dev_new << dendl;
2625 p.bdev = log_dev_new;
2626 }
2627 fnode_persistent.append_extent(p);
2628 }
2629
2630 // 1.7 Encode new log fnode
2631 // This will flush incremental part of fnode_persistent only.
2632 bufferlist starter_bl;
2633 _make_initial_transaction(starter_seq, fnode_persistent, starter_need, &starter_bl);
9f95a23c 2634
39ae355f
TL
2635 // 1.8 Encode compacted meta transaction
2636 dout(20) << __func__ << " op_jump_seq " << log.seq_live << dendl;
2637 // hopefully "compact_meta_need" estimation provides enough extra space
2638 // for this op, assert below if not
2639 compacted_meta_t.op_jump_seq(log.seq_live);
2640
2641 bufferlist compacted_meta_bl;
2642 encode(compacted_meta_t, compacted_meta_bl);
2643 _pad_bl(compacted_meta_bl);
2644 ceph_assert(compacted_meta_bl.length() <= compacted_meta_need);
2645
2646 //
2647 // Part 2
2648 // Write out new log's content
2649 // 2.1. Build the full runtime new log's fnode
2650 //
2651 // 2.2. Write out new log's
2652 //
2653 // 2.3. Do flush and wait for completion through flush_bdev()
2654 //
2655 // 2.4. Finalize log update
2656 // Update all sequence numbers
2657 //
2658
2659 // 2.1 Build the full runtime new log's fnode
2660 bluefs_fnode_t old_log_fnode;
2661 old_log_fnode.swap(fnode_starter);
2662 old_log_fnode.clone_extents(fnode_tail);
2663 old_log_fnode.reset_delta();
2664 log_file->fnode.swap(old_log_fnode);
2665
2666 // 2.2 Write out new log's content
2667 // Get rid off old writer
2668 _close_writer(log.writer);
2669 // Make new log writer and stage new log's content writing
20effc67 2670 log.writer = _create_writer(log_file);
39ae355f
TL
2671 log.writer->append(starter_bl);
2672 log.writer->append(compacted_meta_bl);
2673
2674 // 2.3 Do flush and wait for completion through flush_bdev()
20effc67 2675 _flush_special(log.writer);
11fdf7f2
TL
2676#ifdef HAVE_LIBAIO
2677 if (!cct->_conf->bluefs_sync_write) {
2678 list<aio_t> completed_ios;
20effc67
TL
2679 _claim_completed_aios(log.writer, &completed_ios);
2680 _wait_for_aio(log.writer);
11fdf7f2
TL
2681 completed_ios.clear();
2682 }
2683#endif
20effc67 2684 _flush_bdev();
39ae355f
TL
2685
2686 // 2.4 Finalize log update
1d09f67e
TL
2687 ++log.seq_live;
2688 dirty.seq_live = log.seq_live;
2689 log.t.seq = log.seq_live;
39ae355f
TL
2690 vselector->sub_usage(log_file->vselector_hint, old_log_fnode);
2691 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
224ce89b 2692
39ae355f
TL
2693 // Part 3.
2694 // Write out new superblock to reflect all the changes.
2695 //
11fdf7f2 2696
39ae355f 2697 super.memorized_layout = layout;
11fdf7f2 2698 _write_super(super_dev);
20effc67 2699 _flush_bdev();
7c673cae 2700
39ae355f
TL
2701 // we're mostly done
2702 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2703 logger->inc(l_bluefs_log_compactions);
2704
2705 // Part 4
2706 // Finalization. Release old space.
2707 //
2708 {
2709 dout(10) << __func__
2710 << " release old log extents " << old_log_fnode.extents
2711 << dendl;
2712 std::lock_guard dl(dirty.lock);
2713 for (auto& r : old_log_fnode.extents) {
2714 dirty.pending_release[r.bdev].insert(r.offset, r.length);
2715 }
7c673cae 2716 }
39ae355f 2717 logger->tinc(l_bluefs_compaction_lock_lat, mono_clock::now() - t0);
7c673cae
FG
2718}
2719
2720/*
39ae355f 2721 * ASYNC LOG COMPACTION
7c673cae 2722 *
39ae355f
TL
2723 * 0. Lock the log and forbid its extension. The former covers just
2724 * a part of the below procedure while the latter spans over it
2725 * completely.
2726 * 1. Allocate a new extent to continue the log, and then log an event
2727 * that jumps the log write position to the new extent. At this point, the
2728 * old extent(s) won't be written to, and reflect everything to compact.
2729 * New events will be written to the new region that we'll keep.
2730 * The latter will finally become new log tail on compaction completion.
7c673cae 2731 *
39ae355f
TL
2732 * 2. Build new log. It will include log's starter, compacted metadata
2733 * body and the above tail. Jump ops appended to the starter and meta body
2734 * will link the pieces togather. Log's lock is releases in the mid of the
2735 * process to permit parallel access to it.
7c673cae 2736 *
39ae355f 2737 * 3. Write out new log's content.
7c673cae 2738 *
39ae355f 2739 * 4. Write out new superblock to reflect all the changes.
7c673cae 2740 *
39ae355f 2741 * 5. Apply new log fnode, log is locked for a while.
7c673cae 2742 *
39ae355f 2743 * 6. Finalization. Clean up, old space release and total unlocking.
7c673cae 2744 */
20effc67
TL
2745
2746void BlueFS::_compact_log_async_LD_LNF_D() //also locks FW for new_writer
7c673cae
FG
2747{
2748 dout(10) << __func__ << dendl;
39ae355f
TL
2749 utime_t mtime = ceph_clock_now();
2750 uint64_t starter_seq = 1;
2751 uint64_t old_log_jump_to = 0;
2752
2753 // Part 0.
2754 // Lock the log and forbid its expansion and other compactions
2755
20effc67
TL
2756 // only one compaction allowed at one time
2757 bool old_is_comp = std::atomic_exchange(&log_is_compacting, true);
2758 if (old_is_comp) {
2759 dout(10) << __func__ << " ongoing" <<dendl;
2760 return;
2761 }
39ae355f 2762 // lock log's run-time structures for a while
20effc67 2763 log.lock.lock();
39ae355f 2764 auto t0 = mono_clock::now();
181888fb 2765
20effc67
TL
2766 // Part 1.
2767 // Prepare current log for jumping into it.
2768 // 1. Allocate extent
2769 // 2. Update op to log
2770 // 3. Jump op to log
2771 // During that, no one else can write to log, otherwise we risk jumping backwards.
2772 // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that.
2773
2774 //signal _maybe_extend_log that expansion of log is temporary inacceptable
2775 bool old_forbidden = atomic_exchange(&log_forbidden_to_expand, true);
2776 ceph_assert(old_forbidden == false);
3efd9988 2777
39ae355f
TL
2778 //
2779 // Part 1.
2780 // Prepare current log for jumping into it.
2781 // 1.1. Allocate extent
2782 // 1.2. Save log's fnode extents and add new extents
2783 // 1.3. Update op to log
2784 // 1.4. Jump op to log
2785 // During that, no one else can write to log, otherwise we risk jumping backwards.
2786 // We need to sync log, because we are injecting discontinuity, and writer is not prepared for that.
9f95a23c 2787
39ae355f
TL
2788 // 1.1 allocate new log extents and store them at fnode_tail
2789 File *log_file = log.writer->file.get();
7c673cae 2790 old_log_jump_to = log_file->fnode.get_allocated();
39ae355f 2791 bluefs_fnode_t fnode_tail;
20effc67 2792 uint64_t runway = log_file->fnode.get_allocated() - log.writer->get_effective_write_pos();
7c673cae 2793 dout(10) << __func__ << " old_log_jump_to 0x" << std::hex << old_log_jump_to
39ae355f 2794 << " need 0x" << cct->_conf->bluefs_max_log_runway << std::dec << dendl;
9f95a23c
TL
2795 int r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
2796 cct->_conf->bluefs_max_log_runway,
39ae355f
TL
2797 0,
2798 &fnode_tail);
11fdf7f2 2799 ceph_assert(r == 0);
39ae355f
TL
2800
2801 // 1.2 save log's fnode extents and add new extents
2802 bluefs_fnode_t old_log_fnode(log_file->fnode);
2803 log_file->fnode.clone_extents(fnode_tail);
9f95a23c 2804 //adjust usage as flush below will need it
39ae355f 2805 vselector->sub_usage(log_file->vselector_hint, old_log_fnode);
9f95a23c 2806 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
7c673cae
FG
2807 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2808
39ae355f 2809 // 1.3 update the log file change and log a jump to the offset where we want to
7c673cae 2810 // write the new entries
39ae355f
TL
2811 log.t.op_file_update_inc(log_file->fnode);
2812
2813 // 1.4 jump to new position should mean next seq
20effc67
TL
2814 log.t.op_jump(log.seq_live + 1, old_log_jump_to);
2815 uint64_t seq_now = log.seq_live;
2816 // we need to flush all bdev because we will be streaming all dirty files to log
2817 // TODO - think - if _flush_and_sync_log_jump will not add dirty files nor release pending allocations
2818 // then flush_bdev() will not be necessary
2819 _flush_bdev();
2820 _flush_and_sync_log_jump_D(old_log_jump_to, runway);
2821
39ae355f
TL
2822 //
2823 // Part 2.
2824 // Build new log starter and compacted metadata body
2825 // 2.1. Build full compacted meta transaction.
2826 // While still holding the lock, encode a bluefs transaction
2827 // that dumps all of the in-memory fnodes and names.
2828 // This might be pretty large and its allocation map can exceed
2829 // superblock size. Hence instead we'll need log starter part which
2830 // goes to superblock and refers that new meta through op_update_inc.
2831 // 2.2. After releasing the lock allocate space for the above transaction
2832 // using its size estimation.
2833 // Then build tailing list of extents which consists of these
2834 // newly allocated extents followed by ones from Part 1.
2835 // 2.3. Allocate the space required for the starter part of the new log.
2836 // It should be small enough to fit into superblock.
2837 // Effectively we start building new log fnode here.
2838 // 2.4. Store starter fnode to run-time superblock, to be written out later
2839 // 2.5. Finalize new log's fnode building
2840 // This will include log's starter and tailing extents built at 2.2
2841 // 2.6. Encode new log fnode starter,
2842 // It will include op_init, new log's op_update_inc
2843 // and jump to the compacted meta transaction beginning.
2844 // Superblock will reference this starter part
2845 // 2.7. Encode compacted meta transaction,
2846 // extend the transaction with a jump to the log tail from 1.1 before
2847 // encoding.
2848 //
2849
2850 // 2.1 Build full compacted meta transaction
2851 bluefs_transaction_t compacted_meta_t;
2852 _compact_log_dump_metadata_NF(starter_seq + 1, &compacted_meta_t, 0, seq_now);
2853
2854 // now state is captured to compacted_meta_t,
2855 // current log can be used to write to,
2856 //ops in log will be continuation of captured state
2857 logger->tinc(l_bluefs_compaction_lock_lat, mono_clock::now() - t0);
20effc67 2858 log.lock.unlock();
7c673cae 2859
39ae355f
TL
2860 // 2.2 Allocate the space required for the compacted meta transaction
2861 uint64_t compacted_meta_need = _estimate_transaction_size(&compacted_meta_t);
2862 dout(20) << __func__ << " compacted_meta_need " << compacted_meta_need
2863 << dendl;
2864 {
2865 bluefs_fnode_t fnode_pre_tail;
2866 // do allocate
2867 r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
2868 compacted_meta_need,
2869 0,
2870 &fnode_pre_tail);
2871 ceph_assert(r == 0);
2872 // build trailing list of extents in fnode_tail,
2873 // this will include newly allocated extents for compacted meta
2874 // and aux extents allocated at step 1.1
2875 fnode_pre_tail.claim_extents(fnode_tail.extents);
2876 fnode_tail.swap_extents(fnode_pre_tail);
2877 }
eafe8130 2878
39ae355f
TL
2879 // 2.3 Allocate the space required for the starter part of the new log.
2880 // Start building New log fnode
2881 FileRef new_log = nullptr;
2882 new_log = ceph::make_ref<File>();
2883 new_log->fnode.ino = log_file->fnode.ino;
2884 new_log->fnode.mtime = mtime;
2885 // Estimate the required space
2886 uint64_t starter_need =
2887 _make_initial_transaction(starter_seq, fnode_tail, 0, nullptr);
2888 // and now allocate and store at new_log_fnode
2889 r = _allocate(vselector->select_prefer_bdev(log_file->vselector_hint),
2890 starter_need,
2891 0,
2892 &new_log->fnode);
11fdf7f2
TL
2893 ceph_assert(r == 0);
2894
39ae355f
TL
2895 // 2.4 Store starter fnode to run-time superblock, to be written out later
2896 super.log_fnode = new_log->fnode;
7c673cae 2897
39ae355f
TL
2898 // 2.5 Finalize new log's fnode building
2899 // start collecting new log fnode updates (to make op_update_inc later)
2900 // since this point. This will include compacted meta from 2.2 and aux
2901 // extents from 1.1.
2902 new_log->fnode.reset_delta();
2903 new_log->fnode.claim_extents(fnode_tail.extents);
7c673cae 2904
39ae355f
TL
2905 // 2.6 Encode new log fnode
2906 bufferlist starter_bl;
2907 _make_initial_transaction(starter_seq, new_log->fnode, starter_need,
2908 &starter_bl);
7c673cae 2909
39ae355f
TL
2910 // 2.7 Encode compacted meta transaction,
2911 dout(20) << __func__
2912 << " new_log jump seq " << seq_now
2913 << std::hex << " offset 0x" << starter_need + compacted_meta_need
2914 << std::dec << dendl;
2915 // Extent compacted_meta transaction with a just to new log tail.
2916 // Hopefully "compact_meta_need" estimation provides enough extra space
2917 // for this new jump, assert below if not
2918 compacted_meta_t.op_jump(seq_now, starter_need + compacted_meta_need);
2919 // Now do encodeing and padding
2920 bufferlist compacted_meta_bl;
2921 compacted_meta_bl.reserve(compacted_meta_need);
2922 encode(compacted_meta_t, compacted_meta_bl);
2923 ceph_assert(compacted_meta_bl.length() <= compacted_meta_need);
2924 _pad_bl(compacted_meta_bl, compacted_meta_need);
2925
2926 //
2927 // Part 3.
2928 // Write out new log's content
2929 // 3.1 Stage new log's content writing
2930 // 3.2 Do flush and wait for completion through flush_bdev()
2931 //
2932
2933 // 3.1 Stage new log's content writing
2934 // Make new log writer and append bufferlists to write out.
2935 FileWriter *new_log_writer = _create_writer(new_log);
2936 // And append all new log's bufferlists to write out.
2937 new_log_writer->append(starter_bl);
2938 new_log_writer->append(compacted_meta_bl);
2939
2940 // 3.2. flush and wait
20effc67 2941 _flush_special(new_log_writer);
39ae355f 2942 _flush_bdev(new_log_writer, false); // do not check log.lock is locked
7c673cae 2943
39ae355f
TL
2944 // Part 4.
2945 // Write out new superblock to reflect all the changes.
2946 //
7c673cae 2947
20effc67
TL
2948 _write_super(BDEV_DB);
2949 _flush_bdev();
2950
39ae355f
TL
2951 // Part 5.
2952 // Apply new log fnode
2953 //
2954
2955 // we need to acquire log's lock back at this point
20effc67 2956 log.lock.lock();
39ae355f 2957 // Reconstruct actual log object from the new one.
9f95a23c 2958 vselector->sub_usage(log_file->vselector_hint, log_file->fnode);
39ae355f
TL
2959 log_file->fnode.size =
2960 log.writer->pos - old_log_jump_to + starter_need + compacted_meta_need;
2961 log_file->fnode.mtime = std::max(mtime, log_file->fnode.mtime);
2962 log_file->fnode.swap_extents(new_log->fnode);
2963 // update log's writer
2964 log.writer->pos = log.writer->file->fnode.size;
9f95a23c 2965 vselector->add_usage(log_file->vselector_hint, log_file->fnode);
39ae355f 2966 // and unlock
20effc67 2967 log.lock.unlock();
7c673cae 2968
39ae355f
TL
2969 // we're mostly done
2970 dout(10) << __func__ << " log extents " << log_file->fnode.extents << dendl;
2971 logger->inc(l_bluefs_log_compactions);
2972
2973 //Part 6.
2974 // Finalization
2975 // 6.1 Permit log's extension, forbidden at step 0.
2976 //
2977 // 6.2 Release the new log writer
2978 //
2979 // 6.3 Release old space
2980 //
2981 // 6.4. Enable other compactions
2982 //
2983
2984 // 6.1 Permit log's extension, forbidden at step 0.
20effc67
TL
2985 old_forbidden = atomic_exchange(&log_forbidden_to_expand, false);
2986 ceph_assert(old_forbidden == true);
2987 //to wake up if someone was in need of expanding log
2988 log_cond.notify_all();
7c673cae 2989
39ae355f
TL
2990 // 6.2 Release the new log writer
2991 _close_writer(new_log_writer);
2992 new_log_writer = nullptr;
2993 new_log = nullptr;
2994
2995 // 6.3 Release old space
20effc67 2996 {
39ae355f
TL
2997 dout(10) << __func__
2998 << " release old log extents " << old_log_fnode.extents
2999 << dendl;
20effc67 3000 std::lock_guard dl(dirty.lock);
39ae355f 3001 for (auto& r : old_log_fnode.extents) {
20effc67
TL
3002 dirty.pending_release[r.bdev].insert(r.offset, r.length);
3003 }
7c673cae
FG
3004 }
3005
39ae355f 3006 // 6.4. Enable other compactions
20effc67
TL
3007 old_is_comp = atomic_exchange(&log_is_compacting, false);
3008 ceph_assert(old_is_comp);
7c673cae
FG
3009}
3010
39ae355f 3011void BlueFS::_pad_bl(bufferlist& bl, uint64_t pad_size)
7c673cae 3012{
39ae355f
TL
3013 pad_size = std::max(pad_size, uint64_t(super.block_size));
3014 uint64_t partial = bl.length() % pad_size;
7c673cae
FG
3015 if (partial) {
3016 dout(10) << __func__ << " padding with 0x" << std::hex
39ae355f
TL
3017 << pad_size - partial << " zeros" << std::dec << dendl;
3018 bl.append_zero(pad_size - partial);
7c673cae
FG
3019 }
3020}
3021
7c673cae 3022
20effc67
TL
3023// Returns log seq that was live before advance.
3024uint64_t BlueFS::_log_advance_seq()
7c673cae 3025{
20effc67
TL
3026 ceph_assert(ceph_mutex_is_locked(dirty.lock));
3027 ceph_assert(ceph_mutex_is_locked(log.lock));
3028 //acquire new seq
3029 // this will became seq_stable once we write
3030 ceph_assert(dirty.seq_stable < dirty.seq_live);
3031 ceph_assert(log.t.seq == log.seq_live);
3032 uint64_t seq = log.seq_live;
3033 log.t.uuid = super.uuid;
3034
3035 ++dirty.seq_live;
3036 ++log.seq_live;
3037 ceph_assert(dirty.seq_live == log.seq_live);
3038 return seq;
3039}
7c673cae 3040
a8e16298 3041
20effc67
TL
3042// Adds to log.t file modifications mentioned in `dirty.files`.
3043// Note: some bluefs ops may have already been stored in log.t transaction.
3044void BlueFS::_consume_dirty(uint64_t seq)
3045{
3046 ceph_assert(ceph_mutex_is_locked(dirty.lock));
3047 ceph_assert(ceph_mutex_is_locked(log.lock));
7c673cae
FG
3048
3049 // log dirty files
20effc67
TL
3050 // we just incremented log_seq. It is now illegal to add to dirty.files[log_seq]
3051 auto lsi = dirty.files.find(seq);
3052 if (lsi != dirty.files.end()) {
3053 dout(20) << __func__ << " " << lsi->second.size() << " dirty.files" << dendl;
7c673cae 3054 for (auto &f : lsi->second) {
20effc67
TL
3055 // fnode here is protected indirectly
3056 // the only path that adds to dirty.files goes from _fsync()
3057 // _fsync() is executed under writer lock,
3058 // and does not exit until syncing log is done
3059 dout(20) << __func__ << " op_file_update_inc " << f.fnode << dendl;
3060 log.t.op_file_update_inc(f.fnode);
7c673cae
FG
3061 }
3062 }
20effc67 3063}
7c673cae 3064
20effc67
TL
3065// Extends log if its free space is smaller then bluefs_min_log_runway.
3066// Returns space available *BEFORE* adding new space. Signed for additional <0 detection.
3067int64_t BlueFS::_maybe_extend_log()
3068{
3069 ceph_assert(ceph_mutex_is_locked(log.lock));
7c673cae 3070 // allocate some more space (before we run out)?
20effc67
TL
3071 // BTW: this triggers `flush()` in the `page_aligned_appender` of `log.writer`.
3072 int64_t runway = log.writer->file->fnode.get_allocated() -
3073 log.writer->get_effective_write_pos();
7c673cae
FG
3074 if (runway < (int64_t)cct->_conf->bluefs_min_log_runway) {
3075 dout(10) << __func__ << " allocating more log runway (0x"
3076 << std::hex << runway << std::dec << " remaining)" << dendl;
20effc67
TL
3077 /*
3078 * Usually, when we are low on space in log, we just allocate new extent,
3079 * put update op(log) to log and we are fine.
3080 * Problem - it interferes with log compaction:
3081 * New log produced in compaction will include - as last op - jump into some offset (anchor) of current log.
3082 * It is assumed that log region (anchor - end) will contain all changes made by bluefs since
3083 * full state capture into new log.
3084 * Putting log update into (anchor - end) region is illegal, because any update there must be compatible with
3085 * both logs, but old log is different then new log.
3086 *
3087 * Possible solutions:
3088 * - stall extending log until we finish compacting and switch log (CURRENT)
3089 * - re-run compaction with more runway for old log
3090 * - add OP_FILE_ADDEXT that adds extent; will be compatible with both logs
3091 */
3092 if (log_forbidden_to_expand.load() == true) {
3093 return -EWOULDBLOCK;
7c673cae 3094 }
20effc67 3095 vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
9f95a23c 3096 int r = _allocate(
20effc67 3097 vselector->select_prefer_bdev(log.writer->file->vselector_hint),
9f95a23c 3098 cct->_conf->bluefs_max_log_runway,
39ae355f 3099 0,
20effc67 3100 &log.writer->file->fnode);
11fdf7f2 3101 ceph_assert(r == 0);
20effc67
TL
3102 vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode);
3103 log.t.op_file_update_inc(log.writer->file->fnode);
7c673cae 3104 }
20effc67
TL
3105 return runway;
3106}
3107
3108void BlueFS::_flush_and_sync_log_core(int64_t runway)
3109{
3110 ceph_assert(ceph_mutex_is_locked(log.lock));
3111 dout(10) << __func__ << " " << log.t << dendl;
7c673cae
FG
3112
3113 bufferlist bl;
11fdf7f2 3114 bl.reserve(super.block_size);
20effc67 3115 encode(log.t, bl);
7c673cae 3116 // pad to block boundary
11fdf7f2
TL
3117 size_t realign = super.block_size - (bl.length() % super.block_size);
3118 if (realign && realign != super.block_size)
3119 bl.append_zero(realign);
3120
1e59de90 3121 logger->inc(l_bluefs_log_write_count, 1);
7c673cae
FG
3122 logger->inc(l_bluefs_logged_bytes, bl.length());
3123
20effc67 3124 if (true) {
f6b5b4d7 3125 ceph_assert(bl.length() <= runway); // if we write this, we will have an unrecoverable data loss
20effc67 3126 // transaction will not fit extents before growth -> data loss on _replay
f6b5b4d7
TL
3127 }
3128
20effc67 3129 log.writer->append(bl);
7c673cae 3130
20effc67
TL
3131 // prepare log for new transactions
3132 log.t.clear();
3133 log.t.seq = log.seq_live;
7c673cae 3134
20effc67
TL
3135 uint64_t new_data = _flush_special(log.writer);
3136 vselector->add_usage(log.writer->file->vselector_hint, new_data);
3137}
7c673cae 3138
20effc67
TL
3139// Clears dirty.files up to (including) seq_stable.
3140void BlueFS::_clear_dirty_set_stable_D(uint64_t seq)
3141{
3142 std::lock_guard dl(dirty.lock);
7c673cae
FG
3143
3144 // clean dirty files
20effc67
TL
3145 if (seq > dirty.seq_stable) {
3146 dirty.seq_stable = seq;
3147 dout(20) << __func__ << " seq_stable " << dirty.seq_stable << dendl;
3148
3149 // undirty all files that were already streamed to log
3150 auto p = dirty.files.begin();
3151 while (p != dirty.files.end()) {
3152 if (p->first > dirty.seq_stable) {
7c673cae
FG
3153 dout(20) << __func__ << " done cleaning up dirty files" << dendl;
3154 break;
3155 }
3156
3157 auto l = p->second.begin();
3158 while (l != p->second.end()) {
3159 File *file = &*l;
20effc67
TL
3160 ceph_assert(file->dirty_seq <= dirty.seq_stable);
3161 dout(20) << __func__ << " cleaned file " << file->fnode.ino << dendl;
3162 file->dirty_seq = dirty.seq_stable;
7c673cae
FG
3163 p->second.erase(l++);
3164 }
3165
11fdf7f2 3166 ceph_assert(p->second.empty());
20effc67 3167 dirty.files.erase(p++);
7c673cae
FG
3168 }
3169 } else {
20effc67 3170 dout(20) << __func__ << " seq_stable " << dirty.seq_stable
7c673cae
FG
3171 << " already >= out seq " << seq
3172 << ", we lost a race against another log flush, done" << dendl;
3173 }
20effc67 3174}
a8e16298 3175
20effc67
TL
3176void BlueFS::_release_pending_allocations(vector<interval_set<uint64_t>>& to_release)
3177{
a8e16298 3178 for (unsigned i = 0; i < to_release.size(); ++i) {
1e59de90
TL
3179 if (to_release[i].empty()) {
3180 continue;
3181 }
3182 /* OK, now we have the guarantee alloc[i] won't be null. */
3183
3184 bool discard_queued = bdev[i]->try_discard(to_release[i]);
3185 if (!discard_queued) {
a8e16298 3186 alloc[i]->release(to_release[i]);
f67539c2
TL
3187 if (is_shared_alloc(i)) {
3188 shared_alloc->bluefs_used -= to_release[i].size();
3189 }
a8e16298
TL
3190 }
3191 }
20effc67
TL
3192}
3193
3194int BlueFS::_flush_and_sync_log_LD(uint64_t want_seq)
3195{
3196 int64_t available_runway;
3197 do {
3198 log.lock.lock();
3199 dirty.lock.lock();
3200 if (want_seq && want_seq <= dirty.seq_stable) {
3201 dout(10) << __func__ << " want_seq " << want_seq << " <= seq_stable "
3202 << dirty.seq_stable << ", done" << dendl;
3203 dirty.lock.unlock();
3204 log.lock.unlock();
3205 return 0;
3206 }
3207
3208 available_runway = _maybe_extend_log();
3209 if (available_runway == -EWOULDBLOCK) {
3210 // we are in need of adding runway, but we are during log-switch from compaction
3211 dirty.lock.unlock();
3212 //instead log.lock.unlock() do move ownership
3213 std::unique_lock<ceph::mutex> ll(log.lock, std::adopt_lock);
3214 while (log_forbidden_to_expand.load()) {
3215 log_cond.wait(ll);
3216 }
3217 } else {
3218 ceph_assert(available_runway >= 0);
3219 }
3220 } while (available_runway < 0);
3221
3222 ceph_assert(want_seq == 0 || want_seq <= dirty.seq_live); // illegal to request seq that was not created yet
3223 uint64_t seq =_log_advance_seq();
3224 _consume_dirty(seq);
3225 vector<interval_set<uint64_t>> to_release(dirty.pending_release.size());
3226 to_release.swap(dirty.pending_release);
3227 dirty.lock.unlock();
3228
3229 _flush_and_sync_log_core(available_runway);
3230 _flush_bdev(log.writer);
3231 logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size);
3232 //now log.lock is no longer needed
3233 log.lock.unlock();
3234
3235 _clear_dirty_set_stable_D(seq);
3236 _release_pending_allocations(to_release);
a8e16298 3237
7c673cae 3238 _update_logger_stats();
20effc67
TL
3239 return 0;
3240}
3241
3242// Flushes log and immediately adjusts log_writer pos.
3243int BlueFS::_flush_and_sync_log_jump_D(uint64_t jump_to,
3244 int64_t available_runway)
3245{
3246 ceph_assert(ceph_mutex_is_locked(log.lock));
3247
3248 ceph_assert(jump_to);
3249 // we synchronize writing to log, by lock to log.lock
3250
3251 dirty.lock.lock();
3252 uint64_t seq =_log_advance_seq();
3253 _consume_dirty(seq);
3254 vector<interval_set<uint64_t>> to_release(dirty.pending_release.size());
3255 to_release.swap(dirty.pending_release);
3256 dirty.lock.unlock();
3257 _flush_and_sync_log_core(available_runway);
7c673cae 3258
20effc67
TL
3259 dout(10) << __func__ << " jumping log offset from 0x" << std::hex
3260 << log.writer->pos << " -> 0x" << jump_to << std::dec << dendl;
3261 log.writer->pos = jump_to;
3262 vselector->sub_usage(log.writer->file->vselector_hint, log.writer->file->fnode.size);
3263 log.writer->file->fnode.size = jump_to;
3264 vselector->add_usage(log.writer->file->vselector_hint, log.writer->file->fnode.size);
3265
3266 _flush_bdev(log.writer);
3267
3268 _clear_dirty_set_stable_D(seq);
3269 _release_pending_allocations(to_release);
3270
3271 logger->set(l_bluefs_log_bytes, log.writer->file->fnode.size);
3272 _update_logger_stats();
7c673cae
FG
3273 return 0;
3274}
3275
f67539c2
TL
3276ceph::bufferlist BlueFS::FileWriter::flush_buffer(
3277 CephContext* const cct,
3278 const bool partial,
3279 const unsigned length,
3280 const bluefs_super_t& super)
3281{
20effc67 3282 ceph_assert(ceph_mutex_is_locked(this->lock) || file->fnode.ino <= 1);
f67539c2
TL
3283 ceph::bufferlist bl;
3284 if (partial) {
3285 tail_block.splice(0, tail_block.length(), &bl);
3286 }
3287 const auto remaining_len = length - bl.length();
3288 buffer.splice(0, remaining_len, &bl);
3289 if (buffer.length()) {
3290 dout(20) << " leaving 0x" << std::hex << buffer.length() << std::dec
3291 << " unflushed" << dendl;
3292 }
3293 if (const unsigned tail = bl.length() & ~super.block_mask(); tail) {
3294 const auto padding_len = super.block_size - tail;
3295 dout(20) << __func__ << " caching tail of 0x"
3296 << std::hex << tail
3297 << " and padding block with 0x" << padding_len
3298 << " buffer.length() " << buffer.length()
3299 << std::dec << dendl;
3300 // We need to go through the `buffer_appender` to get a chance to
3301 // preserve in-memory contiguity and not mess with the alignment.
3302 // Otherwise a costly rebuild could happen in e.g. `KernelDevice`.
3303 buffer_appender.append_zero(padding_len);
3304 buffer.splice(buffer.length() - padding_len, padding_len, &bl);
3305 // Deep copy the tail here. This allows to avoid costlier copy on
3306 // bufferlist rebuild in e.g. `KernelDevice` and minimizes number
3307 // of memory allocations.
3308 // The alternative approach would be to place the entire tail and
3309 // padding on a dedicated, 4 KB long memory chunk. This shouldn't
3310 // trigger the rebuild while still being less expensive.
3311 buffer_appender.substr_of(bl, bl.length() - padding_len - tail, tail);
3312 buffer.splice(buffer.length() - tail, tail, &tail_block);
3313 } else {
3314 tail_block.clear();
3315 }
3316 return bl;
3317}
3318
20effc67 3319int BlueFS::_signal_dirty_to_log_D(FileWriter *h)
522d829b 3320{
20effc67
TL
3321 ceph_assert(ceph_mutex_is_locked(h->lock));
3322 std::lock_guard dl(dirty.lock);
1e59de90
TL
3323 if (h->file->deleted) {
3324 dout(10) << __func__ << " deleted, no-op" << dendl;
3325 return 0;
3326 }
3327
522d829b
TL
3328 h->file->fnode.mtime = ceph_clock_now();
3329 ceph_assert(h->file->fnode.ino >= 1);
20effc67
TL
3330 if (h->file->dirty_seq <= dirty.seq_stable) {
3331 h->file->dirty_seq = dirty.seq_live;
3332 dirty.files[h->file->dirty_seq].push_back(*h->file);
3333 dout(20) << __func__ << " dirty_seq = " << dirty.seq_live
522d829b
TL
3334 << " (was clean)" << dendl;
3335 } else {
20effc67 3336 if (h->file->dirty_seq != dirty.seq_live) {
522d829b 3337 // need re-dirty, erase from list first
20effc67
TL
3338 ceph_assert(dirty.files.count(h->file->dirty_seq));
3339 auto it = dirty.files[h->file->dirty_seq].iterator_to(*h->file);
3340 dirty.files[h->file->dirty_seq].erase(it);
3341 h->file->dirty_seq = dirty.seq_live;
3342 dirty.files[h->file->dirty_seq].push_back(*h->file);
3343 dout(20) << __func__ << " dirty_seq = " << dirty.seq_live
522d829b
TL
3344 << " (was " << h->file->dirty_seq << ")" << dendl;
3345 } else {
20effc67 3346 dout(20) << __func__ << " dirty_seq = " << dirty.seq_live
522d829b
TL
3347 << " (unchanged, do nothing) " << dendl;
3348 }
3349 }
3350 return 0;
3351}
3352
20effc67 3353void BlueFS::flush_range(FileWriter *h, uint64_t offset, uint64_t length)/*_WF*/
7c673cae 3354{
20effc67
TL
3355 _maybe_check_vselector_LNF();
3356 std::unique_lock hl(h->lock);
3357 _flush_range_F(h, offset, length);
3358}
3359
3360int BlueFS::_flush_range_F(FileWriter *h, uint64_t offset, uint64_t length)
3361{
3362 ceph_assert(ceph_mutex_is_locked(h->lock));
3363 ceph_assert(h->file->num_readers.load() == 0);
3364 ceph_assert(h->file->fnode.ino > 1);
3365
7c673cae
FG
3366 dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
3367 << " 0x" << offset << "~" << length << std::dec
3368 << " to " << h->file->fnode << dendl;
f67539c2
TL
3369 if (h->file->deleted) {
3370 dout(10) << __func__ << " deleted, no-op" << dendl;
3371 return 0;
3372 }
7c673cae 3373
20effc67 3374 bool buffered = cct->_conf->bluefs_buffered_io;
7c673cae
FG
3375
3376 if (offset + length <= h->pos)
3377 return 0;
3378 if (offset < h->pos) {
3379 length -= h->pos - offset;
3380 offset = h->pos;
3381 dout(10) << " still need 0x"
3382 << std::hex << offset << "~" << length << std::dec
3383 << dendl;
3384 }
20effc67 3385 std::lock_guard file_lock(h->file->lock);
11fdf7f2 3386 ceph_assert(offset <= h->file->fnode.size);
7c673cae
FG
3387
3388 uint64_t allocated = h->file->fnode.get_allocated();
9f95a23c 3389 vselector->sub_usage(h->file->vselector_hint, h->file->fnode);
7c673cae
FG
3390 // do not bother to dirty the file if we are overwriting
3391 // previously allocated extents.
7c673cae
FG
3392 if (allocated < offset + length) {
3393 // we should never run out of log space here; see the min runway check
3394 // in _flush_and_sync_log.
9f95a23c 3395 int r = _allocate(vselector->select_prefer_bdev(h->file->vselector_hint),
7c673cae 3396 offset + length - allocated,
39ae355f 3397 0,
94b18763 3398 &h->file->fnode);
7c673cae
FG
3399 if (r < 0) {
3400 derr << __func__ << " allocated: 0x" << std::hex << allocated
3401 << " offset: 0x" << offset << " length: 0x" << length << std::dec
3402 << dendl;
9f95a23c 3403 vselector->add_usage(h->file->vselector_hint, h->file->fnode); // undo
11fdf7f2 3404 ceph_abort_msg("bluefs enospc");
7c673cae
FG
3405 return r;
3406 }
522d829b 3407 h->file->is_dirty = true;
7c673cae
FG
3408 }
3409 if (h->file->fnode.size < offset + length) {
3410 h->file->fnode.size = offset + length;
20effc67 3411 h->file->is_dirty = true;
7c673cae 3412 }
20effc67 3413
522d829b 3414 dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl;
20effc67
TL
3415 int res = _flush_data(h, offset, length, buffered);
3416 vselector->add_usage(h->file->vselector_hint, h->file->fnode);
3417 return res;
3418}
7c673cae 3419
20effc67
TL
3420int BlueFS::_flush_data(FileWriter *h, uint64_t offset, uint64_t length, bool buffered)
3421{
3422 if (h->file->fnode.ino > 1) {
3423 ceph_assert(ceph_mutex_is_locked(h->lock));
3424 ceph_assert(ceph_mutex_is_locked(h->file->lock));
3425 }
7c673cae
FG
3426 uint64_t x_off = 0;
3427 auto p = h->file->fnode.seek(offset, &x_off);
11fdf7f2 3428 ceph_assert(p != h->file->fnode.extents.end());
7c673cae
FG
3429 dout(20) << __func__ << " in " << *p << " x_off 0x"
3430 << std::hex << x_off << std::dec << dendl;
3431
3432 unsigned partial = x_off & ~super.block_mask();
7c673cae
FG
3433 if (partial) {
3434 dout(20) << __func__ << " using partial tail 0x"
3435 << std::hex << partial << std::dec << dendl;
7c673cae
FG
3436 x_off -= partial;
3437 offset -= partial;
3438 length += partial;
3439 dout(20) << __func__ << " waiting for previous aio to complete" << dendl;
3440 for (auto p : h->iocv) {
3441 if (p) {
3442 p->aio_wait();
3443 }
3444 }
3445 }
7c673cae 3446
f67539c2
TL
3447 auto bl = h->flush_buffer(cct, partial, length, super);
3448 ceph_assert(bl.length() >= length);
9f95a23c 3449 h->pos = offset + length;
f67539c2 3450 length = bl.length();
9f95a23c 3451
1e59de90
TL
3452 logger->inc(l_bluefs_write_count, 1);
3453 logger->inc(l_bluefs_write_bytes, length);
3454
7c673cae
FG
3455 switch (h->writer_type) {
3456 case WRITER_WAL:
1e59de90 3457 logger->inc(l_bluefs_write_count_wal, 1);
7c673cae
FG
3458 logger->inc(l_bluefs_bytes_written_wal, length);
3459 break;
3460 case WRITER_SST:
1e59de90 3461 logger->inc(l_bluefs_write_count_sst, 1);
7c673cae
FG
3462 logger->inc(l_bluefs_bytes_written_sst, length);
3463 break;
3464 }
3465
3466 dout(30) << "dump:\n";
3467 bl.hexdump(*_dout);
3468 *_dout << dendl;
3469
7c673cae 3470 uint64_t bloff = 0;
11fdf7f2 3471 uint64_t bytes_written_slow = 0;
7c673cae 3472 while (length > 0) {
1e59de90
TL
3473 logger->inc(l_bluefs_write_disk_count, 1);
3474
11fdf7f2 3475 uint64_t x_len = std::min(p->length - x_off, length);
7c673cae
FG
3476 bufferlist t;
3477 t.substr_of(bl, bloff, x_len);
7c673cae 3478 if (cct->_conf->bluefs_sync_write) {
11fdf7f2 3479 bdev[p->bdev]->write(p->offset + x_off, t, buffered, h->write_hint);
7c673cae 3480 } else {
11fdf7f2
TL
3481 bdev[p->bdev]->aio_write(p->offset + x_off, t, h->iocv[p->bdev], buffered, h->write_hint);
3482 }
3483 h->dirty_devs[p->bdev] = true;
3484 if (p->bdev == BDEV_SLOW) {
3485 bytes_written_slow += t.length();
7c673cae 3486 }
11fdf7f2 3487
7c673cae
FG
3488 bloff += x_len;
3489 length -= x_len;
3490 ++p;
3491 x_off = 0;
3492 }
f67539c2
TL
3493 if (bytes_written_slow) {
3494 logger->inc(l_bluefs_bytes_written_slow, bytes_written_slow);
3495 }
7c673cae
FG
3496 for (unsigned i = 0; i < MAX_BDEV; ++i) {
3497 if (bdev[i]) {
11fdf7f2 3498 if (h->iocv[i] && h->iocv[i]->has_pending_aios()) {
7c673cae
FG
3499 bdev[i]->aio_submit(h->iocv[i]);
3500 }
3501 }
3502 }
3503 dout(20) << __func__ << " h " << h << " pos now 0x"
3504 << std::hex << h->pos << std::dec << dendl;
3505 return 0;
3506}
3507
11fdf7f2 3508#ifdef HAVE_LIBAIO
7c673cae
FG
3509// we need to retire old completed aios so they don't stick around in
3510// memory indefinitely (along with their bufferlist refs).
3511void BlueFS::_claim_completed_aios(FileWriter *h, list<aio_t> *ls)
3512{
3513 for (auto p : h->iocv) {
3514 if (p) {
3515 ls->splice(ls->end(), p->running_aios);
3516 }
3517 }
3518 dout(10) << __func__ << " got " << ls->size() << " aios" << dendl;
3519}
3520
20effc67 3521void BlueFS::_wait_for_aio(FileWriter *h)
7c673cae
FG
3522{
3523 // NOTE: this is safe to call without a lock, as long as our reference is
3524 // stable.
f67539c2
TL
3525 utime_t start;
3526 lgeneric_subdout(cct, bluefs, 10) << __func__;
3527 start = ceph_clock_now();
3528 *_dout << " " << h << dendl;
7c673cae
FG
3529 for (auto p : h->iocv) {
3530 if (p) {
3531 p->aio_wait();
3532 }
3533 }
11fdf7f2 3534 dout(10) << __func__ << " " << h << " done in " << (ceph_clock_now() - start) << dendl;
7c673cae 3535}
11fdf7f2 3536#endif
7c673cae 3537
20effc67
TL
3538void BlueFS::append_try_flush(FileWriter *h, const char* buf, size_t len)/*_WF_LNF_NF_LD_D*/
3539{
3540 bool flushed_sum = false;
3541 {
3542 std::unique_lock hl(h->lock);
3543 size_t max_size = 1ull << 30; // cap to 1GB
3544 while (len > 0) {
3545 bool need_flush = true;
3546 auto l0 = h->get_buffer_length();
3547 if (l0 < max_size) {
3548 size_t l = std::min(len, max_size - l0);
3549 h->append(buf, l);
3550 buf += l;
3551 len -= l;
3552 need_flush = h->get_buffer_length() >= cct->_conf->bluefs_min_flush_size;
3553 }
3554 if (need_flush) {
3555 bool flushed = false;
3556 int r = _flush_F(h, true, &flushed);
3557 ceph_assert(r == 0);
3558 flushed_sum |= flushed;
3559 // make sure we've made any progress with flush hence the
3560 // loop doesn't iterate forever
3561 ceph_assert(h->get_buffer_length() < max_size);
3562 }
3563 }
3564 }
3565 if (flushed_sum) {
3566 _maybe_compact_log_LNF_NF_LD_D();
3567 }
3568}
3569
3570void BlueFS::flush(FileWriter *h, bool force)/*_WF_LNF_NF_LD_D*/
f6b5b4d7
TL
3571{
3572 bool flushed = false;
20effc67
TL
3573 int r;
3574 {
3575 std::unique_lock hl(h->lock);
3576 r = _flush_F(h, force, &flushed);
3577 ceph_assert(r == 0);
3578 }
f6b5b4d7 3579 if (r == 0 && flushed) {
20effc67 3580 _maybe_compact_log_LNF_NF_LD_D();
f6b5b4d7 3581 }
f6b5b4d7
TL
3582}
3583
20effc67 3584int BlueFS::_flush_F(FileWriter *h, bool force, bool *flushed)
7c673cae 3585{
20effc67 3586 ceph_assert(ceph_mutex_is_locked(h->lock));
f67539c2 3587 uint64_t length = h->get_buffer_length();
7c673cae 3588 uint64_t offset = h->pos;
f6b5b4d7
TL
3589 if (flushed) {
3590 *flushed = false;
3591 }
7c673cae
FG
3592 if (!force &&
3593 length < cct->_conf->bluefs_min_flush_size) {
3594 dout(10) << __func__ << " " << h << " ignoring, length " << length
3595 << " < min_flush_size " << cct->_conf->bluefs_min_flush_size
3596 << dendl;
3597 return 0;
3598 }
3599 if (length == 0) {
3600 dout(10) << __func__ << " " << h << " no dirty data on "
3601 << h->file->fnode << dendl;
3602 return 0;
3603 }
3604 dout(10) << __func__ << " " << h << " 0x"
3605 << std::hex << offset << "~" << length << std::dec
3606 << " to " << h->file->fnode << dendl;
11fdf7f2 3607 ceph_assert(h->pos <= h->file->fnode.size);
20effc67 3608 int r = _flush_range_F(h, offset, length);
f6b5b4d7
TL
3609 if (flushed) {
3610 *flushed = true;
3611 }
3612 return r;
7c673cae
FG
3613}
3614
20effc67
TL
3615// Flush for bluefs special files.
3616// Does not add extents to h.
3617// Does not mark h as dirty.
3618// we do not need to dirty the log file (or it's compacting
3619// replacement) when the file size changes because replay is
3620// smart enough to discover it on its own.
3621uint64_t BlueFS::_flush_special(FileWriter *h)
3622{
3623 ceph_assert(h->file->fnode.ino <= 1);
3624 uint64_t length = h->get_buffer_length();
3625 uint64_t offset = h->pos;
3626 uint64_t new_data = 0;
3627 ceph_assert(length + offset <= h->file->fnode.get_allocated());
3628 if (h->file->fnode.size < offset + length) {
3629 new_data = offset + length - h->file->fnode.size;
3630 h->file->fnode.size = offset + length;
3631 }
3632 _flush_data(h, offset, length, false);
3633 return new_data;
3634}
3635
3636int BlueFS::truncate(FileWriter *h, uint64_t offset)/*_WF_L*/
7c673cae 3637{
20effc67 3638 std::lock_guard hl(h->lock);
7c673cae
FG
3639 dout(10) << __func__ << " 0x" << std::hex << offset << std::dec
3640 << " file " << h->file->fnode << dendl;
3641 if (h->file->deleted) {
3642 dout(10) << __func__ << " deleted, no-op" << dendl;
3643 return 0;
3644 }
3645
3646 // we never truncate internal log files
11fdf7f2 3647 ceph_assert(h->file->fnode.ino > 1);
7c673cae 3648
7c673cae
FG
3649 // truncate off unflushed data?
3650 if (h->pos < offset &&
f67539c2 3651 h->pos + h->get_buffer_length() > offset) {
7c673cae
FG
3652 dout(20) << __func__ << " tossing out last " << offset - h->pos
3653 << " unflushed bytes" << dendl;
11fdf7f2 3654 ceph_abort_msg("actually this shouldn't happen");
7c673cae 3655 }
f67539c2 3656 if (h->get_buffer_length()) {
20effc67 3657 int r = _flush_F(h, true);
7c673cae
FG
3658 if (r < 0)
3659 return r;
3660 }
3661 if (offset == h->file->fnode.size) {
3662 return 0; // no-op!
3663 }
3664 if (offset > h->file->fnode.size) {
11fdf7f2 3665 ceph_abort_msg("truncate up not supported");
7c673cae 3666 }
11fdf7f2 3667 ceph_assert(h->file->fnode.size >= offset);
20effc67
TL
3668 _flush_bdev(h);
3669
3670 std::lock_guard ll(log.lock);
9f95a23c 3671 vselector->sub_usage(h->file->vselector_hint, h->file->fnode.size);
7c673cae 3672 h->file->fnode.size = offset;
1e59de90 3673 h->file->is_dirty = true;
9f95a23c 3674 vselector->add_usage(h->file->vselector_hint, h->file->fnode.size);
20effc67 3675 log.t.op_file_update_inc(h->file->fnode);
7c673cae
FG
3676 return 0;
3677}
3678
20effc67 3679int BlueFS::fsync(FileWriter *h)/*_WF_WD_WLD_WLNF_WNF*/
7c673cae 3680{
20effc67
TL
3681 _maybe_check_vselector_LNF();
3682 std::unique_lock hl(h->lock);
3683 uint64_t old_dirty_seq = 0;
3684 {
1e59de90
TL
3685 dout(10) << __func__ << " " << h << " " << h->file->fnode
3686 << " dirty " << h->file->is_dirty << dendl;
20effc67
TL
3687 int r = _flush_F(h, true);
3688 if (r < 0)
3689 return r;
3690 _flush_bdev(h);
3691 if (h->file->is_dirty) {
3692 _signal_dirty_to_log_D(h);
3693 h->file->is_dirty = false;
3694 }
3695 {
3696 std::lock_guard dl(dirty.lock);
3697 if (dirty.seq_stable < h->file->dirty_seq) {
3698 old_dirty_seq = h->file->dirty_seq;
3699 dout(20) << __func__ << " file metadata was dirty (" << old_dirty_seq
3700 << ") on " << h->file->fnode << ", flushing log" << dendl;
3701 }
3702 }
522d829b 3703 }
7c673cae 3704 if (old_dirty_seq) {
20effc67 3705 _flush_and_sync_log_LD(old_dirty_seq);
7c673cae 3706 }
20effc67
TL
3707 _maybe_compact_log_LNF_NF_LD_D();
3708
7c673cae
FG
3709 return 0;
3710}
3711
20effc67 3712// be careful - either h->file->lock or log.lock must be taken
39ae355f 3713void BlueFS::_flush_bdev(FileWriter *h, bool check_mutext_locked)
7c673cae 3714{
39ae355f
TL
3715 if (check_mutext_locked) {
3716 if (h->file->fnode.ino > 1) {
3717 ceph_assert(ceph_mutex_is_locked(h->lock));
3718 } else if (h->file->fnode.ino == 1) {
3719 ceph_assert(ceph_mutex_is_locked(log.lock));
3720 }
20effc67 3721 }
11fdf7f2
TL
3722 std::array<bool, MAX_BDEV> flush_devs = h->dirty_devs;
3723 h->dirty_devs.fill(false);
3724#ifdef HAVE_LIBAIO
7c673cae
FG
3725 if (!cct->_conf->bluefs_sync_write) {
3726 list<aio_t> completed_ios;
3727 _claim_completed_aios(h, &completed_ios);
20effc67 3728 _wait_for_aio(h);
7c673cae 3729 completed_ios.clear();
7c673cae 3730 }
20effc67
TL
3731#endif
3732 _flush_bdev(flush_devs);
7c673cae
FG
3733}
3734
20effc67 3735void BlueFS::_flush_bdev(std::array<bool, MAX_BDEV>& dirty_bdevs)
11fdf7f2
TL
3736{
3737 // NOTE: this is safe to call without a lock.
3738 dout(20) << __func__ << dendl;
3739 for (unsigned i = 0; i < MAX_BDEV; i++) {
3740 if (dirty_bdevs[i])
3741 bdev[i]->flush();
3742 }
3743}
3744
20effc67 3745void BlueFS::_flush_bdev()
7c673cae
FG
3746{
3747 // NOTE: this is safe to call without a lock.
3748 dout(20) << __func__ << dendl;
f67539c2
TL
3749 for (unsigned i = 0; i < MAX_BDEV; i++) {
3750 // alloc space from BDEV_SLOW is unexpected.
3751 // So most cases we don't alloc from BDEV_SLOW and so avoiding flush not-used device.
3752 if (bdev[i] && (i != BDEV_SLOW || _get_used(i))) {
3753 bdev[i]->flush();
3754 }
7c673cae
FG
3755 }
3756}
3757
eafe8130
TL
3758const char* BlueFS::get_device_name(unsigned id)
3759{
3760 if (id >= MAX_BDEV) return "BDEV_INV";
3761 const char* names[] = {"BDEV_WAL", "BDEV_DB", "BDEV_SLOW", "BDEV_NEWWAL", "BDEV_NEWDB"};
3762 return names[id];
3763}
3764
7c673cae 3765int BlueFS::_allocate(uint8_t id, uint64_t len,
39ae355f
TL
3766 uint64_t alloc_unit,
3767 bluefs_fnode_t* node,
3768 size_t alloc_attempts,
3769 bool permit_dev_fallback)
3770{
3771 dout(10) << __func__ << " len 0x" << std::hex << len
3772 << " au 0x" << alloc_unit
3773 << std::dec << " from " << (int)id
3774 << " cooldown " << cooldown_deadline
3775 << dendl;
11fdf7f2 3776 ceph_assert(id < alloc.size());
b32b8144 3777 int64_t alloc_len = 0;
a8e16298 3778 PExtentVector extents;
11fdf7f2 3779 uint64_t hint = 0;
f67539c2 3780 int64_t need = len;
39ae355f
TL
3781 bool shared = is_shared_alloc(id);
3782 auto shared_unit = shared_alloc ? shared_alloc->alloc_unit : 0;
3783 bool was_cooldown = false;
7c673cae 3784 if (alloc[id]) {
39ae355f
TL
3785 if (!alloc_unit) {
3786 alloc_unit = alloc_size[id];
3787 }
3788 // do not attempt shared_allocator with bluefs alloc unit
3789 // when cooling down, fallback to slow dev alloc unit.
3790 if (shared && alloc_unit != shared_unit) {
3791 if (duration_cast<seconds>(real_clock::now().time_since_epoch()).count() <
3792 cooldown_deadline) {
3793 logger->inc(l_bluefs_alloc_shared_size_fallbacks);
3794 alloc_unit = shared_unit;
3795 was_cooldown = true;
3796 } else if (cooldown_deadline.fetch_and(0)) {
3797 // we might get false cooldown_deadline reset at this point
3798 // but that's mostly harmless.
3799 dout(1) << __func__ << " shared allocation cooldown period elapsed"
3800 << dendl;
3801 }
3802 }
3803 need = round_up_to(len, alloc_unit);
94b18763
FG
3804 if (!node->extents.empty() && node->extents.back().bdev == id) {
3805 hint = node->extents.back().end();
11fdf7f2 3806 }
39ae355f 3807 ++alloc_attempts;
b32b8144 3808 extents.reserve(4); // 4 should be (more than) enough for most allocations
39ae355f 3809 alloc_len = alloc[id]->allocate(need, alloc_unit, hint, &extents);
b32b8144 3810 }
f67539c2
TL
3811 if (alloc_len < 0 || alloc_len < need) {
3812 if (alloc[id]) {
3813 if (alloc_len > 0) {
3814 alloc[id]->release(extents);
3815 }
39ae355f
TL
3816 if (!was_cooldown && shared) {
3817 auto delay_s = cct->_conf->bluefs_failed_shared_alloc_cooldown;
3818 cooldown_deadline = delay_s +
3819 duration_cast<seconds>(real_clock::now().time_since_epoch()).count();
3820 dout(1) << __func__ << " shared allocation cooldown set for "
3821 << delay_s << "s"
3822 << dendl;
3823 }
f67539c2
TL
3824 dout(1) << __func__ << " unable to allocate 0x" << std::hex << need
3825 << " on bdev " << (int)id
3826 << ", allocator name " << alloc[id]->get_name()
3827 << ", allocator type " << alloc[id]->get_type()
3828 << ", capacity 0x" << alloc[id]->get_capacity()
3829 << ", block size 0x" << alloc[id]->get_block_size()
39ae355f 3830 << ", alloc unit 0x" << alloc_unit
f67539c2
TL
3831 << ", free 0x" << alloc[id]->get_free()
3832 << ", fragmentation " << alloc[id]->get_fragmentation()
3833 << ", allocated 0x" << (alloc_len > 0 ? alloc_len : 0)
3834 << std::dec << dendl;
20effc67 3835 } else {
39ae355f
TL
3836 dout(20) << __func__ << " alloc-id not set on index="<< (int)id
3837 << " unable to allocate 0x" << std::hex << need
20effc67 3838 << " on bdev " << (int)id << std::dec << dendl;
b32b8144 3839 }
39ae355f
TL
3840 if (alloc[id] && shared && alloc_unit != shared_unit) {
3841 alloc_unit = shared_unit;
3842 dout(20) << __func__ << " fallback to bdev "
3843 << (int)id
3844 << " with alloc unit 0x" << std::hex << alloc_unit
3845 << std::dec << dendl;
3846 logger->inc(l_bluefs_alloc_shared_size_fallbacks);
3847 return _allocate(id,
3848 len,
3849 alloc_unit,
3850 node,
3851 alloc_attempts,
3852 permit_dev_fallback);
3853 } else if (permit_dev_fallback && id != BDEV_SLOW && alloc[id + 1]) {
f67539c2 3854 dout(20) << __func__ << " fallback to bdev "
20effc67 3855 << (int)id + 1
f67539c2 3856 << dendl;
39ae355f
TL
3857 if (alloc_attempts > 0 && is_shared_alloc(id + 1)) {
3858 logger->inc(l_bluefs_alloc_shared_dev_fallbacks);
3859 }
3860 return _allocate(id + 1,
3861 len,
3862 0, // back to default alloc unit
3863 node,
3864 alloc_attempts,
3865 permit_dev_fallback);
11fdf7f2 3866 } else {
f67539c2
TL
3867 derr << __func__ << " allocation failed, needed 0x" << std::hex << need
3868 << dendl;
11fdf7f2 3869 }
f67539c2 3870 return -ENOSPC;
11fdf7f2 3871 } else {
f67539c2
TL
3872 uint64_t used = _get_used(id);
3873 if (max_bytes[id] < used) {
3874 logger->set(max_bytes_pcounters[id], used);
3875 max_bytes[id] = used;
3876 }
39ae355f 3877 if (shared) {
f67539c2 3878 shared_alloc->bluefs_used += alloc_len;
11fdf7f2 3879 }
7c673cae
FG
3880 }
3881
3882 for (auto& p : extents) {
94b18763 3883 node->append_extent(bluefs_extent_t(id, p.offset, p.length));
7c673cae
FG
3884 }
3885
3886 return 0;
3887}
3888
20effc67 3889int BlueFS::preallocate(FileRef f, uint64_t off, uint64_t len)/*_LF*/
7c673cae 3890{
20effc67
TL
3891 std::lock_guard ll(log.lock);
3892 std::lock_guard fl(f->lock);
7c673cae
FG
3893 dout(10) << __func__ << " file " << f->fnode << " 0x"
3894 << std::hex << off << "~" << len << std::dec << dendl;
3895 if (f->deleted) {
3896 dout(10) << __func__ << " deleted, no-op" << dendl;
3897 return 0;
3898 }
11fdf7f2 3899 ceph_assert(f->fnode.ino > 1);
7c673cae
FG
3900 uint64_t allocated = f->fnode.get_allocated();
3901 if (off + len > allocated) {
3902 uint64_t want = off + len - allocated;
9f95a23c 3903
20effc67 3904 vselector->sub_usage(f->vselector_hint, f->fnode);
9f95a23c
TL
3905 int r = _allocate(vselector->select_prefer_bdev(f->vselector_hint),
3906 want,
39ae355f 3907 0,
9f95a23c
TL
3908 &f->fnode);
3909 vselector->add_usage(f->vselector_hint, f->fnode);
7c673cae
FG
3910 if (r < 0)
3911 return r;
20effc67
TL
3912
3913 log.t.op_file_update_inc(f->fnode);
7c673cae
FG
3914 }
3915 return 0;
3916}
3917
20effc67 3918void BlueFS::sync_metadata(bool avoid_compact)/*_LNF_NF_LD_D*/
7c673cae 3919{
20effc67
TL
3920 bool can_skip_flush;
3921 {
3922 std::lock_guard ll(log.lock);
3923 std::lock_guard dl(dirty.lock);
3924 can_skip_flush = log.t.empty() && dirty.files.empty();
3925 }
3926 if (can_skip_flush) {
7c673cae 3927 dout(10) << __func__ << " - no pending log events" << dendl;
11fdf7f2 3928 } else {
f67539c2
TL
3929 utime_t start;
3930 lgeneric_subdout(cct, bluefs, 10) << __func__;
3931 start = ceph_clock_now();
3932 *_dout << dendl;
20effc67
TL
3933 _flush_bdev(); // FIXME?
3934 _flush_and_sync_log_LD();
11fdf7f2 3935 dout(10) << __func__ << " done in " << (ceph_clock_now() - start) << dendl;
7c673cae 3936 }
7c673cae 3937
f6b5b4d7 3938 if (!avoid_compact) {
20effc67 3939 _maybe_compact_log_LNF_NF_LD_D();
f6b5b4d7
TL
3940 }
3941}
3942
20effc67 3943void BlueFS::_maybe_compact_log_LNF_NF_LD_D()
f6b5b4d7
TL
3944{
3945 if (!cct->_conf->bluefs_replay_recovery_disable_compact &&
20effc67 3946 _should_start_compact_log_L_N()) {
39ae355f 3947 auto t0 = mono_clock::now();
7c673cae 3948 if (cct->_conf->bluefs_compact_log_sync) {
20effc67 3949 _compact_log_sync_LNF_LD();
7c673cae 3950 } else {
20effc67 3951 _compact_log_async_LD_LNF_D();
7c673cae 3952 }
39ae355f 3953 logger->tinc(l_bluefs_compaction_lat, mono_clock::now() - t0);
7c673cae 3954 }
7c673cae
FG
3955}
3956
3957int BlueFS::open_for_write(
b3b6e05e
TL
3958 std::string_view dirname,
3959 std::string_view filename,
7c673cae 3960 FileWriter **h,
1e59de90 3961 bool overwrite)/*_LND*/
7c673cae 3962{
20effc67
TL
3963 _maybe_check_vselector_LNF();
3964 FileRef file;
3965 bool create = false;
3966 bool truncate = false;
3967 mempool::bluefs::vector<bluefs_extent_t> pending_release_extents;
3968 {
1e59de90
TL
3969 std::lock_guard ll(log.lock);
3970 std::lock_guard nl(nodes.lock);
7c673cae 3971 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
20effc67 3972 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
7c673cae 3973 DirRef dir;
20effc67 3974 if (p == nodes.dir_map.end()) {
7c673cae
FG
3975 // implicitly create the dir
3976 dout(20) << __func__ << " dir " << dirname
3977 << " does not exist" << dendl;
3978 return -ENOENT;
3979 } else {
3980 dir = p->second;
3981 }
3982
7c673cae
FG
3983 map<string,FileRef>::iterator q = dir->file_map.find(filename);
3984 if (q == dir->file_map.end()) {
3985 if (overwrite) {
3986 dout(20) << __func__ << " dir " << dirname << " (" << dir
3987 << ") file " << filename
3988 << " does not exist" << dendl;
3989 return -ENOENT;
3990 }
9f95a23c 3991 file = ceph::make_ref<File>();
7c673cae 3992 file->fnode.ino = ++ino_last;
20effc67 3993 nodes.file_map[ino_last] = file;
b3b6e05e 3994 dir->file_map[string{filename}] = file;
7c673cae
FG
3995 ++file->refs;
3996 create = true;
20effc67 3997 logger->set(l_bluefs_num_files, nodes.file_map.size());
7c673cae
FG
3998 } else {
3999 // overwrite existing file?
4000 file = q->second;
4001 if (overwrite) {
4002 dout(20) << __func__ << " dir " << dirname << " (" << dir
4003 << ") file " << filename
4004 << " already exists, overwrite in place" << dendl;
4005 } else {
4006 dout(20) << __func__ << " dir " << dirname << " (" << dir
4007 << ") file " << filename
4008 << " already exists, truncate + overwrite" << dendl;
9f95a23c 4009 vselector->sub_usage(file->vselector_hint, file->fnode);
7c673cae 4010 file->fnode.size = 0;
20effc67 4011 pending_release_extents.swap(file->fnode.extents);
f6b5b4d7 4012 truncate = true;
94b18763
FG
4013
4014 file->fnode.clear_extents();
7c673cae
FG
4015 }
4016 }
11fdf7f2 4017 ceph_assert(file->fnode.ino > 1);
7c673cae
FG
4018
4019 file->fnode.mtime = ceph_clock_now();
9f95a23c 4020 file->vselector_hint = vselector->get_hint_by_dir(dirname);
f6b5b4d7
TL
4021 if (create || truncate) {
4022 vselector->add_usage(file->vselector_hint, file->fnode); // update file count
4023 }
9f95a23c 4024
7c673cae 4025 dout(20) << __func__ << " mapping " << dirname << "/" << filename
9f95a23c
TL
4026 << " vsel_hint " << file->vselector_hint
4027 << dendl;
20effc67 4028
1e59de90
TL
4029 log.t.op_file_update(file->fnode);
4030 if (create)
4031 log.t.op_dir_link(dirname, filename, file->fnode.ino);
4032
4033 std::lock_guard dl(dirty.lock);
4034 for (auto& p : pending_release_extents) {
4035 dirty.pending_release[p.bdev].insert(p.offset, p.length);
4036 }
20effc67 4037 }
7c673cae
FG
4038 *h = _create_writer(file);
4039
4040 if (boost::algorithm::ends_with(filename, ".log")) {
4041 (*h)->writer_type = BlueFS::WRITER_WAL;
4042 if (logger && !overwrite) {
4043 logger->inc(l_bluefs_files_written_wal);
4044 }
4045 } else if (boost::algorithm::ends_with(filename, ".sst")) {
4046 (*h)->writer_type = BlueFS::WRITER_SST;
4047 if (logger) {
4048 logger->inc(l_bluefs_files_written_sst);
4049 }
4050 }
4051
4052 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
4053 return 0;
4054}
4055
4056BlueFS::FileWriter *BlueFS::_create_writer(FileRef f)
4057{
4058 FileWriter *w = new FileWriter(f);
4059 for (unsigned i = 0; i < MAX_BDEV; ++i) {
4060 if (bdev[i]) {
4061 w->iocv[i] = new IOContext(cct, NULL);
7c673cae
FG
4062 }
4063 }
4064 return w;
4065}
4066
20effc67 4067void BlueFS::_drain_writer(FileWriter *h)
7c673cae
FG
4068{
4069 dout(10) << __func__ << " " << h << " type " << h->writer_type << dendl;
f67539c2 4070 //h->buffer.reassign_to_mempool(mempool::mempool_bluefs_file_writer);
7c673cae
FG
4071 for (unsigned i=0; i<MAX_BDEV; ++i) {
4072 if (bdev[i]) {
11fdf7f2
TL
4073 if (h->iocv[i]) {
4074 h->iocv[i]->aio_wait();
20effc67 4075 delete h->iocv[i];
11fdf7f2 4076 }
7c673cae
FG
4077 }
4078 }
522d829b
TL
4079 // sanity
4080 if (h->file->fnode.size >= (1ull << 30)) {
4081 dout(10) << __func__ << " file is unexpectedly large:" << h->file->fnode << dendl;
4082 }
20effc67
TL
4083}
4084
4085void BlueFS::_close_writer(FileWriter *h)
4086{
4087 _drain_writer(h);
4088 delete h;
4089}
4090void BlueFS::close_writer(FileWriter *h)
4091{
4092 {
4093 std::lock_guard l(h->lock);
4094 _drain_writer(h);
4095 }
7c673cae
FG
4096 delete h;
4097}
4098
522d829b
TL
4099uint64_t BlueFS::debug_get_dirty_seq(FileWriter *h)
4100{
20effc67 4101 std::lock_guard l(h->lock);
522d829b
TL
4102 return h->file->dirty_seq;
4103}
4104
4105bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev)
4106{
20effc67 4107 std::lock_guard l(h->lock);
522d829b
TL
4108 return h->dirty_devs[dev];
4109}
4110
7c673cae 4111int BlueFS::open_for_read(
b3b6e05e
TL
4112 std::string_view dirname,
4113 std::string_view filename,
7c673cae 4114 FileReader **h,
20effc67 4115 bool random)/*_N*/
7c673cae 4116{
20effc67
TL
4117 _maybe_check_vselector_LNF();
4118 std::lock_guard nl(nodes.lock);
7c673cae
FG
4119 dout(10) << __func__ << " " << dirname << "/" << filename
4120 << (random ? " (random)":" (sequential)") << dendl;
20effc67
TL
4121 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4122 if (p == nodes.dir_map.end()) {
7c673cae
FG
4123 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
4124 return -ENOENT;
4125 }
4126 DirRef dir = p->second;
4127
4128 map<string,FileRef>::iterator q = dir->file_map.find(filename);
4129 if (q == dir->file_map.end()) {
4130 dout(20) << __func__ << " dir " << dirname << " (" << dir
4131 << ") file " << filename
4132 << " not found" << dendl;
4133 return -ENOENT;
4134 }
4135 File *file = q->second.get();
4136
4137 *h = new FileReader(file, random ? 4096 : cct->_conf->bluefs_max_prefetch,
4138 random, false);
4139 dout(10) << __func__ << " h " << *h << " on " << file->fnode << dendl;
4140 return 0;
4141}
4142
4143int BlueFS::rename(
b3b6e05e 4144 std::string_view old_dirname, std::string_view old_filename,
20effc67 4145 std::string_view new_dirname, std::string_view new_filename)/*_LND*/
7c673cae 4146{
20effc67
TL
4147 std::lock_guard ll(log.lock);
4148 std::lock_guard nl(nodes.lock);
7c673cae
FG
4149 dout(10) << __func__ << " " << old_dirname << "/" << old_filename
4150 << " -> " << new_dirname << "/" << new_filename << dendl;
20effc67
TL
4151 map<string,DirRef>::iterator p = nodes.dir_map.find(old_dirname);
4152 if (p == nodes.dir_map.end()) {
7c673cae
FG
4153 dout(20) << __func__ << " dir " << old_dirname << " not found" << dendl;
4154 return -ENOENT;
4155 }
4156 DirRef old_dir = p->second;
4157 map<string,FileRef>::iterator q = old_dir->file_map.find(old_filename);
4158 if (q == old_dir->file_map.end()) {
4159 dout(20) << __func__ << " dir " << old_dirname << " (" << old_dir
4160 << ") file " << old_filename
4161 << " not found" << dendl;
4162 return -ENOENT;
4163 }
4164 FileRef file = q->second;
4165
20effc67
TL
4166 p = nodes.dir_map.find(new_dirname);
4167 if (p == nodes.dir_map.end()) {
7c673cae
FG
4168 dout(20) << __func__ << " dir " << new_dirname << " not found" << dendl;
4169 return -ENOENT;
4170 }
4171 DirRef new_dir = p->second;
4172 q = new_dir->file_map.find(new_filename);
4173 if (q != new_dir->file_map.end()) {
4174 dout(20) << __func__ << " dir " << new_dirname << " (" << old_dir
4175 << ") file " << new_filename
4176 << " already exists, unlinking" << dendl;
11fdf7f2 4177 ceph_assert(q->second != file);
20effc67
TL
4178 log.t.op_dir_unlink(new_dirname, new_filename);
4179 _drop_link_D(q->second);
7c673cae
FG
4180 }
4181
4182 dout(10) << __func__ << " " << new_dirname << "/" << new_filename << " "
4183 << " " << file->fnode << dendl;
4184
b3b6e05e
TL
4185 new_dir->file_map[string{new_filename}] = file;
4186 old_dir->file_map.erase(string{old_filename});
7c673cae 4187
20effc67
TL
4188 log.t.op_dir_link(new_dirname, new_filename, file->fnode.ino);
4189 log.t.op_dir_unlink(old_dirname, old_filename);
7c673cae
FG
4190 return 0;
4191}
4192
20effc67 4193int BlueFS::mkdir(std::string_view dirname)/*_LN*/
7c673cae 4194{
20effc67
TL
4195 std::lock_guard ll(log.lock);
4196 std::lock_guard nl(nodes.lock);
7c673cae 4197 dout(10) << __func__ << " " << dirname << dendl;
20effc67
TL
4198 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4199 if (p != nodes.dir_map.end()) {
7c673cae
FG
4200 dout(20) << __func__ << " dir " << dirname << " exists" << dendl;
4201 return -EEXIST;
4202 }
20effc67
TL
4203 nodes.dir_map[string{dirname}] = ceph::make_ref<Dir>();
4204 log.t.op_dir_create(dirname);
7c673cae
FG
4205 return 0;
4206}
4207
20effc67 4208int BlueFS::rmdir(std::string_view dirname)/*_LN*/
7c673cae 4209{
20effc67
TL
4210 std::lock_guard ll(log.lock);
4211 std::lock_guard nl(nodes.lock);
7c673cae 4212 dout(10) << __func__ << " " << dirname << dendl;
20effc67
TL
4213 auto p = nodes.dir_map.find(dirname);
4214 if (p == nodes.dir_map.end()) {
7c673cae
FG
4215 dout(20) << __func__ << " dir " << dirname << " does not exist" << dendl;
4216 return -ENOENT;
4217 }
4218 DirRef dir = p->second;
4219 if (!dir->file_map.empty()) {
4220 dout(20) << __func__ << " dir " << dirname << " not empty" << dendl;
4221 return -ENOTEMPTY;
4222 }
20effc67
TL
4223 nodes.dir_map.erase(string{dirname});
4224 log.t.op_dir_remove(dirname);
7c673cae
FG
4225 return 0;
4226}
4227
20effc67 4228bool BlueFS::dir_exists(std::string_view dirname)/*_N*/
7c673cae 4229{
20effc67
TL
4230 std::lock_guard nl(nodes.lock);
4231 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4232 bool exists = p != nodes.dir_map.end();
7c673cae
FG
4233 dout(10) << __func__ << " " << dirname << " = " << (int)exists << dendl;
4234 return exists;
4235}
4236
b3b6e05e 4237int BlueFS::stat(std::string_view dirname, std::string_view filename,
20effc67 4238 uint64_t *size, utime_t *mtime)/*_N*/
7c673cae 4239{
20effc67 4240 std::lock_guard nl(nodes.lock);
7c673cae 4241 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
20effc67
TL
4242 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4243 if (p == nodes.dir_map.end()) {
7c673cae
FG
4244 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
4245 return -ENOENT;
4246 }
4247 DirRef dir = p->second;
4248 map<string,FileRef>::iterator q = dir->file_map.find(filename);
4249 if (q == dir->file_map.end()) {
4250 dout(20) << __func__ << " dir " << dirname << " (" << dir
4251 << ") file " << filename
4252 << " not found" << dendl;
4253 return -ENOENT;
4254 }
4255 File *file = q->second.get();
4256 dout(10) << __func__ << " " << dirname << "/" << filename
4257 << " " << file->fnode << dendl;
4258 if (size)
4259 *size = file->fnode.size;
4260 if (mtime)
4261 *mtime = file->fnode.mtime;
4262 return 0;
4263}
4264
b3b6e05e 4265int BlueFS::lock_file(std::string_view dirname, std::string_view filename,
20effc67 4266 FileLock **plock)/*_LN*/
7c673cae 4267{
20effc67
TL
4268 std::lock_guard ll(log.lock);
4269 std::lock_guard nl(nodes.lock);
7c673cae 4270 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
20effc67
TL
4271 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4272 if (p == nodes.dir_map.end()) {
7c673cae
FG
4273 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
4274 return -ENOENT;
4275 }
4276 DirRef dir = p->second;
b3b6e05e 4277 auto q = dir->file_map.find(filename);
9f95a23c 4278 FileRef file;
7c673cae
FG
4279 if (q == dir->file_map.end()) {
4280 dout(20) << __func__ << " dir " << dirname << " (" << dir
4281 << ") file " << filename
4282 << " not found, creating" << dendl;
9f95a23c 4283 file = ceph::make_ref<File>();
7c673cae
FG
4284 file->fnode.ino = ++ino_last;
4285 file->fnode.mtime = ceph_clock_now();
20effc67 4286 nodes.file_map[ino_last] = file;
b3b6e05e 4287 dir->file_map[string{filename}] = file;
20effc67 4288 logger->set(l_bluefs_num_files, nodes.file_map.size());
7c673cae 4289 ++file->refs;
20effc67
TL
4290 log.t.op_file_update(file->fnode);
4291 log.t.op_dir_link(dirname, filename, file->fnode.ino);
7c673cae 4292 } else {
9f95a23c 4293 file = q->second;
7c673cae
FG
4294 if (file->locked) {
4295 dout(10) << __func__ << " already locked" << dendl;
11fdf7f2 4296 return -ENOLCK;
7c673cae
FG
4297 }
4298 }
4299 file->locked = true;
4300 *plock = new FileLock(file);
4301 dout(10) << __func__ << " locked " << file->fnode
4302 << " with " << *plock << dendl;
4303 return 0;
4304}
4305
20effc67 4306int BlueFS::unlock_file(FileLock *fl)/*_N*/
7c673cae 4307{
20effc67 4308 std::lock_guard nl(nodes.lock);
7c673cae 4309 dout(10) << __func__ << " " << fl << " on " << fl->file->fnode << dendl;
11fdf7f2 4310 ceph_assert(fl->file->locked);
7c673cae
FG
4311 fl->file->locked = false;
4312 delete fl;
4313 return 0;
4314}
4315
20effc67 4316int BlueFS::readdir(std::string_view dirname, vector<string> *ls)/*_N*/
7c673cae 4317{
b3b6e05e
TL
4318 // dirname may contain a trailing /
4319 if (!dirname.empty() && dirname.back() == '/') {
4320 dirname.remove_suffix(1);
4321 }
20effc67 4322 std::lock_guard nl(nodes.lock);
7c673cae
FG
4323 dout(10) << __func__ << " " << dirname << dendl;
4324 if (dirname.empty()) {
4325 // list dirs
20effc67
TL
4326 ls->reserve(nodes.dir_map.size() + 2);
4327 for (auto& q : nodes.dir_map) {
7c673cae
FG
4328 ls->push_back(q.first);
4329 }
4330 } else {
4331 // list files in dir
20effc67
TL
4332 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4333 if (p == nodes.dir_map.end()) {
7c673cae
FG
4334 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
4335 return -ENOENT;
4336 }
4337 DirRef dir = p->second;
4338 ls->reserve(dir->file_map.size() + 2);
4339 for (auto& q : dir->file_map) {
4340 ls->push_back(q.first);
4341 }
4342 }
4343 ls->push_back(".");
4344 ls->push_back("..");
4345 return 0;
4346}
4347
20effc67 4348int BlueFS::unlink(std::string_view dirname, std::string_view filename)/*_LND*/
7c673cae 4349{
20effc67
TL
4350 std::lock_guard ll(log.lock);
4351 std::lock_guard nl(nodes.lock);
7c673cae 4352 dout(10) << __func__ << " " << dirname << "/" << filename << dendl;
20effc67
TL
4353 map<string,DirRef>::iterator p = nodes.dir_map.find(dirname);
4354 if (p == nodes.dir_map.end()) {
7c673cae
FG
4355 dout(20) << __func__ << " dir " << dirname << " not found" << dendl;
4356 return -ENOENT;
4357 }
4358 DirRef dir = p->second;
4359 map<string,FileRef>::iterator q = dir->file_map.find(filename);
4360 if (q == dir->file_map.end()) {
4361 dout(20) << __func__ << " file " << dirname << "/" << filename
4362 << " not found" << dendl;
4363 return -ENOENT;
4364 }
4365 FileRef file = q->second;
4366 if (file->locked) {
4367 dout(20) << __func__ << " file " << dirname << "/" << filename
4368 << " is locked" << dendl;
4369 return -EBUSY;
4370 }
b3b6e05e 4371 dir->file_map.erase(string{filename});
20effc67
TL
4372 log.t.op_dir_unlink(dirname, filename);
4373 _drop_link_D(file);
7c673cae
FG
4374 return 0;
4375}
d2e6a577
FG
4376
4377bool BlueFS::wal_is_rotational()
4378{
94b18763
FG
4379 if (bdev[BDEV_WAL]) {
4380 return bdev[BDEV_WAL]->is_rotational();
4381 } else if (bdev[BDEV_DB]) {
4382 return bdev[BDEV_DB]->is_rotational();
4383 }
4384 return bdev[BDEV_SLOW]->is_rotational();
d2e6a577 4385}
9f95a23c 4386
1d09f67e
TL
4387bool BlueFS::db_is_rotational()
4388{
4389 if (bdev[BDEV_DB]) {
4390 return bdev[BDEV_DB]->is_rotational();
4391 }
4392 return bdev[BDEV_SLOW]->is_rotational();
4393}
4394
f6b5b4d7
TL
4395/*
4396 Algorithm.
4397 do_replay_recovery_read is used when bluefs log abruptly ends, but it seems that more data should be there.
4398 Idea is to search disk for definiton of extents that will be accompanied with bluefs log in future,
4399 and try if using it will produce healthy bluefs transaction.
4400 We encode already known bluefs log extents and search disk for these bytes.
4401 When we find it, we decode following bytes as extent.
4402 We read that whole extent and then check if merged with existing log part gives a proper bluefs transaction.
4403 */
20effc67 4404int BlueFS::_do_replay_recovery_read(FileReader *log_reader,
f6b5b4d7
TL
4405 size_t replay_pos,
4406 size_t read_offset,
4407 size_t read_len,
4408 bufferlist* bl) {
4409 dout(1) << __func__ << " replay_pos=0x" << std::hex << replay_pos <<
4410 " needs 0x" << read_offset << "~" << read_len << std::dec << dendl;
4411
4412 bluefs_fnode_t& log_fnode = log_reader->file->fnode;
4413 bufferlist bin_extents;
f67539c2 4414 ::encode(log_fnode.extents, bin_extents);
f6b5b4d7
TL
4415 dout(2) << __func__ << " log file encoded extents length = " << bin_extents.length() << dendl;
4416
4417 // cannot process if too small to effectively search
4418 ceph_assert(bin_extents.length() >= 32);
4419 bufferlist last_32;
4420 last_32.substr_of(bin_extents, bin_extents.length() - 32, 32);
4421
4422 //read fixed part from replay_pos to end of bluefs_log extents
4423 bufferlist fixed;
4424 uint64_t e_off = 0;
4425 auto e = log_fnode.seek(replay_pos, &e_off);
4426 ceph_assert(e != log_fnode.extents.end());
20effc67
TL
4427 int r = _bdev_read(e->bdev, e->offset + e_off, e->length - e_off, &fixed, ioc[e->bdev],
4428 cct->_conf->bluefs_buffered_io);
f6b5b4d7
TL
4429 ceph_assert(r == 0);
4430 //capture dev of last good extent
4431 uint8_t last_e_dev = e->bdev;
4432 uint64_t last_e_off = e->offset;
4433 ++e;
4434 while (e != log_fnode.extents.end()) {
20effc67
TL
4435 r = _bdev_read(e->bdev, e->offset, e->length, &fixed, ioc[e->bdev],
4436 cct->_conf->bluefs_buffered_io);
f6b5b4d7
TL
4437 ceph_assert(r == 0);
4438 last_e_dev = e->bdev;
4439 ++e;
4440 }
4441 ceph_assert(replay_pos + fixed.length() == read_offset);
4442
4443 dout(2) << __func__ << " valid data in log = " << fixed.length() << dendl;
4444
4445 struct compare {
4446 bool operator()(const bluefs_extent_t& a, const bluefs_extent_t& b) const {
4447 if (a.bdev < b.bdev) return true;
4448 if (a.offset < b.offset) return true;
4449 return a.length < b.length;
4450 }
4451 };
4452 std::set<bluefs_extent_t, compare> extents_rejected;
4453 for (int dcnt = 0; dcnt < 3; dcnt++) {
4454 uint8_t dev = (last_e_dev + dcnt) % MAX_BDEV;
4455 if (bdev[dev] == nullptr) continue;
4456 dout(2) << __func__ << " processing " << get_device_name(dev) << dendl;
4457 interval_set<uint64_t> disk_regions;
4458 disk_regions.insert(0, bdev[dev]->get_size());
20effc67 4459 for (auto f : nodes.file_map) {
f6b5b4d7
TL
4460 auto& e = f.second->fnode.extents;
4461 for (auto& p : e) {
4462 if (p.bdev == dev) {
4463 disk_regions.erase(p.offset, p.length);
4464 }
4465 }
4466 }
4467 size_t disk_regions_count = disk_regions.num_intervals();
4468 dout(5) << __func__ << " " << disk_regions_count << " regions to scan on " << get_device_name(dev) << dendl;
4469
4470 auto reg = disk_regions.lower_bound(last_e_off);
4471 //for all except first, start from beginning
4472 last_e_off = 0;
4473 if (reg == disk_regions.end()) {
4474 reg = disk_regions.begin();
4475 }
4476 const uint64_t chunk_size = 4 * 1024 * 1024;
4477 const uint64_t page_size = 4096;
4478 const uint64_t max_extent_size = 16;
4479 uint64_t overlay_size = last_32.length() + max_extent_size;
4480 for (size_t i = 0; i < disk_regions_count; reg++, i++) {
4481 if (reg == disk_regions.end()) {
4482 reg = disk_regions.begin();
4483 }
4484 uint64_t pos = reg.get_start();
4485 uint64_t len = reg.get_len();
4486
4487 std::unique_ptr<char[]> raw_data_p{new char[page_size + chunk_size]};
4488 char* raw_data = raw_data_p.get();
4489 memset(raw_data, 0, page_size);
4490
4491 while (len > last_32.length()) {
4492 uint64_t chunk_len = len > chunk_size ? chunk_size : len;
4493 dout(5) << __func__ << " read "
20effc67
TL
4494 << get_device_name(dev) << ":0x" << std::hex << pos << "+" << chunk_len
4495 << std::dec << dendl;
4496 r = _bdev_read_random(dev, pos, chunk_len,
4497 raw_data + page_size, cct->_conf->bluefs_buffered_io);
f6b5b4d7
TL
4498 ceph_assert(r == 0);
4499
4500 //search for fixed_last_32
4501 char* chunk_b = raw_data + page_size;
4502 char* chunk_e = chunk_b + chunk_len;
4503
4504 char* search_b = chunk_b - overlay_size;
4505 char* search_e = chunk_e;
4506
4507 for (char* sp = search_b; ; sp += last_32.length()) {
4508 sp = (char*)memmem(sp, search_e - sp, last_32.c_str(), last_32.length());
4509 if (sp == nullptr) {
4510 break;
4511 }
4512
4513 char* n = sp + last_32.length();
4514 dout(5) << __func__ << " checking location 0x" << std::hex << pos + (n - chunk_b) << std::dec << dendl;
4515 bufferlist test;
4516 test.append(n, std::min<size_t>(max_extent_size, chunk_e - n));
4517 bluefs_extent_t ne;
4518 try {
4519 bufferlist::const_iterator p = test.begin();
f67539c2 4520 ::decode(ne, p);
f6b5b4d7
TL
4521 } catch (buffer::error& e) {
4522 continue;
4523 }
4524 if (extents_rejected.count(ne) != 0) {
4525 dout(5) << __func__ << " extent " << ne << " already refected" <<dendl;
4526 continue;
4527 }
4528 //insert as rejected already. if we succeed, it wouldn't make difference.
4529 extents_rejected.insert(ne);
4530
4531 if (ne.bdev >= MAX_BDEV ||
4532 bdev[ne.bdev] == nullptr ||
4533 ne.length > 16 * 1024 * 1024 ||
4534 (ne.length & 4095) != 0 ||
4535 ne.offset + ne.length > bdev[ne.bdev]->get_size() ||
4536 (ne.offset & 4095) != 0) {
4537 dout(5) << __func__ << " refusing extent " << ne << dendl;
4538 continue;
4539 }
4540 dout(5) << __func__ << " checking extent " << ne << dendl;
4541
4542 //read candidate extent - whole
4543 bufferlist candidate;
4544 candidate.append(fixed);
20effc67
TL
4545 r = _bdev_read(ne.bdev, ne.offset, ne.length, &candidate, ioc[ne.bdev],
4546 cct->_conf->bluefs_buffered_io);
f6b5b4d7
TL
4547 ceph_assert(r == 0);
4548
4549 //check if transaction & crc is ok
4550 bluefs_transaction_t t;
4551 try {
f67539c2
TL
4552 bufferlist::const_iterator p = candidate.begin();
4553 ::decode(t, p);
f6b5b4d7
TL
4554 }
4555 catch (buffer::error& e) {
4556 dout(5) << __func__ << " failed match" << dendl;
4557 continue;
4558 }
4559
4560 //success, it seems a probable candidate
4561 uint64_t l = std::min<uint64_t>(ne.length, read_len);
4562 //trim to required size
4563 bufferlist requested_read;
4564 requested_read.substr_of(candidate, fixed.length(), l);
4565 bl->append(requested_read);
4566 dout(5) << __func__ << " successful extension of log " << l << "/" << read_len << dendl;
4567 log_fnode.append_extent(ne);
4568 log_fnode.recalc_allocated();
4569 log_reader->buf.pos += l;
4570 return l;
4571 }
4572 //save overlay for next search
4573 memcpy(search_b, chunk_e - overlay_size, overlay_size);
4574 pos += chunk_len;
4575 len -= chunk_len;
4576 }
4577 }
4578 }
4579 return 0;
4580}
4581
20effc67
TL
4582void BlueFS::_check_vselector_LNF() {
4583 BlueFSVolumeSelector* vs = vselector->clone_empty();
4584 if (!vs) {
4585 return;
4586 }
4587 std::lock_guard ll(log.lock);
4588 std::lock_guard nl(nodes.lock);
4589 // Checking vselector is under log, nodes and file(s) locks,
4590 // so any modification of vselector must be under at least one of those locks.
4591 for (auto& f : nodes.file_map) {
4592 f.second->lock.lock();
4593 vs->add_usage(f.second->vselector_hint, f.second->fnode);
4594 }
4595 bool res = vselector->compare(vs);
4596 if (!res) {
4597 dout(0) << "Current:";
4598 vselector->dump(*_dout);
4599 *_dout << dendl;
4600 dout(0) << "Expected:";
4601 vs->dump(*_dout);
4602 *_dout << dendl;
4603 }
4604 ceph_assert(res);
4605 for (auto& f : nodes.file_map) {
4606 f.second->lock.unlock();
4607 }
4608 delete vs;
4609}
4610
f67539c2 4611size_t BlueFS::probe_alloc_avail(int dev, uint64_t alloc_size)
9f95a23c 4612{
f67539c2
TL
4613 size_t total = 0;
4614 auto iterated_allocation = [&](size_t off, size_t len) {
4615 //only count in size that is alloc_size aligned
4616 size_t dist_to_alignment;
4617 size_t offset_in_block = off & (alloc_size - 1);
4618 if (offset_in_block == 0)
4619 dist_to_alignment = 0;
4620 else
4621 dist_to_alignment = alloc_size - offset_in_block;
4622 if (dist_to_alignment >= len)
4623 return;
4624 len -= dist_to_alignment;
4625 total += p2align(len, alloc_size);
4626 };
4627 if (alloc[dev]) {
1e59de90 4628 alloc[dev]->foreach(iterated_allocation);
9f95a23c 4629 }
f67539c2 4630 return total;
9f95a23c 4631}
9f95a23c
TL
4632// ===============================================
4633// OriginalVolumeSelector
4634
f6b5b4d7
TL
4635void* OriginalVolumeSelector::get_hint_for_log() const {
4636 return reinterpret_cast<void*>(BlueFS::BDEV_WAL);
9f95a23c 4637}
b3b6e05e 4638void* OriginalVolumeSelector::get_hint_by_dir(std::string_view dirname) const {
9f95a23c
TL
4639 uint8_t res = BlueFS::BDEV_DB;
4640 if (dirname.length() > 5) {
4641 // the "db.slow" and "db.wal" directory names are hard-coded at
4642 // match up with bluestore. the slow device is always the second
4643 // one (when a dedicated block.db device is present and used at
4644 // bdev 0). the wal device is always last.
a4b75251 4645 if (boost::algorithm::ends_with(dirname, ".slow") && slow_total) {
9f95a23c 4646 res = BlueFS::BDEV_SLOW;
a4b75251 4647 } else if (boost::algorithm::ends_with(dirname, ".wal") && wal_total) {
9f95a23c
TL
4648 res = BlueFS::BDEV_WAL;
4649 }
4650 }
4651 return reinterpret_cast<void*>(res);
4652}
4653
4654uint8_t OriginalVolumeSelector::select_prefer_bdev(void* hint)
4655{
4656 return (uint8_t)(reinterpret_cast<uint64_t>(hint));
4657}
4658
4659void OriginalVolumeSelector::get_paths(const std::string& base, paths& res) const
4660{
4661 res.emplace_back(base, db_total);
522d829b
TL
4662 res.emplace_back(base + ".slow",
4663 slow_total ? slow_total : db_total); // use fake non-zero value if needed to
4664 // avoid RocksDB complains
9f95a23c
TL
4665}
4666
4667#undef dout_prefix
4668#define dout_prefix *_dout << "OriginalVolumeSelector: "
4669
4670void OriginalVolumeSelector::dump(ostream& sout) {
4671 sout<< "wal_total:" << wal_total
4672 << ", db_total:" << db_total
4673 << ", slow_total:" << slow_total
4674 << std::endl;
4675}
f67539c2
TL
4676
4677// ===============================================
4678// FitToFastVolumeSelector
4679
4680void FitToFastVolumeSelector::get_paths(const std::string& base, paths& res) const {
4681 res.emplace_back(base, 1); // size of the last db_path has no effect
4682}